This commit is contained in:
Joey Hess 2017-11-28 17:17:40 -04:00
parent 53f91bddfa
commit 3febb79c8f
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 149 additions and 80 deletions

View file

@ -3,6 +3,10 @@ git-annex (6.20171125) UNRELEASED; urgency=medium
* Use youtube-dl rather than quvi to download media from web pages,
since quvi is not being actively developed and youtube-dl supports
many more sites.
* addurl --relaxed no longer checks for embedded media, since youtube-dl
does not allow doing so without hitting the network, which would make
this no faster than addurl --fast. Use addurl --fast instead if you
want embedded media to be downloaded.
-- Joey Hess <id@joeyh.name> Tue, 28 Nov 2017 13:48:44 -0400

View file

@ -28,9 +28,8 @@ import Annex.FileMatcher
import Logs.Location
import Utility.Metered
import Utility.FileSystemEncoding
import Utility.HtmlDetect
import qualified Annex.Transfer as Transfer
import Annex.Quvi
import qualified Utility.Quvi as Quvi
cmd :: Command
cmd = notBareRepo $ withGlobalOptions [jobsOption, jsonOption, jsonProgressOption] $
@ -85,7 +84,7 @@ parseRelaxedOption = switch
parseRawOption :: Parser Bool
parseRawOption = switch
( long "raw"
<> help "disable special handling for torrents, quvi, etc"
<> help "disable special handling for torrents, youtube-dl, etc"
)
seek :: AddUrlOptions -> CommandSeek
@ -121,7 +120,7 @@ checkUrl r o u = do
where
go _ (Left e) = void $ commandAction $ do
showStart "addurl" u
showStart' "addurl" (Just u)
warning (show e)
next $ next $ return False
go deffile (Right (UrlContents sz mf)) = do
@ -144,8 +143,9 @@ startRemote :: Remote -> Bool -> FilePath -> URLString -> Maybe Integer -> Comma
startRemote r relaxed file uri sz = do
pathmax <- liftIO $ fileNameLengthLimit "."
let file' = joinPath $ map (truncateFilePath pathmax) $ splitDirectories file
showStart "addurl" file'
showStart' "addurl" (Just uri)
showNote $ "from " ++ Remote.name r
showDestinationFile file'
next $ performRemote r relaxed uri file' sz
performRemote :: Remote -> Bool -> URLString -> FilePath -> Maybe Integer -> CommandPerform
@ -181,19 +181,13 @@ downloadRemoteFile r relaxed uri file sz = checkCanAdd file $ do
where
loguri = setDownloader uri OtherDownloader
startWeb :: AddUrlOptions -> String -> CommandStart
startWeb o s = go $ fromMaybe bad $ parseURI urlstring
startWeb :: AddUrlOptions -> URLString -> CommandStart
startWeb o urlstring = go $ fromMaybe bad $ parseURI urlstring
where
(urlstring, downloader) = getDownloader s
bad = fromMaybe (giveup $ "bad url " ++ urlstring) $
Url.parseURIRelaxed $ urlstring
go url = case downloader of
QuviDownloader -> usequvi
_ -> ifM (quviSupported urlstring)
( usequvi
, regulardownload url
)
regulardownload url = do
go url = do
showStart' "addurl" (Just urlstring)
pathmax <- liftIO $ fileNameLengthLimit "."
urlinfo <- if relaxedOption o
then pure Url.assumeUrlExists
@ -209,25 +203,14 @@ startWeb o s = go $ fromMaybe bad $ parseURI urlstring
( pure $ url2file url (pathdepthOption o) pathmax
, pure f
)
showStart "addurl" file
next $ performWeb (relaxedOption o) urlstring file urlinfo
badquvi = giveup $ "quvi does not know how to download url " ++ urlstring
usequvi = do
page <- fromMaybe badquvi
<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring
let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page
pathmax <- liftIO $ fileNameLengthLimit "."
let file = adjustFile o $ flip fromMaybe (fileOption o) $
truncateFilePath pathmax $ sanitizeFilePath $
Quvi.pageTitle page ++ "." ++ fromMaybe "m" (Quvi.linkSuffix link)
showStart "addurl" file
next $ performQuvi (relaxedOption o) urlstring (Quvi.linkUrl link) file
next $ performWeb o urlstring file urlinfo
performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform
performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl
performWeb :: AddUrlOptions -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform
performWeb o url file urlinfo = ifAnnexed file addurl geturl
where
geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file
addurl = addUrlChecked relaxed url webUUID $ \k -> return $
geturl = next $ isJust <$> addUrlFile (relaxedOption o) url urlinfo file
-- TODO youtube-dl
addurl = addUrlChecked (relaxedOption o) url webUUID $ \k -> return $
(Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k)
performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
@ -290,53 +273,111 @@ addUrlChecked relaxed url u checkexistssize key
stop
)
{- Adds an url, normally to the specified FilePath. But, if youtube-dl
- supports the url, it will be written to a different file, based on the
- title of the media.
-}
addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
addUrlFile relaxed url urlinfo file = checkCanAdd file $ do
liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed)
( nodownload url urlinfo file
addUrlFile relaxed url urlinfo file
| relaxed = checkCanAdd file $ do
liftIO $ createDirectoryIfMissing True (parentDir file)
nodownload url urlinfo file
| otherwise = ifM (Annex.getState Annex.fast)
( checkCanAdd file $ do
liftIO $ createDirectoryIfMissing True (parentDir file)
nodownload url urlinfo file
, downloadWeb url urlinfo file
)
downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
downloadWeb url urlinfo file = do
let dummykey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing
let downloader f p = do
downloadWeb url urlinfo file =
go =<< downloadWith' downloader dummykey webUUID url (AssociatedFile (Just file))
where
dummykey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing
downloader f p = do
showOutput
downloadUrl dummykey p [url] f
showAction $ "downloading " ++ url ++ " "
downloadWith downloader dummykey webUUID url file
go Nothing = return Nothing
-- If we downloaded a html file, try to use youtube-dl to
-- extract embedded media.
go (Just tmp) = ifM (liftIO $ isHtml <$> readFile tmp)
( do
-- TODO need a directory based on dummykey,
-- which unused needs to clean up like
-- it does gitAnnexTmpObjectLocation
tmpdir <- undefined
liftIO $ createDirectoryIfMissing True tmpdir
mf <- youtubeDl url tmpdir
case mf of
Just mediafile -> do
liftIO $ nukeFile tmp
let mediaurl = setDownloader url YoutubeDownloader
let key = Backend.URL.fromUrl mediaurl Nothing
let dest = takeFileName mediafile
showDestinationFile dest
cleanup webUUID mediaurl dest key (Just mediafile)
return (Just key)
Nothing -> normalfinish tmp
, normalfinish tmp
)
normalfinish tmp = do
showDestinationFile file
liftIO $ createDirectoryIfMissing True (parentDir file)
finishDownloadWith tmp webUUID url file
youtubeDl :: URLString -> FilePath -> Annex (Maybe FilePath)
youtubeDl = undefined -- TODO
showDestinationFile :: FilePath -> Annex ()
showDestinationFile file = do
showNote ("to " ++ file)
maybeShowJSON $ JSONChunk [("file", file)]
{- The Key should be a dummy key, based on the URL, which is used
- for this download, before we can examine the file and find its real key.
- For resuming downloads to work, the dummy key for a given url should be
- stable. -}
- stable. For disk space checking to work, the dummy key should have
- the size of the url already set.
-
- Downloads the url, sets up the worktree file, and returns the
- real key.
-}
downloadWith :: (FilePath -> MeterUpdate -> Annex Bool) -> Key -> UUID -> URLString -> FilePath -> Annex (Maybe Key)
downloadWith downloader dummykey u url file =
go =<< downloadWith' downloader dummykey u url afile
where
afile = AssociatedFile (Just file)
go Nothing = return Nothing
go (Just tmp) = finishDownloadWith tmp u url file
{- Like downloadWith, but leaves the dummy key content in
- the returned location. -}
downloadWith' :: (FilePath -> MeterUpdate -> Annex Bool) -> Key -> UUID -> URLString -> AssociatedFile -> Annex (Maybe FilePath)
downloadWith' downloader dummykey u url afile =
checkDiskSpaceToGet dummykey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation dummykey
ifM (runtransfer tmp)
( do
backend <- chooseBackend file
let source = KeySource
{ keyFilename = file
, contentLocation = tmp
, inodeCache = Nothing
}
k <- genKey source backend
case k of
Nothing -> return Nothing
Just (key, _) -> do
cleanup u url file key (Just tmp)
return (Just key)
, return Nothing
)
where
runtransfer tmp = Transfer.notifyTransfer Transfer.Download afile $
Transfer.download u dummykey afile Transfer.forwardRetry $ \p -> do
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloader tmp p
afile = AssociatedFile (Just file)
ok <- Transfer.notifyTransfer Transfer.Download url $
Transfer.download u dummykey afile Transfer.forwardRetry $ \p -> do
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloader tmp p
if ok
then return (Just tmp)
else return Nothing
finishDownloadWith :: FilePath -> UUID -> URLString -> FilePath -> Annex (Maybe Key)
finishDownloadWith tmp u url file = do
backend <- chooseBackend file
let source = KeySource
{ keyFilename = file
, contentLocation = tmp
, inodeCache = Nothing
}
k <- genKey source backend
case k of
Nothing -> return Nothing
Just (key, _) -> do
cleanup u url file key (Just tmp)
return (Just key)
{- Adds the url size to the Key. -}
addSizeUrlKey :: Url.UrlInfo -> Key -> Key
@ -369,6 +410,7 @@ cleanup u url file key mtmp = case mtmp of
, liftIO $ maybe noop nukeFile mtmp
)
-- TODO youtube-dl
nodownload :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
nodownload url urlinfo file
| Url.urlExists urlinfo = do

View file

@ -100,15 +100,15 @@ removeTempUrl :: Key -> Annex ()
removeTempUrl key = Annex.changeState $ \s ->
s { Annex.tempurls = M.delete key (Annex.tempurls s) }
data Downloader = WebDownloader | QuviDownloader | OtherDownloader
data Downloader = WebDownloader | YoutubeDownloader | QuviDownloader | OtherDownloader
deriving (Eq, Show)
{- To keep track of how an url is downloaded, it's mangled slightly in
- the log. For quvi, "quvi:" is prefixed. For urls that are handled by
- some other remote, ":" is prefixed. -}
- the log, with a prefix indicating when a Downloader is used. -}
setDownloader :: URLString -> Downloader -> String
setDownloader u WebDownloader = u
setDownloader u QuviDownloader = "quvi:" ++ u
setDownloader u YoutubeDownloader = "yt:" ++ u
setDownloader u OtherDownloader = ":" ++ u
setDownloader' :: URLString -> Remote -> String
@ -118,6 +118,9 @@ setDownloader' u r
getDownloader :: URLString -> (URLString, Downloader)
getDownloader u = case separate (== ':') u of
("quvi", u') -> (u', QuviDownloader)
("yt", u') -> (u', YoutubeDownloader)
-- quvi is not used any longer; youtube-dl should be able to handle
-- all urls it did.
("quvi", u') -> (u', YoutubeDownloader)
("", u') -> (u', OtherDownloader)
_ -> (u, WebDownloader)

View file

@ -18,11 +18,11 @@ import Data.Char
-- Html fragments like "<p>this</p>" are not detected as being html,
-- although some browsers may chose to render them as html.
isHtml :: String -> Bool
isHtml = evaluate . canonicalizeTags . parseTags . truncate
isHtml = evaluate . canonicalizeTags . parseTags . shorten
where
-- We only care about the beginning of the file,
-- so although tagsoup parses lazily anyway, truncate it.
truncate = take 16384
shorten = take 16384
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
evaluate (TagOpen "html" _:_) = True
-- Allow some leading whitespace before the tag.

View file

@ -10,8 +10,8 @@ git annex addurl `[url ...]`
Downloads each url to its own file, which is added to the annex.
When `youtube-dl` is installed, and the url is to a web page,
it's used to download any video that the web page embeds.
When `youtube-dl` is installed, it's used to download videos
embedded on web pages.
Urls to torrent files (including magnet links) will cause the content of
the torrent to be downloaded, using `aria2c`.
@ -28,10 +28,20 @@ be used to get better filenames.
Avoid immediately downloading the url. The url is still checked
(via HEAD) to verify that it exists, and to get its size if possible.
When `youtube-dl` is installed, videos embedded on web pages
will be added. To avoid the extra work of checking for videos,
add the `--raw` option.
* `--relaxed`
Avoid storing the size of the url's content, and accept whatever
content is there at a future point. (Implies `--fast`.)
Don't immediately download the url, and avoid storing the size of the
url's content. This makes git-annex accept whatever content is there
at a future point. This is also the fastest option, since it does not
hit the network at all.
Note that this does *not* check for embedded videos using `youtube-dl`,
although it used to in previous versions of git-annex.
Use --fast instead if you want to do that.
* `--raw`

View file

@ -16,12 +16,12 @@ urls, see for example http://bugs.debian.org/874321)
So, switching to youtube-dl would probably need a new switch, like `git
annex addurl --rip` that enables using it.
Currently `git annex importfeed` automatically tests for video urls with
quvi; it would also need to support `--rip`.
(Importfeed only treats links in the feed as video urls, not enclosures,
so this problem does not affect it and it would not need a new switch.)
Both of those changes would need changes to user's workflows and cron jobs.
git-annex could keep supporting quvi for some time, and warn when it uses
quvi, to help with the transition.
That would need changes to users' workflows. git-annex could keep
supporting quvi for some time, and warn when it uses quvi, to
help with the transition.
> Alternatively, git-annex addurl could download the url first, and then
> check the file to see if it looks like html. If so, run youtube-dl (which
@ -30,12 +30,22 @@ quvi, to help with the transition.
> overhead, and the redundant download is fairly small compared to ripping
> the media. Only the unusual case where addurl is being used on html that
> does not contain media becomes more expensive.
>
> However, for --relaxed, running youtube-dl --get-filename would be
> significantly more expensive since it hits the network. It seems that
> --relaxed would need to change to not rip videos; users who want that
> could use --fast.
>
> --fast already hits the network, but
> if it uses youtube-dl --get-filename, it would fall afoul of
> bugs like <http://bugs.debian.org/874321>, although those can be worked
> around (/dev/null stderr in cast youtube-dl crashes)
Another gotcha is playlists. youtube-dl downloads playlists automatically.
But, git-annex needs to record an url that downloads a single file so that
`git annex get` works right. So, playlists will need to be disabled when
git-annex runs youtube-dl. But, `--no-playlist` does not always disable
playlists. Best option seems to be `--playlist-items 0` which works for
playlists. Best option seems to be `--no-playlist --playlist-items 0` which works for
non-playlists, and downloads only 1 item from playlists (hopefully a fairly
stable item, but who knows..).