This commit is contained in:
Joey Hess 2017-11-28 17:17:40 -04:00
parent 53f91bddfa
commit 3febb79c8f
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 149 additions and 80 deletions

View file

@ -3,6 +3,10 @@ git-annex (6.20171125) UNRELEASED; urgency=medium
* Use youtube-dl rather than quvi to download media from web pages, * Use youtube-dl rather than quvi to download media from web pages,
since quvi is not being actively developed and youtube-dl supports since quvi is not being actively developed and youtube-dl supports
many more sites. many more sites.
* addurl --relaxed no longer checks for embedded media, since youtube-dl
does not allow doing so without hitting the network, which would make
this no faster than addurl --fast. Use addurl --fast instead if you
want embedded media to be downloaded.
-- Joey Hess <id@joeyh.name> Tue, 28 Nov 2017 13:48:44 -0400 -- Joey Hess <id@joeyh.name> Tue, 28 Nov 2017 13:48:44 -0400

View file

@ -28,9 +28,8 @@ import Annex.FileMatcher
import Logs.Location import Logs.Location
import Utility.Metered import Utility.Metered
import Utility.FileSystemEncoding import Utility.FileSystemEncoding
import Utility.HtmlDetect
import qualified Annex.Transfer as Transfer import qualified Annex.Transfer as Transfer
import Annex.Quvi
import qualified Utility.Quvi as Quvi
cmd :: Command cmd :: Command
cmd = notBareRepo $ withGlobalOptions [jobsOption, jsonOption, jsonProgressOption] $ cmd = notBareRepo $ withGlobalOptions [jobsOption, jsonOption, jsonProgressOption] $
@ -85,7 +84,7 @@ parseRelaxedOption = switch
parseRawOption :: Parser Bool parseRawOption :: Parser Bool
parseRawOption = switch parseRawOption = switch
( long "raw" ( long "raw"
<> help "disable special handling for torrents, quvi, etc" <> help "disable special handling for torrents, youtube-dl, etc"
) )
seek :: AddUrlOptions -> CommandSeek seek :: AddUrlOptions -> CommandSeek
@ -121,7 +120,7 @@ checkUrl r o u = do
where where
go _ (Left e) = void $ commandAction $ do go _ (Left e) = void $ commandAction $ do
showStart "addurl" u showStart' "addurl" (Just u)
warning (show e) warning (show e)
next $ next $ return False next $ next $ return False
go deffile (Right (UrlContents sz mf)) = do go deffile (Right (UrlContents sz mf)) = do
@ -144,8 +143,9 @@ startRemote :: Remote -> Bool -> FilePath -> URLString -> Maybe Integer -> Comma
startRemote r relaxed file uri sz = do startRemote r relaxed file uri sz = do
pathmax <- liftIO $ fileNameLengthLimit "." pathmax <- liftIO $ fileNameLengthLimit "."
let file' = joinPath $ map (truncateFilePath pathmax) $ splitDirectories file let file' = joinPath $ map (truncateFilePath pathmax) $ splitDirectories file
showStart "addurl" file' showStart' "addurl" (Just uri)
showNote $ "from " ++ Remote.name r showNote $ "from " ++ Remote.name r
showDestinationFile file'
next $ performRemote r relaxed uri file' sz next $ performRemote r relaxed uri file' sz
performRemote :: Remote -> Bool -> URLString -> FilePath -> Maybe Integer -> CommandPerform performRemote :: Remote -> Bool -> URLString -> FilePath -> Maybe Integer -> CommandPerform
@ -181,19 +181,13 @@ downloadRemoteFile r relaxed uri file sz = checkCanAdd file $ do
where where
loguri = setDownloader uri OtherDownloader loguri = setDownloader uri OtherDownloader
startWeb :: AddUrlOptions -> String -> CommandStart startWeb :: AddUrlOptions -> URLString -> CommandStart
startWeb o s = go $ fromMaybe bad $ parseURI urlstring startWeb o urlstring = go $ fromMaybe bad $ parseURI urlstring
where where
(urlstring, downloader) = getDownloader s
bad = fromMaybe (giveup $ "bad url " ++ urlstring) $ bad = fromMaybe (giveup $ "bad url " ++ urlstring) $
Url.parseURIRelaxed $ urlstring Url.parseURIRelaxed $ urlstring
go url = case downloader of go url = do
QuviDownloader -> usequvi showStart' "addurl" (Just urlstring)
_ -> ifM (quviSupported urlstring)
( usequvi
, regulardownload url
)
regulardownload url = do
pathmax <- liftIO $ fileNameLengthLimit "." pathmax <- liftIO $ fileNameLengthLimit "."
urlinfo <- if relaxedOption o urlinfo <- if relaxedOption o
then pure Url.assumeUrlExists then pure Url.assumeUrlExists
@ -209,25 +203,14 @@ startWeb o s = go $ fromMaybe bad $ parseURI urlstring
( pure $ url2file url (pathdepthOption o) pathmax ( pure $ url2file url (pathdepthOption o) pathmax
, pure f , pure f
) )
showStart "addurl" file next $ performWeb o urlstring file urlinfo
next $ performWeb (relaxedOption o) urlstring file urlinfo
badquvi = giveup $ "quvi does not know how to download url " ++ urlstring
usequvi = do
page <- fromMaybe badquvi
<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring
let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page
pathmax <- liftIO $ fileNameLengthLimit "."
let file = adjustFile o $ flip fromMaybe (fileOption o) $
truncateFilePath pathmax $ sanitizeFilePath $
Quvi.pageTitle page ++ "." ++ fromMaybe "m" (Quvi.linkSuffix link)
showStart "addurl" file
next $ performQuvi (relaxedOption o) urlstring (Quvi.linkUrl link) file
performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform performWeb :: AddUrlOptions -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform
performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl performWeb o url file urlinfo = ifAnnexed file addurl geturl
where where
geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file geturl = next $ isJust <$> addUrlFile (relaxedOption o) url urlinfo file
addurl = addUrlChecked relaxed url webUUID $ \k -> return $ -- TODO youtube-dl
addurl = addUrlChecked (relaxedOption o) url webUUID $ \k -> return $
(Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k) (Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k)
performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
@ -290,33 +273,99 @@ addUrlChecked relaxed url u checkexistssize key
stop stop
) )
{- Adds an url, normally to the specified FilePath. But, if youtube-dl
- supports the url, it will be written to a different file, based on the
- title of the media.
-}
addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
addUrlFile relaxed url urlinfo file = checkCanAdd file $ do addUrlFile relaxed url urlinfo file
| relaxed = checkCanAdd file $ do
liftIO $ createDirectoryIfMissing True (parentDir file) liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed) nodownload url urlinfo file
( nodownload url urlinfo file | otherwise = ifM (Annex.getState Annex.fast)
( checkCanAdd file $ do
liftIO $ createDirectoryIfMissing True (parentDir file)
nodownload url urlinfo file
, downloadWeb url urlinfo file , downloadWeb url urlinfo file
) )
downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
downloadWeb url urlinfo file = do downloadWeb url urlinfo file =
let dummykey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing go =<< downloadWith' downloader dummykey webUUID url (AssociatedFile (Just file))
let downloader f p = do where
dummykey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing
downloader f p = do
showOutput showOutput
downloadUrl dummykey p [url] f downloadUrl dummykey p [url] f
showAction $ "downloading " ++ url ++ " " go Nothing = return Nothing
downloadWith downloader dummykey webUUID url file -- If we downloaded a html file, try to use youtube-dl to
-- extract embedded media.
go (Just tmp) = ifM (liftIO $ isHtml <$> readFile tmp)
( do
-- TODO need a directory based on dummykey,
-- which unused needs to clean up like
-- it does gitAnnexTmpObjectLocation
tmpdir <- undefined
liftIO $ createDirectoryIfMissing True tmpdir
mf <- youtubeDl url tmpdir
case mf of
Just mediafile -> do
liftIO $ nukeFile tmp
let mediaurl = setDownloader url YoutubeDownloader
let key = Backend.URL.fromUrl mediaurl Nothing
let dest = takeFileName mediafile
showDestinationFile dest
cleanup webUUID mediaurl dest key (Just mediafile)
return (Just key)
Nothing -> normalfinish tmp
, normalfinish tmp
)
normalfinish tmp = do
showDestinationFile file
liftIO $ createDirectoryIfMissing True (parentDir file)
finishDownloadWith tmp webUUID url file
youtubeDl :: URLString -> FilePath -> Annex (Maybe FilePath)
youtubeDl = undefined -- TODO
showDestinationFile :: FilePath -> Annex ()
showDestinationFile file = do
showNote ("to " ++ file)
maybeShowJSON $ JSONChunk [("file", file)]
{- The Key should be a dummy key, based on the URL, which is used {- The Key should be a dummy key, based on the URL, which is used
- for this download, before we can examine the file and find its real key. - for this download, before we can examine the file and find its real key.
- For resuming downloads to work, the dummy key for a given url should be - For resuming downloads to work, the dummy key for a given url should be
- stable. -} - stable. For disk space checking to work, the dummy key should have
- the size of the url already set.
-
- Downloads the url, sets up the worktree file, and returns the
- real key.
-}
downloadWith :: (FilePath -> MeterUpdate -> Annex Bool) -> Key -> UUID -> URLString -> FilePath -> Annex (Maybe Key) downloadWith :: (FilePath -> MeterUpdate -> Annex Bool) -> Key -> UUID -> URLString -> FilePath -> Annex (Maybe Key)
downloadWith downloader dummykey u url file = downloadWith downloader dummykey u url file =
go =<< downloadWith' downloader dummykey u url afile
where
afile = AssociatedFile (Just file)
go Nothing = return Nothing
go (Just tmp) = finishDownloadWith tmp u url file
{- Like downloadWith, but leaves the dummy key content in
- the returned location. -}
downloadWith' :: (FilePath -> MeterUpdate -> Annex Bool) -> Key -> UUID -> URLString -> AssociatedFile -> Annex (Maybe FilePath)
downloadWith' downloader dummykey u url afile =
checkDiskSpaceToGet dummykey Nothing $ do checkDiskSpaceToGet dummykey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation dummykey tmp <- fromRepo $ gitAnnexTmpObjectLocation dummykey
ifM (runtransfer tmp) ok <- Transfer.notifyTransfer Transfer.Download url $
( do Transfer.download u dummykey afile Transfer.forwardRetry $ \p -> do
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloader tmp p
if ok
then return (Just tmp)
else return Nothing
finishDownloadWith :: FilePath -> UUID -> URLString -> FilePath -> Annex (Maybe Key)
finishDownloadWith tmp u url file = do
backend <- chooseBackend file backend <- chooseBackend file
let source = KeySource let source = KeySource
{ keyFilename = file { keyFilename = file
@ -329,14 +378,6 @@ downloadWith downloader dummykey u url file =
Just (key, _) -> do Just (key, _) -> do
cleanup u url file key (Just tmp) cleanup u url file key (Just tmp)
return (Just key) return (Just key)
, return Nothing
)
where
runtransfer tmp = Transfer.notifyTransfer Transfer.Download afile $
Transfer.download u dummykey afile Transfer.forwardRetry $ \p -> do
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloader tmp p
afile = AssociatedFile (Just file)
{- Adds the url size to the Key. -} {- Adds the url size to the Key. -}
addSizeUrlKey :: Url.UrlInfo -> Key -> Key addSizeUrlKey :: Url.UrlInfo -> Key -> Key
@ -369,6 +410,7 @@ cleanup u url file key mtmp = case mtmp of
, liftIO $ maybe noop nukeFile mtmp , liftIO $ maybe noop nukeFile mtmp
) )
-- TODO youtube-dl
nodownload :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key) nodownload :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
nodownload url urlinfo file nodownload url urlinfo file
| Url.urlExists urlinfo = do | Url.urlExists urlinfo = do

View file

@ -100,15 +100,15 @@ removeTempUrl :: Key -> Annex ()
removeTempUrl key = Annex.changeState $ \s -> removeTempUrl key = Annex.changeState $ \s ->
s { Annex.tempurls = M.delete key (Annex.tempurls s) } s { Annex.tempurls = M.delete key (Annex.tempurls s) }
data Downloader = WebDownloader | QuviDownloader | OtherDownloader data Downloader = WebDownloader | YoutubeDownloader | QuviDownloader | OtherDownloader
deriving (Eq, Show) deriving (Eq, Show)
{- To keep track of how an url is downloaded, it's mangled slightly in {- To keep track of how an url is downloaded, it's mangled slightly in
- the log. For quvi, "quvi:" is prefixed. For urls that are handled by - the log, with a prefix indicating when a Downloader is used. -}
- some other remote, ":" is prefixed. -}
setDownloader :: URLString -> Downloader -> String setDownloader :: URLString -> Downloader -> String
setDownloader u WebDownloader = u setDownloader u WebDownloader = u
setDownloader u QuviDownloader = "quvi:" ++ u setDownloader u QuviDownloader = "quvi:" ++ u
setDownloader u YoutubeDownloader = "yt:" ++ u
setDownloader u OtherDownloader = ":" ++ u setDownloader u OtherDownloader = ":" ++ u
setDownloader' :: URLString -> Remote -> String setDownloader' :: URLString -> Remote -> String
@ -118,6 +118,9 @@ setDownloader' u r
getDownloader :: URLString -> (URLString, Downloader) getDownloader :: URLString -> (URLString, Downloader)
getDownloader u = case separate (== ':') u of getDownloader u = case separate (== ':') u of
("quvi", u') -> (u', QuviDownloader) ("yt", u') -> (u', YoutubeDownloader)
-- quvi is not used any longer; youtube-dl should be able to handle
-- all urls it did.
("quvi", u') -> (u', YoutubeDownloader)
("", u') -> (u', OtherDownloader) ("", u') -> (u', OtherDownloader)
_ -> (u, WebDownloader) _ -> (u, WebDownloader)

View file

@ -18,11 +18,11 @@ import Data.Char
-- Html fragments like "<p>this</p>" are not detected as being html, -- Html fragments like "<p>this</p>" are not detected as being html,
-- although some browsers may chose to render them as html. -- although some browsers may chose to render them as html.
isHtml :: String -> Bool isHtml :: String -> Bool
isHtml = evaluate . canonicalizeTags . parseTags . truncate isHtml = evaluate . canonicalizeTags . parseTags . shorten
where where
-- We only care about the beginning of the file, -- We only care about the beginning of the file,
-- so although tagsoup parses lazily anyway, truncate it. -- so although tagsoup parses lazily anyway, truncate it.
truncate = take 16384 shorten = take 16384
evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html" evaluate (TagOpen "!DOCTYPE" ((t, _):_):_) = map toLower t == "html"
evaluate (TagOpen "html" _:_) = True evaluate (TagOpen "html" _:_) = True
-- Allow some leading whitespace before the tag. -- Allow some leading whitespace before the tag.

View file

@ -10,8 +10,8 @@ git annex addurl `[url ...]`
Downloads each url to its own file, which is added to the annex. Downloads each url to its own file, which is added to the annex.
When `youtube-dl` is installed, and the url is to a web page, When `youtube-dl` is installed, it's used to download videos
it's used to download any video that the web page embeds. embedded on web pages.
Urls to torrent files (including magnet links) will cause the content of Urls to torrent files (including magnet links) will cause the content of
the torrent to be downloaded, using `aria2c`. the torrent to be downloaded, using `aria2c`.
@ -28,10 +28,20 @@ be used to get better filenames.
Avoid immediately downloading the url. The url is still checked Avoid immediately downloading the url. The url is still checked
(via HEAD) to verify that it exists, and to get its size if possible. (via HEAD) to verify that it exists, and to get its size if possible.
When `youtube-dl` is installed, videos embedded on web pages
will be added. To avoid the extra work of checking for videos,
add the `--raw` option.
* `--relaxed` * `--relaxed`
Avoid storing the size of the url's content, and accept whatever Don't immediately download the url, and avoid storing the size of the
content is there at a future point. (Implies `--fast`.) url's content. This makes git-annex accept whatever content is there
at a future point. This is also the fastest option, since it does not
hit the network at all.
Note that this does *not* check for embedded videos using `youtube-dl`,
although it used to in previous versions of git-annex.
Use --fast instead if you want to do that.
* `--raw` * `--raw`

View file

@ -16,12 +16,12 @@ urls, see for example http://bugs.debian.org/874321)
So, switching to youtube-dl would probably need a new switch, like `git So, switching to youtube-dl would probably need a new switch, like `git
annex addurl --rip` that enables using it. annex addurl --rip` that enables using it.
Currently `git annex importfeed` automatically tests for video urls with (Importfeed only treats links in the feed as video urls, not enclosures,
quvi; it would also need to support `--rip`. so this problem does not affect it and it would not need a new switch.)
Both of those changes would need changes to user's workflows and cron jobs. That would need changes to users' workflows. git-annex could keep
git-annex could keep supporting quvi for some time, and warn when it uses supporting quvi for some time, and warn when it uses quvi, to
quvi, to help with the transition. help with the transition.
> Alternatively, git-annex addurl could download the url first, and then > Alternatively, git-annex addurl could download the url first, and then
> check the file to see if it looks like html. If so, run youtube-dl (which > check the file to see if it looks like html. If so, run youtube-dl (which
@ -30,12 +30,22 @@ quvi, to help with the transition.
> overhead, and the redundant download is fairly small compared to ripping > overhead, and the redundant download is fairly small compared to ripping
> the media. Only the unusual case where addurl is being used on html that > the media. Only the unusual case where addurl is being used on html that
> does not contain media becomes more expensive. > does not contain media becomes more expensive.
>
> However, for --relaxed, running youtube-dl --get-filename would be
> significantly more expensive since it hits the network. It seems that
> --relaxed would need to change to not rip videos; users who want that
> could use --fast.
>
> --fast already hits the network, but
> if it uses youtube-dl --get-filename, it would fall afoul of
> bugs like <http://bugs.debian.org/874321>, although those can be worked
> around (/dev/null stderr in cast youtube-dl crashes)
Another gotcha is playlists. youtube-dl downloads playlists automatically. Another gotcha is playlists. youtube-dl downloads playlists automatically.
But, git-annex needs to record an url that downloads a single file so that But, git-annex needs to record an url that downloads a single file so that
`git annex get` works right. So, playlists will need to be disabled when `git annex get` works right. So, playlists will need to be disabled when
git-annex runs youtube-dl. But, `--no-playlist` does not always disable git-annex runs youtube-dl. But, `--no-playlist` does not always disable
playlists. Best option seems to be `--playlist-items 0` which works for playlists. Best option seems to be `--no-playlist --playlist-items 0` which works for
non-playlists, and downloads only 1 item from playlists (hopefully a fairly non-playlists, and downloads only 1 item from playlists (hopefully a fairly
stable item, but who knows..). stable item, but who knows..).