fix regression in addurl --fast caused by youtube-dl support

Similar to c6e4bc0a22 but another code
path. As well as using youtube-dl unecessarily, it used the filename it
comes up with, which while nice for youtube videos, is not right for
other files.

This means more work is done for urls that youtube-dl does support,
but is probably more efficient for other urls, since it only downloads
the first chunk of content, while youtube-dl probably downloads more.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2017-12-08 14:49:55 -04:00
parent 5b98da60b1
commit 8990afaef0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 46 additions and 25 deletions

View file

@ -106,42 +106,49 @@ youtubeDlTo key url dest = do
return Nothing return Nothing
return (fromMaybe False res) return (fromMaybe False res)
youtubeDlSupported :: URLString -> Annex Bool
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
-- Check if youtube-dl can find media in an url.
--
-- youtube-dl supports downloading urls that are not html pages, -- youtube-dl supports downloading urls that are not html pages,
-- but we don't want to use it for such urls, since they can be downloaded -- but we don't want to use it for such urls, since they can be downloaded
-- without it. So, this first downloads part of the content and checks -- without it. So, this first downloads part of the content and checks
-- if it's a html page; only then is youtube-dl used. -- if it's a html page; only then is youtube-dl used.
youtubeDlCheck :: URLString -> Annex (Either String Bool) htmlOnly :: URLString -> a -> Annex a -> Annex a
youtubeDlCheck url = catchMsgIO $ do htmlOnly url fallback a = do
uo <- getUrlOptions uo <- getUrlOptions
liftIO (downloadPartial url uo htmlPrefixLength) >>= \case liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
Just bs | isHtmlBs bs -> do Just bs | isHtmlBs bs -> a
opts <- youtubeDlOpts [ Param url, Param "--simulate" ] _ -> return fallback
liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
_ -> return False youtubeDlSupported :: URLString -> Annex Bool
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
-- Check if youtube-dl can find media in an url.
youtubeDlCheck :: URLString -> Annex (Either String Bool)
youtubeDlCheck url = catchMsgIO $ htmlOnly url False $ do
opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
-- Ask youtube-dl for the filename of media in an url. -- Ask youtube-dl for the filename of media in an url.
-- --
-- (This is not always identical to the filename it uses when downloading.) -- (This is not always identical to the filename it uses when downloading.)
youtubeDlFileName :: URLString -> Annex (Either String FilePath) youtubeDlFileName :: URLString -> Annex (Either String FilePath)
youtubeDlFileName url = flip catchIO (pure . Left . show) $ do youtubeDlFileName url = flip catchIO (pure . Left . show) $
-- Sometimes youtube-dl will fail with an ugly backtrace htmlOnly url nomedia $ do
-- (eg, http://bugs.debian.org/874321) -- Sometimes youtube-dl will fail with an ugly backtrace
-- so catch stderr as well as stdout to avoid the user seeing it. -- (eg, http://bugs.debian.org/874321)
-- --no-warnings avoids warning messages that are output to stdout. -- so catch stderr as well as stdout to avoid the user
opts <- youtubeDlOpts -- seeing it. --no-warnings avoids warning messages that
[ Param url -- are output to stdout.
, Param "--get-filename" opts <- youtubeDlOpts
, Param "--no-warnings" [ Param url
] , Param "--get-filename"
(output, ok) <- liftIO $ processTranscript "youtube-dl" (toCommand opts) Nothing , Param "--no-warnings"
return $ case (ok, lines output) of ]
(True, (f:_)) | not (null f) -> Right f (output, ok) <- liftIO $ processTranscript "youtube-dl"
_ -> Left "no media in url" (toCommand opts) Nothing
return $ case (ok, lines output) of
(True, (f:_)) | not (null f) -> Right f
_ -> nomedia
where
nomedia = Left "no media in url"
youtubeDlOpts :: [CommandParam] -> Annex [CommandParam] youtubeDlOpts :: [CommandParam] -> Annex [CommandParam]
youtubeDlOpts addopts = do youtubeDlOpts addopts = do

View file

@ -0,0 +1,6 @@
[[!comment format=mdwn
username="joey"
subject="""comment 2"""
date="2017-12-06T16:02:55Z"
content="""
"""]]

View file

@ -0,0 +1,8 @@
[[!comment format=mdwn
username="joey"
subject="""comment 4"""
date="2017-12-08T18:48:46Z"
content="""
That one happened with `git annex addurl --fast $url` so a different code
path. Had to add a html page check to youtubeDlFileName to fix it.
"""]]