2017-11-29 19:49:05 +00:00
|
|
|
{- youtube-dl integration for git-annex
|
|
|
|
-
|
|
|
|
- Copyright 2017 Joey Hess <id@joeyh.name>
|
|
|
|
-
|
|
|
|
- Licensed under the GNU GPL version 3 or higher.
|
|
|
|
-}
|
|
|
|
|
2017-12-11 16:46:34 +00:00
|
|
|
module Annex.YoutubeDl (
|
|
|
|
youtubeDl,
|
|
|
|
youtubeDlTo,
|
|
|
|
youtubeDlSupported,
|
|
|
|
youtubeDlCheck,
|
|
|
|
youtubeDlFileName,
|
2017-12-31 18:55:51 +00:00
|
|
|
youtubeDlFileName',
|
2017-12-11 16:46:34 +00:00
|
|
|
) where
|
2017-11-29 19:49:05 +00:00
|
|
|
|
|
|
|
import Annex.Common
|
|
|
|
import qualified Annex
|
|
|
|
import Annex.Content
|
2017-12-06 17:16:06 +00:00
|
|
|
import Annex.Url
|
2017-11-29 19:49:05 +00:00
|
|
|
import Utility.Url (URLString)
|
2017-11-30 20:08:30 +00:00
|
|
|
import Utility.DiskFree
|
2017-12-06 17:16:06 +00:00
|
|
|
import Utility.HtmlDetect
|
2017-12-31 20:08:31 +00:00
|
|
|
import Utility.Process.Transcript
|
2017-11-30 20:08:30 +00:00
|
|
|
import Logs.Transfer
|
2017-11-29 19:49:05 +00:00
|
|
|
|
2017-12-11 16:46:34 +00:00
|
|
|
import Network.URI
|
2017-12-31 19:19:01 +00:00
|
|
|
import Control.Concurrent.Async
|
2017-12-11 16:46:34 +00:00
|
|
|
|
2017-11-29 19:49:05 +00:00
|
|
|
-- Runs youtube-dl in a work directory, to download a single media file
|
|
|
|
-- from the url. Reutrns the path to the media file in the work directory.
|
|
|
|
--
|
|
|
|
-- If youtube-dl fails without writing any files to the work directory,
|
|
|
|
-- or is not installed, returns Right Nothing.
|
|
|
|
--
|
|
|
|
-- The work directory can contain files from a previous run of youtube-dl
|
|
|
|
-- and it will resume. It should not contain any other files though,
|
|
|
|
-- and youtube-dl needs to finish up with only one file in the directory
|
|
|
|
-- so we know which one it downloaded.
|
|
|
|
--
|
|
|
|
-- (Note that we can't use --output to specifiy the file to download to,
|
|
|
|
-- due to <https://github.com/rg3/youtube-dl/issues/14864>)
|
|
|
|
youtubeDl :: URLString -> FilePath -> Annex (Either String (Maybe FilePath))
|
2017-12-11 16:46:34 +00:00
|
|
|
youtubeDl url workdir
|
|
|
|
| supportedScheme url = ifM (liftIO $ inPath "youtube-dl")
|
|
|
|
( runcmd >>= \case
|
|
|
|
Right True -> workdirfiles >>= \case
|
|
|
|
(f:[]) -> return (Right (Just f))
|
|
|
|
[] -> return nofiles
|
|
|
|
fs -> return (toomanyfiles fs)
|
|
|
|
Right False -> workdirfiles >>= \case
|
|
|
|
[] -> return (Right Nothing)
|
|
|
|
_ -> return (Left "youtube-dl download is incomplete. Run the command again to resume.")
|
|
|
|
Left msg -> return (Left msg)
|
|
|
|
, return (Right Nothing)
|
|
|
|
)
|
|
|
|
| otherwise = return (Right Nothing)
|
2017-11-29 19:49:05 +00:00
|
|
|
where
|
|
|
|
nofiles = Left "youtube-dl did not put any media in its work directory, perhaps it's been configured to store files somewhere else?"
|
|
|
|
toomanyfiles fs = Left $ "youtube-dl downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs
|
2017-12-05 19:00:50 +00:00
|
|
|
workdirfiles = liftIO $ filterM (doesFileExist) =<< dirContents workdir
|
2017-11-30 20:08:30 +00:00
|
|
|
runcmd = youtubeDlMaxSize workdir >>= \case
|
|
|
|
Left msg -> return (Left msg)
|
|
|
|
Right maxsize -> do
|
|
|
|
quiet <- commandProgressDisabled
|
|
|
|
opts <- youtubeDlOpts $ dlopts ++ maxsize ++
|
|
|
|
if quiet then [ Param "--quiet" ] else []
|
|
|
|
ok <- liftIO $ boolSystem' "youtube-dl" opts $
|
|
|
|
\p -> p { cwd = Just workdir }
|
|
|
|
return (Right ok)
|
|
|
|
dlopts =
|
|
|
|
[ Param url
|
|
|
|
-- To make youtube-dl only download one file when given a
|
|
|
|
-- page with a video and a playlist, download only the video.
|
|
|
|
, Param "--no-playlist"
|
|
|
|
-- And when given a page with only a playlist, download only
|
|
|
|
-- the first video on the playlist. (Assumes the video is
|
|
|
|
-- somewhat stable, but this is the only way to prevent
|
|
|
|
-- youtube-dl from downloading the whole playlist.)
|
|
|
|
, Param "--playlist-items", Param "0"
|
|
|
|
]
|
|
|
|
|
|
|
|
-- To honor annex.diskreserve, ask youtube-dl to not download too
|
|
|
|
-- large a media file. Factors in other downloads that are in progress,
|
|
|
|
-- and any files in the workdir that it may have partially downloaded
|
|
|
|
-- before.
|
|
|
|
youtubeDlMaxSize :: FilePath -> Annex (Either String [CommandParam])
|
|
|
|
youtubeDlMaxSize workdir = ifM (Annex.getState Annex.force)
|
|
|
|
( return $ Right []
|
|
|
|
, liftIO (getDiskFree workdir) >>= \case
|
|
|
|
Just have -> do
|
|
|
|
inprogress <- sizeOfDownloadsInProgress (const True)
|
|
|
|
partial <- liftIO $ sum
|
|
|
|
<$> (mapM getFileSize =<< dirContents workdir)
|
|
|
|
reserve <- annexDiskReserve <$> Annex.getGitConfig
|
|
|
|
let maxsize = have - reserve - inprogress + partial
|
|
|
|
if maxsize > 0
|
|
|
|
then return $ Right
|
|
|
|
[ Param "--max-filesize"
|
|
|
|
, Param (show maxsize)
|
|
|
|
]
|
|
|
|
else return $ Left $
|
|
|
|
needMoreDiskSpace $
|
|
|
|
negate maxsize + 1024
|
|
|
|
Nothing -> return $ Right []
|
|
|
|
)
|
2017-11-29 19:49:05 +00:00
|
|
|
|
|
|
|
-- Download a media file to a destination,
|
|
|
|
youtubeDlTo :: Key -> URLString -> FilePath -> Annex Bool
|
|
|
|
youtubeDlTo key url dest = do
|
2017-12-05 19:00:50 +00:00
|
|
|
res <- withTmpWorkDir key $ \workdir ->
|
|
|
|
youtubeDl url workdir >>= \case
|
2017-11-29 19:49:05 +00:00
|
|
|
Right (Just mediafile) -> do
|
|
|
|
liftIO $ renameFile mediafile dest
|
2017-11-30 17:45:43 +00:00
|
|
|
return (Just True)
|
|
|
|
Right Nothing -> return (Just False)
|
|
|
|
Left msg -> do
|
|
|
|
warning msg
|
|
|
|
return Nothing
|
|
|
|
return (fromMaybe False res)
|
2017-11-29 19:49:05 +00:00
|
|
|
|
2017-12-06 17:16:06 +00:00
|
|
|
-- youtube-dl supports downloading urls that are not html pages,
|
|
|
|
-- but we don't want to use it for such urls, since they can be downloaded
|
|
|
|
-- without it. So, this first downloads part of the content and checks
|
|
|
|
-- if it's a html page; only then is youtube-dl used.
|
2017-12-08 18:49:55 +00:00
|
|
|
htmlOnly :: URLString -> a -> Annex a -> Annex a
|
2018-04-04 19:00:51 +00:00
|
|
|
htmlOnly url fallback a = withUrlOptions $ \uo ->
|
2017-12-06 17:16:06 +00:00
|
|
|
liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
|
2017-12-08 18:49:55 +00:00
|
|
|
Just bs | isHtmlBs bs -> a
|
|
|
|
_ -> return fallback
|
|
|
|
|
|
|
|
youtubeDlSupported :: URLString -> Annex Bool
|
|
|
|
youtubeDlSupported url = either (const False) id <$> youtubeDlCheck url
|
|
|
|
|
|
|
|
-- Check if youtube-dl can find media in an url.
|
|
|
|
youtubeDlCheck :: URLString -> Annex (Either String Bool)
|
2017-12-11 16:46:34 +00:00
|
|
|
youtubeDlCheck url
|
|
|
|
| supportedScheme url = catchMsgIO $ htmlOnly url False $ do
|
|
|
|
opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
|
|
|
|
liftIO $ snd <$> processTranscript "youtube-dl" (toCommand opts) Nothing
|
|
|
|
| otherwise = return (Right False)
|
2017-11-30 18:35:25 +00:00
|
|
|
|
|
|
|
-- Ask youtube-dl for the filename of media in an url.
|
|
|
|
--
|
|
|
|
-- (This is not always identical to the filename it uses when downloading.)
|
|
|
|
youtubeDlFileName :: URLString -> Annex (Either String FilePath)
|
2017-12-11 16:46:34 +00:00
|
|
|
youtubeDlFileName url
|
|
|
|
| supportedScheme url = flip catchIO (pure . Left . show) $
|
2017-12-31 18:55:51 +00:00
|
|
|
htmlOnly url nomedia (youtubeDlFileName' url)
|
|
|
|
| otherwise = return nomedia
|
|
|
|
where
|
|
|
|
nomedia = Left "no media in url"
|
|
|
|
|
|
|
|
-- Does not check if the url contains htmlOnly; use when that's already
|
|
|
|
-- been verified.
|
|
|
|
youtubeDlFileName' :: URLString -> Annex (Either String FilePath)
|
|
|
|
youtubeDlFileName' url
|
|
|
|
| supportedScheme url = flip catchIO (pure . Left . show) go
|
2017-12-11 16:46:34 +00:00
|
|
|
| otherwise = return nomedia
|
|
|
|
where
|
|
|
|
go = do
|
2017-12-08 18:49:55 +00:00
|
|
|
-- Sometimes youtube-dl will fail with an ugly backtrace
|
|
|
|
-- (eg, http://bugs.debian.org/874321)
|
|
|
|
-- so catch stderr as well as stdout to avoid the user
|
|
|
|
-- seeing it. --no-warnings avoids warning messages that
|
|
|
|
-- are output to stdout.
|
|
|
|
opts <- youtubeDlOpts
|
|
|
|
[ Param url
|
|
|
|
, Param "--get-filename"
|
|
|
|
, Param "--no-warnings"
|
|
|
|
]
|
2017-12-31 19:19:01 +00:00
|
|
|
(Nothing, Just o, Just e, pid) <- liftIO $ createProcess
|
|
|
|
(proc "youtube-dl" (toCommand opts))
|
|
|
|
{ std_out = CreatePipe
|
|
|
|
, std_err = CreatePipe
|
|
|
|
}
|
|
|
|
output <- liftIO $ fmap fst $
|
|
|
|
hGetContentsStrict o
|
|
|
|
`concurrently`
|
|
|
|
hGetContentsStrict e
|
|
|
|
ok <- liftIO $ checkSuccessProcess pid
|
2017-12-08 18:49:55 +00:00
|
|
|
return $ case (ok, lines output) of
|
|
|
|
(True, (f:_)) | not (null f) -> Right f
|
|
|
|
_ -> nomedia
|
|
|
|
nomedia = Left "no media in url"
|
2017-11-30 18:35:25 +00:00
|
|
|
|
|
|
|
youtubeDlOpts :: [CommandParam] -> Annex [CommandParam]
|
|
|
|
youtubeDlOpts addopts = do
|
2017-11-30 00:07:03 +00:00
|
|
|
opts <- map Param . annexYoutubeDlOptions <$> Annex.getGitConfig
|
2017-11-30 18:35:25 +00:00
|
|
|
return (opts ++ addopts)
|
2017-12-11 16:46:34 +00:00
|
|
|
|
|
|
|
supportedScheme :: URLString -> Bool
|
|
|
|
supportedScheme url = case uriScheme <$> parseURIRelaxed url of
|
|
|
|
Nothing -> False
|
|
|
|
-- avoid ugly message from youtube-dl about not supporting file:
|
|
|
|
Just "file:" -> False
|
|
|
|
-- ftp indexes may look like html pages, and there's no point
|
|
|
|
-- involving youtube-dl in a ftp download
|
|
|
|
Just "ftp:" -> False
|
|
|
|
Just _ -> True
|