git-annex/Annex/YoutubeDl.hs
Joey Hess e82823d448
nub list of files
yt-dlp when resumed was observed having written the same filename twice
into the file list. Perhaps once by the first download and once by the
resumed one?
2023-07-09 14:18:25 -04:00

326 lines
12 KiB
Haskell

{- yt-dlp (and deprecated youtube-dl) integration for git-annex
-
- Copyright 2017-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Annex.YoutubeDl (
youtubeDl,
youtubeDlTo,
youtubeDlSupported,
youtubeDlCheck,
youtubeDlFileName,
youtubeDlFileNameHtmlOnly,
youtubeDlCommand,
) where
import Annex.Common
import qualified Annex
import Annex.Content
import Annex.Url
import Utility.DiskFree
import Utility.HtmlDetect
import Utility.Process.Transcript
import Utility.Metered
import Messages.Progress
import Logs.Transfer
import Network.URI
import Control.Concurrent.Async
import Text.Read
-- youtube-dl can follow redirects to anywhere, including potentially
-- localhost or a private address. So, it's only allowed to download
-- content if the user has allowed access to all addresses.
youtubeDlAllowed :: Annex Bool
youtubeDlAllowed = ipAddressesUnlimited
youtubeDlNotAllowedMessage :: String
youtubeDlNotAllowedMessage = unwords
[ "This url is supported by yt-dlp, but"
, "yt-dlp could potentially access any address, and the"
, "configuration of annex.security.allowed-ip-addresses"
, "does not allow that. Not using yt-dlp (or youtube-dl)."
]
-- Runs youtube-dl in a work directory, to download a single media file
-- from the url. Returns the path to the media file in the work directory.
--
-- Displays a progress meter as youtube-dl downloads.
--
-- If no file is downloaded, or the program is not installed,
-- returns Right Nothing.
--
-- youtube-dl can write to multiple files, either temporary files, or
-- multiple videos found at the url, and git-annex needs only one file.
-- So we need to find the destination file, and make sure there is not
-- more than one. With yt-dlp use --print-to-file to make it record the
-- file(s) it downloads. With youtube-dl, the best that can be done is
-- to require that the work directory end up with only 1 file in it.
-- (This can fail, but youtube-dl is deprecated, and they closed my
-- issue requesting something like --print-to-file;
-- <https://github.com/rg3/youtube-dl/issues/14864>)
youtubeDl :: URLString -> FilePath -> MeterUpdate -> Annex (Either String (Maybe FilePath))
youtubeDl url workdir p = ifM ipAddressesUnlimited
( withUrlOptions $ youtubeDl' url workdir p
, return $ Left youtubeDlNotAllowedMessage
)
youtubeDl' :: URLString -> FilePath -> MeterUpdate -> UrlOptions -> Annex (Either String (Maybe FilePath))
youtubeDl' url workdir p uo
| supportedScheme uo url = do
cmd <- youtubeDlCommand
ifM (liftIO $ inSearchPath cmd)
( runcmd cmd >>= \case
Right True -> downloadedfiles cmd >>= \case
(f:[]) -> return (Right (Just f))
[] -> return (nofiles cmd)
fs -> return (toomanyfiles cmd fs)
Right False -> workdirfiles >>= \case
[] -> return (Right Nothing)
_ -> return (Left $ cmd ++ " download is incomplete. Run the command again to resume.")
Left msg -> return (Left msg)
, return (Right Nothing)
)
| otherwise = return (Right Nothing)
where
nofiles cmd = Left $ cmd ++ " did not put any media in its work directory, perhaps it's been configured to store files somewhere else?"
toomanyfiles cmd fs = Left $ cmd ++ " downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs
downloadedfiles cmd
| isytdlp cmd = liftIO $
(nub . lines <$> readFile filelistfile)
`catchIO` (pure . const [])
| otherwise = workdirfiles
workdirfiles = liftIO $ filter (/= filelistfile)
<$> (filterM (doesFileExist) =<< dirContents workdir)
filelistfile = workdir </> filelistfilebase
filelistfilebase = "git-annex-file-list-file"
isytdlp cmd = cmd == "yt-dlp"
runcmd cmd = youtubeDlMaxSize workdir >>= \case
Left msg -> return (Left msg)
Right maxsize -> do
opts <- youtubeDlOpts (dlopts cmd ++ maxsize)
oh <- mkOutputHandlerQuiet
-- The size is unknown to start. Once youtube-dl
-- outputs some progress, the meter will be updated
-- with the size, which is why it's important the
-- meter is passed into commandMeter'
let unknownsize = Nothing :: Maybe FileSize
ok <- metered (Just p) unknownsize Nothing $ \meter meterupdate ->
liftIO $ commandMeter'
(if isytdlp cmd then parseYtdlpProgress else parseYoutubeDlProgress)
oh (Just meter) meterupdate cmd opts
(\pr -> pr { cwd = Just workdir })
return (Right ok)
dlopts cmd =
[ Param url
-- To make it only download one file when given a
-- page with a video and a playlist, download only the video.
, Param "--no-playlist"
-- And when given a page with only a playlist, download only
-- the first video on the playlist. (Assumes the video is
-- somewhat stable, but this is the only way to prevent
-- it from downloading the whole playlist.)
, Param "--playlist-items", Param "0"
] ++
if isytdlp cmd
then
[ Param "--progress-template"
, Param progressTemplate
, Param "--print-to-file"
, Param "after_move:filepath"
, Param filelistfilebase
]
else []
-- To honor annex.diskreserve, ask youtube-dl to not download too
-- large a media file. Factors in other downloads that are in progress,
-- and any files in the workdir that it may have partially downloaded
-- before.
youtubeDlMaxSize :: FilePath -> Annex (Either String [CommandParam])
youtubeDlMaxSize workdir = ifM (Annex.getRead Annex.force)
( return $ Right []
, liftIO (getDiskFree workdir) >>= \case
Just have -> do
inprogress <- sizeOfDownloadsInProgress (const True)
partial <- liftIO $ sum
<$> (mapM (getFileSize . toRawFilePath) =<< dirContents workdir)
reserve <- annexDiskReserve <$> Annex.getGitConfig
let maxsize = have - reserve - inprogress + partial
if maxsize > 0
then return $ Right
[ Param "--max-filesize"
, Param (show maxsize)
]
else return $ Left $
needMoreDiskSpace $
negate maxsize + 1024
Nothing -> return $ Right []
)
-- Download a media file to a destination,
youtubeDlTo :: Key -> URLString -> FilePath -> MeterUpdate -> Annex Bool
youtubeDlTo key url dest p = do
res <- withTmpWorkDir key $ \workdir ->
youtubeDl url (fromRawFilePath workdir) p >>= \case
Right (Just mediafile) -> do
liftIO $ moveFile (toRawFilePath mediafile) (toRawFilePath dest)
return (Just True)
Right Nothing -> return (Just False)
Left msg -> do
warning (UnquotedString msg)
return Nothing
return (fromMaybe False res)
-- youtube-dl supports downloading urls that are not html pages,
-- but we don't want to use it for such urls, since they can be downloaded
-- without it. So, this first downloads part of the content and checks
-- if it's a html page; only then is youtube-dl used.
htmlOnly :: URLString -> a -> Annex a -> Annex a
htmlOnly url fallback a = withUrlOptions $ \uo ->
liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
Just bs | isHtmlBs bs -> a
_ -> return fallback
-- Check if youtube-dl supports downloading content from an url.
youtubeDlSupported :: URLString -> Annex Bool
youtubeDlSupported url = either (const False) id
<$> withUrlOptions (youtubeDlCheck' url)
-- Check if youtube-dl can find media in an url.
--
-- While this does not download anything, it checks youtubeDlAllowed
-- for symmetry with youtubeDl; the check should not succeed if the
-- download won't succeed.
youtubeDlCheck :: URLString -> Annex (Either String Bool)
youtubeDlCheck url = ifM youtubeDlAllowed
( withUrlOptions $ youtubeDlCheck' url
, return $ Left youtubeDlNotAllowedMessage
)
youtubeDlCheck' :: URLString -> UrlOptions -> Annex (Either String Bool)
youtubeDlCheck' url uo
| supportedScheme uo url = catchMsgIO $ htmlOnly url False $ do
opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
cmd <- youtubeDlCommand
liftIO $ snd <$> processTranscript cmd (toCommand opts) Nothing
| otherwise = return (Right False)
-- Ask youtube-dl for the filename of media in an url.
--
-- (This is not always identical to the filename it uses when downloading.)
youtubeDlFileName :: URLString -> Annex (Either String FilePath)
youtubeDlFileName url = withUrlOptions go
where
go uo
| supportedScheme uo url = flip catchIO (pure . Left . show) $
htmlOnly url nomedia (youtubeDlFileNameHtmlOnly' url uo)
| otherwise = return nomedia
nomedia = Left "no media in url"
-- Does not check if the url contains htmlOnly; use when that's already
-- been verified.
youtubeDlFileNameHtmlOnly :: URLString -> Annex (Either String FilePath)
youtubeDlFileNameHtmlOnly = withUrlOptions . youtubeDlFileNameHtmlOnly'
youtubeDlFileNameHtmlOnly' :: URLString -> UrlOptions -> Annex (Either String FilePath)
youtubeDlFileNameHtmlOnly' url uo
| supportedScheme uo url = flip catchIO (pure . Left . show) go
| otherwise = return nomedia
where
go = do
-- Sometimes youtube-dl will fail with an ugly backtrace
-- (eg, http://bugs.debian.org/874321)
-- so catch stderr as well as stdout to avoid the user
-- seeing it. --no-warnings avoids warning messages that
-- are output to stdout.
opts <- youtubeDlOpts
[ Param url
, Param "--get-filename"
, Param "--no-warnings"
, Param "--no-playlist"
]
cmd <- youtubeDlCommand
let p = (proc cmd (toCommand opts))
{ std_out = CreatePipe
, std_err = CreatePipe
}
liftIO $ withCreateProcess p waitproc
waitproc Nothing (Just o) (Just e) pid = do
errt <- async $ discardstderr pid e
output <- hGetContentsStrict o
ok <- liftIO $ checkSuccessProcess pid
wait errt
return $ case (ok, lines output) of
(True, (f:_)) | not (null f) -> Right f
_ -> nomedia
waitproc _ _ _ _ = error "internal"
discardstderr pid e = hGetLineUntilExitOrEOF pid e >>= \case
Nothing -> return ()
Just _ -> discardstderr pid e
nomedia = Left "no media in url"
youtubeDlOpts :: [CommandParam] -> Annex [CommandParam]
youtubeDlOpts addopts = do
opts <- map Param . annexYoutubeDlOptions <$> Annex.getGitConfig
return (opts ++ addopts)
youtubeDlCommand :: Annex String
youtubeDlCommand = annexYoutubeDlCommand <$> Annex.getGitConfig >>= \case
Just c -> pure c
Nothing -> ifM (liftIO $ inSearchPath "yt-dlp")
( return "yt-dlp"
, return "youtube-dl"
)
supportedScheme :: UrlOptions -> URLString -> Bool
supportedScheme uo url = case parseURIRelaxed url of
Nothing -> False
Just u -> case uriScheme u of
-- avoid ugly message from youtube-dl about not supporting file:
"file:" -> False
-- ftp indexes may look like html pages, and there's no point
-- involving youtube-dl in a ftp download
"ftp:" -> False
_ -> allowedScheme uo u
progressTemplate :: String
progressTemplate = "ANNEX %(progress.downloaded_bytes)i %(progress.total_bytes_estimate)i %(progress.total_bytes)i ANNEX"
{- The progressTemplate makes output look like "ANNEX 10 100 NA ANNEX" or
- "ANNEX 10 NA 100 ANNEX" depending on whether the total bytes are estimated
- or known. That makes parsing much easier (and less fragile) than parsing
- the usual progress output.
-}
parseYtdlpProgress :: ProgressParser
parseYtdlpProgress = go [] . reverse . progresschunks
where
delim = '\r'
progresschunks = splitc delim
go remainder [] = (Nothing, Nothing, remainder)
go remainder (x:xs) = case splitc ' ' x of
("ANNEX":downloaded_bytes_s:total_bytes_estimate_s:total_bytes_s:"ANNEX":[]) ->
case (readMaybe downloaded_bytes_s, readMaybe total_bytes_estimate_s, readMaybe total_bytes_s) of
(Just downloaded_bytes, Nothing, Just total_bytes) ->
( Just (BytesProcessed downloaded_bytes)
, Just (TotalSize total_bytes)
, remainder
)
(Just downloaded_bytes, Just total_bytes_estimate, _) ->
( Just (BytesProcessed downloaded_bytes)
, Just (TotalSize total_bytes_estimate)
, remainder
)
_ -> go (remainder++x) xs
_ -> go (remainder++x) xs
{- youtube-dl is deprecated, parsing its progress was attempted before but
- was buggy and is no longer done. -}
parseYoutubeDlProgress :: ProgressParser
parseYoutubeDlProgress _ = (Nothing, Nothing, "")