Improve resuming interrupted download when using yt-dlp

Sometimes resuming an interrupted download will fail to resume and download
more files with different names. That resulted in the workdir having
multiple files at the end, which causes git-annex to give up because it
does not know what was downloaded.

To fix this, use a yt-dlp feature, which appends to a file the name of each
file after it's finished downloading it. So the presence of other cruft in
the workdir will not confuse git-annex.
This commit is contained in:
Joey Hess 2023-06-19 14:23:14 -04:00
parent 90d410b382
commit a36a81dea3
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 48 additions and 30 deletions

View file

@ -48,16 +48,18 @@ youtubeDlNotAllowedMessage = unwords
-- --
-- Displays a progress meter as youtube-dl downloads. -- Displays a progress meter as youtube-dl downloads.
-- --
-- If youtube-dl fails without writing any files to the work directory, -- If no file is downloaded, or the program is not installed,
-- or is not installed, returns Right Nothing. -- returns Right Nothing.
-- --
-- The work directory can contain files from a previous run of youtube-dl -- youtube-dl can write to multiple files, either temporary files, or
-- and it will resume. It should not contain any other files though, -- multiple videos found at the url, and git-annex needs only one file.
-- and youtube-dl needs to finish up with only one file in the directory -- So we need to find the destination file, and make sure there is not
-- so we know which one it downloaded. -- more than one. With yt-dlp use --print-to-file to make it record the
-- -- file(s) it downloads. With youtube-dl, the best that can be done is
-- (Note that we can't use --output to specify the file to download to, -- to require that the work directory end up with only 1 file in it.
-- due to <https://github.com/rg3/youtube-dl/issues/14864>) -- (This can fail, but youtube-dl is deprecated, and they closed my
-- issue requesting something like --print-to-file;
-- <https://github.com/rg3/youtube-dl/issues/14864>)
youtubeDl :: URLString -> FilePath -> MeterUpdate -> Annex (Either String (Maybe FilePath)) youtubeDl :: URLString -> FilePath -> MeterUpdate -> Annex (Either String (Maybe FilePath))
youtubeDl url workdir p = ifM ipAddressesUnlimited youtubeDl url workdir p = ifM ipAddressesUnlimited
( withUrlOptions $ youtubeDl' url workdir p ( withUrlOptions $ youtubeDl' url workdir p
@ -66,29 +68,38 @@ youtubeDl url workdir p = ifM ipAddressesUnlimited
youtubeDl' :: URLString -> FilePath -> MeterUpdate -> UrlOptions -> Annex (Either String (Maybe FilePath)) youtubeDl' :: URLString -> FilePath -> MeterUpdate -> UrlOptions -> Annex (Either String (Maybe FilePath))
youtubeDl' url workdir p uo youtubeDl' url workdir p uo
| supportedScheme uo url = ifM (liftIO . inSearchPath =<< youtubeDlCommand) | supportedScheme uo url = do
( runcmd >>= \case cmd <- youtubeDlCommand
Right True -> workdirfiles >>= \case ifM (liftIO $ inSearchPath cmd)
(f:[]) -> return (Right (Just f)) ( runcmd cmd >>= \case
[] -> return nofiles Right True -> downloadedfiles cmd >>= \case
fs -> return (toomanyfiles fs) (f:[]) -> return (Right (Just f))
Right False -> workdirfiles >>= \case [] -> return nofiles
[] -> return (Right Nothing) fs -> return (toomanyfiles fs)
_ -> return (Left "yt-dlp download is incomplete. Run the command again to resume.") Right False -> workdirfiles >>= \case
Left msg -> return (Left msg) [] -> return (Right Nothing)
, return (Right Nothing) _ -> return (Left "yt-dlp download is incomplete. Run the command again to resume.")
) Left msg -> return (Left msg)
, return (Right Nothing)
)
| otherwise = return (Right Nothing) | otherwise = return (Right Nothing)
where where
nofiles = Left "yt-dlp did not put any media in its work directory, perhaps it's been configured to store files somewhere else?" nofiles = Left "yt-dlp did not put any media in its work directory, perhaps it's been configured to store files somewhere else?"
toomanyfiles fs = Left $ "yt-dlp downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs toomanyfiles fs = Left $ "yt-dlp downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs
workdirfiles = liftIO $ filterM (doesFileExist) =<< dirContents workdir downloadedfiles cmd
runcmd = youtubeDlMaxSize workdir >>= \case | isytdlp cmd = liftIO $
(lines <$> readFile filelistfile)
`catchIO` (pure . const [])
| otherwise = workdirfiles
workdirfiles = liftIO $ filter (/= filelistfile)
<$> (filterM (doesFileExist) =<< dirContents workdir)
filelistfile = workdir </> filelistfilebase
filelistfilebase = "git-annex-file-list-file"
isytdlp cmd = "yt-dlp" `isInfixOf` cmd
runcmd cmd = youtubeDlMaxSize workdir >>= \case
Left msg -> return (Left msg) Left msg -> return (Left msg)
Right maxsize -> do Right maxsize -> do
cmd <- youtubeDlCommand opts <- youtubeDlOpts (dlopts cmd ++ maxsize)
let isytdlp = "yt-dlp" `isInfixOf` cmd
opts <- youtubeDlOpts (dlopts isytdlp ++ maxsize)
oh <- mkOutputHandlerQuiet oh <- mkOutputHandlerQuiet
-- The size is unknown to start. Once youtube-dl -- The size is unknown to start. Once youtube-dl
-- outputs some progress, the meter will be updated -- outputs some progress, the meter will be updated
@ -97,11 +108,11 @@ youtubeDl' url workdir p uo
let unknownsize = Nothing :: Maybe FileSize let unknownsize = Nothing :: Maybe FileSize
ok <- metered (Just p) unknownsize Nothing $ \meter meterupdate -> ok <- metered (Just p) unknownsize Nothing $ \meter meterupdate ->
liftIO $ commandMeter' liftIO $ commandMeter'
(if isytdlp then parseYtdlpProgress else parseYoutubeDlProgress) (if isytdlp cmd then parseYtdlpProgress else parseYoutubeDlProgress)
oh (Just meter) meterupdate cmd opts oh (Just meter) meterupdate cmd opts
(\pr -> pr { cwd = Just workdir }) (\pr -> pr { cwd = Just workdir })
return (Right ok) return (Right ok)
dlopts isytdlp = dlopts cmd =
[ Param url [ Param url
-- To make it only download one file when given a -- To make it only download one file when given a
-- page with a video and a playlist, download only the video. -- page with a video and a playlist, download only the video.
@ -112,8 +123,14 @@ youtubeDl' url workdir p uo
-- it from downloading the whole playlist.) -- it from downloading the whole playlist.)
, Param "--playlist-items", Param "0" , Param "--playlist-items", Param "0"
] ++ ] ++
if isytdlp if isytdlp cmd
then [Param "--progress-template", Param progressTemplate] then
[ Param "--progress-template"
, Param progressTemplate
, Param "--print-to-file"
, Param "after_move:filepath"
, Param filelistfilebase
]
else [] else []
-- To honor annex.diskreserve, ask youtube-dl to not download too -- To honor annex.diskreserve, ask youtube-dl to not download too

View file

@ -93,6 +93,7 @@ git-annex (10.20230408) UNRELEASED; urgency=medium
* Fix display when run with -J1. * Fix display when run with -J1.
* assistant: Fix a crash when a small file is deleted immediately after * assistant: Fix a crash when a small file is deleted immediately after
being created. being created.
* Improve resuming interrupted download when using yt-dlp.
-- Joey Hess <id@joeyh.name> Sat, 08 Apr 2023 13:57:18 -0400 -- Joey Hess <id@joeyh.name> Sat, 08 Apr 2023 13:57:18 -0400