Improve resuming interrupted download when using yt-dlp

Sometimes resuming an interrupted download will fail to resume and download
more files with different names. That resulted in the workdir having
multiple files at the end, which causes git-annex to give up because it
does not know what was downloaded.

To fix this, use a yt-dlp feature, which appends to a file the name of each
file after it's finished downloading it. So the presence of other cruft in
the workdir will not confuse git-annex.
This commit is contained in:
Joey Hess 2023-06-19 14:23:14 -04:00
parent 90d410b382
commit a36a81dea3
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 48 additions and 30 deletions

View file

@ -48,16 +48,18 @@ youtubeDlNotAllowedMessage = unwords
--
-- Displays a progress meter as youtube-dl downloads.
--
-- If youtube-dl fails without writing any files to the work directory,
-- or is not installed, returns Right Nothing.
-- If no file is downloaded, or the program is not installed,
-- returns Right Nothing.
--
-- The work directory can contain files from a previous run of youtube-dl
-- and it will resume. It should not contain any other files though,
-- and youtube-dl needs to finish up with only one file in the directory
-- so we know which one it downloaded.
--
-- (Note that we can't use --output to specify the file to download to,
-- due to <https://github.com/rg3/youtube-dl/issues/14864>)
-- youtube-dl can write to multiple files, either temporary files, or
-- multiple videos found at the url, and git-annex needs only one file.
-- So we need to find the destination file, and make sure there is not
-- more than one. With yt-dlp use --print-to-file to make it record the
-- file(s) it downloads. With youtube-dl, the best that can be done is
-- to require that the work directory end up with only 1 file in it.
-- (This can fail, but youtube-dl is deprecated, and they closed my
-- issue requesting something like --print-to-file;
-- <https://github.com/rg3/youtube-dl/issues/14864>)
youtubeDl :: URLString -> FilePath -> MeterUpdate -> Annex (Either String (Maybe FilePath))
youtubeDl url workdir p = ifM ipAddressesUnlimited
( withUrlOptions $ youtubeDl' url workdir p
@ -66,29 +68,38 @@ youtubeDl url workdir p = ifM ipAddressesUnlimited
youtubeDl' :: URLString -> FilePath -> MeterUpdate -> UrlOptions -> Annex (Either String (Maybe FilePath))
youtubeDl' url workdir p uo
| supportedScheme uo url = ifM (liftIO . inSearchPath =<< youtubeDlCommand)
( runcmd >>= \case
Right True -> workdirfiles >>= \case
(f:[]) -> return (Right (Just f))
[] -> return nofiles
fs -> return (toomanyfiles fs)
Right False -> workdirfiles >>= \case
[] -> return (Right Nothing)
_ -> return (Left "yt-dlp download is incomplete. Run the command again to resume.")
Left msg -> return (Left msg)
, return (Right Nothing)
)
| supportedScheme uo url = do
cmd <- youtubeDlCommand
ifM (liftIO $ inSearchPath cmd)
( runcmd cmd >>= \case
Right True -> downloadedfiles cmd >>= \case
(f:[]) -> return (Right (Just f))
[] -> return nofiles
fs -> return (toomanyfiles fs)
Right False -> workdirfiles >>= \case
[] -> return (Right Nothing)
_ -> return (Left "yt-dlp download is incomplete. Run the command again to resume.")
Left msg -> return (Left msg)
, return (Right Nothing)
)
| otherwise = return (Right Nothing)
where
nofiles = Left "yt-dlp did not put any media in its work directory, perhaps it's been configured to store files somewhere else?"
toomanyfiles fs = Left $ "yt-dlp downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs
workdirfiles = liftIO $ filterM (doesFileExist) =<< dirContents workdir
runcmd = youtubeDlMaxSize workdir >>= \case
downloadedfiles cmd
| isytdlp cmd = liftIO $
(lines <$> readFile filelistfile)
`catchIO` (pure . const [])
| otherwise = workdirfiles
workdirfiles = liftIO $ filter (/= filelistfile)
<$> (filterM (doesFileExist) =<< dirContents workdir)
filelistfile = workdir </> filelistfilebase
filelistfilebase = "git-annex-file-list-file"
isytdlp cmd = "yt-dlp" `isInfixOf` cmd
runcmd cmd = youtubeDlMaxSize workdir >>= \case
Left msg -> return (Left msg)
Right maxsize -> do
cmd <- youtubeDlCommand
let isytdlp = "yt-dlp" `isInfixOf` cmd
opts <- youtubeDlOpts (dlopts isytdlp ++ maxsize)
opts <- youtubeDlOpts (dlopts cmd ++ maxsize)
oh <- mkOutputHandlerQuiet
-- The size is unknown to start. Once youtube-dl
-- outputs some progress, the meter will be updated
@ -97,11 +108,11 @@ youtubeDl' url workdir p uo
let unknownsize = Nothing :: Maybe FileSize
ok <- metered (Just p) unknownsize Nothing $ \meter meterupdate ->
liftIO $ commandMeter'
(if isytdlp then parseYtdlpProgress else parseYoutubeDlProgress)
(if isytdlp cmd then parseYtdlpProgress else parseYoutubeDlProgress)
oh (Just meter) meterupdate cmd opts
(\pr -> pr { cwd = Just workdir })
return (Right ok)
dlopts isytdlp =
dlopts cmd =
[ Param url
-- To make it only download one file when given a
-- page with a video and a playlist, download only the video.
@ -112,8 +123,14 @@ youtubeDl' url workdir p uo
-- it from downloading the whole playlist.)
, Param "--playlist-items", Param "0"
] ++
if isytdlp
then [Param "--progress-template", Param progressTemplate]
if isytdlp cmd
then
[ Param "--progress-template"
, Param progressTemplate
, Param "--print-to-file"
, Param "after_move:filepath"
, Param filelistfilebase
]
else []
-- To honor annex.diskreserve, ask youtube-dl to not download too

View file

@ -93,6 +93,7 @@ git-annex (10.20230408) UNRELEASED; urgency=medium
* Fix display when run with -J1.
* assistant: Fix a crash when a small file is deleted immediately after
being created.
* Improve resuming interrupted download when using yt-dlp.
-- Joey Hess <id@joeyh.name> Sat, 08 Apr 2023 13:57:18 -0400