
Which is a per-remote version of the annex.web-options config. Had to plumb RemoteGitConfig through to getUrlOptions. In cases where a special remote does not use curl, there was no need to do that and I used Nothing instead. In the case of the addurl and importfeed commands, it seemed best to say that running these commands is not using the web special remote per se, so the config is not used for those commands.
413 lines
14 KiB
Haskell
413 lines
14 KiB
Haskell
{- yt-dlp (and deprecated youtube-dl) integration for git-annex
|
|
-
|
|
- Copyright 2017-2024 Joey Hess <id@joeyh.name>
|
|
-
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
|
-}
|
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
{-# LANGUAGE DeriveGeneric #-}
|
|
|
|
module Annex.YoutubeDl (
|
|
youtubeDl,
|
|
youtubeDlTo,
|
|
youtubeDlSupported,
|
|
youtubeDlCheck,
|
|
youtubeDlFileName,
|
|
youtubeDlFileNameHtmlOnly,
|
|
youtubeDlCommand,
|
|
youtubePlaylist,
|
|
YoutubePlaylistItem(..),
|
|
) where
|
|
|
|
import Annex.Common
|
|
import qualified Annex
|
|
import Annex.Content
|
|
import Annex.Url
|
|
import Utility.DiskFree
|
|
import Utility.HtmlDetect
|
|
import Utility.Process.Transcript
|
|
import Utility.Metered
|
|
import Utility.Tmp
|
|
import Messages.Progress
|
|
import Logs.Transfer
|
|
import qualified Utility.FileIO as F
|
|
|
|
import Network.URI
|
|
import Control.Concurrent.Async
|
|
import Text.Read
|
|
import Data.Either
|
|
import qualified Data.Aeson as Aeson
|
|
import GHC.Generics
|
|
import qualified Data.ByteString.Char8 as B8
|
|
|
|
-- youtube-dl can follow redirects to anywhere, including potentially
|
|
-- localhost or a private address. So, it's only allowed to download
|
|
-- content if the user has allowed access to all addresses.
|
|
youtubeDlAllowed :: Annex Bool
|
|
youtubeDlAllowed = ipAddressesUnlimited
|
|
|
|
youtubeDlNotAllowedMessage :: String
|
|
youtubeDlNotAllowedMessage = unwords
|
|
[ "This url is supported by yt-dlp, but"
|
|
, "yt-dlp could potentially access any address, and the"
|
|
, "configuration of annex.security.allowed-ip-addresses"
|
|
, "does not allow that. Not using yt-dlp (or youtube-dl)."
|
|
]
|
|
|
|
-- Runs youtube-dl in a work directory, to download a single media file
|
|
-- from the url. Returns the path to the media file in the work directory.
|
|
--
|
|
-- Displays a progress meter as youtube-dl downloads.
|
|
--
|
|
-- If no file is downloaded, or the program is not installed,
|
|
-- returns Right Nothing.
|
|
--
|
|
-- youtube-dl can write to multiple files, either temporary files, or
|
|
-- multiple videos found at the url, and git-annex needs only one file.
|
|
-- So we need to find the destination file, and make sure there is not
|
|
-- more than one. With yt-dlp use --print-to-file to make it record the
|
|
-- file(s) it downloads. With youtube-dl, the best that can be done is
|
|
-- to require that the work directory end up with only 1 file in it.
|
|
-- (This can fail, but youtube-dl is deprecated, and they closed my
|
|
-- issue requesting something like --print-to-file;
|
|
-- <https://github.com/rg3/youtube-dl/issues/14864>)
|
|
youtubeDl :: URLString -> OsPath -> MeterUpdate -> Annex (Either String (Maybe OsPath))
|
|
youtubeDl url workdir p = ifM ipAddressesUnlimited
|
|
( withUrlOptions Nothing $ youtubeDl' url workdir p
|
|
, return $ Left youtubeDlNotAllowedMessage
|
|
)
|
|
|
|
youtubeDl' :: URLString -> OsPath -> MeterUpdate -> UrlOptions -> Annex (Either String (Maybe OsPath))
|
|
youtubeDl' url workdir p uo
|
|
| supportedScheme uo url = do
|
|
cmd <- youtubeDlCommand
|
|
ifM (liftIO $ inSearchPath cmd)
|
|
( runcmd cmd >>= \case
|
|
Right True -> downloadedfiles cmd >>= \case
|
|
(f:[]) -> return $
|
|
Right (Just (toOsPath f))
|
|
[] -> return (nofiles cmd)
|
|
fs -> return (toomanyfiles cmd fs)
|
|
Right False -> workdirfiles >>= \case
|
|
[] -> return (Right Nothing)
|
|
_ -> return (Left $ cmd ++ " download is incomplete. Run the command again to resume.")
|
|
Left msg -> return (Left msg)
|
|
, return (Right Nothing)
|
|
)
|
|
| otherwise = return (Right Nothing)
|
|
where
|
|
nofiles cmd = Left $ cmd ++ " did not put any media in its work directory, perhaps it's been configured to store files somewhere else?"
|
|
toomanyfiles cmd fs = Left $ cmd ++ " downloaded multiple media files; git-annex is only able to deal with one per url: " ++ show fs
|
|
downloadedfiles cmd
|
|
| isytdlp cmd = liftIO $
|
|
(nub . lines <$> readFile (fromOsPath filelistfile))
|
|
`catchIO` (pure . const [])
|
|
| otherwise = map fromOsPath <$> workdirfiles
|
|
workdirfiles = liftIO $ filter (/= filelistfile)
|
|
<$> (filterM doesFileExist =<< dirContents workdir)
|
|
filelistfile = workdir </> filelistfilebase
|
|
filelistfilebase = literalOsPath "git-annex-file-list-file"
|
|
isytdlp cmd = cmd == "yt-dlp"
|
|
runcmd cmd = youtubeDlMaxSize workdir >>= \case
|
|
Left msg -> return (Left msg)
|
|
Right maxsize -> do
|
|
opts <- youtubeDlOpts (dlopts cmd ++ maxsize)
|
|
oh <- mkOutputHandlerQuiet
|
|
-- The size is unknown to start. Once youtube-dl
|
|
-- outputs some progress, the meter will be updated
|
|
-- with the size, which is why it's important the
|
|
-- meter is passed into commandMeter'
|
|
let unknownsize = Nothing :: Maybe FileSize
|
|
ok <- metered (Just p) unknownsize Nothing $ \meter meterupdate ->
|
|
liftIO $ commandMeter'
|
|
(if isytdlp cmd then parseYtdlpProgress else parseYoutubeDlProgress)
|
|
oh (Just meter) meterupdate cmd opts
|
|
(\pr -> pr { cwd = Just (fromOsPath workdir) })
|
|
return (Right ok)
|
|
dlopts cmd =
|
|
[ Param url
|
|
-- To make it only download one file when given a
|
|
-- page with a video and a playlist, download only the video.
|
|
, Param "--no-playlist"
|
|
-- And when given a page with only a playlist, download only
|
|
-- the first video on the playlist. (Assumes the video is
|
|
-- somewhat stable, but this is the only way to prevent
|
|
-- it from downloading the whole playlist.)
|
|
, Param "--playlist-items", Param "0"
|
|
] ++
|
|
if isytdlp cmd
|
|
then
|
|
-- Avoid warnings, which go to
|
|
-- stderr and may mess up
|
|
-- git-annex's display.
|
|
[ Param "--no-warnings"
|
|
, Param "--progress-template"
|
|
, Param progressTemplate
|
|
, Param "--print-to-file"
|
|
, Param "after_move:filepath"
|
|
, Param (fromOsPath filelistfilebase)
|
|
]
|
|
else []
|
|
|
|
-- To honor annex.diskreserve, ask youtube-dl to not download too
|
|
-- large a media file. Factors in other downloads that are in progress,
|
|
-- and any files in the workdir that it may have partially downloaded
|
|
-- before.
|
|
youtubeDlMaxSize :: OsPath -> Annex (Either String [CommandParam])
|
|
youtubeDlMaxSize workdir = ifM (Annex.getRead Annex.force)
|
|
( return $ Right []
|
|
, liftIO (getDiskFree (fromOsPath workdir)) >>= \case
|
|
Just have -> do
|
|
inprogress <- sizeOfDownloadsInProgress (const True)
|
|
partial <- liftIO $ sum
|
|
<$> (mapM getFileSize =<< dirContents workdir)
|
|
reserve <- annexDiskReserve <$> Annex.getGitConfig
|
|
let maxsize = have - reserve - inprogress + partial
|
|
if maxsize > 0
|
|
then return $ Right
|
|
[ Param "--max-filesize"
|
|
, Param (show maxsize)
|
|
]
|
|
else return $ Left $
|
|
needMoreDiskSpace $
|
|
negate maxsize + 1024
|
|
Nothing -> return $ Right []
|
|
)
|
|
|
|
-- Download a media file to a destination,
|
|
youtubeDlTo :: Key -> URLString -> OsPath -> MeterUpdate -> Annex Bool
|
|
youtubeDlTo key url dest p = do
|
|
res <- withTmpWorkDir key $ \workdir ->
|
|
youtubeDl url workdir p >>= \case
|
|
Right (Just mediafile) -> do
|
|
liftIO $ moveFile mediafile dest
|
|
return (Just True)
|
|
Right Nothing -> return (Just False)
|
|
Left msg -> do
|
|
warning (UnquotedString msg)
|
|
return Nothing
|
|
return (fromMaybe False res)
|
|
|
|
-- youtube-dl supports downloading urls that are not html pages,
|
|
-- but we don't want to use it for such urls, since they can be downloaded
|
|
-- without it. So, this first downloads part of the content and checks
|
|
-- if it's a html page; only then is youtube-dl used.
|
|
htmlOnly :: URLString -> a -> Annex a -> Annex a
|
|
htmlOnly url fallback a = withUrlOptions Nothing $ \uo ->
|
|
liftIO (downloadPartial url uo htmlPrefixLength) >>= \case
|
|
Just bs | isHtmlBs bs -> a
|
|
_ -> return fallback
|
|
|
|
-- Check if youtube-dl supports downloading content from an url.
|
|
youtubeDlSupported :: URLString -> Annex Bool
|
|
youtubeDlSupported url = either (const False) id
|
|
<$> withUrlOptions Nothing (youtubeDlCheck' url)
|
|
|
|
-- Check if youtube-dl can find media in an url.
|
|
--
|
|
-- While this does not download anything, it checks youtubeDlAllowed
|
|
-- for symmetry with youtubeDl; the check should not succeed if the
|
|
-- download won't succeed.
|
|
youtubeDlCheck :: URLString -> Annex (Either String Bool)
|
|
youtubeDlCheck url = ifM youtubeDlAllowed
|
|
( withUrlOptions Nothing $ youtubeDlCheck' url
|
|
, return $ Left youtubeDlNotAllowedMessage
|
|
)
|
|
|
|
youtubeDlCheck' :: URLString -> UrlOptions -> Annex (Either String Bool)
|
|
youtubeDlCheck' url uo
|
|
| supportedScheme uo url = catchMsgIO $ htmlOnly url False $ do
|
|
opts <- youtubeDlOpts [ Param url, Param "--simulate" ]
|
|
cmd <- youtubeDlCommand
|
|
liftIO $ snd <$> processTranscript cmd (toCommand opts) Nothing
|
|
| otherwise = return (Right False)
|
|
|
|
-- Ask youtube-dl for the filename of media in an url.
|
|
--
|
|
-- (This is not always identical to the filename it uses when downloading.)
|
|
youtubeDlFileName :: URLString -> Annex (Either String OsPath)
|
|
youtubeDlFileName url = withUrlOptions Nothing go
|
|
where
|
|
go uo
|
|
| supportedScheme uo url = flip catchIO (pure . Left . show) $
|
|
htmlOnly url nomedia (youtubeDlFileNameHtmlOnly' url uo)
|
|
| otherwise = return nomedia
|
|
nomedia = Left "no media in url"
|
|
|
|
-- Does not check if the url contains htmlOnly; use when that's already
|
|
-- been verified.
|
|
youtubeDlFileNameHtmlOnly :: URLString -> Annex (Either String OsPath)
|
|
youtubeDlFileNameHtmlOnly = withUrlOptions Nothing . youtubeDlFileNameHtmlOnly'
|
|
|
|
youtubeDlFileNameHtmlOnly' :: URLString -> UrlOptions -> Annex (Either String OsPath)
|
|
youtubeDlFileNameHtmlOnly' url uo
|
|
| supportedScheme uo url = flip catchIO (pure . Left . show) go
|
|
| otherwise = return nomedia
|
|
where
|
|
go = do
|
|
-- Sometimes youtube-dl will fail with an ugly backtrace
|
|
-- (eg, http://bugs.debian.org/874321)
|
|
-- so catch stderr as well as stdout to avoid the user
|
|
-- seeing it. --no-warnings avoids warning messages that
|
|
-- are output to stdout.
|
|
opts <- youtubeDlOpts
|
|
[ Param url
|
|
, Param "--get-filename"
|
|
, Param "--no-warnings"
|
|
, Param "--no-playlist"
|
|
]
|
|
cmd <- youtubeDlCommand
|
|
let p = (proc cmd (toCommand opts))
|
|
{ std_out = CreatePipe
|
|
, std_err = CreatePipe
|
|
}
|
|
liftIO $ withCreateProcess p waitproc
|
|
|
|
waitproc Nothing (Just o) (Just e) pid = do
|
|
errt <- async $ discardstderr pid e
|
|
output <- hGetContentsStrict o
|
|
ok <- liftIO $ checkSuccessProcess pid
|
|
wait errt
|
|
return $ case (ok, lines output) of
|
|
(True, (f:_)) | not (null f) -> Right (toOsPath f)
|
|
_ -> nomedia
|
|
waitproc _ _ _ _ = error "internal"
|
|
|
|
discardstderr pid e = hGetLineUntilExitOrEOF pid e >>= \case
|
|
Nothing -> return ()
|
|
Just _ -> discardstderr pid e
|
|
|
|
nomedia = Left "no media in url"
|
|
|
|
youtubeDlOpts :: [CommandParam] -> Annex [CommandParam]
|
|
youtubeDlOpts addopts = do
|
|
opts <- map Param . annexYoutubeDlOptions <$> Annex.getGitConfig
|
|
return (opts ++ addopts)
|
|
|
|
youtubeDlCommand :: Annex String
|
|
youtubeDlCommand = annexYoutubeDlCommand <$> Annex.getGitConfig >>= \case
|
|
Just c -> pure c
|
|
Nothing -> ifM (liftIO $ inSearchPath "yt-dlp")
|
|
( return "yt-dlp"
|
|
, return "youtube-dl"
|
|
)
|
|
|
|
supportedScheme :: UrlOptions -> URLString -> Bool
|
|
supportedScheme uo url = case parseURIRelaxed url of
|
|
Nothing -> False
|
|
Just u -> case uriScheme u of
|
|
-- avoid ugly message from youtube-dl about not supporting file:
|
|
"file:" -> False
|
|
-- ftp indexes may look like html pages, and there's no point
|
|
-- involving youtube-dl in a ftp download
|
|
"ftp:" -> False
|
|
_ -> allowedScheme uo u
|
|
|
|
progressTemplate :: String
|
|
progressTemplate = "ANNEX %(progress.downloaded_bytes)i %(progress.total_bytes_estimate)i %(progress.total_bytes)i ANNEX"
|
|
|
|
{- The progressTemplate makes output look like "ANNEX 10 100 NA ANNEX" or
|
|
- "ANNEX 10 NA 100 ANNEX" depending on whether the total bytes are estimated
|
|
- or known. That makes parsing much easier (and less fragile) than parsing
|
|
- the usual progress output.
|
|
-}
|
|
parseYtdlpProgress :: ProgressParser
|
|
parseYtdlpProgress = go [] . reverse . progresschunks
|
|
where
|
|
delim = '\r'
|
|
|
|
progresschunks = splitc delim
|
|
|
|
go remainder [] = (Nothing, Nothing, remainder)
|
|
go remainder (x:xs) = case splitc ' ' x of
|
|
("ANNEX":downloaded_bytes_s:total_bytes_estimate_s:total_bytes_s:"ANNEX":[]) ->
|
|
case (readMaybe downloaded_bytes_s, readMaybe total_bytes_estimate_s, readMaybe total_bytes_s) of
|
|
(Just downloaded_bytes, Nothing, Just total_bytes) ->
|
|
( Just (BytesProcessed downloaded_bytes)
|
|
, Just (TotalSize total_bytes)
|
|
, remainder
|
|
)
|
|
(Just downloaded_bytes, Just total_bytes_estimate, _) ->
|
|
( Just (BytesProcessed downloaded_bytes)
|
|
, Just (TotalSize total_bytes_estimate)
|
|
, remainder
|
|
)
|
|
_ -> go (remainder++x) xs
|
|
_ -> go (remainder++x) xs
|
|
|
|
{- youtube-dl is deprecated, parsing its progress was attempted before but
|
|
- was buggy and is no longer done. -}
|
|
parseYoutubeDlProgress :: ProgressParser
|
|
parseYoutubeDlProgress _ = (Nothing, Nothing, "")
|
|
|
|
{- List the items that yt-dlp can download from an url.
|
|
-
|
|
- Note that this does not check youtubeDlAllowed because it does not
|
|
- download content.
|
|
-}
|
|
youtubePlaylist :: URLString -> Annex (Either String [YoutubePlaylistItem])
|
|
youtubePlaylist url = do
|
|
cmd <- youtubeDlCommand
|
|
if cmd == "yt-dlp"
|
|
then liftIO $ youtubePlaylist' url cmd
|
|
else return $ Left $ "Scraping needs yt-dlp, but git-annex has been configured to use " ++ cmd
|
|
|
|
youtubePlaylist' :: URLString -> String -> IO (Either String [YoutubePlaylistItem])
|
|
youtubePlaylist' url cmd = withTmpFile (literalOsPath "yt-dlp") $ \tmpfile h -> do
|
|
hClose h
|
|
(outerr, ok) <- processTranscript cmd
|
|
[ "--simulate"
|
|
, "--flat-playlist"
|
|
-- Skip live videos in progress
|
|
, "--match-filter", "!is_live"
|
|
, "--print-to-file"
|
|
-- Write json with selected fields.
|
|
, "%(.{" ++ intercalate "," youtubePlaylistItemFields ++ "})j"
|
|
, fromOsPath tmpfile
|
|
, url
|
|
]
|
|
Nothing
|
|
if ok
|
|
then flip catchIO (pure . Left . show) $ do
|
|
v <- map Aeson.eitherDecodeStrict . B8.lines
|
|
<$> F.readFile' tmpfile
|
|
return $ case partitionEithers v of
|
|
((parserr:_), _) ->
|
|
Left $ "yt-dlp json parse error: " ++ parserr
|
|
([], r) -> Right r
|
|
else return $ Left $ if null outerr
|
|
then "yt-dlp failed"
|
|
else "yt-dlp failed: " ++ outerr
|
|
|
|
-- There are other fields that yt-dlp can extract, but these are similar to
|
|
-- the information from an RSS feed.
|
|
youtubePlaylistItemFields :: [String]
|
|
youtubePlaylistItemFields =
|
|
[ "playlist_title"
|
|
, "playlist_uploader"
|
|
, "title"
|
|
, "description"
|
|
, "license"
|
|
, "url"
|
|
, "timestamp"
|
|
]
|
|
|
|
-- Parse JSON generated by yt-dlp for playlist. Note that any field
|
|
-- may be omitted when that information is not supported for a given website.
|
|
data YoutubePlaylistItem = YoutubePlaylistItem
|
|
{ youtube_playlist_title :: Maybe String
|
|
, youtube_playlist_uploader :: Maybe String
|
|
, youtube_title :: Maybe String
|
|
, youtube_description :: Maybe String
|
|
, youtube_license :: Maybe String
|
|
, youtube_url :: Maybe String
|
|
, youtube_timestamp :: Maybe Integer -- ^ unix timestamp
|
|
} deriving (Generic, Show)
|
|
|
|
instance Aeson.FromJSON YoutubePlaylistItem
|
|
where
|
|
parseJSON = Aeson.genericParseJSON Aeson.defaultOptions
|
|
{ Aeson.fieldLabelModifier =
|
|
drop (length ("youtube_" :: String))
|
|
}
|