From 6b38d0c427a645d421ebf2d21111c0ca224dfef3 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 5 Feb 2024 15:16:25 -0400 Subject: [PATCH] addurl, importfeed: Added --raw-except option --raw-except=web allows using yt-dlp but not any other special remotes. Currently this option can only be used once, trying to use it repeatedly will make option parsing fail. Perhaps it ought to support being used more than once, but it seemed like an unlikely use case to need that. Note that getParsed is called repeatedly when the option is used with several urls. While implementing DeferredParseClass would avoid that innefficiency, it didn't seem worth the added boilerplate since getParsed only calls byNameWithUUID which does minimal work. Sponsored-by: Dartmouth College's DANDI project --- CHANGELOG | 1 + Command/AddUrl.hs | 30 +++++++++++++++++-- Command/ImportFeed.hs | 11 ++++--- Remote.hs | 13 ++++++-- doc/git-annex-addurl.mdwn | 10 +++++-- doc/git-annex-importfeed.mdwn | 2 +- .../addurl_--raw-except_REMOTEs__63__.mdwn | 2 ++ 7 files changed, 54 insertions(+), 15 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 2a751cfd7a..6dd0e36837 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ git-annex (10.20240130) UNRELEASED; urgency=medium 10.20230626.) * importfeed --force: Avoid creating duplicates of existing already downloaded files when yt-dlp or a special remote was used. + * addurl, importfeed: Added --raw-except option. -- Joey Hess Mon, 29 Jan 2024 15:59:33 -0400 diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index d98c1f95c0..d67456a05d 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -64,6 +64,7 @@ data DownloadOptions = DownloadOptions { relaxedOption :: Bool , rawOption :: Bool , noRawOption :: Bool + , rawExceptOption :: Maybe (DeferredParse Remote) , fileOption :: Maybe FilePath , preserveFilenameOption :: Bool , checkGitIgnoreOption :: CheckGitIgnore @@ -105,6 +106,13 @@ parseDownloadOptions withfileoptions = DownloadOptions ( long "no-raw" <> help "prevent downloading raw url content, must use special handling" ) + <*> optional + (mkParseRemoteOption <$> strOption + ( long "raw-except" <> metavar paramRemote + <> help "disable special handling except by this remote" + <> completeRemotes + ) + ) <*> (if withfileoptions then optional (strOption ( long "file" <> metavar paramFile @@ -123,7 +131,7 @@ seek :: AddUrlOptions -> CommandSeek seek o = startConcurrency commandStages $ do addunlockedmatcher <- addUnlockedMatcher let go (si, (o', u)) = do - r <- Remote.claimingUrl u + r <- checkClaimingUrl (downloadOptions o) u if Remote.uuid r == webUUID || rawOption (downloadOptions o') then void $ commandAction $ startWeb addunlockedmatcher o' si u @@ -133,6 +141,13 @@ seek o = startConcurrency commandStages $ do batchInput fmt (pure . parseBatchInput o) go NoBatch -> forM_ (addUrls o) (\u -> go (SeekInput [u], (o, u))) +checkClaimingUrl :: DownloadOptions -> URLString -> Annex Remote +checkClaimingUrl o u = do + allowedremote <- case rawExceptOption o of + Nothing -> pure (const True) + Just f -> (==) <$> getParsed f + Remote.claimingUrl' allowedremote u + parseBatchInput :: AddUrlOptions -> String -> Either String (AddUrlOptions, URLString) parseBatchInput o s | batchFilesOption o = @@ -284,7 +299,7 @@ performWeb addunlockedmatcher o url file urlinfo = lookupKey file >>= \case where geturl = next $ isJust <$> addUrlFile addunlockedmatcher (downloadOptions o) url urlinfo file addurl = addUrlChecked o url file webUUID $ \k -> - ifM (pure (not (rawOption (downloadOptions o))) <&&> youtubeDlSupported url) + ifM (useYoutubeDl (downloadOptions o) <&&> youtubeDlSupported url) ( return (Just (True, True, setDownloader url YoutubeDownloader)) , checkRaw Nothing (downloadOptions o) (pure Nothing) $ return (Just (Url.urlExists urlinfo, Url.urlSize urlinfo == fromKey keySize k, url)) @@ -332,7 +347,7 @@ downloadWeb addunlockedmatcher o url urlinfo file = urlkey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing downloader f p = Url.withUrlOptions $ downloadUrl False urlkey p Nothing [url] f go Nothing = return Nothing - go (Just (tmp, backend)) = ifM (pure (not (rawOption o)) <&&> liftIO (isHtmlFile (fromRawFilePath tmp))) + go (Just (tmp, backend)) = ifM (useYoutubeDl o <&&> liftIO (isHtmlFile (fromRawFilePath tmp))) ( tryyoutubedl tmp backend , normalfinish tmp backend ) @@ -394,6 +409,15 @@ checkRaw failreason o f a f | otherwise = a +useYoutubeDl :: DownloadOptions -> Annex Bool +useYoutubeDl o + | rawOption o = pure False + | otherwise = case rawExceptOption o of + Nothing -> pure True + Just f -> do + remote <- getParsed f + pure (Remote.uuid remote == webUUID) + {- The destination file is not known at start time unless the user provided - a filename. It's not displayed then for output consistency, - but is added to the json when available. -} diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs index a8d04ef2da..f96bcd869c 100644 --- a/Command/ImportFeed.hs +++ b/Command/ImportFeed.hs @@ -38,7 +38,7 @@ import Logs.File import qualified Utility.Format import Utility.Tmp import Utility.Metered -import Command.AddUrl (addUrlFile, downloadRemoteFile, parseDownloadOptions, DownloadOptions(..), checkCanAdd, addWorkTree, checkRaw) +import Command.AddUrl (addUrlFile, downloadRemoteFile, parseDownloadOptions, DownloadOptions(..), checkClaimingUrl, checkCanAdd, addWorkTree, checkRaw, useYoutubeDl) import Annex.UUID import Backend.URL (fromUrl) import Annex.Content @@ -317,9 +317,8 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown startdownloadenclosure url = checkknown url $ startUrlDownload cv todownload url $ downloadEnclosure addunlockedmatcher opts cache cv todownload url - downloadmedia linkurl mediaurl mediakey - | rawOption (downloadOptions opts) = startdownloadlink - | otherwise = ifM (youtubeDlSupported linkurl) + downloadmedia linkurl mediaurl mediakey = + ifM (useYoutubeDl (downloadOptions opts) <&&> youtubeDlSupported linkurl) ( startUrlDownload cv todownload linkurl $ withTmpWorkDir mediakey $ \workdir -> do dl <- youtubeDl linkurl (fromRawFilePath workdir) nullMeterUpdate @@ -348,7 +347,7 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown contdownloadlink = downloadEnclosure addunlockedmatcher opts cache cv todownload linkurl addmediafast linkurl mediaurl mediakey = - ifM (pure (not (rawOption (downloadOptions opts))) + ifM (useYoutubeDl (downloadOptions opts) <&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl)) ( startUrlDownload cv todownload linkurl $ do runDownload todownload linkurl ".m" cache cv $ \f -> @@ -362,7 +361,7 @@ downloadEnclosure :: AddUnlockedMatcher -> ImportFeedOptions -> Cache -> TMVar B downloadEnclosure addunlockedmatcher opts cache cv todownload url = runDownload todownload url (takeWhile (/= '?') $ takeExtension url) cache cv $ \f -> do let f' = fromRawFilePath f - r <- Remote.claimingUrl url + r <- checkClaimingUrl (downloadOptions opts) url if Remote.uuid r == webUUID || rawOption (downloadOptions opts) then checkRaw (Just url) (downloadOptions opts) (pure Nothing) $ do let dlopts = (downloadOptions opts) diff --git a/Remote.hs b/Remote.hs index 1638777277..dbbca32a5e 100644 --- a/Remote.hs +++ b/Remote.hs @@ -1,6 +1,6 @@ {- git-annex remotes - - - Copyright 2011-2020 Joey Hess + - Copyright 2011-2024 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -59,6 +59,7 @@ module Remote ( logStatus, checkAvailable, claimingUrl, + claimingUrl', isExportSupported, ) where @@ -432,9 +433,15 @@ hasKeyCheap = checkPresentCheap {- The web special remote claims urls by default. -} claimingUrl :: URLString -> Annex Remote -claimingUrl url = do +claimingUrl = claimingUrl' (const True) + +{- The web special remote still claims urls if there is no + - other remote that does, even when the remotefilter does + - not include it. -} +claimingUrl' :: (Remote -> Bool) -> URLString -> Annex Remote +claimingUrl' remotefilter url = do rs <- remoteList let web = Prelude.head $ filter (\r -> uuid r == webUUID) rs - fromMaybe web <$> firstM checkclaim rs + fromMaybe web <$> firstM checkclaim (filter remotefilter rs) where checkclaim = maybe (pure False) (`id` url) . claimUrl diff --git a/doc/git-annex-addurl.mdwn b/doc/git-annex-addurl.mdwn index dd2bfee1a9..370e5be986 100644 --- a/doc/git-annex-addurl.mdwn +++ b/doc/git-annex-addurl.mdwn @@ -45,8 +45,8 @@ be used to get better filenames. * `--raw` - Prevent special handling of urls by yt-dlp, bittorrent, and other - special remotes. This will for example, make addurl + Prevent special handling of urls by yt-dlp, and by bittorrent + and other special remotes. This will for example, make addurl download the .torrent file and not the contents it points to. * `--no-raw` @@ -55,6 +55,12 @@ be used to get better filenames. or a special remote, rather than the raw content of the url. if that cannot be done, the add will fail. +* `--raw-except=remote` + + Prevent special handling of urls by all special remotes except + for the specified one. To allow special handling only + by yt-dlp, use `--raw-except=web`. + * `--file=name` Use with a filename that does not yet exist to add a new file diff --git a/doc/git-annex-importfeed.mdwn b/doc/git-annex-importfeed.mdwn index 402841eb5e..7379981a8d 100644 --- a/doc/git-annex-importfeed.mdwn +++ b/doc/git-annex-importfeed.mdwn @@ -37,7 +37,7 @@ resulting in the new url being downloaded to such a filename. Force downloading items it's seen before. -* `--relaxed`, `--fast`, `--raw` +* `--relaxed`, `--fast`, `--raw`, `--raw-except` These options behave the same as when using [[git-annex-addurl]](1). diff --git a/doc/todo/addurl_--raw-except_REMOTEs__63__.mdwn b/doc/todo/addurl_--raw-except_REMOTEs__63__.mdwn index 757beb5f92..a7f8cbc3d1 100644 --- a/doc/todo/addurl_--raw-except_REMOTEs__63__.mdwn +++ b/doc/todo/addurl_--raw-except_REMOTEs__63__.mdwn @@ -3,3 +3,5 @@ I think generally we would still prefer to use `--raw` as to avoid possible side [[!meta author=yoh]] [[!tag projects/dandi]] + +> [[done]] --[[Joey]]