addurl, importfeed: Added --raw-except option

--raw-except=web allows using yt-dlp but not any other special remotes.

Currently this option can only be used once, trying to use it repeatedly
will make option parsing fail. Perhaps it ought to support being used more
than once, but it seemed like an unlikely use case to need that.

Note that getParsed is called repeatedly when the option is used with
several urls. While implementing DeferredParseClass would avoid that
innefficiency, it didn't seem worth the added boilerplate since
getParsed only calls byNameWithUUID which does minimal work.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2024-02-05 15:16:25 -04:00
parent c0c85a7de4
commit 6b38d0c427
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 54 additions and 15 deletions

View file

@ -7,6 +7,7 @@ git-annex (10.20240130) UNRELEASED; urgency=medium
10.20230626.) 10.20230626.)
* importfeed --force: Avoid creating duplicates of existing * importfeed --force: Avoid creating duplicates of existing
already downloaded files when yt-dlp or a special remote was used. already downloaded files when yt-dlp or a special remote was used.
* addurl, importfeed: Added --raw-except option.
-- Joey Hess <id@joeyh.name> Mon, 29 Jan 2024 15:59:33 -0400 -- Joey Hess <id@joeyh.name> Mon, 29 Jan 2024 15:59:33 -0400

View file

@ -64,6 +64,7 @@ data DownloadOptions = DownloadOptions
{ relaxedOption :: Bool { relaxedOption :: Bool
, rawOption :: Bool , rawOption :: Bool
, noRawOption :: Bool , noRawOption :: Bool
, rawExceptOption :: Maybe (DeferredParse Remote)
, fileOption :: Maybe FilePath , fileOption :: Maybe FilePath
, preserveFilenameOption :: Bool , preserveFilenameOption :: Bool
, checkGitIgnoreOption :: CheckGitIgnore , checkGitIgnoreOption :: CheckGitIgnore
@ -105,6 +106,13 @@ parseDownloadOptions withfileoptions = DownloadOptions
( long "no-raw" ( long "no-raw"
<> help "prevent downloading raw url content, must use special handling" <> help "prevent downloading raw url content, must use special handling"
) )
<*> optional
(mkParseRemoteOption <$> strOption
( long "raw-except" <> metavar paramRemote
<> help "disable special handling except by this remote"
<> completeRemotes
)
)
<*> (if withfileoptions <*> (if withfileoptions
then optional (strOption then optional (strOption
( long "file" <> metavar paramFile ( long "file" <> metavar paramFile
@ -123,7 +131,7 @@ seek :: AddUrlOptions -> CommandSeek
seek o = startConcurrency commandStages $ do seek o = startConcurrency commandStages $ do
addunlockedmatcher <- addUnlockedMatcher addunlockedmatcher <- addUnlockedMatcher
let go (si, (o', u)) = do let go (si, (o', u)) = do
r <- Remote.claimingUrl u r <- checkClaimingUrl (downloadOptions o) u
if Remote.uuid r == webUUID || rawOption (downloadOptions o') if Remote.uuid r == webUUID || rawOption (downloadOptions o')
then void $ commandAction $ then void $ commandAction $
startWeb addunlockedmatcher o' si u startWeb addunlockedmatcher o' si u
@ -133,6 +141,13 @@ seek o = startConcurrency commandStages $ do
batchInput fmt (pure . parseBatchInput o) go batchInput fmt (pure . parseBatchInput o) go
NoBatch -> forM_ (addUrls o) (\u -> go (SeekInput [u], (o, u))) NoBatch -> forM_ (addUrls o) (\u -> go (SeekInput [u], (o, u)))
checkClaimingUrl :: DownloadOptions -> URLString -> Annex Remote
checkClaimingUrl o u = do
allowedremote <- case rawExceptOption o of
Nothing -> pure (const True)
Just f -> (==) <$> getParsed f
Remote.claimingUrl' allowedremote u
parseBatchInput :: AddUrlOptions -> String -> Either String (AddUrlOptions, URLString) parseBatchInput :: AddUrlOptions -> String -> Either String (AddUrlOptions, URLString)
parseBatchInput o s parseBatchInput o s
| batchFilesOption o = | batchFilesOption o =
@ -284,7 +299,7 @@ performWeb addunlockedmatcher o url file urlinfo = lookupKey file >>= \case
where where
geturl = next $ isJust <$> addUrlFile addunlockedmatcher (downloadOptions o) url urlinfo file geturl = next $ isJust <$> addUrlFile addunlockedmatcher (downloadOptions o) url urlinfo file
addurl = addUrlChecked o url file webUUID $ \k -> addurl = addUrlChecked o url file webUUID $ \k ->
ifM (pure (not (rawOption (downloadOptions o))) <&&> youtubeDlSupported url) ifM (useYoutubeDl (downloadOptions o) <&&> youtubeDlSupported url)
( return (Just (True, True, setDownloader url YoutubeDownloader)) ( return (Just (True, True, setDownloader url YoutubeDownloader))
, checkRaw Nothing (downloadOptions o) (pure Nothing) $ , checkRaw Nothing (downloadOptions o) (pure Nothing) $
return (Just (Url.urlExists urlinfo, Url.urlSize urlinfo == fromKey keySize k, url)) return (Just (Url.urlExists urlinfo, Url.urlSize urlinfo == fromKey keySize k, url))
@ -332,7 +347,7 @@ downloadWeb addunlockedmatcher o url urlinfo file =
urlkey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing urlkey = addSizeUrlKey urlinfo $ Backend.URL.fromUrl url Nothing
downloader f p = Url.withUrlOptions $ downloadUrl False urlkey p Nothing [url] f downloader f p = Url.withUrlOptions $ downloadUrl False urlkey p Nothing [url] f
go Nothing = return Nothing go Nothing = return Nothing
go (Just (tmp, backend)) = ifM (pure (not (rawOption o)) <&&> liftIO (isHtmlFile (fromRawFilePath tmp))) go (Just (tmp, backend)) = ifM (useYoutubeDl o <&&> liftIO (isHtmlFile (fromRawFilePath tmp)))
( tryyoutubedl tmp backend ( tryyoutubedl tmp backend
, normalfinish tmp backend , normalfinish tmp backend
) )
@ -394,6 +409,15 @@ checkRaw failreason o f a
f f
| otherwise = a | otherwise = a
useYoutubeDl :: DownloadOptions -> Annex Bool
useYoutubeDl o
| rawOption o = pure False
| otherwise = case rawExceptOption o of
Nothing -> pure True
Just f -> do
remote <- getParsed f
pure (Remote.uuid remote == webUUID)
{- The destination file is not known at start time unless the user provided {- The destination file is not known at start time unless the user provided
- a filename. It's not displayed then for output consistency, - a filename. It's not displayed then for output consistency,
- but is added to the json when available. -} - but is added to the json when available. -}

View file

@ -38,7 +38,7 @@ import Logs.File
import qualified Utility.Format import qualified Utility.Format
import Utility.Tmp import Utility.Tmp
import Utility.Metered import Utility.Metered
import Command.AddUrl (addUrlFile, downloadRemoteFile, parseDownloadOptions, DownloadOptions(..), checkCanAdd, addWorkTree, checkRaw) import Command.AddUrl (addUrlFile, downloadRemoteFile, parseDownloadOptions, DownloadOptions(..), checkClaimingUrl, checkCanAdd, addWorkTree, checkRaw, useYoutubeDl)
import Annex.UUID import Annex.UUID
import Backend.URL (fromUrl) import Backend.URL (fromUrl)
import Annex.Content import Annex.Content
@ -317,9 +317,8 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown
startdownloadenclosure url = checkknown url $ startUrlDownload cv todownload url $ startdownloadenclosure url = checkknown url $ startUrlDownload cv todownload url $
downloadEnclosure addunlockedmatcher opts cache cv todownload url downloadEnclosure addunlockedmatcher opts cache cv todownload url
downloadmedia linkurl mediaurl mediakey downloadmedia linkurl mediaurl mediakey =
| rawOption (downloadOptions opts) = startdownloadlink ifM (useYoutubeDl (downloadOptions opts) <&&> youtubeDlSupported linkurl)
| otherwise = ifM (youtubeDlSupported linkurl)
( startUrlDownload cv todownload linkurl $ ( startUrlDownload cv todownload linkurl $
withTmpWorkDir mediakey $ \workdir -> do withTmpWorkDir mediakey $ \workdir -> do
dl <- youtubeDl linkurl (fromRawFilePath workdir) nullMeterUpdate dl <- youtubeDl linkurl (fromRawFilePath workdir) nullMeterUpdate
@ -348,7 +347,7 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown
contdownloadlink = downloadEnclosure addunlockedmatcher opts cache cv todownload linkurl contdownloadlink = downloadEnclosure addunlockedmatcher opts cache cv todownload linkurl
addmediafast linkurl mediaurl mediakey = addmediafast linkurl mediaurl mediakey =
ifM (pure (not (rawOption (downloadOptions opts))) ifM (useYoutubeDl (downloadOptions opts)
<&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl)) <&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl))
( startUrlDownload cv todownload linkurl $ do ( startUrlDownload cv todownload linkurl $ do
runDownload todownload linkurl ".m" cache cv $ \f -> runDownload todownload linkurl ".m" cache cv $ \f ->
@ -362,7 +361,7 @@ downloadEnclosure :: AddUnlockedMatcher -> ImportFeedOptions -> Cache -> TMVar B
downloadEnclosure addunlockedmatcher opts cache cv todownload url = downloadEnclosure addunlockedmatcher opts cache cv todownload url =
runDownload todownload url (takeWhile (/= '?') $ takeExtension url) cache cv $ \f -> do runDownload todownload url (takeWhile (/= '?') $ takeExtension url) cache cv $ \f -> do
let f' = fromRawFilePath f let f' = fromRawFilePath f
r <- Remote.claimingUrl url r <- checkClaimingUrl (downloadOptions opts) url
if Remote.uuid r == webUUID || rawOption (downloadOptions opts) if Remote.uuid r == webUUID || rawOption (downloadOptions opts)
then checkRaw (Just url) (downloadOptions opts) (pure Nothing) $ do then checkRaw (Just url) (downloadOptions opts) (pure Nothing) $ do
let dlopts = (downloadOptions opts) let dlopts = (downloadOptions opts)

View file

@ -1,6 +1,6 @@
{- git-annex remotes {- git-annex remotes
- -
- Copyright 2011-2020 Joey Hess <id@joeyh.name> - Copyright 2011-2024 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -59,6 +59,7 @@ module Remote (
logStatus, logStatus,
checkAvailable, checkAvailable,
claimingUrl, claimingUrl,
claimingUrl',
isExportSupported, isExportSupported,
) where ) where
@ -432,9 +433,15 @@ hasKeyCheap = checkPresentCheap
{- The web special remote claims urls by default. -} {- The web special remote claims urls by default. -}
claimingUrl :: URLString -> Annex Remote claimingUrl :: URLString -> Annex Remote
claimingUrl url = do claimingUrl = claimingUrl' (const True)
{- The web special remote still claims urls if there is no
- other remote that does, even when the remotefilter does
- not include it. -}
claimingUrl' :: (Remote -> Bool) -> URLString -> Annex Remote
claimingUrl' remotefilter url = do
rs <- remoteList rs <- remoteList
let web = Prelude.head $ filter (\r -> uuid r == webUUID) rs let web = Prelude.head $ filter (\r -> uuid r == webUUID) rs
fromMaybe web <$> firstM checkclaim rs fromMaybe web <$> firstM checkclaim (filter remotefilter rs)
where where
checkclaim = maybe (pure False) (`id` url) . claimUrl checkclaim = maybe (pure False) (`id` url) . claimUrl

View file

@ -45,8 +45,8 @@ be used to get better filenames.
* `--raw` * `--raw`
Prevent special handling of urls by yt-dlp, bittorrent, and other Prevent special handling of urls by yt-dlp, and by bittorrent
special remotes. This will for example, make addurl and other special remotes. This will for example, make addurl
download the .torrent file and not the contents it points to. download the .torrent file and not the contents it points to.
* `--no-raw` * `--no-raw`
@ -55,6 +55,12 @@ be used to get better filenames.
or a special remote, rather than the raw content of the url. if that or a special remote, rather than the raw content of the url. if that
cannot be done, the add will fail. cannot be done, the add will fail.
* `--raw-except=remote`
Prevent special handling of urls by all special remotes except
for the specified one. To allow special handling only
by yt-dlp, use `--raw-except=web`.
* `--file=name` * `--file=name`
Use with a filename that does not yet exist to add a new file Use with a filename that does not yet exist to add a new file

View file

@ -37,7 +37,7 @@ resulting in the new url being downloaded to such a filename.
Force downloading items it's seen before. Force downloading items it's seen before.
* `--relaxed`, `--fast`, `--raw` * `--relaxed`, `--fast`, `--raw`, `--raw-except`
These options behave the same as when using [[git-annex-addurl]](1). These options behave the same as when using [[git-annex-addurl]](1).

View file

@ -3,3 +3,5 @@ I think generally we would still prefer to use `--raw` as to avoid possible side
[[!meta author=yoh]] [[!meta author=yoh]]
[[!tag projects/dandi]] [[!tag projects/dandi]]
> [[done]] --[[Joey]]