import: Added --no-content option, which avoids downloading files from a special remote

Only supported by some special remotes: directory
I need to check the rest and they're currently missing methods until I do.

git-annex sync --no-content does not yet use this to do imports
This commit is contained in:
Joey Hess 2020-07-03 13:41:57 -04:00
parent a8099b9896
commit 85506a7015
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
12 changed files with 204 additions and 60 deletions

View file

@ -20,6 +20,7 @@ module Annex.Content (
getViaTmp,
getViaTmpFromDisk,
checkDiskSpaceToGet,
checkSecureHashes,
prepTmp,
withTmp,
checkDiskSpace,
@ -473,7 +474,7 @@ withTmp key action = do
- case. May also throw exceptions in some cases.
-}
moveAnnex :: Key -> FilePath -> Annex Bool
moveAnnex key src = ifM (checkSecureHashes key)
moveAnnex key src = ifM (checkSecureHashes' key)
( do
withObjectLoc key storeobject
return True
@ -496,22 +497,27 @@ moveAnnex key src = ifM (checkSecureHashes key)
dest' = fromRawFilePath dest
alreadyhave = liftIO $ removeFile src
checkSecureHashes :: Key -> Annex Bool
checkSecureHashes :: Key -> Annex (Maybe String)
checkSecureHashes key
| cryptographicallySecure (fromKey keyVariety key) = return True
| cryptographicallySecure (fromKey keyVariety key) = return Nothing
| otherwise = ifM (annexSecureHashesOnly <$> Annex.getGitConfig)
( do
warning $ "annex.securehashesonly blocked adding " ++ decodeBS (formatKeyVariety (fromKey keyVariety key)) ++ " key to annex objects"
return False
, return True
( return $ Just $ "annex.securehashesonly blocked adding " ++ decodeBS (formatKeyVariety (fromKey keyVariety key)) ++ " key"
, return Nothing
)
checkSecureHashes' :: Key -> Annex Bool
checkSecureHashes' key = checkSecureHashes key >>= \case
Nothing -> return True
Just msg -> do
warning $ msg ++ "to annex objects"
return False
data LinkAnnexResult = LinkAnnexOk | LinkAnnexFailed | LinkAnnexNoop
{- Populates the annex object file by hard linking or copying a source
- file to it. -}
linkToAnnex :: Key -> FilePath -> Maybe InodeCache -> Annex LinkAnnexResult
linkToAnnex key src srcic = ifM (checkSecureHashes key)
linkToAnnex key src srcic = ifM (checkSecureHashes' key)
( do
dest <- fromRawFilePath <$> calcRepo (gitAnnexLocation key)
modifyContent dest $ linkAnnex To key src srcic dest Nothing

View file

@ -12,7 +12,7 @@ module Annex.Import (
ImportCommitConfig(..),
buildImportCommit,
buildImportTrees,
downloadImport,
importKeys,
filterImportableContents,
makeImportMatcher,
listImportableContents,
@ -280,44 +280,57 @@ buildImportTrees basetree msubdir importable = History
topf = asTopFilePath $
maybe lf (\sd -> getTopFilePath sd P.</> lf) msubdir
{- Downloads all new ContentIdentifiers. Supports concurrency when enabled.
-
- If any download fails, the whole thing fails with Nothing,
- but it will resume where it left off.
{- Downloads all new ContentIdentifiers, or when importcontent is False,
- generates Keys without downloading.
-
- Generates either a Key or a git Sha, depending on annex.largefiles.
- Note that, when a ContentIdentifiers has been imported before,
- annex.largefiles is not reapplied, so will result in how ever that
- content was stored in the repo before.
- But when importcontent is False, it cannot match on annex.largefiles
- (or generate a git Sha), so always generates Keys.
-
- Supports concurrency when enabled.
-
- If it fails on any file, the whole thing fails with Nothing,
- but it will resume where it left off.
-
- Note that, when a ContentIdentifier has been imported before,
- generates the same thing that was imported before, so annex.largefiles
- is not reapplied.
-}
downloadImport :: Remote -> ImportTreeConfig -> ImportableContents (ContentIdentifier, ByteSize) -> Annex (Maybe (ImportableContents (Either Sha Key)))
downloadImport remote importtreeconfig importablecontents = do
importKeys
:: Remote
-> ImportTreeConfig
-> Bool
-> ImportableContents (ContentIdentifier, ByteSize)
-> Annex (Maybe (ImportableContents (Either Sha Key)))
importKeys remote importtreeconfig importcontent importablecontents = do
when (not importcontent && isNothing (Remote.importKey ia)) $
giveup "This remote does not support importing without downloading content."
-- This map is used to remember content identifiers that
-- were just downloaded, before they have necessarily been
-- were just imported, before they have necessarily been
-- stored in the database. This way, if the same content
-- identifier appears multiple times in the
-- importablecontents (eg when it has a history),
-- they will only be downloaded once.
-- they will only be imported once.
cidmap <- liftIO $ newTVarIO M.empty
-- When concurrency is enabled, this set is needed to
-- avoid two threads both downloading the same content identifier.
downloading <- liftIO $ newTVarIO S.empty
-- avoid two threads both importing the same content identifier.
importing <- liftIO $ newTVarIO S.empty
withExclusiveLock gitAnnexContentIdentifierLock $
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
CIDDb.needsUpdateFromLog db
>>= maybe noop (CIDDb.updateFromLog db)
go False cidmap downloading importablecontents db
go False cidmap importing importablecontents db
where
go oldversion cidmap downloading (ImportableContents l h) db = do
go oldversion cidmap importing (ImportableContents l h) db = do
largematcher <- largeFilesMatcher
jobs <- forM l $ \i ->
startdownload cidmap downloading db i oldversion largematcher
startimport cidmap importing db i oldversion largematcher
l' <- liftIO $ forM jobs $
either pure (atomically . takeTMVar)
if any isNothing l'
then return Nothing
else do
h' <- mapM (\ic -> go True cidmap downloading ic db) h
h' <- mapM (\ic -> go True cidmap importing ic db) h
if any isNothing h'
then return Nothing
else return $ Just $
@ -325,17 +338,17 @@ downloadImport remote importtreeconfig importablecontents = do
(catMaybes l')
(catMaybes h')
waitstart downloading cid = liftIO $ atomically $ do
s <- readTVar downloading
waitstart importing cid = liftIO $ atomically $ do
s <- readTVar importing
if S.member cid s
then retry
else writeTVar downloading $ S.insert cid s
else writeTVar importing $ S.insert cid s
signaldone downloading cid = liftIO $ atomically $ do
s <- readTVar downloading
writeTVar downloading $ S.delete cid s
signaldone importing cid = liftIO $ atomically $ do
s <- readTVar importing
writeTVar importing $ S.delete cid s
startdownload cidmap downloading db i@(loc, (cid, _sz)) oldversion largematcher = getcidkey cidmap db cid >>= \case
startimport cidmap importing db i@(loc, (cid, _sz)) oldversion largematcher = getcidkey cidmap db cid >>= \case
(k:ks) ->
-- If the same content was imported before
-- yeilding multiple different keys, it's not clear
@ -350,10 +363,10 @@ downloadImport remote importtreeconfig importablecontents = do
[] -> do
job <- liftIO $ newEmptyTMVarIO
let ai = ActionItemOther (Just (fromRawFilePath (fromImportLocation loc)))
let downloadaction = starting ("import " ++ Remote.name remote) ai $ do
let importaction = starting ("import " ++ Remote.name remote) ai $ do
when oldversion $
showNote "old version"
tryNonAsync (download cidmap db i largematcher) >>= \case
tryNonAsync (importordownload cidmap db i largematcher) >>= \case
Left e -> next $ do
warning (show e)
liftIO $ atomically $
@ -364,12 +377,36 @@ downloadImport remote importtreeconfig importablecontents = do
putTMVar job r
return True
commandAction $ bracket_
(waitstart downloading cid)
(signaldone downloading cid)
downloadaction
(waitstart importing cid)
(signaldone importing cid)
importaction
return (Right job)
download cidmap db (loc, (cid, sz)) largematcher = do
importordownload
| not importcontent = doimport
| otherwise = dodownload
doimport cidmap db (loc, (cid, sz)) _largematcher =
case Remote.importKey ia of
Nothing -> error "internal" -- checked earlier
Just a -> do
let importer p = do
k <- a loc cid sz p
checkSecureHashes k >>= \case
Nothing -> do
recordcidkey cidmap db cid k
logChange k (Remote.uuid remote) InfoPresent
return (Right k)
Just msg -> giveup (msg ++ " to import")
let runimport p = tryNonAsync (importer p) >>= \case
Right k -> return $ Just (loc, k)
Left e -> do
warning (show e)
return Nothing
metered Nothing sz $
const runimport
dodownload cidmap db (loc, (cid, sz)) largematcher = do
let downloader tmpfile p = do
k <- Remote.retrieveExportWithContentIdentifier ia loc cid tmpfile (mkkey loc tmpfile largematcher) p
case keyGitSha k of
@ -394,8 +431,9 @@ downloadImport remote importtreeconfig importablecontents = do
metered Nothing tmpkey $
const (rundownload tmpfile)
where
ia = Remote.importActions remote
tmpkey = importKey cid sz
ia = Remote.importActions remote
mkkey loc tmpfile largematcher = do
f <- fromRepo $ fromTopFilePath $ locworktreefilename loc

View file

@ -1,6 +1,10 @@
git-annex (8.20200618) UNRELEASED; urgency=medium
* import: Added --no-content option, which avoids downloading files
from a special remote. Only supported by some special remotes:
directory
* Honor annex.largefiles when importing a tree from a special remote.
(Except for when --no-content is used.)
* Fix a deadlock that could occur after git-annex got an unlocked
file, causing the command to hang indefinitely. Known to happen on
vfat filesystems, possibly others.
@ -15,7 +19,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
itempubminute, itempubsecond.
* Made several special remotes support locking content on them,
which allows dropping from other special remotes in some situations
where it was not possible before.
where it was not possible before. Supported special remotes:
S3 (with versioning=yes), git-lfs, tahoe
* Fix reversion that broke passing annex.* and remote.*.annex-*
git configs with -c. (Since version 8.20200330.)

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2012-2019 Joey Hess <id@joeyh.name>
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -17,6 +17,7 @@ import qualified Command.Reinject
import qualified Types.Remote as Remote
import qualified Git.Ref
import Utility.CopyFile
import Utility.OptParse
import Backend
import Types.KeySource
import Annex.CheckIgnore
@ -53,12 +54,16 @@ data ImportOptions
{ importFromRemote :: DeferredParse Remote
, importToBranch :: Branch
, importToSubDir :: Maybe FilePath
, importContent :: Bool
}
optParser :: CmdParamsDesc -> Parser ImportOptions
optParser desc = do
ps <- cmdParams desc
mfromremote <- optional $ parseRemoteOption <$> parseFromOption
content <- invertableSwitch "content" True
( help "do not get contents of imported files"
)
dupmode <- fromMaybe Default <$> optional duplicateModeParser
return $ case mfromremote of
Nothing -> LocalImportOptions ps dupmode
@ -68,6 +73,7 @@ optParser desc = do
in RemoteImportOptions r
(Ref (encodeBS' branch))
(if null subdir then Nothing else Just subdir)
content
_ -> giveup "expected BRANCH[:SUBDIR]"
data DuplicateMode = Default | Duplicate | DeDuplicate | CleanDuplicates | SkipDuplicates | ReinjectDuplicates
@ -114,7 +120,7 @@ seek o@(RemoteImportOptions {}) = startConcurrency commandStages $ do
(pure Nothing)
(Just <$$> inRepo . toTopFilePath . toRawFilePath)
(importToSubDir o)
seekRemote r (importToBranch o) subdir
seekRemote r (importToBranch o) subdir (importContent o)
startLocal :: AddUnlockedMatcher -> GetFileMatcher -> DuplicateMode -> (FilePath, FilePath) -> CommandStart
startLocal addunlockedmatcher largematcher mode (srcfile, destfile) =
@ -258,8 +264,8 @@ verifyExisting key destfile (yes, no) = do
verifyEnoughCopiesToDrop [] key Nothing need [] preverified tocheck
(const yes) no
seekRemote :: Remote -> Branch -> Maybe TopFilePath -> CommandSeek
seekRemote remote branch msubdir = do
seekRemote :: Remote -> Branch -> Maybe TopFilePath -> Bool -> CommandSeek
seekRemote remote branch msubdir importcontent = do
importtreeconfig <- case msubdir of
Nothing -> return ImportTree
Just subdir ->
@ -279,7 +285,7 @@ seekRemote remote branch msubdir = do
void $ includeCommandAction (listContents remote importabletvar)
liftIO (atomically (readTVar importabletvar)) >>= \case
Nothing -> return ()
Just importable -> downloadImport remote importtreeconfig importable >>= \case
Just importable -> importKeys remote importtreeconfig importcontent importable >>= \case
Nothing -> warning $ concat
[ "Failed to import some files from "
, Remote.name remote

View file

@ -455,7 +455,7 @@ importRemote o mergeconfig remote currbranch
let subdir = if S.null p
then Nothing
else Just (asTopFilePath p)
Command.Import.seekRemote remote branch subdir
Command.Import.seekRemote remote branch subdir True
void $ mergeRemote remote currbranch mergeconfig o
where
wantpull = remoteAnnexPull (Remote.gitconfig remote)

View file

@ -32,6 +32,8 @@ import Types.Import
import qualified Remote.Directory.LegacyChunked as Legacy
import Annex.Content
import Annex.UUID
import Backend
import Types.KeySource
import Utility.Metered
import Utility.Tmp
import Utility.InodeCache
@ -88,6 +90,7 @@ gen r u rc gc rs = do
}
, importActions = ImportActions
{ listImportableContents = listImportableContentsM dir
, importKey = Just (importKeyM dir)
, retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM dir
, storeExportWithContentIdentifier = storeExportWithContentIdentifierM dir
, removeExportWithContentIdentifier = removeExportWithContentIdentifierM dir
@ -342,6 +345,26 @@ mkContentIdentifier f st =
fmap (ContentIdentifier . encodeBS . showInodeCache)
<$> toInodeCache noTSDelta f st
guardSameContentIdentifiers :: a -> ContentIdentifier -> Maybe ContentIdentifier -> a
guardSameContentIdentifiers cont old new
| new == Just old = cont
| otherwise = giveup "file content has changed"
importKeyM :: FilePath -> ExportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex Key
importKeyM dir loc cid sz p = do
backend <- chooseBackend (fromRawFilePath f)
k <- fst <$> genKey ks p backend
currcid <- liftIO $ mkContentIdentifier absf =<< getFileStatus absf
guardSameContentIdentifiers (return k) cid currcid
where
f = fromExportLocation loc
absf = dir </> fromRawFilePath f
ks = KeySource
{ keyFilename = f
, contentLocation = toRawFilePath absf
, inodeCache = Nothing
}
retrieveExportWithContentIdentifierM :: FilePath -> ExportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key
retrieveExportWithContentIdentifierM dir loc cid dest mkkey p =
precheck $ docopy postcheck
@ -376,7 +399,7 @@ retrieveExportWithContentIdentifierM dir loc cid dest mkkey p =
-- Check before copy, to avoid expensive copy of wrong file
-- content.
precheck cont = comparecid cont
precheck cont = guardSameContentIdentifiers cont cid
=<< liftIO . mkContentIdentifier f
=<< liftIO (getFileStatus f)
@ -404,11 +427,7 @@ retrieveExportWithContentIdentifierM dir loc cid dest mkkey p =
#else
=<< getFileStatus f
#endif
comparecid cont currcid
comparecid cont currcid
| currcid == Just cid = cont
| otherwise = giveup "file content has changed"
guardSameContentIdentifiers cont cid currcid
storeExportWithContentIdentifierM :: FilePath -> FilePath -> Key -> ExportLocation -> [ContentIdentifier] -> MeterUpdate -> Annex ContentIdentifier
storeExportWithContentIdentifierM dir src _k loc overwritablecids p = do

View file

@ -56,6 +56,7 @@ instance HasImportUnsupported (ParsedRemoteConfig -> RemoteGitConfig -> Annex Bo
instance HasImportUnsupported (ImportActions Annex) where
importUnsupported = ImportActions
{ listImportableContents = return Nothing
, importKey = Nothing
, retrieveExportWithContentIdentifier = nope
, storeExportWithContentIdentifier = nope
, removeExportWithContentIdentifier = nope

View file

@ -276,6 +276,18 @@ data ImportActions a = ImportActions
-- May also find old versions of files that are still stored in the
-- remote.
{ listImportableContents :: a (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
-- Imports a file from the remote, without downloading it,
-- by generating a Key (of any type).
--
-- May update the progress meter if it needs to perform an
-- expensive operation, such as hashing a local file.
--
-- Ensures that the key corresponds to the ContentIdentifier,
-- bearing in mind that the file on the remote may have changed
-- since the ContentIdentifier was generated.
--
-- Throws exception on failure.
, importKey :: Maybe (ExportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> a Key)
-- Retrieves a file from the remote. Ensures that the file
-- it retrieves has the requested ContentIdentifier.
--

View file

@ -14,11 +14,11 @@ or from a directory.
# IMPORTING FROM A SPECIAL REMOTE
Importing from a special remote first downloads all new content from it,
and then constructs a git commit that reflects files that have changed on
the special remote since the last time git-annex looked at it. Merging that
commit into your repository will update it to reflect changes made on the
special remote.
Importing from a special remote first downloads or hashes all new content
from it, and then constructs a git commit that reflects files that have
changed on the special remote since the last time git-annex looked at it.
Merging that commit into your repository will update it to reflect changes
made on the special remote.
This way, something can be using the special remote for file storage,
adding files, modifying files, and deleting files, and you can track those
@ -84,6 +84,38 @@ set. This includes expressions containing "copies=", "metadata=", and other
things that depend on the key. Preferred content expressions containing
"include=", "exclude=" "smallerthan=", "largerthan=" will work.
# OPTIONS FOR IMPORTING FROM A SPECIAL REMOTE
* `--content`, `--no-content`
Controls whether content is downloaded from the special remote.
The default is to download content into the git-annex repository.
With --no-content, git-annex keys are generated from information
provided by the special remote, without downloading it. Commands like
`git-annex get` can later be used to download files, as desired.
The --no-content option is not supported by all special remotes,
and the kind of git-annex key that is generated is left up to
each special remote. So while the directory special remote hashes
the file and generates the same key it usually would, other
special remotes may use unusual keys like SHA1, or WORM, depending
on the limitations of the special remote.
The annex.securehashesonly configuration, if set, will prevent
--no-content importing from a special remote that uses insecure keys.
Using --no-content prevents annex.largefiles from being checked,
because the files are not downloaded. So, when using --no-content,
files that would usually be considered non-large will be added to the
annex, rather than adding them directly to the git repository.
Note that a different git tree will often be generated when using
--no-content than would be generated when using --content, because
the options cause different kinds of keys to be used when importing
new/changed files. So mixing uses of --content and --no-content can
lead to merge conflicts in some situations.
# IMPORTING FROM A DIRECTORY
When run with a path, `git annex import` moves files from somewhere outside

View file

@ -856,8 +856,10 @@ Like other git commands, git-annex is configured via `.git/config`.
When this is set, the contents of files using cryptographically
insecure hashes will not be allowed to be added to the repository.
Also, git-annex fsck` will complain about any files present in
the repository that use insecure hashes.
Also, `git-annex fsck` will complain about any files present in
the repository that use insecure hashes. And,
`git-annex import --no-content` will refuse to import files
from special remotes using insecure hashes.
To configure the behavior in new clones of the repository,
this can be set using [[git-annex-config]].

View file

@ -0,0 +1,12 @@
[[!comment format=mdwn
username="joey"
subject="""comment 7"""
date="2020-07-03T16:27:10Z"
content="""
Hmm, --fast is not very descriptive for this when it's used with a
directory special remote, because hashing is almost as slow as copying.
Probably better to use --no-content and --content, same as sync.
(Though unfortunately with an opposite default though iirc there are plans
somewhere to transition sync to default to --content).
"""]]

View file

@ -0,0 +1,12 @@
[[!comment format=mdwn
username="joey"
subject="""comment 8"""
date="2020-07-03T17:39:19Z"
content="""
Note that, since exporttree remotes are always untrusted, after importing
--no-content from one, fsck is going to complain about it being the only
location with the content.
Which seems right.. That content could be overwritten at any time and the
only copy lost. But still worth keeping in mind.
"""]]