diff --git a/Annex/Content.hs b/Annex/Content.hs index 0e9fd9bab9..55ffd5bdbd 100644 --- a/Annex/Content.hs +++ b/Annex/Content.hs @@ -20,6 +20,7 @@ module Annex.Content ( getViaTmp, getViaTmpFromDisk, checkDiskSpaceToGet, + checkSecureHashes, prepTmp, withTmp, checkDiskSpace, @@ -473,7 +474,7 @@ withTmp key action = do - case. May also throw exceptions in some cases. -} moveAnnex :: Key -> FilePath -> Annex Bool -moveAnnex key src = ifM (checkSecureHashes key) +moveAnnex key src = ifM (checkSecureHashes' key) ( do withObjectLoc key storeobject return True @@ -496,22 +497,27 @@ moveAnnex key src = ifM (checkSecureHashes key) dest' = fromRawFilePath dest alreadyhave = liftIO $ removeFile src -checkSecureHashes :: Key -> Annex Bool +checkSecureHashes :: Key -> Annex (Maybe String) checkSecureHashes key - | cryptographicallySecure (fromKey keyVariety key) = return True + | cryptographicallySecure (fromKey keyVariety key) = return Nothing | otherwise = ifM (annexSecureHashesOnly <$> Annex.getGitConfig) - ( do - warning $ "annex.securehashesonly blocked adding " ++ decodeBS (formatKeyVariety (fromKey keyVariety key)) ++ " key to annex objects" - return False - , return True + ( return $ Just $ "annex.securehashesonly blocked adding " ++ decodeBS (formatKeyVariety (fromKey keyVariety key)) ++ " key" + , return Nothing ) +checkSecureHashes' :: Key -> Annex Bool +checkSecureHashes' key = checkSecureHashes key >>= \case + Nothing -> return True + Just msg -> do + warning $ msg ++ "to annex objects" + return False + data LinkAnnexResult = LinkAnnexOk | LinkAnnexFailed | LinkAnnexNoop {- Populates the annex object file by hard linking or copying a source - file to it. -} linkToAnnex :: Key -> FilePath -> Maybe InodeCache -> Annex LinkAnnexResult -linkToAnnex key src srcic = ifM (checkSecureHashes key) +linkToAnnex key src srcic = ifM (checkSecureHashes' key) ( do dest <- fromRawFilePath <$> calcRepo (gitAnnexLocation key) modifyContent dest $ linkAnnex To key src srcic dest Nothing diff --git a/Annex/Import.hs b/Annex/Import.hs index a504c3be13..76ac6cff7f 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -12,7 +12,7 @@ module Annex.Import ( ImportCommitConfig(..), buildImportCommit, buildImportTrees, - downloadImport, + importKeys, filterImportableContents, makeImportMatcher, listImportableContents, @@ -280,44 +280,57 @@ buildImportTrees basetree msubdir importable = History topf = asTopFilePath $ maybe lf (\sd -> getTopFilePath sd P. lf) msubdir -{- Downloads all new ContentIdentifiers. Supports concurrency when enabled. - - - - If any download fails, the whole thing fails with Nothing, - - but it will resume where it left off. +{- Downloads all new ContentIdentifiers, or when importcontent is False, + - generates Keys without downloading. - - Generates either a Key or a git Sha, depending on annex.largefiles. - - Note that, when a ContentIdentifiers has been imported before, - - annex.largefiles is not reapplied, so will result in how ever that - - content was stored in the repo before. + - But when importcontent is False, it cannot match on annex.largefiles + - (or generate a git Sha), so always generates Keys. + - + - Supports concurrency when enabled. + - + - If it fails on any file, the whole thing fails with Nothing, + - but it will resume where it left off. + - + - Note that, when a ContentIdentifier has been imported before, + - generates the same thing that was imported before, so annex.largefiles + - is not reapplied. -} -downloadImport :: Remote -> ImportTreeConfig -> ImportableContents (ContentIdentifier, ByteSize) -> Annex (Maybe (ImportableContents (Either Sha Key))) -downloadImport remote importtreeconfig importablecontents = do +importKeys + :: Remote + -> ImportTreeConfig + -> Bool + -> ImportableContents (ContentIdentifier, ByteSize) + -> Annex (Maybe (ImportableContents (Either Sha Key))) +importKeys remote importtreeconfig importcontent importablecontents = do + when (not importcontent && isNothing (Remote.importKey ia)) $ + giveup "This remote does not support importing without downloading content." -- This map is used to remember content identifiers that - -- were just downloaded, before they have necessarily been + -- were just imported, before they have necessarily been -- stored in the database. This way, if the same content -- identifier appears multiple times in the -- importablecontents (eg when it has a history), - -- they will only be downloaded once. + -- they will only be imported once. cidmap <- liftIO $ newTVarIO M.empty -- When concurrency is enabled, this set is needed to - -- avoid two threads both downloading the same content identifier. - downloading <- liftIO $ newTVarIO S.empty + -- avoid two threads both importing the same content identifier. + importing <- liftIO $ newTVarIO S.empty withExclusiveLock gitAnnexContentIdentifierLock $ bracket CIDDb.openDb CIDDb.closeDb $ \db -> do CIDDb.needsUpdateFromLog db >>= maybe noop (CIDDb.updateFromLog db) - go False cidmap downloading importablecontents db + go False cidmap importing importablecontents db where - go oldversion cidmap downloading (ImportableContents l h) db = do + go oldversion cidmap importing (ImportableContents l h) db = do largematcher <- largeFilesMatcher jobs <- forM l $ \i -> - startdownload cidmap downloading db i oldversion largematcher + startimport cidmap importing db i oldversion largematcher l' <- liftIO $ forM jobs $ either pure (atomically . takeTMVar) if any isNothing l' then return Nothing else do - h' <- mapM (\ic -> go True cidmap downloading ic db) h + h' <- mapM (\ic -> go True cidmap importing ic db) h if any isNothing h' then return Nothing else return $ Just $ @@ -325,17 +338,17 @@ downloadImport remote importtreeconfig importablecontents = do (catMaybes l') (catMaybes h') - waitstart downloading cid = liftIO $ atomically $ do - s <- readTVar downloading + waitstart importing cid = liftIO $ atomically $ do + s <- readTVar importing if S.member cid s then retry - else writeTVar downloading $ S.insert cid s + else writeTVar importing $ S.insert cid s - signaldone downloading cid = liftIO $ atomically $ do - s <- readTVar downloading - writeTVar downloading $ S.delete cid s + signaldone importing cid = liftIO $ atomically $ do + s <- readTVar importing + writeTVar importing $ S.delete cid s - startdownload cidmap downloading db i@(loc, (cid, _sz)) oldversion largematcher = getcidkey cidmap db cid >>= \case + startimport cidmap importing db i@(loc, (cid, _sz)) oldversion largematcher = getcidkey cidmap db cid >>= \case (k:ks) -> -- If the same content was imported before -- yeilding multiple different keys, it's not clear @@ -350,10 +363,10 @@ downloadImport remote importtreeconfig importablecontents = do [] -> do job <- liftIO $ newEmptyTMVarIO let ai = ActionItemOther (Just (fromRawFilePath (fromImportLocation loc))) - let downloadaction = starting ("import " ++ Remote.name remote) ai $ do + let importaction = starting ("import " ++ Remote.name remote) ai $ do when oldversion $ showNote "old version" - tryNonAsync (download cidmap db i largematcher) >>= \case + tryNonAsync (importordownload cidmap db i largematcher) >>= \case Left e -> next $ do warning (show e) liftIO $ atomically $ @@ -364,12 +377,36 @@ downloadImport remote importtreeconfig importablecontents = do putTMVar job r return True commandAction $ bracket_ - (waitstart downloading cid) - (signaldone downloading cid) - downloadaction + (waitstart importing cid) + (signaldone importing cid) + importaction return (Right job) - download cidmap db (loc, (cid, sz)) largematcher = do + importordownload + | not importcontent = doimport + | otherwise = dodownload + + doimport cidmap db (loc, (cid, sz)) _largematcher = + case Remote.importKey ia of + Nothing -> error "internal" -- checked earlier + Just a -> do + let importer p = do + k <- a loc cid sz p + checkSecureHashes k >>= \case + Nothing -> do + recordcidkey cidmap db cid k + logChange k (Remote.uuid remote) InfoPresent + return (Right k) + Just msg -> giveup (msg ++ " to import") + let runimport p = tryNonAsync (importer p) >>= \case + Right k -> return $ Just (loc, k) + Left e -> do + warning (show e) + return Nothing + metered Nothing sz $ + const runimport + + dodownload cidmap db (loc, (cid, sz)) largematcher = do let downloader tmpfile p = do k <- Remote.retrieveExportWithContentIdentifier ia loc cid tmpfile (mkkey loc tmpfile largematcher) p case keyGitSha k of @@ -394,8 +431,9 @@ downloadImport remote importtreeconfig importablecontents = do metered Nothing tmpkey $ const (rundownload tmpfile) where - ia = Remote.importActions remote tmpkey = importKey cid sz + + ia = Remote.importActions remote mkkey loc tmpfile largematcher = do f <- fromRepo $ fromTopFilePath $ locworktreefilename loc diff --git a/CHANGELOG b/CHANGELOG index e6ea084c91..e8226113c2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,10 @@ git-annex (8.20200618) UNRELEASED; urgency=medium + * import: Added --no-content option, which avoids downloading files + from a special remote. Only supported by some special remotes: + directory * Honor annex.largefiles when importing a tree from a special remote. + (Except for when --no-content is used.) * Fix a deadlock that could occur after git-annex got an unlocked file, causing the command to hang indefinitely. Known to happen on vfat filesystems, possibly others. @@ -15,7 +19,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium itempubminute, itempubsecond. * Made several special remotes support locking content on them, which allows dropping from other special remotes in some situations - where it was not possible before. + where it was not possible before. Supported special remotes: S3 (with versioning=yes), git-lfs, tahoe * Fix reversion that broke passing annex.* and remote.*.annex-* git configs with -c. (Since version 8.20200330.) diff --git a/Command/Import.hs b/Command/Import.hs index 094fb89e11..793375d942 100644 --- a/Command/Import.hs +++ b/Command/Import.hs @@ -1,6 +1,6 @@ {- git-annex command - - - Copyright 2012-2019 Joey Hess + - Copyright 2012-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -17,6 +17,7 @@ import qualified Command.Reinject import qualified Types.Remote as Remote import qualified Git.Ref import Utility.CopyFile +import Utility.OptParse import Backend import Types.KeySource import Annex.CheckIgnore @@ -53,12 +54,16 @@ data ImportOptions { importFromRemote :: DeferredParse Remote , importToBranch :: Branch , importToSubDir :: Maybe FilePath + , importContent :: Bool } optParser :: CmdParamsDesc -> Parser ImportOptions optParser desc = do ps <- cmdParams desc mfromremote <- optional $ parseRemoteOption <$> parseFromOption + content <- invertableSwitch "content" True + ( help "do not get contents of imported files" + ) dupmode <- fromMaybe Default <$> optional duplicateModeParser return $ case mfromremote of Nothing -> LocalImportOptions ps dupmode @@ -68,6 +73,7 @@ optParser desc = do in RemoteImportOptions r (Ref (encodeBS' branch)) (if null subdir then Nothing else Just subdir) + content _ -> giveup "expected BRANCH[:SUBDIR]" data DuplicateMode = Default | Duplicate | DeDuplicate | CleanDuplicates | SkipDuplicates | ReinjectDuplicates @@ -114,7 +120,7 @@ seek o@(RemoteImportOptions {}) = startConcurrency commandStages $ do (pure Nothing) (Just <$$> inRepo . toTopFilePath . toRawFilePath) (importToSubDir o) - seekRemote r (importToBranch o) subdir + seekRemote r (importToBranch o) subdir (importContent o) startLocal :: AddUnlockedMatcher -> GetFileMatcher -> DuplicateMode -> (FilePath, FilePath) -> CommandStart startLocal addunlockedmatcher largematcher mode (srcfile, destfile) = @@ -258,8 +264,8 @@ verifyExisting key destfile (yes, no) = do verifyEnoughCopiesToDrop [] key Nothing need [] preverified tocheck (const yes) no -seekRemote :: Remote -> Branch -> Maybe TopFilePath -> CommandSeek -seekRemote remote branch msubdir = do +seekRemote :: Remote -> Branch -> Maybe TopFilePath -> Bool -> CommandSeek +seekRemote remote branch msubdir importcontent = do importtreeconfig <- case msubdir of Nothing -> return ImportTree Just subdir -> @@ -279,7 +285,7 @@ seekRemote remote branch msubdir = do void $ includeCommandAction (listContents remote importabletvar) liftIO (atomically (readTVar importabletvar)) >>= \case Nothing -> return () - Just importable -> downloadImport remote importtreeconfig importable >>= \case + Just importable -> importKeys remote importtreeconfig importcontent importable >>= \case Nothing -> warning $ concat [ "Failed to import some files from " , Remote.name remote diff --git a/Command/Sync.hs b/Command/Sync.hs index f8a6852a38..4e4940fafd 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -455,7 +455,7 @@ importRemote o mergeconfig remote currbranch let subdir = if S.null p then Nothing else Just (asTopFilePath p) - Command.Import.seekRemote remote branch subdir + Command.Import.seekRemote remote branch subdir True void $ mergeRemote remote currbranch mergeconfig o where wantpull = remoteAnnexPull (Remote.gitconfig remote) diff --git a/Remote/Directory.hs b/Remote/Directory.hs index 29c277bf0a..9b22b1f8d1 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -32,6 +32,8 @@ import Types.Import import qualified Remote.Directory.LegacyChunked as Legacy import Annex.Content import Annex.UUID +import Backend +import Types.KeySource import Utility.Metered import Utility.Tmp import Utility.InodeCache @@ -88,6 +90,7 @@ gen r u rc gc rs = do } , importActions = ImportActions { listImportableContents = listImportableContentsM dir + , importKey = Just (importKeyM dir) , retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM dir , storeExportWithContentIdentifier = storeExportWithContentIdentifierM dir , removeExportWithContentIdentifier = removeExportWithContentIdentifierM dir @@ -342,6 +345,26 @@ mkContentIdentifier f st = fmap (ContentIdentifier . encodeBS . showInodeCache) <$> toInodeCache noTSDelta f st +guardSameContentIdentifiers :: a -> ContentIdentifier -> Maybe ContentIdentifier -> a +guardSameContentIdentifiers cont old new + | new == Just old = cont + | otherwise = giveup "file content has changed" + +importKeyM :: FilePath -> ExportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex Key +importKeyM dir loc cid sz p = do + backend <- chooseBackend (fromRawFilePath f) + k <- fst <$> genKey ks p backend + currcid <- liftIO $ mkContentIdentifier absf =<< getFileStatus absf + guardSameContentIdentifiers (return k) cid currcid + where + f = fromExportLocation loc + absf = dir fromRawFilePath f + ks = KeySource + { keyFilename = f + , contentLocation = toRawFilePath absf + , inodeCache = Nothing + } + retrieveExportWithContentIdentifierM :: FilePath -> ExportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM dir loc cid dest mkkey p = precheck $ docopy postcheck @@ -376,7 +399,7 @@ retrieveExportWithContentIdentifierM dir loc cid dest mkkey p = -- Check before copy, to avoid expensive copy of wrong file -- content. - precheck cont = comparecid cont + precheck cont = guardSameContentIdentifiers cont cid =<< liftIO . mkContentIdentifier f =<< liftIO (getFileStatus f) @@ -404,11 +427,7 @@ retrieveExportWithContentIdentifierM dir loc cid dest mkkey p = #else =<< getFileStatus f #endif - comparecid cont currcid - - comparecid cont currcid - | currcid == Just cid = cont - | otherwise = giveup "file content has changed" + guardSameContentIdentifiers cont cid currcid storeExportWithContentIdentifierM :: FilePath -> FilePath -> Key -> ExportLocation -> [ContentIdentifier] -> MeterUpdate -> Annex ContentIdentifier storeExportWithContentIdentifierM dir src _k loc overwritablecids p = do diff --git a/Remote/Helper/ExportImport.hs b/Remote/Helper/ExportImport.hs index fdc3d2e2c3..4a5776b932 100644 --- a/Remote/Helper/ExportImport.hs +++ b/Remote/Helper/ExportImport.hs @@ -56,6 +56,7 @@ instance HasImportUnsupported (ParsedRemoteConfig -> RemoteGitConfig -> Annex Bo instance HasImportUnsupported (ImportActions Annex) where importUnsupported = ImportActions { listImportableContents = return Nothing + , importKey = Nothing , retrieveExportWithContentIdentifier = nope , storeExportWithContentIdentifier = nope , removeExportWithContentIdentifier = nope diff --git a/Types/Remote.hs b/Types/Remote.hs index a5efff16b1..f04266a890 100644 --- a/Types/Remote.hs +++ b/Types/Remote.hs @@ -276,6 +276,18 @@ data ImportActions a = ImportActions -- May also find old versions of files that are still stored in the -- remote. { listImportableContents :: a (Maybe (ImportableContents (ContentIdentifier, ByteSize))) + -- Imports a file from the remote, without downloading it, + -- by generating a Key (of any type). + -- + -- May update the progress meter if it needs to perform an + -- expensive operation, such as hashing a local file. + -- + -- Ensures that the key corresponds to the ContentIdentifier, + -- bearing in mind that the file on the remote may have changed + -- since the ContentIdentifier was generated. + -- + -- Throws exception on failure. + , importKey :: Maybe (ExportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> a Key) -- Retrieves a file from the remote. Ensures that the file -- it retrieves has the requested ContentIdentifier. -- diff --git a/doc/git-annex-import.mdwn b/doc/git-annex-import.mdwn index 566dacc273..7566bb5df6 100644 --- a/doc/git-annex-import.mdwn +++ b/doc/git-annex-import.mdwn @@ -14,11 +14,11 @@ or from a directory. # IMPORTING FROM A SPECIAL REMOTE -Importing from a special remote first downloads all new content from it, -and then constructs a git commit that reflects files that have changed on -the special remote since the last time git-annex looked at it. Merging that -commit into your repository will update it to reflect changes made on the -special remote. +Importing from a special remote first downloads or hashes all new content +from it, and then constructs a git commit that reflects files that have +changed on the special remote since the last time git-annex looked at it. +Merging that commit into your repository will update it to reflect changes +made on the special remote. This way, something can be using the special remote for file storage, adding files, modifying files, and deleting files, and you can track those @@ -84,6 +84,38 @@ set. This includes expressions containing "copies=", "metadata=", and other things that depend on the key. Preferred content expressions containing "include=", "exclude=" "smallerthan=", "largerthan=" will work. +# OPTIONS FOR IMPORTING FROM A SPECIAL REMOTE + +* `--content`, `--no-content` + + Controls whether content is downloaded from the special remote. + The default is to download content into the git-annex repository. + + With --no-content, git-annex keys are generated from information + provided by the special remote, without downloading it. Commands like + `git-annex get` can later be used to download files, as desired. + + The --no-content option is not supported by all special remotes, + and the kind of git-annex key that is generated is left up to + each special remote. So while the directory special remote hashes + the file and generates the same key it usually would, other + special remotes may use unusual keys like SHA1, or WORM, depending + on the limitations of the special remote. + + The annex.securehashesonly configuration, if set, will prevent + --no-content importing from a special remote that uses insecure keys. + + Using --no-content prevents annex.largefiles from being checked, + because the files are not downloaded. So, when using --no-content, + files that would usually be considered non-large will be added to the + annex, rather than adding them directly to the git repository. + + Note that a different git tree will often be generated when using + --no-content than would be generated when using --content, because + the options cause different kinds of keys to be used when importing + new/changed files. So mixing uses of --content and --no-content can + lead to merge conflicts in some situations. + # IMPORTING FROM A DIRECTORY When run with a path, `git annex import` moves files from somewhere outside diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 8eae776463..1e65761e4e 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -856,8 +856,10 @@ Like other git commands, git-annex is configured via `.git/config`. When this is set, the contents of files using cryptographically insecure hashes will not be allowed to be added to the repository. - Also, git-annex fsck` will complain about any files present in - the repository that use insecure hashes. + Also, `git-annex fsck` will complain about any files present in + the repository that use insecure hashes. And, + `git-annex import --no-content` will refuse to import files + from special remotes using insecure hashes. To configure the behavior in new clones of the repository, this can be set using [[git-annex-config]]. diff --git a/doc/todo/importing_from_special_remote_without_downloading/comment_7_87dda7df06cde981f142781849793c08._comment b/doc/todo/importing_from_special_remote_without_downloading/comment_7_87dda7df06cde981f142781849793c08._comment new file mode 100644 index 0000000000..b340bd6954 --- /dev/null +++ b/doc/todo/importing_from_special_remote_without_downloading/comment_7_87dda7df06cde981f142781849793c08._comment @@ -0,0 +1,12 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 7""" + date="2020-07-03T16:27:10Z" + content=""" +Hmm, --fast is not very descriptive for this when it's used with a +directory special remote, because hashing is almost as slow as copying. + +Probably better to use --no-content and --content, same as sync. +(Though unfortunately with an opposite default though iirc there are plans +somewhere to transition sync to default to --content). +"""]] diff --git a/doc/todo/importing_from_special_remote_without_downloading/comment_8_69d49ecfaf18d85a535fd7b52e443cc2._comment b/doc/todo/importing_from_special_remote_without_downloading/comment_8_69d49ecfaf18d85a535fd7b52e443cc2._comment new file mode 100644 index 0000000000..8d57141205 --- /dev/null +++ b/doc/todo/importing_from_special_remote_without_downloading/comment_8_69d49ecfaf18d85a535fd7b52e443cc2._comment @@ -0,0 +1,12 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 8""" + date="2020-07-03T17:39:19Z" + content=""" +Note that, since exporttree remotes are always untrusted, after importing +--no-content from one, fsck is going to complain about it being the only +location with the content. + +Which seems right.. That content could be overwritten at any time and the +only copy lost. But still worth keeping in mind. +"""]]