From ef8c36254a2f94b05e766e96588fde9cc51603b5 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 17 Dec 2020 13:12:35 -0400 Subject: [PATCH 01/24] docs for borg special remote (which DNE yet) --- doc/special_remotes.mdwn | 1 + doc/special_remotes/borg.mdwn | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 doc/special_remotes/borg.mdwn diff --git a/doc/special_remotes.mdwn b/doc/special_remotes.mdwn index 29a1f36347..515699313e 100644 --- a/doc/special_remotes.mdwn +++ b/doc/special_remotes.mdwn @@ -25,6 +25,7 @@ the git history is not stored in them. * [[webdav]] * [[git]] * [[httpalso]] +* [[borg]] * [[xmpp]] The above special remotes are built into git-annex, and can be used diff --git a/doc/special_remotes/borg.mdwn b/doc/special_remotes/borg.mdwn new file mode 100644 index 0000000000..b2faa308d9 --- /dev/null +++ b/doc/special_remotes/borg.mdwn @@ -0,0 +1,24 @@ +This special remote type accesses files stored in a +[borg](https://www.borgbackup.org/) repository. + +This allows retrieving annexed files from borg repositories, and since +git-annex knows which files are stored in a borg repository, it can be +configured to treat the repository as one copy of the file. + +Unlike most special remotes, this is read-only; it cannot change what +is stored in a borg repository. You do that by using borg as usual, +and then `git-annex sync` will learn about the files that are stored +in the borg repository. + +## configuration + +These parameters can be passed to `git annex initremote` to configure the +remote: + +* `repository` - The location of a borg repository, eg a path, or + `user@host:path` for ssh access. + +Setup example: + + # borg init --encryption=keyfile /path/to/repo + # git annex initremote borg type=borg repository=/path/to/repo From f930176d6ec03e55b048c879ea5507aef113e618 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 17 Dec 2020 17:06:50 -0400 Subject: [PATCH 02/24] change info from export=yes to exporttree=yes and same for import for consistency --- Remote/Helper/ExportImport.hs | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/Remote/Helper/ExportImport.hs b/Remote/Helper/ExportImport.hs index 9ff6f92800..b8d6390e4a 100644 --- a/Remote/Helper/ExportImport.hs +++ b/Remote/Helper/ExportImport.hs @@ -102,35 +102,35 @@ adjustExportImportRemoteType rt = rt { setup = setup' } -- | Adjust a remote to support exporttree=yes and/or importree=yes. adjustExportImport :: Remote -> RemoteStateHandle -> Annex Remote adjustExportImport r rs = do - isexport <- pure (exportTree (config r)) <&&> isExportSupported r - isimport <- pure (importTree (config r)) <&&> isImportSupported r + isexporttree <- pure (exportTree (config r)) <&&> isexporttreeSupported r + isimporttree <- pure (importTree (config r)) <&&> isimporttreeSupported r let r' = r { remotetype = (remotetype r) - { exportSupported = if isexport + { exportSupported = if isexporttree then exportSupported (remotetype r) else exportUnsupported - , importSupported = if isimport + , importSupported = if isimporttree then importSupported (remotetype r) else importUnsupported } } - if not isexport && not isimport + if not isexporttree && not isimporttree then return r' - else adjustExportImport' isexport isimport r' rs + else adjustExportImport' isexporttree isimporttree r' rs adjustExportImport' :: Bool -> Bool -> Remote -> RemoteStateHandle -> Annex Remote -adjustExportImport' isexport isimport r rs = do +adjustExportImport' isexporttree isimporttree r rs = do dbv <- prepdbv ciddbv <- prepciddb - let normal = not isexport && not isimport + let normal = not isexporttree && not isimporttree let iskeyvaluestore = normal || appendonly r return $ r - { exportActions = if isexport - then if isimport + { exportActions = if isexporttree + then if isimporttree then exportActionsForImport dbv ciddbv (exportActions r) else exportActions r else exportUnsupported - , importActions = if isimport + , importActions = if isimporttree then importActions r else importUnsupported , storeKey = \k af p -> @@ -139,9 +139,9 @@ adjustExportImport' isexport isimport r rs = do -- when another repository has already stored the -- key, and the local repository does not know -- about it. To avoid unnecessary costs, don't do it. - if isexport + if isexporttree then giveup "remote is configured with exporttree=yes; use `git-annex export` to store content on it" - else if isimport + else if isimporttree then giveup "remote is configured with importtree=yes and without exporttree=yes; cannot modify content stored on it" else storeKey r k af p , removeKey = \k -> @@ -151,19 +151,19 @@ adjustExportImport' isexport isimport r rs = do -- files would not be dealt with correctly. -- There does not seem to be a good use case for -- removing a key from an export in any case. - if isexport + if isexporttree then giveup "dropping content from an export is not supported; use `git annex export` to export a tree that lacks the files you want to remove" - else if isimport + else if isimporttree then giveup "dropping content from this remote is not supported because it is configured with importtree=yes" else removeKey r k , lockContent = if iskeyvaluestore then lockContent r else Nothing , retrieveKeyFile = \k af dest p -> - if isimport + if isimporttree then supportappendonlyretrieve k af dest p $ retrieveKeyFileFromImport dbv ciddbv k af dest p - else if isexport + else if isexporttree then supportappendonlyretrieve k af dest p $ retrieveKeyFileFromExport dbv k af dest p else retrieveKeyFile r k af dest p @@ -172,10 +172,10 @@ adjustExportImport' isexport isimport r rs = do else Nothing , checkPresent = \k -> if appendonly r then checkPresent r k - else if isimport + else if isimporttree then anyM (checkPresentImport ciddbv k) =<< getexportlocs dbv k - else if isexport + else if isexporttree -- Check if any of the files a key -- was exported to are present. This -- doesn't guarantee the export @@ -201,14 +201,14 @@ adjustExportImport' isexport isimport r rs = do else return Nothing , getInfo = do is <- getInfo r - is' <- if isexport + is' <- if isexporttree then do ts <- map fromRef . exportedTreeishes <$> getExport (uuid r) - return (is++[("export", "yes"), ("exportedtree", unwords ts)]) + return (is++[("exporttree", "yes"), ("exportedtree", unwords ts)]) else return is - return $ if isimport - then (is'++[("import", "yes")]) + return $ if isimporttree + then (is'++[("importtree", "yes")]) else is' } where From 037f8b68635072daa48ea13a1cfd25875daf3f44 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 18 Dec 2020 11:06:23 -0400 Subject: [PATCH 03/24] update --- doc/special_remotes/borg.mdwn | 64 ++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/doc/special_remotes/borg.mdwn b/doc/special_remotes/borg.mdwn index b2faa308d9..8869288e88 100644 --- a/doc/special_remotes/borg.mdwn +++ b/doc/special_remotes/borg.mdwn @@ -1,14 +1,10 @@ -This special remote type accesses files stored in a +This special remote type accesses annexed files stored in a [borg](https://www.borgbackup.org/) repository. -This allows retrieving annexed files from borg repositories, and since -git-annex knows which files are stored in a borg repository, it can be -configured to treat the repository as one copy of the file. - -Unlike most special remotes, this is read-only; it cannot change what -is stored in a borg repository. You do that by using borg as usual, -and then `git-annex sync` will learn about the files that are stored -in the borg repository. +Unlike most special remotes, git-annex cannot be used to store annexed +files in this special remote. You store files by using borg as usual, to +back up the git-annex repository. Then `git-annex sync` will learn about +the annexed files that are stored in the borg repository. ## configuration @@ -18,7 +14,51 @@ remote: * `repository` - The location of a borg repository, eg a path, or `user@host:path` for ssh access. -Setup example: +* `scan` - The path, within the borg repository, to scan for + annex object files. This can be the path to a git-annex repository, + or perhaps a non-encrypted special remote, or a path that contains + several repositories. - # borg init --encryption=keyfile /path/to/repo - # git annex initremote borg type=borg repository=/path/to/repo + Information about all annex objects in the path will be + added to the git-annex branch when syncing with the borg repository. + So, it's best to avoid a path that contains object files for unrelated + git-annex repositories. + +## setup example + + # borg init --encryption=keyfile /path/to/borgrepo + # git annex initremote borg type=borg repository=/path/to/borgrepo scan=`pwd` + # borg create /path/to/borgrepo `pwd`::{now} + # git annex sync borg + +## trust levels, borg delete and borg prune + +git-annex will by default treat the borg special remote as untrusted, so +will not trust it to continue to contain a [[copy|copies]] of any annexed +file. This is necessary because you could run `borg delete` or `borg prune` +and remove the copy from the borg repository. If you choose to set the +trust level of the borg repository to a higher level, you need to avoid +using such commands with that borg repository. + +Consider this example: + + git-annex add annexedfile + borg create /path/to/borgrepo `pwd`::foo + git-annex sync borg + git-annex semitrust borg + git-annex drop annexedfile + +Now the only copy of annexedfile is in the borg repository. + + borg create /path/to/borgrepo `pwd`::bar + borg delete /path/to/borgrepo::foo + git-annex sync borg + git-annex whereis annexedfile + +Now no copies of annexfile remain, because the "foo" archive +in the borg repository was the only one to contain it, and it was deleted. + +So either keep the borg special remote as untrusted, and use such borg +commands to delete old archives as needed, or avoid using `borg delete` +and `borg prune`, and then the remote can safely be made semitrusted or +trusted. From 9a2c8757f37696ef4c56fdf155863846e235ab16 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 18 Dec 2020 14:52:57 -0400 Subject: [PATCH 04/24] add thirdPartyPopulated interface This is to support, eg a borg repo as a special remote, which is populated not by running git-annex commands, but by using borg. Then git-annex sync lists the content of the remote, learns which files are annex objects, and treats those as present in the remote. So, most of the import machinery is reused, to a new purpose. While normally importtree maintains a remote tracking branch, this does not, because the files stored in the remote are annex object files, not user-visible filenames. But, internally, a git tree is still generated, of the files on the remote that are annex objects. This tree is used by retrieveExportWithContentIdentifier, etc. As with other import/export remotes, that the tree is recorded in the export log, and gets grafted into the git-annex branch. importKey changed to be able to return Nothing, to indicate when an ImportLocation is not an annex object and so should be skipped from being included in the tree. It did not seem to make sense to have git-annex import do this, since from the user's perspective, it's not like other imports. So only git-annex sync does it. Note that, git-annex sync does not yet download objects from such remotes that are preferred content. importKeys is run with content downloading disabled, to avoid getting the content of all objects. Perhaps what's needed is for seekSyncContent to be run with these remotes, but I don't know if it will just work (in particular, it needs to avoid trying to transfer objects to them), so I skipped that for now. (Untested and unused as of yet.) This commit was sponsored by Jochen Bartl on Patreon. --- Annex/Import.hs | 91 +++++++++++++++++++++-------------- Command/Import.hs | 2 +- Command/Sync.hs | 34 ++++++++++++- Remote/Adb.hs | 1 + Remote/BitTorrent.hs | 1 + Remote/Bup.hs | 1 + Remote/Ddar.hs | 1 + Remote/Directory.hs | 5 +- Remote/External.hs | 1 + Remote/GCrypt.hs | 1 + Remote/Git.hs | 1 + Remote/GitLFS.hs | 1 + Remote/Glacier.hs | 1 + Remote/Helper/ExportImport.hs | 82 +++++++++++++++++++------------ Remote/Hook.hs | 1 + Remote/HttpAlso.hs | 1 + Remote/P2P.hs | 1 + Remote/Rsync.hs | 1 + Remote/S3.hs | 1 + Remote/Tahoe.hs | 1 + Remote/Web.hs | 1 + Remote/WebDAV.hs | 1 + Types/Remote.hs | 22 ++++++--- 23 files changed, 176 insertions(+), 77 deletions(-) diff --git a/Annex/Import.hs b/Annex/Import.hs index 7c59f9a5ec..c5a16f3d86 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -12,6 +12,7 @@ module Annex.Import ( ImportCommitConfig(..), buildImportCommit, buildImportTrees, + recordImportTree, canImportKeys, importKeys, makeImportMatcher, @@ -104,6 +105,28 @@ buildImportCommit remote importtreeconfig importcommitconfig importable = Just trackingcommit -> inRepo (Git.Ref.tree trackingcommit) >>= \case Nothing -> go Nothing Just _ -> go (Just trackingcommit) + where + go trackingcommit = do + (imported, updatestate) <- recordImportTree remote importtreeconfig importable + buildImportCommit' remote importcommitconfig trackingcommit imported >>= \case + Just finalcommit -> do + updatestate + return (Just finalcommit) + Nothing -> return Nothing + +{- Builds a tree for an import from a special remote. + - + - Also returns an action that can be used to update + - all the other state to record the import. + -} +recordImportTree + :: Remote + -> ImportTreeConfig + -> ImportableContents (Either Sha Key) + -> Annex (History Sha, Annex ()) +recordImportTree remote importtreeconfig importable = do + imported@(History finaltree _) <- buildImportTrees basetree subdir importable + return (imported, updatestate finaltree) where basetree = case importtreeconfig of ImportTree -> emptyTree @@ -112,21 +135,12 @@ buildImportCommit remote importtreeconfig importcommitconfig importable = ImportTree -> Nothing ImportSubTree dir _ -> Just dir - go trackingcommit = do - imported@(History finaltree _) <- - buildImportTrees basetree subdir importable - buildImportCommit' remote importcommitconfig trackingcommit imported >>= \case - Just finalcommit -> do - updatestate finaltree - return (Just finalcommit) - Nothing -> return Nothing - - updatestate committedtree = do + updatestate finaltree = do importedtree <- case subdir of - Nothing -> pure committedtree + Nothing -> pure finaltree Just dir -> let subtreeref = Ref $ - fromRef' committedtree + fromRef' finaltree <> ":" <> getTopFilePath dir in fromMaybe emptyTree @@ -308,9 +322,10 @@ importKeys :: Remote -> ImportTreeConfig -> Bool + -> Bool -> ImportableContents (ContentIdentifier, ByteSize) -> Annex (Maybe (ImportableContents (Either Sha Key))) -importKeys remote importtreeconfig importcontent importablecontents = do +importKeys remote importtreeconfig importcontent ignorelargefilesconfig importablecontents = do unless (canImportKeys remote importcontent) $ giveup "This remote does not support importing without downloading content." -- This map is used to remember content identifiers that @@ -400,7 +415,7 @@ importKeys remote importtreeconfig importcontent importablecontents = do let act = if importcontent then case Remote.importKey ia of Nothing -> dodownload - Just _ -> if Utility.Matcher.introspect matchNeedsFileContent matcher + Just _ -> if not ignorelargefilesconfig && Utility.Matcher.introspect matchNeedsFileContent matcher then dodownload else doimport else doimport @@ -410,7 +425,7 @@ importKeys remote importtreeconfig importcontent importablecontents = do case Remote.importKey ia of Nothing -> error "internal" -- checked earlier Just importkey -> do - when (Utility.Matcher.introspect matchNeedsFileContent matcher) $ + when (not ignorelargefilesconfig && Utility.Matcher.introspect matchNeedsFileContent matcher) $ giveup "annex.largefiles configuration examines file contents, so cannot import without content." let mi = MatchingInfo ProvidedInfo { providedFilePath = f @@ -419,7 +434,9 @@ importKeys remote importtreeconfig importcontent importablecontents = do , providedMimeType = Nothing , providedMimeEncoding = Nothing } - islargefile <- checkMatcher' matcher mi mempty + islargefile <- if ignorelargefilesconfig + then pure True + else checkMatcher' matcher mi mempty metered Nothing sz $ const $ if islargefile then doimportlarge importkey cidmap db loc cid sz f else doimportsmall cidmap db loc cid sz @@ -433,24 +450,26 @@ importKeys remote importtreeconfig importcontent importablecontents = do return Nothing where importer = do - unsizedk <- importkey loc cid - -- Don't display progress when generating - -- key, if the content will later be - -- downloaded, which is a more expensive - -- operation generally. - (if importcontent then nullMeterUpdate else p) - -- This avoids every remote needing - -- to add the size. - let k = alterKey unsizedk $ \kd -> kd - { keySize = keySize kd <|> Just sz } - checkSecureHashes k >>= \case - Nothing -> do - recordcidkey cidmap db cid k - logChange k (Remote.uuid remote) InfoPresent - if importcontent - then getcontent k - else return (Just (k, True)) - Just msg -> giveup (msg ++ " to import") + -- Don't display progress when generating + -- key, if the content will later be + -- downloaded, which is a more expensive + -- operation generally. + let p' = if importcontent then nullMeterUpdate else p + importkey loc cid p' >>= \case + Nothing -> return Nothing + Just unsizedk -> do + -- This avoids every remote needing + -- to add the size. + let k = alterKey unsizedk $ \kd -> kd + { keySize = keySize kd <|> Just sz } + checkSecureHashes k >>= \case + Nothing -> do + recordcidkey cidmap db cid k + logChange k (Remote.uuid remote) InfoPresent + if importcontent + then getcontent k + else return (Just (k, True)) + Just msg -> giveup (msg ++ " to import") getcontent :: Key -> Annex (Maybe (Key, Bool)) getcontent k = do @@ -533,7 +552,9 @@ importKeys remote importtreeconfig importcontent importablecontents = do , contentFile = Just tmpfile , matchKey = Nothing } - islargefile <- checkMatcher' matcher mi mempty + islargefile <- if ignorelargefilesconfig + then pure True + else checkMatcher' matcher mi mempty if islargefile then do backend <- chooseBackend f diff --git a/Command/Import.hs b/Command/Import.hs index fd788438f4..f383fd843f 100644 --- a/Command/Import.hs +++ b/Command/Import.hs @@ -306,7 +306,7 @@ seekRemote remote branch msubdir importcontent ci = do void $ includeCommandAction (listContents remote importtreeconfig ci importabletvar) liftIO (atomically (readTVar importabletvar)) >>= \case Nothing -> return () - Just importable -> importKeys remote importtreeconfig importcontent importable >>= \case + Just importable -> importKeys remote importtreeconfig importcontent False importable >>= \case Nothing -> warning $ concat [ "Failed to import some files from " , Remote.name remote diff --git a/Command/Sync.hs b/Command/Sync.hs index 1f1afc9c13..bb938fe42a 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -67,7 +67,7 @@ import Annex.UpdateInstead import Annex.Export import Annex.TaggedPush import Annex.CurrentBranch -import Annex.Import (canImportKeys) +import Annex.Import import Annex.CheckIgnore import Types.FileMatcher import qualified Database.Export as Export @@ -77,6 +77,7 @@ import Utility.Process.Transcript import Utility.Tuple import Control.Concurrent.MVar +import Control.Concurrent.STM import qualified Data.Map as M import qualified Data.ByteString as S import Data.Char @@ -463,6 +464,9 @@ pullRemote o mergeconfig remote branch = stopUnless (pure $ pullOption o && want importRemote :: Bool -> SyncOptions -> [Git.Merge.MergeConfig] -> Remote -> CurrBranch -> CommandSeek importRemote importcontent o mergeconfig remote currbranch | not (pullOption o) || not wantpull = noop + | Remote.thirdPartyPopulated (Remote.remotetype remote) = + when (canImportKeys remote importcontent) $ + importThirdPartyPopulated remote | otherwise = case remoteAnnexTrackingBranch (Remote.gitconfig remote) of Nothing -> noop Just tb -> do @@ -479,6 +483,34 @@ importRemote importcontent o mergeconfig remote currbranch where wantpull = remoteAnnexPull (Remote.gitconfig remote) +{- Import from a remote that is populated by a third party, by listing + - the contents of the remote, and then adding only the files on it that + - importKey identifies to a tree. The tree is only used to keep track + - of where keys are located on the remote, no remote tracking branch is + - updated, because the filenames are the names of annex object files, + - not suitable for a tracking branch. Does not transfer any content. -} +importThirdPartyPopulated :: Remote -> CommandSeek +importThirdPartyPopulated remote = do + importabletvar <- liftIO $ newTVarIO Nothing + void $ includeCommandAction (Command.Import.listContents remote ImportTree (CheckGitIgnore False) importabletvar) + liftIO (atomically (readTVar importabletvar)) >>= \case + Nothing -> return () + Just importable -> + importKeys remote ImportTree False False importable >>= \case + Just importablekeys -> go importablekeys + Nothing -> warning $ concat + [ "Failed to import from" + , Remote.name remote + ] + where + go importablekeys = void $ includeCommandAction $ starting "pull" ai si $ do + (_imported, updatestate) <- recordImportTree remote ImportTree importablekeys + next $ do + updatestate + return True + ai = ActionItemOther (Just (Remote.name remote)) + si = SeekInput [] + {- The remote probably has both a master and a synced/master branch. - Which to merge from? Well, the master has whatever latest changes - were committed (or pushed changes, if this is a bare remote), diff --git a/Remote/Adb.hs b/Remote/Adb.hs index de5b62d030..82459c8599 100644 --- a/Remote/Adb.hs +++ b/Remote/Adb.hs @@ -46,6 +46,7 @@ remote = specialRemoteType $ RemoteType , setup = adbSetup , exportSupported = exportIsSupported , importSupported = importIsSupported + , thirdPartyPopulated = False } androiddirectoryField :: RemoteConfigField diff --git a/Remote/BitTorrent.hs b/Remote/BitTorrent.hs index a60b58506c..61e660270a 100644 --- a/Remote/BitTorrent.hs +++ b/Remote/BitTorrent.hs @@ -49,6 +49,7 @@ remote = RemoteType , setup = error "not supported" , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } -- There is only one bittorrent remote, and it always exists. diff --git a/Remote/Bup.hs b/Remote/Bup.hs index f87888b688..8950ac4670 100644 --- a/Remote/Bup.hs +++ b/Remote/Bup.hs @@ -50,6 +50,7 @@ remote = specialRemoteType $ RemoteType , setup = bupSetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } buprepoField :: RemoteConfigField diff --git a/Remote/Ddar.hs b/Remote/Ddar.hs index 514b978474..4c544d28cd 100644 --- a/Remote/Ddar.hs +++ b/Remote/Ddar.hs @@ -45,6 +45,7 @@ remote = specialRemoteType $ RemoteType , setup = ddarSetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } ddarrepoField :: RemoteConfigField diff --git a/Remote/Directory.hs b/Remote/Directory.hs index 4be5850bb5..fd3072c14f 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -56,6 +56,7 @@ remote = specialRemoteType $ RemoteType , setup = directorySetup , exportSupported = exportIsSupported , importSupported = importIsSupported + , thirdPartyPopulated = False } directoryField :: RemoteConfigField @@ -369,13 +370,13 @@ guardSameContentIdentifiers cont old new | new == Just old = cont | otherwise = giveup "file content has changed" -importKeyM :: RawFilePath -> ExportLocation -> ContentIdentifier -> MeterUpdate -> Annex Key +importKeyM :: RawFilePath -> ExportLocation -> ContentIdentifier -> MeterUpdate -> Annex (Maybe Key) importKeyM dir loc cid p = do backend <- chooseBackend f k <- fst <$> genKey ks p backend currcid <- liftIO $ mkContentIdentifier absf =<< R.getFileStatus absf - guardSameContentIdentifiers (return k) cid currcid + guardSameContentIdentifiers (return (Just k)) cid currcid where f = fromExportLocation loc absf = dir P. f diff --git a/Remote/External.hs b/Remote/External.hs index 255c5e2456..4921bb5027 100644 --- a/Remote/External.hs +++ b/Remote/External.hs @@ -53,6 +53,7 @@ remote = specialRemoteType $ RemoteType , setup = externalSetup , exportSupported = checkExportSupported , importSupported = importUnsupported + , thirdPartyPopulated = False } externaltypeField :: RemoteConfigField diff --git a/Remote/GCrypt.hs b/Remote/GCrypt.hs index 73c5b7b50c..af1ddc712f 100644 --- a/Remote/GCrypt.hs +++ b/Remote/GCrypt.hs @@ -78,6 +78,7 @@ remote = specialRemoteType $ RemoteType , setup = gCryptSetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } gitRepoField :: RemoteConfigField diff --git a/Remote/Git.hs b/Remote/Git.hs index f9e0711401..62751a5607 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -87,6 +87,7 @@ remote = RemoteType , setup = gitSetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } locationField :: RemoteConfigField diff --git a/Remote/GitLFS.hs b/Remote/GitLFS.hs index d2c749359f..f9cf7f7512 100644 --- a/Remote/GitLFS.hs +++ b/Remote/GitLFS.hs @@ -74,6 +74,7 @@ remote = specialRemoteType $ RemoteType , setup = mySetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } urlField :: RemoteConfigField diff --git a/Remote/Glacier.hs b/Remote/Glacier.hs index a5420d9987..d4b2365226 100644 --- a/Remote/Glacier.hs +++ b/Remote/Glacier.hs @@ -48,6 +48,7 @@ remote = specialRemoteType $ RemoteType , setup = glacierSetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } datacenterField :: RemoteConfigField diff --git a/Remote/Helper/ExportImport.hs b/Remote/Helper/ExportImport.hs index b8d6390e4a..985cca4084 100644 --- a/Remote/Helper/ExportImport.hs +++ b/Remote/Helper/ExportImport.hs @@ -72,7 +72,7 @@ importIsSupported :: ParsedRemoteConfig -> RemoteGitConfig -> Annex Bool importIsSupported = \_ _ -> return True -- | Prevent or allow exporttree=yes and importtree=yes when --- setting up a new remote, depending on exportSupported and importSupported. +-- setting up a new remote, depending on the remote's capabilities. adjustExportImportRemoteType :: RemoteType -> RemoteType adjustExportImportRemoteType rt = rt { setup = setup' } where @@ -80,7 +80,7 @@ adjustExportImportRemoteType rt = rt { setup = setup' } pc <- either giveup return . parseRemoteConfig c =<< configParser rt c let checkconfig supported configured configfield cont = - ifM (supported rt pc gc) + ifM (supported rt pc gc <&&> pure (not (thirdPartyPopulated rt))) ( case st of Init | configured pc && encryptionIsEnabled pc -> @@ -102,35 +102,40 @@ adjustExportImportRemoteType rt = rt { setup = setup' } -- | Adjust a remote to support exporttree=yes and/or importree=yes. adjustExportImport :: Remote -> RemoteStateHandle -> Annex Remote adjustExportImport r rs = do - isexporttree <- pure (exportTree (config r)) <&&> isexporttreeSupported r - isimporttree <- pure (importTree (config r)) <&&> isimporttreeSupported r + isexport <- pure (exportTree (config r)) + <&&> isExportSupported r + -- When thirdPartyPopulated is True, the remote + -- does not need to be configured with importTree to support + -- imports. + isimport <- pure (importTree (config r) || not (thirdPartyPopulated (remotetype r))) + <&&> isImportSupported r let r' = r { remotetype = (remotetype r) - { exportSupported = if isexporttree + { exportSupported = if isexport then exportSupported (remotetype r) else exportUnsupported - , importSupported = if isimporttree + , importSupported = if isimport then importSupported (remotetype r) else importUnsupported } } - if not isexporttree && not isimporttree + if not isexport && not isimport then return r' - else adjustExportImport' isexporttree isimporttree r' rs + else adjustExportImport' isexport isimport r' rs adjustExportImport' :: Bool -> Bool -> Remote -> RemoteStateHandle -> Annex Remote -adjustExportImport' isexporttree isimporttree r rs = do +adjustExportImport' isexport isimport r rs = do dbv <- prepdbv ciddbv <- prepciddb - let normal = not isexporttree && not isimporttree + let normal = not isexport && not isimport let iskeyvaluestore = normal || appendonly r return $ r - { exportActions = if isexporttree - then if isimporttree + { exportActions = if isexport + then if isimport then exportActionsForImport dbv ciddbv (exportActions r) else exportActions r else exportUnsupported - , importActions = if isimporttree + , importActions = if isimport then importActions r else importUnsupported , storeKey = \k af p -> @@ -139,11 +144,13 @@ adjustExportImport' isexporttree isimporttree r rs = do -- when another repository has already stored the -- key, and the local repository does not know -- about it. To avoid unnecessary costs, don't do it. - if isexporttree - then giveup "remote is configured with exporttree=yes; use `git-annex export` to store content on it" - else if isimporttree - then giveup "remote is configured with importtree=yes and without exporttree=yes; cannot modify content stored on it" - else storeKey r k af p + if mergeable + then if isexport + then giveup "remote is configured with exporttree=yes; use `git-annex export` to store content on it" + else if isimport + then giveup "remote is configured with importtree=yes and without exporttree=yes; cannot modify content stored on it" + else storeKey r k af p + else storeKey r k af p , removeKey = \k -> -- Removing a key from an export would need to -- change the tree in the export log to not include @@ -151,19 +158,21 @@ adjustExportImport' isexporttree isimporttree r rs = do -- files would not be dealt with correctly. -- There does not seem to be a good use case for -- removing a key from an export in any case. - if isexporttree - then giveup "dropping content from an export is not supported; use `git annex export` to export a tree that lacks the files you want to remove" - else if isimporttree - then giveup "dropping content from this remote is not supported because it is configured with importtree=yes" - else removeKey r k - , lockContent = if iskeyvaluestore + if mergeable + then if isexport + then giveup "dropping content from an export is not supported; use `git annex export` to export a tree that lacks the files you want to remove" + else if isimport + then giveup "dropping content from this remote is not supported because it is configured with importtree=yes" + else removeKey r k + else removeKey r k + , lockContent = if iskeyvaluestore || not mergeable then lockContent r else Nothing , retrieveKeyFile = \k af dest p -> - if isimporttree + if isimport then supportappendonlyretrieve k af dest p $ retrieveKeyFileFromImport dbv ciddbv k af dest p - else if isexporttree + else if isexport then supportappendonlyretrieve k af dest p $ retrieveKeyFileFromExport dbv k af dest p else retrieveKeyFile r k af dest p @@ -172,16 +181,19 @@ adjustExportImport' isexporttree isimporttree r rs = do else Nothing , checkPresent = \k -> if appendonly r then checkPresent r k - else if isimporttree + else if isimport then anyM (checkPresentImport ciddbv k) =<< getexportlocs dbv k - else if isexporttree + else if isexport -- Check if any of the files a key -- was exported to are present. This -- doesn't guarantee the export -- contains the right content, - -- which is why export remotes - -- are untrusted. + -- if the remote is an export, + -- or if something else can write + -- to it. Remotes that have such + -- problems are made untrusted, + -- so it's not worried about here. then anyM (checkPresentExport (exportActions r) k) =<< getexportlocs dbv k else checkPresent r k @@ -201,17 +213,23 @@ adjustExportImport' isexporttree isimporttree r rs = do else return Nothing , getInfo = do is <- getInfo r - is' <- if isexporttree + is' <- if isexport && not mergeable then do ts <- map fromRef . exportedTreeishes <$> getExport (uuid r) return (is++[("exporttree", "yes"), ("exportedtree", unwords ts)]) else return is - return $ if isimporttree + return $ if isimport && not mergeable then (is'++[("importtree", "yes")]) else is' } where + -- When a remote is populated by a third party, a tree can be + -- imported from it, but that tree is not mergeable into the + -- user's own git branch. But annex objects found in the tree + -- (identified by importKey) can still be retrieved from the remote. + mergeable = thirdPartyPopulated (remotetype r) + -- exportActions adjusted to use the equivilant import actions, -- which take ContentIdentifiers into account. exportActionsForImport dbv ciddbv ea = ea diff --git a/Remote/Hook.hs b/Remote/Hook.hs index cc0ead39e7..89611113b7 100644 --- a/Remote/Hook.hs +++ b/Remote/Hook.hs @@ -40,6 +40,7 @@ remote = specialRemoteType $ RemoteType , setup = hookSetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } hooktypeField :: RemoteConfigField diff --git a/Remote/HttpAlso.hs b/Remote/HttpAlso.hs index 9411f3a842..7beb52426a 100644 --- a/Remote/HttpAlso.hs +++ b/Remote/HttpAlso.hs @@ -41,6 +41,7 @@ remote = RemoteType , setup = httpAlsoSetup , exportSupported = exportIsSupported , importSupported = importUnsupported + , thirdPartyPopulated = False } urlField :: RemoteConfigField diff --git a/Remote/P2P.hs b/Remote/P2P.hs index 5016c9f059..859205bfcc 100644 --- a/Remote/P2P.hs +++ b/Remote/P2P.hs @@ -41,6 +41,7 @@ remote = RemoteType , setup = error "P2P remotes are set up using git-annex p2p" , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } chainGen :: P2PAddress -> Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote) diff --git a/Remote/Rsync.hs b/Remote/Rsync.hs index 813c0f1bbd..7627fbd2c6 100644 --- a/Remote/Rsync.hs +++ b/Remote/Rsync.hs @@ -60,6 +60,7 @@ remote = specialRemoteType $ RemoteType , setup = rsyncSetup , exportSupported = exportIsSupported , importSupported = importUnsupported + , thirdPartyPopulated = False } shellEscapeField :: RemoteConfigField diff --git a/Remote/S3.hs b/Remote/S3.hs index b35a62186e..661e0ab7fa 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -118,6 +118,7 @@ remote = specialRemoteType $ RemoteType , setup = s3Setup , exportSupported = exportIsSupported , importSupported = importIsSupported + , thirdPartyPopulated = False } bucketField :: RemoteConfigField diff --git a/Remote/Tahoe.hs b/Remote/Tahoe.hs index 72df4c70e8..3fbae0df9b 100644 --- a/Remote/Tahoe.hs +++ b/Remote/Tahoe.hs @@ -67,6 +67,7 @@ remote = specialRemoteType $ RemoteType , setup = tahoeSetup , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } scsField :: RemoteConfigField diff --git a/Remote/Web.hs b/Remote/Web.hs index bbe24b38fc..f5de143c27 100644 --- a/Remote/Web.hs +++ b/Remote/Web.hs @@ -32,6 +32,7 @@ remote = RemoteType , setup = error "not supported" , exportSupported = exportUnsupported , importSupported = importUnsupported + , thirdPartyPopulated = False } -- There is only one web remote, and it always exists. diff --git a/Remote/WebDAV.hs b/Remote/WebDAV.hs index b649ae0444..5163942b47 100644 --- a/Remote/WebDAV.hs +++ b/Remote/WebDAV.hs @@ -57,6 +57,7 @@ remote = specialRemoteType $ RemoteType , setup = webdavSetup , exportSupported = exportIsSupported , importSupported = importUnsupported + , thirdPartyPopulated = False } urlField :: RemoteConfigField diff --git a/Types/Remote.hs b/Types/Remote.hs index 7b5f1ac0e5..e04ff8d938 100644 --- a/Types/Remote.hs +++ b/Types/Remote.hs @@ -63,10 +63,15 @@ data RemoteTypeA a = RemoteType , configParser :: RemoteConfig -> a RemoteConfigParser -- initializes or enables a remote , setup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> a (RemoteConfig, UUID) - -- check if a remote of this type is able to support export of trees + -- check if a remote of this type is able to support export , exportSupported :: ParsedRemoteConfig -> RemoteGitConfig -> a Bool - -- check if a remote of this type is able to support import of trees + -- check if a remote of this type is able to support import , importSupported :: ParsedRemoteConfig -> RemoteGitConfig -> a Bool + -- is a remote of this type not a usual key/value store, + -- or export/import of a tree of files, but instead a collection + -- of files, populated by something outside git-annex, some of + -- which may be annex objects? + , thirdPartyPopulated :: Bool } instance Eq (RemoteTypeA a) where @@ -113,9 +118,9 @@ data RemoteA a = Remote -- Some remotes can checkPresent without an expensive network -- operation. , checkPresentCheap :: Bool - -- Some remotes support export of trees of files. + -- Some remotes support export. , exportActions :: ExportActions a - -- Some remotes support import of trees of files. + -- Some remotes support import. , importActions :: ImportActions a -- Some remotes can provide additional details for whereis. , whereisKey :: Maybe (Key -> a [String]) @@ -288,8 +293,13 @@ data ImportActions a = ImportActions -- bearing in mind that the file on the remote may have changed -- since the ContentIdentifier was generated. -- - -- Throws exception on failure. - , importKey :: Maybe (ImportLocation -> ContentIdentifier -> MeterUpdate -> a Key) + -- When the remote is thirdPartyPopulated, this should check if the + -- file stored on the remote is the content of an annex object, + -- and return its Key, or Nothing if it is not. Should not + -- otherwise return Nothing. + -- + -- Throws exception on failure to access the remote. + , importKey :: Maybe (ImportLocation -> ContentIdentifier -> MeterUpdate -> a (Maybe Key)) -- Retrieves a file from the remote. Ensures that the file -- it retrieves has the requested ContentIdentifier. -- From 3207e8293b4bd1aee8d1ea599a537bfb80290d1b Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 18 Dec 2020 16:03:51 -0400 Subject: [PATCH 05/24] start borg special remote Compiles, but unusable so far. --- Remote/Borg.hs | 105 +++++++++++++++++++ Remote/List.hs | 2 + Types/GitConfig.hs | 2 + doc/devblog/day_637__thirdparty_of_borg.mdwn | 17 +++ doc/git-annex.mdwn | 6 ++ doc/special_remotes/borg.mdwn | 4 +- git-annex.cabal | 1 + 7 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 Remote/Borg.hs create mode 100644 doc/devblog/day_637__thirdparty_of_borg.mdwn diff --git a/Remote/Borg.hs b/Remote/Borg.hs new file mode 100644 index 0000000000..80e5be2964 --- /dev/null +++ b/Remote/Borg.hs @@ -0,0 +1,105 @@ +{- Using borg as a remote. + - + - Copyright 2020 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Remote.Borg (remote) where + +import Annex.Common +import Types.Remote +import Types.Creds +import qualified Git +import Config +import Config.Cost +import Annex.SpecialRemote.Config +import Remote.Helper.Special +import Remote.Helper.ExportImport +import Annex.UUID +import Types.ProposedAccepted + +import qualified Data.Map as M + +type BorgRepo = String + +remote :: RemoteType +remote = RemoteType + { typename = "borg" + , enumerate = const (findSpecialRemotes "borgrepo") + , generate = gen + , configParser = mkRemoteConfigParser + [ optionalStringParser borgrepoField + (FieldDesc "(required) borg repository to use") + ] + , setup = borgSetup + , exportSupported = exportUnsupported + , importSupported = importIsSupported + , thirdPartyPopulated = True + } + +borgrepoField :: RemoteConfigField +borgrepoField = Accepted "borgrepo" + +gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote) +gen r u rc gc rs = do + c <- parsedRemoteConfig remote rc + cst <- remoteCost gc $ + if borgLocal borgrepo + then nearlyCheapRemoteCost + else expensiveRemoteCost + return $ Just $ Remote + { uuid = u + , cost = cst + , name = Git.repoDescribe r + , storeKey = storeKeyDummy + , retrieveKeyFile = retrieveKeyFileDummy + , retrieveKeyFileCheap = Nothing + -- Borg cryptographically verifies content. + , retrievalSecurityPolicy = RetrievalAllKeysSecure + , removeKey = removeKeyDummy + , lockContent = Nothing + , checkPresent = checkPresentDummy + , checkPresentCheap = borgLocal borgrepo + , exportActions = exportUnsupported + , importActions = importUnsupported + , whereisKey = Nothing + , remoteFsck = Nothing + , repairRepo = Nothing + , config = c + , getRepo = return r + , gitconfig = gc + , localpath = if borgLocal borgrepo && not (null borgrepo) + then Just borgrepo + else Nothing + , remotetype = remote + , availability = if borgLocal borgrepo then LocallyAvailable else GloballyAvailable + , readonly = False + , appendonly = False + , mkUnavailable = return Nothing + , getInfo = return [("repo", borgrepo)] + , claimUrl = Nothing + , checkUrl = Nothing + , remoteStateHandle = rs + } + where + borgrepo = fromMaybe (giveup "missing borgrepo") $ remoteAnnexBorgRepo gc + +borgSetup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID) +borgSetup _ mu _ c _gc = do + u <- maybe (liftIO genUUID) return mu + + -- verify configuration is sane + let borgrepo = maybe (giveup "Specify borgrepo=") fromProposedAccepted $ + M.lookup borgrepoField c + + -- The borgrepo is stored in git config, as well as this repo's + -- persistant state, so it can vary between hosts. + gitConfigSpecialRemote u c [("borgrepo", borgrepo)] + + -- TODO: untrusted by default, but allow overriding that + + return (c, u) + +borgLocal :: BorgRepo -> Bool +borgLocal = notElem ':' diff --git a/Remote/List.hs b/Remote/List.hs index 7695eec902..8ca9d8f794 100644 --- a/Remote/List.hs +++ b/Remote/List.hs @@ -36,6 +36,7 @@ import qualified Remote.Glacier import qualified Remote.Ddar import qualified Remote.GitLFS import qualified Remote.HttpAlso +import qualified Remote.Borg import qualified Remote.Hook import qualified Remote.External @@ -57,6 +58,7 @@ remoteTypes = map adjustExportImportRemoteType , Remote.Ddar.remote , Remote.GitLFS.remote , Remote.HttpAlso.remote + , Remote.Borg.remote , Remote.Hook.remote , Remote.External.remote ] diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index cb50c79666..74df213419 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -327,6 +327,7 @@ data RemoteGitConfig = RemoteGitConfig , remoteAnnexGnupgDecryptOptions :: [String] , remoteAnnexRsyncUrl :: Maybe String , remoteAnnexBupRepo :: Maybe String + , remoteAnnexBorgRepo :: Maybe String , remoteAnnexTahoe :: Maybe FilePath , remoteAnnexBupSplitOptions :: [String] , remoteAnnexDirectory :: Maybe FilePath @@ -391,6 +392,7 @@ extractRemoteGitConfig r remotename = do , remoteAnnexGnupgDecryptOptions = getoptions "gnupg-decrypt-options" , remoteAnnexRsyncUrl = notempty $ getmaybe "rsyncurl" , remoteAnnexBupRepo = getmaybe "buprepo" + , remoteAnnexBorgRepo = getmaybe "borgrepo" , remoteAnnexTahoe = getmaybe "tahoe" , remoteAnnexBupSplitOptions = getoptions "bup-split-options" , remoteAnnexDirectory = notempty $ getmaybe "directory" diff --git a/doc/devblog/day_637__thirdparty_of_borg.mdwn b/doc/devblog/day_637__thirdparty_of_borg.mdwn new file mode 100644 index 0000000000..5d4a34c999 --- /dev/null +++ b/doc/devblog/day_637__thirdparty_of_borg.mdwn @@ -0,0 +1,17 @@ +Finally gotten started on the borg special remote idea. A prerequisite of +that is remotes that can be imported from, but not exported to. So I +actually started by allowing setting importtree=yes without +exporttree=yes. A lot of code had assumptions about that not being allowed, +so it took a while to chase down everything. Finished most of that yesterday. + +What I've done today is added a `thirdPartyPopulated` type of remote, +which `git-annex sync` can "pull" from by using the existing import +interface to list files on it, and determine which of them are annex object +files. I have not started on the actual borg remote at all, but this should +be all the groundwork for it done. + +(I also finished up annex.stalldetection earlier this week.) + +--- + +This work was sponsored by Jake Vosloo [on Patreon](https://patreon.com/joeyh). diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 6280bcb864..45306545c7 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1541,6 +1541,12 @@ Remotes are configured using these settings in `.git/config`. the location of the bup repository to use. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. +* `remote..annex-borgrepo` + + Used by borg special remotes, this configures + the location of the borg repository to use. Normally this is automatically + set up by `git annex initremote`, but you can change it if needed. + * `remote..annex-ddarrepo` Used by ddar special remotes, this configures diff --git a/doc/special_remotes/borg.mdwn b/doc/special_remotes/borg.mdwn index 8869288e88..42f7fefde1 100644 --- a/doc/special_remotes/borg.mdwn +++ b/doc/special_remotes/borg.mdwn @@ -11,7 +11,7 @@ the annexed files that are stored in the borg repository. These parameters can be passed to `git annex initremote` to configure the remote: -* `repository` - The location of a borg repository, eg a path, or +* `borgrepo` - The location of a borg repository, eg a path, or `user@host:path` for ssh access. * `scan` - The path, within the borg repository, to scan for @@ -27,7 +27,7 @@ remote: ## setup example # borg init --encryption=keyfile /path/to/borgrepo - # git annex initremote borg type=borg repository=/path/to/borgrepo scan=`pwd` + # git annex initremote borg type=borg borgrepo=/path/to/borgrepo scan=`pwd` # borg create /path/to/borgrepo `pwd`::{now} # git annex sync borg diff --git a/git-annex.cabal b/git-annex.cabal index 48a1ed5d5c..e035eedd28 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -940,6 +940,7 @@ Executable git-annex Remote Remote.Adb Remote.BitTorrent + Remote.Borg Remote.Bup Remote.Ddar Remote.Directory From 1c054f1cf7fc0f800fec02fb857159f6bf1f9603 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 18 Dec 2020 16:52:49 -0400 Subject: [PATCH 06/24] started borg special remote Still need to implement 3 methods, but importKeyM looks like it will work well to find annex object files. --- Annex/Import.hs | 2 +- Remote/Borg.hs | 66 ++++++++++++++++++++++++++++++++++++++++++++- Remote/Directory.hs | 4 +-- Types/Remote.hs | 2 +- 4 files changed, 69 insertions(+), 5 deletions(-) diff --git a/Annex/Import.hs b/Annex/Import.hs index c5a16f3d86..ffcf5484f6 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -455,7 +455,7 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab -- downloaded, which is a more expensive -- operation generally. let p' = if importcontent then nullMeterUpdate else p - importkey loc cid p' >>= \case + importkey loc cid sz p' >>= \case Nothing -> return Nothing Just unsizedk -> do -- This avoids every remote needing diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 80e5be2964..823cceca21 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -5,11 +5,14 @@ - Licensed under the GNU AGPL version 3 or higher. -} +{-# LANGUAGE OverloadedStrings #-} + module Remote.Borg (remote) where import Annex.Common import Types.Remote import Types.Creds +import Types.Import import qualified Git import Config import Config.Cost @@ -18,7 +21,10 @@ import Remote.Helper.Special import Remote.Helper.ExportImport import Annex.UUID import Types.ProposedAccepted +import Crypto (isEncKey) +import Utility.Metered +import qualified System.FilePath.ByteString as P import qualified Data.Map as M type BorgRepo = String @@ -62,7 +68,17 @@ gen r u rc gc rs = do , checkPresent = checkPresentDummy , checkPresentCheap = borgLocal borgrepo , exportActions = exportUnsupported - , importActions = importUnsupported + , importActions = ImportActions + { listImportableContents = listImportableContentsM borgrepo + , importKey = Just importKeyM + , retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM borgrepo + , checkPresentExportWithContentIdentifier = checkPresentExportWithContentIdentifierM borgrepo + -- This remote is thirdPartyPopulated, so these + -- actions will never be used. + , storeExportWithContentIdentifier = storeExportWithContentIdentifier importUnsupported + , removeExportDirectoryWhenEmpty = removeExportDirectoryWhenEmpty importUnsupported + , removeExportWithContentIdentifier = removeExportWithContentIdentifier importUnsupported + } , whereisKey = Nothing , remoteFsck = Nothing , repairRepo = Nothing @@ -103,3 +119,51 @@ borgSetup _ mu _ c _gc = do borgLocal :: BorgRepo -> Bool borgLocal = notElem ':' + +listImportableContentsM :: BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsM borgrepo = error "TODO" + +-- Since this remote is thirdPartyPopulated, this needs to +-- find only those ImportLocations that are annex object files. All other +-- files in the borg backup are ignored. +importKeyM :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key) +importKeyM loc cid sz _ = return $ case deserializeKey' f of + Just k + -- Annex objects always are in a subdirectory with the same + -- name as the filename. If this is not the case for the file + -- that was backed up, it is probably not a valid annex object. + -- Eg, it could be something in annex/bad/, or annex/tmp/. + -- Or it could be a file that only happens to have a name + -- like an annex object. + -- (This does unfortunately prevent recognizing files that are + -- part of special remotes that don't use that layout. The most + -- likely special remote to be in a backup, the directory + -- special remote, does use that layout at least.) + | lastMaybe (P.splitDirectories (P.dropFileName p)) /= Just f -> Nothing + -- Chunked or encrypted keys used in special remotes are not + -- supported. + | isChunkKey k || isEncKey k -> Nothing + -- Check that the size of the key is the same as the size of the + -- file stored in borg. This is a cheap way to make sure it's + -- probabably the actual content of the file. We don't fully + -- verify the content here because that could be a very + -- expensive operation for a large repository; if the user + -- wants to detect every possible data corruption problem + -- (eg, wrong data read off disk during backup, or the object + -- was corrupt in the git-annex repo and that bad object got + -- backed up), they can fsck the remote. + | otherwise -> case fromKey keySize k of + Just sz' + | sz' == sz -> Just k + | otherwise -> Nothing + Nothing -> Just k + Nothing -> Nothing + where + p = fromImportLocation loc + f = P.takeFileName p + +retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key +retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" + +checkPresentExportWithContentIdentifierM :: BorgRepo -> Key -> ImportLocation -> [ContentIdentifier] -> Annex Bool +checkPresentExportWithContentIdentifierM borgrepo k loc cids = error "TODO" diff --git a/Remote/Directory.hs b/Remote/Directory.hs index fd3072c14f..009c160339 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -370,8 +370,8 @@ guardSameContentIdentifiers cont old new | new == Just old = cont | otherwise = giveup "file content has changed" -importKeyM :: RawFilePath -> ExportLocation -> ContentIdentifier -> MeterUpdate -> Annex (Maybe Key) -importKeyM dir loc cid p = do +importKeyM :: RawFilePath -> ExportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key) +importKeyM dir loc cid _sz p = do backend <- chooseBackend f k <- fst <$> genKey ks p backend currcid <- liftIO $ mkContentIdentifier absf diff --git a/Types/Remote.hs b/Types/Remote.hs index e04ff8d938..9085b7311a 100644 --- a/Types/Remote.hs +++ b/Types/Remote.hs @@ -299,7 +299,7 @@ data ImportActions a = ImportActions -- otherwise return Nothing. -- -- Throws exception on failure to access the remote. - , importKey :: Maybe (ImportLocation -> ContentIdentifier -> MeterUpdate -> a (Maybe Key)) + , importKey :: Maybe (ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> a (Maybe Key)) -- Retrieves a file from the remote. Ensures that the file -- it retrieves has the requested ContentIdentifier. -- From ca31d7e54fefe6e88880caaa12a090041cef7ba9 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 18 Dec 2020 17:04:46 -0400 Subject: [PATCH 07/24] refactor That code was not borg specific, and I can see making more remotes for other backup software. --- Remote/Borg.hs | 44 ++--------------------------- Remote/Helper/ThirdParty.hs | 56 +++++++++++++++++++++++++++++++++++++ git-annex.cabal | 1 + 3 files changed, 59 insertions(+), 42 deletions(-) create mode 100644 Remote/Helper/ThirdParty.hs diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 823cceca21..3f0c2b68cc 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -21,10 +21,9 @@ import Remote.Helper.Special import Remote.Helper.ExportImport import Annex.UUID import Types.ProposedAccepted -import Crypto (isEncKey) import Utility.Metered +import qualified Remote.Helper.ThirdParty as ThirdParty -import qualified System.FilePath.ByteString as P import qualified Data.Map as M type BorgRepo = String @@ -70,7 +69,7 @@ gen r u rc gc rs = do , exportActions = exportUnsupported , importActions = ImportActions { listImportableContents = listImportableContentsM borgrepo - , importKey = Just importKeyM + , importKey = Just ThirdParty.importKey , retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM borgrepo , checkPresentExportWithContentIdentifier = checkPresentExportWithContentIdentifierM borgrepo -- This remote is thirdPartyPopulated, so these @@ -123,45 +122,6 @@ borgLocal = notElem ':' listImportableContentsM :: BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) listImportableContentsM borgrepo = error "TODO" --- Since this remote is thirdPartyPopulated, this needs to --- find only those ImportLocations that are annex object files. All other --- files in the borg backup are ignored. -importKeyM :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key) -importKeyM loc cid sz _ = return $ case deserializeKey' f of - Just k - -- Annex objects always are in a subdirectory with the same - -- name as the filename. If this is not the case for the file - -- that was backed up, it is probably not a valid annex object. - -- Eg, it could be something in annex/bad/, or annex/tmp/. - -- Or it could be a file that only happens to have a name - -- like an annex object. - -- (This does unfortunately prevent recognizing files that are - -- part of special remotes that don't use that layout. The most - -- likely special remote to be in a backup, the directory - -- special remote, does use that layout at least.) - | lastMaybe (P.splitDirectories (P.dropFileName p)) /= Just f -> Nothing - -- Chunked or encrypted keys used in special remotes are not - -- supported. - | isChunkKey k || isEncKey k -> Nothing - -- Check that the size of the key is the same as the size of the - -- file stored in borg. This is a cheap way to make sure it's - -- probabably the actual content of the file. We don't fully - -- verify the content here because that could be a very - -- expensive operation for a large repository; if the user - -- wants to detect every possible data corruption problem - -- (eg, wrong data read off disk during backup, or the object - -- was corrupt in the git-annex repo and that bad object got - -- backed up), they can fsck the remote. - | otherwise -> case fromKey keySize k of - Just sz' - | sz' == sz -> Just k - | otherwise -> Nothing - Nothing -> Just k - Nothing -> Nothing - where - p = fromImportLocation loc - f = P.takeFileName p - retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" diff --git a/Remote/Helper/ThirdParty.hs b/Remote/Helper/ThirdParty.hs new file mode 100644 index 0000000000..6ce106fa12 --- /dev/null +++ b/Remote/Helper/ThirdParty.hs @@ -0,0 +1,56 @@ +{- Helpers for thirdPartyPopulated remotes + - + - Copyright 2020 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Remote.Helper.ThirdParty where + +import Annex.Common +import Types.Remote +import Types.Import +import Crypto (isEncKey) +import Utility.Metered + +import qualified System.FilePath.ByteString as P + +-- When a remote is thirdPartyPopulated, and contains a backup of a +-- git-annex repository or some special remotes, this can be used to +-- find only those ImportLocations that are annex object files. +-- All other ImportLocations are ignored. +importKey :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key) +importKey loc _cid sz _ = return $ case deserializeKey' f of + Just k + -- Annex objects always are in a subdirectory with the same + -- name as the filename. If this is not the case for the file + -- that was backed up, it is probably not a valid annex object. + -- Eg, it could be something in annex/bad/, or annex/tmp/. + -- Or it could be a file that only happens to have a name + -- like an annex object. + -- (This does unfortunately prevent recognizing files that are + -- part of special remotes that don't use that layout. The most + -- likely special remote to be in a backup, the directory + -- special remote, does use that layout at least.) + | lastMaybe (P.splitDirectories (P.dropFileName p)) /= Just f -> Nothing + -- Chunked or encrypted keys used in special remotes are not + -- supported. + | isChunkKey k || isEncKey k -> Nothing + -- Check that the size of the key is the same as the size of the + -- file stored in the backup. This is a cheap way to make sure it's + -- probabably the actual content of the file. We don't fully + -- verify the content here because that could be a very + -- expensive operation for a large repository; if the user + -- wants to detect every possible data corruption problem + -- (eg, wrong data read off disk during backup, or the object + -- was corrupt in the git-annex repo and that bad object got + -- backed up), they can fsck the remote. + | otherwise -> case fromKey keySize k of + Just sz' + | sz' == sz -> Just k + | otherwise -> Nothing + Nothing -> Just k + Nothing -> Nothing + where + p = fromImportLocation loc + f = P.takeFileName p diff --git a/git-annex.cabal b/git-annex.cabal index e035eedd28..f39146f25b 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -963,6 +963,7 @@ Executable git-annex Remote.Helper.Messages Remote.Helper.P2P Remote.Helper.ReadOnly + Remote.Helper.ThirdParty Remote.Helper.Special Remote.Helper.Ssh Remote.HttpAlso From 706e2a63fb536d8bb4ce156c20e5a2f6f734d8b7 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 21 Dec 2020 13:24:07 -0400 Subject: [PATCH 08/24] fix logic error in thirdPartyPopulated handling --- Remote/Helper/ExportImport.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Remote/Helper/ExportImport.hs b/Remote/Helper/ExportImport.hs index 985cca4084..573db92147 100644 --- a/Remote/Helper/ExportImport.hs +++ b/Remote/Helper/ExportImport.hs @@ -107,7 +107,7 @@ adjustExportImport r rs = do -- When thirdPartyPopulated is True, the remote -- does not need to be configured with importTree to support -- imports. - isimport <- pure (importTree (config r) || not (thirdPartyPopulated (remotetype r))) + isimport <- pure (importTree (config r) || thirdPartyPopulated (remotetype r)) <&&> isImportSupported r let r' = r { remotetype = (remotetype r) From 57b03630b36bf0ed9920ffd5151d14d62ae8f6a9 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 21 Dec 2020 13:46:04 -0400 Subject: [PATCH 09/24] support thirdPartyPopulated These don't have importTree in their config, because they don't support tree import, but they do still support import, and do not support export or key/value modification. --- Annex/Drop.hs | 3 ++- Assistant/DaemonStatus.hs | 3 ++- Command/Sync.hs | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Annex/Drop.hs b/Annex/Drop.hs index 1a6af18586..fca96c1530 100644 --- a/Annex/Drop.hs +++ b/Annex/Drop.hs @@ -11,7 +11,7 @@ import Annex.Common import qualified Annex import Logs.Trust import Annex.NumCopies -import Types.Remote (uuid, appendonly, config) +import Types.Remote (uuid, appendonly, config, remotetype, thirdPartyPopulated) import qualified Remote import qualified Command.Drop import Command @@ -88,6 +88,7 @@ handleDropsFrom locs rs reason fromhere key afile si preverified runner = do | appendonly r = go fs rest n | exportTree (config r) = go fs rest n | importTree (config r) = go fs rest n + | thirdPartyPopulated (remotetype r) = go fs rest n | checkcopies n (Just $ Remote.uuid r) = dropr fs r n >>= go fs rest | otherwise = pure n diff --git a/Assistant/DaemonStatus.hs b/Assistant/DaemonStatus.hs index d7c5819951..34e34073ba 100644 --- a/Assistant/DaemonStatus.hs +++ b/Assistant/DaemonStatus.hs @@ -57,7 +57,8 @@ calcSyncRemotes = do contentremotes <- filterM (not <$$> liftIO . getDynamicConfig . remoteAnnexIgnore . Remote.gitconfig) $ filter (\r -> Remote.uuid r /= NoUUID) syncable let (exportremotes, nonexportremotes) = partition (exportTree . Remote.config) contentremotes - let dataremotes = filter (not . importTree . Remote.config) nonexportremotes + let isimport r = importTree (Remote.config r) || Remote.thirdPartyPopulated (Remote.remotetype r) + let dataremotes = filter (not . isimport) nonexportremotes return $ \dstatus -> dstatus { syncRemotes = syncable diff --git a/Command/Sync.hs b/Command/Sync.hs index b4154b798d..613afe8639 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -212,8 +212,9 @@ seek' o = do dataremotes <- filter (\r -> Remote.uuid r /= NoUUID) <$> filterM (not <$$> liftIO . getDynamicConfig . remoteAnnexIgnore . Remote.gitconfig) remotes let (exportremotes, nonexportremotes) = partition (exportTree . Remote.config) dataremotes - let importremotes = filter (importTree . Remote.config) dataremotes - let keyvalueremotes = filter (not . importTree . Remote.config) nonexportremotes + let isimport r = importTree (Remote.config r) || Remote.thirdPartyPopulated (Remote.remotetype r) + let importremotes = filter isimport dataremotes + let keyvalueremotes = filter (not . isimport) nonexportremotes if cleanupOption o then do From 15000dee07a06d04285351616915794bd6ec7f14 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 21 Dec 2020 16:03:27 -0400 Subject: [PATCH 10/24] improve thirdpartypopulated support May actually work now. Note that, importKey now has to add the size to the key if it's supposed to have size. Remote.Directory relied on the importer adding the size, which is no longer done, so it was changed; it was the only one. This way, importKey does not need to behave differently between regular and thirdpartypopulated imports. --- Annex/Import.hs | 55 +++++++++++-------- Command/Import.hs | 16 ++++-- Command/Sync.hs | 29 ++++------ Remote/Directory.hs | 6 +- .../{ThirdParty.hs => ThirdPartyPopulated.hs} | 0 5 files changed, 56 insertions(+), 50 deletions(-) rename Remote/Helper/{ThirdParty.hs => ThirdPartyPopulated.hs} (100%) diff --git a/Annex/Import.hs b/Annex/Import.hs index ffcf5484f6..0bc0c44c83 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -325,7 +325,7 @@ importKeys -> Bool -> ImportableContents (ContentIdentifier, ByteSize) -> Annex (Maybe (ImportableContents (Either Sha Key))) -importKeys remote importtreeconfig importcontent ignorelargefilesconfig importablecontents = do +importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do unless (canImportKeys remote importcontent) $ giveup "This remote does not support importing without downloading content." -- This map is used to remember content identifiers that @@ -347,7 +347,9 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab go oldversion cidmap importing (ImportableContents l h) db = do largematcher <- largeFilesMatcher jobs <- forM l $ \i -> - startimport cidmap importing db i oldversion largematcher + if thirdpartypopulated + then thirdpartypopulatedimport cidmap db i + else startimport cidmap importing db i oldversion largematcher l' <- liftIO $ forM jobs $ either pure (atomically . takeTMVar) if any isNothing l' @@ -406,6 +408,20 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab importaction return (Right job) + thirdpartypopulatedimport cidmap db (loc, (cid, sz)) = + case Remote.importKey ia of + Nothing -> return $ Left Nothing + Just importkey -> + tryNonAsync (importkey loc cid sz nullMeterUpdate) >>= \case + Right (Just k) -> do + recordcidkey cidmap db cid k + logChange k (Remote.uuid remote) InfoPresent + return $ Left $ Just (loc, Right k) + Right Nothing -> return $ Left Nothing + Left e -> do + warning (show e) + return $ Left Nothing + importordownload cidmap db (loc, (cid, sz)) largematcher= do f <- locworktreefile loc matcher <- largematcher f @@ -415,7 +431,7 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab let act = if importcontent then case Remote.importKey ia of Nothing -> dodownload - Just _ -> if not ignorelargefilesconfig && Utility.Matcher.introspect matchNeedsFileContent matcher + Just _ -> if Utility.Matcher.introspect matchNeedsFileContent matcher then dodownload else doimport else doimport @@ -425,7 +441,7 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab case Remote.importKey ia of Nothing -> error "internal" -- checked earlier Just importkey -> do - when (not ignorelargefilesconfig && Utility.Matcher.introspect matchNeedsFileContent matcher) $ + when (Utility.Matcher.introspect matchNeedsFileContent matcher) $ giveup "annex.largefiles configuration examines file contents, so cannot import without content." let mi = MatchingInfo ProvidedInfo { providedFilePath = f @@ -434,9 +450,7 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab , providedMimeType = Nothing , providedMimeEncoding = Nothing } - islargefile <- if ignorelargefilesconfig - then pure True - else checkMatcher' matcher mi mempty + islargefile <- checkMatcher' matcher mi mempty metered Nothing sz $ const $ if islargefile then doimportlarge importkey cidmap db loc cid sz f else doimportsmall cidmap db loc cid sz @@ -457,20 +471,15 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab let p' = if importcontent then nullMeterUpdate else p importkey loc cid sz p' >>= \case Nothing -> return Nothing - Just unsizedk -> do - -- This avoids every remote needing - -- to add the size. - let k = alterKey unsizedk $ \kd -> kd - { keySize = keySize kd <|> Just sz } - checkSecureHashes k >>= \case - Nothing -> do - recordcidkey cidmap db cid k - logChange k (Remote.uuid remote) InfoPresent - if importcontent - then getcontent k - else return (Just (k, True)) - Just msg -> giveup (msg ++ " to import") - + Just k -> checkSecureHashes k >>= \case + Nothing -> do + recordcidkey cidmap db cid k + logChange k (Remote.uuid remote) InfoPresent + if importcontent + then getcontent k + else return (Just (k, True)) + Just msg -> giveup (msg ++ " to import") + getcontent :: Key -> Annex (Maybe (Key, Bool)) getcontent k = do let af = AssociatedFile (Just f) @@ -552,9 +561,7 @@ importKeys remote importtreeconfig importcontent ignorelargefilesconfig importab , contentFile = Just tmpfile , matchKey = Nothing } - islargefile <- if ignorelargefilesconfig - then pure True - else checkMatcher' matcher mi mempty + islargefile <- checkMatcher' matcher mi mempty if islargefile then do backend <- chooseBackend f diff --git a/Command/Import.hs b/Command/Import.hs index f383fd843f..472d82d4e7 100644 --- a/Command/Import.hs +++ b/Command/Import.hs @@ -324,11 +324,18 @@ seekRemote remote branch msubdir importcontent ci = do listContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> TVar (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize))) -> CommandStart listContents remote importtreeconfig ci tvar = starting "list" ai si $ + listContents' remote importtreeconfig ci $ \importable -> do + liftIO $ atomically $ writeTVar tvar (Just importable) + next $ return True + where + ai = ActionItemOther (Just (Remote.name remote)) + si = SeekInput [] + +listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (ImportableContents (ContentIdentifier, Remote.ByteSize) -> Annex a) -> Annex a +listContents' remote importtreeconfig ci a = makeImportMatcher remote >>= \case Right matcher -> getImportableContents remote importtreeconfig ci matcher >>= \case - Just importable -> next $ do - liftIO $ atomically $ writeTVar tvar (Just importable) - return True + Just importable -> a importable Nothing -> giveup $ "Unable to list contents of " ++ Remote.name remote Left err -> giveup $ unwords [ "Cannot import from" @@ -336,9 +343,6 @@ listContents remote importtreeconfig ci tvar = starting "list" ai si $ , "because of a problem with its configuration:" , err ] - where - ai = ActionItemOther (Just (Remote.name remote)) - si = SeekInput [] commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> ImportableContents (Either Sha Key) -> CommandStart commitRemote remote branch tb trackingcommit importtreeconfig importcommitconfig importable = diff --git a/Command/Sync.hs b/Command/Sync.hs index 613afe8639..3a3839b050 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -77,7 +77,6 @@ import Utility.Process.Transcript import Utility.Tuple import Control.Concurrent.MVar -import Control.Concurrent.STM import qualified Data.Map as M import qualified Data.ByteString as S import Data.Char @@ -492,24 +491,18 @@ importRemote importcontent o mergeconfig remote currbranch - updated, because the filenames are the names of annex object files, - not suitable for a tracking branch. Does not transfer any content. -} importThirdPartyPopulated :: Remote -> CommandSeek -importThirdPartyPopulated remote = do - importabletvar <- liftIO $ newTVarIO Nothing - void $ includeCommandAction (Command.Import.listContents remote ImportTree (CheckGitIgnore False) importabletvar) - liftIO (atomically (readTVar importabletvar)) >>= \case - Nothing -> return () - Just importable -> - importKeys remote ImportTree False False importable >>= \case - Just importablekeys -> go importablekeys - Nothing -> warning $ concat - [ "Failed to import from" - , Remote.name remote - ] +importThirdPartyPopulated remote = + void $ includeCommandAction $ starting "list" ai si $ + Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go where - go importablekeys = void $ includeCommandAction $ starting "pull" ai si $ do - (_imported, updatestate) <- recordImportTree remote ImportTree importablekeys - next $ do - updatestate - return True + go importable = importKeys remote ImportTree False True importable >>= \case + Just importablekeys -> do + (_imported, updatestate) <- recordImportTree remote ImportTree importablekeys + next $ do + updatestate + return True + Nothing -> next $ return False + ai = ActionItemOther (Just (Remote.name remote)) si = SeekInput [] diff --git a/Remote/Directory.hs b/Remote/Directory.hs index 009c160339..91ea86019a 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -371,9 +371,11 @@ guardSameContentIdentifiers cont old new | otherwise = giveup "file content has changed" importKeyM :: RawFilePath -> ExportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key) -importKeyM dir loc cid _sz p = do +importKeyM dir loc cid sz p = do backend <- chooseBackend f - k <- fst <$> genKey ks p backend + unsizedk <- fst <$> genKey ks p backend + let k = alterKey unsizedk $ \kd -> kd + { keySize = keySize kd <|> Just sz } currcid <- liftIO $ mkContentIdentifier absf =<< R.getFileStatus absf guardSameContentIdentifiers (return (Just k)) cid currcid diff --git a/Remote/Helper/ThirdParty.hs b/Remote/Helper/ThirdPartyPopulated.hs similarity index 100% rename from Remote/Helper/ThirdParty.hs rename to Remote/Helper/ThirdPartyPopulated.hs From bcd55b365c5617fe7e5438b44d30d8f3af7ec1d0 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 21 Dec 2020 16:20:58 -0400 Subject: [PATCH 11/24] import from borg is basically working Still some issues to deal with, see TODO and XXX. Here's what gets logged, for each key: cid log: 1608582045.832799227s 6720ebad-b20e-4460-a8f2-2477361aea75 !MjAyMC0xMi0yMVQxMTozMzoxNw==:!MjAyMC0xMi0yMVQxMzowNzoyNg== The "!Mj" are base64 encoded borg archive names, since mine were dates and contained some characters not allowed in cid logs unescaped. There were archives that each contained the key. This list will grow as more borg backups are done and learned about. tree generated: 120000 blob 5ef6a4615c084819b44cd4e3a31657664ddf643b x/dotgit/annex/objects/06/mv/SHA256E-s30--a5d8532e64ec28f5491e25e7a6c1cb68f80507c1be6c1b35f8ec53d25413e5da/SHA256E-s30--a5d8532e64ec28f5491e25e7a6c1cb68f80507c1be6c1b35f8ec53d25413e5da 120000 blob 063a139d3021c8db60f5c576d29fada2b824d91c x/dotgit/annex/objects/72/PP/SHA256E-s30--e80b09a854b4e4d99a76caaa6983b34272480e0b4fdb95d04234a54b4849b893/SHA256E-s30--e80b09a854b4e4d99a76caaa6983b34272480e0b4fdb95d04234a54b4849b893 120000 blob b53b54916fd6abf21fedf796deca08d5ac7a75af x/dotgit/annex/objects/Ww/pk/SHA256E-s30--6aac072a8ebf02a5807c4f15e77ed585a6c87b3b333ba625a3c8d6b4dc50a9f2/SHA256E-s30--6aac072a8ebf02a5807c4f15e77ed585a6c87b3b333ba625a3c8d6b4dc50a9f2 This commit was sponsored by Denis Dzyubenko on Patreon. --- Remote/Borg.hs | 67 +++++++++++++++++++++++++--- Remote/Helper/ThirdPartyPopulated.hs | 34 +++++++++++++- git-annex.cabal | 2 +- 3 files changed, 95 insertions(+), 8 deletions(-) diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 3f0c2b68cc..e753982d55 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -5,8 +5,6 @@ - Licensed under the GNU AGPL version 3 or higher. -} -{-# LANGUAGE OverloadedStrings #-} - module Remote.Borg (remote) where import Annex.Common @@ -22,9 +20,13 @@ import Remote.Helper.ExportImport import Annex.UUID import Types.ProposedAccepted import Utility.Metered -import qualified Remote.Helper.ThirdParty as ThirdParty +import qualified Remote.Helper.ThirdPartyPopulated as ThirdPartyPopulated +import Text.Read +import Control.Exception (evaluate) +import Control.DeepSeq import qualified Data.Map as M +import qualified Data.ByteString.Lazy as L type BorgRepo = String @@ -69,7 +71,7 @@ gen r u rc gc rs = do , exportActions = exportUnsupported , importActions = ImportActions { listImportableContents = listImportableContentsM borgrepo - , importKey = Just ThirdParty.importKey + , importKey = Just ThirdPartyPopulated.importKey , retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM borgrepo , checkPresentExportWithContentIdentifier = checkPresentExportWithContentIdentifierM borgrepo -- This remote is thirdPartyPopulated, so these @@ -119,8 +121,63 @@ borgSetup _ mu _ c _gc = do borgLocal :: BorgRepo -> Bool borgLocal = notElem ':' +-- TODO avoid rescanning archives that have already been scanned +-- +-- XXX importableHistory should probably not be populated. git-annex +-- only stores and uses the most recent imported tree, not the whole history, +-- I think. So a key that's only in a previous archive would not have +-- a known ImportLocation when retrieving it. +-- Instead, maybe need to include the archive names at the top of the +-- importlocation? (Then would not need them in the ContentIdentifier.) +-- +-- XXX the tree generated by using this does not seem to get grafted into +-- the git-annex branch, so would be subject to being lost to GC> +-- Is this a general problem affecting importtree too? listImportableContentsM :: BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) -listImportableContentsM borgrepo = error "TODO" +listImportableContentsM borgrepo = prompt $ do + ls <- withborglist borgrepo "{barchive}{NUL}" $ \as -> + forM as $ \archive -> + let archive' = borgrepo ++ "::" ++ decodeBS' archive + in withborglist archive' "{size}{NUL}{path}{NUL}" $ + liftIO . evaluate . force . parsefilelist archive + return (Just (mkimportablecontents (reverse ls))) + where + withborglist what format a = do + let p = (proc "borg" ["list", what, "--format", format]) + { std_out = CreatePipe } + (Nothing, Just h, Nothing, pid) <- liftIO $ createProcess p + l <- liftIO $ map L.toStrict + . filter (not . L.null) + . L.split 0 + <$> L.hGetContents h + let cleanup = liftIO $ do + hClose h + forceSuccessProcess p pid + a l `finally` cleanup + + parsefilelist archive (bsz:f:rest) = case readMaybe (fromRawFilePath bsz) of + Nothing -> parsefilelist archive rest + Just sz -> + let loc = ThirdPartyPopulated.mkThirdPartyImportLocation f + -- This does a little unncessary work to parse the + -- key, which is then thrown away. But, it lets the + -- file list be shrank down to only the ones that are + -- importable keys, so avoids needing to buffer all + -- the rest of the files in memory. + in case ThirdPartyPopulated.importKey' loc sz of + Just k -> (loc, (ContentIdentifier archive, sz)) + : parsefilelist archive rest + Nothing -> parsefilelist archive rest + parsefilelist _ _ = [] + + mkimportablecontents [] = ImportableContents + { importableContents = [] + , importableHistory = [] + } + mkimportablecontents (v:vs) = ImportableContents + { importableContents = v + , importableHistory = [mkimportablecontents vs] + } retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" diff --git a/Remote/Helper/ThirdPartyPopulated.hs b/Remote/Helper/ThirdPartyPopulated.hs index 6ce106fa12..c10a5c25cb 100644 --- a/Remote/Helper/ThirdPartyPopulated.hs +++ b/Remote/Helper/ThirdPartyPopulated.hs @@ -5,7 +5,9 @@ - Licensed under the GNU AGPL version 3 or higher. -} -module Remote.Helper.ThirdParty where +{-# LANGUAGE OverloadedStrings #-} + +module Remote.Helper.ThirdPartyPopulated where import Annex.Common import Types.Remote @@ -14,13 +16,41 @@ import Crypto (isEncKey) import Utility.Metered import qualified System.FilePath.ByteString as P +import qualified Data.ByteString as S + +-- When a remote is thirdPartyPopulated, the files we want are probably +-- in the .git directory. But, git does not really support .git in paths +-- in a git tree. (Such a tree can be built, but it will lead to problems.) +-- And so anything in .git is prevented from being imported. +-- To work around that, this renames that directory when generating an +-- ImportLocation. +mkThirdPartyImportLocation :: RawFilePath -> ImportLocation +mkThirdPartyImportLocation = + mkImportLocation . P.joinPath . map esc . P.splitDirectories + where + esc ".git" = "dotgit" + esc x + | "dotgit" `S.isSuffixOf` x = "dot" <> x + | otherwise = x + +fromThirdPartyImportLocation :: ImportLocation -> RawFilePath +fromThirdPartyImportLocation = + P.joinPath . map unesc . P.splitDirectories . fromImportLocation + where + unesc "dotgit" = ".git" + unesc x + | "dotgit" `S.isSuffixOf` x = S.drop 3 x + | otherwise = x -- When a remote is thirdPartyPopulated, and contains a backup of a -- git-annex repository or some special remotes, this can be used to -- find only those ImportLocations that are annex object files. -- All other ImportLocations are ignored. importKey :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key) -importKey loc _cid sz _ = return $ case deserializeKey' f of +importKey loc _cid sz _ = return $ importKey' loc sz + +importKey' :: ImportLocation -> ByteSize -> Maybe Key +importKey' loc sz = case deserializeKey' f of Just k -- Annex objects always are in a subdirectory with the same -- name as the filename. If this is not the case for the file diff --git a/git-annex.cabal b/git-annex.cabal index f39146f25b..91bf59d44d 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -963,7 +963,7 @@ Executable git-annex Remote.Helper.Messages Remote.Helper.P2P Remote.Helper.ReadOnly - Remote.Helper.ThirdParty + Remote.Helper.ThirdPartyPopulated Remote.Helper.Special Remote.Helper.Ssh Remote.HttpAlso From c2d6f335a6f72988591c8e127ee183df2ab83470 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 11:23:00 -0400 Subject: [PATCH 12/24] notes on ImportableContents history not being used for retrieval --- Types/Import.hs | 6 ++++++ .../export_and_import_appendix.mdwn | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Types/Import.hs b/Types/Import.hs index 2724ddc7ef..2013e44cde 100644 --- a/Types/Import.hs +++ b/Types/Import.hs @@ -59,6 +59,12 @@ data ImportableContents info = ImportableContents -- ^ Used by remotes that support importing historical versions of -- files that are stored in them. This is equivilant to a git -- commit history. + -- + -- When retrieving a historical version of a file, + -- old ImportLocations from importableHistory are not used; + -- the content is no longer expected to be present at those + -- locations. So, if a remote does not support Key/Value access, + -- it should not populate the importableHistory. } deriving (Show, Generic) diff --git a/doc/design/external_special_remote_protocol/export_and_import_appendix.mdwn b/doc/design/external_special_remote_protocol/export_and_import_appendix.mdwn index d1b255a8d0..9d474dbcbf 100644 --- a/doc/design/external_special_remote_protocol/export_and_import_appendix.mdwn +++ b/doc/design/external_special_remote_protocol/export_and_import_appendix.mdwn @@ -167,7 +167,9 @@ support a request, it can reply with `UNSUPPORTED-REQUEST`. this can be used to list those versions. It opens a new block of responses. This can be repeated any number of times (indicating a branching history), and histories can also - be nested multiple levels deep. + be nested multiple levels deep. + This should only be used when the remote supports using + "TRANSFER RECEIVE Key" to retrieve historical versions of files. * `END` Indicates the end of a block of responses. * `LOCATION Name` From 7f7094a7cbb8f66259c0317f86fa3a8ce0c71a46 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 11:53:00 -0400 Subject: [PATCH 13/24] include borg archive name in tree, use empty ContentIdentifier It's unusual to use a ContentIdentifier that is not semi-unique for different contents. Note that in importKeys, it checks if a content identifier is one that's known before, to avoid downloading the same content twice. But that's done in a code path not used for borg repos, because they are thirdpartypopulated. --- Remote/Borg.hs | 54 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/Remote/Borg.hs b/Remote/Borg.hs index e753982d55..a1dc10b0f8 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -26,10 +26,14 @@ import Text.Read import Control.Exception (evaluate) import Control.DeepSeq import qualified Data.Map as M +import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L +import qualified System.FilePath.ByteString as P type BorgRepo = String +type BorgArchiveName = S.ByteString + remote :: RemoteType remote = RemoteType { typename = "borg" @@ -123,15 +127,8 @@ borgLocal = notElem ':' -- TODO avoid rescanning archives that have already been scanned -- --- XXX importableHistory should probably not be populated. git-annex --- only stores and uses the most recent imported tree, not the whole history, --- I think. So a key that's only in a previous archive would not have --- a known ImportLocation when retrieving it. --- Instead, maybe need to include the archive names at the top of the --- importlocation? (Then would not need them in the ContentIdentifier.) --- -- XXX the tree generated by using this does not seem to get grafted into --- the git-annex branch, so would be subject to being lost to GC> +-- the git-annex branch, so would be subject to being lost to GC. -- Is this a general problem affecting importtree too? listImportableContentsM :: BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) listImportableContentsM borgrepo = prompt $ do @@ -140,7 +137,7 @@ listImportableContentsM borgrepo = prompt $ do let archive' = borgrepo ++ "::" ++ decodeBS' archive in withborglist archive' "{size}{NUL}{path}{NUL}" $ liftIO . evaluate . force . parsefilelist archive - return (Just (mkimportablecontents (reverse ls))) + return (Just (mkimportablecontents ls)) where withborglist what format a = do let p = (proc "borg" ["list", what, "--format", format]) @@ -158,26 +155,49 @@ listImportableContentsM borgrepo = prompt $ do parsefilelist archive (bsz:f:rest) = case readMaybe (fromRawFilePath bsz) of Nothing -> parsefilelist archive rest Just sz -> - let loc = ThirdPartyPopulated.mkThirdPartyImportLocation f + let loc = genImportLocation archive f -- This does a little unncessary work to parse the -- key, which is then thrown away. But, it lets the -- file list be shrank down to only the ones that are -- importable keys, so avoids needing to buffer all -- the rest of the files in memory. in case ThirdPartyPopulated.importKey' loc sz of - Just k -> (loc, (ContentIdentifier archive, sz)) + Just k -> (loc, (borgContentIdentifier, sz)) : parsefilelist archive rest Nothing -> parsefilelist archive rest parsefilelist _ _ = [] - mkimportablecontents [] = ImportableContents - { importableContents = [] + -- importableHistory is not used for retrieval, so is not + -- populated with old archives. Instead, a tree of archives + -- is constructed, by genImportLocation including the archive + -- name in the ImportLocation. + mkimportablecontents (l) = ImportableContents + { importableContents = concat l , importableHistory = [] } - mkimportablecontents (v:vs) = ImportableContents - { importableContents = v - , importableHistory = [mkimportablecontents vs] - } + +-- Borg does not allow / in the name of an archive, so the archive +-- name will always be the first directory in the ImportLocation. +-- +-- Paths in a borg archive are always relative, not absolute, so the use of +-- to combine the archive name with the path will always work. +genImportLocation :: BorgArchiveName -> RawFilePath -> ImportLocation +genImportLocation archivename p = + ThirdPartyPopulated.mkThirdPartyImportLocation $ + archivename P. p + +extractImportLocation :: ImportLocation -> (BorgArchiveName, RawFilePath) +extractImportLocation loc = go $ P.splitDirectories $ + ThirdPartyPopulated.fromThirdPartyImportLocation loc + where + go (archivename:rest) = (archivename, P.joinPath rest) + go _ = giveup $ "Unable to parse import location " ++ fromRawFilePath (fromImportLocation loc) + +-- We do not need a ContentIdentifier in order to retrieve a file from +-- borg; the ImportLocation contains all that's needed. So, this is left +-- empty. +borgContentIdentifier :: ContentIdentifier +borgContentIdentifier = ContentIdentifier mempty retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" From 06ef1b7d68ee2b149e663f2362e5685303d0bd3a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 12:00:11 -0400 Subject: [PATCH 14/24] improve storage of redundant ContentIdentifiers When a ContentIdentifier is already recorded, don't add it to the log again, and avoid updating the log. --- Logs/ContentIdentifier.hs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Logs/ContentIdentifier.hs b/Logs/ContentIdentifier.hs index 38f904ae2b..16c3969155 100644 --- a/Logs/ContentIdentifier.hs +++ b/Logs/ContentIdentifier.hs @@ -32,12 +32,16 @@ recordContentIdentifier :: RemoteStateHandle -> ContentIdentifier -> Key -> Anne recordContentIdentifier (RemoteStateHandle u) cid k = do c <- liftIO currentVectorClock config <- Annex.getGitConfig - Annex.Branch.change (remoteContentIdentifierLogFile config k) $ - buildLog . addcid c . parseLog + Annex.Branch.maybeChange (remoteContentIdentifierLogFile config k) $ + addcid c . parseLog where - addcid c l = changeMapLog c u (cid :| contentIdentifierList (M.lookup u m)) l + addcid c v + | cid `elem` l = Nothing -- no change needed + | otherwise = Just $ buildLog $ + changeMapLog c u (cid :| l) v where - m = simpleMap l + m = simpleMap v + l = contentIdentifierList (M.lookup u m) -- | Get all known content identifiers for a key. getContentIdentifiers :: Key -> Annex [(RemoteStateHandle, [ContentIdentifier])] From 5d8e4a7c741de0935f2bc5f0f8d06d9c9f31383d Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 14:06:40 -0400 Subject: [PATCH 15/24] avoid borg list of archives that have been listed before This makes sync a lot faster in the common case where there's no new backup. There's still room for it to be faster. Currently the old imported tree has to be traversed, to generate the ImportableContents. Which then gets turned around to generate the new imported tree, which is identical. So, it would be possible to just return a "no new imports", or an ImportableContents that has a way to graft in a tree. The latter is probably too far to go to optimise this, unless other things need it. The former might be worth it, but it's already pretty fast, since git ls-tree is pretty fast. --- Git/LsTree.hs | 23 ++++++++++--- Git/Types.hs | 11 +++++- Logs/Export.hs | 3 +- Remote/Borg.hs | 90 ++++++++++++++++++++++++++++++++++++++------------ 4 files changed, 98 insertions(+), 29 deletions(-) diff --git a/Git/LsTree.hs b/Git/LsTree.hs index ead501f0dc..cd0d406edf 100644 --- a/Git/LsTree.hs +++ b/Git/LsTree.hs @@ -1,17 +1,17 @@ {- git ls-tree interface - - - Copyright 2011-2019 Joey Hess + - Copyright 2011-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} -{-# LANGUAGE BangPatterns #-} - module Git.LsTree ( TreeItem(..), LsTreeMode(..), lsTree, lsTree', + lsTreeStrict, + lsTreeStrict', lsTreeParams, lsTreeFiles, parseLsTree, @@ -30,6 +30,7 @@ import Data.Either import System.Posix.Types import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L +import qualified Data.Attoparsec.ByteString as AS import qualified Data.Attoparsec.ByteString.Lazy as A import qualified Data.Attoparsec.ByteString.Char8 as A8 @@ -38,7 +39,7 @@ data TreeItem = TreeItem , typeobj :: S.ByteString , sha :: Ref , file :: TopFilePath - } deriving Show + } deriving (Show) data LsTreeMode = LsTreeRecursive | LsTreeNonRecursive @@ -51,6 +52,13 @@ lsTree' ps lsmode t repo = do (l, cleanup) <- pipeNullSplit (lsTreeParams lsmode t ps) repo return (rights (map parseLsTree l), cleanup) +lsTreeStrict :: LsTreeMode -> Ref -> Repo -> IO [TreeItem] +lsTreeStrict = lsTreeStrict' [] + +lsTreeStrict' :: [CommandParam] -> LsTreeMode -> Ref -> Repo -> IO [TreeItem] +lsTreeStrict' ps lsmode t repo = rights . map parseLsTreeStrict + <$> pipeNullSplitStrict (lsTreeParams lsmode t ps) repo + lsTreeParams :: LsTreeMode -> Ref -> [CommandParam] -> [CommandParam] lsTreeParams lsmode r ps = [ Param "ls-tree" @@ -83,6 +91,13 @@ parseLsTree b = case A.parse parserLsTree b of A.Done _ r -> Right r A.Fail _ _ err -> Left err +parseLsTreeStrict :: S.ByteString -> Either String TreeItem +parseLsTreeStrict b = go (AS.parse parserLsTree b) + where + go (AS.Done _ r) = Right r + go (AS.Fail _ _ err) = Left err + go (AS.Partial c) = go (c mempty) + {- Parses a line of ls-tree output, in format: - mode SP type SP sha TAB file - diff --git a/Git/Types.hs b/Git/Types.hs index 77a52d1e45..73c4fe62de 100644 --- a/Git/Types.hs +++ b/Git/Types.hs @@ -135,7 +135,12 @@ fmtObjectType CommitObject = "commit" fmtObjectType TreeObject = "tree" {- Types of items in a tree. -} -data TreeItemType = TreeFile | TreeExecutable | TreeSymlink | TreeSubmodule +data TreeItemType + = TreeFile + | TreeExecutable + | TreeSymlink + | TreeSubmodule + | TreeSubtree deriving (Eq, Show) {- Git uses magic numbers to denote the type of a tree item. -} @@ -144,6 +149,7 @@ readTreeItemType "100644" = Just TreeFile readTreeItemType "100755" = Just TreeExecutable readTreeItemType "120000" = Just TreeSymlink readTreeItemType "160000" = Just TreeSubmodule +readTreeItemType "040000" = Just TreeSubtree readTreeItemType _ = Nothing fmtTreeItemType :: TreeItemType -> S.ByteString @@ -151,12 +157,14 @@ fmtTreeItemType TreeFile = "100644" fmtTreeItemType TreeExecutable = "100755" fmtTreeItemType TreeSymlink = "120000" fmtTreeItemType TreeSubmodule = "160000" +fmtTreeItemType TreeSubtree = "040000" toTreeItemType :: FileMode -> Maybe TreeItemType toTreeItemType 0o100644 = Just TreeFile toTreeItemType 0o100755 = Just TreeExecutable toTreeItemType 0o120000 = Just TreeSymlink toTreeItemType 0o160000 = Just TreeSubmodule +toTreeItemType 0o040000 = Just TreeSubtree toTreeItemType _ = Nothing fromTreeItemType :: TreeItemType -> FileMode @@ -164,6 +172,7 @@ fromTreeItemType TreeFile = 0o100644 fromTreeItemType TreeExecutable = 0o100755 fromTreeItemType TreeSymlink = 0o120000 fromTreeItemType TreeSubmodule = 0o160000 +fromTreeItemType TreeSubtree = 0o040000 data Commit = Commit { commitTree :: Sha diff --git a/Logs/Export.hs b/Logs/Export.hs index 1c198b7992..50b2ea1378 100644 --- a/Logs/Export.hs +++ b/Logs/Export.hs @@ -1,4 +1,4 @@ -{- git-annex export log +{- git-annex export log (also used to log imports) - - Copyright 2017-2019 Joey Hess - @@ -64,7 +64,6 @@ exportedTreeishes = nub . map exportedTreeish incompleteExportedTreeishes :: [Exported] -> [Git.Ref] incompleteExportedTreeishes = concatMap incompleteExportedTreeish - data ExportParticipants = ExportParticipants { exportFrom :: UUID , exportTo :: UUID diff --git a/Remote/Borg.hs b/Remote/Borg.hs index a1dc10b0f8..0e7dd50e6f 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -12,6 +12,9 @@ import Types.Remote import Types.Creds import Types.Import import qualified Git +import qualified Git.LsTree as LsTree +import Git.Types (toTreeItemType, TreeItemType(..)) +import Git.FilePath import Config import Config.Cost import Annex.SpecialRemote.Config @@ -21,6 +24,7 @@ import Annex.UUID import Types.ProposedAccepted import Utility.Metered import qualified Remote.Helper.ThirdPartyPopulated as ThirdPartyPopulated +import Logs.Export import Text.Read import Control.Exception (evaluate) @@ -74,7 +78,7 @@ gen r u rc gc rs = do , checkPresentCheap = borgLocal borgrepo , exportActions = exportUnsupported , importActions = ImportActions - { listImportableContents = listImportableContentsM borgrepo + { listImportableContents = listImportableContentsM u borgrepo , importKey = Just ThirdPartyPopulated.importKey , retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM borgrepo , checkPresentExportWithContentIdentifier = checkPresentExportWithContentIdentifierM borgrepo @@ -125,19 +129,21 @@ borgSetup _ mu _ c _gc = do borgLocal :: BorgRepo -> Bool borgLocal = notElem ':' --- TODO avoid rescanning archives that have already been scanned --- -- XXX the tree generated by using this does not seem to get grafted into -- the git-annex branch, so would be subject to being lost to GC. -- Is this a general problem affecting importtree too? -listImportableContentsM :: BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) -listImportableContentsM borgrepo = prompt $ do +listImportableContentsM :: UUID -> BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsM u borgrepo = prompt $ do + imported <- getImported u ls <- withborglist borgrepo "{barchive}{NUL}" $ \as -> - forM as $ \archive -> - let archive' = borgrepo ++ "::" ++ decodeBS' archive - in withborglist archive' "{size}{NUL}{path}{NUL}" $ - liftIO . evaluate . force . parsefilelist archive - return (Just (mkimportablecontents ls)) + forM as $ \archivename -> + case M.lookup archivename imported of + Just getfast -> getfast + Nothing -> + let archive = borgrepo ++ "::" ++ decodeBS' archivename + in withborglist archive "{size}{NUL}{path}{NUL}" $ + liftIO . evaluate . force . parsefilelist archivename + return $ Just $ mkimportablecontents ls where withborglist what format a = do let p = (proc "borg" ["list", what, "--format", format]) @@ -152,10 +158,10 @@ listImportableContentsM borgrepo = prompt $ do forceSuccessProcess p pid a l `finally` cleanup - parsefilelist archive (bsz:f:rest) = case readMaybe (fromRawFilePath bsz) of - Nothing -> parsefilelist archive rest + parsefilelist archivename (bsz:f:rest) = case readMaybe (fromRawFilePath bsz) of + Nothing -> parsefilelist archivename rest Just sz -> - let loc = genImportLocation archive f + let loc = genImportLocation archivename f -- This does a little unncessary work to parse the -- key, which is then thrown away. But, it lets the -- file list be shrank down to only the ones that are @@ -163,19 +169,25 @@ listImportableContentsM borgrepo = prompt $ do -- the rest of the files in memory. in case ThirdPartyPopulated.importKey' loc sz of Just k -> (loc, (borgContentIdentifier, sz)) - : parsefilelist archive rest - Nothing -> parsefilelist archive rest + : parsefilelist archivename rest + Nothing -> parsefilelist archivename rest parsefilelist _ _ = [] -- importableHistory is not used for retrieval, so is not -- populated with old archives. Instead, a tree of archives -- is constructed, by genImportLocation including the archive -- name in the ImportLocation. - mkimportablecontents (l) = ImportableContents + mkimportablecontents l = ImportableContents { importableContents = concat l , importableHistory = [] } - + +-- We do not need a ContentIdentifier in order to retrieve a file from +-- borg; the ImportLocation contains all that's needed. So, this is left +-- empty. +borgContentIdentifier :: ContentIdentifier +borgContentIdentifier = ContentIdentifier mempty + -- Borg does not allow / in the name of an archive, so the archive -- name will always be the first directory in the ImportLocation. -- @@ -193,11 +205,45 @@ extractImportLocation loc = go $ P.splitDirectories $ go (archivename:rest) = (archivename, P.joinPath rest) go _ = giveup $ "Unable to parse import location " ++ fromRawFilePath (fromImportLocation loc) --- We do not need a ContentIdentifier in order to retrieve a file from --- borg; the ImportLocation contains all that's needed. So, this is left --- empty. -borgContentIdentifier :: ContentIdentifier -borgContentIdentifier = ContentIdentifier mempty +-- Since the ImportLocation starts with the archive name, a list of all +-- archive names we've already imported can be found by just listing the +-- last imported tree. And the contents of those archives can be retrieved +-- by listing the subtree recursively, which will likely be quite a lot +-- faster than running borg. +getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(ImportLocation, (ContentIdentifier, ByteSize))])) +getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u) + where + go t = M.fromList . mapMaybe mk + <$> inRepo (LsTree.lsTreeStrict LsTree.LsTreeNonRecursive t) + + mk ti + | toTreeItemType (LsTree.mode ti) == Just TreeSubtree = Just + ( getTopFilePath (LsTree.file ti) + , getcontents + (getTopFilePath (LsTree.file ti)) + (LsTree.sha ti) + ) + | otherwise = Nothing + + getcontents archivename t = mapMaybe (mkcontents archivename) + <$> inRepo (LsTree.lsTreeStrict LsTree.LsTreeRecursive t) + + mkcontents archivename ti = do + let f = ThirdPartyPopulated.fromThirdPartyImportLocation $ + mkImportLocation $ getTopFilePath $ LsTree.file ti + k <- deserializeKey' (P.takeFileName f) + return + ( genImportLocation archivename f + , + ( borgContentIdentifier + -- defaulting to 0 size is ok, this size + -- only gets used by + -- ThirdPartyPopulated.importKey, + -- which ignores the size when the key + -- does not have a size. + , fromMaybe 0 (fromKey keySize k) + ) + ) retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" From e1ac42be7781b3435ca0d76e26273884d23973e1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 14:20:11 -0400 Subject: [PATCH 16/24] convert listImportableContents to throwing exceptions --- Annex/Import.hs | 14 +++++++------- Command/Import.hs | 6 +++--- Remote/Adb.hs | 12 ++++++------ Remote/Borg.hs | 4 ++-- Remote/Directory.hs | 4 ++-- Remote/Helper/ExportImport.hs | 2 +- Remote/S3.hs | 8 +++----- Types/Remote.hs | 4 +++- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/Annex/Import.hs b/Annex/Import.hs index 0bc0c44c83..9c113002d5 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -658,14 +658,14 @@ makeImportMatcher r = load preferredContentKeylessTokens >>= \case - regardless. (Similar to how git add behaves on gitignored files.) - This avoids creating a remote tracking branch that, when merged, - would delete the files. + - + - Throws exception if unable to contact the remote. -} -getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) -getImportableContents r importtreeconfig ci matcher = - Remote.listImportableContents (Remote.importActions r) >>= \case - Nothing -> return Nothing - Just importable -> do - dbhandle <- Export.openDb (Remote.uuid r) - Just <$> filterunwanted dbhandle importable +getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +getImportableContents r importtreeconfig ci matcher = do + importable <- Remote.listImportableContents (Remote.importActions r) + dbhandle <- Export.openDb (Remote.uuid r) + filterunwanted dbhandle importable where filterunwanted dbhandle ic = ImportableContents <$> filterM (wanted dbhandle) (importableContents ic) diff --git a/Command/Import.hs b/Command/Import.hs index 472d82d4e7..b09984c2c3 100644 --- a/Command/Import.hs +++ b/Command/Import.hs @@ -334,9 +334,9 @@ listContents remote importtreeconfig ci tvar = starting "list" ai si $ listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (ImportableContents (ContentIdentifier, Remote.ByteSize) -> Annex a) -> Annex a listContents' remote importtreeconfig ci a = makeImportMatcher remote >>= \case - Right matcher -> getImportableContents remote importtreeconfig ci matcher >>= \case - Just importable -> a importable - Nothing -> giveup $ "Unable to list contents of " ++ Remote.name remote + Right matcher -> tryNonAsync (getImportableContents remote importtreeconfig ci matcher) >>= \case + Right importable -> a importable + Left e -> giveup $ "Unable to list contents of " ++ Remote.name remote ++ ": " ++ show e Left err -> giveup $ unwords [ "Cannot import from" , Remote.name remote diff --git a/Remote/Adb.hs b/Remote/Adb.hs index 82459c8599..3543051030 100644 --- a/Remote/Adb.hs +++ b/Remote/Adb.hs @@ -286,9 +286,12 @@ renameExportM serial adir _k old new = do , File newloc ] -listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) -listImportableContentsM serial adir = - process <$> adbShell serial +listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +listImportableContentsM serial adir = adbfind >>= \case + Just ls -> return $ ImportableContents (mapMaybe mk ls) [] + Nothing -> giveup "adb find failed" + where + adbfind = adbShell serial [ Param "find" -- trailing slash is needed, or android's find command -- won't recurse into the directory @@ -298,9 +301,6 @@ listImportableContentsM serial adir = , Param "-c", Param statformat , Param "{}", Param "+" ] - where - process Nothing = Nothing - process (Just ls) = Just $ ImportableContents (mapMaybe mk ls) [] statformat = adbStatFormat ++ "\t%n" diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 0e7dd50e6f..7cc4d5e0fa 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -132,7 +132,7 @@ borgLocal = notElem ':' -- XXX the tree generated by using this does not seem to get grafted into -- the git-annex branch, so would be subject to being lost to GC. -- Is this a general problem affecting importtree too? -listImportableContentsM :: UUID -> BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsM :: UUID -> BorgRepo -> Annex (ImportableContents (ContentIdentifier, ByteSize)) listImportableContentsM u borgrepo = prompt $ do imported <- getImported u ls <- withborglist borgrepo "{barchive}{NUL}" $ \as -> @@ -143,7 +143,7 @@ listImportableContentsM u borgrepo = prompt $ do let archive = borgrepo ++ "::" ++ decodeBS' archivename in withborglist archive "{size}{NUL}{path}{NUL}" $ liftIO . evaluate . force . parsefilelist archivename - return $ Just $ mkimportablecontents ls + return $ mkimportablecontents ls where withborglist what format a = do let p = (proc "borg" ["list", what, "--format", format]) diff --git a/Remote/Directory.hs b/Remote/Directory.hs index 91ea86019a..ed2a9dce74 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -337,8 +337,8 @@ removeExportLocation topdir loc = mkExportLocation loc' in go (upFrom loc') =<< tryIO (removeDirectory p) -listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) -listImportableContentsM dir = catchMaybeIO $ liftIO $ do +listImportableContentsM :: RawFilePath -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +listImportableContentsM dir = liftIO $ do l <- dirContentsRecursive (fromRawFilePath dir) l' <- mapM (go . toRawFilePath) l return $ ImportableContents (catMaybes l') [] diff --git a/Remote/Helper/ExportImport.hs b/Remote/Helper/ExportImport.hs index 573db92147..9e2840e2f5 100644 --- a/Remote/Helper/ExportImport.hs +++ b/Remote/Helper/ExportImport.hs @@ -54,7 +54,7 @@ instance HasImportUnsupported (ParsedRemoteConfig -> RemoteGitConfig -> Annex Bo instance HasImportUnsupported (ImportActions Annex) where importUnsupported = ImportActions - { listImportableContents = return Nothing + { listImportableContents = nope , importKey = Nothing , retrieveExportWithContentIdentifier = nope , storeExportWithContentIdentifier = nope diff --git a/Remote/S3.hs b/Remote/S3.hs index 661e0ab7fa..ec9a16d1b4 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -550,13 +550,11 @@ renameExportS3 hv r rs info k src dest = Just <$> go srcobject = T.pack $ bucketExportLocation info src dstobject = T.pack $ bucketExportLocation info dest -listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (ImportableContents (ContentIdentifier, ByteSize)) listImportableContentsS3 hv r info = withS3Handle hv $ \case - Nothing -> do - warning $ needS3Creds (uuid r) - return Nothing - Just h -> catchMaybeIO $ liftIO $ runResourceT $ + Nothing -> giveup $ needS3Creds (uuid r) + Just h -> liftIO $ runResourceT $ extractFromResourceT =<< startlist h where startlist h diff --git a/Types/Remote.hs b/Types/Remote.hs index 9085b7311a..2404c52364 100644 --- a/Types/Remote.hs +++ b/Types/Remote.hs @@ -281,7 +281,9 @@ data ImportActions a = ImportActions -- -- May also find old versions of files that are still stored in the -- remote. - { listImportableContents :: a (Maybe (ImportableContents (ContentIdentifier, ByteSize))) + -- + -- Throws exception on failure to access the remote. + { listImportableContents :: a (ImportableContents (ContentIdentifier, ByteSize)) -- Generates a Key (of any type) for the file stored on the -- remote at the ImportLocation. Does not download the file -- from the remote. From 4f9969d0a157f8ac06030526324141250b2503e1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 14:35:02 -0400 Subject: [PATCH 17/24] optimisation for borg Skip needing to list importable contents when unchanged since last time. --- Annex/Import.hs | 11 +++++++---- Command/Import.hs | 4 ++-- Command/Sync.hs | 3 ++- Remote/Adb.hs | 4 ++-- Remote/Borg.hs | 13 +++++++------ Remote/Directory.hs | 4 ++-- Remote/S3.hs | 7 ++++--- Types/Remote.hs | 3 ++- doc/special_remotes/borg.mdwn | 4 ++++ 9 files changed, 32 insertions(+), 21 deletions(-) diff --git a/Annex/Import.hs b/Annex/Import.hs index 9c113002d5..201d9e5f7e 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -660,12 +660,15 @@ makeImportMatcher r = load preferredContentKeylessTokens >>= \case - would delete the files. - - Throws exception if unable to contact the remote. + - Returns Nothing when there is no change since last time. -} -getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) getImportableContents r importtreeconfig ci matcher = do - importable <- Remote.listImportableContents (Remote.importActions r) - dbhandle <- Export.openDb (Remote.uuid r) - filterunwanted dbhandle importable + Remote.listImportableContents (Remote.importActions r) >>= \case + Just importable -> do + dbhandle <- Export.openDb (Remote.uuid r) + Just <$> filterunwanted dbhandle importable + Nothing -> return Nothing where filterunwanted dbhandle ic = ImportableContents <$> filterM (wanted dbhandle) (importableContents ic) diff --git a/Command/Import.hs b/Command/Import.hs index b09984c2c3..e1560ce93a 100644 --- a/Command/Import.hs +++ b/Command/Import.hs @@ -325,13 +325,13 @@ seekRemote remote branch msubdir importcontent ci = do listContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> TVar (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize))) -> CommandStart listContents remote importtreeconfig ci tvar = starting "list" ai si $ listContents' remote importtreeconfig ci $ \importable -> do - liftIO $ atomically $ writeTVar tvar (Just importable) + liftIO $ atomically $ writeTVar tvar importable next $ return True where ai = ActionItemOther (Just (Remote.name remote)) si = SeekInput [] -listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (ImportableContents (ContentIdentifier, Remote.ByteSize) -> Annex a) -> Annex a +listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize)) -> Annex a) -> Annex a listContents' remote importtreeconfig ci a = makeImportMatcher remote >>= \case Right matcher -> tryNonAsync (getImportableContents remote importtreeconfig ci matcher) >>= \case diff --git a/Command/Sync.hs b/Command/Sync.hs index 3a3839b050..0bfe4241fb 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -495,13 +495,14 @@ importThirdPartyPopulated remote = void $ includeCommandAction $ starting "list" ai si $ Command.Import.listContents' remote ImportTree (CheckGitIgnore False) go where - go importable = importKeys remote ImportTree False True importable >>= \case + go (Just importable) = importKeys remote ImportTree False True importable >>= \case Just importablekeys -> do (_imported, updatestate) <- recordImportTree remote ImportTree importablekeys next $ do updatestate return True Nothing -> next $ return False + go Nothing = next $ return True -- unchanged from before ai = ActionItemOther (Just (Remote.name remote)) si = SeekInput [] diff --git a/Remote/Adb.hs b/Remote/Adb.hs index 3543051030..f67df51754 100644 --- a/Remote/Adb.hs +++ b/Remote/Adb.hs @@ -286,9 +286,9 @@ renameExportM serial adir _k old new = do , File newloc ] -listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) listImportableContentsM serial adir = adbfind >>= \case - Just ls -> return $ ImportableContents (mapMaybe mk ls) [] + Just ls -> return $ Just $ ImportableContents (mapMaybe mk ls) [] Nothing -> giveup "adb find failed" where adbfind = adbShell serial diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 7cc4d5e0fa..b1f5cd3921 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -26,6 +26,7 @@ import Utility.Metered import qualified Remote.Helper.ThirdPartyPopulated as ThirdPartyPopulated import Logs.Export +import Data.Either import Text.Read import Control.Exception (evaluate) import Control.DeepSeq @@ -122,8 +123,6 @@ borgSetup _ mu _ c _gc = do -- persistant state, so it can vary between hosts. gitConfigSpecialRemote u c [("borgrepo", borgrepo)] - -- TODO: untrusted by default, but allow overriding that - return (c, u) borgLocal :: BorgRepo -> Bool @@ -132,18 +131,20 @@ borgLocal = notElem ':' -- XXX the tree generated by using this does not seem to get grafted into -- the git-annex branch, so would be subject to being lost to GC. -- Is this a general problem affecting importtree too? -listImportableContentsM :: UUID -> BorgRepo -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +listImportableContentsM :: UUID -> BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) listImportableContentsM u borgrepo = prompt $ do imported <- getImported u ls <- withborglist borgrepo "{barchive}{NUL}" $ \as -> forM as $ \archivename -> case M.lookup archivename imported of - Just getfast -> getfast - Nothing -> + Just getfast -> return $ Left getfast + Nothing -> Right <$> let archive = borgrepo ++ "::" ++ decodeBS' archivename in withborglist archive "{size}{NUL}{path}{NUL}" $ liftIO . evaluate . force . parsefilelist archivename - return $ mkimportablecontents ls + if all isLeft ls + then return Nothing -- unchanged since last time, avoid work + else Just . mkimportablecontents <$> mapM (either id pure) ls where withborglist what format a = do let p = (proc "borg" ["list", what, "--format", format]) diff --git a/Remote/Directory.hs b/Remote/Directory.hs index ed2a9dce74..6de71fffe9 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -337,11 +337,11 @@ removeExportLocation topdir loc = mkExportLocation loc' in go (upFrom loc') =<< tryIO (removeDirectory p) -listImportableContentsM :: RawFilePath -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) listImportableContentsM dir = liftIO $ do l <- dirContentsRecursive (fromRawFilePath dir) l' <- mapM (go . toRawFilePath) l - return $ ImportableContents (catMaybes l') [] + return $ Just $ ImportableContents (catMaybes l') [] where go f = do st <- R.getFileStatus f diff --git a/Remote/S3.hs b/Remote/S3.hs index ec9a16d1b4..90db63bb1d 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -550,13 +550,14 @@ renameExportS3 hv r rs info k src dest = Just <$> go srcobject = T.pack $ bucketExportLocation info src dstobject = T.pack $ bucketExportLocation info dest -listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (ImportableContents (ContentIdentifier, ByteSize)) +listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) listImportableContentsS3 hv r info = withS3Handle hv $ \case Nothing -> giveup $ needS3Creds (uuid r) - Just h -> liftIO $ runResourceT $ - extractFromResourceT =<< startlist h + Just h -> Just <$> go h where + go h = liftIO $ runResourceT $ extractFromResourceT =<< startlist h + startlist h | versioning info = do rsp <- sendS3Handle h $ diff --git a/Types/Remote.hs b/Types/Remote.hs index 2404c52364..cc5fb47a23 100644 --- a/Types/Remote.hs +++ b/Types/Remote.hs @@ -283,7 +283,8 @@ data ImportActions a = ImportActions -- remote. -- -- Throws exception on failure to access the remote. - { listImportableContents :: a (ImportableContents (ContentIdentifier, ByteSize)) + -- May return Nothing when the remote is unchanged since last time. + { listImportableContents :: a (Maybe (ImportableContents (ContentIdentifier, ByteSize))) -- Generates a Key (of any type) for the file stored on the -- remote at the ImportLocation. Does not download the file -- from the remote. diff --git a/doc/special_remotes/borg.mdwn b/doc/special_remotes/borg.mdwn index 42f7fefde1..a97afd0b2c 100644 --- a/doc/special_remotes/borg.mdwn +++ b/doc/special_remotes/borg.mdwn @@ -62,3 +62,7 @@ So either keep the borg special remote as untrusted, and use such borg commands to delete old archives as needed, or avoid using `borg delete` and `borg prune`, and then the remote can safely be made semitrusted or trusted. + +Also, if you do choose to delete old archives, make sure to never reuse +that archive name for a new archive. git-annex may think it's the same +archive it saw before, and not notice the change. From 82e43da936e2f2f666b08b2ecd38c30687d151d8 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 15:00:11 -0400 Subject: [PATCH 18/24] todo --- ...low_overriding_untrust_of_import_remotes.mdwn | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 doc/todo/allow_overriding_untrust_of_import_remotes.mdwn diff --git a/doc/todo/allow_overriding_untrust_of_import_remotes.mdwn b/doc/todo/allow_overriding_untrust_of_import_remotes.mdwn new file mode 100644 index 0000000000..ab39488bc8 --- /dev/null +++ b/doc/todo/allow_overriding_untrust_of_import_remotes.mdwn @@ -0,0 +1,16 @@ +importtree=yes remotes are untrusted, because something is modifying that +remote other than git-annex, and it could change a file at any time, so +git-annex can't rely on the file being there. However, it's possible the user +has a policy of not letting files on the remote be modified. It may even be +that some remotes use storage that avoids such problems. So, there should be +some way to override the default trust level for such remotes. + +Currently: + + joey@darkstar:/tmp/y8>git annex semitrust borg + semitrust borg + This remote's trust level is overridden to untrusted. + +The borg special remote is one example of one where it's easy for the user to +decide they're going to not delete old archives from it, and so want git-annex +to trust it. From f31bdd0b19bf32a4fba51e4623ab91d87ccb85e8 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 15:01:07 -0400 Subject: [PATCH 19/24] todo --- doc/special_remotes/borg.mdwn | 36 ----------------- ..._overriding_untrust_of_import_remotes.mdwn | 39 +++++++++++++++++++ 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/doc/special_remotes/borg.mdwn b/doc/special_remotes/borg.mdwn index a97afd0b2c..02f8258559 100644 --- a/doc/special_remotes/borg.mdwn +++ b/doc/special_remotes/borg.mdwn @@ -30,39 +30,3 @@ remote: # git annex initremote borg type=borg borgrepo=/path/to/borgrepo scan=`pwd` # borg create /path/to/borgrepo `pwd`::{now} # git annex sync borg - -## trust levels, borg delete and borg prune - -git-annex will by default treat the borg special remote as untrusted, so -will not trust it to continue to contain a [[copy|copies]] of any annexed -file. This is necessary because you could run `borg delete` or `borg prune` -and remove the copy from the borg repository. If you choose to set the -trust level of the borg repository to a higher level, you need to avoid -using such commands with that borg repository. - -Consider this example: - - git-annex add annexedfile - borg create /path/to/borgrepo `pwd`::foo - git-annex sync borg - git-annex semitrust borg - git-annex drop annexedfile - -Now the only copy of annexedfile is in the borg repository. - - borg create /path/to/borgrepo `pwd`::bar - borg delete /path/to/borgrepo::foo - git-annex sync borg - git-annex whereis annexedfile - -Now no copies of annexfile remain, because the "foo" archive -in the borg repository was the only one to contain it, and it was deleted. - -So either keep the borg special remote as untrusted, and use such borg -commands to delete old archives as needed, or avoid using `borg delete` -and `borg prune`, and then the remote can safely be made semitrusted or -trusted. - -Also, if you do choose to delete old archives, make sure to never reuse -that archive name for a new archive. git-annex may think it's the same -archive it saw before, and not notice the change. diff --git a/doc/todo/allow_overriding_untrust_of_import_remotes.mdwn b/doc/todo/allow_overriding_untrust_of_import_remotes.mdwn index ab39488bc8..99d6009d52 100644 --- a/doc/todo/allow_overriding_untrust_of_import_remotes.mdwn +++ b/doc/todo/allow_overriding_untrust_of_import_remotes.mdwn @@ -14,3 +14,42 @@ Currently: The borg special remote is one example of one where it's easy for the user to decide they're going to not delete old archives from it, and so want git-annex to trust it. + +Below is some docs I wrote for the borg special remote page, should be +moved there when this gets fixed. --[[Joey]] + +## trust levels, borg delete and borg prune + +git-annex will by default treat the borg special remote as untrusted, so +will not trust it to continue to contain a [[copy|copies]] of any annexed +file. This is necessary because you could run `borg delete` or `borg prune` +and remove the copy from the borg repository. If you choose to set the +trust level of the borg repository to a higher level, you need to avoid +using such commands with that borg repository. + +Consider this example: + + git-annex add annexedfile + borg create /path/to/borgrepo `pwd`::foo + git-annex sync borg + git-annex semitrust borg + git-annex drop annexedfile + +Now the only copy of annexedfile is in the borg repository. + + borg create /path/to/borgrepo `pwd`::bar + borg delete /path/to/borgrepo::foo + git-annex sync borg + git-annex whereis annexedfile + +Now no copies of annexfile remain, because the "foo" archive +in the borg repository was the only one to contain it, and it was deleted. + +So either keep the borg special remote as untrusted, and use such borg +commands to delete old archives as needed, or avoid using `borg delete` +and `borg prune`, and then the remote can safely be made semitrusted or +trusted. + +Also, if you do choose to delete old archives, make sure to never reuse +that archive name for a new archive. git-annex may think it's the same +archive it saw before, and not notice the change. From 523b7143e088b6605b14ec6cf99675f00bbbf60a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 15:34:41 -0400 Subject: [PATCH 20/24] implemented checkPresentExportWithContentIdentifier --- Remote/Borg.hs | 51 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/Remote/Borg.hs b/Remote/Borg.hs index b1f5cd3921..13b1e3e3b3 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -128,6 +128,9 @@ borgSetup _ mu _ c _gc = do borgLocal :: BorgRepo -> Bool borgLocal = notElem ':' +borgArchive :: BorgRepo -> BorgArchiveName -> String +borgArchive r n = r ++ "::" ++ decodeBS' n + -- XXX the tree generated by using this does not seem to get grafted into -- the git-annex branch, so would be subject to being lost to GC. -- Is this a general problem affecting importtree too? @@ -139,7 +142,7 @@ listImportableContentsM u borgrepo = prompt $ do case M.lookup archivename imported of Just getfast -> return $ Left getfast Nothing -> Right <$> - let archive = borgrepo ++ "::" ++ decodeBS' archivename + let archive = borgArchive borgrepo archivename in withborglist archive "{size}{NUL}{path}{NUL}" $ liftIO . evaluate . force . parsefilelist archivename if all isLeft ls @@ -249,5 +252,49 @@ getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u) retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" +-- Check if the file is still there in the borg archive. +-- Does not check that the content is unchanged; we assume that +-- the content of files in borg archives does not change, which is normally +-- the case. But archives may be deleted, and files may be deleted. checkPresentExportWithContentIdentifierM :: BorgRepo -> Key -> ImportLocation -> [ContentIdentifier] -> Annex Bool -checkPresentExportWithContentIdentifierM borgrepo k loc cids = error "TODO" +checkPresentExportWithContentIdentifierM borgrepo k loc cids = liftIO $ do + let p = proc "borg" + [ "list" + , "--format" + , "1" + , borgArchive borgrepo archivename + , fromRawFilePath archivefile + ] + -- borg list exits nonzero with an error message if an archive + -- no longer exists. But, the user can delete archives at any + -- time they want. So, hide errors, and if it exists nonzero, + -- check if the borg repository still exists, and only throw an + -- exception if not. + (Nothing, Just h, Nothing, pid) <- withNullHandle $ \nullh -> + createProcess $ p + { std_out = CreatePipe + , std_err = UseHandle nullh + } + ok <- (== "1") <$> hGetContentsStrict h + hClose h + ifM (checkSuccessProcess pid) + ( return ok + , checkrepoexists + ) + where + (archivename, archivefile) = extractImportLocation loc + + checkrepoexists = do + let p = proc "borg" + [ "list" + , "--format" + , "1" + , borgrepo + ] + (Nothing, Nothing, Nothing, pid) <- withNullHandle $ \nullh -> + createProcess $ p + { std_out = UseHandle nullh } + ifM (checkSuccessProcess pid) + ( return False -- repo exists, content not in it + , giveup $ "Unable to access borg repository " ++ borgrepo + ) From df4942e179240a86a3db3ea1ce5ccbda0549f00f Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 15:45:06 -0400 Subject: [PATCH 21/24] notice when an archive that was seen before gets deleted --- Remote/Borg.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 13b1e3e3b3..cd60cd5ea0 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -140,14 +140,14 @@ listImportableContentsM u borgrepo = prompt $ do ls <- withborglist borgrepo "{barchive}{NUL}" $ \as -> forM as $ \archivename -> case M.lookup archivename imported of - Just getfast -> return $ Left getfast + Just getfast -> return $ Left (archivename, getfast) Nothing -> Right <$> let archive = borgArchive borgrepo archivename in withborglist archive "{size}{NUL}{path}{NUL}" $ liftIO . evaluate . force . parsefilelist archivename - if all isLeft ls + if all isLeft ls && M.null (M.difference imported (M.fromList (lefts ls))) then return Nothing -- unchanged since last time, avoid work - else Just . mkimportablecontents <$> mapM (either id pure) ls + else Just . mkimportablecontents <$> mapM (either snd pure) ls where withborglist what format a = do let p = (proc "borg" ["list", what, "--format", format]) From a9d639c5b5374b600863469621304786ca9e6f64 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 15:48:17 -0400 Subject: [PATCH 22/24] borg can prompt --- Remote/Borg.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Remote/Borg.hs b/Remote/Borg.hs index cd60cd5ea0..460be074fe 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -257,7 +257,7 @@ retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" -- the content of files in borg archives does not change, which is normally -- the case. But archives may be deleted, and files may be deleted. checkPresentExportWithContentIdentifierM :: BorgRepo -> Key -> ImportLocation -> [ContentIdentifier] -> Annex Bool -checkPresentExportWithContentIdentifierM borgrepo k loc cids = liftIO $ do +checkPresentExportWithContentIdentifierM borgrepo k loc cids = prompt $ liftIO $ do let p = proc "borg" [ "list" , "--format" From 4254e2297db006b912da22ac9776c41ee06be906 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 16:07:53 -0400 Subject: [PATCH 23/24] implement retrieveExportWithContentIdentifier Moved out an XXX to a todo This seems about ready to merge.. --- Remote/Borg.hs | 34 ++++++++++++++++++------ doc/todo/borg_sync_tree_not_grafted.mdwn | 5 ++++ 2 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 doc/todo/borg_sync_tree_not_grafted.mdwn diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 460be074fe..70a01f8ec1 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -17,6 +17,7 @@ import Git.Types (toTreeItemType, TreeItemType(..)) import Git.FilePath import Config import Config.Cost +import Annex.Tmp import Annex.SpecialRemote.Config import Remote.Helper.Special import Remote.Helper.ExportImport @@ -131,9 +132,6 @@ borgLocal = notElem ':' borgArchive :: BorgRepo -> BorgArchiveName -> String borgArchive r n = r ++ "::" ++ decodeBS' n --- XXX the tree generated by using this does not seem to get grafted into --- the git-annex branch, so would be subject to being lost to GC. --- Is this a general problem affecting importtree too? listImportableContentsM :: UUID -> BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) listImportableContentsM u borgrepo = prompt $ do imported <- getImported u @@ -172,7 +170,7 @@ listImportableContentsM u borgrepo = prompt $ do -- importable keys, so avoids needing to buffer all -- the rest of the files in memory. in case ThirdPartyPopulated.importKey' loc sz of - Just k -> (loc, (borgContentIdentifier, sz)) + Just _k -> (loc, (borgContentIdentifier, sz)) : parsefilelist archivename rest Nothing -> parsefilelist archivename rest parsefilelist _ _ = [] @@ -249,15 +247,12 @@ getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u) ) ) -retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key -retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO" - -- Check if the file is still there in the borg archive. -- Does not check that the content is unchanged; we assume that -- the content of files in borg archives does not change, which is normally -- the case. But archives may be deleted, and files may be deleted. checkPresentExportWithContentIdentifierM :: BorgRepo -> Key -> ImportLocation -> [ContentIdentifier] -> Annex Bool -checkPresentExportWithContentIdentifierM borgrepo k loc cids = prompt $ liftIO $ do +checkPresentExportWithContentIdentifierM borgrepo _ loc _ = prompt $ liftIO $ do let p = proc "borg" [ "list" , "--format" @@ -298,3 +293,26 @@ checkPresentExportWithContentIdentifierM borgrepo k loc cids = prompt $ liftIO $ ( return False -- repo exists, content not in it , giveup $ "Unable to access borg repository " ++ borgrepo ) + +retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key +retrieveExportWithContentIdentifierM borgrepo loc _ dest mkk _ = do + showOutput + prompt $ withOtherTmp $ \othertmp -> liftIO $ do + -- borgrepo could be relative, and borg has to be run + -- in the temp directory to get it to write there + absborgrepo <- fromRawFilePath <$> absPath (toRawFilePath borgrepo) + let p = proc "borg" + [ "extract" + , borgArchive absborgrepo archivename + , fromRawFilePath archivefile + ] + (Nothing, Nothing, Nothing, pid) <- createProcess $ p + { cwd = Just (fromRawFilePath othertmp) } + forceSuccessProcess p pid + -- Filepaths in borg archives are relative, so it's ok to + -- combine with + moveFile (fromRawFilePath othertmp fromRawFilePath archivefile) dest + removeDirectoryRecursive (fromRawFilePath othertmp) + mkk + where + (archivename, archivefile) = extractImportLocation loc diff --git a/doc/todo/borg_sync_tree_not_grafted.mdwn b/doc/todo/borg_sync_tree_not_grafted.mdwn new file mode 100644 index 0000000000..a4423706e1 --- /dev/null +++ b/doc/todo/borg_sync_tree_not_grafted.mdwn @@ -0,0 +1,5 @@ +The tree generated by git-annex sync with a borg remote +does not seem to get grafted into the git-annex branch, so +would be subject to being lost to GC. + +Is this a general problem affecting importtree too? From 2335476e1ed27b1b2b8db12df501b9f2e41f8f16 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 22 Dec 2020 16:19:02 -0400 Subject: [PATCH 24/24] todo --- doc/todo/sync_--content_with_borg_does_not_get_content.mdwn | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 doc/todo/sync_--content_with_borg_does_not_get_content.mdwn diff --git a/doc/todo/sync_--content_with_borg_does_not_get_content.mdwn b/doc/todo/sync_--content_with_borg_does_not_get_content.mdwn new file mode 100644 index 0000000000..a8755fd3d2 --- /dev/null +++ b/doc/todo/sync_--content_with_borg_does_not_get_content.mdwn @@ -0,0 +1,2 @@ +Subject says it all really, sync does not try to get content +from remotes that are thirdPartyPopulated yet.