avoid borg list of archives that have been listed before

This makes sync a lot faster in the common case where there's no new
backup.

There's still room for it to be faster. Currently the old imported tree
has to be traversed, to generate the ImportableContents. Which then
gets turned around to generate the new imported tree, which is
identical. So, it would be possible to just return a "no new imports",
or an ImportableContents that has a way to graft in a tree. The latter
is probably too far to go to optimise this, unless other things need it.
The former might be worth it, but it's already pretty fast, since git
ls-tree is pretty fast.
This commit is contained in:
Joey Hess 2020-12-22 14:06:40 -04:00
parent 06ef1b7d68
commit 5d8e4a7c74
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 98 additions and 29 deletions

View file

@ -12,6 +12,9 @@ import Types.Remote
import Types.Creds
import Types.Import
import qualified Git
import qualified Git.LsTree as LsTree
import Git.Types (toTreeItemType, TreeItemType(..))
import Git.FilePath
import Config
import Config.Cost
import Annex.SpecialRemote.Config
@ -21,6 +24,7 @@ import Annex.UUID
import Types.ProposedAccepted
import Utility.Metered
import qualified Remote.Helper.ThirdPartyPopulated as ThirdPartyPopulated
import Logs.Export
import Text.Read
import Control.Exception (evaluate)
@ -74,7 +78,7 @@ gen r u rc gc rs = do
, checkPresentCheap = borgLocal borgrepo
, exportActions = exportUnsupported
, importActions = ImportActions
{ listImportableContents = listImportableContentsM borgrepo
{ listImportableContents = listImportableContentsM u borgrepo
, importKey = Just ThirdPartyPopulated.importKey
, retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM borgrepo
, checkPresentExportWithContentIdentifier = checkPresentExportWithContentIdentifierM borgrepo
@ -125,19 +129,21 @@ borgSetup _ mu _ c _gc = do
borgLocal :: BorgRepo -> Bool
borgLocal = notElem ':'
-- TODO avoid rescanning archives that have already been scanned
--
-- XXX the tree generated by using this does not seem to get grafted into
-- the git-annex branch, so would be subject to being lost to GC.
-- Is this a general problem affecting importtree too?
listImportableContentsM :: BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM borgrepo = prompt $ do
listImportableContentsM :: UUID -> BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM u borgrepo = prompt $ do
imported <- getImported u
ls <- withborglist borgrepo "{barchive}{NUL}" $ \as ->
forM as $ \archive ->
let archive' = borgrepo ++ "::" ++ decodeBS' archive
in withborglist archive' "{size}{NUL}{path}{NUL}" $
liftIO . evaluate . force . parsefilelist archive
return (Just (mkimportablecontents ls))
forM as $ \archivename ->
case M.lookup archivename imported of
Just getfast -> getfast
Nothing ->
let archive = borgrepo ++ "::" ++ decodeBS' archivename
in withborglist archive "{size}{NUL}{path}{NUL}" $
liftIO . evaluate . force . parsefilelist archivename
return $ Just $ mkimportablecontents ls
where
withborglist what format a = do
let p = (proc "borg" ["list", what, "--format", format])
@ -152,10 +158,10 @@ listImportableContentsM borgrepo = prompt $ do
forceSuccessProcess p pid
a l `finally` cleanup
parsefilelist archive (bsz:f:rest) = case readMaybe (fromRawFilePath bsz) of
Nothing -> parsefilelist archive rest
parsefilelist archivename (bsz:f:rest) = case readMaybe (fromRawFilePath bsz) of
Nothing -> parsefilelist archivename rest
Just sz ->
let loc = genImportLocation archive f
let loc = genImportLocation archivename f
-- This does a little unncessary work to parse the
-- key, which is then thrown away. But, it lets the
-- file list be shrank down to only the ones that are
@ -163,19 +169,25 @@ listImportableContentsM borgrepo = prompt $ do
-- the rest of the files in memory.
in case ThirdPartyPopulated.importKey' loc sz of
Just k -> (loc, (borgContentIdentifier, sz))
: parsefilelist archive rest
Nothing -> parsefilelist archive rest
: parsefilelist archivename rest
Nothing -> parsefilelist archivename rest
parsefilelist _ _ = []
-- importableHistory is not used for retrieval, so is not
-- populated with old archives. Instead, a tree of archives
-- is constructed, by genImportLocation including the archive
-- name in the ImportLocation.
mkimportablecontents (l) = ImportableContents
mkimportablecontents l = ImportableContents
{ importableContents = concat l
, importableHistory = []
}
-- We do not need a ContentIdentifier in order to retrieve a file from
-- borg; the ImportLocation contains all that's needed. So, this is left
-- empty.
borgContentIdentifier :: ContentIdentifier
borgContentIdentifier = ContentIdentifier mempty
-- Borg does not allow / in the name of an archive, so the archive
-- name will always be the first directory in the ImportLocation.
--
@ -193,11 +205,45 @@ extractImportLocation loc = go $ P.splitDirectories $
go (archivename:rest) = (archivename, P.joinPath rest)
go _ = giveup $ "Unable to parse import location " ++ fromRawFilePath (fromImportLocation loc)
-- We do not need a ContentIdentifier in order to retrieve a file from
-- borg; the ImportLocation contains all that's needed. So, this is left
-- empty.
borgContentIdentifier :: ContentIdentifier
borgContentIdentifier = ContentIdentifier mempty
-- Since the ImportLocation starts with the archive name, a list of all
-- archive names we've already imported can be found by just listing the
-- last imported tree. And the contents of those archives can be retrieved
-- by listing the subtree recursively, which will likely be quite a lot
-- faster than running borg.
getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(ImportLocation, (ContentIdentifier, ByteSize))]))
getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u)
where
go t = M.fromList . mapMaybe mk
<$> inRepo (LsTree.lsTreeStrict LsTree.LsTreeNonRecursive t)
mk ti
| toTreeItemType (LsTree.mode ti) == Just TreeSubtree = Just
( getTopFilePath (LsTree.file ti)
, getcontents
(getTopFilePath (LsTree.file ti))
(LsTree.sha ti)
)
| otherwise = Nothing
getcontents archivename t = mapMaybe (mkcontents archivename)
<$> inRepo (LsTree.lsTreeStrict LsTree.LsTreeRecursive t)
mkcontents archivename ti = do
let f = ThirdPartyPopulated.fromThirdPartyImportLocation $
mkImportLocation $ getTopFilePath $ LsTree.file ti
k <- deserializeKey' (P.takeFileName f)
return
( genImportLocation archivename f
,
( borgContentIdentifier
-- defaulting to 0 size is ok, this size
-- only gets used by
-- ThirdPartyPopulated.importKey,
-- which ignores the size when the key
-- does not have a size.
, fromMaybe 0 (fromKey keySize k)
)
)
retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key
retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO"