ImportableContentsChunkable

This improves the borg special remote memory usage, by
letting it only load one archive's worth of filenames into memory at a
time, and building up a larger tree out of the chunks.

When a borg repository has many archives, git-annex could easily OOM
before. Now, it will use only memory proportional to the number of
annexed keys in an archive.

Minor implementation wart: Each new chunk re-opens the content
identifier database, and also a new vector clock is used for each chunk.
This is a minor innefficiency only; the use of continuations makes
it hard to avoid, although putting the database handle into a Reader
monad would be one way to fix it.

It may later be possible to extend the ImportableContentsChunkable
interface to remotes that are not third-party populated. However, that
would perhaps need an interface that does not use continuations.

The ImportableContentsChunkable interface currently does not allow
populating the top of the tree with anything other than subtrees. It
would be easy to extend it to allow putting files in that tree, but borg
doesn't need that so I left it out for now.

Sponsored-by: Noam Kremen on Patreon
This commit is contained in:
Joey Hess 2021-10-06 17:05:32 -04:00
parent 153f3600fb
commit 69f8e6c7c0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
13 changed files with 286 additions and 92 deletions

View file

@ -288,9 +288,10 @@ renameExportM serial adir _k old new = do
, File newloc
]
listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM serial adir = adbfind >>= \case
Just ls -> return $ Just $ ImportableContents (mapMaybe mk ls) []
Just ls -> return $ Just $ ImportableContentsComplete $
ImportableContents (mapMaybe mk ls) []
Nothing -> giveup "adb find failed"
where
adbfind = adbShell serial

View file

@ -162,20 +162,21 @@ borgRepoLocalPath r@(BorgRepo p)
| borgLocal r && not (null p) = Just p
| otherwise = Nothing
listImportableContentsM :: UUID -> BorgRepo -> ParsedRemoteConfig -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM :: UUID -> BorgRepo -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM u borgrepo c = prompt $ do
imported <- getImported u
ls <- withborglist (locBorgRepo borgrepo) Nothing formatarchivelist $ \as ->
forM (filter (not . S.null) as) $ \archivename ->
case M.lookup archivename imported of
Just getfast -> return $ Left (archivename, getfast)
Nothing -> Right <$>
return $ case M.lookup archivename imported of
Just getlist -> Left (archivename, getlist)
Nothing ->
let archive = borgArchive borgrepo archivename
in withborglist archive subdir formatfilelist $
getlist = withborglist archive subdir formatfilelist $
liftIO . evaluate . force . parsefilelist archivename
in Right (archivename, getlist)
if all isLeft ls && M.null (M.difference imported (M.fromList (lefts ls)))
then return Nothing -- unchanged since last time, avoid work
else Just . mkimportablecontents <$> mapM (either snd pure) ls
else Just <$> mkimportablecontents (map (either id id) ls)
where
withborglist what addparam format a = do
environ <- liftIO getEnvironment
@ -210,7 +211,7 @@ listImportableContentsM u borgrepo c = prompt $ do
parsefilelist archivename (bsz:f:extra:rest) = case readMaybe (fromRawFilePath bsz) of
Nothing -> parsefilelist archivename rest
Just sz ->
let loc = genImportLocation archivename f
let loc = genImportLocation f
-- borg list reports hard links as 0 byte files,
-- with the extra field set to " link to ".
-- When the annex object is a hard link to
@ -234,12 +235,27 @@ listImportableContentsM u borgrepo c = prompt $ do
-- importableHistory is not used for retrieval, so is not
-- populated with old archives. Instead, a tree of archives
-- is constructed, by genImportLocation including the archive
-- name in the ImportLocation.
mkimportablecontents l = ImportableContents
{ importableContents = concat l
, importableHistory = []
}
-- is constructed, with a subtree for each archive.
mkimportablecontents [] = return $ ImportableContentsComplete $
ImportableContents
{ importableContents = []
, importableHistory = []
}
mkimportablecontents (l:ls) = ImportableContentsChunked
<$> mkimportablecontentschunk l ls
<*> pure []
mkimportablecontentschunk (archivename, getlist) rest = do
l <- getlist
return $ ImportableContentsChunk
{ importableContentsSubDir =
genImportChunkSubDir archivename
, importableContentsSubTree = l
, importableContentsNextChunk = case rest of
(getlist':rest') -> Just
<$> mkimportablecontentschunk getlist' rest'
[] -> return Nothing
}
-- We do not need a ContentIdentifier in order to retrieve a file from
-- borg; the ImportLocation contains all that's needed. So, this is left
@ -247,15 +263,20 @@ listImportableContentsM u borgrepo c = prompt $ do
borgContentIdentifier :: ContentIdentifier
borgContentIdentifier = ContentIdentifier mempty
-- Convert a path file a borg archive to a path that can be used as an
-- ImportLocation. The archive name gets used as a subdirectory,
-- which this path is inside.
--
-- Borg does not allow / in the name of an archive, so the archive
-- name will always be the first directory in the ImportLocation.
--
-- Paths in a borg archive are always relative, not absolute, so the use of
-- </> to combine the archive name with the path will always work.
genImportLocation :: BorgArchiveName -> RawFilePath -> ImportLocation
genImportLocation archivename p =
ThirdPartyPopulated.mkThirdPartyImportLocation $
archivename P.</> p
-- This scheme also relies on the fact that paths in a borg archive are
-- always relative, not absolute.
genImportLocation :: RawFilePath -> RawFilePath
genImportLocation = fromImportLocation . ThirdPartyPopulated.mkThirdPartyImportLocation
genImportChunkSubDir :: BorgArchiveName -> ImportChunkSubDir
genImportChunkSubDir = ImportChunkSubDir . fromImportLocation . ThirdPartyPopulated.mkThirdPartyImportLocation
extractImportLocation :: ImportLocation -> (BorgArchiveName, RawFilePath)
extractImportLocation loc = go $ P.splitDirectories $
@ -269,7 +290,7 @@ extractImportLocation loc = go $ P.splitDirectories $
-- last imported tree. And the contents of those archives can be retrieved
-- by listing the subtree recursively, which will likely be quite a lot
-- faster than running borg.
getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(ImportLocation, (ContentIdentifier, ByteSize))]))
getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(RawFilePath, (ContentIdentifier, ByteSize))]))
getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u)
where
go t = M.fromList . mapMaybe mk
@ -278,21 +299,19 @@ getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u)
mk ti
| toTreeItemType (LsTree.mode ti) == Just TreeSubtree = Just
( getTopFilePath (LsTree.file ti)
, getcontents
(getTopFilePath (LsTree.file ti))
(LsTree.sha ti)
, getcontents (LsTree.sha ti)
)
| otherwise = Nothing
getcontents archivename t = mapMaybe (mkcontents archivename)
getcontents t = mapMaybe mkcontents
<$> inRepo (LsTree.lsTreeStrict LsTree.LsTreeRecursive (LsTree.LsTreeLong False) t)
mkcontents archivename ti = do
mkcontents ti = do
let f = ThirdPartyPopulated.fromThirdPartyImportLocation $
mkImportLocation $ getTopFilePath $ LsTree.file ti
k <- fileKey (P.takeFileName f)
return
( genImportLocation archivename f
( genImportLocation f
,
( borgContentIdentifier
-- defaulting to 0 size is ok, this size

View file

@ -351,11 +351,12 @@ removeExportLocation topdir loc =
mkExportLocation loc'
in go (upFrom loc') =<< tryIO (removeDirectory p)
listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM dir = liftIO $ do
l <- dirContentsRecursive (fromRawFilePath dir)
l' <- mapM (go . toRawFilePath) l
return $ Just $ ImportableContents (catMaybes l') []
return $ Just $ ImportableContentsComplete $
ImportableContents (catMaybes l') []
where
go f = do
st <- R.getFileStatus f

View file

@ -47,10 +47,10 @@ fromThirdPartyImportLocation =
-- find only those ImportLocations that are annex object files.
-- All other ImportLocations are ignored.
importKey :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key)
importKey loc _cid sz _ = return $ importKey' loc (Just sz)
importKey loc _cid sz _ = return $ importKey' (fromImportLocation loc) (Just sz)
importKey' :: ImportLocation -> Maybe ByteSize -> Maybe Key
importKey' loc msz = case fileKey f of
importKey' :: RawFilePath -> Maybe ByteSize -> Maybe Key
importKey' p msz = case fileKey f of
Just k
-- Annex objects always are in a subdirectory with the same
-- name as the filename. If this is not the case for the file
@ -82,5 +82,4 @@ importKey' loc msz = case fileKey f of
_ -> Just k
Nothing -> Nothing
where
p = fromImportLocation loc
f = P.takeFileName p

View file

@ -549,13 +549,15 @@ renameExportS3 hv r rs info k src dest = Just <$> go
srcobject = T.pack $ bucketExportLocation info src
dstobject = T.pack $ bucketExportLocation info dest
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsS3 hv r info =
withS3Handle hv $ \case
Nothing -> giveup $ needS3Creds (uuid r)
Just h -> Just <$> go h
where
go h = liftIO $ runResourceT $ extractFromResourceT =<< startlist h
go h = do
ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h
return (ImportableContentsComplete ic)
startlist h
| versioning info = do