Merge branch 'borgchunks'

This commit is contained in:
Joey Hess 2021-10-08 13:26:45 -04:00
commit 022bb6174c
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
14 changed files with 306 additions and 92 deletions

View file

@ -1,6 +1,6 @@
{- git-annex import from remotes
-
- Copyright 2019-2020 Joey Hess <id@joeyh.name>
- Copyright 2019-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -98,7 +98,7 @@ buildImportCommit
:: Remote
-> ImportTreeConfig
-> ImportCommitConfig
-> ImportableContents (Either Sha Key)
-> ImportableContentsChunkable Annex (Either Sha Key)
-> Annex (Maybe Ref)
buildImportCommit remote importtreeconfig importcommitconfig importable =
case importCommitTracking importcommitconfig of
@ -123,7 +123,7 @@ buildImportCommit remote importtreeconfig importcommitconfig importable =
recordImportTree
:: Remote
-> ImportTreeConfig
-> ImportableContents (Either Sha Key)
-> ImportableContentsChunkable Annex (Either Sha Key)
-> Annex (History Sha, Annex ())
recordImportTree remote importtreeconfig importable = do
imported@(History finaltree _) <- buildImportTrees basetree subdir importable
@ -264,25 +264,75 @@ buildImportCommit' remote importcommitconfig mtrackingcommit imported@(History t
buildImportTrees
:: Ref
-> Maybe TopFilePath
-> ImportableContents (Either Sha Key)
-> ImportableContentsChunkable Annex (Either Sha Key)
-> Annex (History Sha)
buildImportTrees basetree msubdir importable = History
<$> (buildtree (importableContents importable) =<< Annex.gitRepo)
<*> buildhistory
buildImportTrees basetree msubdir (ImportableContentsComplete importable) = do
repo <- Annex.gitRepo
withMkTreeHandle repo $ buildImportTrees' basetree msubdir importable
buildImportTrees basetree msubdir importable@(ImportableContentsChunked {}) = do
repo <- Annex.gitRepo
withMkTreeHandle repo $ \hdl ->
History
<$> go hdl
<*> buildImportTreesHistory basetree msubdir
(importableHistoryComplete importable) hdl
where
go hdl = do
tree <- gochunks [] (importableContentsChunk importable) hdl
importtree <- liftIO $ recordTree' hdl tree
graftImportTree basetree msubdir importtree hdl
gochunks l c hdl = do
let subdir = importChunkSubDir $ importableContentsSubDir c
-- Full directory prefix where the sub tree is located.
let fullprefix = asTopFilePath $ case msubdir of
Nothing -> subdir
Just d -> getTopFilePath d Posix.</> subdir
Tree ts <- convertImportTree (Just fullprefix) $
map (\(p, i) -> (mkImportLocation p, i))
(importableContentsSubTree c)
-- Record this subtree before getting next chunk, this
-- avoids buffering all the chunks into memory.
tc <- liftIO $ recordSubTree hdl $
NewSubTree (asTopFilePath subdir) ts
importableContentsNextChunk c >>= \case
Nothing -> return (Tree (tc:l))
Just c' -> gochunks (tc:l) c' hdl
buildImportTrees'
:: Ref
-> Maybe TopFilePath
-> ImportableContents (Either Sha Key)
-> MkTreeHandle
-> Annex (History Sha)
buildImportTrees' basetree msubdir importable hdl = History
<$> buildImportTree basetree msubdir (importableContents importable) hdl
<*> buildImportTreesHistory basetree msubdir (importableHistory importable) hdl
buildImportTree
:: Ref
-> Maybe TopFilePath
-> [(ImportLocation, Either Sha Key)]
-> MkTreeHandle
-> Annex Sha
buildImportTree basetree msubdir ls hdl = do
importtree <- liftIO . recordTree' hdl =<< convertImportTree msubdir ls
graftImportTree basetree msubdir importtree hdl
graftImportTree
:: Ref
-> Maybe TopFilePath
-> Sha
-> MkTreeHandle
-> Annex Sha
graftImportTree basetree msubdir tree hdl = case msubdir of
Nothing -> return tree
Just subdir -> inRepo $ \repo ->
graftTree' tree subdir basetree repo hdl
convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree
convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls
where
buildhistory = S.fromList
<$> mapM (buildImportTrees basetree msubdir)
(importableHistory importable)
buildtree ls repo = withMkTreeHandle repo $ \hdl -> do
importtree <- liftIO . recordTree' hdl
. treeItemsToTree
=<< mapM mktreeitem ls
case msubdir of
Nothing -> return importtree
Just subdir -> liftIO $
graftTree' importtree subdir basetree repo hdl
mktreeitem (loc, v) = case v of
Right k -> do
relf <- fromRepo $ fromTopFilePath topf
@ -297,6 +347,15 @@ buildImportTrees basetree msubdir importable = History
topf = asTopFilePath $
maybe lf (\sd -> getTopFilePath sd P.</> lf) msubdir
buildImportTreesHistory
:: Ref
-> Maybe TopFilePath
-> [ImportableContents (Either Sha Key)]
-> MkTreeHandle
-> Annex (S.Set (History Sha))
buildImportTreesHistory basetree msubdir history hdl = S.fromList
<$> mapM (\ic -> buildImportTrees' basetree msubdir ic hdl) history
canImportKeys :: Remote -> Bool -> Bool
canImportKeys remote importcontent =
importcontent || isJust (Remote.importKey ia)
@ -324,8 +383,8 @@ importKeys
-> ImportTreeConfig
-> Bool
-> Bool
-> ImportableContents (ContentIdentifier, ByteSize)
-> Annex (Maybe (ImportableContents (Either Sha Key)))
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
-> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key)))
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
unless (canImportKeys remote importcontent) $
giveup "This remote does not support importing without downloading content."
@ -339,40 +398,82 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
-- When concurrency is enabled, this set is needed to
-- avoid two threads both importing the same content identifier.
importing <- liftIO $ newTVarIO S.empty
withExclusiveLock gitAnnexContentIdentifierLock $
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
CIDDb.needsUpdateFromLog db
>>= maybe noop (CIDDb.updateFromLog db)
(run (go False cidmap importing importablecontents db))
withciddb $ \db -> do
CIDDb.needsUpdateFromLog db
>>= maybe noop (CIDDb.updateFromLog db)
(prepclock (run cidmap importing db))
where
-- When not importing content, reuse the same vector
-- clock for all state that's recorded. This can save
-- a little bit of disk space. Individual file downloads
-- while downloading take too long for this optimisation
-- to be safe to do.
run a
prepclock a
| importcontent = a
| otherwise = reuseVectorClockWhile a
go oldversion cidmap importing (ImportableContents l h) db = do
withciddb = withExclusiveLock gitAnnexContentIdentifierLock .
bracket CIDDb.openDb CIDDb.closeDb
run cidmap importing db = do
largematcher <- largeFilesMatcher
case importablecontents of
ImportableContentsComplete ic ->
go False largematcher cidmap importing db ic >>= return . \case
Nothing -> Nothing
Just v -> Just $ ImportableContentsComplete v
ImportableContentsChunked {} -> do
c <- gochunked db (importableContentsChunk importablecontents)
gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case
Nothing -> Nothing
Just h -> Just $ ImportableContentsChunked
{ importableContentsChunk = c
, importableHistoryComplete = h
}
go oldversion largematcher cidmap importing db (ImportableContents l h) = do
jobs <- forM l $ \i ->
if thirdpartypopulated
then thirdpartypopulatedimport cidmap db i
then Left <$> thirdpartypopulatedimport db i
else startimport cidmap importing db i oldversion largematcher
l' <- liftIO $ forM jobs $
either pure (atomically . takeTMVar)
if any isNothing l'
then return Nothing
else do
h' <- mapM (\ic -> go True cidmap importing ic db) h
if any isNothing h'
then return Nothing
else return $ Just $
ImportableContents
(catMaybes l')
(catMaybes h')
else gohistory largematcher cidmap importing db h >>= return . \case
Nothing -> Nothing
Just h' -> Just $ ImportableContents (catMaybes l') h'
gohistory largematcher cidmap importing db h = do
h' <- mapM (go True largematcher cidmap importing db) h
if any isNothing h'
then return Nothing
else return $ Just $ catMaybes h'
gochunked db c
-- Downloading cannot be done when chunked, since only
-- the first chunk is processed before returning.
| importcontent = error "importKeys does not support downloading chunked import"
-- Chunked import is currently only used by thirdpartypopulated
-- remotes.
| not thirdpartypopulated = error "importKeys does not support chunked import when not thirdpartypopulated"
| otherwise = do
l <- forM (importableContentsSubTree c) $ \(loc, i) -> do
let loc' = importableContentsChunkFullLocation (importableContentsSubDir c) loc
thirdpartypopulatedimport db (loc', i) >>= return . \case
Just (_loc, k) -> Just (loc, k)
Nothing -> Nothing
return $ ImportableContentsChunk
{ importableContentsSubDir = importableContentsSubDir c
, importableContentsSubTree = catMaybes l
, importableContentsNextChunk =
importableContentsNextChunk c >>= \case
Nothing -> return Nothing
Just c' -> withciddb $ \db' ->
prepclock $
Just <$> gochunked db' c'
}
waitstart importing cid = liftIO $ atomically $ do
s <- readTVar importing
if S.member cid s
@ -418,19 +519,19 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
importaction
return (Right job)
thirdpartypopulatedimport cidmap db (loc, (cid, sz)) =
thirdpartypopulatedimport db (loc, (cid, sz)) =
case Remote.importKey ia of
Nothing -> return $ Left Nothing
Nothing -> return Nothing
Just importkey ->
tryNonAsync (importkey loc cid sz nullMeterUpdate) >>= \case
Right (Just k) -> do
recordcidkey cidmap db cid k
logChange k (Remote.uuid remote) InfoPresent
return $ Left $ Just (loc, Right k)
Right Nothing -> return $ Left Nothing
recordcidkey' db cid k
logChange k (Remote.uuid remote) InfoPresent
return $ Just (loc, Right k)
Right Nothing -> return Nothing
Left e -> do
warning (show e)
return $ Left Nothing
return Nothing
importordownload cidmap db (loc, (cid, sz)) largematcher= do
f <- locworktreefile loc
@ -603,6 +704,8 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
recordcidkey cidmap db cid k = do
liftIO $ atomically $ modifyTVar' cidmap $
M.insert cid k
recordcidkey' db cid k
recordcidkey' db cid k = do
liftIO $ CIDDb.recordContentIdentifier db rs cid k
CIDLog.recordContentIdentifier rs cid k
@ -675,18 +778,38 @@ makeImportMatcher r = load preferredContentKeylessTokens >>= \case
- Throws exception if unable to contact the remote.
- Returns Nothing when there is no change since last time.
-}
getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
getImportableContents r importtreeconfig ci matcher = do
Remote.listImportableContents (Remote.importActions r) >>= \case
Just importable -> do
dbhandle <- Export.openDb (Remote.uuid r)
Just <$> filterunwanted dbhandle importable
Just (ImportableContentsComplete ic) -> do
dbhandle <- opendbhandle
Just . ImportableContentsComplete
<$> filterunwanted dbhandle ic
Just (c@(ImportableContentsChunked {})) -> do
dbhandle <- opendbhandle
Just <$> filterunwantedchunked dbhandle c
Nothing -> return Nothing
where
filterunwanted dbhandle ic = ImportableContents
<$> filterM (wanted dbhandle) (importableContents ic)
<*> mapM (filterunwanted dbhandle) (importableHistory ic)
filterunwantedchunked dbhandle c = ImportableContentsChunked
<$> filterunwantedchunk dbhandle (importableContentsChunk c)
<*> mapM (filterunwanted dbhandle) (importableHistoryComplete c)
filterunwantedchunk dbhandle c = ImportableContentsChunk
<$> pure (importableContentsSubDir c)
<*> filterM (wantedunder dbhandle (importableContentsSubDir c))
(importableContentsSubTree c)
<*> pure (
importableContentsNextChunk c >>= \case
Nothing -> return Nothing
Just c' -> Just <$> filterunwantedchunk dbhandle c'
)
opendbhandle = Export.openDb (Remote.uuid r)
wanted dbhandle (loc, (_cid, sz))
| ingitdir = pure False
| otherwise =
@ -697,6 +820,9 @@ getImportableContents r importtreeconfig ci matcher = do
matches = matchesImportLocation matcher loc sz
isknown = isKnownImportLocation dbhandle loc
notignored = notIgnoredImportLocation importtreeconfig ci loc
wantedunder dbhandle root (loc, v) =
wanted dbhandle (importableContentsChunkFullLocation root loc, v)
isKnownImportLocation :: Export.ExportHandle -> ImportLocation -> Annex Bool
isKnownImportLocation dbhandle loc = liftIO $

View file

@ -6,6 +6,8 @@ git-annex (8.20210904) UNRELEASED; urgency=medium
* Bug fix: Git configs such as annex.verify were incorrectly overriding
per-remote git configs such as remote.name.annex-verify.
(Reversion in version 4.20130323)
* borg: Significantly improved memory use when a borg repository
contains many archives.
* borg: Avoid trying to extract xattrs, ACLS, and bsdflags when
retrieving from a borg repository.
* Sped up git-annex smudge --clean by 25%.
@ -15,7 +17,6 @@ git-annex (8.20210904) UNRELEASED; urgency=medium
incrementally verified, when used on NTFS and perhaps other filesystems.
* reinject: Fix crash when reinjecting a file from outside the repository.
(Reversion in version 8.20210621)
* borg: Some improvements to memory use when importing a lot of archives.
* Avoid cursor jitter when updating progress display.
-- Joey Hess <id@joeyh.name> Fri, 03 Sep 2021 12:02:55 -0400

View file

@ -346,7 +346,7 @@ seekRemote remote branch msubdir importcontent ci = do
fromtrackingbranch a = inRepo $ a (fromRemoteTrackingBranch tb)
listContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> TVar (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize))) -> CommandStart
listContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> TVar (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, Remote.ByteSize))) -> CommandStart
listContents remote importtreeconfig ci tvar = starting "list" ai si $
listContents' remote importtreeconfig ci $ \importable -> do
liftIO $ atomically $ writeTVar tvar importable
@ -355,7 +355,7 @@ listContents remote importtreeconfig ci tvar = starting "list" ai si $
ai = ActionItemOther (Just (Remote.name remote))
si = SeekInput []
listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize)) -> Annex a) -> Annex a
listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, Remote.ByteSize)) -> Annex a) -> Annex a
listContents' remote importtreeconfig ci a =
makeImportMatcher remote >>= \case
Right matcher -> tryNonAsync (getImportableContents remote importtreeconfig ci matcher) >>= \case
@ -368,7 +368,7 @@ listContents' remote importtreeconfig ci a =
, err
]
commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> ImportableContents (Either Sha Key) -> CommandStart
commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> ImportableContentsChunkable Annex (Either Sha Key) -> CommandStart
commitRemote remote branch tb trackingcommit importtreeconfig importcommitconfig importable =
starting "update" ai si $ do
importcommit <- buildImportCommit remote importtreeconfig importcommitconfig importable

View file

@ -13,6 +13,7 @@ module Git.Tree (
getTree,
recordTree,
recordTree',
recordSubTree,
TreeItem(..),
treeItemsToTree,
treeItemToLsTreeItem,
@ -21,6 +22,7 @@ module Git.Tree (
graftTree,
graftTree',
withMkTreeHandle,
MkTreeHandle,
treeMode,
) where

View file

@ -288,9 +288,10 @@ renameExportM serial adir _k old new = do
, File newloc
]
listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM serial adir = adbfind >>= \case
Just ls -> return $ Just $ ImportableContents (mapMaybe mk ls) []
Just ls -> return $ Just $ ImportableContentsComplete $
ImportableContents (mapMaybe mk ls) []
Nothing -> giveup "adb find failed"
where
adbfind = adbShell serial

View file

@ -162,20 +162,21 @@ borgRepoLocalPath r@(BorgRepo p)
| borgLocal r && not (null p) = Just p
| otherwise = Nothing
listImportableContentsM :: UUID -> BorgRepo -> ParsedRemoteConfig -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM :: UUID -> BorgRepo -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM u borgrepo c = prompt $ do
imported <- getImported u
ls <- withborglist (locBorgRepo borgrepo) Nothing formatarchivelist $ \as ->
forM (filter (not . S.null) as) $ \archivename ->
case M.lookup archivename imported of
Just getfast -> return $ Left (archivename, getfast)
Nothing -> Right <$>
return $ case M.lookup archivename imported of
Just getlist -> Left (archivename, getlist)
Nothing ->
let archive = borgArchive borgrepo archivename
in withborglist archive subdir formatfilelist $
getlist = withborglist archive subdir formatfilelist $
liftIO . evaluate . force . parsefilelist archivename
in Right (archivename, getlist)
if all isLeft ls && M.null (M.difference imported (M.fromList (lefts ls)))
then return Nothing -- unchanged since last time, avoid work
else Just . mkimportablecontents <$> mapM (either snd pure) ls
else Just <$> mkimportablecontents (map (either id id) ls)
where
withborglist what addparam format a = do
environ <- liftIO getEnvironment
@ -210,7 +211,7 @@ listImportableContentsM u borgrepo c = prompt $ do
parsefilelist archivename (bsz:f:extra:rest) = case readMaybe (fromRawFilePath bsz) of
Nothing -> parsefilelist archivename rest
Just sz ->
let loc = genImportLocation archivename f
let loc = genImportLocation f
-- borg list reports hard links as 0 byte files,
-- with the extra field set to " link to ".
-- When the annex object is a hard link to
@ -234,12 +235,27 @@ listImportableContentsM u borgrepo c = prompt $ do
-- importableHistory is not used for retrieval, so is not
-- populated with old archives. Instead, a tree of archives
-- is constructed, by genImportLocation including the archive
-- name in the ImportLocation.
mkimportablecontents l = ImportableContents
{ importableContents = concat l
, importableHistory = []
}
-- is constructed, with a subtree for each archive.
mkimportablecontents [] = return $ ImportableContentsComplete $
ImportableContents
{ importableContents = []
, importableHistory = []
}
mkimportablecontents (l:ls) = ImportableContentsChunked
<$> mkimportablecontentschunk l ls
<*> pure []
mkimportablecontentschunk (archivename, getlist) rest = do
l <- getlist
return $ ImportableContentsChunk
{ importableContentsSubDir =
genImportChunkSubDir archivename
, importableContentsSubTree = l
, importableContentsNextChunk = case rest of
(getlist':rest') -> Just
<$> mkimportablecontentschunk getlist' rest'
[] -> return Nothing
}
-- We do not need a ContentIdentifier in order to retrieve a file from
-- borg; the ImportLocation contains all that's needed. So, this is left
@ -247,15 +263,20 @@ listImportableContentsM u borgrepo c = prompt $ do
borgContentIdentifier :: ContentIdentifier
borgContentIdentifier = ContentIdentifier mempty
-- Convert a path file a borg archive to a path that can be used as an
-- ImportLocation. The archive name gets used as a subdirectory,
-- which this path is inside.
--
-- Borg does not allow / in the name of an archive, so the archive
-- name will always be the first directory in the ImportLocation.
--
-- Paths in a borg archive are always relative, not absolute, so the use of
-- </> to combine the archive name with the path will always work.
genImportLocation :: BorgArchiveName -> RawFilePath -> ImportLocation
genImportLocation archivename p =
ThirdPartyPopulated.mkThirdPartyImportLocation $
archivename P.</> p
-- This scheme also relies on the fact that paths in a borg archive are
-- always relative, not absolute.
genImportLocation :: RawFilePath -> RawFilePath
genImportLocation = fromImportLocation . ThirdPartyPopulated.mkThirdPartyImportLocation
genImportChunkSubDir :: BorgArchiveName -> ImportChunkSubDir
genImportChunkSubDir = ImportChunkSubDir . fromImportLocation . ThirdPartyPopulated.mkThirdPartyImportLocation
extractImportLocation :: ImportLocation -> (BorgArchiveName, RawFilePath)
extractImportLocation loc = go $ P.splitDirectories $
@ -269,7 +290,7 @@ extractImportLocation loc = go $ P.splitDirectories $
-- last imported tree. And the contents of those archives can be retrieved
-- by listing the subtree recursively, which will likely be quite a lot
-- faster than running borg.
getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(ImportLocation, (ContentIdentifier, ByteSize))]))
getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(RawFilePath, (ContentIdentifier, ByteSize))]))
getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u)
where
go t = M.fromList . mapMaybe mk
@ -278,21 +299,19 @@ getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u)
mk ti
| toTreeItemType (LsTree.mode ti) == Just TreeSubtree = Just
( getTopFilePath (LsTree.file ti)
, getcontents
(getTopFilePath (LsTree.file ti))
(LsTree.sha ti)
, getcontents (LsTree.sha ti)
)
| otherwise = Nothing
getcontents archivename t = mapMaybe (mkcontents archivename)
getcontents t = mapMaybe mkcontents
<$> inRepo (LsTree.lsTreeStrict LsTree.LsTreeRecursive (LsTree.LsTreeLong False) t)
mkcontents archivename ti = do
mkcontents ti = do
let f = ThirdPartyPopulated.fromThirdPartyImportLocation $
mkImportLocation $ getTopFilePath $ LsTree.file ti
k <- fileKey (P.takeFileName f)
return
( genImportLocation archivename f
( genImportLocation f
,
( borgContentIdentifier
-- defaulting to 0 size is ok, this size

View file

@ -351,11 +351,12 @@ removeExportLocation topdir loc =
mkExportLocation loc'
in go (upFrom loc') =<< tryIO (removeDirectory p)
listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM dir = liftIO $ do
l <- dirContentsRecursive (fromRawFilePath dir)
l' <- mapM (go . toRawFilePath) l
return $ Just $ ImportableContents (catMaybes l') []
return $ Just $ ImportableContentsComplete $
ImportableContents (catMaybes l') []
where
go f = do
st <- R.getFileStatus f

View file

@ -47,10 +47,10 @@ fromThirdPartyImportLocation =
-- find only those ImportLocations that are annex object files.
-- All other ImportLocations are ignored.
importKey :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key)
importKey loc _cid sz _ = return $ importKey' loc (Just sz)
importKey loc _cid sz _ = return $ importKey' (fromImportLocation loc) (Just sz)
importKey' :: ImportLocation -> Maybe ByteSize -> Maybe Key
importKey' loc msz = case fileKey f of
importKey' :: RawFilePath -> Maybe ByteSize -> Maybe Key
importKey' p msz = case fileKey f of
Just k
-- Annex objects always are in a subdirectory with the same
-- name as the filename. If this is not the case for the file
@ -82,5 +82,4 @@ importKey' loc msz = case fileKey f of
_ -> Just k
Nothing -> Nothing
where
p = fromImportLocation loc
f = P.takeFileName p

View file

@ -549,13 +549,15 @@ renameExportS3 hv r rs info k src dest = Just <$> go
srcobject = T.pack $ bucketExportLocation info src
dstobject = T.pack $ bucketExportLocation info dest
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsS3 hv r info =
withS3Handle hv $ \case
Nothing -> giveup $ needS3Creds (uuid r)
Just h -> Just <$> go h
where
go h = liftIO $ runResourceT $ extractFromResourceT =<< startlist h
go h = do
ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h
return (ImportableContentsComplete ic)
startlist h
| versioning info = do

View file

@ -1,6 +1,6 @@
{- git-annex import types
-
- Copyright 2019 Joey Hess <id@joeyh.name>
- Copyright 2019-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -13,6 +13,7 @@ import qualified Data.ByteString as S
import Data.Char
import Control.DeepSeq
import GHC.Generics
import qualified System.FilePath.Posix.ByteString as Posix
import Types.Export
import Utility.QuickCheck
@ -69,3 +70,34 @@ data ImportableContents info = ImportableContents
deriving (Show, Generic)
instance NFData info => NFData (ImportableContents info)
{- ImportableContents, but it can be chunked into subtrees to avoid
- all needing to fit in memory at the same time. -}
data ImportableContentsChunkable m info
= ImportableContentsComplete (ImportableContents info)
-- ^ Used when not chunking
| ImportableContentsChunked
{ importableContentsChunk :: ImportableContentsChunk m info
, importableHistoryComplete :: [ImportableContents info]
-- ^ Chunking the history is not supported
}
{- A chunk of ImportableContents, which is the entire content of a subtree
- of the main tree. Nested subtrees are not allowed. -}
data ImportableContentsChunk m info = ImportableContentsChunk
{ importableContentsSubDir :: ImportChunkSubDir
, importableContentsSubTree :: [(RawFilePath, info)]
-- ^ locations are relative to importableContentsSubDir
, importableContentsNextChunk :: m (Maybe (ImportableContentsChunk m info))
-- ^ Continuation to get the next chunk.
-- Returns Nothing when there are no more chunks.
}
newtype ImportChunkSubDir = ImportChunkSubDir { importChunkSubDir :: RawFilePath }
importableContentsChunkFullLocation
:: ImportChunkSubDir
-> RawFilePath
-> ImportLocation
importableContentsChunkFullLocation (ImportChunkSubDir root) loc =
mkImportLocation $ Posix.combine root loc

View file

@ -309,7 +309,7 @@ data ImportActions a = ImportActions
--
-- Throws exception on failure to access the remote.
-- May return Nothing when the remote is unchanged since last time.
{ listImportableContents :: a (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
{ listImportableContents :: a (Maybe (ImportableContentsChunkable a (ContentIdentifier, ByteSize)))
-- Generates a Key (of any type) for the file stored on the
-- remote at the ImportLocation. Does not download the file
-- from the remote.
@ -322,7 +322,7 @@ data ImportActions a = ImportActions
-- since the ContentIdentifier was generated.
--
-- When it returns nothing, the file at the ImportLocation
-- not by included in the imported tree.
-- will not be included in the imported tree.
--
-- When the remote is thirdPartyPopulated, this should check if the
-- file stored on the remote is the content of an annex object,

View file

@ -3,3 +3,5 @@ memory, then got OOM-killed.
I don't know if this is a memory leak or just trying to load too much, but it seems like this is a thing you should be able to do on
a machine with 64G of RAM.
> [[fixed|done]] --[[Joey]]

View file

@ -0,0 +1,9 @@
[[!comment format=mdwn
username="joey"
subject="""comment 12"""
date="2021-10-08T17:06:05Z"
content="""
I've fixed this problem, my test case tops out at 160 mb now, and adding more
archives to the borg repo no longer increases memory use. Memory use is now
proportional to the number of annexed objects in a borg archive.
"""]]

View file

@ -0,0 +1,20 @@
Borg uses an empty ContentIdentifier for everything; it does not need to
record anything. But that empty value gets stored in the log for each key
that is stored in borg. This unncessarily bloats the size of the git-annex
branch, by one content identifier per key stored in borg.
I think that it also slows down importing many archives from borg,
because for each of them it has to record the content identifier,
which is always the same, but still results in a db write.
Omitting storing any ContentIdentifier would break code such as
Remote.Helper.ExportImport's retrieveKeyFileFromImport.
If the borg Remote could indicate with a flag that it does not use
ContentIdentifiers, then code like that could pass it a null
ContentIdentifier without needing to read it from the db.
Annex.Import uses getContentIdentifierKeys, but only when it's not
thirdpartypopulated. So this change would not break that for borg,
but a clean way to handle that would be to make it also return a null
ContentIdentifier when the remote has the flag set. --[[Joey]]