diff --git a/Annex/Import.hs b/Annex/Import.hs index 2d15c11b99..2e4275fa9a 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -1,6 +1,6 @@ {- git-annex import from remotes - - - Copyright 2019-2020 Joey Hess + - Copyright 2019-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -98,7 +98,7 @@ buildImportCommit :: Remote -> ImportTreeConfig -> ImportCommitConfig - -> ImportableContents (Either Sha Key) + -> ImportableContentsChunkable Annex (Either Sha Key) -> Annex (Maybe Ref) buildImportCommit remote importtreeconfig importcommitconfig importable = case importCommitTracking importcommitconfig of @@ -123,7 +123,7 @@ buildImportCommit remote importtreeconfig importcommitconfig importable = recordImportTree :: Remote -> ImportTreeConfig - -> ImportableContents (Either Sha Key) + -> ImportableContentsChunkable Annex (Either Sha Key) -> Annex (History Sha, Annex ()) recordImportTree remote importtreeconfig importable = do imported@(History finaltree _) <- buildImportTrees basetree subdir importable @@ -264,25 +264,75 @@ buildImportCommit' remote importcommitconfig mtrackingcommit imported@(History t buildImportTrees :: Ref -> Maybe TopFilePath - -> ImportableContents (Either Sha Key) + -> ImportableContentsChunkable Annex (Either Sha Key) -> Annex (History Sha) -buildImportTrees basetree msubdir importable = History - <$> (buildtree (importableContents importable) =<< Annex.gitRepo) - <*> buildhistory +buildImportTrees basetree msubdir (ImportableContentsComplete importable) = do + repo <- Annex.gitRepo + withMkTreeHandle repo $ buildImportTrees' basetree msubdir importable +buildImportTrees basetree msubdir importable@(ImportableContentsChunked {}) = do + repo <- Annex.gitRepo + withMkTreeHandle repo $ \hdl -> + History + <$> go hdl + <*> buildImportTreesHistory basetree msubdir + (importableHistoryComplete importable) hdl + where + go hdl = do + tree <- gochunks [] (importableContentsChunk importable) hdl + importtree <- liftIO $ recordTree' hdl tree + graftImportTree basetree msubdir importtree hdl + + gochunks l c hdl = do + let subdir = importChunkSubDir $ importableContentsSubDir c + -- Full directory prefix where the sub tree is located. + let fullprefix = asTopFilePath $ case msubdir of + Nothing -> subdir + Just d -> getTopFilePath d Posix. subdir + Tree ts <- convertImportTree (Just fullprefix) $ + map (\(p, i) -> (mkImportLocation p, i)) + (importableContentsSubTree c) + -- Record this subtree before getting next chunk, this + -- avoids buffering all the chunks into memory. + tc <- liftIO $ recordSubTree hdl $ + NewSubTree (asTopFilePath subdir) ts + importableContentsNextChunk c >>= \case + Nothing -> return (Tree (tc:l)) + Just c' -> gochunks (tc:l) c' hdl + +buildImportTrees' + :: Ref + -> Maybe TopFilePath + -> ImportableContents (Either Sha Key) + -> MkTreeHandle + -> Annex (History Sha) +buildImportTrees' basetree msubdir importable hdl = History + <$> buildImportTree basetree msubdir (importableContents importable) hdl + <*> buildImportTreesHistory basetree msubdir (importableHistory importable) hdl + +buildImportTree + :: Ref + -> Maybe TopFilePath + -> [(ImportLocation, Either Sha Key)] + -> MkTreeHandle + -> Annex Sha +buildImportTree basetree msubdir ls hdl = do + importtree <- liftIO . recordTree' hdl =<< convertImportTree msubdir ls + graftImportTree basetree msubdir importtree hdl + +graftImportTree + :: Ref + -> Maybe TopFilePath + -> Sha + -> MkTreeHandle + -> Annex Sha +graftImportTree basetree msubdir tree hdl = case msubdir of + Nothing -> return tree + Just subdir -> inRepo $ \repo -> + graftTree' tree subdir basetree repo hdl + +convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree +convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls where - buildhistory = S.fromList - <$> mapM (buildImportTrees basetree msubdir) - (importableHistory importable) - - buildtree ls repo = withMkTreeHandle repo $ \hdl -> do - importtree <- liftIO . recordTree' hdl - . treeItemsToTree - =<< mapM mktreeitem ls - case msubdir of - Nothing -> return importtree - Just subdir -> liftIO $ - graftTree' importtree subdir basetree repo hdl - mktreeitem (loc, v) = case v of Right k -> do relf <- fromRepo $ fromTopFilePath topf @@ -297,6 +347,15 @@ buildImportTrees basetree msubdir importable = History topf = asTopFilePath $ maybe lf (\sd -> getTopFilePath sd P. lf) msubdir +buildImportTreesHistory + :: Ref + -> Maybe TopFilePath + -> [ImportableContents (Either Sha Key)] + -> MkTreeHandle + -> Annex (S.Set (History Sha)) +buildImportTreesHistory basetree msubdir history hdl = S.fromList + <$> mapM (\ic -> buildImportTrees' basetree msubdir ic hdl) history + canImportKeys :: Remote -> Bool -> Bool canImportKeys remote importcontent = importcontent || isJust (Remote.importKey ia) @@ -324,8 +383,8 @@ importKeys -> ImportTreeConfig -> Bool -> Bool - -> ImportableContents (ContentIdentifier, ByteSize) - -> Annex (Maybe (ImportableContents (Either Sha Key))) + -> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize) + -> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key))) importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do unless (canImportKeys remote importcontent) $ giveup "This remote does not support importing without downloading content." @@ -339,40 +398,82 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec -- When concurrency is enabled, this set is needed to -- avoid two threads both importing the same content identifier. importing <- liftIO $ newTVarIO S.empty - withExclusiveLock gitAnnexContentIdentifierLock $ - bracket CIDDb.openDb CIDDb.closeDb $ \db -> do - CIDDb.needsUpdateFromLog db - >>= maybe noop (CIDDb.updateFromLog db) - (run (go False cidmap importing importablecontents db)) + withciddb $ \db -> do + CIDDb.needsUpdateFromLog db + >>= maybe noop (CIDDb.updateFromLog db) + (prepclock (run cidmap importing db)) where -- When not importing content, reuse the same vector -- clock for all state that's recorded. This can save -- a little bit of disk space. Individual file downloads -- while downloading take too long for this optimisation -- to be safe to do. - run a + prepclock a | importcontent = a | otherwise = reuseVectorClockWhile a - go oldversion cidmap importing (ImportableContents l h) db = do + withciddb = withExclusiveLock gitAnnexContentIdentifierLock . + bracket CIDDb.openDb CIDDb.closeDb + + run cidmap importing db = do largematcher <- largeFilesMatcher + case importablecontents of + ImportableContentsComplete ic -> + go False largematcher cidmap importing db ic >>= return . \case + Nothing -> Nothing + Just v -> Just $ ImportableContentsComplete v + ImportableContentsChunked {} -> do + c <- gochunked db (importableContentsChunk importablecontents) + gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case + Nothing -> Nothing + Just h -> Just $ ImportableContentsChunked + { importableContentsChunk = c + , importableHistoryComplete = h + } + + go oldversion largematcher cidmap importing db (ImportableContents l h) = do jobs <- forM l $ \i -> if thirdpartypopulated - then thirdpartypopulatedimport cidmap db i + then Left <$> thirdpartypopulatedimport db i else startimport cidmap importing db i oldversion largematcher l' <- liftIO $ forM jobs $ either pure (atomically . takeTMVar) if any isNothing l' then return Nothing - else do - h' <- mapM (\ic -> go True cidmap importing ic db) h - if any isNothing h' - then return Nothing - else return $ Just $ - ImportableContents - (catMaybes l') - (catMaybes h') + else gohistory largematcher cidmap importing db h >>= return . \case + Nothing -> Nothing + Just h' -> Just $ ImportableContents (catMaybes l') h' + gohistory largematcher cidmap importing db h = do + h' <- mapM (go True largematcher cidmap importing db) h + if any isNothing h' + then return Nothing + else return $ Just $ catMaybes h' + + gochunked db c + -- Downloading cannot be done when chunked, since only + -- the first chunk is processed before returning. + | importcontent = error "importKeys does not support downloading chunked import" + -- Chunked import is currently only used by thirdpartypopulated + -- remotes. + | not thirdpartypopulated = error "importKeys does not support chunked import when not thirdpartypopulated" + | otherwise = do + l <- forM (importableContentsSubTree c) $ \(loc, i) -> do + let loc' = importableContentsChunkFullLocation (importableContentsSubDir c) loc + thirdpartypopulatedimport db (loc', i) >>= return . \case + Just (_loc, k) -> Just (loc, k) + Nothing -> Nothing + return $ ImportableContentsChunk + { importableContentsSubDir = importableContentsSubDir c + , importableContentsSubTree = catMaybes l + , importableContentsNextChunk = + importableContentsNextChunk c >>= \case + Nothing -> return Nothing + Just c' -> withciddb $ \db' -> + prepclock $ + Just <$> gochunked db' c' + } + waitstart importing cid = liftIO $ atomically $ do s <- readTVar importing if S.member cid s @@ -418,19 +519,19 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec importaction return (Right job) - thirdpartypopulatedimport cidmap db (loc, (cid, sz)) = + thirdpartypopulatedimport db (loc, (cid, sz)) = case Remote.importKey ia of - Nothing -> return $ Left Nothing + Nothing -> return Nothing Just importkey -> tryNonAsync (importkey loc cid sz nullMeterUpdate) >>= \case Right (Just k) -> do - recordcidkey cidmap db cid k - logChange k (Remote.uuid remote) InfoPresent - return $ Left $ Just (loc, Right k) - Right Nothing -> return $ Left Nothing + recordcidkey' db cid k + logChange k (Remote.uuid remote) InfoPresent + return $ Just (loc, Right k) + Right Nothing -> return Nothing Left e -> do warning (show e) - return $ Left Nothing + return Nothing importordownload cidmap db (loc, (cid, sz)) largematcher= do f <- locworktreefile loc @@ -603,6 +704,8 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec recordcidkey cidmap db cid k = do liftIO $ atomically $ modifyTVar' cidmap $ M.insert cid k + recordcidkey' db cid k + recordcidkey' db cid k = do liftIO $ CIDDb.recordContentIdentifier db rs cid k CIDLog.recordContentIdentifier rs cid k @@ -675,18 +778,38 @@ makeImportMatcher r = load preferredContentKeylessTokens >>= \case - Throws exception if unable to contact the remote. - Returns Nothing when there is no change since last time. -} -getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))) getImportableContents r importtreeconfig ci matcher = do Remote.listImportableContents (Remote.importActions r) >>= \case - Just importable -> do - dbhandle <- Export.openDb (Remote.uuid r) - Just <$> filterunwanted dbhandle importable + Just (ImportableContentsComplete ic) -> do + dbhandle <- opendbhandle + Just . ImportableContentsComplete + <$> filterunwanted dbhandle ic + Just (c@(ImportableContentsChunked {})) -> do + dbhandle <- opendbhandle + Just <$> filterunwantedchunked dbhandle c Nothing -> return Nothing where filterunwanted dbhandle ic = ImportableContents <$> filterM (wanted dbhandle) (importableContents ic) <*> mapM (filterunwanted dbhandle) (importableHistory ic) + filterunwantedchunked dbhandle c = ImportableContentsChunked + <$> filterunwantedchunk dbhandle (importableContentsChunk c) + <*> mapM (filterunwanted dbhandle) (importableHistoryComplete c) + + filterunwantedchunk dbhandle c = ImportableContentsChunk + <$> pure (importableContentsSubDir c) + <*> filterM (wantedunder dbhandle (importableContentsSubDir c)) + (importableContentsSubTree c) + <*> pure ( + importableContentsNextChunk c >>= \case + Nothing -> return Nothing + Just c' -> Just <$> filterunwantedchunk dbhandle c' + ) + + opendbhandle = Export.openDb (Remote.uuid r) + wanted dbhandle (loc, (_cid, sz)) | ingitdir = pure False | otherwise = @@ -697,6 +820,9 @@ getImportableContents r importtreeconfig ci matcher = do matches = matchesImportLocation matcher loc sz isknown = isKnownImportLocation dbhandle loc notignored = notIgnoredImportLocation importtreeconfig ci loc + + wantedunder dbhandle root (loc, v) = + wanted dbhandle (importableContentsChunkFullLocation root loc, v) isKnownImportLocation :: Export.ExportHandle -> ImportLocation -> Annex Bool isKnownImportLocation dbhandle loc = liftIO $ diff --git a/CHANGELOG b/CHANGELOG index a66d4524d1..6c66eaf91f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,8 @@ git-annex (8.20210904) UNRELEASED; urgency=medium * Bug fix: Git configs such as annex.verify were incorrectly overriding per-remote git configs such as remote.name.annex-verify. (Reversion in version 4.20130323) + * borg: Significantly improved memory use when a borg repository + contains many archives. * borg: Avoid trying to extract xattrs, ACLS, and bsdflags when retrieving from a borg repository. * Sped up git-annex smudge --clean by 25%. @@ -15,7 +17,6 @@ git-annex (8.20210904) UNRELEASED; urgency=medium incrementally verified, when used on NTFS and perhaps other filesystems. * reinject: Fix crash when reinjecting a file from outside the repository. (Reversion in version 8.20210621) - * borg: Some improvements to memory use when importing a lot of archives. * Avoid cursor jitter when updating progress display. -- Joey Hess Fri, 03 Sep 2021 12:02:55 -0400 diff --git a/Command/Import.hs b/Command/Import.hs index f31cdb18fb..e0a96643c1 100644 --- a/Command/Import.hs +++ b/Command/Import.hs @@ -346,7 +346,7 @@ seekRemote remote branch msubdir importcontent ci = do fromtrackingbranch a = inRepo $ a (fromRemoteTrackingBranch tb) -listContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> TVar (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize))) -> CommandStart +listContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> TVar (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, Remote.ByteSize))) -> CommandStart listContents remote importtreeconfig ci tvar = starting "list" ai si $ listContents' remote importtreeconfig ci $ \importable -> do liftIO $ atomically $ writeTVar tvar importable @@ -355,7 +355,7 @@ listContents remote importtreeconfig ci tvar = starting "list" ai si $ ai = ActionItemOther (Just (Remote.name remote)) si = SeekInput [] -listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize)) -> Annex a) -> Annex a +listContents' :: Remote -> ImportTreeConfig -> CheckGitIgnore -> (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, Remote.ByteSize)) -> Annex a) -> Annex a listContents' remote importtreeconfig ci a = makeImportMatcher remote >>= \case Right matcher -> tryNonAsync (getImportableContents remote importtreeconfig ci matcher) >>= \case @@ -368,7 +368,7 @@ listContents' remote importtreeconfig ci a = , err ] -commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> ImportableContents (Either Sha Key) -> CommandStart +commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> ImportableContentsChunkable Annex (Either Sha Key) -> CommandStart commitRemote remote branch tb trackingcommit importtreeconfig importcommitconfig importable = starting "update" ai si $ do importcommit <- buildImportCommit remote importtreeconfig importcommitconfig importable diff --git a/Git/Tree.hs b/Git/Tree.hs index 4c48c75da3..48ed126cfe 100644 --- a/Git/Tree.hs +++ b/Git/Tree.hs @@ -13,6 +13,7 @@ module Git.Tree ( getTree, recordTree, recordTree', + recordSubTree, TreeItem(..), treeItemsToTree, treeItemToLsTreeItem, @@ -21,6 +22,7 @@ module Git.Tree ( graftTree, graftTree', withMkTreeHandle, + MkTreeHandle, treeMode, ) where diff --git a/Remote/Adb.hs b/Remote/Adb.hs index 5f07fe8aa7..060366a9fb 100644 --- a/Remote/Adb.hs +++ b/Remote/Adb.hs @@ -288,9 +288,10 @@ renameExportM serial adir _k old new = do , File newloc ] -listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsM :: AndroidSerial -> AndroidPath -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))) listImportableContentsM serial adir = adbfind >>= \case - Just ls -> return $ Just $ ImportableContents (mapMaybe mk ls) [] + Just ls -> return $ Just $ ImportableContentsComplete $ + ImportableContents (mapMaybe mk ls) [] Nothing -> giveup "adb find failed" where adbfind = adbShell serial diff --git a/Remote/Borg.hs b/Remote/Borg.hs index 86db4bf21c..af3a010703 100644 --- a/Remote/Borg.hs +++ b/Remote/Borg.hs @@ -162,20 +162,21 @@ borgRepoLocalPath r@(BorgRepo p) | borgLocal r && not (null p) = Just p | otherwise = Nothing -listImportableContentsM :: UUID -> BorgRepo -> ParsedRemoteConfig -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsM :: UUID -> BorgRepo -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))) listImportableContentsM u borgrepo c = prompt $ do imported <- getImported u ls <- withborglist (locBorgRepo borgrepo) Nothing formatarchivelist $ \as -> forM (filter (not . S.null) as) $ \archivename -> - case M.lookup archivename imported of - Just getfast -> return $ Left (archivename, getfast) - Nothing -> Right <$> + return $ case M.lookup archivename imported of + Just getlist -> Left (archivename, getlist) + Nothing -> let archive = borgArchive borgrepo archivename - in withborglist archive subdir formatfilelist $ + getlist = withborglist archive subdir formatfilelist $ liftIO . evaluate . force . parsefilelist archivename + in Right (archivename, getlist) if all isLeft ls && M.null (M.difference imported (M.fromList (lefts ls))) then return Nothing -- unchanged since last time, avoid work - else Just . mkimportablecontents <$> mapM (either snd pure) ls + else Just <$> mkimportablecontents (map (either id id) ls) where withborglist what addparam format a = do environ <- liftIO getEnvironment @@ -210,7 +211,7 @@ listImportableContentsM u borgrepo c = prompt $ do parsefilelist archivename (bsz:f:extra:rest) = case readMaybe (fromRawFilePath bsz) of Nothing -> parsefilelist archivename rest Just sz -> - let loc = genImportLocation archivename f + let loc = genImportLocation f -- borg list reports hard links as 0 byte files, -- with the extra field set to " link to ". -- When the annex object is a hard link to @@ -234,12 +235,27 @@ listImportableContentsM u borgrepo c = prompt $ do -- importableHistory is not used for retrieval, so is not -- populated with old archives. Instead, a tree of archives - -- is constructed, by genImportLocation including the archive - -- name in the ImportLocation. - mkimportablecontents l = ImportableContents - { importableContents = concat l - , importableHistory = [] - } + -- is constructed, with a subtree for each archive. + mkimportablecontents [] = return $ ImportableContentsComplete $ + ImportableContents + { importableContents = [] + , importableHistory = [] + } + mkimportablecontents (l:ls) = ImportableContentsChunked + <$> mkimportablecontentschunk l ls + <*> pure [] + + mkimportablecontentschunk (archivename, getlist) rest = do + l <- getlist + return $ ImportableContentsChunk + { importableContentsSubDir = + genImportChunkSubDir archivename + , importableContentsSubTree = l + , importableContentsNextChunk = case rest of + (getlist':rest') -> Just + <$> mkimportablecontentschunk getlist' rest' + [] -> return Nothing + } -- We do not need a ContentIdentifier in order to retrieve a file from -- borg; the ImportLocation contains all that's needed. So, this is left @@ -247,15 +263,20 @@ listImportableContentsM u borgrepo c = prompt $ do borgContentIdentifier :: ContentIdentifier borgContentIdentifier = ContentIdentifier mempty +-- Convert a path file a borg archive to a path that can be used as an +-- ImportLocation. The archive name gets used as a subdirectory, +-- which this path is inside. +-- -- Borg does not allow / in the name of an archive, so the archive -- name will always be the first directory in the ImportLocation. -- --- Paths in a borg archive are always relative, not absolute, so the use of --- to combine the archive name with the path will always work. -genImportLocation :: BorgArchiveName -> RawFilePath -> ImportLocation -genImportLocation archivename p = - ThirdPartyPopulated.mkThirdPartyImportLocation $ - archivename P. p +-- This scheme also relies on the fact that paths in a borg archive are +-- always relative, not absolute. +genImportLocation :: RawFilePath -> RawFilePath +genImportLocation = fromImportLocation . ThirdPartyPopulated.mkThirdPartyImportLocation + +genImportChunkSubDir :: BorgArchiveName -> ImportChunkSubDir +genImportChunkSubDir = ImportChunkSubDir . fromImportLocation . ThirdPartyPopulated.mkThirdPartyImportLocation extractImportLocation :: ImportLocation -> (BorgArchiveName, RawFilePath) extractImportLocation loc = go $ P.splitDirectories $ @@ -269,7 +290,7 @@ extractImportLocation loc = go $ P.splitDirectories $ -- last imported tree. And the contents of those archives can be retrieved -- by listing the subtree recursively, which will likely be quite a lot -- faster than running borg. -getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(ImportLocation, (ContentIdentifier, ByteSize))])) +getImported :: UUID -> Annex (M.Map BorgArchiveName (Annex [(RawFilePath, (ContentIdentifier, ByteSize))])) getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u) where go t = M.fromList . mapMaybe mk @@ -278,21 +299,19 @@ getImported u = M.unions <$> (mapM go . exportedTreeishes =<< getExport u) mk ti | toTreeItemType (LsTree.mode ti) == Just TreeSubtree = Just ( getTopFilePath (LsTree.file ti) - , getcontents - (getTopFilePath (LsTree.file ti)) - (LsTree.sha ti) + , getcontents (LsTree.sha ti) ) | otherwise = Nothing - getcontents archivename t = mapMaybe (mkcontents archivename) + getcontents t = mapMaybe mkcontents <$> inRepo (LsTree.lsTreeStrict LsTree.LsTreeRecursive (LsTree.LsTreeLong False) t) - mkcontents archivename ti = do + mkcontents ti = do let f = ThirdPartyPopulated.fromThirdPartyImportLocation $ mkImportLocation $ getTopFilePath $ LsTree.file ti k <- fileKey (P.takeFileName f) return - ( genImportLocation archivename f + ( genImportLocation f , ( borgContentIdentifier -- defaulting to 0 size is ok, this size diff --git a/Remote/Directory.hs b/Remote/Directory.hs index 6a34df6bd0..4650f054d3 100644 --- a/Remote/Directory.hs +++ b/Remote/Directory.hs @@ -351,11 +351,12 @@ removeExportLocation topdir loc = mkExportLocation loc' in go (upFrom loc') =<< tryIO (removeDirectory p) -listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsM :: RawFilePath -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))) listImportableContentsM dir = liftIO $ do l <- dirContentsRecursive (fromRawFilePath dir) l' <- mapM (go . toRawFilePath) l - return $ Just $ ImportableContents (catMaybes l') [] + return $ Just $ ImportableContentsComplete $ + ImportableContents (catMaybes l') [] where go f = do st <- R.getFileStatus f diff --git a/Remote/Helper/ThirdPartyPopulated.hs b/Remote/Helper/ThirdPartyPopulated.hs index 170dab4784..beeadd3109 100644 --- a/Remote/Helper/ThirdPartyPopulated.hs +++ b/Remote/Helper/ThirdPartyPopulated.hs @@ -47,10 +47,10 @@ fromThirdPartyImportLocation = -- find only those ImportLocations that are annex object files. -- All other ImportLocations are ignored. importKey :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key) -importKey loc _cid sz _ = return $ importKey' loc (Just sz) +importKey loc _cid sz _ = return $ importKey' (fromImportLocation loc) (Just sz) -importKey' :: ImportLocation -> Maybe ByteSize -> Maybe Key -importKey' loc msz = case fileKey f of +importKey' :: RawFilePath -> Maybe ByteSize -> Maybe Key +importKey' p msz = case fileKey f of Just k -- Annex objects always are in a subdirectory with the same -- name as the filename. If this is not the case for the file @@ -82,5 +82,4 @@ importKey' loc msz = case fileKey f of _ -> Just k Nothing -> Nothing where - p = fromImportLocation loc f = P.takeFileName p diff --git a/Remote/S3.hs b/Remote/S3.hs index 7a4a8541b2..9ca1d7c87d 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -549,13 +549,15 @@ renameExportS3 hv r rs info k src dest = Just <$> go srcobject = T.pack $ bucketExportLocation info src dstobject = T.pack $ bucketExportLocation info dest -listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize))) +listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))) listImportableContentsS3 hv r info = withS3Handle hv $ \case Nothing -> giveup $ needS3Creds (uuid r) Just h -> Just <$> go h where - go h = liftIO $ runResourceT $ extractFromResourceT =<< startlist h + go h = do + ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h + return (ImportableContentsComplete ic) startlist h | versioning info = do diff --git a/Types/Import.hs b/Types/Import.hs index 2013e44cde..c7add2610c 100644 --- a/Types/Import.hs +++ b/Types/Import.hs @@ -1,6 +1,6 @@ {- git-annex import types - - - Copyright 2019 Joey Hess + - Copyright 2019-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -13,6 +13,7 @@ import qualified Data.ByteString as S import Data.Char import Control.DeepSeq import GHC.Generics +import qualified System.FilePath.Posix.ByteString as Posix import Types.Export import Utility.QuickCheck @@ -69,3 +70,34 @@ data ImportableContents info = ImportableContents deriving (Show, Generic) instance NFData info => NFData (ImportableContents info) + +{- ImportableContents, but it can be chunked into subtrees to avoid + - all needing to fit in memory at the same time. -} +data ImportableContentsChunkable m info + = ImportableContentsComplete (ImportableContents info) + -- ^ Used when not chunking + | ImportableContentsChunked + { importableContentsChunk :: ImportableContentsChunk m info + , importableHistoryComplete :: [ImportableContents info] + -- ^ Chunking the history is not supported + } + +{- A chunk of ImportableContents, which is the entire content of a subtree + - of the main tree. Nested subtrees are not allowed. -} +data ImportableContentsChunk m info = ImportableContentsChunk + { importableContentsSubDir :: ImportChunkSubDir + , importableContentsSubTree :: [(RawFilePath, info)] + -- ^ locations are relative to importableContentsSubDir + , importableContentsNextChunk :: m (Maybe (ImportableContentsChunk m info)) + -- ^ Continuation to get the next chunk. + -- Returns Nothing when there are no more chunks. + } + +newtype ImportChunkSubDir = ImportChunkSubDir { importChunkSubDir :: RawFilePath } + +importableContentsChunkFullLocation + :: ImportChunkSubDir + -> RawFilePath + -> ImportLocation +importableContentsChunkFullLocation (ImportChunkSubDir root) loc = + mkImportLocation $ Posix.combine root loc diff --git a/Types/Remote.hs b/Types/Remote.hs index de04e69859..e8b25c2a81 100644 --- a/Types/Remote.hs +++ b/Types/Remote.hs @@ -309,7 +309,7 @@ data ImportActions a = ImportActions -- -- Throws exception on failure to access the remote. -- May return Nothing when the remote is unchanged since last time. - { listImportableContents :: a (Maybe (ImportableContents (ContentIdentifier, ByteSize))) + { listImportableContents :: a (Maybe (ImportableContentsChunkable a (ContentIdentifier, ByteSize))) -- Generates a Key (of any type) for the file stored on the -- remote at the ImportLocation. Does not download the file -- from the remote. @@ -322,7 +322,7 @@ data ImportActions a = ImportActions -- since the ContentIdentifier was generated. -- -- When it returns nothing, the file at the ImportLocation - -- not by included in the imported tree. + -- will not be included in the imported tree. -- -- When the remote is thirdPartyPopulated, this should check if the -- file stored on the remote is the content of an annex object, diff --git a/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo.mdwn b/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo.mdwn index 375a5cfdd3..cc5393c8c9 100644 --- a/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo.mdwn +++ b/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo.mdwn @@ -3,3 +3,5 @@ memory, then got OOM-killed. I don't know if this is a memory leak or just trying to load too much, but it seems like this is a thing you should be able to do on a machine with 64G of RAM. + +> [[fixed|done]] --[[Joey]] diff --git a/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo/comment_12_f0eacfc77f5083b45a694552009ad0f6._comment b/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo/comment_12_f0eacfc77f5083b45a694552009ad0f6._comment new file mode 100644 index 0000000000..22c1d22d1c --- /dev/null +++ b/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo/comment_12_f0eacfc77f5083b45a694552009ad0f6._comment @@ -0,0 +1,9 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 12""" + date="2021-10-08T17:06:05Z" + content=""" +I've fixed this problem, my test case tops out at 160 mb now, and adding more +archives to the borg repo no longer increases memory use. Memory use is now +proportional to the number of annexed objects in a borg archive. +"""]] diff --git a/doc/todo/avoid_storing_contentidentifier_log_for_borg.mdwn b/doc/todo/avoid_storing_contentidentifier_log_for_borg.mdwn new file mode 100644 index 0000000000..453d31291f --- /dev/null +++ b/doc/todo/avoid_storing_contentidentifier_log_for_borg.mdwn @@ -0,0 +1,20 @@ +Borg uses an empty ContentIdentifier for everything; it does not need to +record anything. But that empty value gets stored in the log for each key +that is stored in borg. This unncessarily bloats the size of the git-annex +branch, by one content identifier per key stored in borg. + +I think that it also slows down importing many archives from borg, +because for each of them it has to record the content identifier, +which is always the same, but still results in a db write. + +Omitting storing any ContentIdentifier would break code such as +Remote.Helper.ExportImport's retrieveKeyFileFromImport. + +If the borg Remote could indicate with a flag that it does not use +ContentIdentifiers, then code like that could pass it a null +ContentIdentifier without needing to read it from the db. + +Annex.Import uses getContentIdentifierKeys, but only when it's not +thirdpartypopulated. So this change would not break that for borg, +but a clean way to handle that would be to make it also return a null +ContentIdentifier when the remote has the flag set. --[[Joey]]