From bea0ad220a68138dc0a43d0c86bb2353ecf92d2c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 26 Apr 2018 14:21:27 -0400 Subject: [PATCH] avoid --all buffering list of all keys In Annex.Branch.branch, the (++) was killing laziness. Rewrote so it streams lazily. filterM also kills laziness, so made loggedKeys use a Unchecked type, and check if the key is dead in the seek loop. Note that loggedKeysFor still buffers, so git-annex info and git-annex unused --from remote still use more memory than necessary. Also removed some unused functions from Annex.Journal. --- Annex/Branch.hs | 27 +++++++++++++++++++++------ Annex/Journal.hs | 21 --------------------- CHANGELOG | 1 + CmdLine/Seek.hs | 15 ++++++++------- Command/Unused.hs | 3 ++- Logs/Location.hs | 29 ++++++++++++++++++++--------- 6 files changed, 52 insertions(+), 44 deletions(-) diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 36cfb8b558..73b98a9239 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -34,6 +34,7 @@ import qualified Data.Map as M import Data.Function import Data.Char import Control.Concurrent (threadDelay) +import System.IO.Unsafe (unsafeInterleaveIO) import Annex.Common import Annex.BranchState @@ -333,19 +334,33 @@ commitIndex' jl branchref message basemessage retrynum parents = do let racemessage = basemessage ++ " (recovery from race #" ++ show retrynum' ++ "; expected commit parent " ++ show branchref ++ " but found " ++ show lostrefs ++ " )" commitIndex' jl committedref racemessage basemessage retrynum' [committedref] -{- Lists all files on the branch. There may be duplicates in the list. -} +{- Lists all files on the branch. including ones in the journal + - that have not been committed yet. There may be duplicates in the list. + - Streams lazily. -} files :: Annex [FilePath] files = do update - (++) - <$> branchFiles - <*> getJournalledFilesStale + withIndex $ do + g <- gitRepo + withJournalHandle (go g) + where + go g jh = readDirectory jh >>= \case + Nothing -> branchFiles' g + Just file + | dirCruft file -> go g jh + | otherwise -> do + let branchfile = fileJournal file + rest <- unsafeInterleaveIO (go g jh) + return (branchfile:rest) {- Files in the branch, not including any from journalled changes, - and without updating the branch. -} branchFiles :: Annex [FilePath] -branchFiles = withIndex $ inRepo $ Git.Command.pipeNullSplitZombie $ - lsTreeParams fullname [Param "--name-only"] +branchFiles = withIndex $ inRepo branchFiles' + +branchFiles' :: Git.Repo -> IO [FilePath] +branchFiles' = Git.Command.pipeNullSplitZombie + (lsTreeParams fullname [Param "--name-only"]) {- Populates the branch's index file with the current branch contents. - diff --git a/Annex/Journal.hs b/Annex/Journal.hs index 0ff95ffe54..ea2648c5ac 100644 --- a/Annex/Journal.hs +++ b/Annex/Journal.hs @@ -55,27 +55,6 @@ getJournalFileStale :: FilePath -> Annex (Maybe String) getJournalFileStale file = inRepo $ \g -> catchMaybeIO $ readFileStrict $ journalFile file g -{- List of files that have updated content in the journal. -} -getJournalledFiles :: JournalLocked -> Annex [FilePath] -getJournalledFiles jl = map fileJournal <$> getJournalFiles jl - -getJournalledFilesStale :: Annex [FilePath] -getJournalledFilesStale = map fileJournal <$> getJournalFilesStale - -{- List of existing journal files. -} -getJournalFiles :: JournalLocked -> Annex [FilePath] -getJournalFiles _jl = getJournalFilesStale - -{- List of existing journal files, but without locking, may miss new ones - - just being added, or may have false positives if the journal is staged - - as it is run. -} -getJournalFilesStale :: Annex [FilePath] -getJournalFilesStale = do - g <- gitRepo - fs <- liftIO $ catchDefaultIO [] $ - getDirectoryContents $ gitAnnexJournalDir g - return $ filter (`notElem` [".", ".."]) fs - withJournalHandle :: (DirectoryHandle -> IO a) -> Annex a withJournalHandle a = do d <- fromRepo gitAnnexJournalDir diff --git a/CHANGELOG b/CHANGELOG index 2053973e1e..ebadab2e88 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -23,6 +23,7 @@ git-annex (6.20180410) UNRELEASED; urgency=medium * Assistant: Fix installation of menus, icons, etc when run from within runshell. * import: Avoid buffering all filenames to be imported in memory. + * Improve memory use and speed of --all, by not buffering list of all keys. -- Joey Hess Mon, 09 Apr 2018 14:03:28 -0400 diff --git a/CmdLine/Seek.hs b/CmdLine/Seek.hs index e0284f54eb..ec002dbdd9 100644 --- a/CmdLine/Seek.hs +++ b/CmdLine/Seek.hs @@ -196,14 +196,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do giveup "Cannot use --auto in a bare repository" case (null params, ko) of (True, Nothing) - | bare -> noauto $ runkeyaction loggedKeys + | bare -> noauto $ runkeyaction finishCheck loggedKeys | otherwise -> fallbackaction params (False, Nothing) -> fallbackaction params - (True, Just WantAllKeys) -> noauto $ runkeyaction loggedKeys - (True, Just WantUnusedKeys) -> noauto $ runkeyaction unusedKeys' + (True, Just WantAllKeys) -> noauto $ runkeyaction finishCheck loggedKeys + (True, Just WantUnusedKeys) -> noauto $ runkeyaction (pure . Just) unusedKeys' (True, Just WantFailedTransfers) -> noauto runfailedtransfers - (True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (return [k]) - (True, Just WantIncompleteKeys) -> noauto $ runkeyaction incompletekeys + (True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (pure . Just) (return [k]) + (True, Just WantIncompleteKeys) -> noauto $ runkeyaction (pure . Just) incompletekeys (True, Just (WantBranchKeys bs)) -> noauto $ runbranchkeys bs (False, Just _) -> giveup "Can only specify one of file names, --all, --branch, --unused, --failed, --key, or --incomplete" where @@ -211,10 +211,11 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do | auto = giveup "Cannot use --auto with --all or --branch or --unused or --key or --incomplete" | otherwise = a incompletekeys = staleKeysPrune gitAnnexTmpObjectDir True - runkeyaction getks = do + runkeyaction checker getks = do keyaction <- mkkeyaction ks <- getks - forM_ ks $ \k -> keyaction k (mkActionItem k) + forM_ ks $ checker >=> maybe noop + (\k -> keyaction k (mkActionItem k)) runbranchkeys bs = do keyaction <- mkkeyaction forM_ bs $ \b -> do diff --git a/Command/Unused.hs b/Command/Unused.hs index 27018cf382..c68dc4a9d0 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -100,7 +100,8 @@ checkRemoteUnused name refspec = go =<< fromJust <$> Remote.byNameWithUUID (Just showAction "checking for unused data" _ <- check "" (remoteUnusedMsg r) (remoteunused r) 0 next $ return True - remoteunused r = excludeReferenced refspec <=< loggedKeysFor $ Remote.uuid r + remoteunused r = excludeReferenced refspec + <=< loggedKeysFor $ Remote.uuid r check :: FilePath -> ([(Int, Key)] -> String) -> Annex [Key] -> Int -> Annex Int check file msg a c = do diff --git a/Logs/Location.hs b/Logs/Location.hs index a94dc9089b..621cca9716 100644 --- a/Logs/Location.hs +++ b/Logs/Location.hs @@ -8,7 +8,7 @@ - Repositories record their UUID and the date when they --get or --drop - a value. - - - Copyright 2010-2015 Joey Hess + - Copyright 2010-2018 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -23,6 +23,8 @@ module Logs.Location ( isKnownKey, checkDead, setDead, + Unchecked, + finishCheck, loggedKeys, loggedKeysFor, ) where @@ -114,24 +116,33 @@ setDead' l = l Unknown -> Unknown } +data Unchecked a = Unchecked (Annex (Maybe a)) + +finishCheck :: Unchecked a -> Annex (Maybe a) +finishCheck (Unchecked a) = a + {- Finds all keys that have location log information. - (There may be duplicate keys in the list.) - - Keys that have been marked as dead are not included. -} -loggedKeys :: Annex [Key] +loggedKeys :: Annex [Unchecked Key] loggedKeys = loggedKeys' (not <$$> checkDead) -{- Note that sel should be strict, to avoid the filterM building many - - thunks. -} -loggedKeys' :: (Key -> Annex Bool) -> Annex [Key] -loggedKeys' sel = filterM sel =<< - (mapMaybe locationLogFileKey <$> Annex.Branch.files) +loggedKeys' :: (Key -> Annex Bool) -> Annex [Unchecked Key] +loggedKeys' check = mapMaybe (defercheck <$$> locationLogFileKey) + <$> Annex.Branch.files + where + defercheck k = Unchecked $ ifM (check k) + ( return (Just k) + , return Nothing + ) {- Finds all keys that have location log information indicating - - they are present for the specified repository. -} + - they are present for the specified repository. + -} loggedKeysFor :: UUID -> Annex [Key] -loggedKeysFor u = loggedKeys' isthere +loggedKeysFor u = catMaybes <$> (mapM finishCheck =<< loggedKeys' isthere) where isthere k = do us <- loggedLocations k