avoid --all buffering list of all keys

In Annex.Branch.branch, the (++) was killing laziness.
Rewrote so it streams lazily.

filterM also kills laziness, so made loggedKeys use a Unchecked type,
and check if the key is dead in the seek loop.

Note that loggedKeysFor still buffers, so git-annex info <remote> and
git-annex unused --from remote still use more memory than necessary.

Also removed some unused functions from Annex.Journal.
This commit is contained in:
Joey Hess 2018-04-26 14:21:27 -04:00
parent a8c91ce69a
commit bea0ad220a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 52 additions and 44 deletions

View file

@ -34,6 +34,7 @@ import qualified Data.Map as M
import Data.Function
import Data.Char
import Control.Concurrent (threadDelay)
import System.IO.Unsafe (unsafeInterleaveIO)
import Annex.Common
import Annex.BranchState
@ -333,19 +334,33 @@ commitIndex' jl branchref message basemessage retrynum parents = do
let racemessage = basemessage ++ " (recovery from race #" ++ show retrynum' ++ "; expected commit parent " ++ show branchref ++ " but found " ++ show lostrefs ++ " )"
commitIndex' jl committedref racemessage basemessage retrynum' [committedref]
{- Lists all files on the branch. There may be duplicates in the list. -}
{- Lists all files on the branch. including ones in the journal
- that have not been committed yet. There may be duplicates in the list.
- Streams lazily. -}
files :: Annex [FilePath]
files = do
update
(++)
<$> branchFiles
<*> getJournalledFilesStale
withIndex $ do
g <- gitRepo
withJournalHandle (go g)
where
go g jh = readDirectory jh >>= \case
Nothing -> branchFiles' g
Just file
| dirCruft file -> go g jh
| otherwise -> do
let branchfile = fileJournal file
rest <- unsafeInterleaveIO (go g jh)
return (branchfile:rest)
{- Files in the branch, not including any from journalled changes,
- and without updating the branch. -}
branchFiles :: Annex [FilePath]
branchFiles = withIndex $ inRepo $ Git.Command.pipeNullSplitZombie $
lsTreeParams fullname [Param "--name-only"]
branchFiles = withIndex $ inRepo branchFiles'
branchFiles' :: Git.Repo -> IO [FilePath]
branchFiles' = Git.Command.pipeNullSplitZombie
(lsTreeParams fullname [Param "--name-only"])
{- Populates the branch's index file with the current branch contents.
-

View file

@ -55,27 +55,6 @@ getJournalFileStale :: FilePath -> Annex (Maybe String)
getJournalFileStale file = inRepo $ \g -> catchMaybeIO $
readFileStrict $ journalFile file g
{- List of files that have updated content in the journal. -}
getJournalledFiles :: JournalLocked -> Annex [FilePath]
getJournalledFiles jl = map fileJournal <$> getJournalFiles jl
getJournalledFilesStale :: Annex [FilePath]
getJournalledFilesStale = map fileJournal <$> getJournalFilesStale
{- List of existing journal files. -}
getJournalFiles :: JournalLocked -> Annex [FilePath]
getJournalFiles _jl = getJournalFilesStale
{- List of existing journal files, but without locking, may miss new ones
- just being added, or may have false positives if the journal is staged
- as it is run. -}
getJournalFilesStale :: Annex [FilePath]
getJournalFilesStale = do
g <- gitRepo
fs <- liftIO $ catchDefaultIO [] $
getDirectoryContents $ gitAnnexJournalDir g
return $ filter (`notElem` [".", ".."]) fs
withJournalHandle :: (DirectoryHandle -> IO a) -> Annex a
withJournalHandle a = do
d <- fromRepo gitAnnexJournalDir

View file

@ -23,6 +23,7 @@ git-annex (6.20180410) UNRELEASED; urgency=medium
* Assistant: Fix installation of menus, icons, etc when run
from within runshell.
* import: Avoid buffering all filenames to be imported in memory.
* Improve memory use and speed of --all, by not buffering list of all keys.
-- Joey Hess <id@joeyh.name> Mon, 09 Apr 2018 14:03:28 -0400

View file

@ -196,14 +196,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
giveup "Cannot use --auto in a bare repository"
case (null params, ko) of
(True, Nothing)
| bare -> noauto $ runkeyaction loggedKeys
| bare -> noauto $ runkeyaction finishCheck loggedKeys
| otherwise -> fallbackaction params
(False, Nothing) -> fallbackaction params
(True, Just WantAllKeys) -> noauto $ runkeyaction loggedKeys
(True, Just WantUnusedKeys) -> noauto $ runkeyaction unusedKeys'
(True, Just WantAllKeys) -> noauto $ runkeyaction finishCheck loggedKeys
(True, Just WantUnusedKeys) -> noauto $ runkeyaction (pure . Just) unusedKeys'
(True, Just WantFailedTransfers) -> noauto runfailedtransfers
(True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (return [k])
(True, Just WantIncompleteKeys) -> noauto $ runkeyaction incompletekeys
(True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (pure . Just) (return [k])
(True, Just WantIncompleteKeys) -> noauto $ runkeyaction (pure . Just) incompletekeys
(True, Just (WantBranchKeys bs)) -> noauto $ runbranchkeys bs
(False, Just _) -> giveup "Can only specify one of file names, --all, --branch, --unused, --failed, --key, or --incomplete"
where
@ -211,10 +211,11 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
| auto = giveup "Cannot use --auto with --all or --branch or --unused or --key or --incomplete"
| otherwise = a
incompletekeys = staleKeysPrune gitAnnexTmpObjectDir True
runkeyaction getks = do
runkeyaction checker getks = do
keyaction <- mkkeyaction
ks <- getks
forM_ ks $ \k -> keyaction k (mkActionItem k)
forM_ ks $ checker >=> maybe noop
(\k -> keyaction k (mkActionItem k))
runbranchkeys bs = do
keyaction <- mkkeyaction
forM_ bs $ \b -> do

View file

@ -100,7 +100,8 @@ checkRemoteUnused name refspec = go =<< fromJust <$> Remote.byNameWithUUID (Just
showAction "checking for unused data"
_ <- check "" (remoteUnusedMsg r) (remoteunused r) 0
next $ return True
remoteunused r = excludeReferenced refspec <=< loggedKeysFor $ Remote.uuid r
remoteunused r = excludeReferenced refspec
<=< loggedKeysFor $ Remote.uuid r
check :: FilePath -> ([(Int, Key)] -> String) -> Annex [Key] -> Int -> Annex Int
check file msg a c = do

View file

@ -8,7 +8,7 @@
- Repositories record their UUID and the date when they --get or --drop
- a value.
-
- Copyright 2010-2015 Joey Hess <id@joeyh.name>
- Copyright 2010-2018 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -23,6 +23,8 @@ module Logs.Location (
isKnownKey,
checkDead,
setDead,
Unchecked,
finishCheck,
loggedKeys,
loggedKeysFor,
) where
@ -114,24 +116,33 @@ setDead' l = l
Unknown -> Unknown
}
data Unchecked a = Unchecked (Annex (Maybe a))
finishCheck :: Unchecked a -> Annex (Maybe a)
finishCheck (Unchecked a) = a
{- Finds all keys that have location log information.
- (There may be duplicate keys in the list.)
-
- Keys that have been marked as dead are not included.
-}
loggedKeys :: Annex [Key]
loggedKeys :: Annex [Unchecked Key]
loggedKeys = loggedKeys' (not <$$> checkDead)
{- Note that sel should be strict, to avoid the filterM building many
- thunks. -}
loggedKeys' :: (Key -> Annex Bool) -> Annex [Key]
loggedKeys' sel = filterM sel =<<
(mapMaybe locationLogFileKey <$> Annex.Branch.files)
loggedKeys' :: (Key -> Annex Bool) -> Annex [Unchecked Key]
loggedKeys' check = mapMaybe (defercheck <$$> locationLogFileKey)
<$> Annex.Branch.files
where
defercheck k = Unchecked $ ifM (check k)
( return (Just k)
, return Nothing
)
{- Finds all keys that have location log information indicating
- they are present for the specified repository. -}
- they are present for the specified repository.
-}
loggedKeysFor :: UUID -> Annex [Key]
loggedKeysFor u = loggedKeys' isthere
loggedKeysFor u = catMaybes <$> (mapM finishCheck =<< loggedKeys' isthere)
where
isthere k = do
us <- loggedLocations k