getKeysPresent is now fully lazy

.. Allowing it to be used by things in constant space!

Random statistics: git annex status has gone from taking 239 mb
of memory and 26 seconds in a repo, to 8 mb and 13 seconds.

The trick here is the unsafeInterleaveIO, and the form of the function's
recursion, which I cribbed heavily from System.IO.HVFS.Utils.recurseDirStat.
The difference is, this one goes to a limited depth and avoids statting
everything.
This commit is contained in:
Joey Hess 2012-03-11 18:04:58 -04:00
parent ff3644ad38
commit b325694645
2 changed files with 16 additions and 14 deletions

View file

@ -27,6 +27,7 @@ module Annex.Content (
import Control.Exception (bracket_)
import System.Posix.Types
import System.IO.Unsafe (unsafeInterleaveIO)
import Common.Annex
import Logs.Location
@ -290,19 +291,20 @@ moveBad key = do
{- List of keys whose content exists in .git/annex/objects/ -}
getKeysPresent :: Annex [Key]
getKeysPresent = getKeysPresent' =<< fromRepo gitAnnexObjectDir
getKeysPresent' :: FilePath -> Annex [Key]
getKeysPresent' dir = do
exists <- liftIO $ doesDirectoryExist dir
if not exists
then return []
else liftIO $ do
-- 2 levels of hashing
levela <- dirContents dir
levelb <- mapM dirContents levela
contents <- unsafeInterleaveIO $ mapM dirContents (concat levelb)
let files = concat contents
return $ mapMaybe (fileKey . takeFileName) files
getKeysPresent = liftIO . traverse (2 :: Int) =<< fromRepo gitAnnexObjectDir
where
traverse depth dir = do
contents <- catchDefaultIO (dirContents dir) []
if depth == 0
then continue (mapMaybe (fileKey . takeFileName) contents) []
else do
let deeper = traverse (depth - 1)
continue [] (map deeper contents)
continue keys [] = return keys
continue keys (a:as) = do
{- Force lazy traversal with unsafeInterleaveIO. -}
morekeys <- unsafeInterleaveIO a
continue (morekeys++keys) as
{- Things to do to record changes to content when shutting down.
-

2
debian/changelog vendored
View file

@ -6,7 +6,7 @@ git-annex (3.20120310) UNRELEASED; urgency=low
* unused: Reduce memory usage significantly. Still not constant
space, but now only needs to store the set of file contents that
are present in the annex in memory.
* status: Fixed to run in nearly constant space.
* status: Fixed to run in constant space.
-- Joey Hess <joeyh@debian.org> Sat, 10 Mar 2012 14:03:22 -0400