optimisation: stream keys through git cat-file --buffer

This is only implemented for git-annex get so far. It makes git-annex
get nearly twice as fast in a repo with 10k files, all of them present!

But, see the TODO for some caveats.
This commit is contained in:
Joey Hess 2020-07-10 13:54:52 -04:00
parent 1df9e72a78
commit 4c9ad1de46
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 78 additions and 39 deletions

View file

@ -29,25 +29,35 @@ import Logs.Transfer
import Remote.List import Remote.List
import qualified Remote import qualified Remote
import Annex.CatFile import Annex.CatFile
import Git.CatFile (catObjectStream) import Git.CatFile
import Annex.CurrentBranch import Annex.CurrentBranch
import Annex.Content import Annex.Content
import Annex.Link
import Annex.InodeSentinal import Annex.InodeSentinal
import Annex.Concurrent
import qualified Annex.Branch import qualified Annex.Branch
import qualified Annex.BranchState import qualified Annex.BranchState
import qualified Database.Keys import qualified Database.Keys
import qualified Utility.RawFilePath as R import qualified Utility.RawFilePath as R
import Utility.Tuple
import Control.Concurrent.Async
import System.Posix.Types
withFilesInGit :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek withFilesInGit :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesInGit ww a l = seekActions $ prepFiltered a $ withFilesInGit ww a l = seekFiltered a $
seekHelper ww LsFiles.inRepo l seekHelper id ww LsFiles.inRepo l
withFilesInGitAnnex :: WarnUnmatchWhen -> (RawFilePath -> Key -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesInGitAnnex ww a l = seekFiltered' a $
seekHelper fst3 ww LsFiles.inRepoDetails l
withFilesInGitNonRecursive :: WarnUnmatchWhen -> String -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek withFilesInGitNonRecursive :: WarnUnmatchWhen -> String -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesInGitNonRecursive ww needforce a l = ifM (Annex.getState Annex.force) withFilesInGitNonRecursive ww needforce a l = ifM (Annex.getState Annex.force)
( withFilesInGit ww a l ( withFilesInGit ww a l
, if null l , if null l
then giveup needforce then giveup needforce
else seekActions $ prepFiltered a (getfiles [] l) else seekFiltered a (getfiles [] l)
) )
where where
getfiles c [] = return (reverse c) getfiles c [] = return (reverse c)
@ -71,8 +81,8 @@ withFilesNotInGit a l = go =<< seek
g <- gitRepo g <- gitRepo
liftIO $ Git.Command.leaveZombie liftIO $ Git.Command.leaveZombie
<$> LsFiles.notInRepo [] force (map (\(WorkTreeItem f) -> toRawFilePath f) l) g <$> LsFiles.notInRepo [] force (map (\(WorkTreeItem f) -> toRawFilePath f) l) g
go fs = seekActions $ prepFiltered a $ go fs = seekFiltered a $
return $ concat $ segmentPaths (map (\(WorkTreeItem f) -> toRawFilePath f) l) fs return $ concat $ segmentPaths id (map (\(WorkTreeItem f) -> toRawFilePath f) l) fs
withPathContents :: ((FilePath, FilePath) -> CommandSeek) -> CmdParams -> CommandSeek withPathContents :: ((FilePath, FilePath) -> CommandSeek) -> CmdParams -> CommandSeek
withPathContents a params = do withPathContents a params = do
@ -94,21 +104,21 @@ withPathContents a params = do
} }
withWords :: ([String] -> CommandSeek) -> CmdParams -> CommandSeek withWords :: ([String] -> CommandSeek) -> CmdParams -> CommandSeek
withWords a params = seekActions $ return [a params] withWords a params = a params
withStrings :: (String -> CommandSeek) -> CmdParams -> CommandSeek withStrings :: (String -> CommandSeek) -> CmdParams -> CommandSeek
withStrings a params = seekActions $ return $ map a params withStrings a params = sequence_ $ map a params
withPairs :: ((String, String) -> CommandSeek) -> CmdParams -> CommandSeek withPairs :: ((String, String) -> CommandSeek) -> CmdParams -> CommandSeek
withPairs a params = seekActions $ return $ map a $ pairs [] params withPairs a params = sequence_ $ map a $ pairs [] params
where where
pairs c [] = reverse c pairs c [] = reverse c
pairs c (x:y:xs) = pairs ((x,y):c) xs pairs c (x:y:xs) = pairs ((x,y):c) xs
pairs _ _ = giveup "expected pairs" pairs _ _ = giveup "expected pairs"
withFilesToBeCommitted :: (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek withFilesToBeCommitted :: (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesToBeCommitted a l = seekActions $ prepFiltered a $ withFilesToBeCommitted a l = seekFiltered a $
seekHelper WarnUnmatchWorkTreeItems (const LsFiles.stagedNotDeleted) l seekHelper id WarnUnmatchWorkTreeItems (const LsFiles.stagedNotDeleted) l
isOldUnlocked :: RawFilePath -> Annex Bool isOldUnlocked :: RawFilePath -> Annex Bool
isOldUnlocked f = liftIO (notSymlink f) <&&> isOldUnlocked f = liftIO (notSymlink f) <&&>
@ -117,11 +127,10 @@ isOldUnlocked f = liftIO (notSymlink f) <&&>
{- unlocked pointer files that are staged, and whose content has not been {- unlocked pointer files that are staged, and whose content has not been
- modified-} - modified-}
withUnmodifiedUnlockedPointers :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek withUnmodifiedUnlockedPointers :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withUnmodifiedUnlockedPointers ww a l = seekActions $ withUnmodifiedUnlockedPointers ww a l = seekFiltered a unlockedfiles
prepFiltered a unlockedfiles
where where
unlockedfiles = filterM isUnmodifiedUnlocked unlockedfiles = filterM isUnmodifiedUnlocked
=<< seekHelper ww (const LsFiles.typeChangedStaged) l =<< seekHelper id ww (const LsFiles.typeChangedStaged) l
isUnmodifiedUnlocked :: RawFilePath -> Annex Bool isUnmodifiedUnlocked :: RawFilePath -> Annex Bool
isUnmodifiedUnlocked f = catKeyFile f >>= \case isUnmodifiedUnlocked f = catKeyFile f >>= \case
@ -130,11 +139,11 @@ isUnmodifiedUnlocked f = catKeyFile f >>= \case
{- Finds files that may be modified. -} {- Finds files that may be modified. -}
withFilesMaybeModified :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek withFilesMaybeModified :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesMaybeModified ww a params = seekActions $ withFilesMaybeModified ww a params = seekFiltered a $
prepFiltered a $ seekHelper ww LsFiles.modified params seekHelper id ww LsFiles.modified params
withKeys :: (Key -> CommandSeek) -> CmdParams -> CommandSeek withKeys :: (Key -> CommandSeek) -> CmdParams -> CommandSeek
withKeys a l = seekActions $ return $ map (a . parse) l withKeys a l = sequence_ $ map (a . parse) l
where where
parse p = fromMaybe (giveup "bad key") $ deserializeKey p parse p = fromMaybe (giveup "bad key") $ deserializeKey p
@ -251,23 +260,51 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
forM_ ts $ \(t, i) -> forM_ ts $ \(t, i) ->
keyaction (transferKey t, mkActionItem (t, i)) keyaction (transferKey t, mkActionItem (t, i))
prepFiltered :: (RawFilePath -> CommandSeek) -> Annex [RawFilePath] -> Annex [CommandSeek] seekFiltered :: (RawFilePath -> CommandSeek) -> Annex [RawFilePath] -> Annex ()
prepFiltered a fs = do seekFiltered a fs = do
matcher <- Limit.getMatcher matcher <- Limit.getMatcher
map (process matcher) <$> fs sequence_ =<< (map (process matcher) <$> fs)
where where
process matcher f = process matcher f =
whenM (matcher $ MatchingFile $ FileInfo f f) $ a f whenM (matcher $ MatchingFile $ FileInfo f f) $ a f
seekActions :: Annex [CommandSeek] -> Annex () seekFiltered' :: (RawFilePath -> Key -> CommandSeek) -> Annex [(RawFilePath, Git.Sha, FileMode)] -> Annex ()
seekActions gen = sequence_ =<< gen seekFiltered' a fs = do
g <- Annex.gitRepo
catObjectStream' g $ \feeder closer reader -> do
tid <- liftIO . async =<< forkState (gofeed feeder closer)
goread reader
join (liftIO (wait tid))
where
gofeed feeder closer = do
matcher <- Limit.getMatcher
l <- fs
forM_ l $ process matcher feeder
liftIO closer
seekHelper :: WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([RawFilePath], IO Bool)) -> [WorkTreeItem] -> Annex [RawFilePath] process matcher feeder (f, sha, mode) =
seekHelper ww a l = do -- TODO handle non-symlink separately to avoid
-- catting large files
-- If the matcher needs to look up a key, it should be run
-- in goread, not here, and the key passed in. OTOH, if
-- the matcher does not need to look up a key, it's more
-- efficient to put it here, to avoid catting files that
-- will not be matched.
whenM (matcher $ MatchingFile $ FileInfo f f) $
liftIO $ feeder (f, sha)
goread reader = liftIO reader >>= \case
Just (f, content) -> do
maybe noop (a f) (parseLinkTargetOrPointerLazy =<< content)
goread reader
_ -> return ()
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a]
seekHelper c ww a l = do
os <- seekOptions ww os <- seekOptions ww
inRepo $ \g -> inRepo $ \g ->
concat . concat <$> forM (segmentXargsOrdered l') concat . concat <$> forM (segmentXargsOrdered l')
(runSegmentPaths (\fs -> Git.Command.leaveZombie <$> a os fs g) . map toRawFilePath) (runSegmentPaths c (\fs -> Git.Command.leaveZombie <$> a os fs g) . map toRawFilePath)
where where
l' = map (\(WorkTreeItem f) -> f) l l' = map (\(WorkTreeItem f) -> f) l

View file

@ -200,7 +200,7 @@ changeExport r db (PreferredFiltered new) = do
mapdiff a oldtreesha newtreesha = do mapdiff a oldtreesha newtreesha = do
(diff, cleanup) <- inRepo $ (diff, cleanup) <- inRepo $
Git.DiffTree.diffTreeRecursive oldtreesha newtreesha Git.DiffTree.diffTreeRecursive oldtreesha newtreesha
seekActions $ pure $ map a diff sequence_ $ map a diff
void $ liftIO cleanup void $ liftIO cleanup
-- Map of old and new filenames for each changed ExportKey in a diff. -- Map of old and new filenames for each changed ExportKey in a diff.

View file

@ -40,12 +40,13 @@ optParser desc = GetOptions
seek :: GetOptions -> CommandSeek seek :: GetOptions -> CommandSeek
seek o = startConcurrency downloadStages $ do seek o = startConcurrency downloadStages $ do
from <- maybe (pure Nothing) (Just <$$> getParsed) (getFrom o) from <- maybe (pure Nothing) (Just <$$> getParsed) (getFrom o)
let go = whenAnnexed $ start o from let go = start o from
case batchOption o of case batchOption o of
Batch fmt -> batchFilesMatching fmt (go . toRawFilePath) Batch fmt -> batchFilesMatching fmt
(whenAnnexed go . toRawFilePath)
NoBatch -> withKeyOptions (keyOptions o) (autoMode o) NoBatch -> withKeyOptions (keyOptions o) (autoMode o)
(commandAction . startKeys from) (commandAction . startKeys from)
(withFilesInGit ww (commandAction . go)) (withFilesInGitAnnex ww (\f k -> commandAction (go f k)))
=<< workTreeItems ww (getFiles o) =<< workTreeItems ww (getFiles o)
where where
ww = WarnUnmatchLsFiles ww = WarnUnmatchLsFiles

View file

@ -130,7 +130,7 @@ send ups fs = do
starting "sending files" (ActionItemOther Nothing) $ starting "sending files" (ActionItemOther Nothing) $
withTmpFile "send" $ \t h -> do withTmpFile "send" $ \t h -> do
let ww = WarnUnmatchLsFiles let ww = WarnUnmatchLsFiles
fs' <- seekHelper ww LsFiles.inRepo fs' <- seekHelper id ww LsFiles.inRepo
=<< workTreeItems ww fs =<< workTreeItems ww fs
matcher <- Limit.getMatcher matcher <- Limit.getMatcher
let addlist f o = whenM (matcher $ MatchingFile $ FileInfo f f) $ let addlist f o = whenM (matcher $ MatchingFile $ FileInfo f f) $

View file

@ -652,11 +652,11 @@ seekSyncContent o rs currbranch = do
liftIO $ not <$> isEmptyMVar mvar liftIO $ not <$> isEmptyMVar mvar
where where
seekworktree mvar l bloomfeeder = seekworktree mvar l bloomfeeder =
seekHelper ww LsFiles.inRepo l seekHelper id ww LsFiles.inRepo l
>>= gofiles bloomfeeder mvar >>= gofiles bloomfeeder mvar
seekincludinghidden origbranch mvar l bloomfeeder = seekincludinghidden origbranch mvar l bloomfeeder =
seekHelper ww (LsFiles.inRepoOrBranch origbranch) l seekHelper id ww (LsFiles.inRepoOrBranch origbranch) l
>>= gofiles bloomfeeder mvar >>= gofiles bloomfeeder mvar
ww = WarnUnmatchLsFiles ww = WarnUnmatchLsFiles

View file

@ -214,22 +214,23 @@ prop_relPathDirToFile_regressionTest = same_dir_shortcurcuits_at_difference
- we stop preserving ordering at that point. Presumably a user passing - we stop preserving ordering at that point. Presumably a user passing
- that many paths in doesn't care too much about order of the later ones. - that many paths in doesn't care too much about order of the later ones.
-} -}
segmentPaths :: [RawFilePath] -> [RawFilePath] -> [[RawFilePath]] segmentPaths :: (a -> RawFilePath) -> [RawFilePath] -> [a] -> [[a]]
segmentPaths [] new = [new] segmentPaths _ [] new = [new]
segmentPaths [_] new = [new] -- optimisation segmentPaths _ [_] new = [new] -- optimisation
segmentPaths (l:ls) new = found : segmentPaths ls rest segmentPaths c (l:ls) new = found : segmentPaths c ls rest
where where
(found, rest) = if length ls < 100 (found, rest) = if length ls < 100
then partition inl new then partition inl new
else break (not . inl) new else break (not . inl) new
inl f = fromRawFilePath l `dirContains` fromRawFilePath f inl f = l' `dirContains` fromRawFilePath (c f)
l' = fromRawFilePath l
{- This assumes that it's cheaper to call segmentPaths on the result, {- This assumes that it's cheaper to call segmentPaths on the result,
- than it would be to run the action separately with each path. In - than it would be to run the action separately with each path. In
- the case of git file list commands, that assumption tends to hold. - the case of git file list commands, that assumption tends to hold.
-} -}
runSegmentPaths :: ([RawFilePath] -> IO [RawFilePath]) -> [RawFilePath] -> IO [[RawFilePath]] runSegmentPaths :: (a -> RawFilePath) -> ([RawFilePath] -> IO [a]) -> [RawFilePath] -> IO [[a]]
runSegmentPaths a paths = segmentPaths paths <$> a paths runSegmentPaths c a paths = segmentPaths c paths <$> a paths
{- Converts paths in the home directory to use ~/ -} {- Converts paths in the home directory to use ~/ -}
relHome :: FilePath -> IO String relHome :: FilePath -> IO String