optimisation: stream keys through git cat-file --buffer

This is only implemented for git-annex get so far. It makes git-annex
get nearly twice as fast in a repo with 10k files, all of them present!

But, see the TODO for some caveats.
This commit is contained in:
Joey Hess 2020-07-10 13:54:52 -04:00
parent 1df9e72a78
commit 4c9ad1de46
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 78 additions and 39 deletions

View file

@ -29,25 +29,35 @@ import Logs.Transfer
import Remote.List
import qualified Remote
import Annex.CatFile
import Git.CatFile (catObjectStream)
import Git.CatFile
import Annex.CurrentBranch
import Annex.Content
import Annex.Link
import Annex.InodeSentinal
import Annex.Concurrent
import qualified Annex.Branch
import qualified Annex.BranchState
import qualified Database.Keys
import qualified Utility.RawFilePath as R
import Utility.Tuple
import Control.Concurrent.Async
import System.Posix.Types
withFilesInGit :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesInGit ww a l = seekActions $ prepFiltered a $
seekHelper ww LsFiles.inRepo l
withFilesInGit ww a l = seekFiltered a $
seekHelper id ww LsFiles.inRepo l
withFilesInGitAnnex :: WarnUnmatchWhen -> (RawFilePath -> Key -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesInGitAnnex ww a l = seekFiltered' a $
seekHelper fst3 ww LsFiles.inRepoDetails l
withFilesInGitNonRecursive :: WarnUnmatchWhen -> String -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesInGitNonRecursive ww needforce a l = ifM (Annex.getState Annex.force)
( withFilesInGit ww a l
, if null l
then giveup needforce
else seekActions $ prepFiltered a (getfiles [] l)
else seekFiltered a (getfiles [] l)
)
where
getfiles c [] = return (reverse c)
@ -71,8 +81,8 @@ withFilesNotInGit a l = go =<< seek
g <- gitRepo
liftIO $ Git.Command.leaveZombie
<$> LsFiles.notInRepo [] force (map (\(WorkTreeItem f) -> toRawFilePath f) l) g
go fs = seekActions $ prepFiltered a $
return $ concat $ segmentPaths (map (\(WorkTreeItem f) -> toRawFilePath f) l) fs
go fs = seekFiltered a $
return $ concat $ segmentPaths id (map (\(WorkTreeItem f) -> toRawFilePath f) l) fs
withPathContents :: ((FilePath, FilePath) -> CommandSeek) -> CmdParams -> CommandSeek
withPathContents a params = do
@ -94,21 +104,21 @@ withPathContents a params = do
}
withWords :: ([String] -> CommandSeek) -> CmdParams -> CommandSeek
withWords a params = seekActions $ return [a params]
withWords a params = a params
withStrings :: (String -> CommandSeek) -> CmdParams -> CommandSeek
withStrings a params = seekActions $ return $ map a params
withStrings a params = sequence_ $ map a params
withPairs :: ((String, String) -> CommandSeek) -> CmdParams -> CommandSeek
withPairs a params = seekActions $ return $ map a $ pairs [] params
withPairs a params = sequence_ $ map a $ pairs [] params
where
pairs c [] = reverse c
pairs c (x:y:xs) = pairs ((x,y):c) xs
pairs _ _ = giveup "expected pairs"
withFilesToBeCommitted :: (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesToBeCommitted a l = seekActions $ prepFiltered a $
seekHelper WarnUnmatchWorkTreeItems (const LsFiles.stagedNotDeleted) l
withFilesToBeCommitted a l = seekFiltered a $
seekHelper id WarnUnmatchWorkTreeItems (const LsFiles.stagedNotDeleted) l
isOldUnlocked :: RawFilePath -> Annex Bool
isOldUnlocked f = liftIO (notSymlink f) <&&>
@ -117,11 +127,10 @@ isOldUnlocked f = liftIO (notSymlink f) <&&>
{- unlocked pointer files that are staged, and whose content has not been
- modified-}
withUnmodifiedUnlockedPointers :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withUnmodifiedUnlockedPointers ww a l = seekActions $
prepFiltered a unlockedfiles
withUnmodifiedUnlockedPointers ww a l = seekFiltered a unlockedfiles
where
unlockedfiles = filterM isUnmodifiedUnlocked
=<< seekHelper ww (const LsFiles.typeChangedStaged) l
=<< seekHelper id ww (const LsFiles.typeChangedStaged) l
isUnmodifiedUnlocked :: RawFilePath -> Annex Bool
isUnmodifiedUnlocked f = catKeyFile f >>= \case
@ -130,11 +139,11 @@ isUnmodifiedUnlocked f = catKeyFile f >>= \case
{- Finds files that may be modified. -}
withFilesMaybeModified :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesMaybeModified ww a params = seekActions $
prepFiltered a $ seekHelper ww LsFiles.modified params
withFilesMaybeModified ww a params = seekFiltered a $
seekHelper id ww LsFiles.modified params
withKeys :: (Key -> CommandSeek) -> CmdParams -> CommandSeek
withKeys a l = seekActions $ return $ map (a . parse) l
withKeys a l = sequence_ $ map (a . parse) l
where
parse p = fromMaybe (giveup "bad key") $ deserializeKey p
@ -251,23 +260,51 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
forM_ ts $ \(t, i) ->
keyaction (transferKey t, mkActionItem (t, i))
prepFiltered :: (RawFilePath -> CommandSeek) -> Annex [RawFilePath] -> Annex [CommandSeek]
prepFiltered a fs = do
seekFiltered :: (RawFilePath -> CommandSeek) -> Annex [RawFilePath] -> Annex ()
seekFiltered a fs = do
matcher <- Limit.getMatcher
map (process matcher) <$> fs
sequence_ =<< (map (process matcher) <$> fs)
where
process matcher f =
whenM (matcher $ MatchingFile $ FileInfo f f) $ a f
seekActions :: Annex [CommandSeek] -> Annex ()
seekActions gen = sequence_ =<< gen
seekFiltered' :: (RawFilePath -> Key -> CommandSeek) -> Annex [(RawFilePath, Git.Sha, FileMode)] -> Annex ()
seekFiltered' a fs = do
g <- Annex.gitRepo
catObjectStream' g $ \feeder closer reader -> do
tid <- liftIO . async =<< forkState (gofeed feeder closer)
goread reader
join (liftIO (wait tid))
where
gofeed feeder closer = do
matcher <- Limit.getMatcher
l <- fs
forM_ l $ process matcher feeder
liftIO closer
process matcher feeder (f, sha, mode) =
-- TODO handle non-symlink separately to avoid
-- catting large files
-- If the matcher needs to look up a key, it should be run
-- in goread, not here, and the key passed in. OTOH, if
-- the matcher does not need to look up a key, it's more
-- efficient to put it here, to avoid catting files that
-- will not be matched.
whenM (matcher $ MatchingFile $ FileInfo f f) $
liftIO $ feeder (f, sha)
seekHelper :: WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([RawFilePath], IO Bool)) -> [WorkTreeItem] -> Annex [RawFilePath]
seekHelper ww a l = do
goread reader = liftIO reader >>= \case
Just (f, content) -> do
maybe noop (a f) (parseLinkTargetOrPointerLazy =<< content)
goread reader
_ -> return ()
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a]
seekHelper c ww a l = do
os <- seekOptions ww
inRepo $ \g ->
concat . concat <$> forM (segmentXargsOrdered l')
(runSegmentPaths (\fs -> Git.Command.leaveZombie <$> a os fs g) . map toRawFilePath)
(runSegmentPaths c (\fs -> Git.Command.leaveZombie <$> a os fs g) . map toRawFilePath)
where
l' = map (\(WorkTreeItem f) -> f) l

View file

@ -200,7 +200,7 @@ changeExport r db (PreferredFiltered new) = do
mapdiff a oldtreesha newtreesha = do
(diff, cleanup) <- inRepo $
Git.DiffTree.diffTreeRecursive oldtreesha newtreesha
seekActions $ pure $ map a diff
sequence_ $ map a diff
void $ liftIO cleanup
-- Map of old and new filenames for each changed ExportKey in a diff.

View file

@ -40,12 +40,13 @@ optParser desc = GetOptions
seek :: GetOptions -> CommandSeek
seek o = startConcurrency downloadStages $ do
from <- maybe (pure Nothing) (Just <$$> getParsed) (getFrom o)
let go = whenAnnexed $ start o from
let go = start o from
case batchOption o of
Batch fmt -> batchFilesMatching fmt (go . toRawFilePath)
Batch fmt -> batchFilesMatching fmt
(whenAnnexed go . toRawFilePath)
NoBatch -> withKeyOptions (keyOptions o) (autoMode o)
(commandAction . startKeys from)
(withFilesInGit ww (commandAction . go))
(withFilesInGitAnnex ww (\f k -> commandAction (go f k)))
=<< workTreeItems ww (getFiles o)
where
ww = WarnUnmatchLsFiles

View file

@ -130,7 +130,7 @@ send ups fs = do
starting "sending files" (ActionItemOther Nothing) $
withTmpFile "send" $ \t h -> do
let ww = WarnUnmatchLsFiles
fs' <- seekHelper ww LsFiles.inRepo
fs' <- seekHelper id ww LsFiles.inRepo
=<< workTreeItems ww fs
matcher <- Limit.getMatcher
let addlist f o = whenM (matcher $ MatchingFile $ FileInfo f f) $

View file

@ -652,11 +652,11 @@ seekSyncContent o rs currbranch = do
liftIO $ not <$> isEmptyMVar mvar
where
seekworktree mvar l bloomfeeder =
seekHelper ww LsFiles.inRepo l
seekHelper id ww LsFiles.inRepo l
>>= gofiles bloomfeeder mvar
seekincludinghidden origbranch mvar l bloomfeeder =
seekHelper ww (LsFiles.inRepoOrBranch origbranch) l
seekHelper id ww (LsFiles.inRepoOrBranch origbranch) l
>>= gofiles bloomfeeder mvar
ww = WarnUnmatchLsFiles

View file

@ -214,22 +214,23 @@ prop_relPathDirToFile_regressionTest = same_dir_shortcurcuits_at_difference
- we stop preserving ordering at that point. Presumably a user passing
- that many paths in doesn't care too much about order of the later ones.
-}
segmentPaths :: [RawFilePath] -> [RawFilePath] -> [[RawFilePath]]
segmentPaths [] new = [new]
segmentPaths [_] new = [new] -- optimisation
segmentPaths (l:ls) new = found : segmentPaths ls rest
segmentPaths :: (a -> RawFilePath) -> [RawFilePath] -> [a] -> [[a]]
segmentPaths _ [] new = [new]
segmentPaths _ [_] new = [new] -- optimisation
segmentPaths c (l:ls) new = found : segmentPaths c ls rest
where
(found, rest) = if length ls < 100
then partition inl new
else break (not . inl) new
inl f = fromRawFilePath l `dirContains` fromRawFilePath f
inl f = l' `dirContains` fromRawFilePath (c f)
l' = fromRawFilePath l
{- This assumes that it's cheaper to call segmentPaths on the result,
- than it would be to run the action separately with each path. In
- the case of git file list commands, that assumption tends to hold.
-}
runSegmentPaths :: ([RawFilePath] -> IO [RawFilePath]) -> [RawFilePath] -> IO [[RawFilePath]]
runSegmentPaths a paths = segmentPaths paths <$> a paths
runSegmentPaths :: (a -> RawFilePath) -> ([RawFilePath] -> IO [a]) -> [RawFilePath] -> IO [[a]]
runSegmentPaths c a paths = segmentPaths c paths <$> a paths
{- Converts paths in the home directory to use ~/ -}
relHome :: FilePath -> IO String