optimisation: stream keys through git cat-file --buffer
This is only implemented for git-annex get so far. It makes git-annex get nearly twice as fast in a repo with 10k files, all of them present! But, see the TODO for some caveats.
This commit is contained in:
parent
1df9e72a78
commit
4c9ad1de46
6 changed files with 78 additions and 39 deletions
|
@ -29,25 +29,35 @@ import Logs.Transfer
|
||||||
import Remote.List
|
import Remote.List
|
||||||
import qualified Remote
|
import qualified Remote
|
||||||
import Annex.CatFile
|
import Annex.CatFile
|
||||||
import Git.CatFile (catObjectStream)
|
import Git.CatFile
|
||||||
import Annex.CurrentBranch
|
import Annex.CurrentBranch
|
||||||
import Annex.Content
|
import Annex.Content
|
||||||
|
import Annex.Link
|
||||||
import Annex.InodeSentinal
|
import Annex.InodeSentinal
|
||||||
|
import Annex.Concurrent
|
||||||
import qualified Annex.Branch
|
import qualified Annex.Branch
|
||||||
import qualified Annex.BranchState
|
import qualified Annex.BranchState
|
||||||
import qualified Database.Keys
|
import qualified Database.Keys
|
||||||
import qualified Utility.RawFilePath as R
|
import qualified Utility.RawFilePath as R
|
||||||
|
import Utility.Tuple
|
||||||
|
|
||||||
|
import Control.Concurrent.Async
|
||||||
|
import System.Posix.Types
|
||||||
|
|
||||||
withFilesInGit :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
withFilesInGit :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
||||||
withFilesInGit ww a l = seekActions $ prepFiltered a $
|
withFilesInGit ww a l = seekFiltered a $
|
||||||
seekHelper ww LsFiles.inRepo l
|
seekHelper id ww LsFiles.inRepo l
|
||||||
|
|
||||||
|
withFilesInGitAnnex :: WarnUnmatchWhen -> (RawFilePath -> Key -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
||||||
|
withFilesInGitAnnex ww a l = seekFiltered' a $
|
||||||
|
seekHelper fst3 ww LsFiles.inRepoDetails l
|
||||||
|
|
||||||
withFilesInGitNonRecursive :: WarnUnmatchWhen -> String -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
withFilesInGitNonRecursive :: WarnUnmatchWhen -> String -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
||||||
withFilesInGitNonRecursive ww needforce a l = ifM (Annex.getState Annex.force)
|
withFilesInGitNonRecursive ww needforce a l = ifM (Annex.getState Annex.force)
|
||||||
( withFilesInGit ww a l
|
( withFilesInGit ww a l
|
||||||
, if null l
|
, if null l
|
||||||
then giveup needforce
|
then giveup needforce
|
||||||
else seekActions $ prepFiltered a (getfiles [] l)
|
else seekFiltered a (getfiles [] l)
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
getfiles c [] = return (reverse c)
|
getfiles c [] = return (reverse c)
|
||||||
|
@ -71,8 +81,8 @@ withFilesNotInGit a l = go =<< seek
|
||||||
g <- gitRepo
|
g <- gitRepo
|
||||||
liftIO $ Git.Command.leaveZombie
|
liftIO $ Git.Command.leaveZombie
|
||||||
<$> LsFiles.notInRepo [] force (map (\(WorkTreeItem f) -> toRawFilePath f) l) g
|
<$> LsFiles.notInRepo [] force (map (\(WorkTreeItem f) -> toRawFilePath f) l) g
|
||||||
go fs = seekActions $ prepFiltered a $
|
go fs = seekFiltered a $
|
||||||
return $ concat $ segmentPaths (map (\(WorkTreeItem f) -> toRawFilePath f) l) fs
|
return $ concat $ segmentPaths id (map (\(WorkTreeItem f) -> toRawFilePath f) l) fs
|
||||||
|
|
||||||
withPathContents :: ((FilePath, FilePath) -> CommandSeek) -> CmdParams -> CommandSeek
|
withPathContents :: ((FilePath, FilePath) -> CommandSeek) -> CmdParams -> CommandSeek
|
||||||
withPathContents a params = do
|
withPathContents a params = do
|
||||||
|
@ -94,21 +104,21 @@ withPathContents a params = do
|
||||||
}
|
}
|
||||||
|
|
||||||
withWords :: ([String] -> CommandSeek) -> CmdParams -> CommandSeek
|
withWords :: ([String] -> CommandSeek) -> CmdParams -> CommandSeek
|
||||||
withWords a params = seekActions $ return [a params]
|
withWords a params = a params
|
||||||
|
|
||||||
withStrings :: (String -> CommandSeek) -> CmdParams -> CommandSeek
|
withStrings :: (String -> CommandSeek) -> CmdParams -> CommandSeek
|
||||||
withStrings a params = seekActions $ return $ map a params
|
withStrings a params = sequence_ $ map a params
|
||||||
|
|
||||||
withPairs :: ((String, String) -> CommandSeek) -> CmdParams -> CommandSeek
|
withPairs :: ((String, String) -> CommandSeek) -> CmdParams -> CommandSeek
|
||||||
withPairs a params = seekActions $ return $ map a $ pairs [] params
|
withPairs a params = sequence_ $ map a $ pairs [] params
|
||||||
where
|
where
|
||||||
pairs c [] = reverse c
|
pairs c [] = reverse c
|
||||||
pairs c (x:y:xs) = pairs ((x,y):c) xs
|
pairs c (x:y:xs) = pairs ((x,y):c) xs
|
||||||
pairs _ _ = giveup "expected pairs"
|
pairs _ _ = giveup "expected pairs"
|
||||||
|
|
||||||
withFilesToBeCommitted :: (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
withFilesToBeCommitted :: (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
||||||
withFilesToBeCommitted a l = seekActions $ prepFiltered a $
|
withFilesToBeCommitted a l = seekFiltered a $
|
||||||
seekHelper WarnUnmatchWorkTreeItems (const LsFiles.stagedNotDeleted) l
|
seekHelper id WarnUnmatchWorkTreeItems (const LsFiles.stagedNotDeleted) l
|
||||||
|
|
||||||
isOldUnlocked :: RawFilePath -> Annex Bool
|
isOldUnlocked :: RawFilePath -> Annex Bool
|
||||||
isOldUnlocked f = liftIO (notSymlink f) <&&>
|
isOldUnlocked f = liftIO (notSymlink f) <&&>
|
||||||
|
@ -117,11 +127,10 @@ isOldUnlocked f = liftIO (notSymlink f) <&&>
|
||||||
{- unlocked pointer files that are staged, and whose content has not been
|
{- unlocked pointer files that are staged, and whose content has not been
|
||||||
- modified-}
|
- modified-}
|
||||||
withUnmodifiedUnlockedPointers :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
withUnmodifiedUnlockedPointers :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
||||||
withUnmodifiedUnlockedPointers ww a l = seekActions $
|
withUnmodifiedUnlockedPointers ww a l = seekFiltered a unlockedfiles
|
||||||
prepFiltered a unlockedfiles
|
|
||||||
where
|
where
|
||||||
unlockedfiles = filterM isUnmodifiedUnlocked
|
unlockedfiles = filterM isUnmodifiedUnlocked
|
||||||
=<< seekHelper ww (const LsFiles.typeChangedStaged) l
|
=<< seekHelper id ww (const LsFiles.typeChangedStaged) l
|
||||||
|
|
||||||
isUnmodifiedUnlocked :: RawFilePath -> Annex Bool
|
isUnmodifiedUnlocked :: RawFilePath -> Annex Bool
|
||||||
isUnmodifiedUnlocked f = catKeyFile f >>= \case
|
isUnmodifiedUnlocked f = catKeyFile f >>= \case
|
||||||
|
@ -130,11 +139,11 @@ isUnmodifiedUnlocked f = catKeyFile f >>= \case
|
||||||
|
|
||||||
{- Finds files that may be modified. -}
|
{- Finds files that may be modified. -}
|
||||||
withFilesMaybeModified :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
withFilesMaybeModified :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
|
||||||
withFilesMaybeModified ww a params = seekActions $
|
withFilesMaybeModified ww a params = seekFiltered a $
|
||||||
prepFiltered a $ seekHelper ww LsFiles.modified params
|
seekHelper id ww LsFiles.modified params
|
||||||
|
|
||||||
withKeys :: (Key -> CommandSeek) -> CmdParams -> CommandSeek
|
withKeys :: (Key -> CommandSeek) -> CmdParams -> CommandSeek
|
||||||
withKeys a l = seekActions $ return $ map (a . parse) l
|
withKeys a l = sequence_ $ map (a . parse) l
|
||||||
where
|
where
|
||||||
parse p = fromMaybe (giveup "bad key") $ deserializeKey p
|
parse p = fromMaybe (giveup "bad key") $ deserializeKey p
|
||||||
|
|
||||||
|
@ -251,23 +260,51 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
|
||||||
forM_ ts $ \(t, i) ->
|
forM_ ts $ \(t, i) ->
|
||||||
keyaction (transferKey t, mkActionItem (t, i))
|
keyaction (transferKey t, mkActionItem (t, i))
|
||||||
|
|
||||||
prepFiltered :: (RawFilePath -> CommandSeek) -> Annex [RawFilePath] -> Annex [CommandSeek]
|
seekFiltered :: (RawFilePath -> CommandSeek) -> Annex [RawFilePath] -> Annex ()
|
||||||
prepFiltered a fs = do
|
seekFiltered a fs = do
|
||||||
matcher <- Limit.getMatcher
|
matcher <- Limit.getMatcher
|
||||||
map (process matcher) <$> fs
|
sequence_ =<< (map (process matcher) <$> fs)
|
||||||
where
|
where
|
||||||
process matcher f =
|
process matcher f =
|
||||||
whenM (matcher $ MatchingFile $ FileInfo f f) $ a f
|
whenM (matcher $ MatchingFile $ FileInfo f f) $ a f
|
||||||
|
|
||||||
seekActions :: Annex [CommandSeek] -> Annex ()
|
seekFiltered' :: (RawFilePath -> Key -> CommandSeek) -> Annex [(RawFilePath, Git.Sha, FileMode)] -> Annex ()
|
||||||
seekActions gen = sequence_ =<< gen
|
seekFiltered' a fs = do
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
catObjectStream' g $ \feeder closer reader -> do
|
||||||
|
tid <- liftIO . async =<< forkState (gofeed feeder closer)
|
||||||
|
goread reader
|
||||||
|
join (liftIO (wait tid))
|
||||||
|
where
|
||||||
|
gofeed feeder closer = do
|
||||||
|
matcher <- Limit.getMatcher
|
||||||
|
l <- fs
|
||||||
|
forM_ l $ process matcher feeder
|
||||||
|
liftIO closer
|
||||||
|
|
||||||
seekHelper :: WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([RawFilePath], IO Bool)) -> [WorkTreeItem] -> Annex [RawFilePath]
|
process matcher feeder (f, sha, mode) =
|
||||||
seekHelper ww a l = do
|
-- TODO handle non-symlink separately to avoid
|
||||||
|
-- catting large files
|
||||||
|
-- If the matcher needs to look up a key, it should be run
|
||||||
|
-- in goread, not here, and the key passed in. OTOH, if
|
||||||
|
-- the matcher does not need to look up a key, it's more
|
||||||
|
-- efficient to put it here, to avoid catting files that
|
||||||
|
-- will not be matched.
|
||||||
|
whenM (matcher $ MatchingFile $ FileInfo f f) $
|
||||||
|
liftIO $ feeder (f, sha)
|
||||||
|
|
||||||
|
goread reader = liftIO reader >>= \case
|
||||||
|
Just (f, content) -> do
|
||||||
|
maybe noop (a f) (parseLinkTargetOrPointerLazy =<< content)
|
||||||
|
goread reader
|
||||||
|
_ -> return ()
|
||||||
|
|
||||||
|
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a]
|
||||||
|
seekHelper c ww a l = do
|
||||||
os <- seekOptions ww
|
os <- seekOptions ww
|
||||||
inRepo $ \g ->
|
inRepo $ \g ->
|
||||||
concat . concat <$> forM (segmentXargsOrdered l')
|
concat . concat <$> forM (segmentXargsOrdered l')
|
||||||
(runSegmentPaths (\fs -> Git.Command.leaveZombie <$> a os fs g) . map toRawFilePath)
|
(runSegmentPaths c (\fs -> Git.Command.leaveZombie <$> a os fs g) . map toRawFilePath)
|
||||||
where
|
where
|
||||||
l' = map (\(WorkTreeItem f) -> f) l
|
l' = map (\(WorkTreeItem f) -> f) l
|
||||||
|
|
||||||
|
|
|
@ -200,7 +200,7 @@ changeExport r db (PreferredFiltered new) = do
|
||||||
mapdiff a oldtreesha newtreesha = do
|
mapdiff a oldtreesha newtreesha = do
|
||||||
(diff, cleanup) <- inRepo $
|
(diff, cleanup) <- inRepo $
|
||||||
Git.DiffTree.diffTreeRecursive oldtreesha newtreesha
|
Git.DiffTree.diffTreeRecursive oldtreesha newtreesha
|
||||||
seekActions $ pure $ map a diff
|
sequence_ $ map a diff
|
||||||
void $ liftIO cleanup
|
void $ liftIO cleanup
|
||||||
|
|
||||||
-- Map of old and new filenames for each changed ExportKey in a diff.
|
-- Map of old and new filenames for each changed ExportKey in a diff.
|
||||||
|
|
|
@ -40,12 +40,13 @@ optParser desc = GetOptions
|
||||||
seek :: GetOptions -> CommandSeek
|
seek :: GetOptions -> CommandSeek
|
||||||
seek o = startConcurrency downloadStages $ do
|
seek o = startConcurrency downloadStages $ do
|
||||||
from <- maybe (pure Nothing) (Just <$$> getParsed) (getFrom o)
|
from <- maybe (pure Nothing) (Just <$$> getParsed) (getFrom o)
|
||||||
let go = whenAnnexed $ start o from
|
let go = start o from
|
||||||
case batchOption o of
|
case batchOption o of
|
||||||
Batch fmt -> batchFilesMatching fmt (go . toRawFilePath)
|
Batch fmt -> batchFilesMatching fmt
|
||||||
|
(whenAnnexed go . toRawFilePath)
|
||||||
NoBatch -> withKeyOptions (keyOptions o) (autoMode o)
|
NoBatch -> withKeyOptions (keyOptions o) (autoMode o)
|
||||||
(commandAction . startKeys from)
|
(commandAction . startKeys from)
|
||||||
(withFilesInGit ww (commandAction . go))
|
(withFilesInGitAnnex ww (\f k -> commandAction (go f k)))
|
||||||
=<< workTreeItems ww (getFiles o)
|
=<< workTreeItems ww (getFiles o)
|
||||||
where
|
where
|
||||||
ww = WarnUnmatchLsFiles
|
ww = WarnUnmatchLsFiles
|
||||||
|
|
|
@ -130,7 +130,7 @@ send ups fs = do
|
||||||
starting "sending files" (ActionItemOther Nothing) $
|
starting "sending files" (ActionItemOther Nothing) $
|
||||||
withTmpFile "send" $ \t h -> do
|
withTmpFile "send" $ \t h -> do
|
||||||
let ww = WarnUnmatchLsFiles
|
let ww = WarnUnmatchLsFiles
|
||||||
fs' <- seekHelper ww LsFiles.inRepo
|
fs' <- seekHelper id ww LsFiles.inRepo
|
||||||
=<< workTreeItems ww fs
|
=<< workTreeItems ww fs
|
||||||
matcher <- Limit.getMatcher
|
matcher <- Limit.getMatcher
|
||||||
let addlist f o = whenM (matcher $ MatchingFile $ FileInfo f f) $
|
let addlist f o = whenM (matcher $ MatchingFile $ FileInfo f f) $
|
||||||
|
|
|
@ -652,11 +652,11 @@ seekSyncContent o rs currbranch = do
|
||||||
liftIO $ not <$> isEmptyMVar mvar
|
liftIO $ not <$> isEmptyMVar mvar
|
||||||
where
|
where
|
||||||
seekworktree mvar l bloomfeeder =
|
seekworktree mvar l bloomfeeder =
|
||||||
seekHelper ww LsFiles.inRepo l
|
seekHelper id ww LsFiles.inRepo l
|
||||||
>>= gofiles bloomfeeder mvar
|
>>= gofiles bloomfeeder mvar
|
||||||
|
|
||||||
seekincludinghidden origbranch mvar l bloomfeeder =
|
seekincludinghidden origbranch mvar l bloomfeeder =
|
||||||
seekHelper ww (LsFiles.inRepoOrBranch origbranch) l
|
seekHelper id ww (LsFiles.inRepoOrBranch origbranch) l
|
||||||
>>= gofiles bloomfeeder mvar
|
>>= gofiles bloomfeeder mvar
|
||||||
|
|
||||||
ww = WarnUnmatchLsFiles
|
ww = WarnUnmatchLsFiles
|
||||||
|
|
|
@ -214,22 +214,23 @@ prop_relPathDirToFile_regressionTest = same_dir_shortcurcuits_at_difference
|
||||||
- we stop preserving ordering at that point. Presumably a user passing
|
- we stop preserving ordering at that point. Presumably a user passing
|
||||||
- that many paths in doesn't care too much about order of the later ones.
|
- that many paths in doesn't care too much about order of the later ones.
|
||||||
-}
|
-}
|
||||||
segmentPaths :: [RawFilePath] -> [RawFilePath] -> [[RawFilePath]]
|
segmentPaths :: (a -> RawFilePath) -> [RawFilePath] -> [a] -> [[a]]
|
||||||
segmentPaths [] new = [new]
|
segmentPaths _ [] new = [new]
|
||||||
segmentPaths [_] new = [new] -- optimisation
|
segmentPaths _ [_] new = [new] -- optimisation
|
||||||
segmentPaths (l:ls) new = found : segmentPaths ls rest
|
segmentPaths c (l:ls) new = found : segmentPaths c ls rest
|
||||||
where
|
where
|
||||||
(found, rest) = if length ls < 100
|
(found, rest) = if length ls < 100
|
||||||
then partition inl new
|
then partition inl new
|
||||||
else break (not . inl) new
|
else break (not . inl) new
|
||||||
inl f = fromRawFilePath l `dirContains` fromRawFilePath f
|
inl f = l' `dirContains` fromRawFilePath (c f)
|
||||||
|
l' = fromRawFilePath l
|
||||||
|
|
||||||
{- This assumes that it's cheaper to call segmentPaths on the result,
|
{- This assumes that it's cheaper to call segmentPaths on the result,
|
||||||
- than it would be to run the action separately with each path. In
|
- than it would be to run the action separately with each path. In
|
||||||
- the case of git file list commands, that assumption tends to hold.
|
- the case of git file list commands, that assumption tends to hold.
|
||||||
-}
|
-}
|
||||||
runSegmentPaths :: ([RawFilePath] -> IO [RawFilePath]) -> [RawFilePath] -> IO [[RawFilePath]]
|
runSegmentPaths :: (a -> RawFilePath) -> ([RawFilePath] -> IO [a]) -> [RawFilePath] -> IO [[a]]
|
||||||
runSegmentPaths a paths = segmentPaths paths <$> a paths
|
runSegmentPaths c a paths = segmentPaths c paths <$> a paths
|
||||||
|
|
||||||
{- Converts paths in the home directory to use ~/ -}
|
{- Converts paths in the home directory to use ~/ -}
|
||||||
relHome :: FilePath -> IO String
|
relHome :: FilePath -> IO String
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue