mostly done with location log precaching

Some nice wins.
This commit is contained in:
Joey Hess 2020-07-13 17:04:02 -04:00
parent df58609804
commit 75aab72d23
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
22 changed files with 217 additions and 68 deletions

View file

@ -30,6 +30,8 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
* Sped up the --all option by 2x to 16x by using git cat-file --buffer.
Thanks to Lukey for finding this optimisation.
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
* Sped up sync --content by 100% (without --all).
* Sped up some other commands like fsck --fast and whereis by around 50%.
* fsck: Detect if WORM keys contain a carriage return, and recommend
upgrading the key. (git-annex could have maybe created such keys back
in 2013).

View file

@ -20,6 +20,7 @@ import qualified Git.Command
import qualified Git.LsFiles as LsFiles
import qualified Git.LsTree as LsTree
import qualified Git.Types as Git
import qualified Git.Ref
import Git.FilePath
import qualified Limit
import CmdLine.GitAnnex.Options
@ -49,11 +50,17 @@ withFilesInGit :: WarnUnmatchWhen -> (RawFilePath -> CommandSeek) -> [WorkTreeIt
withFilesInGit ww a l = seekFiltered a $
seekHelper id ww LsFiles.inRepo l
withFilesInGitAnnex :: WarnUnmatchWhen -> (RawFilePath -> Key -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
data AnnexedFileSeeker = AnnexedFileSeeker
{ seekAction :: RawFilePath -> Key -> CommandSeek
, checkContentPresent :: Maybe Bool
, usesLocationLog :: Bool
}
withFilesInGitAnnex :: WarnUnmatchWhen -> AnnexedFileSeeker -> [WorkTreeItem] -> CommandSeek
withFilesInGitAnnex ww a l = seekFilteredKeys a $
seekHelper fst3 ww LsFiles.inRepoDetails l
withFilesInGitAnnexNonRecursive :: WarnUnmatchWhen -> String -> (RawFilePath -> Key -> CommandSeek) -> [WorkTreeItem] -> CommandSeek
withFilesInGitAnnexNonRecursive :: WarnUnmatchWhen -> String -> AnnexedFileSeeker -> [WorkTreeItem] -> CommandSeek
withFilesInGitAnnexNonRecursive ww needforce a l = ifM (Annex.getState Annex.force)
( withFilesInGitAnnex ww a l
, if null l
@ -265,35 +272,72 @@ seekFiltered a fs = do
process matcher f =
whenM (matcher $ MatchingFile $ FileInfo f f) $ a f
-- This is significantly faster than using lookupKey after seekFiltered.
seekFilteredKeys :: (RawFilePath -> Key -> CommandSeek) -> Annex [(RawFilePath, Git.Sha, FileMode)] -> Annex ()
seekFilteredKeys a listfs = do
-- This is significantly faster than using lookupKey after seekFiltered,
-- because of the way data is streamed through git cat-file.
--
-- It can also precache location logs using the same efficient streaming.
seekFilteredKeys :: AnnexedFileSeeker -> Annex [(RawFilePath, Git.Sha, FileMode)] -> Annex ()
seekFilteredKeys seeker listfs = do
g <- Annex.gitRepo
matcher <- Limit.getMatcher
config <- Annex.getGitConfig
-- Run here, not in the async, because it could throw an exception
-- The list should be built lazily.
l <- listfs
catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
catObjectStream g $ \feeder closer reader -> do
catObjectStream g $ \ofeeder ocloser oreader -> do
processertid <- liftIO . async =<< forkState
(process matcher feeder mdfeeder mdcloser False l)
(process matcher ofeeder mdfeeder mdcloser False l)
mdprocessertid <- liftIO . async =<< forkState
(mdprocess matcher mdreader feeder closer)
goread reader
(mdprocess matcher mdreader ofeeder ocloser)
if usesLocationLog seeker
then catObjectStream g $ \lfeeder lcloser lreader -> do
precachertid <- liftIO . async =<< forkState
(precacher config oreader lfeeder lcloser)
precachefinisher lreader
join (liftIO (wait precachertid))
else finisher oreader
join (liftIO (wait mdprocessertid))
join (liftIO (wait processertid))
where
goread reader = liftIO reader >>= \case
checkpresence k cont = case checkContentPresent seeker of
Just v -> do
present <- inAnnex k
when (present == v) cont
Nothing -> cont
finisher oreader = liftIO oreader >>= \case
Just (f, content) -> do
maybe noop (a f) (parseLinkTargetOrPointerLazy =<< content)
goread reader
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkpresence k $
seekAction seeker f k
Nothing -> noop
finisher oreader
Nothing -> return ()
precachefinisher lreader = liftIO lreader >>= \case
Just ((logf, f, k), logcontent) -> do
maybe noop (Annex.BranchState.setCache logf) logcontent
seekAction seeker f k
precachefinisher lreader
Nothing -> return ()
feedmatches matcher feeder f sha =
precacher config oreader lfeeder lcloser = liftIO oreader >>= \case
Just (f, content) -> do
case parseLinkTargetOrPointerLazy =<< content of
Just k -> checkpresence k $
let logf = locationLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ lfeeder ((logf, f, k), ref)
Nothing -> noop
precacher config oreader lfeeder lcloser
Nothing -> liftIO $ void lcloser
feedmatches matcher ofeeder f sha =
whenM (matcher $ MatchingFile $ FileInfo f f) $
liftIO $ feeder (f, sha)
liftIO $ ofeeder (f, sha)
process matcher feeder mdfeeder mdcloser seenpointer ((f, sha, mode):rest) =
process matcher ofeeder mdfeeder mdcloser seenpointer ((f, sha, mode):rest) =
case Git.toTreeItemType mode of
Just Git.TreeSymlink -> do
-- Once a pointer file has been seen,
@ -303,27 +347,27 @@ seekFilteredKeys a listfs = do
-- file order.
if seenpointer
then liftIO $ mdfeeder (f, sha)
else feedmatches matcher feeder f sha
process matcher feeder mdfeeder mdcloser seenpointer rest
else feedmatches matcher ofeeder f sha
process matcher ofeeder mdfeeder mdcloser seenpointer rest
Just Git.TreeSubmodule ->
process matcher feeder mdfeeder mdcloser seenpointer rest
process matcher ofeeder mdfeeder mdcloser seenpointer rest
-- Might be a pointer file, might be other
-- file in git, possibly large. Avoid catting
-- large files by first looking up the size.
Just _ -> do
liftIO $ mdfeeder (f, sha)
process matcher feeder mdfeeder mdcloser True rest
process matcher ofeeder mdfeeder mdcloser True rest
Nothing ->
process matcher feeder mdfeeder mdcloser seenpointer rest
process matcher ofeeder mdfeeder mdcloser seenpointer rest
process _ _ _ mdcloser _ [] = liftIO $ void mdcloser
mdprocess matcher mdreader feeder closer = liftIO mdreader >>= \case
mdprocess matcher mdreader ofeeder ocloser = liftIO mdreader >>= \case
Just (f, Just (sha, size, _type))
| size < maxPointerSz -> do
feedmatches matcher feeder f sha
mdprocess matcher mdreader feeder closer
Just _ -> mdprocess matcher mdreader feeder closer
Nothing -> liftIO $ void closer
feedmatches matcher ofeeder f sha
mdprocess matcher mdreader ofeeder ocloser
Just _ -> mdprocess matcher mdreader ofeeder ocloser
Nothing -> liftIO $ void ocloser
seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a]
seekHelper c ww a l = do

View file

@ -46,13 +46,18 @@ instance DeferredParseClass CopyOptions where
seek :: CopyOptions -> CommandSeek
seek o = startConcurrency commandStages $ do
let go = start o
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' go
, checkContentPresent = Nothing
, usesLocationLog = False
}
case batchOption o of
Batch fmt -> batchFilesMatching fmt
(whenAnnexed go . toRawFilePath)
NoBatch -> withKeyOptions
(keyOptions o) (autoMode o)
(commandAction . Command.Move.startKey (fromToOptions o) Command.Move.RemoveNever)
(withFilesInGitAnnex ww (commandAction' go))
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (copyFiles o)
where
ww = WarnUnmatchLsFiles

View file

@ -58,12 +58,18 @@ seek o = startConcurrency commandStages $
(whenAnnexed go . toRawFilePath)
NoBatch -> withKeyOptions (keyOptions o) (autoMode o)
(commandAction . startKeys o)
(withFilesInGitAnnex ww (commandAction' go))
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (dropFiles o)
where
go = start o
ww = WarnUnmatchLsFiles
seeker = AnnexedFileSeeker
{ seekAction = commandAction' go
, checkContentPresent = Nothing
, usesLocationLog = False
}
start :: DropOptions -> RawFilePath -> Key -> CommandStart
start o file key = start' o key afile ai
where

View file

@ -13,7 +13,6 @@ import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import Command
import Annex.Content
import Limit
import Types.Key
import Git.FilePath
@ -55,24 +54,31 @@ parseFormatOption =
seek :: FindOptions -> CommandSeek
seek o = case batchOption o of
NoBatch -> withKeyOptions (keyOptions o) False
(commandAction . startKeys o)
(withFilesInGitAnnex ww (commandAction' go))
=<< workTreeItems ww (findThese o)
NoBatch -> do
islimited <- limited
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' go
-- only files with content present are shown, unless
-- the user has requested others via a limit
, checkContentPresent = if islimited
then Nothing
else Just True
, usesLocationLog = False
}
withKeyOptions (keyOptions o) False
(commandAction . startKeys o)
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (findThese o)
Batch fmt -> batchFilesMatching fmt
(whenAnnexed go . toRawFilePath)
where
go = start o
ww = WarnUnmatchLsFiles
-- only files inAnnex are shown, unless the user has requested
-- others via a limit
start :: FindOptions -> RawFilePath -> Key -> CommandStart
start o file key =
stopUnless (limited <||> inAnnex key) $
startingCustomOutput key $ do
showFormatted (formatOption o) file $ ("file", fromRawFilePath file) : keyVars key
next $ return True
start o file key = startingCustomOutput key $ do
showFormatted (formatOption o) file $ ("file", fromRawFilePath file) : keyVars key
next $ return True
startKeys :: FindOptions -> (Key, ActionItem) -> CommandStart
startKeys o (key, ActionItemBranchFilePath (BranchFilePath _ topf) _) =

View file

@ -32,10 +32,14 @@ cmd = noCommit $ withGlobalOptions [annexedMatchingOptions] $
seek :: CmdParams -> CommandSeek
seek ps = unlessM crippledFileSystem $
withFilesInGitAnnex ww (commandAction' (start FixAll))
=<< workTreeItems ww ps
withFilesInGitAnnex ww seeker =<< workTreeItems ww ps
where
ww = WarnUnmatchLsFiles
seeker = AnnexedFileSeeker
{ seekAction = commandAction' (start FixAll)
, checkContentPresent = Nothing
, usesLocationLog = False
}
data FixWhat = FixSymlinks | FixAll

View file

@ -92,9 +92,14 @@ seek o = startConcurrency commandStages $ do
u <- maybe getUUID (pure . Remote.uuid) from
checkDeadRepo u
i <- prepIncremental u (incrementalOpt o)
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' (start from i)
, checkContentPresent = Just True
, usesLocationLog = True
}
withKeyOptions (keyOptions o) False
(\kai -> commandAction . startKey from i kai =<< getNumCopies)
(withFilesInGit ww $ commandAction . (whenAnnexed (start from i)))
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (fsckFiles o)
cleanupIncremental i
void $ tryIO $ recordActivity Fsck u

View file

@ -41,12 +41,17 @@ seek :: GetOptions -> CommandSeek
seek o = startConcurrency downloadStages $ do
from <- maybe (pure Nothing) (Just <$$> getParsed) (getFrom o)
let go = start o from
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' go
, checkContentPresent = Just False
, usesLocationLog = True
}
case batchOption o of
Batch fmt -> batchFilesMatching fmt
(whenAnnexed go . toRawFilePath)
NoBatch -> withKeyOptions (keyOptions o) (autoMode o)
(commandAction . startKeys from)
(withFilesInGitAnnex ww (commandAction' go))
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (getFiles o)
where
ww = WarnUnmatchLsFiles

View file

@ -38,8 +38,12 @@ seek o = do
| otherwise -> commandAction stop
_ -> do
let s = S.fromList ts
withFilesInGitAnnex ww
(commandAction' (start s))
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' (start s)
, checkContentPresent = Nothing
, usesLocationLog = False
}
withFilesInGitAnnex ww seeker
=<< workTreeItems ww (inprogressFiles o)
where
ww = WarnUnmatchLsFiles

View file

@ -44,8 +44,12 @@ seek :: ListOptions -> CommandSeek
seek o = do
list <- getList o
printHeader list
withFilesInGitAnnex ww (commandAction' (start list))
=<< workTreeItems ww (listThese o)
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' (start list)
, checkContentPresent = Nothing
, usesLocationLog = True
}
withFilesInGitAnnex ww seeker =<< workTreeItems ww (listThese o)
where
ww = WarnUnmatchLsFiles

View file

@ -28,11 +28,14 @@ cmd = withGlobalOptions [jsonOptions, annexedMatchingOptions] $
paramPaths (withParams seek)
seek :: CmdParams -> CommandSeek
seek ps = do
l <- workTreeItems ww ps
withFilesInGitAnnex ww (commandAction' start) l
seek ps = withFilesInGitAnnex ww seeker =<< workTreeItems ww ps
where
ww = WarnUnmatchLsFiles
seeker = AnnexedFileSeeker
{ seekAction = commandAction' start
, checkContentPresent = Nothing
, usesLocationLog = False
}
start :: RawFilePath -> Key -> CommandStart
start file key = ifM (isJust <$> isAnnexLink file)

View file

@ -85,9 +85,15 @@ seek o = do
m <- Remote.uuidDescriptions
zone <- liftIO getCurrentTimeZone
let outputter = mkOutputter m zone o
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' (start o outputter)
, checkContentPresent = Nothing
-- the way this uses the location log would not be helped
-- by precaching the current value
, usesLocationLog = False
}
case (logFiles o, allOption o) of
(fs, False) -> withFilesInGitAnnex ww
(commandAction' (start o outputter))
(fs, False) -> withFilesInGitAnnex ww seeker
=<< workTreeItems ww fs
([], True) -> commandAction (startAll o outputter)
(_, True) -> giveup "Cannot specify both files and --all"

View file

@ -76,14 +76,19 @@ seek o = case batchOption o of
NoBatch -> do
c <- liftIO currentVectorClock
let ww = WarnUnmatchLsFiles
let seeker = case getSet o of
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' (start c o)
, checkContentPresent = Nothing
, usesLocationLog = False
}
let seekaction = case getSet o of
Get _ -> withFilesInGitAnnex ww
GetAll -> withFilesInGitAnnex ww
Set _ -> withFilesInGitAnnexNonRecursive ww
"Not recursively setting metadata. Use --force to do that."
withKeyOptions (keyOptions o) False
(commandAction . startKeys c o)
(seeker (commandAction' (start c o)))
(seekaction seeker)
=<< workTreeItems ww (forFiles o)
Batch fmt -> withMessageState $ \s -> case outputType s of
JSONOutput _ -> ifM limited

View file

@ -26,10 +26,14 @@ cmd = withGlobalOptions [annexedMatchingOptions] $
paramPaths (withParams seek)
seek :: CmdParams -> CommandSeek
seek = withFilesInGitAnnex ww (commandAction' start)
<=< workTreeItems ww
seek = withFilesInGitAnnex ww seeker <=< workTreeItems ww
where
ww = WarnUnmatchLsFiles
seeker = AnnexedFileSeeker
{ seekAction = commandAction' start
, checkContentPresent = Nothing
, usesLocationLog = False
}
start :: RawFilePath -> Key -> CommandStart
start file key = do

View file

@ -44,13 +44,18 @@ seek :: MirrorOptions -> CommandSeek
seek o = startConcurrency stages $
withKeyOptions (keyOptions o) False
(commandAction . startKey o (AssociatedFile Nothing))
(withFilesInGitAnnex ww (commandAction' (start o)))
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (mirrorFiles o)
where
stages = case fromToOptions o of
FromRemote _ -> downloadStages
ToRemote _ -> commandStages
ww = WarnUnmatchLsFiles
seeker = AnnexedFileSeeker
{ seekAction = commandAction' (start o)
, checkContentPresent = Nothing
, usesLocationLog = False
}
start :: MirrorOptions -> RawFilePath -> Key -> CommandStart
start o file k = startKey o afile (k, ai)

View file

@ -56,12 +56,17 @@ data RemoveWhen = RemoveSafe | RemoveNever
seek :: MoveOptions -> CommandSeek
seek o = startConcurrency stages $ do
let go = start (fromToOptions o) (removeWhen o)
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' go
, checkContentPresent = Nothing
, usesLocationLog = False
}
case batchOption o of
Batch fmt -> batchFilesMatching fmt
(whenAnnexed go . toRawFilePath)
NoBatch -> withKeyOptions (keyOptions o) False
(commandAction . startKey (fromToOptions o) (removeWhen o))
(withFilesInGitAnnex ww (commandAction' go))
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (moveFiles o)
where
stages = case fromToOptions o of

View file

@ -652,8 +652,13 @@ seekSyncContent o rs currbranch = do
waitForAllRunningCommandActions
liftIO $ not <$> isEmptyMVar mvar
where
seekworktree mvar l bloomfeeder =
seekFilteredKeys (gofile bloomfeeder mvar) $
seekworktree mvar l bloomfeeder = do
let seeker = AnnexedFileSeeker
{ seekAction = gofile bloomfeeder mvar
, checkContentPresent = Nothing
, usesLocationLog = True
}
seekFilteredKeys seeker $
seekHelper fst3 ww LsFiles.inRepoDetails l
seekincludinghidden origbranch mvar l bloomfeeder =

View file

@ -23,13 +23,19 @@ cmd = withGlobalOptions [annexedMatchingOptions] $
paramPaths (withParams seek)
seek :: CmdParams -> CommandSeek
seek ps = (withFilesInGitAnnex ww (commandAction' start))
=<< workTreeItems ww ps
seek ps = withFilesInGitAnnex ww seeker =<< workTreeItems ww ps
where
ww = WarnUnmatchLsFiles
seeker :: AnnexedFileSeeker
seeker = AnnexedFileSeeker
{ seekAction = commandAction' start
, checkContentPresent = Nothing
, usesLocationLog = False
}
start :: RawFilePath -> Key -> CommandStart
start file key = stopUnless (inAnnex key) $
start file key =
starting "unannex" (mkActionItem (key, file)) $
perform file key

View file

@ -44,7 +44,7 @@ seek ps = do
l <- workTreeItems ww ps
withFilesNotInGit (commandAction . whenAnnexed (startCheckIncomplete . fromRawFilePath)) l
Annex.changeState $ \s -> s { Annex.fast = True }
withFilesInGitAnnex ww (commandAction' Command.Unannex.start) l
withFilesInGitAnnex ww Command.Unannex.seeker l
finish
where
ww = WarnUnmatchLsFiles

View file

@ -27,10 +27,14 @@ mkcmd n d = withGlobalOptions [jsonOptions, annexedMatchingOptions] $
command n SectionCommon d paramPaths (withParams seek)
seek :: CmdParams -> CommandSeek
seek ps = withFilesInGitAnnex ww (commandAction' start)
=<< workTreeItems ww ps
seek ps = withFilesInGitAnnex ww seeker =<< workTreeItems ww ps
where
ww = WarnUnmatchLsFiles
seeker = AnnexedFileSeeker
{ seekAction = commandAction' start
, checkContentPresent = Nothing
, usesLocationLog = False
}
start :: RawFilePath -> Key -> CommandStart
start file key = ifM (isJust <$> isAnnexLink file)

View file

@ -55,10 +55,15 @@ seek o = do
case batchOption o of
Batch fmt -> batchFilesMatching fmt
(whenAnnexed go . toRawFilePath)
NoBatch ->
NoBatch -> do
let seeker = AnnexedFileSeeker
{ seekAction = commandAction' go
, checkContentPresent = Nothing
, usesLocationLog = True
}
withKeyOptions (keyOptions o) False
(commandAction . startKeys o m)
(withFilesInGitAnnex ww (commandAction' go))
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (whereisFiles o)
where
ww = WarnUnmatchLsFiles

View file

@ -25,6 +25,22 @@ and precache them.
> >
> > So, this needs some more work, but is promising.
> > > Second try at this, results:
> > >
> > > * `get` in a full repo is not any slower. And presumably in an
> > > empty repo, `get` is faster, but I didn't try it and the transfers
> > > will dominate that anyway
> > > * `sync --content` 2x speedup!
> > > * `fsck --fast` 1.5x speedup
> > > * `whereis` 1.5x speedup
> > >
> > > Still todo:
> > >
> > > * move, copy, drop, and mirror were left not using the location log caching yet
> > > * get is left with an unncessary inAnnex check so could be sped up
> > > a little bit more
> > >
Another thing that the same cat-file --buffer approach could be used with
is to cat the annex links. Git.LsFiles.inRepoDetails provides the Sha
of file contents, which can be fed through cat-file --buffer to get keys.