optimize smudge --clean of unmodified file

Usually, git won't run clean filter when a file is unmodified. But, when
git checkout runs git annex smudge --update, it populates the pointer
runs git update-index, which sees the file has changed and runs
git annex smudge --clean, which was checksumming the file unncessarily
as it re-ingested it.

With annex.thin set, this is the difference between git checkout of a
branch with a 1 gb file taking 30s and 0.1s.

This commit was sponsored by Brett Eisenberg on Patreon.
This commit is contained in:
Joey Hess 2018-10-25 16:38:04 -04:00
parent daa259ec6a
commit c28ca8294f
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 40 additions and 15 deletions

View file

@ -46,6 +46,7 @@ module Annex.Content (
staleKeysPrune,
pruneTmpWorkDirBefore,
isUnmodified,
isUnmodifiedCheap,
verifyKeyContent,
VerifyConfig(..),
Verification(..),
@ -746,25 +747,38 @@ isUnmodified :: Key -> FilePath -> Annex Bool
isUnmodified key f = go =<< geti
where
go Nothing = return False
go (Just fc) = cheapcheck fc <||> expensivecheck fc
cheapcheck fc = anyM (compareInodeCaches fc)
=<< Database.Keys.getInodeCaches key
go (Just fc) = isUnmodifiedCheap' key fc <||> expensivecheck fc
expensivecheck fc = ifM (verifyKeyContent RetrievalAllKeysSecure AlwaysVerify UnVerified key f)
( do
liftIO $ print "content verified"
-- The file could have been modified while it was
-- being verified. Detect that.
ifM (geti >>= maybe (return False) (compareInodeCaches fc))
( do
-- Update the InodeCache to avoid
-- performing this expensive check again.
liftIO $ print "update inode cache"
Database.Keys.addInodeCaches key [fc]
return True
, return False
)
, return False
, do
liftIO $ print "content not verified"
return False
)
geti = withTSDelta (liftIO . genInodeCache f)
{- Cheap check if a file contains the unmodified content of the key,
- only checking the InodeCache of the key.
-}
isUnmodifiedCheap :: Key -> FilePath -> Annex Bool
isUnmodifiedCheap key f = maybe (return False) (isUnmodifiedCheap' key)
=<< withTSDelta (liftIO . genInodeCache f)
isUnmodifiedCheap' :: Key -> InodeCache -> Annex Bool
isUnmodifiedCheap' key fc =
anyM (compareInodeCaches fc) =<< Database.Keys.getInodeCaches key
{- Moves a key out of .git/annex/objects/ into .git/annex/bad, and
- returns the file it was moved to. -}
moveBad :: Key -> Annex FilePath

View file

@ -95,19 +95,30 @@ clean file = do
if Git.BuildVersion.older "2.5"
then B.length b `seq` return ()
else liftIO $ hClose stdin
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its
-- backend does not change.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey
-- Can't restage associated files because git add
-- runs this and has the index locked.
let norestage = Restage False
liftIO . emitPointer
=<< postingest
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
-- Optimization when the file is already annexed
-- and is unmodified.
case oldkey of
Nothing -> ingest oldkey
Just ko -> ifM (isUnmodifiedCheap ko file)
( liftIO $ emitPointer ko
, ingest oldkey
)
, liftIO $ B.hPut stdout b
)
ingest oldkey = do
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its
-- backend does not change.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey
-- Can't restage associated files because git add
-- runs this and has the index locked.
let norestage = Restage False
liftIO . emitPointer
=<< postingest
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
postingest (Just k, _) = do
logStatus k InfoPresent