optimize smudge --clean of unmodified file

Usually, git won't run clean filter when a file is unmodified. But, when
git checkout runs git annex smudge --update, it populates the pointer
runs git update-index, which sees the file has changed and runs
git annex smudge --clean, which was checksumming the file unncessarily
as it re-ingested it.

With annex.thin set, this is the difference between git checkout of a
branch with a 1 gb file taking 30s and 0.1s.

This commit was sponsored by Brett Eisenberg on Patreon.
This commit is contained in:
Joey Hess 2018-10-25 16:38:04 -04:00
parent daa259ec6a
commit c28ca8294f
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 40 additions and 15 deletions

View file

@ -46,6 +46,7 @@ module Annex.Content (
staleKeysPrune, staleKeysPrune,
pruneTmpWorkDirBefore, pruneTmpWorkDirBefore,
isUnmodified, isUnmodified,
isUnmodifiedCheap,
verifyKeyContent, verifyKeyContent,
VerifyConfig(..), VerifyConfig(..),
Verification(..), Verification(..),
@ -746,25 +747,38 @@ isUnmodified :: Key -> FilePath -> Annex Bool
isUnmodified key f = go =<< geti isUnmodified key f = go =<< geti
where where
go Nothing = return False go Nothing = return False
go (Just fc) = cheapcheck fc <||> expensivecheck fc go (Just fc) = isUnmodifiedCheap' key fc <||> expensivecheck fc
cheapcheck fc = anyM (compareInodeCaches fc)
=<< Database.Keys.getInodeCaches key
expensivecheck fc = ifM (verifyKeyContent RetrievalAllKeysSecure AlwaysVerify UnVerified key f) expensivecheck fc = ifM (verifyKeyContent RetrievalAllKeysSecure AlwaysVerify UnVerified key f)
( do ( do
liftIO $ print "content verified"
-- The file could have been modified while it was -- The file could have been modified while it was
-- being verified. Detect that. -- being verified. Detect that.
ifM (geti >>= maybe (return False) (compareInodeCaches fc)) ifM (geti >>= maybe (return False) (compareInodeCaches fc))
( do ( do
-- Update the InodeCache to avoid -- Update the InodeCache to avoid
-- performing this expensive check again. -- performing this expensive check again.
liftIO $ print "update inode cache"
Database.Keys.addInodeCaches key [fc] Database.Keys.addInodeCaches key [fc]
return True return True
, return False , return False
) )
, return False , do
liftIO $ print "content not verified"
return False
) )
geti = withTSDelta (liftIO . genInodeCache f) geti = withTSDelta (liftIO . genInodeCache f)
{- Cheap check if a file contains the unmodified content of the key,
- only checking the InodeCache of the key.
-}
isUnmodifiedCheap :: Key -> FilePath -> Annex Bool
isUnmodifiedCheap key f = maybe (return False) (isUnmodifiedCheap' key)
=<< withTSDelta (liftIO . genInodeCache f)
isUnmodifiedCheap' :: Key -> InodeCache -> Annex Bool
isUnmodifiedCheap' key fc =
anyM (compareInodeCaches fc) =<< Database.Keys.getInodeCaches key
{- Moves a key out of .git/annex/objects/ into .git/annex/bad, and {- Moves a key out of .git/annex/objects/ into .git/annex/bad, and
- returns the file it was moved to. -} - returns the file it was moved to. -}
moveBad :: Key -> Annex FilePath moveBad :: Key -> Annex FilePath

View file

@ -95,20 +95,31 @@ clean file = do
if Git.BuildVersion.older "2.5" if Git.BuildVersion.older "2.5"
then B.length b `seq` return () then B.length b `seq` return ()
else liftIO $ hClose stdin else liftIO $ hClose stdin
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its -- Optimization when the file is already annexed
-- backend does not change. -- and is unmodified.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey case oldkey of
-- Can't restage associated files because git add Nothing -> ingest oldkey
-- runs this and has the index locked. Just ko -> ifM (isUnmodifiedCheap ko file)
let norestage = Restage False ( liftIO $ emitPointer ko
liftIO . emitPointer , ingest oldkey
=<< postingest )
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
, liftIO $ B.hPut stdout b , liftIO $ B.hPut stdout b
) )
ingest oldkey = do
-- Look up the backend that was used for this file
-- before, so that when git re-cleans a file its
-- backend does not change.
let oldbackend = maybe Nothing (maybeLookupBackendVariety . keyVariety) oldkey
-- Can't restage associated files because git add
-- runs this and has the index locked.
let norestage = Restage False
liftIO . emitPointer
=<< postingest
=<< (\ld -> ingest' oldbackend ld Nothing norestage)
=<< lockDown cfg file
postingest (Just k, _) = do postingest (Just k, _) = do
logStatus k InfoPresent logStatus k InfoPresent
return k return k