03cb2c8ece put a cat-file into the fast
bloomfilter generation path. Instead, add another bloom filter which diffs
from the work tree to the index.

Also, pull the sha of the changed object out of the diffs, and cat that
object directly, rather than indirecting through the filename.

Finally, removed some hacks that are unncessary thanks to the worktree to
index diff.
This commit is contained in:
Joey Hess 2016-01-06 20:38:02 -04:00
parent b26776d92f
commit 3320870bad
Failed to extract signature
2 changed files with 54 additions and 52 deletions

View file

@ -30,8 +30,8 @@ import Annex.Link
import Annex.CatFile import Annex.CatFile
import Types.Key import Types.Key
import Types.RefSpec import Types.RefSpec
import Git.FilePath
import Git.Types import Git.Types
import Git.Sha
import Logs.View (is_branchView) import Logs.View (is_branchView)
import Annex.BloomFilter import Annex.BloomFilter
@ -158,28 +158,23 @@ dropMsg' s = "\nTo remove unwanted data: git-annex dropunused" ++ s ++ " NUMBER\
- -
- Strategy: - Strategy:
- -
- * Build a bloom filter of all keys referenced by symlinks. This - Pass keys through 3 bloom filters in order, only creating each bloom
- is the fastest one to build and will filter out most keys. - filter on demand if the previous one didn't filter out all keys.
- * If keys remain, build a second bloom filter of keys referenced by -
- branches maching the RefSpec. - 1. All keys referenced by files in the work tree.
- * The list is streamed through these bloom filters lazily, so both will - This is the fastest one to build and will filter out most keys.
- exist at the same time. This means that twice the memory is used, - 2. All keys in the diff from the work tree to the index.
- but they're relatively small, so the added complexity of using a - 3. All keys in the diffs between the index and branches matching the
- mutable bloom filter does not seem worthwhile. - RefSpec. (This can take quite a while).
- * Generating the second bloom filter can take quite a while, since
- it needs enumerating all keys in all git branches. But, the common
- case, if the second filter is needed, is for some keys to be globally
- unused, and in that case, no short-circuit is possible.
- Short-circuiting if the first filter filters all the keys handles the
- other common case.
-} -}
excludeReferenced :: RefSpec -> [Key] -> Annex [Key] excludeReferenced :: RefSpec -> [Key] -> Annex [Key]
excludeReferenced refspec ks = runfilter firstlevel ks >>= runfilter secondlevel excludeReferenced refspec ks =
runfilter withKeysReferencedM ks
>>= runfilter withKeysReferencedDiffIndex
>>= runfilter (withKeysReferencedDiffGitRefs refspec)
where where
runfilter _ [] = return [] -- optimisation runfilter _ [] = return [] -- optimisation
runfilter a l = bloomFilter l <$> genBloomFilter a runfilter a l = bloomFilter l <$> genBloomFilter a
firstlevel = withKeysReferencedM
secondlevel = withKeysReferencedInGit refspec
{- Given an initial value, folds it with each key referenced by {- Given an initial value, folds it with each key referenced by
- files in the working tree. -} - files in the working tree. -}
@ -218,7 +213,6 @@ withKeysReferenced' mdir initial a = do
mk <- getM id mk <- getM id
[ isAnnexLink f [ isAnnexLink f
, liftIO (isPointerFile f) , liftIO (isPointerFile f)
, catKeyFile f
] ]
case mk of case mk of
Nothing -> go v fs Nothing -> go v fs
@ -226,53 +220,56 @@ withKeysReferenced' mdir initial a = do
!v' <- a k f v !v' <- a k f v
go v' fs go v' fs
withKeysReferencedInGit :: RefSpec -> (Key -> Annex ()) -> Annex () withKeysReferencedDiffGitRefs :: RefSpec -> (Key -> Annex ()) -> Annex ()
withKeysReferencedInGit refspec a = do withKeysReferencedDiffGitRefs refspec a = do
current <- inRepo Git.Branch.currentUnsafe rs <- relevantrefs <$> inRepo (Git.Command.pipeReadStrict [Param "show-ref"])
shaHead <- maybe (return Nothing) (inRepo . Git.Ref.sha) current shaHead <- maybe (return Nothing) (inRepo . Git.Ref.sha)
rs <- relevantrefs (shaHead, current) =<< inRepo Git.Branch.currentUnsafe
<$> inRepo (Git.Command.pipeReadStrict [Param "show-ref"]) let haveHead = any (\(shaRef, _) -> Just shaRef == shaHead) rs
usedrefs <- applyRefSpec refspec rs (getreflog rs) let rs' = map snd (nubRefs rs)
forM_ usedrefs $ usedrefs <- applyRefSpec refspec rs' (getreflog rs')
withKeysReferencedInGitRef a forM_ (if haveHead then usedrefs else Git.Ref.headRef : usedrefs) $
withKeysReferencedDiffGitRef a
where where
relevantrefs headRef = addHead headRef . relevantrefs = map (\(r, h) -> (Git.Ref r, Git.Ref h)) .
filter ourbranches . filter ourbranches .
map (separate (== ' ')) . map (separate (== ' ')) .
lines lines
nubRefs = map (Git.Ref . snd) . nubBy (\(x, _) (y, _) -> x == y) nubRefs = nubBy (\(x, _) (y, _) -> x == y)
ourbranchend = '/' : Git.fromRef Annex.Branch.name ourbranchend = '/' : Git.fromRef Annex.Branch.name
ourbranches (_, b) = not (ourbranchend `isSuffixOf` b) ourbranches (_, b) = not (ourbranchend `isSuffixOf` b)
&& not ("refs/synced/" `isPrefixOf` b) && not ("refs/synced/" `isPrefixOf` b)
&& not (is_branchView (Git.Ref b)) && not (is_branchView (Git.Ref b))
addHead headRef refs = case headRef of
-- if HEAD diverges from all branches (except the branch it
-- points to), run the actions on staged keys (and keys
-- that are only present in the work tree if the repo is
-- non bare)
(Just (Git.Ref x), Just (Git.Ref b))
| all (\(x',b') -> x /= x' || b == b') refs ->
Git.Ref.headRef
: nubRefs (filter ((/= x) . fst) refs)
_ -> nubRefs refs
getreflog rs = inRepo $ Git.RefLog.getMulti rs getreflog rs = inRepo $ Git.RefLog.getMulti rs
{- Runs an action on keys referenced in the given Git reference which {- Runs an action on keys referenced in the given Git reference which
- differ from those referenced in the work tree. -} - differ from those referenced in the index. -}
withKeysReferencedInGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex () withKeysReferencedDiffGitRef :: (Key -> Annex ()) -> Git.Ref -> Annex ()
withKeysReferencedInGitRef a ref = do withKeysReferencedDiffGitRef a ref = do
showAction $ "checking " ++ Git.Ref.describe ref showAction $ "checking " ++ Git.Ref.describe ref
bare <- isBareRepo withKeysReferencedDiff a
(ts,clean) <- inRepo $ if bare (inRepo $ DiffTree.diffIndex ref)
then DiffTree.diffIndex ref DiffTree.srcsha
else DiffTree.diffWorkTree ref
let lookAtWorkingTree = not bare && ref == Git.Ref.headRef {- Runs an action on keys referenced in the index which differ from the
forM_ ts $ tKey lookAtWorkingTree >=> maybe noop a - work tree. -}
withKeysReferencedDiffIndex :: (Key -> Annex ()) -> Annex ()
withKeysReferencedDiffIndex a = unlessM (isBareRepo) $
withKeysReferencedDiff a
(inRepo $ DiffTree.diffFiles [])
DiffTree.srcsha
withKeysReferencedDiff :: (Key -> Annex ()) -> (Annex ([DiffTree.DiffTreeItem], IO Bool)) -> (DiffTree.DiffTreeItem -> Sha) -> Annex ()
withKeysReferencedDiff a getdiff extractsha = do
(ds, clean) <- getdiff
forM_ ds go
liftIO $ void clean liftIO $ void clean
where where
tKey True = lookupFile . getTopFilePath . DiffTree.file go d = do
tKey False = parseLinkOrPointer let sha = extractsha d
<$$> catFile ref . getTopFilePath . DiffTree.file unless (sha == nullSha) $
(parseLinkOrPointer <$> catObject sha)
>>= maybe noop a
data UnusedMaps = UnusedMaps data UnusedMaps = UnusedMaps
{ unusedMap :: UnusedMap { unusedMap :: UnusedMap

View file

@ -12,6 +12,7 @@ module Git.DiffTree (
diffTreeRecursive, diffTreeRecursive,
diffIndex, diffIndex,
diffWorkTree, diffWorkTree,
diffFiles,
diffLog, diffLog,
) where ) where
@ -66,6 +67,10 @@ diffIndex' ref params repo =
, return ([], return True) , return ([], return True)
) )
{- Diff between the index and work tree. -}
diffFiles :: [CommandParam] -> Repo -> IO ([DiffTreeItem], IO Bool)
diffFiles = getdiff (Param "diff-files")
{- Runs git log in --raw mode to get the changes that were made in {- Runs git log in --raw mode to get the changes that were made in
- a particular commit. The output format is adjusted to be the same - a particular commit. The output format is adjusted to be the same
- as diff-tree --raw._-} - as diff-tree --raw._-}