use bloom filter in second pass of sync --all --content

This is needed because when preferred content matches on files,
the second pass would otherwise want to drop all keys. Using a bloom filter
avoids this, and in the case of a false positive, a key will be left
undropped that preferred content would allow dropping. Chances of that
happening are a mere 1 in 1 million.
This commit is contained in:
Joey Hess 2015-06-16 18:38:12 -04:00
parent a0a8127956
commit adba0595bd
4 changed files with 46 additions and 21 deletions

View file

@ -40,11 +40,11 @@ bloomBitsHashes = do
- Once the action completes, the mutable filter is frozen - Once the action completes, the mutable filter is frozen
- for later use. - for later use.
-} -}
genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex b) -> Annex (Bloom v) genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
genBloomFilter populate = do genBloomFilter populate = do
(numbits, numhashes) <- bloomBitsHashes (numbits, numhashes) <- bloomBitsHashes
bloom <- lift $ newMB (cheapHashes numhashes) numbits bloom <- lift $ newMB (cheapHashes numhashes) numbits
_ <- populate $ \v -> lift $ insertMB bloom v populate $ \v -> lift $ insertMB bloom v
lift $ unsafeFreezeMB bloom lift $ unsafeFreezeMB bloom
where where
lift = liftIO . stToIO lift = liftIO . stToIO

View file

@ -45,6 +45,7 @@ import Annex.UUID
import Logs.UUID import Logs.UUID
import Annex.AutoMerge import Annex.AutoMerge
import Annex.Ssh import Annex.Ssh
import Annex.BloomFilter
import Utility.Bloom import Utility.Bloom
import Control.Concurrent.MVar import Control.Concurrent.MVar
@ -358,7 +359,12 @@ newer remote b = do
, return True , return True
) )
{- If it's preferred content, and we don't have it, get it from one of the {- Without --all, only looks at files in the work tree. With --all,
- makes 2 passes, first looking at the work tree and then all keys.
- This ensures that preferred content expressions that match on
- filenames work, even when in --all mode.
-
- If it's preferred content, and we don't have it, get it from one of the
- listed remotes (preferring the cheaper earlier ones). - listed remotes (preferring the cheaper earlier ones).
- -
- Send it to each remote that doesn't have it, and for which it's - Send it to each remote that doesn't have it, and for which it's
@ -374,22 +380,20 @@ newer remote b = do
seekSyncContent :: [Remote] -> Annex Bool seekSyncContent :: [Remote] -> Annex Bool
seekSyncContent rs = do seekSyncContent rs = do
mvar <- liftIO newEmptyMVar mvar <- liftIO newEmptyMVar
-- Always start with the work tree; this ensures that preferred bloom <- genBloomFilter (seekworktree mvar [])
-- content expressions that match files match, even when in --all withKeyOptions' False (seekkeys mvar bloom) (const noop) []
-- mode.
seekworktree mvar []
withKeyOptions' False (seekkeys mvar) (const noop) []
liftIO $ not <$> isEmptyMVar mvar liftIO $ not <$> isEmptyMVar mvar
where where
seekworktree mvar = seekHelper LsFiles.inRepo >=> seekworktree mvar l bloomfeeder = seekHelper LsFiles.inRepo l >>=
mapM_ (\f -> ifAnnexed f (go mvar (Just f)) noop) mapM_ (\f -> ifAnnexed f (go (Right bloomfeeder) mvar (Just f)) noop)
seekkeys mvar getkeys = mapM_ (go mvar Nothing) =<< getkeys seekkeys mvar bloom getkeys =
go mvar af k = do mapM_ (go (Left bloom) mvar Nothing) =<< getkeys
go ebloom mvar af k = do
void $ liftIO $ tryPutMVar mvar () void $ liftIO $ tryPutMVar mvar ()
syncFile rs af k syncFile ebloom rs af k
syncFile :: [Remote] -> AssociatedFile -> Key -> Annex () syncFile :: Either (Bloom Key) (Key -> Annex ()) -> [Remote] -> AssociatedFile -> Key -> Annex ()
syncFile rs af k = do syncFile ebloom rs af k = do
locs <- loggedLocations k locs <- loggedLocations k
let (have, lack) = partition (\r -> Remote.uuid r `elem` locs) rs let (have, lack) = partition (\r -> Remote.uuid r `elem` locs) rs
@ -399,8 +403,22 @@ syncFile rs af k = do
u <- getUUID u <- getUUID
let locs' = concat [[u | got], putrs, locs] let locs' = concat [[u | got], putrs, locs]
-- Using callCommandAction rather than includeCommandAction for drops, -- A bloom filter is populated with all the keys in the first pass.
-- because a failure to drop does not mean the sync failed. -- On the second pass, avoid dropping keys that were seen in the
-- first pass, which would happen otherwise when preferred content
-- matches on the filename, which is not available in the second
-- pass.
--
-- When there's a false positive in the bloom filter, the result
-- is keeping a key that preferred content doesn't really want.
seenbloom <- case ebloom of
Left bloom -> pure (elemB k bloom)
Right bloomfeeder -> bloomfeeder k >> return False
unless seenbloom $
-- Using callCommandAction rather than
-- includeCommandAction for drops,
-- because a failure to drop does not mean
-- the sync failed.
handleDropsFrom locs' rs "unwanted" True k af handleDropsFrom locs' rs "unwanted" True k af
Nothing callCommandAction Nothing callCommandAction
where where

View file

@ -132,8 +132,8 @@ instance Arbitrary Key where
<*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative <*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative
instance Hashable Key where instance Hashable Key where
hashIO32 = hashIO32 . show hashIO32 = hashIO32 . key2file
hashIO64 = hashIO64 . show hashIO64 = hashIO64 . key2file
prop_idempotent_key_encode :: Key -> Bool prop_idempotent_key_encode :: Key -> Bool
prop_idempotent_key_encode k = Just k == (file2key . key2file) k prop_idempotent_key_encode k = Just k == (file2key . key2file) k

View file

@ -12,6 +12,7 @@ module Utility.Bloom (
safeSuggestSizing, safeSuggestSizing,
Hashable(..), Hashable(..),
cheapHashes, cheapHashes,
elemB,
notElemB, notElemB,
newMB, newMB,
@ -34,6 +35,9 @@ import Control.Monad.ST (ST)
notElemB :: a -> Bloom a -> Bool notElemB :: a -> Bloom a -> Bool
notElemB = Bloom.notElem notElemB = Bloom.notElem
elemB :: a -> Bloom a -> Bool
elemB = Bloom.elem
newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (MBloom.MBloom s a) newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (MBloom.MBloom s a)
newMB = MBloom.new newMB = MBloom.new
@ -48,6 +52,9 @@ unsafeFreezeMB = Bloom.unsafeFreeze
notElemB :: a -> Bloom a -> Bool notElemB :: a -> Bloom a -> Bool
notElemB = Bloom.notElemB notElemB = Bloom.notElemB
elemB :: a -> Bloom a -> Bool
elemB = Bloom.elem
newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (Bloom.MBloom s a) newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (Bloom.MBloom s a)
newMB = Bloom.newMB newMB = Bloom.newMB