use bloom filter in second pass of sync --all --content
This is needed because when preferred content matches on files, the second pass would otherwise want to drop all keys. Using a bloom filter avoids this, and in the case of a false positive, a key will be left undropped that preferred content would allow dropping. Chances of that happening are a mere 1 in 1 million.
This commit is contained in:
parent
a0a8127956
commit
adba0595bd
4 changed files with 46 additions and 21 deletions
|
@ -40,11 +40,11 @@ bloomBitsHashes = do
|
|||
- Once the action completes, the mutable filter is frozen
|
||||
- for later use.
|
||||
-}
|
||||
genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex b) -> Annex (Bloom v)
|
||||
genBloomFilter :: Hashable v => ((v -> Annex ()) -> Annex ()) -> Annex (Bloom v)
|
||||
genBloomFilter populate = do
|
||||
(numbits, numhashes) <- bloomBitsHashes
|
||||
bloom <- lift $ newMB (cheapHashes numhashes) numbits
|
||||
_ <- populate $ \v -> lift $ insertMB bloom v
|
||||
populate $ \v -> lift $ insertMB bloom v
|
||||
lift $ unsafeFreezeMB bloom
|
||||
where
|
||||
lift = liftIO . stToIO
|
||||
|
|
|
@ -45,6 +45,7 @@ import Annex.UUID
|
|||
import Logs.UUID
|
||||
import Annex.AutoMerge
|
||||
import Annex.Ssh
|
||||
import Annex.BloomFilter
|
||||
import Utility.Bloom
|
||||
|
||||
import Control.Concurrent.MVar
|
||||
|
@ -358,7 +359,12 @@ newer remote b = do
|
|||
, return True
|
||||
)
|
||||
|
||||
{- If it's preferred content, and we don't have it, get it from one of the
|
||||
{- Without --all, only looks at files in the work tree. With --all,
|
||||
- makes 2 passes, first looking at the work tree and then all keys.
|
||||
- This ensures that preferred content expressions that match on
|
||||
- filenames work, even when in --all mode.
|
||||
-
|
||||
- If it's preferred content, and we don't have it, get it from one of the
|
||||
- listed remotes (preferring the cheaper earlier ones).
|
||||
-
|
||||
- Send it to each remote that doesn't have it, and for which it's
|
||||
|
@ -374,22 +380,20 @@ newer remote b = do
|
|||
seekSyncContent :: [Remote] -> Annex Bool
|
||||
seekSyncContent rs = do
|
||||
mvar <- liftIO newEmptyMVar
|
||||
-- Always start with the work tree; this ensures that preferred
|
||||
-- content expressions that match files match, even when in --all
|
||||
-- mode.
|
||||
seekworktree mvar []
|
||||
withKeyOptions' False (seekkeys mvar) (const noop) []
|
||||
bloom <- genBloomFilter (seekworktree mvar [])
|
||||
withKeyOptions' False (seekkeys mvar bloom) (const noop) []
|
||||
liftIO $ not <$> isEmptyMVar mvar
|
||||
where
|
||||
seekworktree mvar = seekHelper LsFiles.inRepo >=>
|
||||
mapM_ (\f -> ifAnnexed f (go mvar (Just f)) noop)
|
||||
seekkeys mvar getkeys = mapM_ (go mvar Nothing) =<< getkeys
|
||||
go mvar af k = do
|
||||
seekworktree mvar l bloomfeeder = seekHelper LsFiles.inRepo l >>=
|
||||
mapM_ (\f -> ifAnnexed f (go (Right bloomfeeder) mvar (Just f)) noop)
|
||||
seekkeys mvar bloom getkeys =
|
||||
mapM_ (go (Left bloom) mvar Nothing) =<< getkeys
|
||||
go ebloom mvar af k = do
|
||||
void $ liftIO $ tryPutMVar mvar ()
|
||||
syncFile rs af k
|
||||
syncFile ebloom rs af k
|
||||
|
||||
syncFile :: [Remote] -> AssociatedFile -> Key -> Annex ()
|
||||
syncFile rs af k = do
|
||||
syncFile :: Either (Bloom Key) (Key -> Annex ()) -> [Remote] -> AssociatedFile -> Key -> Annex ()
|
||||
syncFile ebloom rs af k = do
|
||||
locs <- loggedLocations k
|
||||
let (have, lack) = partition (\r -> Remote.uuid r `elem` locs) rs
|
||||
|
||||
|
@ -399,10 +403,24 @@ syncFile rs af k = do
|
|||
u <- getUUID
|
||||
let locs' = concat [[u | got], putrs, locs]
|
||||
|
||||
-- Using callCommandAction rather than includeCommandAction for drops,
|
||||
-- because a failure to drop does not mean the sync failed.
|
||||
handleDropsFrom locs' rs "unwanted" True k af
|
||||
Nothing callCommandAction
|
||||
-- A bloom filter is populated with all the keys in the first pass.
|
||||
-- On the second pass, avoid dropping keys that were seen in the
|
||||
-- first pass, which would happen otherwise when preferred content
|
||||
-- matches on the filename, which is not available in the second
|
||||
-- pass.
|
||||
--
|
||||
-- When there's a false positive in the bloom filter, the result
|
||||
-- is keeping a key that preferred content doesn't really want.
|
||||
seenbloom <- case ebloom of
|
||||
Left bloom -> pure (elemB k bloom)
|
||||
Right bloomfeeder -> bloomfeeder k >> return False
|
||||
unless seenbloom $
|
||||
-- Using callCommandAction rather than
|
||||
-- includeCommandAction for drops,
|
||||
-- because a failure to drop does not mean
|
||||
-- the sync failed.
|
||||
handleDropsFrom locs' rs "unwanted" True k af
|
||||
Nothing callCommandAction
|
||||
where
|
||||
wantget have = allM id
|
||||
[ pure (not $ null have)
|
||||
|
|
|
@ -132,8 +132,8 @@ instance Arbitrary Key where
|
|||
<*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative
|
||||
|
||||
instance Hashable Key where
|
||||
hashIO32 = hashIO32 . show
|
||||
hashIO64 = hashIO64 . show
|
||||
hashIO32 = hashIO32 . key2file
|
||||
hashIO64 = hashIO64 . key2file
|
||||
|
||||
prop_idempotent_key_encode :: Key -> Bool
|
||||
prop_idempotent_key_encode k = Just k == (file2key . key2file) k
|
||||
|
|
|
@ -12,6 +12,7 @@ module Utility.Bloom (
|
|||
safeSuggestSizing,
|
||||
Hashable(..),
|
||||
cheapHashes,
|
||||
elemB,
|
||||
notElemB,
|
||||
|
||||
newMB,
|
||||
|
@ -34,6 +35,9 @@ import Control.Monad.ST (ST)
|
|||
notElemB :: a -> Bloom a -> Bool
|
||||
notElemB = Bloom.notElem
|
||||
|
||||
elemB :: a -> Bloom a -> Bool
|
||||
elemB = Bloom.elem
|
||||
|
||||
newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (MBloom.MBloom s a)
|
||||
newMB = MBloom.new
|
||||
|
||||
|
@ -48,6 +52,9 @@ unsafeFreezeMB = Bloom.unsafeFreeze
|
|||
notElemB :: a -> Bloom a -> Bool
|
||||
notElemB = Bloom.notElemB
|
||||
|
||||
elemB :: a -> Bloom a -> Bool
|
||||
elemB = Bloom.elem
|
||||
|
||||
newMB :: (a -> [Bloom.Hash]) -> Int -> ST s (Bloom.MBloom s a)
|
||||
newMB = Bloom.newMB
|
||||
|
||||
|
|
Loading…
Reference in a new issue