incremental checksum for local remotes
This benchmarks only slightly faster than the old git-annex. Eg, for a 1 gb file, 14.56s vs 15.57s. (On a ram disk; there would certianly be more of an effect if the file was written to disk and didn't stay in cache.) Commenting out the updateIncremental calls make the same run in 6.31s. May be that overhead in the implementation, other than the actual checksumming, is slowing it down. Eg, MVar access. (I also tried using 10x larger chunks, which did not change the speed.)
This commit is contained in:
parent
48f63c2798
commit
f44d4704c6
4 changed files with 42 additions and 19 deletions
|
@ -24,8 +24,8 @@ git-annex (8.20210128) UNRELEASED; urgency=medium
|
||||||
* Include libkqueue.h file needed to build the assistant on BSDs.
|
* Include libkqueue.h file needed to build the assistant on BSDs.
|
||||||
* Tahoe: Avoid verifying hash after download, since tahoe does sufficient
|
* Tahoe: Avoid verifying hash after download, since tahoe does sufficient
|
||||||
verification itself.
|
verification itself.
|
||||||
* Checksum as content is received from a remote git-annex repository
|
* Checksum as content is received from a remote git-annex repository,
|
||||||
over ssh/p2p protocols, rather than doing it in a second pass.
|
rather than doing it in a second pass.
|
||||||
* Bugfix: fsck --from a ssh remote did not actually check that the
|
* Bugfix: fsck --from a ssh remote did not actually check that the
|
||||||
content on the remote is not corrupted.
|
content on the remote is not corrupted.
|
||||||
|
|
||||||
|
|
|
@ -62,7 +62,10 @@ import Annex.Path
|
||||||
import Creds
|
import Creds
|
||||||
import Types.NumCopies
|
import Types.NumCopies
|
||||||
import Types.ProposedAccepted
|
import Types.ProposedAccepted
|
||||||
|
import Types.Backend
|
||||||
|
import Backend
|
||||||
import Annex.Action
|
import Annex.Action
|
||||||
|
import Annex.Verify
|
||||||
import Messages.Progress
|
import Messages.Progress
|
||||||
|
|
||||||
#ifndef mingw32_HOST_OS
|
#ifndef mingw32_HOST_OS
|
||||||
|
@ -542,11 +545,12 @@ copyFromRemote'' repo forcersync r st@(State connpool _ _ _ _) key file dest met
|
||||||
-- run copy from perspective of remote
|
-- run copy from perspective of remote
|
||||||
onLocalFast st $ Annex.Content.prepSendAnnex key >>= \case
|
onLocalFast st $ Annex.Content.prepSendAnnex key >>= \case
|
||||||
Just (object, checksuccess) -> do
|
Just (object, checksuccess) -> do
|
||||||
|
let verify = Annex.Content.RemoteVerify r
|
||||||
copier <- mkCopier hardlink st
|
copier <- mkCopier hardlink st
|
||||||
(ok, v) <- runTransfer (Transfer Download u (fromKey id key))
|
(ok, v) <- runTransfer (Transfer Download u (fromKey id key))
|
||||||
file Nothing stdRetry $ \p ->
|
file Nothing stdRetry $ \p ->
|
||||||
metered (Just (combineMeterUpdate p meterupdate)) key $ \_ p' ->
|
metered (Just (combineMeterUpdate p meterupdate)) key $ \_ p' ->
|
||||||
copier object dest key p' checksuccess
|
copier object dest key p' checksuccess verify
|
||||||
if ok
|
if ok
|
||||||
then return v
|
then return v
|
||||||
else giveup "failed to retrieve content from remote"
|
else giveup "failed to retrieve content from remote"
|
||||||
|
@ -685,12 +689,12 @@ copyToRemote' repo r st@(State connpool duc _ _ _) key file meterupdate
|
||||||
res <- onLocalFast st $ ifM (Annex.Content.inAnnex key)
|
res <- onLocalFast st $ ifM (Annex.Content.inAnnex key)
|
||||||
( return True
|
( return True
|
||||||
, runTransfer (Transfer Download u (fromKey id key)) file Nothing stdRetry $ \p -> do
|
, runTransfer (Transfer Download u (fromKey id key)) file Nothing stdRetry $ \p -> do
|
||||||
copier <- mkCopier hardlink st
|
|
||||||
let verify = Annex.Content.RemoteVerify r
|
let verify = Annex.Content.RemoteVerify r
|
||||||
|
copier <- mkCopier hardlink st
|
||||||
let rsp = RetrievalAllKeysSecure
|
let rsp = RetrievalAllKeysSecure
|
||||||
res <- logStatusAfter key $ Annex.Content.getViaTmp rsp verify key file $ \dest ->
|
res <- logStatusAfter key $ Annex.Content.getViaTmp rsp verify key file $ \dest ->
|
||||||
metered (Just (combineMeterUpdate meterupdate p)) key $ \_ p' ->
|
metered (Just (combineMeterUpdate meterupdate p)) key $ \_ p' ->
|
||||||
copier object (fromRawFilePath dest) key p' (liftIO checksuccessio)
|
copier object (fromRawFilePath dest) key p' (liftIO checksuccessio) verify
|
||||||
Annex.Content.saveState True
|
Annex.Content.saveState True
|
||||||
return res
|
return res
|
||||||
)
|
)
|
||||||
|
@ -825,7 +829,7 @@ wantHardLink = (annexHardLink <$> Annex.getGitConfig)
|
||||||
-- from is implicitly trusted, so no expensive verification needs to be
|
-- from is implicitly trusted, so no expensive verification needs to be
|
||||||
-- done. Also returns Verified if the key's content is verified while
|
-- done. Also returns Verified if the key's content is verified while
|
||||||
-- copying it.
|
-- copying it.
|
||||||
type Copier = FilePath -> FilePath -> Key -> MeterUpdate -> Annex Bool -> Annex (Bool, Verification)
|
type Copier = FilePath -> FilePath -> Key -> MeterUpdate -> Annex Bool -> VerifyConfig -> Annex (Bool, Verification)
|
||||||
|
|
||||||
mkCopier :: Bool -> State -> Annex Copier
|
mkCopier :: Bool -> State -> Annex Copier
|
||||||
mkCopier remotewanthardlink st = do
|
mkCopier remotewanthardlink st = do
|
||||||
|
@ -833,13 +837,13 @@ mkCopier remotewanthardlink st = do
|
||||||
localwanthardlink <- wantHardLink
|
localwanthardlink <- wantHardLink
|
||||||
let linker = \src dest -> createLink src dest >> return True
|
let linker = \src dest -> createLink src dest >> return True
|
||||||
if remotewanthardlink || localwanthardlink
|
if remotewanthardlink || localwanthardlink
|
||||||
then return $ \src dest k p check ->
|
then return $ \src dest k p check verifyconfig ->
|
||||||
ifM (liftIO (catchBoolIO (linker src dest)))
|
ifM (liftIO (catchBoolIO (linker src dest)))
|
||||||
( ifM check
|
( ifM check
|
||||||
( return (True, Verified)
|
( return (True, Verified)
|
||||||
, return (False, UnVerified)
|
, return (False, UnVerified)
|
||||||
)
|
)
|
||||||
, copier src dest k p check
|
, copier src dest k p check verifyconfig
|
||||||
)
|
)
|
||||||
else return copier
|
else return copier
|
||||||
|
|
||||||
|
@ -922,9 +926,9 @@ newCopyCoWTried = CopyCoWTried <$> newEmptyMVar
|
||||||
-}
|
-}
|
||||||
fileCopier :: State -> Copier
|
fileCopier :: State -> Copier
|
||||||
#ifdef mingw32_HOST_OS
|
#ifdef mingw32_HOST_OS
|
||||||
fileCopier _st src dest k meterupdate check = docopy
|
fileCopier _st src dest k meterupdate check verifyconfig = docopy
|
||||||
#else
|
#else
|
||||||
fileCopier st src dest k meterupdate check =
|
fileCopier st src dest k meterupdate check verifyconfig =
|
||||||
-- If multiple threads reach this at the same time, they
|
-- If multiple threads reach this at the same time, they
|
||||||
-- will both try CoW, which is acceptable.
|
-- will both try CoW, which is acceptable.
|
||||||
ifM (liftIO $ isEmptyMVar copycowtried)
|
ifM (liftIO $ isEmptyMVar copycowtried)
|
||||||
|
@ -953,14 +957,16 @@ fileCopier st src dest k meterupdate check =
|
||||||
dest' = toRawFilePath dest
|
dest' = toRawFilePath dest
|
||||||
|
|
||||||
docopy = do
|
docopy = do
|
||||||
|
iv <- startVerifyKeyContentIncrementally verifyconfig k
|
||||||
|
|
||||||
-- The file might have had the write bit removed,
|
-- The file might have had the write bit removed,
|
||||||
-- so make sure we can write to it.
|
-- so make sure we can write to it.
|
||||||
void $ liftIO $ tryIO $ allowWrite dest'
|
void $ liftIO $ tryIO $ allowWrite dest'
|
||||||
|
|
||||||
liftIO $ withBinaryFile dest ReadWriteMode $ \hdest ->
|
liftIO $ withBinaryFile dest ReadWriteMode $ \hdest ->
|
||||||
withBinaryFile src ReadMode $ \hsrc -> do
|
withBinaryFile src ReadMode $ \hsrc -> do
|
||||||
sofar <- compareexisting hdest hsrc zeroBytesProcessed
|
sofar <- compareexisting iv hdest hsrc zeroBytesProcessed
|
||||||
docopy' hdest hsrc sofar
|
docopy' iv hdest hsrc sofar
|
||||||
|
|
||||||
-- Copy src mode and mtime.
|
-- Copy src mode and mtime.
|
||||||
mode <- liftIO $ fileMode <$> getFileStatus src
|
mode <- liftIO $ fileMode <$> getFileStatus src
|
||||||
|
@ -969,24 +975,30 @@ fileCopier st src dest k meterupdate check =
|
||||||
liftIO $ touch dest' mtime False
|
liftIO $ touch dest' mtime False
|
||||||
|
|
||||||
ifM check
|
ifM check
|
||||||
( return (True, UnVerified)
|
( case iv of
|
||||||
|
Just x -> ifM (liftIO $ finalizeIncremental x)
|
||||||
|
( return (True, Verified)
|
||||||
|
, return (False, UnVerified)
|
||||||
|
)
|
||||||
|
Nothing -> return (True, UnVerified)
|
||||||
, return (False, UnVerified)
|
, return (False, UnVerified)
|
||||||
)
|
)
|
||||||
|
|
||||||
docopy' hdest hsrc sofar = do
|
docopy' iv hdest hsrc sofar = do
|
||||||
s <- S.hGet hsrc defaultChunkSize
|
s <- S.hGet hsrc defaultChunkSize
|
||||||
if s == S.empty
|
if s == S.empty
|
||||||
then return ()
|
then return ()
|
||||||
else do
|
else do
|
||||||
let sofar' = addBytesProcessed sofar (S.length s)
|
let sofar' = addBytesProcessed sofar (S.length s)
|
||||||
S.hPut hdest s
|
S.hPut hdest s
|
||||||
|
maybe noop (flip updateIncremental s) iv
|
||||||
meterupdate sofar'
|
meterupdate sofar'
|
||||||
docopy' hdest hsrc sofar'
|
docopy' iv hdest hsrc sofar'
|
||||||
|
|
||||||
-- Leaves hdest and hsrc seeked to wherever the two diverge,
|
-- Leaves hdest and hsrc seeked to wherever the two diverge,
|
||||||
-- so typically hdest will be seeked to end, and hsrc to the same
|
-- so typically hdest will be seeked to end, and hsrc to the same
|
||||||
-- position.
|
-- position.
|
||||||
compareexisting hdest hsrc sofar = do
|
compareexisting iv hdest hsrc sofar = do
|
||||||
s <- S.hGet hdest defaultChunkSize
|
s <- S.hGet hdest defaultChunkSize
|
||||||
if s == S.empty
|
if s == S.empty
|
||||||
then return sofar
|
then return sofar
|
||||||
|
@ -994,9 +1006,10 @@ fileCopier st src dest k meterupdate check =
|
||||||
s' <- getnoshort (S.length s) hsrc
|
s' <- getnoshort (S.length s) hsrc
|
||||||
if s == s'
|
if s == s'
|
||||||
then do
|
then do
|
||||||
|
maybe noop (flip updateIncremental s) iv
|
||||||
let sofar' = addBytesProcessed sofar (S.length s)
|
let sofar' = addBytesProcessed sofar (S.length s)
|
||||||
meterupdate sofar'
|
meterupdate sofar'
|
||||||
compareexisting hdest hsrc sofar'
|
compareexisting iv hdest hsrc sofar'
|
||||||
else do
|
else do
|
||||||
seekbefore hdest s
|
seekbefore hdest s
|
||||||
seekbefore hsrc s'
|
seekbefore hsrc s'
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
[[!comment format=mdwn
|
||||||
|
username="joey"
|
||||||
|
subject="""comment 10"""
|
||||||
|
date="2021-02-10T19:48:58Z"
|
||||||
|
content="""
|
||||||
|
Incremental hashing implemented for local git remotes.
|
||||||
|
|
||||||
|
Next step should be a special remote, such as directory,
|
||||||
|
that uses byteRetriever. Chunking and encryption will complicate them..
|
||||||
|
"""]]
|
|
@ -20,6 +20,6 @@ checksum.
|
||||||
|
|
||||||
Urk: Using rsync currently protects against
|
Urk: Using rsync currently protects against
|
||||||
[[bugs/URL_key_potential_data_loss]], so the replacement would also need to
|
[[bugs/URL_key_potential_data_loss]], so the replacement would also need to
|
||||||
deal with that. Probably by refusing to resume a partial transfer of an
|
deal with that. Eg, by comparing the temp file content with the start of
|
||||||
affected key. (Or it could just fall back to rsync for such keys.)
|
the object when resuming.
|
||||||
"""]]
|
"""]]
|
||||||
|
|
Loading…
Reference in a new issue