From f44d4704c638e2fdb48dd0667809b16fac377b5b Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 10 Feb 2021 16:05:24 -0400 Subject: [PATCH] incremental checksum for local remotes This benchmarks only slightly faster than the old git-annex. Eg, for a 1 gb file, 14.56s vs 15.57s. (On a ram disk; there would certianly be more of an effect if the file was written to disk and didn't stay in cache.) Commenting out the updateIncremental calls make the same run in 6.31s. May be that overhead in the implementation, other than the actual checksumming, is slowing it down. Eg, MVar access. (I also tried using 10x larger chunks, which did not change the speed.) --- CHANGELOG | 4 +- Remote/Git.hs | 43 ++++++++++++------- ..._695d1269ab20c66630ddfa2d8cbabbef._comment | 10 +++++ ..._4f4f4a42adafe52207ed32a5d20e94be._comment | 4 +- 4 files changed, 42 insertions(+), 19 deletions(-) create mode 100644 doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_10_695d1269ab20c66630ddfa2d8cbabbef._comment diff --git a/CHANGELOG b/CHANGELOG index 580d5832ad..b54546442e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -24,8 +24,8 @@ git-annex (8.20210128) UNRELEASED; urgency=medium * Include libkqueue.h file needed to build the assistant on BSDs. * Tahoe: Avoid verifying hash after download, since tahoe does sufficient verification itself. - * Checksum as content is received from a remote git-annex repository - over ssh/p2p protocols, rather than doing it in a second pass. + * Checksum as content is received from a remote git-annex repository, + rather than doing it in a second pass. * Bugfix: fsck --from a ssh remote did not actually check that the content on the remote is not corrupted. diff --git a/Remote/Git.hs b/Remote/Git.hs index ec20a3cc82..3ca65c4f65 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -62,7 +62,10 @@ import Annex.Path import Creds import Types.NumCopies import Types.ProposedAccepted +import Types.Backend +import Backend import Annex.Action +import Annex.Verify import Messages.Progress #ifndef mingw32_HOST_OS @@ -542,11 +545,12 @@ copyFromRemote'' repo forcersync r st@(State connpool _ _ _ _) key file dest met -- run copy from perspective of remote onLocalFast st $ Annex.Content.prepSendAnnex key >>= \case Just (object, checksuccess) -> do + let verify = Annex.Content.RemoteVerify r copier <- mkCopier hardlink st (ok, v) <- runTransfer (Transfer Download u (fromKey id key)) file Nothing stdRetry $ \p -> metered (Just (combineMeterUpdate p meterupdate)) key $ \_ p' -> - copier object dest key p' checksuccess + copier object dest key p' checksuccess verify if ok then return v else giveup "failed to retrieve content from remote" @@ -685,12 +689,12 @@ copyToRemote' repo r st@(State connpool duc _ _ _) key file meterupdate res <- onLocalFast st $ ifM (Annex.Content.inAnnex key) ( return True , runTransfer (Transfer Download u (fromKey id key)) file Nothing stdRetry $ \p -> do - copier <- mkCopier hardlink st let verify = Annex.Content.RemoteVerify r + copier <- mkCopier hardlink st let rsp = RetrievalAllKeysSecure res <- logStatusAfter key $ Annex.Content.getViaTmp rsp verify key file $ \dest -> metered (Just (combineMeterUpdate meterupdate p)) key $ \_ p' -> - copier object (fromRawFilePath dest) key p' (liftIO checksuccessio) + copier object (fromRawFilePath dest) key p' (liftIO checksuccessio) verify Annex.Content.saveState True return res ) @@ -825,7 +829,7 @@ wantHardLink = (annexHardLink <$> Annex.getGitConfig) -- from is implicitly trusted, so no expensive verification needs to be -- done. Also returns Verified if the key's content is verified while -- copying it. -type Copier = FilePath -> FilePath -> Key -> MeterUpdate -> Annex Bool -> Annex (Bool, Verification) +type Copier = FilePath -> FilePath -> Key -> MeterUpdate -> Annex Bool -> VerifyConfig -> Annex (Bool, Verification) mkCopier :: Bool -> State -> Annex Copier mkCopier remotewanthardlink st = do @@ -833,13 +837,13 @@ mkCopier remotewanthardlink st = do localwanthardlink <- wantHardLink let linker = \src dest -> createLink src dest >> return True if remotewanthardlink || localwanthardlink - then return $ \src dest k p check -> + then return $ \src dest k p check verifyconfig -> ifM (liftIO (catchBoolIO (linker src dest))) ( ifM check ( return (True, Verified) , return (False, UnVerified) ) - , copier src dest k p check + , copier src dest k p check verifyconfig ) else return copier @@ -922,9 +926,9 @@ newCopyCoWTried = CopyCoWTried <$> newEmptyMVar -} fileCopier :: State -> Copier #ifdef mingw32_HOST_OS -fileCopier _st src dest k meterupdate check = docopy +fileCopier _st src dest k meterupdate check verifyconfig = docopy #else -fileCopier st src dest k meterupdate check = +fileCopier st src dest k meterupdate check verifyconfig = -- If multiple threads reach this at the same time, they -- will both try CoW, which is acceptable. ifM (liftIO $ isEmptyMVar copycowtried) @@ -953,14 +957,16 @@ fileCopier st src dest k meterupdate check = dest' = toRawFilePath dest docopy = do + iv <- startVerifyKeyContentIncrementally verifyconfig k + -- The file might have had the write bit removed, -- so make sure we can write to it. void $ liftIO $ tryIO $ allowWrite dest' liftIO $ withBinaryFile dest ReadWriteMode $ \hdest -> withBinaryFile src ReadMode $ \hsrc -> do - sofar <- compareexisting hdest hsrc zeroBytesProcessed - docopy' hdest hsrc sofar + sofar <- compareexisting iv hdest hsrc zeroBytesProcessed + docopy' iv hdest hsrc sofar -- Copy src mode and mtime. mode <- liftIO $ fileMode <$> getFileStatus src @@ -969,24 +975,30 @@ fileCopier st src dest k meterupdate check = liftIO $ touch dest' mtime False ifM check - ( return (True, UnVerified) + ( case iv of + Just x -> ifM (liftIO $ finalizeIncremental x) + ( return (True, Verified) + , return (False, UnVerified) + ) + Nothing -> return (True, UnVerified) , return (False, UnVerified) ) - docopy' hdest hsrc sofar = do + docopy' iv hdest hsrc sofar = do s <- S.hGet hsrc defaultChunkSize if s == S.empty then return () else do let sofar' = addBytesProcessed sofar (S.length s) S.hPut hdest s + maybe noop (flip updateIncremental s) iv meterupdate sofar' - docopy' hdest hsrc sofar' + docopy' iv hdest hsrc sofar' -- Leaves hdest and hsrc seeked to wherever the two diverge, -- so typically hdest will be seeked to end, and hsrc to the same -- position. - compareexisting hdest hsrc sofar = do + compareexisting iv hdest hsrc sofar = do s <- S.hGet hdest defaultChunkSize if s == S.empty then return sofar @@ -994,9 +1006,10 @@ fileCopier st src dest k meterupdate check = s' <- getnoshort (S.length s) hsrc if s == s' then do + maybe noop (flip updateIncremental s) iv let sofar' = addBytesProcessed sofar (S.length s) meterupdate sofar' - compareexisting hdest hsrc sofar' + compareexisting iv hdest hsrc sofar' else do seekbefore hdest s seekbefore hsrc s' diff --git a/doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_10_695d1269ab20c66630ddfa2d8cbabbef._comment b/doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_10_695d1269ab20c66630ddfa2d8cbabbef._comment new file mode 100644 index 0000000000..4e754e0b88 --- /dev/null +++ b/doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_10_695d1269ab20c66630ddfa2d8cbabbef._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 10""" + date="2021-02-10T19:48:58Z" + content=""" +Incremental hashing implemented for local git remotes. + +Next step should be a special remote, such as directory, +that uses byteRetriever. Chunking and encryption will complicate them.. +"""]] diff --git a/doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_9_4f4f4a42adafe52207ed32a5d20e94be._comment b/doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_9_4f4f4a42adafe52207ed32a5d20e94be._comment index 8992a4c1d8..befeac793c 100644 --- a/doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_9_4f4f4a42adafe52207ed32a5d20e94be._comment +++ b/doc/todo/OPT__58_____34__bundle__34___get_+_check___40__of_checksum__41___in_a_single_operation/comment_9_4f4f4a42adafe52207ed32a5d20e94be._comment @@ -20,6 +20,6 @@ checksum. Urk: Using rsync currently protects against [[bugs/URL_key_potential_data_loss]], so the replacement would also need to -deal with that. Probably by refusing to resume a partial transfer of an -affected key. (Or it could just fall back to rsync for such keys.) +deal with that. Eg, by comparing the temp file content with the start of +the object when resuming. """]]