From 62e152f2107a10a7a5c84eea01521a79b6c6da47 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 9 Feb 2021 17:03:27 -0400 Subject: [PATCH] incremental checksum on download from ssh or p2p Checksum as content is received from a remote git-annex repository, rather than doing it in a second pass. Not tested at all yet, but I imagine it will work! Not implemented for any special remotes, and also not implemented for copies from local remotes. It may be that, for local remotes, it will suffice to use rsync, rely on its checksumming, and simply return Verified. (It would still make a checksumming pass when cp is used for COW, I guess.) --- Annex/Content/Presence.hs | 15 +--------- Annex/Verify.hs | 25 ++++++++++++++++ Backend.hs | 15 +++++++++- Backend/Hash.hs | 3 +- CHANGELOG | 2 ++ P2P/Annex.hs | 53 ++++++++++++++++++++++++++------- P2P/IO.hs | 2 +- P2P/Protocol.hs | 11 +++---- Remote/Bup.hs | 2 +- Remote/Git.hs | 1 + Remote/Glacier.hs | 3 +- Remote/Helper/Chunked/Legacy.hs | 3 +- Remote/Helper/P2P.hs | 10 ++++--- Remote/Helper/Special.hs | 2 +- Remote/P2P.hs | 3 +- Utility/Metered.hs | 17 ++++++----- git-annex.cabal | 3 +- 17 files changed, 118 insertions(+), 52 deletions(-) create mode 100644 Annex/Verify.hs diff --git a/Annex/Content/Presence.hs b/Annex/Content/Presence.hs index 05ff5715e1..6d9d158649 100644 --- a/Annex/Content/Presence.hs +++ b/Annex/Content/Presence.hs @@ -26,10 +26,10 @@ module Annex.Content.Presence ( import Annex.Common import qualified Annex +import Annex.Verify import Annex.LockPool import Annex.WorkerPool import Types.Remote (unVerified, Verification(..), RetrievalSecurityPolicy(..)) -import qualified Types.Remote import qualified Types.Backend import qualified Backend import qualified Database.Keys @@ -231,16 +231,3 @@ warnUnverifiableInsecure k = warning $ unwords ] where kv = decodeBS (formatKeyVariety (fromKey keyVariety k)) - -data VerifyConfig = AlwaysVerify | NoVerify | RemoteVerify Remote | DefaultVerify - -shouldVerify :: VerifyConfig -> Annex Bool -shouldVerify AlwaysVerify = return True -shouldVerify NoVerify = return False -shouldVerify DefaultVerify = annexVerify <$> Annex.getGitConfig -shouldVerify (RemoteVerify r) = - (shouldVerify DefaultVerify - <&&> pure (remoteAnnexVerify (Types.Remote.gitconfig r))) - -- Export remotes are not key/value stores, so always verify - -- content from them even when verification is disabled. - <||> Types.Remote.isExportSupported r diff --git a/Annex/Verify.hs b/Annex/Verify.hs new file mode 100644 index 0000000000..6d1a6ab37f --- /dev/null +++ b/Annex/Verify.hs @@ -0,0 +1,25 @@ +{- verification + - + - Copyright 2010-2021 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Annex.Verify where + +import Annex.Common +import qualified Annex +import qualified Types.Remote + +data VerifyConfig = AlwaysVerify | NoVerify | RemoteVerify Remote | DefaultVerify + +shouldVerify :: VerifyConfig -> Annex Bool +shouldVerify AlwaysVerify = return True +shouldVerify NoVerify = return False +shouldVerify DefaultVerify = annexVerify <$> Annex.getGitConfig +shouldVerify (RemoteVerify r) = + (shouldVerify DefaultVerify + <&&> pure (remoteAnnexVerify (Types.Remote.gitconfig r))) + -- Export remotes are not key/value stores, so always verify + -- content from them even when verification is disabled. + <||> Types.Remote.isExportSupported r diff --git a/Backend.hs b/Backend.hs index 5769c840cc..76ba12313a 100644 --- a/Backend.hs +++ b/Backend.hs @@ -1,6 +1,6 @@ {- git-annex key/value backends - - - Copyright 2010-2020 Joey Hess + - Copyright 2010-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -17,11 +17,13 @@ module Backend ( isStableKey, isCryptographicallySecure, isVerifiable, + startVerifyKeyContentIncrementally, ) where import Annex.Common import qualified Annex import Annex.CheckAttr +import Annex.Verify import Types.Key import Types.KeySource import qualified Types.Backend as B @@ -127,3 +129,14 @@ isCryptographicallySecure k = maybe False (`B.isCryptographicallySecure` k) isVerifiable :: Key -> Annex Bool isVerifiable k = maybe False (isJust . B.verifyKeyContent) <$> maybeLookupBackendVariety (fromKey keyVariety k) + +startVerifyKeyContentIncrementally :: VerifyConfig -> Key -> Annex (Maybe B.IncrementalVerifier) +startVerifyKeyContentIncrementally verifyconfig k = + ifM (shouldVerify verifyconfig) + ( maybeLookupBackendVariety (fromKey keyVariety k) >>= \case + Just b -> case B.verifyKeyContentIncrementally b of + Just v -> Just <$> v k + Nothing -> return Nothing + Nothing -> return Nothing + , return Nothing + ) diff --git a/Backend/Hash.hs b/Backend/Hash.hs index 12ad921aaf..0e723dea5d 100644 --- a/Backend/Hash.hs +++ b/Backend/Hash.hs @@ -145,8 +145,7 @@ sameCheckSum key s expected = decodeBS (keyHash key) checkKeyChecksumIncremental :: Hash -> Key -> Annex IncrementalVerifier -checkKeyChecksumIncremental hash key = liftIO $ - (\h -> snd h key) (hasher hash) +checkKeyChecksumIncremental hash key = liftIO $ (snd $ hasher hash) key keyHash :: Key -> S.ByteString keyHash = fst . splitKeyNameExtension diff --git a/CHANGELOG b/CHANGELOG index c8ed840b86..b889050c88 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -24,6 +24,8 @@ git-annex (8.20210128) UNRELEASED; urgency=medium * Include libkqueue.h file needed to build the assistant on BSDs. * Tahoe: Avoid verifying hash after download, since tahoe does sufficient verification itself. + * Checksum as content is received from a remote git-annex repository, + rather than doing it in a second pass. -- Joey Hess Thu, 28 Jan 2021 12:34:32 -0400 diff --git a/P2P/Annex.hs b/P2P/Annex.hs index b933575c41..c28c3eba71 100644 --- a/P2P/Annex.hs +++ b/P2P/Annex.hs @@ -1,6 +1,6 @@ {- P2P protocol, Annex implementation - - - Copyright 2016-2018 Joey Hess + - Copyright 2016-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -23,9 +23,11 @@ import P2P.IO import Logs.Location import Types.NumCopies import Utility.Metered +import Types.Backend (IncrementalVerifier(..)) import Control.Monad.Free import Control.Concurrent.STM +import qualified Data.ByteString as S -- Full interpreter for Proto, that can receive and send objects. runFullProto :: RunState -> P2PConnection -> Proto a -> Annex (Either ProtoFailure a) @@ -77,7 +79,7 @@ runLocal runst runner a = case a of let runtransfer ti = Right <$> transfer download' k af Nothing (\p -> logStatusAfter k $ getViaTmp rsp DefaultVerify k af $ \tmp -> - storefile (fromRawFilePath tmp) o l getb validitycheck p ti) + storefile (fromRawFilePath tmp) o l getb Nothing validitycheck p ti) let fallback = return $ Left $ ProtoFailureMessage "transfer already in progress, or unable to take transfer lock" checktransfer runtransfer fallback @@ -85,10 +87,10 @@ runLocal runst runner a = case a of Left e -> return $ Left $ ProtoFailureException e Right (Left e) -> return $ Left e Right (Right ok) -> runner (next ok) - StoreContentTo dest o l getb validitycheck next -> do + StoreContentTo dest incrementalverifier o l getb validitycheck next -> do v <- tryNonAsync $ do let runtransfer ti = Right - <$> storefile dest o l getb validitycheck nullMeterUpdate ti + <$> storefile dest o l getb incrementalverifier validitycheck nullMeterUpdate ti let fallback = return $ Left $ ProtoFailureMessage "transfer failed" checktransfer runtransfer fallback @@ -153,16 +155,41 @@ runLocal runst runner a = case a of -- Transfer logs are updated higher in the stack when -- a client. Client _ -> ta nullMeterUpdate + + resumefromoffset o incrementalverifier p h + | o /= 0 = do + p' <- case incrementalverifier of + Just iv -> do + go iv o + return p + _ -> return $ offsetMeterUpdate p (toBytesProcessed o) + -- Make sure the handle is seeked to the offset. + -- (Reading the file probably left it there + -- when that was done, but let's be sure.) + hSeek h AbsoluteSeek o + return p' + | otherwise = return p + where + go iv n + | n == 0 = return () + | n > fromIntegral defaultChunkSize = do + updateIncremental iv =<< S.hGet h defaultChunkSize + go iv (n - fromIntegral defaultChunkSize) + | otherwise = + updateIncremental iv =<< S.hGet h (fromIntegral n) - storefile dest (Offset o) (Len l) getb validitycheck p ti = do - let p' = offsetMeterUpdate p (toBytesProcessed o) + storefile dest (Offset o) (Len l) getb incrementalverifier validitycheck p ti = do v <- runner getb case v of Right b -> do liftIO $ withBinaryFile dest ReadWriteMode $ \h -> do - when (o /= 0) $ - hSeek h AbsoluteSeek o - meteredWrite p' h b + p' <- resumefromoffset o incrementalverifier p h + let writechunk = case incrementalverifier of + Nothing -> \c -> S.hPut h c + Just iv -> \c -> do + S.hPut h c + updateIncremental iv c + meteredWrite p' writechunk b indicatetransferred ti rightsize <- do @@ -170,8 +197,12 @@ runLocal runst runner a = case a of return (toInteger sz == l + o) runner validitycheck >>= \case - Right (Just Valid) -> - return (rightsize, UnVerified) + Right (Just Valid) -> case incrementalverifier of + Just iv -> ifM (liftIO (finalizeIncremental iv) <&&> pure rightsize) + ( return (True, Verified) + , return (False, UnVerified) + ) + Nothing -> return (rightsize, UnVerified) Right (Just Invalid) | l == 0 -> -- Special case, for when -- content was not diff --git a/P2P/IO.hs b/P2P/IO.hs index 9a71ba89f0..d089f1eb00 100644 --- a/P2P/IO.hs +++ b/P2P/IO.hs @@ -259,7 +259,7 @@ debugMessage conn prefix m = do -- connection. False is returned to indicate this problem. sendExactly :: Len -> L.ByteString -> Handle -> MeterUpdate -> IO Bool sendExactly (Len n) b h p = do - sent <- meteredWrite' p h (L.take (fromIntegral n) b) + sent <- meteredWrite' p (B.hPut h) (L.take (fromIntegral n) b) return (fromBytesProcessed sent == n) receiveExactly :: Len -> Handle -> MeterUpdate -> IO L.ByteString diff --git a/P2P/Protocol.hs b/P2P/Protocol.hs index 13d5ea8c3f..568bccc5cf 100644 --- a/P2P/Protocol.hs +++ b/P2P/Protocol.hs @@ -2,7 +2,7 @@ - - See doc/design/p2p_protocol.mdwn - - - Copyright 2016-2020 Joey Hess + - Copyright 2016-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -18,6 +18,7 @@ import Types (Annex) import Types.Key import Types.UUID import Types.Remote (Verification(..), unVerified) +import Types.Backend (IncrementalVerifier(..)) import Utility.AuthToken import Utility.Applicative import Utility.PartialPrelude @@ -266,7 +267,7 @@ data LocalF c -- Note: The ByteString may not contain the entire remaining content -- of the key. Only once the temp file size == Len has the whole -- content been transferred. - | StoreContentTo FilePath Offset Len (Proto L.ByteString) (Proto (Maybe Validity)) ((Bool, Verification) -> c) + | StoreContentTo FilePath (Maybe IncrementalVerifier) Offset Len (Proto L.ByteString) (Proto (Maybe Validity)) ((Bool, Verification) -> c) -- ^ Like StoreContent, but stores the content to a temp file. | SetPresent Key UUID c | CheckContentPresent Key (Bool -> c) @@ -351,13 +352,13 @@ remove key = do net $ sendMessage (REMOVE key) checkSuccess -get :: FilePath -> Key -> AssociatedFile -> Meter -> MeterUpdate -> Proto (Bool, Verification) -get dest key af m p = +get :: FilePath -> Key -> Maybe IncrementalVerifier -> AssociatedFile -> Meter -> MeterUpdate -> Proto (Bool, Verification) +get dest key iv af m p = receiveContent (Just m) p sizer storer $ \offset -> GET offset (ProtoAssociatedFile af) key where sizer = fileSize dest - storer = storeContentTo dest + storer = storeContentTo dest iv put :: Key -> AssociatedFile -> MeterUpdate -> Proto Bool put key af p = do diff --git a/Remote/Bup.hs b/Remote/Bup.hs index 2da6642d26..9b14480d54 100644 --- a/Remote/Bup.hs +++ b/Remote/Bup.hs @@ -167,7 +167,7 @@ store r buprepo = byteStorer $ \k b p -> do } else cmd feeder = \h -> do - meteredWrite p h b + meteredWrite p (S.hPut h) b hClose h in withCreateProcess cmd' (go feeder cmd') where diff --git a/Remote/Git.hs b/Remote/Git.hs index 1bc602a612..9deb67508b 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -557,6 +557,7 @@ copyFromRemote'' repo forcersync r st@(State connpool _ _ _ _) key file dest met then return v else giveup "failed to retrieve content from remote" else P2PHelper.retrieve + (Annex.Content.RemoteVerify r) (\p -> Ssh.runProto r connpool (return (False, UnVerified)) (fallback p)) key file dest meterupdate | otherwise = giveup "copying from non-ssh, non-http remote not supported" diff --git a/Remote/Glacier.hs b/Remote/Glacier.hs index 5b6f1ce93b..ee1377ae68 100644 --- a/Remote/Glacier.hs +++ b/Remote/Glacier.hs @@ -9,6 +9,7 @@ module Remote.Glacier (remote, jobList, checkSaneGlacierCommand) where import qualified Data.Map as M import qualified Data.Text as T +import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L import Annex.Common @@ -168,7 +169,7 @@ store' r k b p = go =<< glacierEnv c gc u { std_in = CreatePipe } in liftIO $ withCreateProcess cmd (go' cmd) go' cmd (Just hin) _ _ pid = do - meteredWrite p hin b + meteredWrite p (S.hPut hin) b hClose hin forceSuccessProcess cmd pid go' _ _ _ _ _ = error "internal" diff --git a/Remote/Helper/Chunked/Legacy.hs b/Remote/Helper/Chunked/Legacy.hs index b236b8cb18..b3454ab2dc 100644 --- a/Remote/Helper/Chunked/Legacy.hs +++ b/Remote/Helper/Chunked/Legacy.hs @@ -11,6 +11,7 @@ import Annex.Common import Remote.Helper.Chunked import Utility.Metered +import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L {- This is an extension that's added to the usual file (or whatever) @@ -117,4 +118,4 @@ meteredWriteFileChunks :: MeterUpdate -> FilePath -> [v] -> (v -> IO L.ByteStrin meteredWriteFileChunks meterupdate dest chunks feeder = withBinaryFile dest WriteMode $ \h -> forM_ chunks $ - meteredWrite meterupdate h <=< feeder + meteredWrite meterupdate (S.hPut h) <=< feeder diff --git a/Remote/Helper/P2P.hs b/Remote/Helper/P2P.hs index 844c4b9b82..7e2d13f2e2 100644 --- a/Remote/Helper/P2P.hs +++ b/Remote/Helper/P2P.hs @@ -1,6 +1,6 @@ {- Helpers for remotes using the git-annex P2P protocol. - - - Copyright 2016-2020 Joey Hess + - Copyright 2016-2021 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -17,6 +17,7 @@ import Annex.Content import Messages.Progress import Utility.Metered import Types.NumCopies +import Backend import Control.Concurrent @@ -39,10 +40,11 @@ store runner k af p = do Just False -> giveup "transfer failed" Nothing -> remoteUnavail -retrieve :: (MeterUpdate -> ProtoRunner (Bool, Verification)) -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Verification -retrieve runner k af dest p = +retrieve :: VerifyConfig -> (MeterUpdate -> ProtoRunner (Bool, Verification)) -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Verification +retrieve verifyconfig runner k af dest p = do + iv <- startVerifyKeyContentIncrementally verifyconfig k metered (Just p) k $ \m p' -> - runner p' (P2P.get dest k af m p') >>= \case + runner p' (P2P.get dest k iv af m p') >>= \case Just (True, v) -> return v Just (False, _) -> giveup "transfer failed" Nothing -> remoteUnavail diff --git a/Remote/Helper/Special.hs b/Remote/Helper/Special.hs index ce1ebe22a0..646ce4fdb5 100644 --- a/Remote/Helper/Special.hs +++ b/Remote/Helper/Special.hs @@ -295,7 +295,7 @@ sink dest enc c mh mp content = case (enc, mh, content) of Just h -> liftIO $ b `streamto` h Nothing -> liftIO $ bracket opendest hClose (b `streamto`) streamto b h = case mp of - Just p -> meteredWrite p h b + Just p -> meteredWrite p (S.hPut h) b Nothing -> L.hPut h b opendest = openBinaryFile dest WriteMode diff --git a/Remote/P2P.hs b/Remote/P2P.hs index 21cf5b42e1..cc39ea9e0c 100644 --- a/Remote/P2P.hs +++ b/Remote/P2P.hs @@ -13,6 +13,7 @@ module Remote.P2P ( import Annex.Common import qualified Annex import qualified P2P.Protocol as P2P +import qualified Annex.Content import P2P.Address import P2P.Annex import P2P.IO @@ -56,7 +57,7 @@ chainGen addr r u rc gc rs = do , cost = cst , name = Git.repoDescribe r , storeKey = store (const protorunner) - , retrieveKeyFile = retrieve (const protorunner) + , retrieveKeyFile = retrieve (Annex.Content.RemoteVerify this) (const protorunner) , retrieveKeyFileCheap = Nothing , retrievalSecurityPolicy = RetrievalAllKeysSecure , removeKey = remove protorunner diff --git a/Utility/Metered.hs b/Utility/Metered.hs index 1715f0bf77..4683c13d6d 100644 --- a/Utility/Metered.hs +++ b/Utility/Metered.hs @@ -1,6 +1,6 @@ {- Metered IO and actions - - - Copyright 2012-2020 Joey Hess + - Copyright 2012-2021 Joey Hess - - License: BSD-2-clause -} @@ -118,23 +118,24 @@ withMeteredFile :: FilePath -> MeterUpdate -> (L.ByteString -> IO a) -> IO a withMeteredFile f meterupdate a = withBinaryFile f ReadMode $ \h -> hGetContentsMetered h meterupdate >>= a -{- Writes a ByteString to a Handle, updating a meter as it's written. -} -meteredWrite :: MeterUpdate -> Handle -> L.ByteString -> IO () -meteredWrite meterupdate h = void . meteredWrite' meterupdate h +{- Calls the action repeatedly with chunks from the lazy ByteString. + - Updates the meter after each chunk is processed. -} +meteredWrite :: MeterUpdate -> (S.ByteString -> IO ()) -> L.ByteString -> IO () +meteredWrite meterupdate a = void . meteredWrite' meterupdate a -meteredWrite' :: MeterUpdate -> Handle -> L.ByteString -> IO BytesProcessed -meteredWrite' meterupdate h = go zeroBytesProcessed . L.toChunks +meteredWrite' :: MeterUpdate -> (S.ByteString -> IO ()) -> L.ByteString -> IO BytesProcessed +meteredWrite' meterupdate a = go zeroBytesProcessed . L.toChunks where go sofar [] = return sofar go sofar (c:cs) = do - S.hPut h c + a c let !sofar' = addBytesProcessed sofar $ S.length c meterupdate sofar' go sofar' cs meteredWriteFile :: MeterUpdate -> FilePath -> L.ByteString -> IO () meteredWriteFile meterupdate f b = withBinaryFile f WriteMode $ \h -> - meteredWrite meterupdate h b + meteredWrite meterupdate (S.hPut h) b {- Applies an offset to a MeterUpdate. This can be useful when - performing a sequence of actions, such as multiple meteredWriteFiles, diff --git a/git-annex.cabal b/git-annex.cabal index 187aa31761..86f3a7829a 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -671,9 +671,10 @@ Executable git-annex Annex.UpdateInstead Annex.UUID Annex.Url + Annex.VariantFile Annex.VectorClock Annex.VectorClock.Utility - Annex.VariantFile + Annex.Verify Annex.Version Annex.View Annex.View.ViewedFile