incremental verify for byteRetriever special remotes

Several special remotes verify content while it is being retrieved,
avoiding a separate checksum pass. They are: S3, bup, ddar, and
gcrypt (with a local repository).

Not done when using chunking, yet.

Complicated by Retriever needing to change to be polymorphic. Which in turn
meant RankNTypes is needed, and also needed some code changes. The
change in Remote.External does not change behavior at all but avoids
the type checking failing because of a "rigid, skolem type" which
"would escape its scope". So I refactored slightly to make the type
checker's job easier there.

Unfortunately, directory uses fileRetriever (except when chunked),
so it is not amoung the improved ones. Fixing that would need a way for
FileRetriever to return a Verification. But, since the file retrieved
may be encrypted or chunked, it would be extra work to always
incrementally checksum the file while retrieving it. Hm.

Some other special remotes use fileRetriever, and so don't get incremental
verification, but could be converted to byteRetriever later. One is
GitLFS, which uses downloadConduit, which writes to the file, so could
verify as it goes. Other special remotes like web could too, but don't
use Remote.Helper.Special and so will need to be addressed separately.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2021-08-11 13:43:30 -04:00
parent 9518aca2f5
commit c20358b671
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
16 changed files with 111 additions and 50 deletions

View file

@ -8,6 +8,10 @@ git-annex (8.20210804) UNRELEASED; urgency=medium
* add: When adding a dotfile, avoid treating its name as an extension.
* rsync special remote: Stop displaying rsync progress, and use
git-annex's own progress display.
* Several special remotes verify content while it is being retrieved,
avoiding a separate checksum pass. They are: S3, bup, ddar,
and gcrypt (with a local repository). This optimisation is not yet
available when chunks are used.
-- Joey Hess <id@joeyh.name> Tue, 03 Aug 2021 12:22:45 -0400

View file

@ -5,6 +5,8 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes #-}
module Remote.Adb (remote) where
import Annex.Common

View file

@ -5,6 +5,8 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes #-}
module Remote.Bup (remote) where
import qualified Data.Map as M
@ -184,10 +186,11 @@ retrieve buprepo = byteRetriever $ \k sink -> do
bracketIO (createProcess p) cleanupProcess (go sink p)
where
go sink p (_, Just h, _, pid) = do
() <- sink =<< liftIO (L.hGetContents h)
r <- sink =<< liftIO (L.hGetContents h)
liftIO $ do
hClose h
forceSuccessProcess p pid
return r
go _ _ _ = error "internal"
{- Cannot revert having stored a key in bup, but at least the data for the

View file

@ -7,6 +7,7 @@
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RankNTypes #-}
module Remote.Ddar (remote) where
@ -166,10 +167,11 @@ retrieve ddarrepo = byteRetriever $ \k sink -> do
bracketIO (createProcess p) cleanupProcess (go sink p)
where
go sink p (_, Just h, _, pid) = do
() <- sink =<< liftIO (L.hGetContents h)
r <- sink =<< liftIO (L.hGetContents h)
liftIO $ do
hClose h
forceSuccessProcess p pid
return r
go _ _ _ = error "internal"
remove :: DdarRepo -> Remover

View file

@ -6,6 +6,7 @@
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE CPP #-}
module Remote.Directory (

View file

@ -8,6 +8,7 @@
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Remote.Directory.LegacyChunked where

View file

@ -7,6 +7,7 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RankNTypes #-}
module Remote.External (remote) where
@ -66,17 +67,19 @@ gen r u rc gc rs
| externaltype == "readonly" = do
c <- parsedRemoteConfig remote rc
cst <- remoteCost gc expensiveRemoteCost
mk c cst GloballyAvailable
readonlyStorer
retrieveUrl
readonlyRemoveKey
checkKeyUrl
let rmt = mk c cst GloballyAvailable
Nothing
(externalInfo externaltype)
Nothing
Nothing
exportUnsupported
exportUnsupported
return $ Just $ specialRemote c
readonlyStorer
retrieveUrl
readonlyRemoveKey
checkKeyUrl
rmt
| otherwise = do
c <- parsedRemoteConfig remote rc
external <- newExternal externaltype (Just u) c (Just gc)
@ -103,20 +106,22 @@ gen r u rc gc rs
let cheapexportsupported = if exportsupported
then exportIsSupported
else exportUnsupported
mk c cst avail
(storeKeyM external)
(retrieveKeyFileM external)
(removeKeyM external)
(checkPresentM external)
let rmt = mk c cst avail
(Just (whereisKeyM external))
(getInfoM external)
(Just (claimUrlM external))
(Just (checkUrlM external))
exportactions
cheapexportsupported
return $ Just $ specialRemote c
(storeKeyM external)
(retrieveKeyFileM external)
(removeKeyM external)
(checkPresentM external)
rmt
where
mk c cst avail tostore toretrieve toremove tocheckkey towhereis togetinfo toclaimurl tocheckurl exportactions cheapexportsupported = do
let rmt = Remote
mk c cst avail towhereis togetinfo toclaimurl tocheckurl exportactions cheapexportsupported =
Remote
{ uuid = u
, cost = cst
, name = Git.repoDescribe r
@ -154,12 +159,6 @@ gen r u rc gc rs
, checkUrl = tocheckurl
, remoteStateHandle = rs
}
return $ Just $ specialRemote c
tostore
toretrieve
toremove
tocheckkey
rmt
externaltype = fromMaybe (giveup "missing externaltype") (remoteAnnexExternalType gc)
externalSetup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)

View file

@ -6,6 +6,7 @@
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Remote.GCrypt (
remote,

View file

@ -6,6 +6,7 @@
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE CPP #-}
module Remote.GitLFS (remote, gen, configKnownUrl) where

View file

@ -5,6 +5,8 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes #-}
module Remote.Glacier (remote, jobList, checkSaneGlacierCommand) where
import qualified Data.Map as M
@ -176,7 +178,7 @@ store' r k b p = go =<< glacierEnv c gc u
retrieve :: Remote -> Retriever
retrieve = byteRetriever . retrieve'
retrieve' :: Remote -> Key -> (L.ByteString -> Annex ()) -> Annex ()
retrieve' :: Remote -> Key -> (L.ByteString -> Annex a) -> Annex a
retrieve' r k sink = go =<< glacierEnv c gc u
where
c = config r

View file

@ -1,10 +1,12 @@
{- git-annex chunked remotes
-
- Copyright 2014-2020 Joey Hess <id@joeyh.name>
- Copyright 2014-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes #-}
module Remote.Helper.Chunked (
ChunkSize,
ChunkConfig(..),
@ -30,6 +32,7 @@ import Utility.Metered
import Crypto
import Backend (isStableKey)
import Annex.SpecialRemote.Config
import Annex.Verify
import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S
@ -250,6 +253,7 @@ retrieveChunks
:: LensGpgEncParams encc
=> Retriever
-> UUID
-> VerifyConfig
-> ChunkConfig
-> EncKey
-> Key
@ -257,15 +261,26 @@ retrieveChunks
-> MeterUpdate
-> Maybe (Cipher, EncKey)
-> encc
-> Annex ()
retrieveChunks retriever u chunkconfig encryptor basek dest basep enc encc
| noChunks chunkconfig =
-> Annex Verification
retrieveChunks retriever u vc chunkconfig encryptor basek dest basep enc encc
| noChunks chunkconfig = do
-- Optimisation: Try the unchunked key first, to avoid
-- looking in the git-annex branch for chunk counts
-- that are likely not there.
getunchunked `catchNonAsync`
(\e -> go (Just e) =<< chunkKeysOnly u chunkconfig basek)
| otherwise = go Nothing =<< chunkKeys u chunkconfig basek
iv <- startVerifyKeyContentIncrementally vc basek
tryNonAsync (getunchunked iv) >>= \case
Right Nothing -> return UnVerified
Right (Just iv') ->
ifM (liftIO $ finalizeIncremental iv')
( return Verified
, return UnVerified
)
Left e -> do
go (Just e) =<< chunkKeysOnly u chunkconfig basek
return UnVerified
| otherwise = do
go Nothing =<< chunkKeys u chunkconfig basek
return UnVerified
where
go pe cks = do
let ls = map chunkKeyList cks
@ -279,7 +294,7 @@ retrieveChunks retriever u chunkconfig encryptor basek dest basep enc encc
firstavail (Just e) _ [] = throwM e
firstavail pe currsize ([]:ls) = firstavail pe currsize ls
firstavail _ currsize ((k:ks):ls)
| k == basek = getunchunked
| k == basek = void (getunchunked Nothing)
`catchNonAsync` (\e -> firstavail (Just e) currsize ls)
| otherwise = do
let offset = resumeOffset currsize k
@ -289,7 +304,7 @@ retrieveChunks retriever u chunkconfig encryptor basek dest basep enc encc
v <- tryNonAsync $
retriever (encryptor k) p $ \content ->
bracketIO (maybe opennew openresume offset) hClose $ \h -> do
retrieved (Just h) p content
void $ retrieved Nothing (Just h) p content
let sz = toBytesProcessed $
fromMaybe 0 $ fromKey keyChunkSize k
getrest p h sz sz ks
@ -303,10 +318,12 @@ retrieveChunks retriever u chunkconfig encryptor basek dest basep enc encc
getrest p h sz bytesprocessed (k:ks) = do
let p' = offsetMeterUpdate p bytesprocessed
liftIO $ p' zeroBytesProcessed
retriever (encryptor k) p' $ retrieved (Just h) p'
retriever (encryptor k) p' $
void . retrieved Nothing (Just h) p'
getrest p h sz (addBytesProcessed bytesprocessed sz) ks
getunchunked = retriever (encryptor basek) basep $ retrieved Nothing basep
getunchunked iv = retriever (encryptor basek) basep $
retrieved iv Nothing basep
opennew = openBinaryFile dest WriteMode
@ -326,21 +343,27 @@ retrieveChunks retriever u chunkconfig encryptor basek dest basep enc encc
- Instead, writeRetrievedContent is passed a meter to update
- as it consumes the ByteString.
-}
retrieved h p content = writeRetrievedContent dest enc encc h p' content
retrieved iv h p content =
writeRetrievedContent dest enc encc h p' content iv
where
p'
| isByteContent content = Just p
| otherwise = Nothing
{- Writes retrieved file content into the provided Handle, decrypting it
{- Writes retrieved file content to the provided Handle, decrypting it
- first if necessary.
-
- If the remote did not store the content using chunks, no Handle
- will be provided, and it's up to us to open the destination file.
- will be provided, and instead the content will be written to the
- dest file.
-
- Note that when neither chunking nor encryption is used, and the remote
- provides FileContent, that file only needs to be renamed
- into place. (And it may even already be in the right place..)
-
- The IncrementalVerifier is updated as the file content is read.
- If it was not able to be updated, due to the file not needing to be read,
- Nothing will be returned.
-}
writeRetrievedContent
:: LensGpgEncParams encc
@ -350,31 +373,45 @@ writeRetrievedContent
-> Maybe Handle
-> Maybe MeterUpdate
-> ContentSource
-> Annex ()
writeRetrievedContent dest enc encc mh mp content = case (enc, mh, content) of
-> Maybe IncrementalVerifier
-> Annex (Maybe IncrementalVerifier)
writeRetrievedContent dest enc encc mh mp content miv = case (enc, mh, content) of
(Nothing, Nothing, FileContent f)
| f == dest -> noop
| otherwise -> liftIO $ moveFile f dest
| f == dest -> return Nothing
| otherwise -> do
liftIO $ moveFile f dest
return Nothing
(Just (cipher, _), _, ByteContent b) -> do
cmd <- gpgCmd <$> Annex.getGitConfig
decrypt cmd encc cipher (feedBytes b) $
readBytes write
return miv
(Just (cipher, _), _, FileContent f) -> do
cmd <- gpgCmd <$> Annex.getGitConfig
withBytes content $ \b ->
decrypt cmd encc cipher (feedBytes b) $
readBytes write
liftIO $ removeWhenExistsWith R.removeLink (toRawFilePath f)
return miv
(Nothing, _, FileContent f) -> do
withBytes content write
liftIO $ removeWhenExistsWith R.removeLink (toRawFilePath f)
(Nothing, _, ByteContent b) -> write b
return miv
(Nothing, _, ByteContent b) -> do
write b
return miv
where
write b = case mh of
Just h -> liftIO $ b `streamto` h
Nothing -> liftIO $ bracket opendest hClose (b `streamto`)
streamto b h = case mp of
Just p -> meteredWrite p (S.hPut h) b
Just h -> liftIO $ write' b h
Nothing -> liftIO $ bracket opendest hClose (write' b)
write' b h = case mp of
Just p ->
let writer = case miv of
Just iv -> \s -> do
updateIncremental iv s
S.hPut h s
Nothing -> S.hPut h
in meteredWrite p writer b
Nothing -> L.hPut h b
opendest = openBinaryFile dest WriteMode

View file

@ -5,6 +5,7 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE OverloadedStrings #-}
module Remote.Helper.Special (
@ -111,7 +112,7 @@ fileRetriever a k m callback = do
-- A Retriever that generates a lazy ByteString containing the Key's
-- content, and passes it to a callback action which will fully consume it
-- before returning.
byteRetriever :: (Key -> (L.ByteString -> Annex ()) -> Annex ()) -> Retriever
byteRetriever :: (Key -> (L.ByteString -> Annex a) -> Annex a) -> Key -> MeterUpdate -> (ContentSource -> Annex a) -> Annex a
byteRetriever a k _m callback = a k (callback . ByteContent)
{- The base Remote that is provided to specialRemote needs to have
@ -216,11 +217,12 @@ specialRemote' cfg c storer retriever remover checkpresent baser = encr
enck = maybe id snd enc
-- call retriever to get chunks; decrypt them; stream to dest file
retrieveKeyFileGen k dest p enc = do
retrieveKeyFileGen k dest p enc =
displayprogress p k Nothing $ \p' ->
retrieveChunks retriever (uuid baser) chunkconfig
enck k dest p' enc encr
return UnVerified
retrieveChunks retriever
(uuid baser)
(RemoteVerify baser)
chunkconfig enck k dest p' enc encr
where
enck = maybe id snd enc

View file

@ -5,6 +5,8 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes #-}
module Remote.Hook (remote) where
import Annex.Common

View file

@ -8,6 +8,7 @@
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE CPP #-}
module Remote.S3 (remote, iaHost, configIA, iaItemUrl) where

View file

@ -7,6 +7,7 @@
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
module Remote.WebDAV (remote, davCreds, configUrl) where

View file

@ -5,6 +5,8 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE RankNTypes #-}
module Types.StoreRetrieve where
import Annex.Common
@ -28,7 +30,7 @@ type Storer = Key -> ContentSource -> MeterUpdate -> Annex ()
-- Action that retrieves a Key's content from a remote, passing it to a
-- callback, which will fully consume the content before returning.
-- Throws exception if key is not present, or remote is not accessible.
type Retriever = Key -> MeterUpdate -> (ContentSource -> Annex ()) -> Annex ()
type Retriever = forall a. Key -> MeterUpdate -> (ContentSource -> Annex a) -> Annex a
-- Action that removes a Key's content from a remote.
-- Succeeds if key is already not present.