incremental verification for retrieval from import remotes

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2022-05-09 15:38:21 -04:00
parent 2f2701137d
commit e8a601aa24
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
12 changed files with 129 additions and 83 deletions

View file

@ -1,6 +1,6 @@
{- Copying files. {- Copying files.
- -
- Copyright 2011-2021 Joey Hess <id@joeyh.name> - Copyright 2011-2022 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -79,40 +79,47 @@ data CopyMethod = CopiedCoW | Copied
- (eg when isStableKey is false), and doing this avoids getting a - (eg when isStableKey is false), and doing this avoids getting a
- corrupted file in such cases. - corrupted file in such cases.
-} -}
fileCopier :: CopyCoWTried -> FilePath -> FilePath -> MeterUpdate -> Maybe IncrementalVerifier -> Annex CopyMethod fileCopier :: CopyCoWTried -> FilePath -> FilePath -> MeterUpdate -> Maybe IncrementalVerifier -> IO CopyMethod
#ifdef mingw32_HOST_OS #ifdef mingw32_HOST_OS
fileCopier _ src dest meterupdate iv = docopy fileCopier _ src dest meterupdate iv = docopy
#else #else
fileCopier copycowtried src dest meterupdate iv = fileCopier copycowtried src dest meterupdate iv =
ifM (liftIO $ tryCopyCoW copycowtried src dest meterupdate) ifM (tryCopyCoW copycowtried src dest meterupdate)
( do ( do
liftIO $ maybe noop unableIncrementalVerifier iv maybe noop unableIncrementalVerifier iv
return CopiedCoW return CopiedCoW
, docopy , docopy
) )
#endif #endif
where where
dest' = toRawFilePath dest
docopy = do docopy = do
-- The file might have had the write bit removed, -- The file might have had the write bit removed,
-- so make sure we can write to it. -- so make sure we can write to it.
void $ liftIO $ tryIO $ allowWrite dest' void $ tryIO $ allowWrite dest'
liftIO $ withBinaryFile dest ReadWriteMode $ \hdest ->
withBinaryFile src ReadMode $ \hsrc -> do
sofar <- compareexisting hdest hsrc zeroBytesProcessed
docopy' hdest hsrc sofar
withBinaryFile src ReadMode $ \hsrc ->
fileContentCopier hsrc dest meterupdate iv
-- Copy src mode and mtime. -- Copy src mode and mtime.
mode <- liftIO $ fileMode <$> getFileStatus src mode <- fileMode <$> getFileStatus src
mtime <- liftIO $ utcTimeToPOSIXSeconds <$> getModificationTime src mtime <- utcTimeToPOSIXSeconds <$> getModificationTime src
liftIO $ setFileMode dest mode setFileMode dest mode
liftIO $ touch dest' mtime False touch dest' mtime False
return Copied return Copied
docopy' hdest hsrc sofar = do dest' = toRawFilePath dest
{- Copies content from a handle to a destination file. Does not
- use copy-on-write, and does not copy file mode and mtime.
-}
fileContentCopier :: Handle -> FilePath -> MeterUpdate -> Maybe IncrementalVerifier -> IO ()
fileContentCopier hsrc dest meterupdate iv =
withBinaryFile dest ReadWriteMode $ \hdest -> do
sofar <- compareexisting hdest zeroBytesProcessed
docopy hdest sofar
where
docopy hdest sofar = do
s <- S.hGet hsrc defaultChunkSize s <- S.hGet hsrc defaultChunkSize
if s == S.empty if s == S.empty
then return () then return ()
@ -121,12 +128,12 @@ fileCopier copycowtried src dest meterupdate iv =
S.hPut hdest s S.hPut hdest s
maybe noop (flip updateIncrementalVerifier s) iv maybe noop (flip updateIncrementalVerifier s) iv
meterupdate sofar' meterupdate sofar'
docopy' hdest hsrc sofar' docopy hdest sofar'
-- Leaves hdest and hsrc seeked to wherever the two diverge, -- Leaves hdest and hsrc seeked to wherever the two diverge,
-- so typically hdest will be seeked to end, and hsrc to the same -- so typically hdest will be seeked to end, and hsrc to the same
-- position. -- position.
compareexisting hdest hsrc sofar = do compareexisting hdest sofar = do
s <- S.hGet hdest defaultChunkSize s <- S.hGet hdest defaultChunkSize
if s == S.empty if s == S.empty
then return sofar then return sofar
@ -137,7 +144,7 @@ fileCopier copycowtried src dest meterupdate iv =
maybe noop (flip updateIncrementalVerifier s) iv maybe noop (flip updateIncrementalVerifier s) iv
let sofar' = addBytesProcessed sofar (S.length s) let sofar' = addBytesProcessed sofar (S.length s)
meterupdate sofar' meterupdate sofar'
compareexisting hdest hsrc sofar' compareexisting hdest sofar'
else do else do
seekbefore hdest s seekbefore hdest s
seekbefore hsrc s' seekbefore hsrc s'

View file

@ -597,14 +597,14 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
getcontent k = do getcontent k = do
let af = AssociatedFile (Just f) let af = AssociatedFile (Just f)
let downloader p' tmpfile = do let downloader p' tmpfile = do
k' <- Remote.retrieveExportWithContentIdentifier _ <- Remote.retrieveExportWithContentIdentifier
ia loc cid (fromRawFilePath tmpfile) ia loc cid (fromRawFilePath tmpfile)
(pure k) (Left k)
(combineMeterUpdate p' p) (combineMeterUpdate p' p)
ok <- moveAnnex k' af tmpfile ok <- moveAnnex k af tmpfile
when ok $ when ok $
logStatus k InfoPresent logStatus k InfoPresent
return (Just (k', ok)) return (Just (k, ok))
checkDiskSpaceToGet k Nothing $ checkDiskSpaceToGet k Nothing $
notifyTransfer Download af $ notifyTransfer Download af $
download' (Remote.uuid remote) k af Nothing stdRetry $ \p' -> download' (Remote.uuid remote) k af Nothing stdRetry $ \p' ->
@ -615,9 +615,9 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
-- need to retrieve this file. -- need to retrieve this file.
doimportsmall cidmap db loc cid sz p = do doimportsmall cidmap db loc cid sz p = do
let downloader tmpfile = do let downloader tmpfile = do
k <- Remote.retrieveExportWithContentIdentifier (k, _) <- Remote.retrieveExportWithContentIdentifier
ia loc cid (fromRawFilePath tmpfile) ia loc cid (fromRawFilePath tmpfile)
(mkkey tmpfile) (Right (mkkey tmpfile))
p p
case keyGitSha k of case keyGitSha k of
Just sha -> do Just sha -> do
@ -638,9 +638,9 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
dodownload cidmap db (loc, (cid, sz)) f matcher = do dodownload cidmap db (loc, (cid, sz)) f matcher = do
let af = AssociatedFile (Just f) let af = AssociatedFile (Just f)
let downloader tmpfile p = do let downloader tmpfile p = do
k <- Remote.retrieveExportWithContentIdentifier (k, _) <- Remote.retrieveExportWithContentIdentifier
ia loc cid (fromRawFilePath tmpfile) ia loc cid (fromRawFilePath tmpfile)
(mkkey tmpfile) (Right (mkkey tmpfile))
p p
case keyGitSha k of case keyGitSha k of
Nothing -> do Nothing -> do

View file

@ -5,6 +5,9 @@ git-annex (10.20220505) UNRELEASED; urgency=medium
data units. Note that the short form is "Mbit" not "Mb" because data units. Note that the short form is "Mbit" not "Mb" because
that differs from "MB" only in case, and git-annex parses units that differs from "MB" only in case, and git-annex parses units
case-insensitively. case-insensitively.
* Special remotes using exporttree=yes and/or importtree=yes now
checksum content while it is being retrieved, instead of in a separate
pass at the end.
-- Joey Hess <id@joeyh.name> Thu, 05 May 2022 15:08:07 -0400 -- Joey Hess <id@joeyh.name> Thu, 05 May 2022 15:08:07 -0400

View file

@ -338,15 +338,23 @@ listImportableContentsM serial adir c = adbfind >>= \case
-- connection is resonably fast, it's probably as good as -- connection is resonably fast, it's probably as good as
-- git's handling of similar situations with files being modified while -- git's handling of similar situations with files being modified while
-- it's updating the working tree for a merge. -- it's updating the working tree for a merge.
retrieveExportWithContentIdentifierM :: AndroidSerial -> AndroidPath -> ExportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM :: AndroidSerial -> AndroidPath -> ExportLocation -> ContentIdentifier -> FilePath -> Either Key (Annex Key) -> MeterUpdate -> Annex (Key, Verification)
retrieveExportWithContentIdentifierM serial adir loc cid dest mkkey _p = do retrieveExportWithContentIdentifierM serial adir loc cid dest gk _p = do
retrieve' serial src dest case gk of
k <- mkkey Right mkkey -> do
currcid <- getExportContentIdentifier serial adir loc go
if currcid == Right (Just cid) k <- mkkey
then return k return (k, UnVerified)
else giveup "the file on the android device has changed" Left k -> do
v <- verifyKeyContentIncrementally DefaultVerify k
(\iv -> tailVerify iv (toRawFilePath dest) go)
return (k, v)
where where
go = do
retrieve' serial src dest
currcid <- getExportContentIdentifier serial adir loc
when (currcid /= Right (Just cid)) $
giveup "the file on the android device has changed"
src = androidExportLocation adir loc src = androidExportLocation adir loc
storeExportWithContentIdentifierM :: AndroidSerial -> AndroidPath -> FilePath -> Key -> ExportLocation -> [ContentIdentifier] -> MeterUpdate -> Annex ContentIdentifier storeExportWithContentIdentifierM :: AndroidSerial -> AndroidPath -> FilePath -> Key -> ExportLocation -> [ContentIdentifier] -> MeterUpdate -> Annex ContentIdentifier

View file

@ -29,6 +29,7 @@ import Utility.Metered
import Logs.Export import Logs.Export
import qualified Remote.Helper.ThirdPartyPopulated as ThirdPartyPopulated import qualified Remote.Helper.ThirdPartyPopulated as ThirdPartyPopulated
import Utility.Env import Utility.Env
import Annex.Verify
import Data.Either import Data.Either
import Text.Read import Text.Read
@ -370,10 +371,20 @@ checkPresentExportWithContentIdentifierM borgrepo _ loc _ = prompt $ liftIO $ do
, giveup $ "Unable to access borg repository " ++ locBorgRepo borgrepo , giveup $ "Unable to access borg repository " ++ locBorgRepo borgrepo
) )
retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Either Key (Annex Key) -> MeterUpdate -> Annex (Key, Verification)
retrieveExportWithContentIdentifierM borgrepo loc _ dest mkk _ = do retrieveExportWithContentIdentifierM borgrepo loc _ dest gk _ = do
showOutput showOutput
prompt $ withOtherTmp $ \othertmp -> liftIO $ do case gk of
Right mkkey -> do
go
k <- mkkey
return (k, UnVerified)
Left k -> do
v <- verifyKeyContentIncrementally DefaultVerify k
(\iv -> tailVerify iv (toRawFilePath dest) go)
return (k, v)
where
go = prompt $ withOtherTmp $ \othertmp -> liftIO $ do
-- borgrepo could be relative, and borg has to be run -- borgrepo could be relative, and borg has to be run
-- in the temp directory to get it to write there -- in the temp directory to get it to write there
absborgrepo <- absBorgRepo borgrepo absborgrepo <- absBorgRepo borgrepo
@ -398,6 +409,5 @@ retrieveExportWithContentIdentifierM borgrepo loc _ dest mkk _ = do
-- combine with </> -- combine with </>
moveFile (fromRawFilePath othertmp </> fromRawFilePath archivefile) dest moveFile (fromRawFilePath othertmp </> fromRawFilePath archivefile) dest
removeDirectoryRecursive (fromRawFilePath othertmp) removeDirectoryRecursive (fromRawFilePath othertmp)
mkk
where
(archivename, archivefile) = extractImportLocation loc (archivename, archivefile) = extractImportLocation loc

View file

@ -198,9 +198,9 @@ storeKeyM d chunkconfig cow k c m =
(fromRawFilePath destdir) (fromRawFilePath destdir)
in byteStorer go k c m in byteStorer go k c m
NoChunks -> NoChunks ->
let go _k src p = do let go _k src p = liftIO $ do
void $ fileCopier cow src tmpf p Nothing void $ fileCopier cow src tmpf p Nothing
liftIO $ finalizeStoreGeneric d tmpdir destdir finalizeStoreGeneric d tmpdir destdir
in fileStorer go k c m in fileStorer go k c m
_ -> _ ->
let go _k b p = liftIO $ do let go _k b p = liftIO $ do
@ -242,7 +242,7 @@ retrieveKeyFileM :: RawFilePath -> ChunkConfig -> CopyCoWTried -> Retriever
retrieveKeyFileM d (LegacyChunks _) _ = Legacy.retrieve locations d retrieveKeyFileM d (LegacyChunks _) _ = Legacy.retrieve locations d
retrieveKeyFileM d NoChunks cow = fileRetriever' $ \dest k p iv -> do retrieveKeyFileM d NoChunks cow = fileRetriever' $ \dest k p iv -> do
src <- liftIO $ fromRawFilePath <$> getLocation d k src <- liftIO $ fromRawFilePath <$> getLocation d k
void $ fileCopier cow src (fromRawFilePath dest) p iv void $ liftIO $ fileCopier cow src (fromRawFilePath dest) p iv
retrieveKeyFileM d _ _ = byteRetriever $ \k sink -> retrieveKeyFileM d _ _ = byteRetriever $ \k sink ->
sink =<< liftIO (L.readFile . fromRawFilePath =<< getLocation d k) sink =<< liftIO (L.readFile . fromRawFilePath =<< getLocation d k)
@ -315,12 +315,12 @@ storeExportM d cow src _k loc p = do
viaTmp go (fromRawFilePath dest) () viaTmp go (fromRawFilePath dest) ()
where where
dest = exportPath d loc dest = exportPath d loc
go tmp () = void $ fileCopier cow src tmp p Nothing go tmp () = void $ liftIO $ fileCopier cow src tmp p Nothing
retrieveExportM :: RawFilePath -> CopyCoWTried -> Key -> ExportLocation -> FilePath -> MeterUpdate -> Annex Verification retrieveExportM :: RawFilePath -> CopyCoWTried -> Key -> ExportLocation -> FilePath -> MeterUpdate -> Annex Verification
retrieveExportM d cow k loc dest p = retrieveExportM d cow k loc dest p =
verifyKeyContentIncrementally AlwaysVerify k $ \iv -> verifyKeyContentIncrementally AlwaysVerify k $ \iv ->
void $ fileCopier cow src dest p iv void $ liftIO $ fileCopier cow src dest p iv
where where
src = fromRawFilePath $ exportPath d loc src = fromRawFilePath $ exportPath d loc
@ -413,25 +413,31 @@ importKeyM ii dir loc cid sz p = do
, inodeCache = Nothing , inodeCache = Nothing
} }
retrieveExportWithContentIdentifierM :: IgnoreInodes -> RawFilePath -> CopyCoWTried -> ExportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierM :: IgnoreInodes -> RawFilePath -> CopyCoWTried -> ExportLocation -> ContentIdentifier -> FilePath -> Either Key (Annex Key) -> MeterUpdate -> Annex (Key, Verification)
retrieveExportWithContentIdentifierM ii dir cow loc cid dest mkkey p = retrieveExportWithContentIdentifierM ii dir cow loc cid dest gk p =
precheck docopy case gk of
Right mkkey -> do
go Nothing
k <- mkkey
return (k, UnVerified)
Left k -> do
v <- verifyKeyContentIncrementally DefaultVerify k go
return (k, v)
where where
f = exportPath dir loc f = exportPath dir loc
f' = fromRawFilePath f f' = fromRawFilePath f
go iv = precheck (docopy iv)
docopy = ifM (liftIO $ tryCopyCoW cow f' dest p) docopy iv = ifM (liftIO $ tryCopyCoW cow f' dest p)
( do ( postcheckcow (liftIO $ maybe noop unableIncrementalVerifier iv)
k <- mkkey , docopynoncow iv
postcheckcow (return k)
, docopynoncow
) )
docopynoncow = do docopynoncow iv = do
#ifndef mingw32_HOST_OS #ifndef mingw32_HOST_OS
let open = do let open = do
-- Need a duplicate fd for the post check, since -- Need a duplicate fd for the post check.
-- hGetContentsMetered closes its handle.
fd <- openFd f' ReadOnly Nothing defaultFileFlags fd <- openFd f' ReadOnly Nothing defaultFileFlags
dupfd <- dup fd dupfd <- dup fd
h <- fdToHandle fd h <- fdToHandle fd
@ -445,12 +451,11 @@ retrieveExportWithContentIdentifierM ii dir cow loc cid dest mkkey p =
let close = hClose let close = hClose
bracketIO open close $ \h -> do bracketIO open close $ \h -> do
#endif #endif
liftIO $ hGetContentsMetered h p >>= L.writeFile dest liftIO $ fileContentCopier h dest p iv
k <- mkkey
#ifndef mingw32_HOST_OS #ifndef mingw32_HOST_OS
postchecknoncow dupfd (return k) postchecknoncow dupfd (return ())
#else #else
postchecknoncow (return k) postchecknoncow (return ())
#endif #endif
-- Check before copy, to avoid expensive copy of wrong file -- Check before copy, to avoid expensive copy of wrong file
@ -500,7 +505,7 @@ storeExportWithContentIdentifierM ii dir cow src _k loc overwritablecids p = do
liftIO $ createDirectoryUnder dir (toRawFilePath destdir) liftIO $ createDirectoryUnder dir (toRawFilePath destdir)
withTmpFileIn destdir template $ \tmpf tmph -> do withTmpFileIn destdir template $ \tmpf tmph -> do
liftIO $ hClose tmph liftIO $ hClose tmph
void $ fileCopier cow src tmpf p Nothing void $ liftIO $ fileCopier cow src tmpf p Nothing
let tmpf' = toRawFilePath tmpf let tmpf' = toRawFilePath tmpf
resetAnnexFilePerm tmpf' resetAnnexFilePerm tmpf'
liftIO (getFileStatus tmpf) >>= liftIO . mkContentIdentifier ii tmpf' >>= \case liftIO (getFileStatus tmpf) >>= liftIO . mkContentIdentifier ii tmpf' >>= \case

View file

@ -715,7 +715,7 @@ mkFileCopier remotewanthardlink (State _ _ copycowtried _ _) = do
where where
copier src dest k p check verifyconfig = do copier src dest k p check verifyconfig = do
iv <- startVerifyKeyContentIncrementally verifyconfig k iv <- startVerifyKeyContentIncrementally verifyconfig k
fileCopier copycowtried src dest p iv >>= \case liftIO (fileCopier copycowtried src dest p iv) >>= \case
Copied -> ifM check Copied -> ifM check
( finishVerifyKeyContentIncrementally iv ( finishVerifyKeyContentIncrementally iv
, do , do

View file

@ -360,8 +360,7 @@ adjustExportImport' isexport isimport r rs = do
getkeycids ciddbv k >>= \case getkeycids ciddbv k >>= \case
(cid:_) -> do (cid:_) -> do
l <- getfirstexportloc dbv k l <- getfirstexportloc dbv k
void $ retrieveExportWithContentIdentifier (importActions r) l cid dest (pure k) p snd <$> retrieveExportWithContentIdentifier (importActions r) l cid dest (Left k) p
return UnVerified
-- In case a content identifier is somehow missing, -- In case a content identifier is somehow missing,
-- try this instead. -- try this instead.
[] -> if isexport [] -> if isexport

View file

@ -649,22 +649,31 @@ mkImportableContentsVersioned info = build . groupfiles
| otherwise = | otherwise =
i : removemostrecent mtime rest i : removemostrecent mtime rest
retrieveExportWithContentIdentifierS3 :: S3HandleVar -> Remote -> RemoteStateHandle -> S3Info -> ExportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key retrieveExportWithContentIdentifierS3 :: S3HandleVar -> Remote -> RemoteStateHandle -> S3Info -> ExportLocation -> ContentIdentifier -> FilePath -> Either Key (Annex Key) -> MeterUpdate -> Annex (Key, Verification)
retrieveExportWithContentIdentifierS3 hv r rs info loc cid dest mkkey p = withS3Handle hv $ \case retrieveExportWithContentIdentifierS3 hv r rs info loc cid dest gk p =
Just h -> do case gk of
rewritePreconditionException $ retrieveHelper' h dest p Nothing $ Right _mkkey -> do
limitGetToContentIdentifier cid $ k <- go Nothing
S3.getObject (bucket info) o return (k, UnVerified)
k <- mkkey Left k -> do
case extractContentIdentifier cid o of v <- verifyKeyContentIncrementally DefaultVerify k
Right vid -> do (void . go)
vids <- getS3VersionID rs k return (k, v)
unless (vid `elem` map Just vids) $
setS3VersionID info rs k vid
Left _ -> noop
return k
Nothing -> giveup $ needS3Creds (uuid r)
where where
go iv = withS3Handle hv $ \case
Just h -> do
rewritePreconditionException $ retrieveHelper' h dest p iv $
limitGetToContentIdentifier cid $
S3.getObject (bucket info) o
k <- either return id gk
case extractContentIdentifier cid o of
Right vid -> do
vids <- getS3VersionID rs k
unless (vid `elem` map Just vids) $
setS3VersionID info rs k vid
Left _ -> noop
return k
Nothing -> giveup $ needS3Creds (uuid r)
o = T.pack $ bucketExportLocation info loc o = T.pack $ bucketExportLocation info loc
{- Catch exception getObject returns when a precondition is not met, {- Catch exception getObject returns when a precondition is not met,

View file

@ -346,10 +346,11 @@ data ImportActions a = ImportActions
-> ContentIdentifier -> ContentIdentifier
-- file to write content to -- file to write content to
-> FilePath -> FilePath
-- callback that generates a key from the downloaded content -- Either the key, or when it's not yet known, a callback
-> a Key -- that generates a key from the downloaded content.
-> Either Key (a Key)
-> MeterUpdate -> MeterUpdate
-> a Key -> a (Key, Verification)
-- Exports content to an ExportLocation, and returns the -- Exports content to an ExportLocation, and returns the
-- ContentIdentifier corresponding to the content it stored. -- ContentIdentifier corresponding to the content it stored.
-- --

View file

@ -12,3 +12,5 @@ If needed example, here is http://datasets.datalad.org/allen-brain-observatory/v
[[!meta author=yoh]] [[!meta author=yoh]]
[[!tag projects/dandi]] [[!tag projects/dandi]]
> [[done]] --[[Joey]]

View file

@ -6,4 +6,6 @@
Update: incremental hashing is also now done for all export remotes. Update: incremental hashing is also now done for all export remotes.
Only import (and export+import) remotes don't support incremental hashing Only import (and export+import) remotes don't support incremental hashing
now. now.
Update 2: Now also done for import remotes. All done!
"""]] """]]