From e95747a1494b2c1a858394f0e422e906b1651b22 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 7 Jan 2022 13:17:43 -0400 Subject: [PATCH] fix handling of corrupted data received from git remote Recover from corrupted content being received from a git remote due eg to a wire error, by deleting the temporary file when it fails to verify. This prevents a retry from failing again. Reversion introduced in version 8.20210903, when incremental verification was added. Only the git remote seems to be affected, although it is certianly possible that other remotes could later have the same issue. This only affects things passed to getViaTmp that return (False, UnVerified) due to verification failing. As far as getViaTmp can tell, that could just as well mean that the transfer failed in a way that would resume, so it cannot delete the temp file itself. Remote.Git and P2P.Annex use getViaTmp internally, while other remotes do not, which is why only it seems affected. A better fix perhaps would be to improve the types of the callback passed to getViaTmp, so that some other value could be used to indicate the state where the transfer succeeded but verification failed. Sponsored-by: Boyd Stephen Smith Jr. --- Annex/Content.hs | 33 ++++++++++++------- CHANGELOG | 8 ++--- P2P/Annex.hs | 6 ++-- Remote/Git.hs | 9 +++-- ..._to_get_small_files_over_P2P_protocol.mdwn | 3 ++ 5 files changed, 39 insertions(+), 20 deletions(-) diff --git a/Annex/Content.hs b/Annex/Content.hs index e48e9d6d32..68d21506f1 100644 --- a/Annex/Content.hs +++ b/Annex/Content.hs @@ -1,6 +1,6 @@ {- git-annex file content managing - - - Copyright 2010-2021 Joey Hess + - Copyright 2010-2022 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -19,6 +19,7 @@ module Annex.Content ( RetrievalSecurityPolicy(..), getViaTmp, getViaTmpFromDisk, + verificationOfContentFailed, checkDiskSpaceToGet, checkSecureHashes, prepTmp, @@ -222,7 +223,6 @@ getViaTmpFromDisk rsp v key af action = checkallowed $ do tmpfile <- prepTmp key resuming <- liftIO $ R.doesPathExist tmpfile (ok, verification) <- action tmpfile - liftIO $ print ok -- When the temp file already had content, we don't know if -- that content is good or not, so only trust if it the action -- Verified it in passing. Otherwise, force verification even @@ -236,16 +236,7 @@ getViaTmpFromDisk rsp v key af action = checkallowed $ do then ifM (verifyKeyContentPostRetrieval rsp v verification' key tmpfile) ( pruneTmpWorkDirBefore tmpfile (moveAnnex key af) , do - warning "verification of content failed" - -- The bad content is not retained, because - -- a retry should not try to resume from it - -- since it's apparently corrupted. - -- Also, the bad content could be any data, - -- including perhaps the content of another - -- file than the one that was requested, - -- and so it's best not to keep it on disk. - pruneTmpWorkDirBefore tmpfile - (liftIO . removeWhenExistsWith R.removeLink) + verificationOfContentFailed tmpfile return False ) -- On transfer failure, the tmp file is left behind, in case @@ -264,6 +255,24 @@ getViaTmpFromDisk rsp v key af action = checkallowed $ do ) ) +{- When the content of a file that was successfully transferred from a remote + - fails to verify, use this to display a message so the user knows why it + - failed, and to clean up the corrupted content. + - + - The bad content is not retained, because the transfer of it succeeded. + - So it's not incomplete and a resume using it will not work. While + - some protocols like rsync could recover such a bad content file, + - they are assumed to not write out bad data to a file in the first place. + - Most protocols, including the P2P protocol, pick up downloads where they + - left off, and so if the bad content were not deleted, repeated downloads + - would continue to fail. + -} +verificationOfContentFailed :: RawFilePath -> Annex () +verificationOfContentFailed tmpfile = do + warning "Verification of content failed" + pruneTmpWorkDirBefore tmpfile + (liftIO . removeWhenExistsWith R.removeLink) + {- Checks if there is enough free disk space to download a key - to its temp file. - diff --git a/CHANGELOG b/CHANGELOG index c6c35d37ba..79d655a6b4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -5,10 +5,10 @@ git-annex (8.20211232) UNRELEASED; urgency=medium preserve it in the imported tree so it does not get deleted. * enableremote, renameremote: Better handling of the unusual case where multiple special remotes have been initialized with the same name. - * Recover from over the wire errors when downloading from remotes, - by deleting the object file when verification of it fails. This allows - the next attempt at a download to succeed, rather than using the same - content and failing again. + * Recover from corrupted content being received from a git remote, + by deleting the temporary file when it fails to verify. This prevents + a retry from failing again. + (reversion introduced in version 8.20210903) -- Joey Hess Mon, 03 Jan 2022 14:01:14 -0400 diff --git a/P2P/Annex.hs b/P2P/Annex.hs index a2641d38da..08177bec6e 100644 --- a/P2P/Annex.hs +++ b/P2P/Annex.hs @@ -1,6 +1,6 @@ {- P2P protocol, Annex implementation - - - Copyright 2016-2021 Joey Hess + - Copyright 2016-2022 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -206,7 +206,9 @@ runLocal runst runner a = case a of | rightsize -> liftIO (finalizeIncrementalVerifier iv) >>= \case Nothing -> return (True, UnVerified) Just True -> return (True, Verified) - Just False -> return (False, UnVerified) + Just False -> do + verificationOfContentFailed (toRawFilePath dest) + return (False, UnVerified) | otherwise -> return (False, UnVerified) Nothing -> return (rightsize, UnVerified) Right (Just Invalid) | l == 0 -> diff --git a/Remote/Git.hs b/Remote/Git.hs index 99ccc60330..45e79ee348 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -30,6 +30,7 @@ import Logs.Presence import Annex.Transfer import Annex.CopyFile import Annex.Verify +import Annex.Content (verificationOfContentFailed) import Annex.UUID import qualified Annex.Content import qualified Annex.BranchState @@ -706,7 +707,9 @@ mkFileCopier remotewanthardlink (State _ _ copycowtried _ _) = do ifM (liftIO (catchBoolIO (linker src dest))) ( ifM check ( return (True, Verified) - , return (False, UnVerified) + , do + verificationOfContentFailed (toRawFilePath dest) + return (False, UnVerified) ) , copier src dest k p check verifyconfig ) @@ -717,7 +720,9 @@ mkFileCopier remotewanthardlink (State _ _ copycowtried _ _) = do fileCopier copycowtried src dest p iv >>= \case Copied -> ifM check ( finishVerifyKeyContentIncrementally iv - , return (False, UnVerified) + , do + verificationOfContentFailed (toRawFilePath dest) + return (False, UnVerified) ) CopiedCoW -> unVerified check diff --git a/doc/bugs/Failure_to_get_small_files_over_P2P_protocol.mdwn b/doc/bugs/Failure_to_get_small_files_over_P2P_protocol.mdwn index 2b8eb58278..cb0a2b663b 100644 --- a/doc/bugs/Failure_to_get_small_files_over_P2P_protocol.mdwn +++ b/doc/bugs/Failure_to_get_small_files_over_P2P_protocol.mdwn @@ -115,3 +115,6 @@ Both servers are debian. I've been using git annex for years now, some periods more intensively than others. The only data I've lost I can only blame on my own stupidity, even after a terminal disk failure I managed to recover almost everything with a reinject. I'm still using it entirely cli though, no assistants or gui for me. [[!meta title="corrupted tmp file can prevent getting an object"]] + +> [[fixed|done]] (the related issue that I discovered in comment #1 +> specifically) --[[Joey]]