From 64ccb4734e0be51e803685350a9416cbf4480dc8 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 23 Feb 2022 15:17:08 -0400 Subject: [PATCH] smudge: Warn when encountering a pointer file that has other content appended to it It will then proceed to add the file the same as if it were any other file containing possibly annexable content. Usually the file is one that was annexed before, so the new, probably corrupt content will also be added to the annex. If the file was not annexed before, the content will be added to git. It's not possible for the smudge filter to throw an error here, because git then just adds the file to git anyway. Sponsored-by: Dartmouth College's Datalad project --- Annex/Link.hs | 29 +++++++++++----- CHANGELOG | 2 ++ Command/FilterProcess.hs | 2 +- Command/Smudge.hs | 33 +++++++++++-------- ...nt_silent_data_loss_on_unlocked_files.mdwn | 2 ++ ..._adb69305a31cf55ff62497b6de8e2f6a._comment | 10 ++++++ 6 files changed, 55 insertions(+), 23 deletions(-) create mode 100644 doc/bugs/prevent_silent_data_loss_on_unlocked_files/comment_5_adb69305a31cf55ff62497b6de8e2f6a._comment diff --git a/Annex/Link.hs b/Annex/Link.hs index bac3531b41..a9b4108ebe 100644 --- a/Annex/Link.hs +++ b/Annex/Link.hs @@ -305,13 +305,20 @@ unableToRestage mf = unwords - valid pointer file. -} parseLinkTargetOrPointer :: S.ByteString -> Maybe Key -parseLinkTargetOrPointer b - | S.length b <= maxValidPointerSz = - let (firstline, rest) = S8.span (/= '\n') b - in case parsekey $ droptrailing '\r' firstline of - Just k | restvalid (dropleading '\n' rest) -> Just k - _ -> Nothing - | otherwise = Nothing +parseLinkTargetOrPointer = either (const Nothing) id + . parseLinkTargetOrPointer' + +data InvalidAppendedPointerFile = InvalidAppendedPointerFile + +parseLinkTargetOrPointer' :: S.ByteString -> Either InvalidAppendedPointerFile (Maybe Key) +parseLinkTargetOrPointer' b = + let (firstline, rest) = S8.span (/= '\n') b + in case parsekey $ droptrailing '\r' firstline of + Just k + | S.length b > maxValidPointerSz -> Left InvalidAppendedPointerFile + | restvalid (dropleading '\n' rest) -> Right (Just k) + | otherwise -> Left InvalidAppendedPointerFile + Nothing -> Right Nothing where parsekey l | isLinkToAnnex l = fileKey $ snd $ S8.breakEnd pathsep l @@ -344,9 +351,13 @@ parseLinkTargetOrPointer b {- Avoid looking at more of the lazy ByteString than necessary since it - could be reading from a large file that is not a pointer file. -} parseLinkTargetOrPointerLazy :: L.ByteString -> Maybe Key -parseLinkTargetOrPointerLazy b = +parseLinkTargetOrPointerLazy = either (const Nothing) id + . parseLinkTargetOrPointerLazy' + +parseLinkTargetOrPointerLazy' :: L.ByteString -> Either InvalidAppendedPointerFile (Maybe Key) +parseLinkTargetOrPointerLazy' b = let b' = L.take (fromIntegral maxPointerSz) b - in parseLinkTargetOrPointer (L.toStrict b') + in parseLinkTargetOrPointer' (L.toStrict b') formatPointer :: Key -> S.ByteString formatPointer k = prefix <> keyFile k <> nl diff --git a/CHANGELOG b/CHANGELOG index 76ca493c55..13306b9802 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,8 @@ git-annex (10.20220223) UNRELEASED; urgency=medium some other content appended to it, and avoid treating it as a pointer file, so that appended content will not be checked into git, but will be annexed like any other file. + * smudge: Warn when encountering a pointer file that has other content + appended to it. -- Joey Hess Wed, 23 Feb 2022 14:14:09 -0400 diff --git a/Command/FilterProcess.hs b/Command/FilterProcess.hs index d53303c6fd..ff20dd7268 100644 --- a/Command/FilterProcess.hs +++ b/Command/FilterProcess.hs @@ -83,7 +83,7 @@ clean file = do -- hash the content provided by git, but Backend does not currently -- have an interface to do so. Command.Smudge.clean' (toRawFilePath file) - (parseLinkTargetOrPointer b) + (parseLinkTargetOrPointer' b) passthrough discardreststdin emitpointer diff --git a/Command/Smudge.hs b/Command/Smudge.hs index 02043341d2..a5d8871998 100644 --- a/Command/Smudge.hs +++ b/Command/Smudge.hs @@ -1,6 +1,6 @@ {- git-annex command - - - Copyright 2015-2021 Joey Hess + - Copyright 2015-2022 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -105,7 +105,7 @@ clean file = do then L.length b `seq` return () else liftIO $ hClose stdin let emitpointer = liftIO . S.hPut stdout . formatPointer - clean' file (parseLinkTargetOrPointerLazy b) + clean' file (parseLinkTargetOrPointerLazy' b) passthrough discardreststdin emitpointer @@ -115,7 +115,7 @@ clean file = do -- Handles everything except the IO of the file content. clean' :: RawFilePath - -> Maybe Key + -> Either InvalidAppendedPointerFile (Maybe Key) -- ^ If the content provided by git is an annex pointer, -- this is the key it points to. -> Annex () @@ -135,19 +135,26 @@ clean' file mk passthrough discardreststdin emitpointer = where go = case mk of - Just k -> do + Right (Just k) -> do addingExistingLink file k $ do getMoveRaceRecovery k file passthrough - Nothing -> inRepo (Git.Ref.fileRef file) >>= \case - Just fileref -> do - indexmeta <- catObjectMetaData fileref - oldkey <- case indexmeta of - Just (_, sz, _) -> catKey' fileref sz - Nothing -> return Nothing - go' indexmeta oldkey - Nothing -> passthrough - go' indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey) + Right Nothing -> notpointer + Left InvalidAppendedPointerFile -> do + toplevelWarning False $ + "The file \"" ++ fromRawFilePath file ++ "\" looks like git-annex pointer file that has had other content appended to it" + notpointer + + notpointer = inRepo (Git.Ref.fileRef file) >>= \case + Just fileref -> do + indexmeta <- catObjectMetaData fileref + oldkey <- case indexmeta of + Just (_, sz, _) -> catKey' fileref sz + Nothing -> return Nothing + notpointer' indexmeta oldkey + Nothing -> passthrough + + notpointer' indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey) ( do discardreststdin diff --git a/doc/bugs/prevent_silent_data_loss_on_unlocked_files.mdwn b/doc/bugs/prevent_silent_data_loss_on_unlocked_files.mdwn index a378101743..ea3e135cef 100644 --- a/doc/bugs/prevent_silent_data_loss_on_unlocked_files.mdwn +++ b/doc/bugs/prevent_silent_data_loss_on_unlocked_files.mdwn @@ -72,3 +72,5 @@ may be situation is even more "dire" because git-annex still considers this file [[!meta author=yoh]] [[!tag projects/datalad]] + +> [[fixed|done]] (at least as far as it can be fixed) --[[Joey]] diff --git a/doc/bugs/prevent_silent_data_loss_on_unlocked_files/comment_5_adb69305a31cf55ff62497b6de8e2f6a._comment b/doc/bugs/prevent_silent_data_loss_on_unlocked_files/comment_5_adb69305a31cf55ff62497b6de8e2f6a._comment new file mode 100644 index 0000000000..362dd8a378 --- /dev/null +++ b/doc/bugs/prevent_silent_data_loss_on_unlocked_files/comment_5_adb69305a31cf55ff62497b6de8e2f6a._comment @@ -0,0 +1,10 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 5""" + date="2022-02-23T19:13:25Z" + content=""" + joey@darkstar:/tmp/r>git add xx + git-annex: The file "xx" looks like git-annex pointer file that has had other content appended to it + +I think this is as far as git-annex can do toward preventing foot shooting here. +"""]]