fix a case where file tracked by git unexpectedly becomes annex pointer file

smudge: When annex.largefiles=anything, files that were already stored in
git, and have not been modified could sometimes be converted to being
stored in the annex. Changes in 7.20191024 made this more of a problem.
This case is now detected and prevented.
This commit is contained in:
Joey Hess 2019-12-27 14:58:10 -04:00
parent b9481c6ba0
commit ea3cb7d277
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
10 changed files with 90 additions and 34 deletions

View file

@ -1,6 +1,6 @@
{- git cat-file interface, with handle automatically stored in the Annex monad
-
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -16,6 +16,7 @@ module Annex.CatFile (
catObjectMetaData,
catFileStop,
catKey,
catKey',
catSymLinkTarget,
catKeyFile,
catKeyFileHEAD,
@ -54,7 +55,7 @@ catObject ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catObject h ref
catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType))
catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
catObjectMetaData ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catObjectMetaData h ref
@ -99,14 +100,16 @@ catFileStop = do
{- From ref to a symlink or a pointer file, get the key. -}
catKey :: Ref -> Annex (Maybe Key)
catKey ref = go =<< catObjectMetaData ref
where
go (Just (sz, _))
-- Avoid catting large files, that cannot be symlinks or
-- pointer files, which would require buffering their
-- content in memory, as well as a lot of IO.
| sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref
go _ = return Nothing
catKey ref = catKey' ref =<< catObjectMetaData ref
catKey' :: Ref -> Maybe (Sha, Integer, ObjectType) -> Annex (Maybe Key)
catKey' ref (Just (_, sz, _))
-- Avoid catting large files, that cannot be symlinks or
-- pointer files, which would require buffering their
-- content in memory, as well as a lot of IO.
| sz <= maxPointerSz =
parseLinkTargetOrPointer . L.toStrict <$> catObject ref
catKey' _ _ = return Nothing
{- Gets a symlink target. -}
catSymLinkTarget :: Sha -> Annex RawFilePath
@ -151,7 +154,7 @@ catKeyFileHEAD f = catKey $ Git.Ref.fileFromRef Git.Ref.headRef f
catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key)
catKeyFileHidden = hiddenCat catKey
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Integer, ObjectType))
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Sha, Integer, ObjectType))
catObjectMetaDataHidden = hiddenCat catObjectMetaData
hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a)

View file

@ -21,7 +21,7 @@ hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
where
startup = do
h <- inRepo $ Git.HashObject.hashObjectStart
h <- inRepo $ Git.HashObject.hashObjectStart True
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
return h

View file

@ -19,6 +19,10 @@ git-annex (7.20191219) UNRELEASED; urgency=medium
setting, except for in the special case of annex.securehashesonly.
* Improve file ordering behavior when one parameter is "." and other
parameters are other directories.
* smudge: When annex.largefiles=anything, files that were already
stored in git, and have not been modified could sometimes be converted
to being stored in the annex. Changes in 7.20191024 made this more
of a problem. This case is now detected and prevented.
-- Joey Hess <id@joeyh.name> Wed, 18 Dec 2019 15:12:40 -0400

View file

@ -18,7 +18,10 @@ import Logs.Location
import qualified Database.Keys
import qualified Git.BuildVersion
import Git.FilePath
import Git.Types
import Git.HashObject
import qualified Git
import qualified Git.Ref
import qualified Annex
import Backend
import Utility.Metered
@ -88,11 +91,14 @@ clean file = do
Just k -> do
getMoveRaceRecovery k (toRawFilePath file)
liftIO $ L.hPut stdout b
Nothing -> go b =<< catKeyFile (toRawFilePath file)
Nothing -> do
let fileref = Git.Ref.fileRef (toRawFilePath file)
indexmeta <- catObjectMetaData fileref
go b indexmeta =<< catKey' fileref indexmeta
)
stop
where
go b oldkey = ifM (shouldAnnex file oldkey)
go b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
( do
-- Before git 2.5, failing to consume all stdin here
-- would cause a SIGPIPE and crash it.
@ -153,17 +159,17 @@ clean file = do
-- added to the annex, so will be added to git. But some heuristics
-- are used to avoid bad behavior:
--
-- If the index already contains the file, preserve its annexed/not annexed
-- state. This prevents accidental conversions.
-- If the file is annexed in the index, keep it annexed.
-- This prevents accidental conversions.
--
-- Otherwise, when the file's inode is the same as one that was used for
-- annexed content before, annex it. This handles cases such as renaming an
-- unlocked annexed file followed by git add, which the user naturally
-- expects to behave the same as git mv.
shouldAnnex :: FilePath -> Maybe Key -> Annex Bool
shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
( checkmatcher checkheuristics
, checkheuristics
shouldAnnex :: FilePath -> Maybe (Sha, FileSize, ObjectType) -> Maybe Key -> Annex Bool
shouldAnnex file indexmeta moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
( checkunchangedgitfile $ checkmatcher checkheuristics
, checkunchangedgitfile checkheuristics
)
where
checkmatcher d = do
@ -178,6 +184,31 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
Nothing -> pure False
Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus
-- This checks for a case where the file had been added to git
-- previously, not to the annex before, and its content is not
-- changed, but git is running the clean filter again on it
-- (eg because its mtime or inode changed, or just because git feels
-- like it). Such a file should not be added to the annex, even if
-- annex.largefiles now matches it, because the content is not
-- changed.
checkunchangedgitfile cont = case (moldkey, indexmeta) of
(Nothing, Just (sha, sz, _)) -> liftIO (catchMaybeIO (getFileSize file)) >>= \case
Just sz' | sz' == sz -> do
-- The size is the same, so the file
-- is not much larger than what was stored
-- in git before, so it won't be out of
-- line to hash it. However, the content
-- is prevented from being stored in git
-- when hashing.
h <- inRepo $ hashObjectStart False
sha' <- liftIO $ hashFile h file
liftIO $ hashObjectStop h
if sha' == sha
then return False
else cont
_ -> cont
_ -> cont
emitPointer :: Key -> IO ()
emitPointer = S.putStr . formatPointer

View file

@ -1,6 +1,6 @@
{- git cat-file interface
-
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -33,6 +33,7 @@ import Text.Read
import Common
import Git
import Git.Sha
import qualified Git.Ref
import Git.Command
import Git.Types
import Git.FilePath
@ -109,22 +110,23 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
return (Just (content, sha, objtype))
{- Gets the size and type of an object, without reading its content. -}
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType))
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
resp <- hGetLine from
case parseResp object resp of
Just (ParsedResp _ size objtype) ->
return $ Just (size, objtype)
Just (ParsedResp sha size objtype) ->
return $ Just (sha, size, objtype)
Just DNE -> return Nothing
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
where
-- Slow fallback path for filenames containing newlines.
newlinefallback = do
sha <- Git.Ref.sha object (gitRepo h)
sz <- querySize object (gitRepo h)
objtype <- queryObjectType object (gitRepo h)
return $ (,) <$> sz <*> objtype
return $ (,,) <$> sha <*> sz <*> objtype
data ParsedResp = ParsedResp Sha Integer ObjectType | DNE
data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
query hdl object newlinefallback receive
@ -180,7 +182,7 @@ querySingle o r repo reader = assertLocal repo $
, return Nothing
)
querySize :: Ref -> Repo -> IO (Maybe Integer)
querySize :: Ref -> Repo -> IO (Maybe FileSize)
querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n'))
<$> querySingle (Param "-s") r repo hGetContentsStrict

View file

@ -23,12 +23,12 @@ import Data.ByteString.Builder
type HashObjectHandle = CoProcess.CoProcessHandle
hashObjectStart :: Repo -> IO HashObjectHandle
hashObjectStart = gitCoProcessStart True
[ Param "hash-object"
, Param "-w"
, Param "--stdin-paths"
, Param "--no-filters"
hashObjectStart :: Bool -> Repo -> IO HashObjectHandle
hashObjectStart writeobject = gitCoProcessStart True $ catMaybes
[ Just (Param "hash-object")
, if writeobject then Just (Param "-w") else Nothing
, Just (Param "--stdin-paths")
, Just (Param "--no-filters")
]
hashObjectStop :: HashObjectHandle -> IO ()

View file

@ -31,7 +31,7 @@ import Git.FilePath
-}
merge :: Ref -> Ref -> Repo -> IO ()
merge x y repo = do
hashhandle <- hashObjectStart repo
hashhandle <- hashObjectStart True repo
ch <- catFileStart repo
streamUpdateIndex repo
[ lsTree x repo

View file

@ -75,3 +75,5 @@ Thanks for having a look.
[[!meta author=kyle]]
[[!tag projects/datalad]]
> [[fixed|done]] --[[Joey]]

View file

@ -41,4 +41,5 @@ add a specific file to annex or git, bypassing annex.largefiles and all
other configuration and state. This could also be used to easily switch
a file from one storage to the other. I'd hope the existence of that
would prevent one-off setting of annex.largefiles=anything/nothing.
[[todo/git_annex_add_option_to_control_to_where]]
"""]]

View file

@ -0,0 +1,13 @@
Make `git-annex add --annex` and `git-annex add --git` add a specific file to
annex or git, bypassing annex.largefiles and all other configuration and state.
One reason to want this is that it avoids users doing stuff like this:
git -c annex.largefiles=anything annex add foo.c
Such a temporary setting of annex.largefiles can be problimatic, as explored in
<https://git-annex.branchable.com/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/>
Also, this could also be used to easily switch a file from one storage to
the other. I suppose the file would have to be touched first to make git-annex
add process it?