fix a case where file tracked by git unexpectedly becomes annex pointer file

smudge: When annex.largefiles=anything, files that were already stored in
git, and have not been modified could sometimes be converted to being
stored in the annex. Changes in 7.20191024 made this more of a problem.
This case is now detected and prevented.
This commit is contained in:
Joey Hess 2019-12-27 14:58:10 -04:00
parent b9481c6ba0
commit ea3cb7d277
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
10 changed files with 90 additions and 34 deletions

View file

@ -1,6 +1,6 @@
{- git cat-file interface, with handle automatically stored in the Annex monad {- git cat-file interface, with handle automatically stored in the Annex monad
- -
- Copyright 2011-2018 Joey Hess <id@joeyh.name> - Copyright 2011-2019 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -16,6 +16,7 @@ module Annex.CatFile (
catObjectMetaData, catObjectMetaData,
catFileStop, catFileStop,
catKey, catKey,
catKey',
catSymLinkTarget, catSymLinkTarget,
catKeyFile, catKeyFile,
catKeyFileHEAD, catKeyFileHEAD,
@ -54,7 +55,7 @@ catObject ref = do
h <- catFileHandle h <- catFileHandle
liftIO $ Git.CatFile.catObject h ref liftIO $ Git.CatFile.catObject h ref
catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType)) catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
catObjectMetaData ref = do catObjectMetaData ref = do
h <- catFileHandle h <- catFileHandle
liftIO $ Git.CatFile.catObjectMetaData h ref liftIO $ Git.CatFile.catObjectMetaData h ref
@ -99,14 +100,16 @@ catFileStop = do
{- From ref to a symlink or a pointer file, get the key. -} {- From ref to a symlink or a pointer file, get the key. -}
catKey :: Ref -> Annex (Maybe Key) catKey :: Ref -> Annex (Maybe Key)
catKey ref = go =<< catObjectMetaData ref catKey ref = catKey' ref =<< catObjectMetaData ref
where
go (Just (sz, _)) catKey' :: Ref -> Maybe (Sha, Integer, ObjectType) -> Annex (Maybe Key)
-- Avoid catting large files, that cannot be symlinks or catKey' ref (Just (_, sz, _))
-- pointer files, which would require buffering their -- Avoid catting large files, that cannot be symlinks or
-- content in memory, as well as a lot of IO. -- pointer files, which would require buffering their
| sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref -- content in memory, as well as a lot of IO.
go _ = return Nothing | sz <= maxPointerSz =
parseLinkTargetOrPointer . L.toStrict <$> catObject ref
catKey' _ _ = return Nothing
{- Gets a symlink target. -} {- Gets a symlink target. -}
catSymLinkTarget :: Sha -> Annex RawFilePath catSymLinkTarget :: Sha -> Annex RawFilePath
@ -151,7 +154,7 @@ catKeyFileHEAD f = catKey $ Git.Ref.fileFromRef Git.Ref.headRef f
catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key) catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key)
catKeyFileHidden = hiddenCat catKey catKeyFileHidden = hiddenCat catKey
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Integer, ObjectType)) catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Sha, Integer, ObjectType))
catObjectMetaDataHidden = hiddenCat catObjectMetaData catObjectMetaDataHidden = hiddenCat catObjectMetaData
hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a) hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a)

View file

@ -21,7 +21,7 @@ hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
where where
startup = do startup = do
h <- inRepo $ Git.HashObject.hashObjectStart h <- inRepo $ Git.HashObject.hashObjectStart True
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h } Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
return h return h

View file

@ -19,6 +19,10 @@ git-annex (7.20191219) UNRELEASED; urgency=medium
setting, except for in the special case of annex.securehashesonly. setting, except for in the special case of annex.securehashesonly.
* Improve file ordering behavior when one parameter is "." and other * Improve file ordering behavior when one parameter is "." and other
parameters are other directories. parameters are other directories.
* smudge: When annex.largefiles=anything, files that were already
stored in git, and have not been modified could sometimes be converted
to being stored in the annex. Changes in 7.20191024 made this more
of a problem. This case is now detected and prevented.
-- Joey Hess <id@joeyh.name> Wed, 18 Dec 2019 15:12:40 -0400 -- Joey Hess <id@joeyh.name> Wed, 18 Dec 2019 15:12:40 -0400

View file

@ -18,7 +18,10 @@ import Logs.Location
import qualified Database.Keys import qualified Database.Keys
import qualified Git.BuildVersion import qualified Git.BuildVersion
import Git.FilePath import Git.FilePath
import Git.Types
import Git.HashObject
import qualified Git import qualified Git
import qualified Git.Ref
import qualified Annex import qualified Annex
import Backend import Backend
import Utility.Metered import Utility.Metered
@ -88,11 +91,14 @@ clean file = do
Just k -> do Just k -> do
getMoveRaceRecovery k (toRawFilePath file) getMoveRaceRecovery k (toRawFilePath file)
liftIO $ L.hPut stdout b liftIO $ L.hPut stdout b
Nothing -> go b =<< catKeyFile (toRawFilePath file) Nothing -> do
let fileref = Git.Ref.fileRef (toRawFilePath file)
indexmeta <- catObjectMetaData fileref
go b indexmeta =<< catKey' fileref indexmeta
) )
stop stop
where where
go b oldkey = ifM (shouldAnnex file oldkey) go b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
( do ( do
-- Before git 2.5, failing to consume all stdin here -- Before git 2.5, failing to consume all stdin here
-- would cause a SIGPIPE and crash it. -- would cause a SIGPIPE and crash it.
@ -153,17 +159,17 @@ clean file = do
-- added to the annex, so will be added to git. But some heuristics -- added to the annex, so will be added to git. But some heuristics
-- are used to avoid bad behavior: -- are used to avoid bad behavior:
-- --
-- If the index already contains the file, preserve its annexed/not annexed -- If the file is annexed in the index, keep it annexed.
-- state. This prevents accidental conversions. -- This prevents accidental conversions.
-- --
-- Otherwise, when the file's inode is the same as one that was used for -- Otherwise, when the file's inode is the same as one that was used for
-- annexed content before, annex it. This handles cases such as renaming an -- annexed content before, annex it. This handles cases such as renaming an
-- unlocked annexed file followed by git add, which the user naturally -- unlocked annexed file followed by git add, which the user naturally
-- expects to behave the same as git mv. -- expects to behave the same as git mv.
shouldAnnex :: FilePath -> Maybe Key -> Annex Bool shouldAnnex :: FilePath -> Maybe (Sha, FileSize, ObjectType) -> Maybe Key -> Annex Bool
shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig) shouldAnnex file indexmeta moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
( checkmatcher checkheuristics ( checkunchangedgitfile $ checkmatcher checkheuristics
, checkheuristics , checkunchangedgitfile checkheuristics
) )
where where
checkmatcher d = do checkmatcher d = do
@ -178,6 +184,31 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
Nothing -> pure False Nothing -> pure False
Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus
-- This checks for a case where the file had been added to git
-- previously, not to the annex before, and its content is not
-- changed, but git is running the clean filter again on it
-- (eg because its mtime or inode changed, or just because git feels
-- like it). Such a file should not be added to the annex, even if
-- annex.largefiles now matches it, because the content is not
-- changed.
checkunchangedgitfile cont = case (moldkey, indexmeta) of
(Nothing, Just (sha, sz, _)) -> liftIO (catchMaybeIO (getFileSize file)) >>= \case
Just sz' | sz' == sz -> do
-- The size is the same, so the file
-- is not much larger than what was stored
-- in git before, so it won't be out of
-- line to hash it. However, the content
-- is prevented from being stored in git
-- when hashing.
h <- inRepo $ hashObjectStart False
sha' <- liftIO $ hashFile h file
liftIO $ hashObjectStop h
if sha' == sha
then return False
else cont
_ -> cont
_ -> cont
emitPointer :: Key -> IO () emitPointer :: Key -> IO ()
emitPointer = S.putStr . formatPointer emitPointer = S.putStr . formatPointer

View file

@ -1,6 +1,6 @@
{- git cat-file interface {- git cat-file interface
- -
- Copyright 2011-2018 Joey Hess <id@joeyh.name> - Copyright 2011-2019 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -33,6 +33,7 @@ import Text.Read
import Common import Common
import Git import Git
import Git.Sha import Git.Sha
import qualified Git.Ref
import Git.Command import Git.Command
import Git.Types import Git.Types
import Git.FilePath import Git.FilePath
@ -109,22 +110,23 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
return (Just (content, sha, objtype)) return (Just (content, sha, objtype))
{- Gets the size and type of an object, without reading its content. -} {- Gets the size and type of an object, without reading its content. -}
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType)) catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
resp <- hGetLine from resp <- hGetLine from
case parseResp object resp of case parseResp object resp of
Just (ParsedResp _ size objtype) -> Just (ParsedResp sha size objtype) ->
return $ Just (size, objtype) return $ Just (sha, size, objtype)
Just DNE -> return Nothing Just DNE -> return Nothing
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object) Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
where where
-- Slow fallback path for filenames containing newlines. -- Slow fallback path for filenames containing newlines.
newlinefallback = do newlinefallback = do
sha <- Git.Ref.sha object (gitRepo h)
sz <- querySize object (gitRepo h) sz <- querySize object (gitRepo h)
objtype <- queryObjectType object (gitRepo h) objtype <- queryObjectType object (gitRepo h)
return $ (,) <$> sz <*> objtype return $ (,,) <$> sha <*> sz <*> objtype
data ParsedResp = ParsedResp Sha Integer ObjectType | DNE data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
query hdl object newlinefallback receive query hdl object newlinefallback receive
@ -180,7 +182,7 @@ querySingle o r repo reader = assertLocal repo $
, return Nothing , return Nothing
) )
querySize :: Ref -> Repo -> IO (Maybe Integer) querySize :: Ref -> Repo -> IO (Maybe FileSize)
querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n')) querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n'))
<$> querySingle (Param "-s") r repo hGetContentsStrict <$> querySingle (Param "-s") r repo hGetContentsStrict

View file

@ -23,12 +23,12 @@ import Data.ByteString.Builder
type HashObjectHandle = CoProcess.CoProcessHandle type HashObjectHandle = CoProcess.CoProcessHandle
hashObjectStart :: Repo -> IO HashObjectHandle hashObjectStart :: Bool -> Repo -> IO HashObjectHandle
hashObjectStart = gitCoProcessStart True hashObjectStart writeobject = gitCoProcessStart True $ catMaybes
[ Param "hash-object" [ Just (Param "hash-object")
, Param "-w" , if writeobject then Just (Param "-w") else Nothing
, Param "--stdin-paths" , Just (Param "--stdin-paths")
, Param "--no-filters" , Just (Param "--no-filters")
] ]
hashObjectStop :: HashObjectHandle -> IO () hashObjectStop :: HashObjectHandle -> IO ()

View file

@ -31,7 +31,7 @@ import Git.FilePath
-} -}
merge :: Ref -> Ref -> Repo -> IO () merge :: Ref -> Ref -> Repo -> IO ()
merge x y repo = do merge x y repo = do
hashhandle <- hashObjectStart repo hashhandle <- hashObjectStart True repo
ch <- catFileStart repo ch <- catFileStart repo
streamUpdateIndex repo streamUpdateIndex repo
[ lsTree x repo [ lsTree x repo

View file

@ -75,3 +75,5 @@ Thanks for having a look.
[[!meta author=kyle]] [[!meta author=kyle]]
[[!tag projects/datalad]] [[!tag projects/datalad]]
> [[fixed|done]] --[[Joey]]

View file

@ -41,4 +41,5 @@ add a specific file to annex or git, bypassing annex.largefiles and all
other configuration and state. This could also be used to easily switch other configuration and state. This could also be used to easily switch
a file from one storage to the other. I'd hope the existence of that a file from one storage to the other. I'd hope the existence of that
would prevent one-off setting of annex.largefiles=anything/nothing. would prevent one-off setting of annex.largefiles=anything/nothing.
[[todo/git_annex_add_option_to_control_to_where]]
"""]] """]]

View file

@ -0,0 +1,13 @@
Make `git-annex add --annex` and `git-annex add --git` add a specific file to
annex or git, bypassing annex.largefiles and all other configuration and state.
One reason to want this is that it avoids users doing stuff like this:
git -c annex.largefiles=anything annex add foo.c
Such a temporary setting of annex.largefiles can be problimatic, as explored in
<https://git-annex.branchable.com/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/>
Also, this could also be used to easily switch a file from one storage to
the other. I suppose the file would have to be touched first to make git-annex
add process it?