fix a case where file tracked by git unexpectedly becomes annex pointer file
smudge: When annex.largefiles=anything, files that were already stored in git, and have not been modified could sometimes be converted to being stored in the annex. Changes in 7.20191024 made this more of a problem. This case is now detected and prevented.
This commit is contained in:
parent
b9481c6ba0
commit
ea3cb7d277
10 changed files with 90 additions and 34 deletions
|
@ -1,6 +1,6 @@
|
|||
{- git cat-file interface, with handle automatically stored in the Annex monad
|
||||
-
|
||||
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -16,6 +16,7 @@ module Annex.CatFile (
|
|||
catObjectMetaData,
|
||||
catFileStop,
|
||||
catKey,
|
||||
catKey',
|
||||
catSymLinkTarget,
|
||||
catKeyFile,
|
||||
catKeyFileHEAD,
|
||||
|
@ -54,7 +55,7 @@ catObject ref = do
|
|||
h <- catFileHandle
|
||||
liftIO $ Git.CatFile.catObject h ref
|
||||
|
||||
catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType))
|
||||
catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
|
||||
catObjectMetaData ref = do
|
||||
h <- catFileHandle
|
||||
liftIO $ Git.CatFile.catObjectMetaData h ref
|
||||
|
@ -99,14 +100,16 @@ catFileStop = do
|
|||
|
||||
{- From ref to a symlink or a pointer file, get the key. -}
|
||||
catKey :: Ref -> Annex (Maybe Key)
|
||||
catKey ref = go =<< catObjectMetaData ref
|
||||
where
|
||||
go (Just (sz, _))
|
||||
-- Avoid catting large files, that cannot be symlinks or
|
||||
-- pointer files, which would require buffering their
|
||||
-- content in memory, as well as a lot of IO.
|
||||
| sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref
|
||||
go _ = return Nothing
|
||||
catKey ref = catKey' ref =<< catObjectMetaData ref
|
||||
|
||||
catKey' :: Ref -> Maybe (Sha, Integer, ObjectType) -> Annex (Maybe Key)
|
||||
catKey' ref (Just (_, sz, _))
|
||||
-- Avoid catting large files, that cannot be symlinks or
|
||||
-- pointer files, which would require buffering their
|
||||
-- content in memory, as well as a lot of IO.
|
||||
| sz <= maxPointerSz =
|
||||
parseLinkTargetOrPointer . L.toStrict <$> catObject ref
|
||||
catKey' _ _ = return Nothing
|
||||
|
||||
{- Gets a symlink target. -}
|
||||
catSymLinkTarget :: Sha -> Annex RawFilePath
|
||||
|
@ -151,7 +154,7 @@ catKeyFileHEAD f = catKey $ Git.Ref.fileFromRef Git.Ref.headRef f
|
|||
catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key)
|
||||
catKeyFileHidden = hiddenCat catKey
|
||||
|
||||
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Integer, ObjectType))
|
||||
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Sha, Integer, ObjectType))
|
||||
catObjectMetaDataHidden = hiddenCat catObjectMetaData
|
||||
|
||||
hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a)
|
||||
|
|
|
@ -21,7 +21,7 @@ hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
|
|||
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
|
||||
where
|
||||
startup = do
|
||||
h <- inRepo $ Git.HashObject.hashObjectStart
|
||||
h <- inRepo $ Git.HashObject.hashObjectStart True
|
||||
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
|
||||
return h
|
||||
|
||||
|
|
|
@ -19,6 +19,10 @@ git-annex (7.20191219) UNRELEASED; urgency=medium
|
|||
setting, except for in the special case of annex.securehashesonly.
|
||||
* Improve file ordering behavior when one parameter is "." and other
|
||||
parameters are other directories.
|
||||
* smudge: When annex.largefiles=anything, files that were already
|
||||
stored in git, and have not been modified could sometimes be converted
|
||||
to being stored in the annex. Changes in 7.20191024 made this more
|
||||
of a problem. This case is now detected and prevented.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Wed, 18 Dec 2019 15:12:40 -0400
|
||||
|
||||
|
|
|
@ -18,7 +18,10 @@ import Logs.Location
|
|||
import qualified Database.Keys
|
||||
import qualified Git.BuildVersion
|
||||
import Git.FilePath
|
||||
import Git.Types
|
||||
import Git.HashObject
|
||||
import qualified Git
|
||||
import qualified Git.Ref
|
||||
import qualified Annex
|
||||
import Backend
|
||||
import Utility.Metered
|
||||
|
@ -88,11 +91,14 @@ clean file = do
|
|||
Just k -> do
|
||||
getMoveRaceRecovery k (toRawFilePath file)
|
||||
liftIO $ L.hPut stdout b
|
||||
Nothing -> go b =<< catKeyFile (toRawFilePath file)
|
||||
Nothing -> do
|
||||
let fileref = Git.Ref.fileRef (toRawFilePath file)
|
||||
indexmeta <- catObjectMetaData fileref
|
||||
go b indexmeta =<< catKey' fileref indexmeta
|
||||
)
|
||||
stop
|
||||
where
|
||||
go b oldkey = ifM (shouldAnnex file oldkey)
|
||||
go b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
|
||||
( do
|
||||
-- Before git 2.5, failing to consume all stdin here
|
||||
-- would cause a SIGPIPE and crash it.
|
||||
|
@ -153,17 +159,17 @@ clean file = do
|
|||
-- added to the annex, so will be added to git. But some heuristics
|
||||
-- are used to avoid bad behavior:
|
||||
--
|
||||
-- If the index already contains the file, preserve its annexed/not annexed
|
||||
-- state. This prevents accidental conversions.
|
||||
-- If the file is annexed in the index, keep it annexed.
|
||||
-- This prevents accidental conversions.
|
||||
--
|
||||
-- Otherwise, when the file's inode is the same as one that was used for
|
||||
-- annexed content before, annex it. This handles cases such as renaming an
|
||||
-- unlocked annexed file followed by git add, which the user naturally
|
||||
-- expects to behave the same as git mv.
|
||||
shouldAnnex :: FilePath -> Maybe Key -> Annex Bool
|
||||
shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
|
||||
( checkmatcher checkheuristics
|
||||
, checkheuristics
|
||||
shouldAnnex :: FilePath -> Maybe (Sha, FileSize, ObjectType) -> Maybe Key -> Annex Bool
|
||||
shouldAnnex file indexmeta moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
|
||||
( checkunchangedgitfile $ checkmatcher checkheuristics
|
||||
, checkunchangedgitfile checkheuristics
|
||||
)
|
||||
where
|
||||
checkmatcher d = do
|
||||
|
@ -178,6 +184,31 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
|
|||
Nothing -> pure False
|
||||
Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus
|
||||
|
||||
-- This checks for a case where the file had been added to git
|
||||
-- previously, not to the annex before, and its content is not
|
||||
-- changed, but git is running the clean filter again on it
|
||||
-- (eg because its mtime or inode changed, or just because git feels
|
||||
-- like it). Such a file should not be added to the annex, even if
|
||||
-- annex.largefiles now matches it, because the content is not
|
||||
-- changed.
|
||||
checkunchangedgitfile cont = case (moldkey, indexmeta) of
|
||||
(Nothing, Just (sha, sz, _)) -> liftIO (catchMaybeIO (getFileSize file)) >>= \case
|
||||
Just sz' | sz' == sz -> do
|
||||
-- The size is the same, so the file
|
||||
-- is not much larger than what was stored
|
||||
-- in git before, so it won't be out of
|
||||
-- line to hash it. However, the content
|
||||
-- is prevented from being stored in git
|
||||
-- when hashing.
|
||||
h <- inRepo $ hashObjectStart False
|
||||
sha' <- liftIO $ hashFile h file
|
||||
liftIO $ hashObjectStop h
|
||||
if sha' == sha
|
||||
then return False
|
||||
else cont
|
||||
_ -> cont
|
||||
_ -> cont
|
||||
|
||||
emitPointer :: Key -> IO ()
|
||||
emitPointer = S.putStr . formatPointer
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git cat-file interface
|
||||
-
|
||||
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -33,6 +33,7 @@ import Text.Read
|
|||
import Common
|
||||
import Git
|
||||
import Git.Sha
|
||||
import qualified Git.Ref
|
||||
import Git.Command
|
||||
import Git.Types
|
||||
import Git.FilePath
|
||||
|
@ -109,22 +110,23 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
|
|||
return (Just (content, sha, objtype))
|
||||
|
||||
{- Gets the size and type of an object, without reading its content. -}
|
||||
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType))
|
||||
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
|
||||
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
|
||||
resp <- hGetLine from
|
||||
case parseResp object resp of
|
||||
Just (ParsedResp _ size objtype) ->
|
||||
return $ Just (size, objtype)
|
||||
Just (ParsedResp sha size objtype) ->
|
||||
return $ Just (sha, size, objtype)
|
||||
Just DNE -> return Nothing
|
||||
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
|
||||
where
|
||||
-- Slow fallback path for filenames containing newlines.
|
||||
newlinefallback = do
|
||||
sha <- Git.Ref.sha object (gitRepo h)
|
||||
sz <- querySize object (gitRepo h)
|
||||
objtype <- queryObjectType object (gitRepo h)
|
||||
return $ (,) <$> sz <*> objtype
|
||||
return $ (,,) <$> sha <*> sz <*> objtype
|
||||
|
||||
data ParsedResp = ParsedResp Sha Integer ObjectType | DNE
|
||||
data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE
|
||||
|
||||
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
|
||||
query hdl object newlinefallback receive
|
||||
|
@ -180,7 +182,7 @@ querySingle o r repo reader = assertLocal repo $
|
|||
, return Nothing
|
||||
)
|
||||
|
||||
querySize :: Ref -> Repo -> IO (Maybe Integer)
|
||||
querySize :: Ref -> Repo -> IO (Maybe FileSize)
|
||||
querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n'))
|
||||
<$> querySingle (Param "-s") r repo hGetContentsStrict
|
||||
|
||||
|
|
|
@ -23,12 +23,12 @@ import Data.ByteString.Builder
|
|||
|
||||
type HashObjectHandle = CoProcess.CoProcessHandle
|
||||
|
||||
hashObjectStart :: Repo -> IO HashObjectHandle
|
||||
hashObjectStart = gitCoProcessStart True
|
||||
[ Param "hash-object"
|
||||
, Param "-w"
|
||||
, Param "--stdin-paths"
|
||||
, Param "--no-filters"
|
||||
hashObjectStart :: Bool -> Repo -> IO HashObjectHandle
|
||||
hashObjectStart writeobject = gitCoProcessStart True $ catMaybes
|
||||
[ Just (Param "hash-object")
|
||||
, if writeobject then Just (Param "-w") else Nothing
|
||||
, Just (Param "--stdin-paths")
|
||||
, Just (Param "--no-filters")
|
||||
]
|
||||
|
||||
hashObjectStop :: HashObjectHandle -> IO ()
|
||||
|
|
|
@ -31,7 +31,7 @@ import Git.FilePath
|
|||
-}
|
||||
merge :: Ref -> Ref -> Repo -> IO ()
|
||||
merge x y repo = do
|
||||
hashhandle <- hashObjectStart repo
|
||||
hashhandle <- hashObjectStart True repo
|
||||
ch <- catFileStart repo
|
||||
streamUpdateIndex repo
|
||||
[ lsTree x repo
|
||||
|
|
|
@ -75,3 +75,5 @@ Thanks for having a look.
|
|||
|
||||
[[!meta author=kyle]]
|
||||
[[!tag projects/datalad]]
|
||||
|
||||
> [[fixed|done]] --[[Joey]]
|
||||
|
|
|
@ -41,4 +41,5 @@ add a specific file to annex or git, bypassing annex.largefiles and all
|
|||
other configuration and state. This could also be used to easily switch
|
||||
a file from one storage to the other. I'd hope the existence of that
|
||||
would prevent one-off setting of annex.largefiles=anything/nothing.
|
||||
[[todo/git_annex_add_option_to_control_to_where]]
|
||||
"""]]
|
||||
|
|
13
doc/todo/git_annex_add_option_to_control_to_where.mdwn
Normal file
13
doc/todo/git_annex_add_option_to_control_to_where.mdwn
Normal file
|
@ -0,0 +1,13 @@
|
|||
Make `git-annex add --annex` and `git-annex add --git` add a specific file to
|
||||
annex or git, bypassing annex.largefiles and all other configuration and state.
|
||||
|
||||
One reason to want this is that it avoids users doing stuff like this:
|
||||
|
||||
git -c annex.largefiles=anything annex add foo.c
|
||||
|
||||
Such a temporary setting of annex.largefiles can be problimatic, as explored in
|
||||
<https://git-annex.branchable.com/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/>
|
||||
|
||||
Also, this could also be used to easily switch a file from one storage to
|
||||
the other. I suppose the file would have to be touched first to make git-annex
|
||||
add process it?
|
Loading…
Add table
Add a link
Reference in a new issue