fix a case where file tracked by git unexpectedly becomes annex pointer file
smudge: When annex.largefiles=anything, files that were already stored in git, and have not been modified could sometimes be converted to being stored in the annex. Changes in 7.20191024 made this more of a problem. This case is now detected and prevented.
This commit is contained in:
parent
b9481c6ba0
commit
ea3cb7d277
10 changed files with 90 additions and 34 deletions
|
@ -1,6 +1,6 @@
|
||||||
{- git cat-file interface, with handle automatically stored in the Annex monad
|
{- git cat-file interface, with handle automatically stored in the Annex monad
|
||||||
-
|
-
|
||||||
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
|
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
@ -16,6 +16,7 @@ module Annex.CatFile (
|
||||||
catObjectMetaData,
|
catObjectMetaData,
|
||||||
catFileStop,
|
catFileStop,
|
||||||
catKey,
|
catKey,
|
||||||
|
catKey',
|
||||||
catSymLinkTarget,
|
catSymLinkTarget,
|
||||||
catKeyFile,
|
catKeyFile,
|
||||||
catKeyFileHEAD,
|
catKeyFileHEAD,
|
||||||
|
@ -54,7 +55,7 @@ catObject ref = do
|
||||||
h <- catFileHandle
|
h <- catFileHandle
|
||||||
liftIO $ Git.CatFile.catObject h ref
|
liftIO $ Git.CatFile.catObject h ref
|
||||||
|
|
||||||
catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType))
|
catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
|
||||||
catObjectMetaData ref = do
|
catObjectMetaData ref = do
|
||||||
h <- catFileHandle
|
h <- catFileHandle
|
||||||
liftIO $ Git.CatFile.catObjectMetaData h ref
|
liftIO $ Git.CatFile.catObjectMetaData h ref
|
||||||
|
@ -99,14 +100,16 @@ catFileStop = do
|
||||||
|
|
||||||
{- From ref to a symlink or a pointer file, get the key. -}
|
{- From ref to a symlink or a pointer file, get the key. -}
|
||||||
catKey :: Ref -> Annex (Maybe Key)
|
catKey :: Ref -> Annex (Maybe Key)
|
||||||
catKey ref = go =<< catObjectMetaData ref
|
catKey ref = catKey' ref =<< catObjectMetaData ref
|
||||||
where
|
|
||||||
go (Just (sz, _))
|
catKey' :: Ref -> Maybe (Sha, Integer, ObjectType) -> Annex (Maybe Key)
|
||||||
-- Avoid catting large files, that cannot be symlinks or
|
catKey' ref (Just (_, sz, _))
|
||||||
-- pointer files, which would require buffering their
|
-- Avoid catting large files, that cannot be symlinks or
|
||||||
-- content in memory, as well as a lot of IO.
|
-- pointer files, which would require buffering their
|
||||||
| sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref
|
-- content in memory, as well as a lot of IO.
|
||||||
go _ = return Nothing
|
| sz <= maxPointerSz =
|
||||||
|
parseLinkTargetOrPointer . L.toStrict <$> catObject ref
|
||||||
|
catKey' _ _ = return Nothing
|
||||||
|
|
||||||
{- Gets a symlink target. -}
|
{- Gets a symlink target. -}
|
||||||
catSymLinkTarget :: Sha -> Annex RawFilePath
|
catSymLinkTarget :: Sha -> Annex RawFilePath
|
||||||
|
@ -151,7 +154,7 @@ catKeyFileHEAD f = catKey $ Git.Ref.fileFromRef Git.Ref.headRef f
|
||||||
catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key)
|
catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key)
|
||||||
catKeyFileHidden = hiddenCat catKey
|
catKeyFileHidden = hiddenCat catKey
|
||||||
|
|
||||||
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Integer, ObjectType))
|
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Sha, Integer, ObjectType))
|
||||||
catObjectMetaDataHidden = hiddenCat catObjectMetaData
|
catObjectMetaDataHidden = hiddenCat catObjectMetaData
|
||||||
|
|
||||||
hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a)
|
hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a)
|
||||||
|
|
|
@ -21,7 +21,7 @@ hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
|
||||||
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
|
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
|
||||||
where
|
where
|
||||||
startup = do
|
startup = do
|
||||||
h <- inRepo $ Git.HashObject.hashObjectStart
|
h <- inRepo $ Git.HashObject.hashObjectStart True
|
||||||
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
|
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
|
||||||
return h
|
return h
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,10 @@ git-annex (7.20191219) UNRELEASED; urgency=medium
|
||||||
setting, except for in the special case of annex.securehashesonly.
|
setting, except for in the special case of annex.securehashesonly.
|
||||||
* Improve file ordering behavior when one parameter is "." and other
|
* Improve file ordering behavior when one parameter is "." and other
|
||||||
parameters are other directories.
|
parameters are other directories.
|
||||||
|
* smudge: When annex.largefiles=anything, files that were already
|
||||||
|
stored in git, and have not been modified could sometimes be converted
|
||||||
|
to being stored in the annex. Changes in 7.20191024 made this more
|
||||||
|
of a problem. This case is now detected and prevented.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Wed, 18 Dec 2019 15:12:40 -0400
|
-- Joey Hess <id@joeyh.name> Wed, 18 Dec 2019 15:12:40 -0400
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,10 @@ import Logs.Location
|
||||||
import qualified Database.Keys
|
import qualified Database.Keys
|
||||||
import qualified Git.BuildVersion
|
import qualified Git.BuildVersion
|
||||||
import Git.FilePath
|
import Git.FilePath
|
||||||
|
import Git.Types
|
||||||
|
import Git.HashObject
|
||||||
import qualified Git
|
import qualified Git
|
||||||
|
import qualified Git.Ref
|
||||||
import qualified Annex
|
import qualified Annex
|
||||||
import Backend
|
import Backend
|
||||||
import Utility.Metered
|
import Utility.Metered
|
||||||
|
@ -88,11 +91,14 @@ clean file = do
|
||||||
Just k -> do
|
Just k -> do
|
||||||
getMoveRaceRecovery k (toRawFilePath file)
|
getMoveRaceRecovery k (toRawFilePath file)
|
||||||
liftIO $ L.hPut stdout b
|
liftIO $ L.hPut stdout b
|
||||||
Nothing -> go b =<< catKeyFile (toRawFilePath file)
|
Nothing -> do
|
||||||
|
let fileref = Git.Ref.fileRef (toRawFilePath file)
|
||||||
|
indexmeta <- catObjectMetaData fileref
|
||||||
|
go b indexmeta =<< catKey' fileref indexmeta
|
||||||
)
|
)
|
||||||
stop
|
stop
|
||||||
where
|
where
|
||||||
go b oldkey = ifM (shouldAnnex file oldkey)
|
go b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
|
||||||
( do
|
( do
|
||||||
-- Before git 2.5, failing to consume all stdin here
|
-- Before git 2.5, failing to consume all stdin here
|
||||||
-- would cause a SIGPIPE and crash it.
|
-- would cause a SIGPIPE and crash it.
|
||||||
|
@ -153,17 +159,17 @@ clean file = do
|
||||||
-- added to the annex, so will be added to git. But some heuristics
|
-- added to the annex, so will be added to git. But some heuristics
|
||||||
-- are used to avoid bad behavior:
|
-- are used to avoid bad behavior:
|
||||||
--
|
--
|
||||||
-- If the index already contains the file, preserve its annexed/not annexed
|
-- If the file is annexed in the index, keep it annexed.
|
||||||
-- state. This prevents accidental conversions.
|
-- This prevents accidental conversions.
|
||||||
--
|
--
|
||||||
-- Otherwise, when the file's inode is the same as one that was used for
|
-- Otherwise, when the file's inode is the same as one that was used for
|
||||||
-- annexed content before, annex it. This handles cases such as renaming an
|
-- annexed content before, annex it. This handles cases such as renaming an
|
||||||
-- unlocked annexed file followed by git add, which the user naturally
|
-- unlocked annexed file followed by git add, which the user naturally
|
||||||
-- expects to behave the same as git mv.
|
-- expects to behave the same as git mv.
|
||||||
shouldAnnex :: FilePath -> Maybe Key -> Annex Bool
|
shouldAnnex :: FilePath -> Maybe (Sha, FileSize, ObjectType) -> Maybe Key -> Annex Bool
|
||||||
shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
|
shouldAnnex file indexmeta moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
|
||||||
( checkmatcher checkheuristics
|
( checkunchangedgitfile $ checkmatcher checkheuristics
|
||||||
, checkheuristics
|
, checkunchangedgitfile checkheuristics
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
checkmatcher d = do
|
checkmatcher d = do
|
||||||
|
@ -178,6 +184,31 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
|
||||||
Nothing -> pure False
|
Nothing -> pure False
|
||||||
Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus
|
Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus
|
||||||
|
|
||||||
|
-- This checks for a case where the file had been added to git
|
||||||
|
-- previously, not to the annex before, and its content is not
|
||||||
|
-- changed, but git is running the clean filter again on it
|
||||||
|
-- (eg because its mtime or inode changed, or just because git feels
|
||||||
|
-- like it). Such a file should not be added to the annex, even if
|
||||||
|
-- annex.largefiles now matches it, because the content is not
|
||||||
|
-- changed.
|
||||||
|
checkunchangedgitfile cont = case (moldkey, indexmeta) of
|
||||||
|
(Nothing, Just (sha, sz, _)) -> liftIO (catchMaybeIO (getFileSize file)) >>= \case
|
||||||
|
Just sz' | sz' == sz -> do
|
||||||
|
-- The size is the same, so the file
|
||||||
|
-- is not much larger than what was stored
|
||||||
|
-- in git before, so it won't be out of
|
||||||
|
-- line to hash it. However, the content
|
||||||
|
-- is prevented from being stored in git
|
||||||
|
-- when hashing.
|
||||||
|
h <- inRepo $ hashObjectStart False
|
||||||
|
sha' <- liftIO $ hashFile h file
|
||||||
|
liftIO $ hashObjectStop h
|
||||||
|
if sha' == sha
|
||||||
|
then return False
|
||||||
|
else cont
|
||||||
|
_ -> cont
|
||||||
|
_ -> cont
|
||||||
|
|
||||||
emitPointer :: Key -> IO ()
|
emitPointer :: Key -> IO ()
|
||||||
emitPointer = S.putStr . formatPointer
|
emitPointer = S.putStr . formatPointer
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{- git cat-file interface
|
{- git cat-file interface
|
||||||
-
|
-
|
||||||
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
|
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
@ -33,6 +33,7 @@ import Text.Read
|
||||||
import Common
|
import Common
|
||||||
import Git
|
import Git
|
||||||
import Git.Sha
|
import Git.Sha
|
||||||
|
import qualified Git.Ref
|
||||||
import Git.Command
|
import Git.Command
|
||||||
import Git.Types
|
import Git.Types
|
||||||
import Git.FilePath
|
import Git.FilePath
|
||||||
|
@ -109,22 +110,23 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
|
||||||
return (Just (content, sha, objtype))
|
return (Just (content, sha, objtype))
|
||||||
|
|
||||||
{- Gets the size and type of an object, without reading its content. -}
|
{- Gets the size and type of an object, without reading its content. -}
|
||||||
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType))
|
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
|
||||||
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
|
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
|
||||||
resp <- hGetLine from
|
resp <- hGetLine from
|
||||||
case parseResp object resp of
|
case parseResp object resp of
|
||||||
Just (ParsedResp _ size objtype) ->
|
Just (ParsedResp sha size objtype) ->
|
||||||
return $ Just (size, objtype)
|
return $ Just (sha, size, objtype)
|
||||||
Just DNE -> return Nothing
|
Just DNE -> return Nothing
|
||||||
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
|
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
|
||||||
where
|
where
|
||||||
-- Slow fallback path for filenames containing newlines.
|
-- Slow fallback path for filenames containing newlines.
|
||||||
newlinefallback = do
|
newlinefallback = do
|
||||||
|
sha <- Git.Ref.sha object (gitRepo h)
|
||||||
sz <- querySize object (gitRepo h)
|
sz <- querySize object (gitRepo h)
|
||||||
objtype <- queryObjectType object (gitRepo h)
|
objtype <- queryObjectType object (gitRepo h)
|
||||||
return $ (,) <$> sz <*> objtype
|
return $ (,,) <$> sha <*> sz <*> objtype
|
||||||
|
|
||||||
data ParsedResp = ParsedResp Sha Integer ObjectType | DNE
|
data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE
|
||||||
|
|
||||||
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
|
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
|
||||||
query hdl object newlinefallback receive
|
query hdl object newlinefallback receive
|
||||||
|
@ -180,7 +182,7 @@ querySingle o r repo reader = assertLocal repo $
|
||||||
, return Nothing
|
, return Nothing
|
||||||
)
|
)
|
||||||
|
|
||||||
querySize :: Ref -> Repo -> IO (Maybe Integer)
|
querySize :: Ref -> Repo -> IO (Maybe FileSize)
|
||||||
querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n'))
|
querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n'))
|
||||||
<$> querySingle (Param "-s") r repo hGetContentsStrict
|
<$> querySingle (Param "-s") r repo hGetContentsStrict
|
||||||
|
|
||||||
|
|
|
@ -23,12 +23,12 @@ import Data.ByteString.Builder
|
||||||
|
|
||||||
type HashObjectHandle = CoProcess.CoProcessHandle
|
type HashObjectHandle = CoProcess.CoProcessHandle
|
||||||
|
|
||||||
hashObjectStart :: Repo -> IO HashObjectHandle
|
hashObjectStart :: Bool -> Repo -> IO HashObjectHandle
|
||||||
hashObjectStart = gitCoProcessStart True
|
hashObjectStart writeobject = gitCoProcessStart True $ catMaybes
|
||||||
[ Param "hash-object"
|
[ Just (Param "hash-object")
|
||||||
, Param "-w"
|
, if writeobject then Just (Param "-w") else Nothing
|
||||||
, Param "--stdin-paths"
|
, Just (Param "--stdin-paths")
|
||||||
, Param "--no-filters"
|
, Just (Param "--no-filters")
|
||||||
]
|
]
|
||||||
|
|
||||||
hashObjectStop :: HashObjectHandle -> IO ()
|
hashObjectStop :: HashObjectHandle -> IO ()
|
||||||
|
|
|
@ -31,7 +31,7 @@ import Git.FilePath
|
||||||
-}
|
-}
|
||||||
merge :: Ref -> Ref -> Repo -> IO ()
|
merge :: Ref -> Ref -> Repo -> IO ()
|
||||||
merge x y repo = do
|
merge x y repo = do
|
||||||
hashhandle <- hashObjectStart repo
|
hashhandle <- hashObjectStart True repo
|
||||||
ch <- catFileStart repo
|
ch <- catFileStart repo
|
||||||
streamUpdateIndex repo
|
streamUpdateIndex repo
|
||||||
[ lsTree x repo
|
[ lsTree x repo
|
||||||
|
|
|
@ -75,3 +75,5 @@ Thanks for having a look.
|
||||||
|
|
||||||
[[!meta author=kyle]]
|
[[!meta author=kyle]]
|
||||||
[[!tag projects/datalad]]
|
[[!tag projects/datalad]]
|
||||||
|
|
||||||
|
> [[fixed|done]] --[[Joey]]
|
||||||
|
|
|
@ -41,4 +41,5 @@ add a specific file to annex or git, bypassing annex.largefiles and all
|
||||||
other configuration and state. This could also be used to easily switch
|
other configuration and state. This could also be used to easily switch
|
||||||
a file from one storage to the other. I'd hope the existence of that
|
a file from one storage to the other. I'd hope the existence of that
|
||||||
would prevent one-off setting of annex.largefiles=anything/nothing.
|
would prevent one-off setting of annex.largefiles=anything/nothing.
|
||||||
|
[[todo/git_annex_add_option_to_control_to_where]]
|
||||||
"""]]
|
"""]]
|
||||||
|
|
13
doc/todo/git_annex_add_option_to_control_to_where.mdwn
Normal file
13
doc/todo/git_annex_add_option_to_control_to_where.mdwn
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
Make `git-annex add --annex` and `git-annex add --git` add a specific file to
|
||||||
|
annex or git, bypassing annex.largefiles and all other configuration and state.
|
||||||
|
|
||||||
|
One reason to want this is that it avoids users doing stuff like this:
|
||||||
|
|
||||||
|
git -c annex.largefiles=anything annex add foo.c
|
||||||
|
|
||||||
|
Such a temporary setting of annex.largefiles can be problimatic, as explored in
|
||||||
|
<https://git-annex.branchable.com/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/>
|
||||||
|
|
||||||
|
Also, this could also be used to easily switch a file from one storage to
|
||||||
|
the other. I suppose the file would have to be touched first to make git-annex
|
||||||
|
add process it?
|
Loading…
Add table
Add a link
Reference in a new issue