diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs index 2037693e91..427d80db20 100644 --- a/Annex/CatFile.hs +++ b/Annex/CatFile.hs @@ -1,6 +1,6 @@ {- git cat-file interface, with handle automatically stored in the Annex monad - - - Copyright 2011-2018 Joey Hess + - Copyright 2011-2019 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -16,6 +16,7 @@ module Annex.CatFile ( catObjectMetaData, catFileStop, catKey, + catKey', catSymLinkTarget, catKeyFile, catKeyFileHEAD, @@ -54,7 +55,7 @@ catObject ref = do h <- catFileHandle liftIO $ Git.CatFile.catObject h ref -catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType)) +catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType)) catObjectMetaData ref = do h <- catFileHandle liftIO $ Git.CatFile.catObjectMetaData h ref @@ -99,14 +100,16 @@ catFileStop = do {- From ref to a symlink or a pointer file, get the key. -} catKey :: Ref -> Annex (Maybe Key) -catKey ref = go =<< catObjectMetaData ref - where - go (Just (sz, _)) - -- Avoid catting large files, that cannot be symlinks or - -- pointer files, which would require buffering their - -- content in memory, as well as a lot of IO. - | sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref - go _ = return Nothing +catKey ref = catKey' ref =<< catObjectMetaData ref + +catKey' :: Ref -> Maybe (Sha, Integer, ObjectType) -> Annex (Maybe Key) +catKey' ref (Just (_, sz, _)) + -- Avoid catting large files, that cannot be symlinks or + -- pointer files, which would require buffering their + -- content in memory, as well as a lot of IO. + | sz <= maxPointerSz = + parseLinkTargetOrPointer . L.toStrict <$> catObject ref +catKey' _ _ = return Nothing {- Gets a symlink target. -} catSymLinkTarget :: Sha -> Annex RawFilePath @@ -151,7 +154,7 @@ catKeyFileHEAD f = catKey $ Git.Ref.fileFromRef Git.Ref.headRef f catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key) catKeyFileHidden = hiddenCat catKey -catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Integer, ObjectType)) +catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Sha, Integer, ObjectType)) catObjectMetaDataHidden = hiddenCat catObjectMetaData hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a) diff --git a/Annex/HashObject.hs b/Annex/HashObject.hs index d368318255..7871fefc93 100644 --- a/Annex/HashObject.hs +++ b/Annex/HashObject.hs @@ -21,7 +21,7 @@ hashObjectHandle :: Annex Git.HashObject.HashObjectHandle hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle where startup = do - h <- inRepo $ Git.HashObject.hashObjectStart + h <- inRepo $ Git.HashObject.hashObjectStart True Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h } return h diff --git a/CHANGELOG b/CHANGELOG index 4cde472e26..657de265a8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -19,6 +19,10 @@ git-annex (7.20191219) UNRELEASED; urgency=medium setting, except for in the special case of annex.securehashesonly. * Improve file ordering behavior when one parameter is "." and other parameters are other directories. + * smudge: When annex.largefiles=anything, files that were already + stored in git, and have not been modified could sometimes be converted + to being stored in the annex. Changes in 7.20191024 made this more + of a problem. This case is now detected and prevented. -- Joey Hess Wed, 18 Dec 2019 15:12:40 -0400 diff --git a/Command/Smudge.hs b/Command/Smudge.hs index d8f6c08454..c2aee6bd2b 100644 --- a/Command/Smudge.hs +++ b/Command/Smudge.hs @@ -18,7 +18,10 @@ import Logs.Location import qualified Database.Keys import qualified Git.BuildVersion import Git.FilePath +import Git.Types +import Git.HashObject import qualified Git +import qualified Git.Ref import qualified Annex import Backend import Utility.Metered @@ -88,11 +91,14 @@ clean file = do Just k -> do getMoveRaceRecovery k (toRawFilePath file) liftIO $ L.hPut stdout b - Nothing -> go b =<< catKeyFile (toRawFilePath file) + Nothing -> do + let fileref = Git.Ref.fileRef (toRawFilePath file) + indexmeta <- catObjectMetaData fileref + go b indexmeta =<< catKey' fileref indexmeta ) stop where - go b oldkey = ifM (shouldAnnex file oldkey) + go b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey) ( do -- Before git 2.5, failing to consume all stdin here -- would cause a SIGPIPE and crash it. @@ -153,17 +159,17 @@ clean file = do -- added to the annex, so will be added to git. But some heuristics -- are used to avoid bad behavior: -- --- If the index already contains the file, preserve its annexed/not annexed --- state. This prevents accidental conversions. +-- If the file is annexed in the index, keep it annexed. +-- This prevents accidental conversions. -- -- Otherwise, when the file's inode is the same as one that was used for -- annexed content before, annex it. This handles cases such as renaming an -- unlocked annexed file followed by git add, which the user naturally -- expects to behave the same as git mv. -shouldAnnex :: FilePath -> Maybe Key -> Annex Bool -shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig) - ( checkmatcher checkheuristics - , checkheuristics +shouldAnnex :: FilePath -> Maybe (Sha, FileSize, ObjectType) -> Maybe Key -> Annex Bool +shouldAnnex file indexmeta moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig) + ( checkunchangedgitfile $ checkmatcher checkheuristics + , checkunchangedgitfile checkheuristics ) where checkmatcher d = do @@ -178,6 +184,31 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig) Nothing -> pure False Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus + -- This checks for a case where the file had been added to git + -- previously, not to the annex before, and its content is not + -- changed, but git is running the clean filter again on it + -- (eg because its mtime or inode changed, or just because git feels + -- like it). Such a file should not be added to the annex, even if + -- annex.largefiles now matches it, because the content is not + -- changed. + checkunchangedgitfile cont = case (moldkey, indexmeta) of + (Nothing, Just (sha, sz, _)) -> liftIO (catchMaybeIO (getFileSize file)) >>= \case + Just sz' | sz' == sz -> do + -- The size is the same, so the file + -- is not much larger than what was stored + -- in git before, so it won't be out of + -- line to hash it. However, the content + -- is prevented from being stored in git + -- when hashing. + h <- inRepo $ hashObjectStart False + sha' <- liftIO $ hashFile h file + liftIO $ hashObjectStop h + if sha' == sha + then return False + else cont + _ -> cont + _ -> cont + emitPointer :: Key -> IO () emitPointer = S.putStr . formatPointer diff --git a/Git/CatFile.hs b/Git/CatFile.hs index 732c18a643..6402001ebd 100644 --- a/Git/CatFile.hs +++ b/Git/CatFile.hs @@ -1,6 +1,6 @@ {- git cat-file interface - - - Copyright 2011-2018 Joey Hess + - Copyright 2011-2019 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -33,6 +33,7 @@ import Text.Read import Common import Git import Git.Sha +import qualified Git.Ref import Git.Command import Git.Types import Git.FilePath @@ -109,22 +110,23 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f return (Just (content, sha, objtype)) {- Gets the size and type of an object, without reading its content. -} -catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType)) +catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType)) catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do resp <- hGetLine from case parseResp object resp of - Just (ParsedResp _ size objtype) -> - return $ Just (size, objtype) + Just (ParsedResp sha size objtype) -> + return $ Just (sha, size, objtype) Just DNE -> return Nothing Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object) where -- Slow fallback path for filenames containing newlines. newlinefallback = do + sha <- Git.Ref.sha object (gitRepo h) sz <- querySize object (gitRepo h) objtype <- queryObjectType object (gitRepo h) - return $ (,) <$> sz <*> objtype + return $ (,,) <$> sha <*> sz <*> objtype -data ParsedResp = ParsedResp Sha Integer ObjectType | DNE +data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a query hdl object newlinefallback receive @@ -180,7 +182,7 @@ querySingle o r repo reader = assertLocal repo $ , return Nothing ) -querySize :: Ref -> Repo -> IO (Maybe Integer) +querySize :: Ref -> Repo -> IO (Maybe FileSize) querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n')) <$> querySingle (Param "-s") r repo hGetContentsStrict diff --git a/Git/HashObject.hs b/Git/HashObject.hs index 605e6d504c..3787c9cb57 100644 --- a/Git/HashObject.hs +++ b/Git/HashObject.hs @@ -23,12 +23,12 @@ import Data.ByteString.Builder type HashObjectHandle = CoProcess.CoProcessHandle -hashObjectStart :: Repo -> IO HashObjectHandle -hashObjectStart = gitCoProcessStart True - [ Param "hash-object" - , Param "-w" - , Param "--stdin-paths" - , Param "--no-filters" +hashObjectStart :: Bool -> Repo -> IO HashObjectHandle +hashObjectStart writeobject = gitCoProcessStart True $ catMaybes + [ Just (Param "hash-object") + , if writeobject then Just (Param "-w") else Nothing + , Just (Param "--stdin-paths") + , Just (Param "--no-filters") ] hashObjectStop :: HashObjectHandle -> IO () diff --git a/Git/UnionMerge.hs b/Git/UnionMerge.hs index e046895a1c..c88b36c1b2 100644 --- a/Git/UnionMerge.hs +++ b/Git/UnionMerge.hs @@ -31,7 +31,7 @@ import Git.FilePath -} merge :: Ref -> Ref -> Repo -> IO () merge x y repo = do - hashhandle <- hashObjectStart repo + hashhandle <- hashObjectStart True repo ch <- catFileStart repo streamUpdateIndex repo [ lsTree x repo diff --git a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file.mdwn b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file.mdwn index 80691280b5..eb5b79a5f4 100644 --- a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file.mdwn +++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file.mdwn @@ -75,3 +75,5 @@ Thanks for having a look. [[!meta author=kyle]] [[!tag projects/datalad]] + +> [[fixed|done]] --[[Joey]] diff --git a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_4_4c3b233aa1c6b0b6c0b0d7519b5877a1._comment b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_4_4c3b233aa1c6b0b6c0b0d7519b5877a1._comment index d9c7bfe0e0..22b7852084 100644 --- a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_4_4c3b233aa1c6b0b6c0b0d7519b5877a1._comment +++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_4_4c3b233aa1c6b0b6c0b0d7519b5877a1._comment @@ -41,4 +41,5 @@ add a specific file to annex or git, bypassing annex.largefiles and all other configuration and state. This could also be used to easily switch a file from one storage to the other. I'd hope the existence of that would prevent one-off setting of annex.largefiles=anything/nothing. +[[todo/git_annex_add_option_to_control_to_where]] """]] diff --git a/doc/todo/git_annex_add_option_to_control_to_where.mdwn b/doc/todo/git_annex_add_option_to_control_to_where.mdwn new file mode 100644 index 0000000000..cbc1a9582d --- /dev/null +++ b/doc/todo/git_annex_add_option_to_control_to_where.mdwn @@ -0,0 +1,13 @@ +Make `git-annex add --annex` and `git-annex add --git` add a specific file to +annex or git, bypassing annex.largefiles and all other configuration and state. + +One reason to want this is that it avoids users doing stuff like this: + + git -c annex.largefiles=anything annex add foo.c + +Such a temporary setting of annex.largefiles can be problimatic, as explored in + + +Also, this could also be used to easily switch a file from one storage to +the other. I suppose the file would have to be touched first to make git-annex +add process it?