Preserve metadata when staging a new version of an annexed file.

Performance impact: When adding a large tree of new files, this needs
to do some git cat-file queries to check if any of the files already
existed and might need a metadata copy. I tried a benchmark in a copy
of my sound repository (so there was already a significant git tree
to check against.

Adding 10000 small files, with a cold cache:
  before: 1m48.539s
  after:  1m52.791s

So, impact is 0.0004 seconds per file added. Which seems acceptable, so did
not add some kind of configuration to enable/disable this.

This commit was sponsored by Lisa Feilen.
This commit is contained in:
Joey Hess 2014-02-24 14:41:33 -04:00
parent e7252cf192
commit 8d5158fa31
5 changed files with 41 additions and 14 deletions

View file

@ -87,8 +87,7 @@ catKey' modeguaranteed ref mode
| modeguaranteed = catObject ref
| otherwise = L.take 8192 <$> catObject ref
{- Looks up the file mode corresponding to the Ref using the running
- cat-file.
{- Looks up the key corresponding to the Ref using the running cat-file.
-
- Currently this always has to look in HEAD, because cat-file --batch
- does not offer a way to specify that we want to look up a tree object

View file

@ -11,6 +11,7 @@ import Common.Annex
import qualified Annex
import Types.MetaData
import Logs.MetaData
import Annex.CatFile
import qualified Data.Set as S
import qualified Data.Map as M
@ -27,18 +28,27 @@ yearMetaField = MetaField "year"
monthMetaField :: MetaField
monthMetaField = MetaField "month"
{- Generates metadata for a file that has just been ingested into the
- annex. Passed the FileStatus of the content file.
{- Adds metadata for a file that has just been ingested into the
- annex, but has not yet been committed to git.
-
- Does not overwrite any existing metadata values for the key.
- When the file has been modified, the metadata is copied over
- from the old key to the new key. Note that it looks at the old key as
- committed to HEAD -- the new key may or may not have already been staged
- in th annex.
-
- Also, can generate new metadata, if configured to do so.
-}
genMetaData :: Key -> FileStatus -> Annex ()
genMetaData key status = whenM (annexGenMetaData <$> Annex.getGitConfig) $ do
metadata <- getCurrentMetaData key
let metadata' = genMetaData' status metadata
unless (metadata' == emptyMetaData) $
addMetaData key metadata'
genMetaData :: Key -> FilePath -> FileStatus -> Annex ()
genMetaData key file status = do
maybe noop (flip copyMetaData key) =<< catKeyFileHEAD file
whenM (annexGenMetaData <$> Annex.getGitConfig) $ do
metadata <- getCurrentMetaData key
let metadata' = genMetaData' status metadata
unless (metadata' == emptyMetaData) $
addMetaData key metadata'
{- Generates metadata from the FileStatus.
- Does not overwrite any existing metadata values. -}
genMetaData' :: FileStatus -> MetaData -> MetaData
genMetaData' status old = MetaData $ M.fromList $ filter isnew
[ (yearMetaField, S.singleton $ toMetaValue $ show y)

View file

@ -161,14 +161,14 @@ ingest (Just source) = do
goindirect (Just (key, _)) mcache ms = do
catchAnnex (moveAnnex key $ contentLocation source)
(undo (keyFilename source) key)
maybe noop (genMetaData key) ms
maybe noop (genMetaData key (keyFilename source)) ms
liftIO $ nukeFile $ keyFilename source
return $ (Just key, mcache)
goindirect _ _ _ = failure "failed to generate a key"
godirect (Just (key, _)) (Just cache) ms = do
addInodeCache key cache
maybe noop (genMetaData key) ms
maybe noop (genMetaData key (keyFilename source)) ms
finishIngestDirect key source
return $ (Just key, Just cache)
godirect _ _ _ = failure "failed to generate a key"

View file

@ -28,10 +28,10 @@
module Logs.MetaData (
getCurrentMetaData,
getMetaData,
addMetaData,
addMetaData',
currentMetaData,
copyMetaData,
) where
import Common.Annex
@ -135,3 +135,20 @@ simplifyLog s = case sl of
where
older = value l
unique = older `differenceMetaData` newer
{- Copies the metadata from the old key to the new key.
-
- The exact content of the metadata file is copied, so that the timestamps
- remain the same, and because this is more space-efficient in the git
- repository.
-
- Any metadata already attached to the new key is not preserved.
-}
copyMetaData :: Key -> Key -> Annex ()
copyMetaData oldkey newkey
| oldkey == newkey = noop
| otherwise = do
l <- getMetaData oldkey
unless (S.null l) $
Annex.Branch.change (metaDataLogFile newkey) $
const $ showLog l

1
debian/changelog vendored
View file

@ -11,6 +11,7 @@ git-annex (5.20140222) UNRELEASED; urgency=medium
tag/showname.
* annex.genmetadata can be set to make git-annex automatically set
metadata (year and month) when adding files.
* Preserve metadata when staging a new version of an annexed file.
* metadata: Field names limited to alphanumerics and a few whitelisted
punctuation characters to avoid issues with views, etc.
* metadata: Support --json