From 67e817c6a1c6a96a5507b001b0e70297dc1ba73b Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 29 Mar 2013 16:17:13 -0400 Subject: [PATCH] New annex.largefiles setting, which configures which files `git annex add` and the assistant add to the annex. I would have sort of liked to put this in .gitattributes, but it seems it does not support multi-word attribute values. Also, making this a single config setting makes it easy to only parse the expression once. A natural next step would be to make the assistant `git add` files that are not annex.largefiles. OTOH, I don't think `git annex add` should `git add` such files, because git-annex command line tools are not in the business of wrapping git command line tools. --- Annex/FileMatcher.hs | 86 ++++++++++++++++++++++++++++++++++++ Assistant/Threads/Watcher.hs | 24 ++++++---- Command/Add.hs | 15 +++++-- Limit.hs | 11 +++-- Logs/PreferredContent.hs | 54 +++++----------------- Types/GitConfig.hs | 2 + debian/changelog | 2 + doc/git-annex.mdwn | 38 ++++++++++++++-- 8 files changed, 169 insertions(+), 63 deletions(-) create mode 100644 Annex/FileMatcher.hs diff --git a/Annex/FileMatcher.hs b/Annex/FileMatcher.hs new file mode 100644 index 0000000000..c32402baf0 --- /dev/null +++ b/Annex/FileMatcher.hs @@ -0,0 +1,86 @@ +{- git-annex file matching + - + - Copyright 2012, 2013 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Annex.FileMatcher where + +import qualified Data.Map as M + +import Common.Annex +import Limit +import Utility.Matcher +import Types.Group +import Logs.Group +import Annex.UUID +import qualified Annex +import Git.FilePath + +import Data.Either +import qualified Data.Set as S + +type FileMatcher = Matcher MatchFiles + +checkFileMatcher :: FileMatcher -> FilePath -> Annex Bool +checkFileMatcher matcher file = checkFileMatcher' matcher file S.empty True + +checkFileMatcher' :: FileMatcher -> FilePath -> AssumeNotPresent -> Bool -> Annex Bool +checkFileMatcher' matcher file notpresent def + | isEmpty matcher = return def + | otherwise = do + matchfile <- getTopFilePath <$> inRepo (toTopFilePath file) + let fi = Annex.FileInfo + { Annex.matchFile = matchfile + , Annex.relFile = file + } + matchMrun matcher $ \a -> a notpresent fi + +matchAll :: FileMatcher +matchAll = generate [] + +parsedToMatcher :: [Either String (Token MatchFiles)] -> Either String FileMatcher +parsedToMatcher parsed = case partitionEithers parsed of + ([], vs) -> Right $ generate vs + (es, _) -> Left $ unwords $ map ("Parse failure: " ++) es + +parseToken :: MkLimit -> GroupMap -> String -> Either String (Token MatchFiles) +parseToken checkpresent groupmap t + | any (== t) Utility.Matcher.tokens = Right $ Utility.Matcher.token t + | t == "present" = use checkpresent + | otherwise = maybe (Left $ "near " ++ show t) use $ M.lookup k $ + M.fromList + [ ("include", limitInclude) + , ("exclude", limitExclude) + , ("copies", limitCopies) + , ("inbackend", limitInBackend) + , ("largerthan", limitSize (>)) + , ("smallerthan", limitSize (<)) + , ("inallgroup", limitInAllGroup groupmap) + ] + where + (k, v) = separate (== '=') t + use a = Utility.Matcher.Operation <$> a v + +{- This is really dumb tokenization; there's no support for quoted values. + - Open and close parens are always treated as standalone tokens; + - otherwise tokens must be separated by whitespace. -} +tokenizeMatcher :: String -> [String] +tokenizeMatcher = filter (not . null ) . concatMap splitparens . words + where + splitparens = segmentDelim (`elem` "()") + +{- Generates a matcher for files large enough (or meeting other criteria) + - to be added to the annex, rather than directly to git. -} +largeFilesMatcher :: Annex FileMatcher +largeFilesMatcher = go =<< annexLargeFiles <$> Annex.getGitConfig + where + go Nothing = return $ matchAll + go (Just expr) = do + m <- groupMap + u <- getUUID + either badexpr return $ parsedToMatcher $ + map (parseToken (limitPresent $ Just u) m) + (tokenizeMatcher expr) + badexpr e = error $ "bad annex.largefiles configuration: " ++ e diff --git a/Assistant/Threads/Watcher.hs b/Assistant/Threads/Watcher.hs index 8d06e6659f..84193de208 100644 --- a/Assistant/Threads/Watcher.hs +++ b/Assistant/Threads/Watcher.hs @@ -35,6 +35,7 @@ import Annex.Direct import Annex.Content.Direct import Annex.CatFile import Annex.Link +import Annex.FileMatcher import Git.Types import Config import Utility.ThreadScheduler @@ -77,8 +78,9 @@ watchThread = namedThread "Watcher" $ runWatcher :: Assistant () runWatcher = do startup <- asIO1 startupScan + matcher <- liftAnnex $ largeFilesMatcher direct <- liftAnnex isDirect - addhook <- hook $ if direct then onAddDirect else onAdd + addhook <- hook $ if direct then onAddDirect matcher else onAdd matcher delhook <- hook onDel addsymlinkhook <- hook $ onAddSymlink direct deldirhook <- hook onDelDir @@ -166,16 +168,22 @@ runHandler handler file filestatus = void $ do liftAnnex $ Annex.Queue.flushWhenFull recordChange change -onAdd :: Handler -onAdd file filestatus - | maybe False isRegularFile filestatus = pendingAddChange file +checkAdd :: FileMatcher -> FilePath -> Assistant (Maybe Change) +checkAdd matcher file = ifM (liftAnnex $ checkFileMatcher matcher file) + ( pendingAddChange file + , noChange + ) + +onAdd :: FileMatcher -> Handler +onAdd matcher file filestatus + | maybe False isRegularFile filestatus = checkAdd matcher file | otherwise = noChange {- In direct mode, add events are received for both new files, and - modified existing files. Or, in some cases, existing files that have not - really been modified. -} -onAddDirect :: Handler -onAddDirect file fs = do +onAddDirect :: FileMatcher -> Handler +onAddDirect matcher file fs = do debug ["add direct", file] v <- liftAnnex $ catKeyFile file case (v, fs) of @@ -184,9 +192,9 @@ onAddDirect file fs = do ( noChange , do liftAnnex $ changedDirect key file - pendingAddChange file + checkAdd matcher file ) - _ -> pendingAddChange file + _ -> checkAdd matcher file {- A symlink might be an arbitrary symlink, which is just added. - Or, if it is a git-annex symlink, ensure it points to the content diff --git a/Command/Add.hs b/Command/Add.hs index cf2a55c50f..83b1ca22cd 100644 --- a/Command/Add.hs +++ b/Command/Add.hs @@ -1,6 +1,6 @@ {- git-annex command - - - Copyright 2010 Joey Hess + - Copyright 2010, 2013 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -27,6 +27,7 @@ import Utility.Touch import Utility.FileMode import Config import Utility.InodeCache +import Annex.FileMatcher def :: [Command] def = [notBareRepo $ command "add" paramPaths seek SectionCommon @@ -37,10 +38,16 @@ def = [notBareRepo $ command "add" paramPaths seek SectionCommon - In direct mode, it acts on any files that have changed. -} seek :: [CommandSeek] seek = - [ withFilesNotInGit start - , whenNotDirect $ withFilesUnlocked start - , whenDirect $ withFilesMaybeModified start + [ go withFilesNotInGit + , whenNotDirect $ go withFilesUnlocked + , whenDirect $ go withFilesMaybeModified ] + where + go a = withValue largeFilesMatcher $ \matcher -> + a $ \file -> ifM (checkFileMatcher matcher file) + ( start file + , stop + ) {- The add subcommand annexes a file, generating a key for it using a - backend, and then moving it into the annex directory and setting up diff --git a/Limit.hs b/Limit.hs index d7f82eb84f..98227144d2 100644 --- a/Limit.hs +++ b/Limit.hs @@ -204,10 +204,15 @@ addSmallerThan = addLimit . limitSize (<) limitSize :: (Maybe Integer -> Maybe Integer -> Bool) -> MkLimit limitSize vs s = case readSize dataUnits s of Nothing -> Left "bad size" - Just sz -> Right $ const $ lookupFile >=> check sz + Just sz -> Right $ go sz where - check _ Nothing = return False - check sz (Just (key, _)) = return $ keySize key `vs` Just sz + go sz _ fi = lookupFile fi >>= check fi sz + check _ sz (Just (key, _)) = return $ keySize key `vs` Just sz + check fi sz Nothing = do + filesize <- liftIO $ catchMaybeIO $ + fromIntegral . fileSize + <$> getFileStatus (Annex.relFile fi) + return $ filesize `vs` Just sz addTimeLimit :: String -> Annex () addTimeLimit s = do diff --git a/Logs/PreferredContent.hs b/Logs/PreferredContent.hs index 3340cf5ef7..0efe42e174 100644 --- a/Logs/PreferredContent.hs +++ b/Logs/PreferredContent.hs @@ -27,8 +27,8 @@ import qualified Annex import Logs.UUIDBased import Limit import qualified Utility.Matcher +import Annex.FileMatcher import Annex.UUID -import Git.FilePath import Types.Group import Logs.Group import Types.StandardGroups @@ -50,19 +50,11 @@ preferredContentSet NoUUID _ = error "unknown UUID; cannot modify" - (or the current repository if none is specified). -} isPreferredContent :: Maybe UUID -> AssumeNotPresent -> FilePath -> Bool -> Annex Bool isPreferredContent mu notpresent file def = do - matchfile <- getTopFilePath <$> inRepo (toTopFilePath file) - let fi = Annex.FileInfo - { Annex.matchFile = matchfile - , Annex.relFile = file - } u <- maybe getUUID return mu m <- preferredContentMap case M.lookup u m of Nothing -> return def - Just matcher - | Utility.Matcher.isEmpty matcher -> return def - | otherwise -> Utility.Matcher.matchMrun matcher $ - \a -> a notpresent fi + Just matcher -> checkFileMatcher' matcher file notpresent def {- The map is cached for speed. -} preferredContentMap :: Annex Annex.PreferredContentMap @@ -87,56 +79,30 @@ preferredContentMapRaw = simpleMap . parseLog Just - because the configuration is shared amoung repositories and newer - versions of git-annex may add new features. Instead, parse errors - result in a Matcher that will always succeed. -} -makeMatcher :: GroupMap -> UUID -> String -> Utility.Matcher.Matcher MatchFiles +makeMatcher :: GroupMap -> UUID -> String -> FileMatcher makeMatcher groupmap u s | s == "standard" = standardMatcher groupmap u | null (lefts tokens) = Utility.Matcher.generate $ rights tokens | otherwise = matchAll where - tokens = map (parseToken (Just u) groupmap) (tokenizeMatcher s) + tokens = map (parseToken (limitPresent $ Just u) groupmap) (tokenizeMatcher s) {- Standard matchers are pre-defined for some groups. If none is defined, - or a repository is in multiple groups with standard matchers, match all. -} -standardMatcher :: GroupMap -> UUID -> Utility.Matcher.Matcher MatchFiles +standardMatcher :: GroupMap -> UUID -> FileMatcher standardMatcher m u = maybe matchAll (makeMatcher m u . preferredContent) $ getStandardGroup =<< u `M.lookup` groupsByUUID m -matchAll :: Utility.Matcher.Matcher MatchFiles -matchAll = Utility.Matcher.generate [] - {- Checks if an expression can be parsed, if not returns Just error -} checkPreferredContentExpression :: String -> Maybe String checkPreferredContentExpression s | s == "standard" = Nothing - | otherwise = case lefts $ map (parseToken Nothing emptyGroupMap) (tokenizeMatcher s) of - [] -> Nothing - l -> Just $ unwords $ map ("Parse failure: " ++) l - -parseToken :: (Maybe UUID) -> GroupMap -> String -> Either String (Utility.Matcher.Token MatchFiles) -parseToken mu groupmap t - | any (== t) Utility.Matcher.tokens = Right $ Utility.Matcher.token t - | t == "present" = use $ limitPresent mu - | otherwise = maybe (Left $ "near " ++ show t) use $ M.lookup k $ - M.fromList - [ ("include", limitInclude) - , ("exclude", limitExclude) - , ("copies", limitCopies) - , ("inbackend", limitInBackend) - , ("largerthan", limitSize (>)) - , ("smallerthan", limitSize (<)) - , ("inallgroup", limitInAllGroup groupmap) - ] + | otherwise = case parsedToMatcher vs of + Left e -> Just e + Right _ -> Nothing where - (k, v) = separate (== '=') t - use a = Utility.Matcher.Operation <$> a v - -{- This is really dumb tokenization; there's no support for quoted values. - - Open and close parens are always treated as standalone tokens; - - otherwise tokens must be separated by whitespace. -} -tokenizeMatcher :: String -> [String] -tokenizeMatcher = filter (not . null ) . concatMap splitparens . words - where - splitparens = segmentDelim (`elem` "()") + vs = map (parseToken (limitPresent Nothing) emptyGroupMap) + (tokenizeMatcher s) {- Puts a UUID in a standard group, and sets its preferred content to use - the standard expression for that group, unless something is already set. -} diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index b42f8f2293..246c320d07 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -37,6 +37,7 @@ data GitConfig = GitConfig , annexAutoCommit :: Bool , annexWebOptions :: [String] , annexCrippledFileSystem :: Bool + , annexLargeFiles :: Maybe String , coreSymlinks :: Bool } @@ -59,6 +60,7 @@ extractGitConfig r = GitConfig , annexAutoCommit = getbool (annex "autocommit") True , annexWebOptions = getwords (annex "web-options") , annexCrippledFileSystem = getbool (annex "crippledfilesystem") False + , annexLargeFiles = getmaybe (annex "largefiles") , coreSymlinks = getbool "core.symlinks" True } where diff --git a/debian/changelog b/debian/changelog index e7b79f34b0..241fd4bded 100644 --- a/debian/changelog +++ b/debian/changelog @@ -10,6 +10,8 @@ git-annex (4.20130324) UNRELEASED; urgency=low * webapp: Run ssh server probes in a way that will work when the login shell is a monstrosity that should have died 25 years ago, such as csh. + * New annex.largefiles setting, which configures which files + `git annex add` and the assistant add to the annex. -- Joey Hess Mon, 25 Mar 2013 10:21:46 -0400 diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 35a1b2cdf5..502c1e1687 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -735,6 +735,23 @@ file contents are present at either of two repositories. Closes a group of file matching options. +# PREFERRED CONTENT + +Each repository has a preferred content setting, which specifies content +that the repository wants to have present. These settings can be configured +using `git annex vicfg`. They are used by the `--auto` option, and +by the git-annex assistant. + +The preferred content settings are similar, but not identical to +the file matching options specified above, just without the dashes. +For example: + + exclude=archive/* and (include=*.mp3 or smallerthan=1mb) + +The main differences are that `exclude=` and `include=` always +match relative to the top of the git repository, and that there is +no equivilant to --in. + # CONFIGURATION Like other git commands, git-annex is configured via `.git/config`. @@ -765,6 +782,19 @@ Here are all the supported configuration settings. The default reserve is 1 megabyte. +* `annex.largefiles` + + Allows configuring which files `git annex add` and the assistant consider + to be large enough to need to be added to the annex. By default, + all files are added to the annex. + + The value is a preferred content expression. See PREFERRED CONTENT + for details. + + Example: + + annex.largefiles = largerthan=100kb or include=*.mp3 + * `annex.queuesize` git-annex builds a queue of git commands, in order to combine similar @@ -790,10 +820,6 @@ Here are all the supported configuration settings. the accuracy will make `git annex unused` consume more memory; run `git annex status` for memory usage numbers. -* `annex.version` - - Automatically maintained, and used to automate upgrades between versions. - * `annex.sshcaching` By default, git-annex caches ssh connections @@ -819,6 +845,10 @@ Here are all the supported configuration settings. Set to false to prevent the git-annex assistant from automatically committing changes to files in the repository. +* `annex.version` + + Automatically maintained, and used to automate upgrades between versions. + * `annex.direct` Set to true when the repository is in direct mode. Should not be set