git-annex/Annex/FileMatcher.hs
Joey Hess 067aabdd48
wip RawFilePath 2x git-annex find speedup
Finally builds (oh the agoncy of making it build), but still very
unmergable, only Command.Find is included and lots of stuff is badly
hacked to make it compile.

Benchmarking vs master, this git-annex find is significantly faster!
Specifically:

	num files	old	new	speedup
	48500		4.77	3.73	28%
	12500		1.36	1.02	66%
	20		0.075	0.074	0% (so startup time is unchanged)

That's without really finishing the optimization. Things still to do:

* Eliminate all the fromRawFilePath, toRawFilePath, encodeBS,
  decodeBS conversions.
* Use versions of IO actions like getFileStatus that take a RawFilePath.
* Eliminate some Data.ByteString.Lazy.toStrict, which is a slow copy.
* Use ByteString for parsing git config to speed up startup.

It's likely several of those will speed up git-annex find further.
And other commands will certianly benefit even more.
2019-11-26 16:01:58 -04:00

230 lines
7.5 KiB
Haskell

{- git-annex file matching
-
- Copyright 2012-2019 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE CPP #-}
module Annex.FileMatcher (
GetFileMatcher,
checkFileMatcher,
checkFileMatcher',
checkMatcher,
checkMatcher',
matchAll,
PreferredContentData(..),
preferredContentTokens,
preferredContentKeylessTokens,
preferredContentParser,
ParseToken,
parsedToMatcher,
mkLargeFilesParser,
largeFilesMatcher,
) where
import qualified Data.Map as M
import Annex.Common
import Limit
import Utility.Matcher
import Types.Group
import qualified Annex
import Types.FileMatcher
import Git.FilePath
import Types.Remote (RemoteConfig)
import Annex.CheckAttr
import Git.CheckAttr (unspecifiedAttr)
#ifdef WITH_MAGICMIME
import Annex.Magic
#endif
import Data.Either
import qualified Data.Set as S
type GetFileMatcher = FilePath -> Annex (FileMatcher Annex)
checkFileMatcher :: GetFileMatcher -> FilePath -> Annex Bool
checkFileMatcher getmatcher file = checkFileMatcher' getmatcher file (return True)
-- | Allows running an action when no matcher is configured for the file.
checkFileMatcher' :: GetFileMatcher -> FilePath -> Annex Bool -> Annex Bool
checkFileMatcher' getmatcher file notconfigured = do
matcher <- getmatcher file
checkMatcher matcher Nothing afile S.empty notconfigured d
where
afile = AssociatedFile (Just (toRawFilePath file))
-- checkMatcher will never use this, because afile is provided.
d = return True
checkMatcher :: FileMatcher Annex -> Maybe Key -> AssociatedFile -> AssumeNotPresent -> Annex Bool -> Annex Bool -> Annex Bool
checkMatcher matcher mkey afile notpresent notconfigured d
| isEmpty matcher = notconfigured
| otherwise = case (mkey, afile) of
(_, AssociatedFile (Just file)) -> go =<< fileMatchInfo (fromRawFilePath file)
(Just key, _) -> go (MatchingKey key afile)
_ -> d
where
go mi = checkMatcher' matcher mi notpresent
checkMatcher' :: FileMatcher Annex -> MatchInfo -> AssumeNotPresent -> Annex Bool
checkMatcher' matcher mi notpresent =
matchMrun matcher $ \a -> a notpresent mi
fileMatchInfo :: FilePath -> Annex MatchInfo
fileMatchInfo file = do
matchfile <- getTopFilePath <$> inRepo (toTopFilePath file)
return $ MatchingFile FileInfo
{ matchFile = matchfile
, currFile = file
}
matchAll :: FileMatcher Annex
matchAll = generate []
parsedToMatcher :: [ParseResult (MatchFiles Annex)] -> Either String (FileMatcher Annex)
parsedToMatcher parsed = case partitionEithers parsed of
([], vs) -> Right $ generate vs
(es, _) -> Left $ unwords $ map ("Parse failure: " ++) es
data ParseToken t
= SimpleToken String (ParseResult t)
| ValueToken String (String -> ParseResult t)
type ParseResult t = Either String (Token t)
parseToken :: [ParseToken t] -> String -> ParseResult t
parseToken l t = case syntaxToken t of
Right st -> Right st
Left _ -> go l
where
go [] = Left $ "near " ++ show t
go (SimpleToken s r : _) | s == t = r
go (ValueToken s mkr : _) | s == k = mkr v
go (_ : ps) = go ps
(k, v) = separate (== '=') t
{- This is really dumb tokenization; there's no support for quoted values.
- Open and close parens are always treated as standalone tokens;
- otherwise tokens must be separated by whitespace. -}
tokenizeMatcher :: String -> [String]
tokenizeMatcher = filter (not . null) . concatMap splitparens . words
where
splitparens = segmentDelim (`elem` "()")
commonKeylessTokens :: LimitBy -> [ParseToken (MatchFiles Annex)]
commonKeylessTokens lb =
[ SimpleToken "anything" (simply limitAnything)
, SimpleToken "nothing" (simply limitNothing)
, ValueToken "include" (usev limitInclude)
, ValueToken "exclude" (usev limitExclude)
, ValueToken "largerthan" (usev $ limitSize lb (>))
, ValueToken "smallerthan" (usev $ limitSize lb (<))
]
commonKeyedTokens :: [ParseToken (MatchFiles Annex)]
commonKeyedTokens =
[ SimpleToken "unused" (simply limitUnused)
]
data PreferredContentData = PCD
{ matchStandard :: Either String (FileMatcher Annex)
, matchGroupWanted :: Either String (FileMatcher Annex)
, getGroupMap :: Annex GroupMap
, configMap :: M.Map UUID RemoteConfig
, repoUUID :: Maybe UUID
}
-- Tokens of preferred content expressions that do not need a Key to be
-- known.
--
-- When importing from a special remote, this is used to match
-- some preferred content expressions before the content is downloaded,
-- so the Key is not known.
preferredContentKeylessTokens :: PreferredContentData -> [ParseToken (MatchFiles Annex)]
preferredContentKeylessTokens pcd =
[ SimpleToken "standard" (call $ matchStandard pcd)
, SimpleToken "groupwanted" (call $ matchGroupWanted pcd)
, SimpleToken "inpreferreddir" (simply $ limitInDir preferreddir)
] ++ commonKeylessTokens LimitAnnexFiles
where
preferreddir = fromMaybe "public" $
M.lookup "preferreddir" =<< (`M.lookup` configMap pcd) =<< repoUUID pcd
preferredContentKeyedTokens :: PreferredContentData -> [ParseToken (MatchFiles Annex)]
preferredContentKeyedTokens pcd =
[ SimpleToken "present" (simply $ limitPresent $ repoUUID pcd)
, SimpleToken "securehash" (simply limitSecureHash)
, ValueToken "copies" (usev limitCopies)
, ValueToken "lackingcopies" (usev $ limitLackingCopies False)
, ValueToken "approxlackingcopies" (usev $ limitLackingCopies True)
, ValueToken "inbacked" (usev limitInBackend)
, ValueToken "metadata" (usev limitMetaData)
, ValueToken "inallgroup" (usev $ limitInAllGroup $ getGroupMap pcd)
] ++ commonKeyedTokens
preferredContentTokens :: PreferredContentData -> [ParseToken (MatchFiles Annex)]
preferredContentTokens pcd = concat
[ preferredContentKeylessTokens pcd
, preferredContentKeyedTokens pcd
]
preferredContentParser :: [ParseToken (MatchFiles Annex)] -> String -> [ParseResult (MatchFiles Annex)]
preferredContentParser tokens = map (parseToken tokens) . tokenizeMatcher
mkLargeFilesParser :: Annex (String -> [ParseResult (MatchFiles Annex)])
mkLargeFilesParser = do
#ifdef WITH_MAGICMIME
magicmime <- liftIO initMagicMime
let mimer n f = ValueToken n (usev $ f magicmime)
#else
let mimer n = ValueToken n $
const $ Left $ "\""++n++"\" not supported; not built with MagicMime support"
#endif
let parse = parseToken $
commonKeyedTokens ++
commonKeylessTokens LimitDiskFiles ++
#ifdef WITH_MAGICMIME
[ mimer "mimetype" $
matchMagic "mimetype" getMagicMimeType providedMimeType
, mimer "mimeencoding" $
matchMagic "mimeencoding" getMagicMimeEncoding providedMimeEncoding
]
#else
[ mimer "mimetype"
, mimer "mimeencoding"
]
#endif
return $ map parse . tokenizeMatcher
where
{- Generates a matcher for files large enough (or meeting other criteria)
- to be added to the annex, rather than directly to git. -}
largeFilesMatcher :: Annex GetFileMatcher
largeFilesMatcher = go =<< annexLargeFiles <$> Annex.getGitConfig
where
go (Just expr) = do
matcher <- mkmatcher expr
return $ const $ return matcher
go Nothing = return $ \file -> do
expr <- checkAttr "annex.largefiles" file
if null expr || expr == unspecifiedAttr
then return matchAll
else mkmatcher expr
mkmatcher expr = do
parser <- mkLargeFilesParser
either badexpr return $ parsedToMatcher $ parser expr
badexpr e = giveup $ "bad annex.largefiles configuration: " ++ e
simply :: MatchFiles Annex -> ParseResult (MatchFiles Annex)
simply = Right . Operation
usev :: MkLimit Annex -> String -> ParseResult (MatchFiles Annex)
usev a v = Operation <$> a v
call :: Either String (FileMatcher Annex) -> ParseResult (MatchFiles Annex)
call (Right sub) = Right $ Operation $ \notpresent mi ->
matchMrun sub $ \a -> a notpresent mi
call (Left err) = Left err