New matching options --excludesamecontent and --includesamecontent

The normalisation of filenames turns out to be the tricky part here,
because the associated files coming out of the keys db may look like
"./foo/bar" or "../bar". For the former to match a glob like "foo/*",
it needs to be normalised.

Note that, on windows, normalise "./foo/bar" = "foo\\bar"
which a glob like "foo/*" won't match. So the glob is matched a second
time, on the toInternalGitPath, so allowing the user to provide a glob
with the slashes in either direction. However, this still won't support
some wacky edge cases like the user providing a glob of "foo/bar\\*"

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-05-25 13:05:42 -04:00
parent cd73fcc92c
commit b5f5475ed6
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 102 additions and 3 deletions

View file

@ -1,10 +1,12 @@
{- user-specified limits on files to act on
-
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE CPP #-}
module Limit where
import Annex.Common
@ -29,16 +31,20 @@ import Logs.MetaData
import Logs.Group
import Logs.Unused
import Logs.Location
import Annex.CatFile
import Git.FilePath
import Git.Types (RefDate(..))
import Utility.Glob
import Utility.HumanTime
import Utility.DataUnits
import qualified Database.Keys
import qualified Utility.RawFilePath as R
import Backend
import Data.Time.Clock.POSIX
import qualified Data.Set as S
import qualified Data.Map as M
import qualified System.FilePath.ByteString as P
{- Some limits can look at the current status of files on
- disk, or in the annex. This allows controlling which happens. -}
@ -122,6 +128,65 @@ matchGlobFile glob = go
Nothing -> False
go (MatchingUserInfo p) = matchGlob cglob <$> getUserInfo (userProvidedFilePath p)
{- Add a limit to skip files when there is no other file using the same
- content, with a name matching the glob. -}
addIncludeSameContent :: String -> Annex ()
addIncludeSameContent = addLimit . limitIncludeSameContent
limitIncludeSameContent :: MkLimit Annex
limitIncludeSameContent glob = Right $ MatchFiles
{ matchAction = const $ matchSameContentGlob glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
{- Add a limit to skip files when there is no other file using the same
- content, with a name matching the glob. -}
addExcludeSameContent :: String -> Annex ()
addExcludeSameContent = addLimit . limitExcludeSameContent
limitExcludeSameContent :: MkLimit Annex
limitExcludeSameContent glob = Right $ MatchFiles
{ matchAction = const $ not <$$> matchSameContentGlob glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
matchSameContentGlob :: String -> MatchInfo -> Annex Bool
matchSameContentGlob glob mi = checkKey (go mi) mi
where
go (MatchingFile fi) k = check k (matchFile fi)
go (MatchingInfo p) k = case providedFilePath p of
Just f -> check k f
Nothing -> return False
go (MatchingUserInfo p) k =
check k . toRawFilePath
=<< getUserInfo (userProvidedFilePath p)
cglob = compileGlob glob CaseSensative (GlobFilePath True) -- memoized
matchesglob f = matchGlob cglob (fromRawFilePath f)
#ifdef mingw32_HOST_OS
|| matchGlob cglob (fromRawFilePath (toInternalGitPath f))
#endif
check k skipf = do
-- Find other files with the same content, with filenames
-- matching the glob.
g <- Annex.gitRepo
fs <- filter (/= P.normalise skipf)
. filter matchesglob
. map (\f -> P.normalise (fromTopFilePath f g))
<$> Database.Keys.getAssociatedFiles k
-- Some associated files in the keys database may no longer
-- correspond to files in the repository. This is checked
-- last as it's most expensive.
anyM (\f -> maybe False (== k) <$> catKeyFile f) fs
addMimeType :: String -> Annex ()
addMimeType = addMagicLimit "mimetype" getMagicMimeType providedMimeType userProvidedMimeType