New matching options --excludesamecontent and --includesamecontent

The normalisation of filenames turns out to be the tricky part here,
because the associated files coming out of the keys db may look like
"./foo/bar" or "../bar". For the former to match a glob like "foo/*",
it needs to be normalised.

Note that, on windows, normalise "./foo/bar" = "foo\\bar"
which a glob like "foo/*" won't match. So the glob is matched a second
time, on the toInternalGitPath, so allowing the user to provide a glob
with the slashes in either direction. However, this still won't support
some wacky edge cases like the user providing a glob of "foo/bar\\*"

Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
Joey Hess 2021-05-25 13:05:42 -04:00
parent cd73fcc92c
commit b5f5475ed6
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 102 additions and 3 deletions

View file

@ -1,5 +1,6 @@
git-annex (8.20210429) UNRELEASED; urgency=medium
* New matching options --excludesamecontent and --includesamecontent
* When two files have the same content, and a required content expression
matches one but not the other, dropping the latter file will fail as it
would also remove the content of the required file.

View file

@ -338,6 +338,16 @@ fileMatchingOptions' lb =
<> help "limit to files matching the glob pattern"
<> hidden
)
, globalOption (setAnnexState . Limit.addExcludeSameContent) $ strOption
( long "excludesamecontent" <> short 'x' <> metavar paramGlob
<> help "skip files whose content is the same as another file matching the glob pattern"
<> hidden
)
, globalOption (setAnnexState . Limit.addIncludeSameContent) $ strOption
( long "includesamecontent" <> short 'I' <> metavar paramGlob
<> help "limit to files whose content is the same as another file matching the glob pattern"
<> hidden
)
, globalOption (setAnnexState . Limit.addLargerThan lb) $ strOption
( long "largerthan" <> metavar paramSize
<> help "match files larger than a size"

View file

@ -1,10 +1,12 @@
{- user-specified limits on files to act on
-
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE CPP #-}
module Limit where
import Annex.Common
@ -29,16 +31,20 @@ import Logs.MetaData
import Logs.Group
import Logs.Unused
import Logs.Location
import Annex.CatFile
import Git.FilePath
import Git.Types (RefDate(..))
import Utility.Glob
import Utility.HumanTime
import Utility.DataUnits
import qualified Database.Keys
import qualified Utility.RawFilePath as R
import Backend
import Data.Time.Clock.POSIX
import qualified Data.Set as S
import qualified Data.Map as M
import qualified System.FilePath.ByteString as P
{- Some limits can look at the current status of files on
- disk, or in the annex. This allows controlling which happens. -}
@ -122,6 +128,65 @@ matchGlobFile glob = go
Nothing -> False
go (MatchingUserInfo p) = matchGlob cglob <$> getUserInfo (userProvidedFilePath p)
{- Add a limit to skip files when there is no other file using the same
- content, with a name matching the glob. -}
addIncludeSameContent :: String -> Annex ()
addIncludeSameContent = addLimit . limitIncludeSameContent
limitIncludeSameContent :: MkLimit Annex
limitIncludeSameContent glob = Right $ MatchFiles
{ matchAction = const $ matchSameContentGlob glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
{- Add a limit to skip files when there is no other file using the same
- content, with a name matching the glob. -}
addExcludeSameContent :: String -> Annex ()
addExcludeSameContent = addLimit . limitExcludeSameContent
limitExcludeSameContent :: MkLimit Annex
limitExcludeSameContent glob = Right $ MatchFiles
{ matchAction = const $ not <$$> matchSameContentGlob glob
, matchNeedsFileName = True
, matchNeedsFileContent = False
, matchNeedsKey = False
, matchNeedsLocationLog = False
}
matchSameContentGlob :: String -> MatchInfo -> Annex Bool
matchSameContentGlob glob mi = checkKey (go mi) mi
where
go (MatchingFile fi) k = check k (matchFile fi)
go (MatchingInfo p) k = case providedFilePath p of
Just f -> check k f
Nothing -> return False
go (MatchingUserInfo p) k =
check k . toRawFilePath
=<< getUserInfo (userProvidedFilePath p)
cglob = compileGlob glob CaseSensative (GlobFilePath True) -- memoized
matchesglob f = matchGlob cglob (fromRawFilePath f)
#ifdef mingw32_HOST_OS
|| matchGlob cglob (fromRawFilePath (toInternalGitPath f))
#endif
check k skipf = do
-- Find other files with the same content, with filenames
-- matching the glob.
g <- Annex.gitRepo
fs <- filter (/= P.normalise skipf)
. filter matchesglob
. map (\f -> P.normalise (fromTopFilePath f g))
<$> Database.Keys.getAssociatedFiles k
-- Some associated files in the keys database may no longer
-- correspond to files in the repository. This is checked
-- last as it's most expensive.
anyM (\f -> maybe False (== k) <$> catKeyFile f) fs
addMimeType :: String -> Annex ()
addMimeType = addMagicLimit "mimetype" getMagicMimeType providedMimeType userProvidedMimeType

View file

@ -22,7 +22,7 @@ in either of two repositories.
Skips files matching the glob pattern. The glob is matched relative to
the current directory. For example:
--exclude='*.mp3' --exclude='subdir/*'
git annex get --exclude='*.mp3' --exclude='subdir/*'
Note that this will not match anything when using --all or --unused.
@ -31,10 +31,31 @@ in either of two repositories.
Skips files not matching the glob pattern. (Same as `--not --exclude`.)
For example, to include only mp3 and ogg files:
--include='*.mp3' --or --include='*.ogg'
git annex get --include='*.mp3' --or --include='*.ogg'
Note that this will not skip anything when using --all or --unused.
* `--excludesamecontent=glob`
Skips a file when there is another file with the same content,
whose name matches the glob. The glob is matched relative to the current
directory.
For example, to drop files in the archive directory, but not when the same
content is used by a file in the work directory:
git annex drop archive/ --excludesamecontent='work/*'
* `--includesamecontent=glob`
Skips files when there is no other file with the same content
whose name matches the glob. (Same as `--not --includesamecontent`)
For example, if you have inbox and outbox directories, and want to find
anything in the inbox that has the same content as something in the outbox:
git annex find inbox --includesamecontent='outbox/*'
* `--in=repository`
Matches only files that git-annex believes have their contents present

View file

@ -6,3 +6,5 @@ In our repositories/workflows we quite often encounter cases where multiple subf
[[!meta author=yoh]]
[[!tag projects/datalad]]
> [[done]] --[[Joey]]