New matching options --excludesamecontent and --includesamecontent
The normalisation of filenames turns out to be the tricky part here, because the associated files coming out of the keys db may look like "./foo/bar" or "../bar". For the former to match a glob like "foo/*", it needs to be normalised. Note that, on windows, normalise "./foo/bar" = "foo\\bar" which a glob like "foo/*" won't match. So the glob is matched a second time, on the toInternalGitPath, so allowing the user to provide a glob with the slashes in either direction. However, this still won't support some wacky edge cases like the user providing a glob of "foo/bar\\*" Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
cd73fcc92c
commit
b5f5475ed6
5 changed files with 102 additions and 3 deletions
|
@ -1,5 +1,6 @@
|
|||
git-annex (8.20210429) UNRELEASED; urgency=medium
|
||||
|
||||
* New matching options --excludesamecontent and --includesamecontent
|
||||
* When two files have the same content, and a required content expression
|
||||
matches one but not the other, dropping the latter file will fail as it
|
||||
would also remove the content of the required file.
|
||||
|
|
|
@ -338,6 +338,16 @@ fileMatchingOptions' lb =
|
|||
<> help "limit to files matching the glob pattern"
|
||||
<> hidden
|
||||
)
|
||||
, globalOption (setAnnexState . Limit.addExcludeSameContent) $ strOption
|
||||
( long "excludesamecontent" <> short 'x' <> metavar paramGlob
|
||||
<> help "skip files whose content is the same as another file matching the glob pattern"
|
||||
<> hidden
|
||||
)
|
||||
, globalOption (setAnnexState . Limit.addIncludeSameContent) $ strOption
|
||||
( long "includesamecontent" <> short 'I' <> metavar paramGlob
|
||||
<> help "limit to files whose content is the same as another file matching the glob pattern"
|
||||
<> hidden
|
||||
)
|
||||
, globalOption (setAnnexState . Limit.addLargerThan lb) $ strOption
|
||||
( long "largerthan" <> metavar paramSize
|
||||
<> help "match files larger than a size"
|
||||
|
|
67
Limit.hs
67
Limit.hs
|
@ -1,10 +1,12 @@
|
|||
{- user-specified limits on files to act on
|
||||
-
|
||||
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE CPP #-}
|
||||
|
||||
module Limit where
|
||||
|
||||
import Annex.Common
|
||||
|
@ -29,16 +31,20 @@ import Logs.MetaData
|
|||
import Logs.Group
|
||||
import Logs.Unused
|
||||
import Logs.Location
|
||||
import Annex.CatFile
|
||||
import Git.FilePath
|
||||
import Git.Types (RefDate(..))
|
||||
import Utility.Glob
|
||||
import Utility.HumanTime
|
||||
import Utility.DataUnits
|
||||
import qualified Database.Keys
|
||||
import qualified Utility.RawFilePath as R
|
||||
import Backend
|
||||
|
||||
import Data.Time.Clock.POSIX
|
||||
import qualified Data.Set as S
|
||||
import qualified Data.Map as M
|
||||
import qualified System.FilePath.ByteString as P
|
||||
|
||||
{- Some limits can look at the current status of files on
|
||||
- disk, or in the annex. This allows controlling which happens. -}
|
||||
|
@ -122,6 +128,65 @@ matchGlobFile glob = go
|
|||
Nothing -> False
|
||||
go (MatchingUserInfo p) = matchGlob cglob <$> getUserInfo (userProvidedFilePath p)
|
||||
|
||||
{- Add a limit to skip files when there is no other file using the same
|
||||
- content, with a name matching the glob. -}
|
||||
addIncludeSameContent :: String -> Annex ()
|
||||
addIncludeSameContent = addLimit . limitIncludeSameContent
|
||||
|
||||
limitIncludeSameContent :: MkLimit Annex
|
||||
limitIncludeSameContent glob = Right $ MatchFiles
|
||||
{ matchAction = const $ matchSameContentGlob glob
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
{- Add a limit to skip files when there is no other file using the same
|
||||
- content, with a name matching the glob. -}
|
||||
addExcludeSameContent :: String -> Annex ()
|
||||
addExcludeSameContent = addLimit . limitExcludeSameContent
|
||||
|
||||
limitExcludeSameContent :: MkLimit Annex
|
||||
limitExcludeSameContent glob = Right $ MatchFiles
|
||||
{ matchAction = const $ not <$$> matchSameContentGlob glob
|
||||
, matchNeedsFileName = True
|
||||
, matchNeedsFileContent = False
|
||||
, matchNeedsKey = False
|
||||
, matchNeedsLocationLog = False
|
||||
}
|
||||
|
||||
matchSameContentGlob :: String -> MatchInfo -> Annex Bool
|
||||
matchSameContentGlob glob mi = checkKey (go mi) mi
|
||||
where
|
||||
go (MatchingFile fi) k = check k (matchFile fi)
|
||||
go (MatchingInfo p) k = case providedFilePath p of
|
||||
Just f -> check k f
|
||||
Nothing -> return False
|
||||
go (MatchingUserInfo p) k =
|
||||
check k . toRawFilePath
|
||||
=<< getUserInfo (userProvidedFilePath p)
|
||||
|
||||
cglob = compileGlob glob CaseSensative (GlobFilePath True) -- memoized
|
||||
|
||||
matchesglob f = matchGlob cglob (fromRawFilePath f)
|
||||
#ifdef mingw32_HOST_OS
|
||||
|| matchGlob cglob (fromRawFilePath (toInternalGitPath f))
|
||||
#endif
|
||||
|
||||
check k skipf = do
|
||||
-- Find other files with the same content, with filenames
|
||||
-- matching the glob.
|
||||
g <- Annex.gitRepo
|
||||
fs <- filter (/= P.normalise skipf)
|
||||
. filter matchesglob
|
||||
. map (\f -> P.normalise (fromTopFilePath f g))
|
||||
<$> Database.Keys.getAssociatedFiles k
|
||||
-- Some associated files in the keys database may no longer
|
||||
-- correspond to files in the repository. This is checked
|
||||
-- last as it's most expensive.
|
||||
anyM (\f -> maybe False (== k) <$> catKeyFile f) fs
|
||||
|
||||
addMimeType :: String -> Annex ()
|
||||
addMimeType = addMagicLimit "mimetype" getMagicMimeType providedMimeType userProvidedMimeType
|
||||
|
||||
|
|
|
@ -22,7 +22,7 @@ in either of two repositories.
|
|||
Skips files matching the glob pattern. The glob is matched relative to
|
||||
the current directory. For example:
|
||||
|
||||
--exclude='*.mp3' --exclude='subdir/*'
|
||||
git annex get --exclude='*.mp3' --exclude='subdir/*'
|
||||
|
||||
Note that this will not match anything when using --all or --unused.
|
||||
|
||||
|
@ -31,10 +31,31 @@ in either of two repositories.
|
|||
Skips files not matching the glob pattern. (Same as `--not --exclude`.)
|
||||
For example, to include only mp3 and ogg files:
|
||||
|
||||
--include='*.mp3' --or --include='*.ogg'
|
||||
git annex get --include='*.mp3' --or --include='*.ogg'
|
||||
|
||||
Note that this will not skip anything when using --all or --unused.
|
||||
|
||||
* `--excludesamecontent=glob`
|
||||
|
||||
Skips a file when there is another file with the same content,
|
||||
whose name matches the glob. The glob is matched relative to the current
|
||||
directory.
|
||||
|
||||
For example, to drop files in the archive directory, but not when the same
|
||||
content is used by a file in the work directory:
|
||||
|
||||
git annex drop archive/ --excludesamecontent='work/*'
|
||||
|
||||
* `--includesamecontent=glob`
|
||||
|
||||
Skips files when there is no other file with the same content
|
||||
whose name matches the glob. (Same as `--not --includesamecontent`)
|
||||
|
||||
For example, if you have inbox and outbox directories, and want to find
|
||||
anything in the inbox that has the same content as something in the outbox:
|
||||
|
||||
git annex find inbox --includesamecontent='outbox/*'
|
||||
|
||||
* `--in=repository`
|
||||
|
||||
Matches only files that git-annex believes have their contents present
|
||||
|
|
|
@ -6,3 +6,5 @@ In our repositories/workflows we quite often encounter cases where multiple subf
|
|||
|
||||
[[!meta author=yoh]]
|
||||
[[!tag projects/datalad]]
|
||||
|
||||
> [[done]] --[[Joey]]
|
||||
|
|
Loading…
Reference in a new issue