New matching options --excludesamecontent and --includesamecontent
The normalisation of filenames turns out to be the tricky part here, because the associated files coming out of the keys db may look like "./foo/bar" or "../bar". For the former to match a glob like "foo/*", it needs to be normalised. Note that, on windows, normalise "./foo/bar" = "foo\\bar" which a glob like "foo/*" won't match. So the glob is matched a second time, on the toInternalGitPath, so allowing the user to provide a glob with the slashes in either direction. However, this still won't support some wacky edge cases like the user providing a glob of "foo/bar\\*" Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
cd73fcc92c
commit
b5f5475ed6
5 changed files with 102 additions and 3 deletions
|
@ -1,5 +1,6 @@
|
||||||
git-annex (8.20210429) UNRELEASED; urgency=medium
|
git-annex (8.20210429) UNRELEASED; urgency=medium
|
||||||
|
|
||||||
|
* New matching options --excludesamecontent and --includesamecontent
|
||||||
* When two files have the same content, and a required content expression
|
* When two files have the same content, and a required content expression
|
||||||
matches one but not the other, dropping the latter file will fail as it
|
matches one but not the other, dropping the latter file will fail as it
|
||||||
would also remove the content of the required file.
|
would also remove the content of the required file.
|
||||||
|
|
|
@ -338,6 +338,16 @@ fileMatchingOptions' lb =
|
||||||
<> help "limit to files matching the glob pattern"
|
<> help "limit to files matching the glob pattern"
|
||||||
<> hidden
|
<> hidden
|
||||||
)
|
)
|
||||||
|
, globalOption (setAnnexState . Limit.addExcludeSameContent) $ strOption
|
||||||
|
( long "excludesamecontent" <> short 'x' <> metavar paramGlob
|
||||||
|
<> help "skip files whose content is the same as another file matching the glob pattern"
|
||||||
|
<> hidden
|
||||||
|
)
|
||||||
|
, globalOption (setAnnexState . Limit.addIncludeSameContent) $ strOption
|
||||||
|
( long "includesamecontent" <> short 'I' <> metavar paramGlob
|
||||||
|
<> help "limit to files whose content is the same as another file matching the glob pattern"
|
||||||
|
<> hidden
|
||||||
|
)
|
||||||
, globalOption (setAnnexState . Limit.addLargerThan lb) $ strOption
|
, globalOption (setAnnexState . Limit.addLargerThan lb) $ strOption
|
||||||
( long "largerthan" <> metavar paramSize
|
( long "largerthan" <> metavar paramSize
|
||||||
<> help "match files larger than a size"
|
<> help "match files larger than a size"
|
||||||
|
|
67
Limit.hs
67
Limit.hs
|
@ -1,10 +1,12 @@
|
||||||
{- user-specified limits on files to act on
|
{- user-specified limits on files to act on
|
||||||
-
|
-
|
||||||
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
|
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
|
{-# LANGUAGE CPP #-}
|
||||||
|
|
||||||
module Limit where
|
module Limit where
|
||||||
|
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
|
@ -29,16 +31,20 @@ import Logs.MetaData
|
||||||
import Logs.Group
|
import Logs.Group
|
||||||
import Logs.Unused
|
import Logs.Unused
|
||||||
import Logs.Location
|
import Logs.Location
|
||||||
|
import Annex.CatFile
|
||||||
|
import Git.FilePath
|
||||||
import Git.Types (RefDate(..))
|
import Git.Types (RefDate(..))
|
||||||
import Utility.Glob
|
import Utility.Glob
|
||||||
import Utility.HumanTime
|
import Utility.HumanTime
|
||||||
import Utility.DataUnits
|
import Utility.DataUnits
|
||||||
|
import qualified Database.Keys
|
||||||
import qualified Utility.RawFilePath as R
|
import qualified Utility.RawFilePath as R
|
||||||
import Backend
|
import Backend
|
||||||
|
|
||||||
import Data.Time.Clock.POSIX
|
import Data.Time.Clock.POSIX
|
||||||
import qualified Data.Set as S
|
import qualified Data.Set as S
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
|
import qualified System.FilePath.ByteString as P
|
||||||
|
|
||||||
{- Some limits can look at the current status of files on
|
{- Some limits can look at the current status of files on
|
||||||
- disk, or in the annex. This allows controlling which happens. -}
|
- disk, or in the annex. This allows controlling which happens. -}
|
||||||
|
@ -122,6 +128,65 @@ matchGlobFile glob = go
|
||||||
Nothing -> False
|
Nothing -> False
|
||||||
go (MatchingUserInfo p) = matchGlob cglob <$> getUserInfo (userProvidedFilePath p)
|
go (MatchingUserInfo p) = matchGlob cglob <$> getUserInfo (userProvidedFilePath p)
|
||||||
|
|
||||||
|
{- Add a limit to skip files when there is no other file using the same
|
||||||
|
- content, with a name matching the glob. -}
|
||||||
|
addIncludeSameContent :: String -> Annex ()
|
||||||
|
addIncludeSameContent = addLimit . limitIncludeSameContent
|
||||||
|
|
||||||
|
limitIncludeSameContent :: MkLimit Annex
|
||||||
|
limitIncludeSameContent glob = Right $ MatchFiles
|
||||||
|
{ matchAction = const $ matchSameContentGlob glob
|
||||||
|
, matchNeedsFileName = True
|
||||||
|
, matchNeedsFileContent = False
|
||||||
|
, matchNeedsKey = False
|
||||||
|
, matchNeedsLocationLog = False
|
||||||
|
}
|
||||||
|
|
||||||
|
{- Add a limit to skip files when there is no other file using the same
|
||||||
|
- content, with a name matching the glob. -}
|
||||||
|
addExcludeSameContent :: String -> Annex ()
|
||||||
|
addExcludeSameContent = addLimit . limitExcludeSameContent
|
||||||
|
|
||||||
|
limitExcludeSameContent :: MkLimit Annex
|
||||||
|
limitExcludeSameContent glob = Right $ MatchFiles
|
||||||
|
{ matchAction = const $ not <$$> matchSameContentGlob glob
|
||||||
|
, matchNeedsFileName = True
|
||||||
|
, matchNeedsFileContent = False
|
||||||
|
, matchNeedsKey = False
|
||||||
|
, matchNeedsLocationLog = False
|
||||||
|
}
|
||||||
|
|
||||||
|
matchSameContentGlob :: String -> MatchInfo -> Annex Bool
|
||||||
|
matchSameContentGlob glob mi = checkKey (go mi) mi
|
||||||
|
where
|
||||||
|
go (MatchingFile fi) k = check k (matchFile fi)
|
||||||
|
go (MatchingInfo p) k = case providedFilePath p of
|
||||||
|
Just f -> check k f
|
||||||
|
Nothing -> return False
|
||||||
|
go (MatchingUserInfo p) k =
|
||||||
|
check k . toRawFilePath
|
||||||
|
=<< getUserInfo (userProvidedFilePath p)
|
||||||
|
|
||||||
|
cglob = compileGlob glob CaseSensative (GlobFilePath True) -- memoized
|
||||||
|
|
||||||
|
matchesglob f = matchGlob cglob (fromRawFilePath f)
|
||||||
|
#ifdef mingw32_HOST_OS
|
||||||
|
|| matchGlob cglob (fromRawFilePath (toInternalGitPath f))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
check k skipf = do
|
||||||
|
-- Find other files with the same content, with filenames
|
||||||
|
-- matching the glob.
|
||||||
|
g <- Annex.gitRepo
|
||||||
|
fs <- filter (/= P.normalise skipf)
|
||||||
|
. filter matchesglob
|
||||||
|
. map (\f -> P.normalise (fromTopFilePath f g))
|
||||||
|
<$> Database.Keys.getAssociatedFiles k
|
||||||
|
-- Some associated files in the keys database may no longer
|
||||||
|
-- correspond to files in the repository. This is checked
|
||||||
|
-- last as it's most expensive.
|
||||||
|
anyM (\f -> maybe False (== k) <$> catKeyFile f) fs
|
||||||
|
|
||||||
addMimeType :: String -> Annex ()
|
addMimeType :: String -> Annex ()
|
||||||
addMimeType = addMagicLimit "mimetype" getMagicMimeType providedMimeType userProvidedMimeType
|
addMimeType = addMagicLimit "mimetype" getMagicMimeType providedMimeType userProvidedMimeType
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@ in either of two repositories.
|
||||||
Skips files matching the glob pattern. The glob is matched relative to
|
Skips files matching the glob pattern. The glob is matched relative to
|
||||||
the current directory. For example:
|
the current directory. For example:
|
||||||
|
|
||||||
--exclude='*.mp3' --exclude='subdir/*'
|
git annex get --exclude='*.mp3' --exclude='subdir/*'
|
||||||
|
|
||||||
Note that this will not match anything when using --all or --unused.
|
Note that this will not match anything when using --all or --unused.
|
||||||
|
|
||||||
|
@ -31,10 +31,31 @@ in either of two repositories.
|
||||||
Skips files not matching the glob pattern. (Same as `--not --exclude`.)
|
Skips files not matching the glob pattern. (Same as `--not --exclude`.)
|
||||||
For example, to include only mp3 and ogg files:
|
For example, to include only mp3 and ogg files:
|
||||||
|
|
||||||
--include='*.mp3' --or --include='*.ogg'
|
git annex get --include='*.mp3' --or --include='*.ogg'
|
||||||
|
|
||||||
Note that this will not skip anything when using --all or --unused.
|
Note that this will not skip anything when using --all or --unused.
|
||||||
|
|
||||||
|
* `--excludesamecontent=glob`
|
||||||
|
|
||||||
|
Skips a file when there is another file with the same content,
|
||||||
|
whose name matches the glob. The glob is matched relative to the current
|
||||||
|
directory.
|
||||||
|
|
||||||
|
For example, to drop files in the archive directory, but not when the same
|
||||||
|
content is used by a file in the work directory:
|
||||||
|
|
||||||
|
git annex drop archive/ --excludesamecontent='work/*'
|
||||||
|
|
||||||
|
* `--includesamecontent=glob`
|
||||||
|
|
||||||
|
Skips files when there is no other file with the same content
|
||||||
|
whose name matches the glob. (Same as `--not --includesamecontent`)
|
||||||
|
|
||||||
|
For example, if you have inbox and outbox directories, and want to find
|
||||||
|
anything in the inbox that has the same content as something in the outbox:
|
||||||
|
|
||||||
|
git annex find inbox --includesamecontent='outbox/*'
|
||||||
|
|
||||||
* `--in=repository`
|
* `--in=repository`
|
||||||
|
|
||||||
Matches only files that git-annex believes have their contents present
|
Matches only files that git-annex believes have their contents present
|
||||||
|
|
|
@ -6,3 +6,5 @@ In our repositories/workflows we quite often encounter cases where multiple subf
|
||||||
|
|
||||||
[[!meta author=yoh]]
|
[[!meta author=yoh]]
|
||||||
[[!tag projects/datalad]]
|
[[!tag projects/datalad]]
|
||||||
|
|
||||||
|
> [[done]] --[[Joey]]
|
||||||
|
|
Loading…
Reference in a new issue