Merge branch 'filter-branch'

This commit is contained in:
Joey Hess 2021-05-17 14:16:13 -04:00
commit 40f093775c
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
13 changed files with 386 additions and 20 deletions

View file

@ -24,6 +24,7 @@ module Annex.Branch (
change,
maybeChange,
commitMessage,
createMessage,
commit,
forceCommit,
getBranch,
@ -129,7 +130,8 @@ getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
<$> branchsha
go False = withIndex' True $ do
cmode <- annexCommitMode <$> Annex.getGitConfig
inRepo $ Git.Branch.commitAlways cmode "branch created" fullname []
cmessage <- createMessage
inRepo $ Git.Branch.commitAlways cmode cmessage fullname []
use sha = do
setIndexSha sha
return sha
@ -363,6 +365,10 @@ set jl ru f c = do
commitMessage :: Annex String
commitMessage = fromMaybe "update" . annexCommitMessage <$> Annex.getGitConfig
{- Commit message used when creating the branch. -}
createMessage :: Annex String
createMessage = fromMaybe "branch created" . annexCommitMessage <$> Annex.getGitConfig
{- Stages the journal, and commits staged changes to the branch. -}
commit :: String -> Annex ()
commit = whenM (journalDirty gitAnnexJournalDir) . forceCommit

View file

@ -7,7 +7,8 @@
module Annex.Branch.Transitions (
FileTransition(..),
getTransitionCalculator
getTransitionCalculator,
filterBranch,
) where
import Common

View file

@ -11,6 +11,8 @@ git-annex (8.20210429) UNRELEASED; urgency=medium
* reinject: Error out when run on a file that is not annexed, rather
than silently skipping it.
* assistant: Fix a crash on startup by avoiding using forkProcess.
* init: When annex.commitmessage is set, use that message for the commit
that creates the git-annex branch.
-- Joey Hess <id@joeyh.name> Mon, 03 May 2021 10:33:10 -0400

View file

@ -68,6 +68,7 @@ import qualified Command.Unlock
import qualified Command.Lock
import qualified Command.PreCommit
import qualified Command.PostReceive
import qualified Command.FilterBranch
import qualified Command.Find
import qualified Command.FindRef
import qualified Command.Whereis
@ -202,6 +203,7 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexGlobalOption
, Command.Unused.cmd
, Command.DropUnused.cmd
, Command.AddUnused.cmd
, Command.FilterBranch.cmd
, Command.Find.cmd
, Command.FindRef.cmd
, Command.Whereis.cmd

194
Command/FilterBranch.hs Normal file
View file

@ -0,0 +1,194 @@
{- git-annex command
-
- Copyright 2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Command.FilterBranch where
import Command
import qualified Annex
import qualified Annex.Branch
import Annex.Branch.Transitions (filterBranch, FileTransition(..))
import Annex.HashObject
import Annex.Tmp
import Annex.SpecialRemote.Config
import Types.ProposedAccepted
import Logs
import Logs.Remote
import Git.Types
import Git.FilePath
import Git.Index
import Git.Env
import Git.UpdateIndex
import qualified Git.LsTree as LsTree
import qualified Git.Branch as Git
import Utility.RawFilePath
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Builder
import qualified System.FilePath.ByteString as P
cmd :: Command
cmd = noMessages $ withGlobalOptions [annexedMatchingOptions] $
command "filter-branch" SectionMaintenance
"filter information from the git-annex branch"
paramPaths (seek <$$> optParser)
data FilterBranchOptions = FilterBranchOptions
{ includeFiles :: CmdParams
, keyOptions :: Maybe KeyOptions
, keyInformation :: [IncludeExclude (DeferredParse UUID)]
, repoConfig :: [IncludeExclude (DeferredParse UUID)]
, includeGlobalConfig :: Bool
}
optParser :: CmdParamsDesc -> Parser FilterBranchOptions
optParser desc = FilterBranchOptions
<$> cmdParams desc
<*> optional parseKeyOptions
<*> many (parseIncludeExclude "key-information")
<*> many (parseIncludeExclude "repo-config")
<*> switch
( long "include-global-config"
<> help "include global configuration"
)
data IncludeExclude t
= Include t
| Exclude t
| IncludeAll
deriving (Show, Eq, Ord)
isInclude :: IncludeExclude t -> Bool
isInclude (Include _) = True
isInclude IncludeAll = True
isInclude (Exclude _) = False
parseIncludeExclude :: String -> Parser (IncludeExclude (DeferredParse UUID))
parseIncludeExclude s =
( Include <$> parseRepositoryOption
("include-" ++ s ++ "-for")
"include information about a repository"
) <|>
( Exclude <$> parseRepositoryOption
("exclude-" ++ s ++ "-for")
"exclude information about a repository"
) <|>
( flag' IncludeAll
( long ("include-all-" ++ s)
<> help "include information about all non-excluded repositories"
)
)
parseRepositoryOption :: String -> String -> Parser (DeferredParse UUID)
parseRepositoryOption s h = parseUUIDOption <$> strOption
( long s
<> help h
<> metavar (paramRemote `paramOr` paramDesc `paramOr` paramUUID)
<> completeRemotes
)
mkUUIDMatcher :: [IncludeExclude (DeferredParse UUID)] -> Annex (UUID -> Bool)
mkUUIDMatcher l = do
sameasmap <- M.mapMaybe
(toUUID . fromProposedAccepted <$$> M.lookup sameasUUIDField)
<$> remoteConfigMap
mkUUIDMatcher' sameasmap <$> mapM get l
where
get (Include v) = Include <$> getParsed v
get (Exclude v) = Exclude <$> getParsed v
get IncludeAll = pure IncludeAll
mkUUIDMatcher' :: M.Map UUID UUID -> [IncludeExclude UUID] -> (UUID -> Bool)
mkUUIDMatcher' sameasmap l = \u ->
let sameas = M.lookup u sameasmap
in ( S.member (Include u) includes
|| S.member IncludeAll includes
|| maybe False (\u' -> S.member (Include u') includes) sameas
)
&& S.notMember (Exclude u) excludes
&& maybe True (\u' -> S.notMember (Exclude u') excludes) sameas
where
(includes, excludes) = (S.partition isInclude (S.fromList l))
seek :: FilterBranchOptions -> CommandSeek
seek o = withOtherTmp $ \tmpdir -> do
let tmpindex = tmpdir P.</> "index"
gc <- Annex.getGitConfig
tmpindexrepo <- Annex.inRepo $ \r ->
addGitEnv r indexEnv (fromRawFilePath tmpindex)
withUpdateIndex tmpindexrepo $ \h -> do
keyinfomatcher <- mkUUIDMatcher (keyInformation o)
repoconfigmatcher <- mkUUIDMatcher (repoConfig o)
let addtoindex f sha = liftIO $ streamUpdateIndex' h $
pureStreamer $ L.fromStrict $ LsTree.formatLsTree $ LsTree.TreeItem
{ LsTree.mode = fromTreeItemType TreeFile
, LsTree.typeobj = fmtObjectType BlobObject
, LsTree.sha = sha
, LsTree.size = Nothing
, LsTree.file = asTopFilePath f
}
let filterbanch matcher f c
| L.null c = noop
| otherwise = case filterBranch matcher gc f c of
ChangeFile builder -> do
let c' = toLazyByteString builder
unless (L.null c') $
addtoindex f =<< hashBlob c'
-- This could perhaps be optimised by looking
-- up the sha of the file in the branch.
PreserveFile -> addtoindex f =<< hashBlob c
-- Add information for all keys that are being included,
-- filtering out information for repositories that are not
-- being included.
let addkeyinfo k = startingCustomOutput k $ do
forM_ (keyLogFiles gc k) $ \f ->
filterbanch keyinfomatcher f
=<< Annex.Branch.get f
next (return True)
let seeker = AnnexedFileSeeker
{ startAction = \_ _ k -> addkeyinfo k
, checkContentPresent = Nothing
, usesLocationLog = True
}
-- Avoid the usual default of all files in the current
-- directory and below, because this command is documented
-- as only including the information it has explicitly been
-- told to include.
when (not (null (includeFiles o)) || isJust (keyOptions o)) $
withKeyOptions (keyOptions o) False seeker
(commandAction . \(_, k, _) -> addkeyinfo k)
(withFilesInGitAnnex ww seeker)
=<< workTreeItems ww (includeFiles o)
-- Add repository configs for all repositories that are
-- being included.
forM_ topLevelUUIDBasedLogs $ \f ->
filterbanch repoconfigmatcher f
=<< Annex.Branch.get f
-- Add global configs when included.
when (includeGlobalConfig o) $
forM_ otherTopLevelLogs $ \f -> do
c <- Annex.Branch.get f
unless (L.null c) $
addtoindex f =<< hashBlob c
-- Commit the temporary index, and output the result.
t <- liftIO $ Git.writeTree tmpindexrepo
liftIO $ removeWhenExistsWith removeLink tmpindex
cmode <- annexCommitMode <$> Annex.getGitConfig
cmessage <- Annex.Branch.commitMessage
c <- inRepo $ Git.commitTree cmode cmessage [] t
liftIO $ putStrLn (fromRef c)
where
ww = WarnUnmatchLsFiles

View file

@ -1,6 +1,6 @@
{- git branch stuff
-
- Copyright 2011 Joey Hess <id@joeyh.name>
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -166,7 +166,7 @@ commitCommand' runner commitmode ps = runner $
-}
commit :: CommitMode -> Bool -> String -> Branch -> [Ref] -> Repo -> IO (Maybe Sha)
commit commitmode allowempty message branch parentrefs repo = do
tree <- getSha "write-tree" $ pipeReadStrict [Param "write-tree"] repo
tree <- writeTree repo
ifM (cancommit tree)
( do
sha <- commitTree commitmode message parentrefs tree repo
@ -185,6 +185,9 @@ commitAlways :: CommitMode -> String -> Branch -> [Ref] -> Repo -> IO Sha
commitAlways commitmode message branch parentrefs repo = fromJust
<$> commit commitmode True message branch parentrefs repo
writeTree :: Repo -> IO Sha
writeTree repo = getSha "write-tree" $ pipeReadStrict [Param "write-tree"] repo
commitTree :: CommitMode -> String -> [Ref] -> Ref -> Repo -> IO Sha
commitTree commitmode message parentrefs tree repo =
getSha "commit-tree" $

View file

@ -28,6 +28,7 @@ import Utility.Attoparsec
import Numeric
import Data.Either
import Data.Char
import System.Posix.Types
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
@ -146,9 +147,9 @@ parserLsTree long = case long of
{- Inverse of parseLsTree. Note that the long output format is not
- generated, so any size information is not included. -}
formatLsTree :: TreeItem -> String
formatLsTree ti = unwords
[ showOct (mode ti) ""
, decodeBS (typeobj ti)
, fromRef (sha ti)
] ++ ('\t' : fromRawFilePath (getTopFilePath (file ti)))
formatLsTree :: TreeItem -> S.ByteString
formatLsTree ti = S.intercalate (S.singleton (fromIntegral (ord ' ')))
[ encodeBS' (showOct (mode ti) "")
, typeobj ti
, fromRef' (sha ti)
] <> (S.cons (fromIntegral (ord '\t')) (getTopFilePath (file ti)))

10
Logs.hs
View file

@ -72,13 +72,9 @@ keyLogFiles config k =
, chunkLogFile config k
] ++ oldurlLogs config k
{- All the log files that do not contain information specific to a key. -}
nonKeyLogFiles :: [RawFilePath]
nonKeyLogFiles = concat
[ topLevelNewUUIDBasedLogs
, topLevelOldUUIDBasedLogs
, otherTopLevelLogs
]
{- All uuid-based logs stored in the top of the git-annex branch. -}
topLevelUUIDBasedLogs :: [RawFilePath]
topLevelUUIDBasedLogs = topLevelNewUUIDBasedLogs ++ topLevelOldUUIDBasedLogs
{- All the old-format uuid-based logs stored in the top of the git-annex branch. -}
topLevelOldUUIDBasedLogs :: [RawFilePath]

View file

@ -23,6 +23,7 @@ module Logs.Export (
) where
import qualified Data.Map as M
import qualified Data.ByteString as B
import Annex.Common
import qualified Annex.Branch
@ -119,10 +120,10 @@ logExportExcluded :: UUID -> ((Git.Tree.TreeItem -> IO ()) -> Annex a) -> Annex
logExportExcluded u a = do
logf <- fromRepo $ gitAnnexExportExcludeLog u
withLogHandle logf $ \logh -> do
liftIO $ hSetNewlineMode logh noNewlineTranslation
a (writer logh)
where
writer logh = hPutStrLn logh
writer logh = B.hPutStr logh
. flip B.snoc (fromIntegral (ord '\n'))
. Git.LsTree.formatLsTree
. Git.Tree.treeItemToLsTreeItem

View file

@ -0,0 +1,151 @@
# NAME
git-annex filter-branch - filter information from the git-annex branch
# SYNOPSIS
git annex filter-branch [...]
# DESCRIPTION
This copies selected information from the git-annex branch into a git
commit object, and outputs its hash. The git commit can be transported
to another git repository, and given a branch name such as "foo/git-annex",
and git-annex there will automatically merge that into its git-annex
branch. This allows publishing some information from your git-annex branch,
without publishing the whole thing.
Other ways to avoid publishing information from a git-annex branch,
or remove information from it include [[git-annex-forget]](1), the
`annex.private` git config, and the `--private` option to
[[git-annex-initremote](1). Those are much easier to use, but this
provides full control for those who need it.
With no options, no information at all will be included from the git-annex
branch. Use options to specify what to include. All options can be specified
multiple times.
When the repository contains information about a private
repository (due to `annex.private` being set, or `git-annex initremote
--private` being used), that private information will be included when
allowed by the options, even though it is not recorded on the git-annex
branch.
When a repository was created with `git annex initremote --sameas=foo`,
its information will be included when the information for foo is,
and excluded when foo is excluded.
When a special remote is configured with importtree=yes or exporttree=yes,
normally the git tree corresponding to the repository is included in
the git-annex branch, to make sure it does not get garbage collected
by `git gc`. Those trees are *not* included when filtering the git-annex
branch. Usually this will not cause any problems, but if such a tree does
get garbage collected, it will prevent accessing files on the special
remote, until the next time a tree is imported or exported to it.
# OPTIONS
* `path`
Include information about all keys of annexed files in the path.
* file matching options
The [[git-annex-matching-options]](1)
can be used to specify which files in a path to include.
* `--branch=ref`
Include information about keys referred of annexed files in the branch
or treeish.
* `--key=key`
Include information about a specific key.
* `--all`
Include information about all keys.
* `--include-key-information-for=repo`
When including information about a key, include information specific to
this repository. The repository can be specified with a uuid or the name
of a remote. This option can be used repeatedly to include several
repositories.
* `--include-all-key-information`
Include key information for all repositories, except any excluded with
the `--exclude-key-information-for` option.
* `--exclude-key-information-for=repo`
When including information about a key, exclude information specific to
this repository. The repository can be specified with a uuid or the name
of a remote. This option can be used repeatedly to exclude
several repositories.
* `--include-repo-config-for=repo`
Include configuration specific to this repository.
The repository can be specified with a uuid or the name of a remote.
This includes the configuration of special remotes, which may include
embedded credentials, or encryption parameters. It also includes trust
settings, preferred content, etc. It does not include information
about any git-annex keys. This option can be used repeatedly to include
several repositories.
* `--include-all-repo-config`
Include the configuration of all repositories, except for any excluded
with the `--exclude-repo-config-for` option.
* `--exclude-repo-config-for=repo`
Exclude configuration specific to this repository.
The repository can be specified with a uuid or the name of a remote.
This option can be used repeatedly to exclude several repositories.
* `--include-global-config`
Include global configuration, that is not specific to any repository.
This includes configs stored by [[git-annex-numcopies]](1),
[[git-annex-config]](1), etc.
# EXAMPLES
You have a big git-annex repository and are splitting the directory "foo"
out, to make a smaller repository. You want the smaller repo's git-annex
branch to contain all the information about remotes and other configuration,
but only information about keys in that directory.
git-annex filter-branch foo --include-all-key-information \
--include-all-repo-config --include-global-config
That only includes information about the keys that are currently
in the directory "foo", not keys used by old versions of files.
To also include information about the version of the subdir in
tag "1.0", add the option `--branch=1.0:foo`
Your repository has a special remote "bar", and you want to share information
about which annexed files are stored in it, but without sharing anything
about the configuration of the remote.
git-annex filter-branch --all --include-all-key-information \
--include-all-repo-config --exclude-repo-config-for=bar \
--include-global-config
# SEE ALSO
[[git-annex]](1)
[[git-annex-forget]](1)
# AUTHOR
Joey Hess <id@joeyh.name>
Warning: Automatically converted into a man page by mdwn2man. Edit with care.

View file

@ -32,6 +32,8 @@ git to push the branch to any git repositories not running git-annex.)
[[git-annex]](1)
[[git-annex-filter-branch]](1)
# AUTHOR
Joey Hess <id@joeyh.name>

View file

@ -404,6 +404,12 @@ content from the key-value store.
See [[git-annex-forget]](1) for details.
* `filter-branch`
Produces a filtered version of the git-annex branch.
See [[git-annex-filter-branch]](1) for details.
* `repair`
This can repair many of the problems with git repositories that `git fsck`
@ -1008,7 +1014,7 @@ repository, using [[git-annex-config]]. See its man page for a list.)
* `annex.commitmessage`
When git-annex updates the git-annex branch, it usually makes up
its own commit message ("update"), since users rarely look at or
its own commit message (eg "update"), since users rarely look at or
care about changes to that branch. If you do care, you can
specify this setting by running commands with
`-c annex.commitmessage=whatever`

View file

@ -742,6 +742,7 @@ Executable git-annex
Command.ExamineKey
Command.Expire
Command.Export
Command.FilterBranch
Command.Find
Command.FindRef
Command.Fix