add annex.alwayscompact

Added annex.alwayscompact setting which can be unset to speed up writes to
the git-annex branch in some cases.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2022-07-18 15:50:36 -04:00
parent ccff639651
commit 36f0bdcd57
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 78 additions and 33 deletions

View file

@ -411,13 +411,39 @@ data ChangeOrAppend t = Change t | Append t
{- Applies a function that can either modify the content of the file,
- or append to the file. Appending can be more efficient when several
- lines are written to a file in succession.
-
- When annex.alwayscompact=false, the function is not passed the content
- of the journal file when the journal file already exists, and whatever
- value it provides is always appended to the journal file. That avoids
- reading the journal file, and so can be faster when many lines are being
- written to it. The information that is recorded will be effectively the
- same, only obsolate log lines will not get compacted.
-}
changeOrAppend :: Journalable content => RegardingUUID -> RawFilePath -> (L.ByteString -> ChangeOrAppend content) -> Annex ()
changeOrAppend ru file f = lockJournal $ \jl -> do
changeOrAppend ru file f = lockJournal $ \jl ->
checkCanAppendJournalFile jl ru file >>= \case
Just appendable -> ifM (annexAlwaysCompact <$> Annex.getGitConfig)
( do
oldc <- getToChange ru file
case f oldc of
Change newc -> set jl ru file newc
Append toappend -> append jl ru file oldc toappend
Append toappend -> append jl file appendable toappend
, case f mempty of
-- Append even though a change was
-- requested; since mempty was passed in,
-- the lines requested to change are
-- minimized.
Change newc -> append jl file appendable newc
Append toappend -> append jl file appendable toappend
)
Nothing -> do
oldc <- getToChange ru file
case f oldc of
Change newc -> set jl ru file newc
-- Journal file does not exist yet, so
-- cannot append and have to write it all.
Append toappend -> set jl ru file $
oldc <> journalableByteString toappend
{- Only get private information when the RegardingUUID is itself private. -}
getToChange :: RegardingUUID -> RawFilePath -> Annex L.ByteString
@ -443,17 +469,11 @@ set jl ru f c = do
-- a log file immediately after writing it.
invalidateCache
{- Appends content to the journal file.
-
- The ByteString is the content that the file had before appending.
- It is either the content of the journal file or the content from the
- branch. When the journal file does not exist yet, this content is
- written to it before appending.
-}
append :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> L.ByteString -> content -> Annex ()
append jl ru f oldc toappend = do
{- Appends content to the journal file. -}
append :: Journalable content => JournalLocked -> RawFilePath -> AppendableJournalFile -> content -> Annex ()
append jl f appendable toappend = do
journalChanged
appendJournalFile jl ru f oldc toappend
appendJournalFile jl appendable toappend
fastDebug "Annex.Branch" ("append " ++ fromRawFilePath f)
invalidateCache

View file

@ -4,7 +4,7 @@
- git-annex branch. Among other things, it ensures that if git-annex is
- interrupted, its recorded data is not lost.
-
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
- Copyright 2011-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -20,6 +20,7 @@ import Annex.Perms
import Annex.Tmp
import Annex.LockFile
import Utility.Directory.Stream
import qualified Utility.RawFilePath as R
import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
@ -91,27 +92,30 @@ setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
-- exists
mv `catchIO` (const (createAnnexDirectory jd >> mv))
{- Appends content to a journal file.
-
- The oldcontent is whatever is in the git-annex branch.
- When the journal file does not yet exist, the oldcontent
- is first written to the journal file.
-
- TODO: Unsafe! Does not append atomically. -}
appendJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> L.ByteString -> content -> Annex ()
appendJournalFile _jl ru file oldcontent toappend = do
newtype AppendableJournalFile = AppendableJournalFile (RawFilePath, RawFilePath)
{- If the journal file does not exist, it cannot be appended to, because
- that would overwrite whatever content the file has in the git-annex
- branch. -}
checkCanAppendJournalFile :: JournalLocked -> RegardingUUID -> RawFilePath -> Annex (Maybe AppendableJournalFile)
checkCanAppendJournalFile _jl ru file = do
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
( return gitAnnexPrivateJournalDir
, return gitAnnexJournalDir
)
let jfile = fromRawFilePath $ jd P.</> journalFile file
let write = liftIO $ ifM (doesFileExist jfile)
( withFile jfile AppendMode $ \h ->
writeJournalHandle h toappend
, withFile jfile WriteMode $ \h -> do
writeJournalHandle h oldcontent
writeJournalHandle h toappend
let jfile = jd P.</> journalFile file
ifM (liftIO $ R.doesPathExist jfile)
( return (Just (AppendableJournalFile (jd, jfile)))
, return Nothing
)
{- Appends content to an existing journal file.
-
- TODO: Unsafe! Does not append atomically. -}
appendJournalFile :: Journalable content => JournalLocked -> AppendableJournalFile -> content -> Annex ()
appendJournalFile _jl (AppendableJournalFile (jd, jfile)) content = do
let write = liftIO $ withFile (fromRawFilePath jfile) AppendMode $ \h ->
writeJournalHandle h content
write `catchIO` (const (createAnnexDirectory jd >> write))
data JournalledContent

View file

@ -19,6 +19,8 @@ git-annex (10.20220625) UNRELEASED; urgency=medium
importtree=yes or exporttree=yes.
* Fix a reversion that prevented --batch commands (and the assistant)
from noticing data written to the journal by other commands.
* Added annex.alwayscompact setting which can be unset to speed up
writes to the git-annex branch in some cases.
-- Joey Hess <id@joeyh.name> Tue, 28 Jun 2022 14:49:17 -0400

View file

@ -57,7 +57,11 @@ getUrlsWithPrefix key prefix = filter (prefix `isPrefixOf`)
setUrlPresent :: Key -> URLString -> Annex ()
setUrlPresent key url = do
us <- getUrls key
-- Avoid reading the url log when not compacting, for speed.
us <- ifM (annexAlwaysCompact <$> Annex.getGitConfig)
( getUrls key
, pure mempty
)
unless (url `elem` us) $ do
config <- Annex.getGitConfig
addLog (Annex.Branch.RegardingUUID []) (urlLogFile config key)

View file

@ -82,6 +82,7 @@ data GitConfig = GitConfig
, annexBloomAccuracy :: Maybe Int
, annexSshCaching :: Maybe Bool
, annexAlwaysCommit :: Bool
, annexAlwaysCompact :: Bool
, annexCommitMessage :: Maybe String
, annexMergeAnnexBranches :: Bool
, annexDelayAdd :: Maybe Int
@ -164,6 +165,7 @@ extractGitConfig configsource r = GitConfig
, annexBloomAccuracy = getmayberead (annexConfig "bloomaccuracy")
, annexSshCaching = getmaybebool (annexConfig "sshcaching")
, annexAlwaysCommit = getbool (annexConfig "alwayscommit") True
, annexAlwaysCompact = getbool (annexConfig "alwayscompact") True
, annexCommitMessage = getmaybe (annexConfig "commitmessage")
, annexMergeAnnexBranches = getbool (annexConfig "merge-annex-branches") True
, annexDelayAdd = getmayberead (annexConfig "delayadd")

View file

@ -1038,6 +1038,19 @@ repository, using [[git-annex-config]]. See its man page for a list.)
This works well in combination with annex.alwayscommit=false,
to gather up a set of changes and commit them with a message you specify.
* `annex.alwayscompact`
By default, git-annex compacts data it records in the git-annex branch.
Setting this to false avoids doing that compaction in some cases, which
can speed up operations that populate the git-annex branch with a lot
of data. However, when used with operations that overwrite old values in
the git-annex branch, that may cause the git-annex branch to use more disk
space, or slow down reading data from it.
An example of a command that can be sped up by using
`-c annex.alwayscompact=false` is `git-annex registerurl --batch`,
when adding a large number of urls to the same key.
* `annex.allowsign`
By default git-annex avoids gpg signing commits that it makes when