diff --git a/Annex/Branch.hs b/Annex/Branch.hs index 93dae25617..456d9d6619 100644 --- a/Annex/Branch.hs +++ b/Annex/Branch.hs @@ -411,13 +411,39 @@ data ChangeOrAppend t = Change t | Append t {- Applies a function that can either modify the content of the file, - or append to the file. Appending can be more efficient when several - lines are written to a file in succession. + - + - When annex.alwayscompact=false, the function is not passed the content + - of the journal file when the journal file already exists, and whatever + - value it provides is always appended to the journal file. That avoids + - reading the journal file, and so can be faster when many lines are being + - written to it. The information that is recorded will be effectively the + - same, only obsolate log lines will not get compacted. -} changeOrAppend :: Journalable content => RegardingUUID -> RawFilePath -> (L.ByteString -> ChangeOrAppend content) -> Annex () -changeOrAppend ru file f = lockJournal $ \jl -> do - oldc <- getToChange ru file - case f oldc of - Change newc -> set jl ru file newc - Append toappend -> append jl ru file oldc toappend +changeOrAppend ru file f = lockJournal $ \jl -> + checkCanAppendJournalFile jl ru file >>= \case + Just appendable -> ifM (annexAlwaysCompact <$> Annex.getGitConfig) + ( do + oldc <- getToChange ru file + case f oldc of + Change newc -> set jl ru file newc + Append toappend -> append jl file appendable toappend + , case f mempty of + -- Append even though a change was + -- requested; since mempty was passed in, + -- the lines requested to change are + -- minimized. + Change newc -> append jl file appendable newc + Append toappend -> append jl file appendable toappend + ) + Nothing -> do + oldc <- getToChange ru file + case f oldc of + Change newc -> set jl ru file newc + -- Journal file does not exist yet, so + -- cannot append and have to write it all. + Append toappend -> set jl ru file $ + oldc <> journalableByteString toappend {- Only get private information when the RegardingUUID is itself private. -} getToChange :: RegardingUUID -> RawFilePath -> Annex L.ByteString @@ -443,17 +469,11 @@ set jl ru f c = do -- a log file immediately after writing it. invalidateCache -{- Appends content to the journal file. - - - - The ByteString is the content that the file had before appending. - - It is either the content of the journal file or the content from the - - branch. When the journal file does not exist yet, this content is - - written to it before appending. - -} -append :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> L.ByteString -> content -> Annex () -append jl ru f oldc toappend = do +{- Appends content to the journal file. -} +append :: Journalable content => JournalLocked -> RawFilePath -> AppendableJournalFile -> content -> Annex () +append jl f appendable toappend = do journalChanged - appendJournalFile jl ru f oldc toappend + appendJournalFile jl appendable toappend fastDebug "Annex.Branch" ("append " ++ fromRawFilePath f) invalidateCache diff --git a/Annex/Journal.hs b/Annex/Journal.hs index 5a87d0f7e4..acf07dc959 100644 --- a/Annex/Journal.hs +++ b/Annex/Journal.hs @@ -4,7 +4,7 @@ - git-annex branch. Among other things, it ensures that if git-annex is - interrupted, its recorded data is not lost. - - - Copyright 2011-2021 Joey Hess + - Copyright 2011-2022 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -20,6 +20,7 @@ import Annex.Perms import Annex.Tmp import Annex.LockFile import Utility.Directory.Stream +import qualified Utility.RawFilePath as R import qualified Data.Set as S import qualified Data.ByteString.Lazy as L @@ -91,27 +92,30 @@ setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do -- exists mv `catchIO` (const (createAnnexDirectory jd >> mv)) -{- Appends content to a journal file. - - - - The oldcontent is whatever is in the git-annex branch. - - When the journal file does not yet exist, the oldcontent - - is first written to the journal file. - - - - TODO: Unsafe! Does not append atomically. -} -appendJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> L.ByteString -> content -> Annex () -appendJournalFile _jl ru file oldcontent toappend = do +newtype AppendableJournalFile = AppendableJournalFile (RawFilePath, RawFilePath) + +{- If the journal file does not exist, it cannot be appended to, because + - that would overwrite whatever content the file has in the git-annex + - branch. -} +checkCanAppendJournalFile :: JournalLocked -> RegardingUUID -> RawFilePath -> Annex (Maybe AppendableJournalFile) +checkCanAppendJournalFile _jl ru file = do jd <- fromRepo =<< ifM (regardingPrivateUUID ru) ( return gitAnnexPrivateJournalDir , return gitAnnexJournalDir ) - let jfile = fromRawFilePath $ jd P. journalFile file - let write = liftIO $ ifM (doesFileExist jfile) - ( withFile jfile AppendMode $ \h -> - writeJournalHandle h toappend - , withFile jfile WriteMode $ \h -> do - writeJournalHandle h oldcontent - writeJournalHandle h toappend + let jfile = jd P. journalFile file + ifM (liftIO $ R.doesPathExist jfile) + ( return (Just (AppendableJournalFile (jd, jfile))) + , return Nothing ) + +{- Appends content to an existing journal file. + - + - TODO: Unsafe! Does not append atomically. -} +appendJournalFile :: Journalable content => JournalLocked -> AppendableJournalFile -> content -> Annex () +appendJournalFile _jl (AppendableJournalFile (jd, jfile)) content = do + let write = liftIO $ withFile (fromRawFilePath jfile) AppendMode $ \h -> + writeJournalHandle h content write `catchIO` (const (createAnnexDirectory jd >> write)) data JournalledContent diff --git a/CHANGELOG b/CHANGELOG index 1c2c561f33..9f6d22fcd7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -19,6 +19,8 @@ git-annex (10.20220625) UNRELEASED; urgency=medium importtree=yes or exporttree=yes. * Fix a reversion that prevented --batch commands (and the assistant) from noticing data written to the journal by other commands. + * Added annex.alwayscompact setting which can be unset to speed up + writes to the git-annex branch in some cases. -- Joey Hess Tue, 28 Jun 2022 14:49:17 -0400 diff --git a/Logs/Web.hs b/Logs/Web.hs index a84fc1cc76..cce855f105 100644 --- a/Logs/Web.hs +++ b/Logs/Web.hs @@ -57,7 +57,11 @@ getUrlsWithPrefix key prefix = filter (prefix `isPrefixOf`) setUrlPresent :: Key -> URLString -> Annex () setUrlPresent key url = do - us <- getUrls key + -- Avoid reading the url log when not compacting, for speed. + us <- ifM (annexAlwaysCompact <$> Annex.getGitConfig) + ( getUrls key + , pure mempty + ) unless (url `elem` us) $ do config <- Annex.getGitConfig addLog (Annex.Branch.RegardingUUID []) (urlLogFile config key) diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index 097a015c31..a97fb00b68 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -82,6 +82,7 @@ data GitConfig = GitConfig , annexBloomAccuracy :: Maybe Int , annexSshCaching :: Maybe Bool , annexAlwaysCommit :: Bool + , annexAlwaysCompact :: Bool , annexCommitMessage :: Maybe String , annexMergeAnnexBranches :: Bool , annexDelayAdd :: Maybe Int @@ -164,6 +165,7 @@ extractGitConfig configsource r = GitConfig , annexBloomAccuracy = getmayberead (annexConfig "bloomaccuracy") , annexSshCaching = getmaybebool (annexConfig "sshcaching") , annexAlwaysCommit = getbool (annexConfig "alwayscommit") True + , annexAlwaysCompact = getbool (annexConfig "alwayscompact") True , annexCommitMessage = getmaybe (annexConfig "commitmessage") , annexMergeAnnexBranches = getbool (annexConfig "merge-annex-branches") True , annexDelayAdd = getmayberead (annexConfig "delayadd") diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index cbaa52d223..45c7f9f0b2 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1038,6 +1038,19 @@ repository, using [[git-annex-config]]. See its man page for a list.) This works well in combination with annex.alwayscommit=false, to gather up a set of changes and commit them with a message you specify. +* `annex.alwayscompact` + + By default, git-annex compacts data it records in the git-annex branch. + Setting this to false avoids doing that compaction in some cases, which + can speed up operations that populate the git-annex branch with a lot + of data. However, when used with operations that overwrite old values in + the git-annex branch, that may cause the git-annex branch to use more disk + space, or slow down reading data from it. + + An example of a command that can be sped up by using + `-c annex.alwayscompact=false` is `git-annex registerurl --batch`, + when adding a large number of urls to the same key. + * `annex.allowsign` By default git-annex avoids gpg signing commits that it makes when