Merge branch 'append'

This commit is contained in:
Joey Hess 2022-07-20 13:24:04 -04:00
commit 05b96a1acf
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
9 changed files with 194 additions and 20 deletions

View file

@ -1,6 +1,6 @@
{- management of the git-annex branch
-
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
- Copyright 2011-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -23,6 +23,8 @@ module Annex.Branch (
getUnmergedRefs,
RegardingUUID(..),
change,
ChangeOrAppend(..),
changeOrAppend,
maybeChange,
commitMessage,
createMessage,
@ -48,7 +50,7 @@ import Control.Concurrent (threadDelay)
import Control.Concurrent.MVar
import qualified System.FilePath.ByteString as P
import Annex.Common
import Annex.Common hiding (append)
import Types.BranchState
import Annex.BranchState
import Annex.Journal
@ -404,6 +406,56 @@ maybeChange ru file f = lockJournal $ \jl -> do
in when (v /= b) $ set jl ru file b
_ -> noop
data ChangeOrAppend t = Change t | Append t
{- Applies a function that can either modify the content of the file,
- or append to the file. Appending can be more efficient when several
- lines are written to a file in succession.
-
- When annex.alwayscompact=false, the function is not passed the content
- of the journal file when the journal file already exists, and whatever
- value it provides is always appended to the journal file. That avoids
- reading the journal file, and so can be faster when many lines are being
- written to it. The information that is recorded will be effectively the
- same, only obsolate log lines will not get compacted.
-
- Currently, only appends when annex.alwayscompact=false. That is to
- avoid appending when an older version of git-annex is also in use in the
- same repository. An interrupted append could leave the journal file in a
- state that would confuse the older version. This is planned to be
- changed in a future repository version.
-}
changeOrAppend :: Journalable content => RegardingUUID -> RawFilePath -> (L.ByteString -> ChangeOrAppend content) -> Annex ()
changeOrAppend ru file f = lockJournal $ \jl ->
checkCanAppendJournalFile jl ru file >>= \case
Just appendable -> ifM (annexAlwaysCompact <$> Annex.getGitConfig)
( do
oldc <- getToChange ru file
case f oldc of
Change newc -> set jl ru file newc
Append toappend ->
set jl ru file $
oldc <> journalableByteString toappend
-- Use this instead in v11
-- or whatever.
-- append jl file appendable toappend
, case f mempty of
-- Append even though a change was
-- requested; since mempty was passed in,
-- the lines requested to change are
-- minimized.
Change newc -> append jl file appendable newc
Append toappend -> append jl file appendable toappend
)
Nothing -> do
oldc <- getToChange ru file
case f oldc of
Change newc -> set jl ru file newc
-- Journal file does not exist yet, so
-- cannot append and have to write it all.
Append toappend -> set jl ru file $
oldc <> journalableByteString toappend
{- Only get private information when the RegardingUUID is itself private. -}
getToChange :: RegardingUUID -> RawFilePath -> Annex L.ByteString
getToChange ru f = flip getLocal' f . GetPrivate =<< regardingPrivateUUID ru
@ -428,6 +480,14 @@ set jl ru f c = do
-- a log file immediately after writing it.
invalidateCache
{- Appends content to the journal file. -}
append :: Journalable content => JournalLocked -> RawFilePath -> AppendableJournalFile -> content -> Annex ()
append jl f appendable toappend = do
journalChanged
appendJournalFile jl appendable toappend
fastDebug "Annex.Branch" ("append " ++ fromRawFilePath f)
invalidateCache
{- Commit message used when making a commit of whatever data has changed
- to the git-annex brach. -}
commitMessage :: Annex String

View file

@ -4,12 +4,16 @@
- git-annex branch. Among other things, it ensures that if git-annex is
- interrupted, its recorded data is not lost.
-
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
- All files in the journal must be a series of lines separated by
- newlines.
-
- Copyright 2011-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE CPP #-}
module Annex.Journal where
@ -20,6 +24,7 @@ import Annex.Perms
import Annex.Tmp
import Annex.LockFile
import Utility.Directory.Stream
import qualified Utility.RawFilePath as R
import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
@ -71,9 +76,9 @@ privateUUIDsKnown' = not . S.null . annexPrivateRepos . Annex.gitconfig
- Using the journal, rather than immediatly staging content to the index
- avoids git needing to rewrite the index after every change.
-
- The file in the journal is updated atomically, which allows
- getJournalFileStale to always return a consistent journal file
- content, although possibly not the most current one.
- The file in the journal is updated atomically. This avoids an
- interrupted write truncating information that was earlier read from the
- file, and so losing data.
-}
setJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> content -> Annex ()
setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
@ -91,6 +96,50 @@ setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
-- exists
mv `catchIO` (const (createAnnexDirectory jd >> mv))
newtype AppendableJournalFile = AppendableJournalFile (RawFilePath, RawFilePath)
{- If the journal file does not exist, it cannot be appended to, because
- that would overwrite whatever content the file has in the git-annex
- branch. -}
checkCanAppendJournalFile :: JournalLocked -> RegardingUUID -> RawFilePath -> Annex (Maybe AppendableJournalFile)
checkCanAppendJournalFile _jl ru file = do
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
( return gitAnnexPrivateJournalDir
, return gitAnnexJournalDir
)
let jfile = jd P.</> journalFile file
ifM (liftIO $ R.doesPathExist jfile)
( return (Just (AppendableJournalFile (jd, jfile)))
, return Nothing
)
{- Appends content to an existing journal file.
-
- Appends are not necessarily atomic, though short appends often are.
- So, when this is interrupted, it can leave only part of the content
- written to the file. To deal with that situation, both this and
- getJournalFileStale check if the file ends with a newline, and if
- not discard the incomplete line.
-
- Due to the lack of atomicity, this should not be used when multiple
- lines need to be written to the file as an atomic unit.
-}
appendJournalFile :: Journalable content => JournalLocked -> AppendableJournalFile -> content -> Annex ()
appendJournalFile _jl (AppendableJournalFile (jd, jfile)) content = do
let write = liftIO $ withFile (fromRawFilePath jfile) ReadWriteMode $ \h -> do
sz <- hFileSize h
when (sz /= 0) $ do
hSeek h SeekFromEnd (-1)
lastchar <- B.hGet h 1
unless (lastchar == "\n") $ do
hSeek h AbsoluteSeek 0
goodpart <- L.length . discardIncompleteAppend
<$> L.hGet h (fromIntegral sz)
hSetFileSize h (fromIntegral goodpart)
hSeek h SeekFromEnd 0
writeJournalHandle h content
write `catchIO` (const (createAnnexDirectory jd >> write))
data JournalledContent
= NoJournalledContent
| JournalledContent L.ByteString
@ -110,15 +159,17 @@ getJournalFile _jl = getJournalFileStale
data GetPrivate = GetPrivate Bool
{- Without locking, this is not guaranteed to be the most recent
- version of the file in the journal, so should not be used as a basis for
- changes.
- content of the file in the journal, so should not be used as a basis for
- making changes to the file.
-
- The file is read strictly so that its content can safely be fed into
- an operation that modifies the file. While setJournalFile doesn't
- write directly to journal files and so probably avoids problems with
- writing to the same file that's being read, but there could be
- concurrency or other issues with a lazy read, and the minor loss of
- laziness doesn't matter much, as the files are not very large.
- an operation that modifies the file (when getJournalFile calls this).
- The minor loss of laziness doesn't matter much, as the files are not
- very large.
-
- To recover from an append of a line that is interrupted part way through
- (or is in progress when this is called), if the file content does not end
- with a newline, it is truncated back to the previous newline.
-}
getJournalFileStale :: GetPrivate -> RawFilePath -> Annex JournalledContent
getJournalFileStale (GetPrivate getprivate) file = do
@ -145,7 +196,22 @@ getJournalFileStale (GetPrivate getprivate) file = do
where
jfile = journalFile file
getfrom d = catchMaybeIO $
L.fromStrict <$> B.readFile (fromRawFilePath (d P.</> jfile))
discardIncompleteAppend
<$> L.readFile (fromRawFilePath (d P.</> jfile))
-- Note that this forces read of the whole lazy bytestring.
discardIncompleteAppend :: L.ByteString -> L.ByteString
discardIncompleteAppend v
| L.null v = v
| L.last v == nl = v
| otherwise = dropwhileend (/= nl) v
where
nl = fromIntegral (ord '\n')
#if MIN_VERSION_bytestring(0,11,2)
dropwhileend = L.dropWhileEnd
#else
dropwhileend p = L.reverse . L.dropWhile p . L.reverse
#endif
{- List of existing journal files in a journal directory, but without locking,
- may miss new ones just being added, or may have false positives if the

View file

@ -1,4 +1,4 @@
git-annex (10.20220625) UNRELEASED; urgency=medium
git-annex (10.20220724) UNRELEASED; urgency=medium
* Improve handling of parallelization with -J when copying content
from/to a git remote that is a local path.
@ -19,6 +19,9 @@ git-annex (10.20220625) UNRELEASED; urgency=medium
importtree=yes or exporttree=yes.
* Fix a reversion that prevented --batch commands (and the assistant)
from noticing data written to the journal by other commands.
* Added annex.alwayscompact setting which can be unset to speed up
writes to the git-annex branch in some cases. See its documentation
for important notes on when it's appropariate to use.
-- Joey Hess <id@joeyh.name> Tue, 28 Jun 2022 14:49:17 -0400

View file

@ -6,7 +6,7 @@
- A line of the log will look like: "date N INFO"
- Where N=1 when the INFO is present, 0 otherwise.
-
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
- Copyright 2010-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -35,10 +35,13 @@ addLog ru file logstatus loginfo =
addLog' :: Annex.Branch.RegardingUUID -> RawFilePath -> LogStatus -> LogInfo -> CandidateVectorClock -> Annex ()
addLog' ru file logstatus loginfo c =
Annex.Branch.change ru file $ \b ->
Annex.Branch.changeOrAppend ru file $ \b ->
let old = parseLog b
line = genLine logstatus loginfo c old
in buildLog $ compactLog (line : old)
in if isNewInfo line old
then Annex.Branch.Append $ buildLog [line]
else Annex.Branch.Change $ buildLog $
compactLog (line : old)
{- When a LogLine already exists with the same status and info, but an
- older timestamp, that LogLine is preserved, rather than updating the log

View file

@ -1,6 +1,6 @@
{- git-annex presence log, pure operations
-
- Copyright 2010-2019 Joey Hess <id@joeyh.name>
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -91,6 +91,12 @@ mapLog = M.elems
logMap :: [LogLine] -> LogMap
logMap = foldr insertNewerLogLine M.empty
{- Check if the info of the given line is not in the list of LogLines. -}
isNewInfo :: LogLine -> [LogLine] -> Bool
isNewInfo l old = not (any issame old)
where
issame l' = info l' == info l
insertBetter :: (LogLine -> Bool) -> LogLine -> LogMap -> Maybe LogMap
insertBetter betterthan l m
| better = Just (M.insert i l m)

View file

@ -57,7 +57,11 @@ getUrlsWithPrefix key prefix = filter (prefix `isPrefixOf`)
setUrlPresent :: Key -> URLString -> Annex ()
setUrlPresent key url = do
us <- getUrls key
-- Avoid reading the url log when not compacting, for speed.
us <- ifM (annexAlwaysCompact <$> Annex.getGitConfig)
( getUrls key
, pure mempty
)
unless (url `elem` us) $ do
config <- Annex.getGitConfig
addLog (Annex.Branch.RegardingUUID []) (urlLogFile config key)

View file

@ -82,6 +82,7 @@ data GitConfig = GitConfig
, annexBloomAccuracy :: Maybe Int
, annexSshCaching :: Maybe Bool
, annexAlwaysCommit :: Bool
, annexAlwaysCompact :: Bool
, annexCommitMessage :: Maybe String
, annexMergeAnnexBranches :: Bool
, annexDelayAdd :: Maybe Int
@ -164,6 +165,7 @@ extractGitConfig configsource r = GitConfig
, annexBloomAccuracy = getmayberead (annexConfig "bloomaccuracy")
, annexSshCaching = getmaybebool (annexConfig "sshcaching")
, annexAlwaysCommit = getbool (annexConfig "alwayscommit") True
, annexAlwaysCompact = getbool (annexConfig "alwayscompact") True
, annexCommitMessage = getmaybe (annexConfig "commitmessage")
, annexMergeAnnexBranches = getbool (annexConfig "merge-annex-branches") True
, annexDelayAdd = getmayberead (annexConfig "delayadd")

View file

@ -1038,6 +1038,24 @@ repository, using [[git-annex-config]]. See its man page for a list.)
This works well in combination with annex.alwayscommit=false,
to gather up a set of changes and commit them with a message you specify.
* `annex.alwayscompact`
By default, git-annex compacts data it records in the git-annex branch.
Setting this to false avoids doing that compaction in some cases, which
can speed up operations that populate the git-annex branch with a lot
of data. However, when used with operations that overwrite old values in
the git-annex branch, that may cause the git-annex branch to use more disk
space, and so slow down reading data from it.
An example of a command that can be sped up by using
`-c annex.alwayscompact=false` is `git-annex registerurl --batch`,
when adding a large number of urls to the same key.
This option was first supported by git-annex version 10.20220724.
It is not entirely safe to set this option in a repository that may also
be used by an older version of git-annex at the same time as a version
that supports this option.
* `annex.allowsign`
By default git-annex avoids gpg signing commits that it makes when

12
doc/todo/v11_changes.mdwn Normal file
View file

@ -0,0 +1,12 @@
This is a todo for collecting changes that could lead to a v11 repository
version.
* Append to journal files even when annex.alwayscompact=true.
This can make it a lot faster in some cases.
See note in Annex.Branch.changeOrAppend.
It's important that this only happen when no git-annex version
older than 10.20220724 can plausibly be running in a repository
after upgrading to the repo version that enables this. Depending on the
timing of v11, this may need to be put in a v12 upgrade that is delayed
some amount of time (eg 1 year) after v11.