Merge branch 'append'
This commit is contained in:
commit
05b96a1acf
9 changed files with 194 additions and 20 deletions
|
@ -1,6 +1,6 @@
|
|||
{- management of the git-annex branch
|
||||
-
|
||||
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2022 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -23,6 +23,8 @@ module Annex.Branch (
|
|||
getUnmergedRefs,
|
||||
RegardingUUID(..),
|
||||
change,
|
||||
ChangeOrAppend(..),
|
||||
changeOrAppend,
|
||||
maybeChange,
|
||||
commitMessage,
|
||||
createMessage,
|
||||
|
@ -48,7 +50,7 @@ import Control.Concurrent (threadDelay)
|
|||
import Control.Concurrent.MVar
|
||||
import qualified System.FilePath.ByteString as P
|
||||
|
||||
import Annex.Common
|
||||
import Annex.Common hiding (append)
|
||||
import Types.BranchState
|
||||
import Annex.BranchState
|
||||
import Annex.Journal
|
||||
|
@ -404,6 +406,56 @@ maybeChange ru file f = lockJournal $ \jl -> do
|
|||
in when (v /= b) $ set jl ru file b
|
||||
_ -> noop
|
||||
|
||||
data ChangeOrAppend t = Change t | Append t
|
||||
|
||||
{- Applies a function that can either modify the content of the file,
|
||||
- or append to the file. Appending can be more efficient when several
|
||||
- lines are written to a file in succession.
|
||||
-
|
||||
- When annex.alwayscompact=false, the function is not passed the content
|
||||
- of the journal file when the journal file already exists, and whatever
|
||||
- value it provides is always appended to the journal file. That avoids
|
||||
- reading the journal file, and so can be faster when many lines are being
|
||||
- written to it. The information that is recorded will be effectively the
|
||||
- same, only obsolate log lines will not get compacted.
|
||||
-
|
||||
- Currently, only appends when annex.alwayscompact=false. That is to
|
||||
- avoid appending when an older version of git-annex is also in use in the
|
||||
- same repository. An interrupted append could leave the journal file in a
|
||||
- state that would confuse the older version. This is planned to be
|
||||
- changed in a future repository version.
|
||||
-}
|
||||
changeOrAppend :: Journalable content => RegardingUUID -> RawFilePath -> (L.ByteString -> ChangeOrAppend content) -> Annex ()
|
||||
changeOrAppend ru file f = lockJournal $ \jl ->
|
||||
checkCanAppendJournalFile jl ru file >>= \case
|
||||
Just appendable -> ifM (annexAlwaysCompact <$> Annex.getGitConfig)
|
||||
( do
|
||||
oldc <- getToChange ru file
|
||||
case f oldc of
|
||||
Change newc -> set jl ru file newc
|
||||
Append toappend ->
|
||||
set jl ru file $
|
||||
oldc <> journalableByteString toappend
|
||||
-- Use this instead in v11
|
||||
-- or whatever.
|
||||
-- append jl file appendable toappend
|
||||
, case f mempty of
|
||||
-- Append even though a change was
|
||||
-- requested; since mempty was passed in,
|
||||
-- the lines requested to change are
|
||||
-- minimized.
|
||||
Change newc -> append jl file appendable newc
|
||||
Append toappend -> append jl file appendable toappend
|
||||
)
|
||||
Nothing -> do
|
||||
oldc <- getToChange ru file
|
||||
case f oldc of
|
||||
Change newc -> set jl ru file newc
|
||||
-- Journal file does not exist yet, so
|
||||
-- cannot append and have to write it all.
|
||||
Append toappend -> set jl ru file $
|
||||
oldc <> journalableByteString toappend
|
||||
|
||||
{- Only get private information when the RegardingUUID is itself private. -}
|
||||
getToChange :: RegardingUUID -> RawFilePath -> Annex L.ByteString
|
||||
getToChange ru f = flip getLocal' f . GetPrivate =<< regardingPrivateUUID ru
|
||||
|
@ -428,6 +480,14 @@ set jl ru f c = do
|
|||
-- a log file immediately after writing it.
|
||||
invalidateCache
|
||||
|
||||
{- Appends content to the journal file. -}
|
||||
append :: Journalable content => JournalLocked -> RawFilePath -> AppendableJournalFile -> content -> Annex ()
|
||||
append jl f appendable toappend = do
|
||||
journalChanged
|
||||
appendJournalFile jl appendable toappend
|
||||
fastDebug "Annex.Branch" ("append " ++ fromRawFilePath f)
|
||||
invalidateCache
|
||||
|
||||
{- Commit message used when making a commit of whatever data has changed
|
||||
- to the git-annex brach. -}
|
||||
commitMessage :: Annex String
|
||||
|
|
|
@ -4,12 +4,16 @@
|
|||
- git-annex branch. Among other things, it ensures that if git-annex is
|
||||
- interrupted, its recorded data is not lost.
|
||||
-
|
||||
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
|
||||
- All files in the journal must be a series of lines separated by
|
||||
- newlines.
|
||||
-
|
||||
- Copyright 2011-2022 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE CPP #-}
|
||||
|
||||
module Annex.Journal where
|
||||
|
||||
|
@ -20,6 +24,7 @@ import Annex.Perms
|
|||
import Annex.Tmp
|
||||
import Annex.LockFile
|
||||
import Utility.Directory.Stream
|
||||
import qualified Utility.RawFilePath as R
|
||||
|
||||
import qualified Data.Set as S
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
|
@ -71,9 +76,9 @@ privateUUIDsKnown' = not . S.null . annexPrivateRepos . Annex.gitconfig
|
|||
- Using the journal, rather than immediatly staging content to the index
|
||||
- avoids git needing to rewrite the index after every change.
|
||||
-
|
||||
- The file in the journal is updated atomically, which allows
|
||||
- getJournalFileStale to always return a consistent journal file
|
||||
- content, although possibly not the most current one.
|
||||
- The file in the journal is updated atomically. This avoids an
|
||||
- interrupted write truncating information that was earlier read from the
|
||||
- file, and so losing data.
|
||||
-}
|
||||
setJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> content -> Annex ()
|
||||
setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
|
||||
|
@ -91,6 +96,50 @@ setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
|
|||
-- exists
|
||||
mv `catchIO` (const (createAnnexDirectory jd >> mv))
|
||||
|
||||
newtype AppendableJournalFile = AppendableJournalFile (RawFilePath, RawFilePath)
|
||||
|
||||
{- If the journal file does not exist, it cannot be appended to, because
|
||||
- that would overwrite whatever content the file has in the git-annex
|
||||
- branch. -}
|
||||
checkCanAppendJournalFile :: JournalLocked -> RegardingUUID -> RawFilePath -> Annex (Maybe AppendableJournalFile)
|
||||
checkCanAppendJournalFile _jl ru file = do
|
||||
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
|
||||
( return gitAnnexPrivateJournalDir
|
||||
, return gitAnnexJournalDir
|
||||
)
|
||||
let jfile = jd P.</> journalFile file
|
||||
ifM (liftIO $ R.doesPathExist jfile)
|
||||
( return (Just (AppendableJournalFile (jd, jfile)))
|
||||
, return Nothing
|
||||
)
|
||||
|
||||
{- Appends content to an existing journal file.
|
||||
-
|
||||
- Appends are not necessarily atomic, though short appends often are.
|
||||
- So, when this is interrupted, it can leave only part of the content
|
||||
- written to the file. To deal with that situation, both this and
|
||||
- getJournalFileStale check if the file ends with a newline, and if
|
||||
- not discard the incomplete line.
|
||||
-
|
||||
- Due to the lack of atomicity, this should not be used when multiple
|
||||
- lines need to be written to the file as an atomic unit.
|
||||
-}
|
||||
appendJournalFile :: Journalable content => JournalLocked -> AppendableJournalFile -> content -> Annex ()
|
||||
appendJournalFile _jl (AppendableJournalFile (jd, jfile)) content = do
|
||||
let write = liftIO $ withFile (fromRawFilePath jfile) ReadWriteMode $ \h -> do
|
||||
sz <- hFileSize h
|
||||
when (sz /= 0) $ do
|
||||
hSeek h SeekFromEnd (-1)
|
||||
lastchar <- B.hGet h 1
|
||||
unless (lastchar == "\n") $ do
|
||||
hSeek h AbsoluteSeek 0
|
||||
goodpart <- L.length . discardIncompleteAppend
|
||||
<$> L.hGet h (fromIntegral sz)
|
||||
hSetFileSize h (fromIntegral goodpart)
|
||||
hSeek h SeekFromEnd 0
|
||||
writeJournalHandle h content
|
||||
write `catchIO` (const (createAnnexDirectory jd >> write))
|
||||
|
||||
data JournalledContent
|
||||
= NoJournalledContent
|
||||
| JournalledContent L.ByteString
|
||||
|
@ -110,15 +159,17 @@ getJournalFile _jl = getJournalFileStale
|
|||
data GetPrivate = GetPrivate Bool
|
||||
|
||||
{- Without locking, this is not guaranteed to be the most recent
|
||||
- version of the file in the journal, so should not be used as a basis for
|
||||
- changes.
|
||||
- content of the file in the journal, so should not be used as a basis for
|
||||
- making changes to the file.
|
||||
-
|
||||
- The file is read strictly so that its content can safely be fed into
|
||||
- an operation that modifies the file. While setJournalFile doesn't
|
||||
- write directly to journal files and so probably avoids problems with
|
||||
- writing to the same file that's being read, but there could be
|
||||
- concurrency or other issues with a lazy read, and the minor loss of
|
||||
- laziness doesn't matter much, as the files are not very large.
|
||||
- an operation that modifies the file (when getJournalFile calls this).
|
||||
- The minor loss of laziness doesn't matter much, as the files are not
|
||||
- very large.
|
||||
-
|
||||
- To recover from an append of a line that is interrupted part way through
|
||||
- (or is in progress when this is called), if the file content does not end
|
||||
- with a newline, it is truncated back to the previous newline.
|
||||
-}
|
||||
getJournalFileStale :: GetPrivate -> RawFilePath -> Annex JournalledContent
|
||||
getJournalFileStale (GetPrivate getprivate) file = do
|
||||
|
@ -145,7 +196,22 @@ getJournalFileStale (GetPrivate getprivate) file = do
|
|||
where
|
||||
jfile = journalFile file
|
||||
getfrom d = catchMaybeIO $
|
||||
L.fromStrict <$> B.readFile (fromRawFilePath (d P.</> jfile))
|
||||
discardIncompleteAppend
|
||||
<$> L.readFile (fromRawFilePath (d P.</> jfile))
|
||||
|
||||
-- Note that this forces read of the whole lazy bytestring.
|
||||
discardIncompleteAppend :: L.ByteString -> L.ByteString
|
||||
discardIncompleteAppend v
|
||||
| L.null v = v
|
||||
| L.last v == nl = v
|
||||
| otherwise = dropwhileend (/= nl) v
|
||||
where
|
||||
nl = fromIntegral (ord '\n')
|
||||
#if MIN_VERSION_bytestring(0,11,2)
|
||||
dropwhileend = L.dropWhileEnd
|
||||
#else
|
||||
dropwhileend p = L.reverse . L.dropWhile p . L.reverse
|
||||
#endif
|
||||
|
||||
{- List of existing journal files in a journal directory, but without locking,
|
||||
- may miss new ones just being added, or may have false positives if the
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
git-annex (10.20220625) UNRELEASED; urgency=medium
|
||||
git-annex (10.20220724) UNRELEASED; urgency=medium
|
||||
|
||||
* Improve handling of parallelization with -J when copying content
|
||||
from/to a git remote that is a local path.
|
||||
|
@ -19,6 +19,9 @@ git-annex (10.20220625) UNRELEASED; urgency=medium
|
|||
importtree=yes or exporttree=yes.
|
||||
* Fix a reversion that prevented --batch commands (and the assistant)
|
||||
from noticing data written to the journal by other commands.
|
||||
* Added annex.alwayscompact setting which can be unset to speed up
|
||||
writes to the git-annex branch in some cases. See its documentation
|
||||
for important notes on when it's appropariate to use.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Tue, 28 Jun 2022 14:49:17 -0400
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
- A line of the log will look like: "date N INFO"
|
||||
- Where N=1 when the INFO is present, 0 otherwise.
|
||||
-
|
||||
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2010-2022 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -35,10 +35,13 @@ addLog ru file logstatus loginfo =
|
|||
|
||||
addLog' :: Annex.Branch.RegardingUUID -> RawFilePath -> LogStatus -> LogInfo -> CandidateVectorClock -> Annex ()
|
||||
addLog' ru file logstatus loginfo c =
|
||||
Annex.Branch.change ru file $ \b ->
|
||||
Annex.Branch.changeOrAppend ru file $ \b ->
|
||||
let old = parseLog b
|
||||
line = genLine logstatus loginfo c old
|
||||
in buildLog $ compactLog (line : old)
|
||||
in if isNewInfo line old
|
||||
then Annex.Branch.Append $ buildLog [line]
|
||||
else Annex.Branch.Change $ buildLog $
|
||||
compactLog (line : old)
|
||||
|
||||
{- When a LogLine already exists with the same status and info, but an
|
||||
- older timestamp, that LogLine is preserved, rather than updating the log
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex presence log, pure operations
|
||||
-
|
||||
- Copyright 2010-2019 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2010-2021 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -91,6 +91,12 @@ mapLog = M.elems
|
|||
logMap :: [LogLine] -> LogMap
|
||||
logMap = foldr insertNewerLogLine M.empty
|
||||
|
||||
{- Check if the info of the given line is not in the list of LogLines. -}
|
||||
isNewInfo :: LogLine -> [LogLine] -> Bool
|
||||
isNewInfo l old = not (any issame old)
|
||||
where
|
||||
issame l' = info l' == info l
|
||||
|
||||
insertBetter :: (LogLine -> Bool) -> LogLine -> LogMap -> Maybe LogMap
|
||||
insertBetter betterthan l m
|
||||
| better = Just (M.insert i l m)
|
||||
|
|
|
@ -57,7 +57,11 @@ getUrlsWithPrefix key prefix = filter (prefix `isPrefixOf`)
|
|||
|
||||
setUrlPresent :: Key -> URLString -> Annex ()
|
||||
setUrlPresent key url = do
|
||||
us <- getUrls key
|
||||
-- Avoid reading the url log when not compacting, for speed.
|
||||
us <- ifM (annexAlwaysCompact <$> Annex.getGitConfig)
|
||||
( getUrls key
|
||||
, pure mempty
|
||||
)
|
||||
unless (url `elem` us) $ do
|
||||
config <- Annex.getGitConfig
|
||||
addLog (Annex.Branch.RegardingUUID []) (urlLogFile config key)
|
||||
|
|
|
@ -82,6 +82,7 @@ data GitConfig = GitConfig
|
|||
, annexBloomAccuracy :: Maybe Int
|
||||
, annexSshCaching :: Maybe Bool
|
||||
, annexAlwaysCommit :: Bool
|
||||
, annexAlwaysCompact :: Bool
|
||||
, annexCommitMessage :: Maybe String
|
||||
, annexMergeAnnexBranches :: Bool
|
||||
, annexDelayAdd :: Maybe Int
|
||||
|
@ -164,6 +165,7 @@ extractGitConfig configsource r = GitConfig
|
|||
, annexBloomAccuracy = getmayberead (annexConfig "bloomaccuracy")
|
||||
, annexSshCaching = getmaybebool (annexConfig "sshcaching")
|
||||
, annexAlwaysCommit = getbool (annexConfig "alwayscommit") True
|
||||
, annexAlwaysCompact = getbool (annexConfig "alwayscompact") True
|
||||
, annexCommitMessage = getmaybe (annexConfig "commitmessage")
|
||||
, annexMergeAnnexBranches = getbool (annexConfig "merge-annex-branches") True
|
||||
, annexDelayAdd = getmayberead (annexConfig "delayadd")
|
||||
|
|
|
@ -1038,6 +1038,24 @@ repository, using [[git-annex-config]]. See its man page for a list.)
|
|||
This works well in combination with annex.alwayscommit=false,
|
||||
to gather up a set of changes and commit them with a message you specify.
|
||||
|
||||
* `annex.alwayscompact`
|
||||
|
||||
By default, git-annex compacts data it records in the git-annex branch.
|
||||
Setting this to false avoids doing that compaction in some cases, which
|
||||
can speed up operations that populate the git-annex branch with a lot
|
||||
of data. However, when used with operations that overwrite old values in
|
||||
the git-annex branch, that may cause the git-annex branch to use more disk
|
||||
space, and so slow down reading data from it.
|
||||
|
||||
An example of a command that can be sped up by using
|
||||
`-c annex.alwayscompact=false` is `git-annex registerurl --batch`,
|
||||
when adding a large number of urls to the same key.
|
||||
|
||||
This option was first supported by git-annex version 10.20220724.
|
||||
It is not entirely safe to set this option in a repository that may also
|
||||
be used by an older version of git-annex at the same time as a version
|
||||
that supports this option.
|
||||
|
||||
* `annex.allowsign`
|
||||
|
||||
By default git-annex avoids gpg signing commits that it makes when
|
||||
|
|
12
doc/todo/v11_changes.mdwn
Normal file
12
doc/todo/v11_changes.mdwn
Normal file
|
@ -0,0 +1,12 @@
|
|||
This is a todo for collecting changes that could lead to a v11 repository
|
||||
version.
|
||||
|
||||
* Append to journal files even when annex.alwayscompact=true.
|
||||
This can make it a lot faster in some cases.
|
||||
See note in Annex.Branch.changeOrAppend.
|
||||
|
||||
It's important that this only happen when no git-annex version
|
||||
older than 10.20220724 can plausibly be running in a repository
|
||||
after upgrading to the repo version that enables this. Depending on the
|
||||
timing of v11, this may need to be put in a v12 upgrade that is delayed
|
||||
some amount of time (eg 1 year) after v11.
|
Loading…
Reference in a new issue