2012-10-19 18:25:15 +00:00
|
|
|
{- management of the git-annex journal
|
2011-12-12 22:03:28 +00:00
|
|
|
-
|
|
|
|
- The journal is used to queue up changes before they are committed to the
|
2013-10-03 18:41:57 +00:00
|
|
|
- git-annex branch. Among other things, it ensures that if git-annex is
|
2011-12-12 22:03:28 +00:00
|
|
|
- interrupted, its recorded data is not lost.
|
|
|
|
-
|
2022-07-20 16:39:03 +00:00
|
|
|
- All files in the journal must be a series of lines separated by
|
|
|
|
- newlines.
|
|
|
|
-
|
2024-05-15 21:33:38 +00:00
|
|
|
- Copyright 2011-2024 Joey Hess <id@joeyh.name>
|
2011-12-12 22:03:28 +00:00
|
|
|
-
|
2019-03-13 19:48:14 +00:00
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
2011-12-12 22:03:28 +00:00
|
|
|
-}
|
|
|
|
|
2021-04-20 17:13:45 +00:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
2022-07-20 16:39:03 +00:00
|
|
|
{-# LANGUAGE CPP #-}
|
2021-04-20 17:13:45 +00:00
|
|
|
|
2011-12-12 22:03:28 +00:00
|
|
|
module Annex.Journal where
|
|
|
|
|
2016-01-20 20:36:33 +00:00
|
|
|
import Annex.Common
|
2021-04-23 18:21:57 +00:00
|
|
|
import qualified Annex
|
2011-12-12 22:03:28 +00:00
|
|
|
import qualified Git
|
2012-04-21 20:59:49 +00:00
|
|
|
import Annex.Perms
|
2019-01-17 19:40:44 +00:00
|
|
|
import Annex.Tmp
|
2014-07-10 04:32:23 +00:00
|
|
|
import Annex.LockFile
|
2024-05-15 21:33:38 +00:00
|
|
|
import Annex.BranchState
|
|
|
|
import Types.BranchState
|
2017-12-31 20:08:31 +00:00
|
|
|
import Utility.Directory.Stream
|
2022-07-18 19:50:36 +00:00
|
|
|
import qualified Utility.RawFilePath as R
|
2014-01-28 18:17:14 +00:00
|
|
|
|
2021-04-23 18:21:57 +00:00
|
|
|
import qualified Data.Set as S
|
2019-01-03 17:21:48 +00:00
|
|
|
import qualified Data.ByteString.Lazy as L
|
2021-04-23 18:21:57 +00:00
|
|
|
import qualified Data.ByteString as B
|
2019-12-11 18:12:22 +00:00
|
|
|
import qualified System.FilePath.ByteString as P
|
2019-01-03 17:21:48 +00:00
|
|
|
import Data.ByteString.Builder
|
2019-12-11 18:12:22 +00:00
|
|
|
import Data.Char
|
2019-01-03 17:21:48 +00:00
|
|
|
|
|
|
|
class Journalable t where
|
|
|
|
writeJournalHandle :: Handle -> t -> IO ()
|
|
|
|
journalableByteString :: t -> L.ByteString
|
|
|
|
|
|
|
|
instance Journalable L.ByteString where
|
|
|
|
writeJournalHandle = L.hPut
|
|
|
|
journalableByteString = id
|
|
|
|
|
|
|
|
-- This is more efficient than the ByteString instance.
|
|
|
|
instance Journalable Builder where
|
|
|
|
writeJournalHandle = hPutBuilder
|
|
|
|
journalableByteString = toLazyByteString
|
|
|
|
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
{- When a file in the git-annex branch is changed, this indicates what
|
|
|
|
- repository UUID (or in some cases, UUIDs) a change is regarding.
|
|
|
|
-
|
2021-04-23 18:21:57 +00:00
|
|
|
- Using this lets changes regarding private UUIDs be stored separately
|
|
|
|
- from the git-annex branch, so its information does not get exposed
|
|
|
|
- outside the repo.
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
-}
|
|
|
|
data RegardingUUID = RegardingUUID [UUID]
|
|
|
|
|
2021-04-23 18:21:57 +00:00
|
|
|
regardingPrivateUUID :: RegardingUUID -> Annex Bool
|
|
|
|
regardingPrivateUUID (RegardingUUID []) = pure False
|
|
|
|
regardingPrivateUUID (RegardingUUID us) = do
|
|
|
|
s <- annexPrivateRepos <$> Annex.getGitConfig
|
|
|
|
return (any (flip S.member s) us)
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
|
2021-04-23 18:21:57 +00:00
|
|
|
{- Are any private UUIDs known to exist? If so, extra work has to be done,
|
|
|
|
- to check for information separately recorded for them, outside the usual
|
|
|
|
- locations.
|
|
|
|
-}
|
|
|
|
privateUUIDsKnown :: Annex Bool
|
|
|
|
privateUUIDsKnown = privateUUIDsKnown' <$> Annex.getState id
|
|
|
|
|
|
|
|
privateUUIDsKnown' :: Annex.AnnexState -> Bool
|
|
|
|
privateUUIDsKnown' = not . S.null . annexPrivateRepos . Annex.gitconfig
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
|
2011-12-12 22:03:28 +00:00
|
|
|
{- Records content for a file in the branch to the journal.
|
|
|
|
-
|
2023-03-14 02:39:16 +00:00
|
|
|
- Using the journal, rather than immediately staging content to the index
|
2013-10-03 18:41:57 +00:00
|
|
|
- avoids git needing to rewrite the index after every change.
|
|
|
|
-
|
2022-07-20 16:39:03 +00:00
|
|
|
- The file in the journal is updated atomically. This avoids an
|
|
|
|
- interrupted write truncating information that was earlier read from the
|
|
|
|
- file, and so losing data.
|
2013-10-03 18:41:57 +00:00
|
|
|
-}
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
setJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> content -> Annex ()
|
|
|
|
setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
|
2024-05-15 21:33:38 +00:00
|
|
|
st <- getState
|
2021-04-23 18:21:57 +00:00
|
|
|
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
|
2024-05-15 21:33:38 +00:00
|
|
|
( return (gitAnnexPrivateJournalDir st)
|
|
|
|
, return (gitAnnexJournalDir st)
|
2021-04-23 18:21:57 +00:00
|
|
|
)
|
2012-04-21 20:59:49 +00:00
|
|
|
-- journal file is written atomically
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
let jfile = journalFile file
|
2022-06-22 20:47:34 +00:00
|
|
|
let tmpfile = tmp P.</> jfile
|
2022-07-18 17:49:17 +00:00
|
|
|
liftIO $ withFile (fromRawFilePath tmpfile) WriteMode $ \h ->
|
|
|
|
writeJournalHandle h content
|
2023-04-26 20:22:15 +00:00
|
|
|
let dest = jd P.</> jfile
|
|
|
|
let mv = do
|
|
|
|
liftIO $ moveFile tmpfile dest
|
|
|
|
setAnnexFilePerm dest
|
2022-07-14 16:28:16 +00:00
|
|
|
-- avoid overhead of creating the journal directory when it already
|
|
|
|
-- exists
|
2022-07-18 17:49:17 +00:00
|
|
|
mv `catchIO` (const (createAnnexDirectory jd >> mv))
|
2011-12-12 22:03:28 +00:00
|
|
|
|
2022-07-18 19:50:36 +00:00
|
|
|
newtype AppendableJournalFile = AppendableJournalFile (RawFilePath, RawFilePath)
|
|
|
|
|
|
|
|
{- If the journal file does not exist, it cannot be appended to, because
|
|
|
|
- that would overwrite whatever content the file has in the git-annex
|
|
|
|
- branch. -}
|
|
|
|
checkCanAppendJournalFile :: JournalLocked -> RegardingUUID -> RawFilePath -> Annex (Maybe AppendableJournalFile)
|
|
|
|
checkCanAppendJournalFile _jl ru file = do
|
2024-05-15 21:33:38 +00:00
|
|
|
st <- getState
|
split out appending to journal from writing, high level only
Currently this is not an improvement, but it allows for optimising
appendJournalFile later. With an optimised appendJournalFile, this will
greatly speed up access patterns like git-annex addurl of a lot of urls
to the same key, where the log file can grow rather large. Appending
rather than re-writing the journal file for each line can save a lot of
disk writes.
It still has to read the current journal or branch file, to check
if it can append to it, and so when the journal file does not exist yet,
it can write the old content from the branch to it. Probably the re-reads
are better cached by the filesystem than repeated writes. (If the
re-reads turn out to keep performance bad, they could be eliminated, at
the cost of not being able to compact the log when replacing old
information in it. That could be enabled by a switch.)
While the immediate need is to affect addurl writes, it was implemented
at the level of presence logs, so will also perhaps speed up location logs.
The only added overhead is the call to isNewInfo, which only needs to
compare ByteStrings. Helping to balance that out, it avoids compactLog
when it's able to append.
Sponsored-by: Dartmouth College's DANDI project
2022-07-18 17:22:50 +00:00
|
|
|
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
|
2024-05-15 21:33:38 +00:00
|
|
|
( return (gitAnnexPrivateJournalDir st)
|
|
|
|
, return (gitAnnexJournalDir st)
|
split out appending to journal from writing, high level only
Currently this is not an improvement, but it allows for optimising
appendJournalFile later. With an optimised appendJournalFile, this will
greatly speed up access patterns like git-annex addurl of a lot of urls
to the same key, where the log file can grow rather large. Appending
rather than re-writing the journal file for each line can save a lot of
disk writes.
It still has to read the current journal or branch file, to check
if it can append to it, and so when the journal file does not exist yet,
it can write the old content from the branch to it. Probably the re-reads
are better cached by the filesystem than repeated writes. (If the
re-reads turn out to keep performance bad, they could be eliminated, at
the cost of not being able to compact the log when replacing old
information in it. That could be enabled by a switch.)
While the immediate need is to affect addurl writes, it was implemented
at the level of presence logs, so will also perhaps speed up location logs.
The only added overhead is the call to isNewInfo, which only needs to
compare ByteStrings. Helping to balance that out, it avoids compactLog
when it's able to append.
Sponsored-by: Dartmouth College's DANDI project
2022-07-18 17:22:50 +00:00
|
|
|
)
|
2022-07-18 19:50:36 +00:00
|
|
|
let jfile = jd P.</> journalFile file
|
|
|
|
ifM (liftIO $ R.doesPathExist jfile)
|
|
|
|
( return (Just (AppendableJournalFile (jd, jfile)))
|
|
|
|
, return Nothing
|
2022-07-18 17:47:56 +00:00
|
|
|
)
|
2022-07-18 19:50:36 +00:00
|
|
|
|
|
|
|
{- Appends content to an existing journal file.
|
|
|
|
-
|
2022-07-20 16:39:03 +00:00
|
|
|
- Appends are not necessarily atomic, though short appends often are.
|
|
|
|
- So, when this is interrupted, it can leave only part of the content
|
|
|
|
- written to the file. To deal with that situation, both this and
|
|
|
|
- getJournalFileStale check if the file ends with a newline, and if
|
|
|
|
- not discard the incomplete line.
|
|
|
|
-
|
|
|
|
- Due to the lack of atomicity, this should not be used when multiple
|
|
|
|
- lines need to be written to the file as an atomic unit.
|
|
|
|
-}
|
2022-07-18 19:50:36 +00:00
|
|
|
appendJournalFile :: Journalable content => JournalLocked -> AppendableJournalFile -> content -> Annex ()
|
|
|
|
appendJournalFile _jl (AppendableJournalFile (jd, jfile)) content = do
|
2022-07-20 16:39:03 +00:00
|
|
|
let write = liftIO $ withFile (fromRawFilePath jfile) ReadWriteMode $ \h -> do
|
|
|
|
sz <- hFileSize h
|
|
|
|
when (sz /= 0) $ do
|
|
|
|
hSeek h SeekFromEnd (-1)
|
|
|
|
lastchar <- B.hGet h 1
|
|
|
|
unless (lastchar == "\n") $ do
|
|
|
|
hSeek h AbsoluteSeek 0
|
|
|
|
goodpart <- L.length . discardIncompleteAppend
|
|
|
|
<$> L.hGet h (fromIntegral sz)
|
|
|
|
hSetFileSize h (fromIntegral goodpart)
|
|
|
|
hSeek h SeekFromEnd 0
|
2022-07-18 19:50:36 +00:00
|
|
|
writeJournalHandle h content
|
split out appending to journal from writing, high level only
Currently this is not an improvement, but it allows for optimising
appendJournalFile later. With an optimised appendJournalFile, this will
greatly speed up access patterns like git-annex addurl of a lot of urls
to the same key, where the log file can grow rather large. Appending
rather than re-writing the journal file for each line can save a lot of
disk writes.
It still has to read the current journal or branch file, to check
if it can append to it, and so when the journal file does not exist yet,
it can write the old content from the branch to it. Probably the re-reads
are better cached by the filesystem than repeated writes. (If the
re-reads turn out to keep performance bad, they could be eliminated, at
the cost of not being able to compact the log when replacing old
information in it. That could be enabled by a switch.)
While the immediate need is to affect addurl writes, it was implemented
at the level of presence logs, so will also perhaps speed up location logs.
The only added overhead is the call to isNewInfo, which only needs to
compare ByteStrings. Helping to balance that out, it avoids compactLog
when it's able to append.
Sponsored-by: Dartmouth College's DANDI project
2022-07-18 17:22:50 +00:00
|
|
|
write `catchIO` (const (createAnnexDirectory jd >> write))
|
|
|
|
|
2021-10-26 17:43:50 +00:00
|
|
|
data JournalledContent
|
|
|
|
= NoJournalledContent
|
|
|
|
| JournalledContent L.ByteString
|
|
|
|
| PossiblyStaleJournalledContent L.ByteString
|
|
|
|
-- ^ This is used when the journalled content may have been
|
|
|
|
-- supersceded by content in the git-annex branch. The returned
|
|
|
|
-- content should be combined with content from the git-annex branch.
|
|
|
|
-- This is particularly the case when a file is in the private
|
|
|
|
-- journal, which does not get written to the git-annex branch,
|
|
|
|
-- and so the git-annex branch can contain changes to non-private
|
|
|
|
-- information that were made after that journal file was written.
|
|
|
|
|
2011-12-12 22:03:28 +00:00
|
|
|
{- Gets any journalled content for a file in the branch. -}
|
2021-10-26 17:43:50 +00:00
|
|
|
getJournalFile :: JournalLocked -> GetPrivate -> RawFilePath -> Annex JournalledContent
|
2013-10-03 18:41:57 +00:00
|
|
|
getJournalFile _jl = getJournalFileStale
|
|
|
|
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
data GetPrivate = GetPrivate Bool
|
|
|
|
|
2013-10-03 18:41:57 +00:00
|
|
|
{- Without locking, this is not guaranteed to be the most recent
|
2022-07-20 16:39:03 +00:00
|
|
|
- content of the file in the journal, so should not be used as a basis for
|
|
|
|
- making changes to the file.
|
2019-01-03 17:21:48 +00:00
|
|
|
-
|
|
|
|
- The file is read strictly so that its content can safely be fed into
|
2022-07-20 16:39:03 +00:00
|
|
|
- an operation that modifies the file (when getJournalFile calls this).
|
|
|
|
- The minor loss of laziness doesn't matter much, as the files are not
|
|
|
|
- very large.
|
|
|
|
-
|
|
|
|
- To recover from an append of a line that is interrupted part way through
|
|
|
|
- (or is in progress when this is called), if the file content does not end
|
|
|
|
- with a newline, it is truncated back to the previous newline.
|
2019-01-03 17:21:48 +00:00
|
|
|
-}
|
2021-10-26 17:43:50 +00:00
|
|
|
getJournalFileStale :: GetPrivate -> RawFilePath -> Annex JournalledContent
|
2021-04-23 18:21:57 +00:00
|
|
|
getJournalFileStale (GetPrivate getprivate) file = do
|
|
|
|
st <- Annex.getState id
|
|
|
|
liftIO $
|
|
|
|
if getprivate && privateUUIDsKnown' st
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
then do
|
2024-05-15 21:33:38 +00:00
|
|
|
x <- getfrom (gitAnnexJournalDir (Annex.branchstate st) (Annex.repo st))
|
|
|
|
getfrom (gitAnnexPrivateJournalDir (Annex.branchstate st) (Annex.repo st)) >>= \case
|
2021-10-26 17:43:50 +00:00
|
|
|
Nothing -> return $ case x of
|
|
|
|
Nothing -> NoJournalledContent
|
|
|
|
Just b -> JournalledContent b
|
|
|
|
Just y -> return $ PossiblyStaleJournalledContent $ case x of
|
|
|
|
Nothing -> y
|
|
|
|
-- This concacenation is the same as
|
|
|
|
-- happens in a merge of two
|
|
|
|
-- git-annex branches.
|
|
|
|
Just x' -> x' <> y
|
2024-05-15 21:33:38 +00:00
|
|
|
else getfrom (gitAnnexJournalDir (Annex.branchstate st) (Annex.repo st)) >>= return . \case
|
2021-10-26 17:43:50 +00:00
|
|
|
Nothing -> NoJournalledContent
|
|
|
|
Just b -> JournalledContent b
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
where
|
|
|
|
jfile = journalFile file
|
|
|
|
getfrom d = catchMaybeIO $
|
2022-07-22 15:36:21 +00:00
|
|
|
discardIncompleteAppend . L.fromStrict
|
|
|
|
<$> B.readFile (fromRawFilePath (d P.</> jfile))
|
2022-07-20 16:39:03 +00:00
|
|
|
|
|
|
|
-- Note that this forces read of the whole lazy bytestring.
|
|
|
|
discardIncompleteAppend :: L.ByteString -> L.ByteString
|
|
|
|
discardIncompleteAppend v
|
|
|
|
| L.null v = v
|
|
|
|
| L.last v == nl = v
|
|
|
|
| otherwise = dropwhileend (/= nl) v
|
|
|
|
where
|
|
|
|
nl = fromIntegral (ord '\n')
|
|
|
|
#if MIN_VERSION_bytestring(0,11,2)
|
|
|
|
dropwhileend = L.dropWhileEnd
|
|
|
|
#else
|
|
|
|
dropwhileend p = L.reverse . L.dropWhile p . L.reverse
|
|
|
|
#endif
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
|
|
|
|
{- List of existing journal files in a journal directory, but without locking,
|
|
|
|
- may miss new ones just being added, or may have false positives if the
|
|
|
|
- journal is staged as it is run. -}
|
2024-05-15 21:33:38 +00:00
|
|
|
getJournalledFilesStale :: (BranchState -> Git.Repo -> RawFilePath) -> Annex [RawFilePath]
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
getJournalledFilesStale getjournaldir = do
|
2024-05-15 21:33:38 +00:00
|
|
|
st <- Annex.getState id
|
|
|
|
let d = getjournaldir (Annex.branchstate st) (Annex.repo st)
|
|
|
|
fs <- liftIO $ catchDefaultIO [] $
|
|
|
|
getDirectoryContents (fromRawFilePath d)
|
2019-12-11 18:12:22 +00:00
|
|
|
return $ filter (`notElem` [".", ".."]) $
|
2021-04-20 17:13:45 +00:00
|
|
|
map (fileJournal . toRawFilePath) fs
|
2018-05-08 17:54:42 +00:00
|
|
|
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
{- Directory handle open on a journal directory. -}
|
2024-05-15 21:33:38 +00:00
|
|
|
withJournalHandle :: (BranchState -> Git.Repo -> RawFilePath) -> (DirectoryHandle -> IO a) -> Annex a
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
withJournalHandle getjournaldir a = do
|
2024-05-15 21:33:38 +00:00
|
|
|
st <- Annex.getState id
|
|
|
|
let d = getjournaldir (Annex.branchstate st) (Annex.repo st)
|
2023-06-21 19:23:59 +00:00
|
|
|
bracket (opendir d) (liftIO . closeDirectory) (liftIO . a)
|
|
|
|
where
|
|
|
|
-- avoid overhead of creating the journal directory when it already
|
|
|
|
-- exists
|
|
|
|
opendir d = liftIO (openDirectory (fromRawFilePath d))
|
|
|
|
`catchIO` (const (createAnnexDirectory d >> opendir d))
|
2014-07-10 03:36:53 +00:00
|
|
|
|
2011-12-12 22:03:28 +00:00
|
|
|
{- Checks if there are changes in the journal. -}
|
2024-05-15 21:33:38 +00:00
|
|
|
journalDirty :: (BranchState -> Git.Repo -> RawFilePath) -> Annex Bool
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
journalDirty getjournaldir = do
|
2024-05-15 21:33:38 +00:00
|
|
|
st <- getState
|
|
|
|
d <- fromRawFilePath <$> fromRepo (getjournaldir st)
|
2014-07-10 04:16:53 +00:00
|
|
|
liftIO $
|
|
|
|
(not <$> isDirectoryEmpty d)
|
|
|
|
`catchIO` (const $ doesDirectoryExist d)
|
2011-12-12 22:03:28 +00:00
|
|
|
|
|
|
|
{- Produces a filename to use in the journal for a file on the branch.
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
- The filename does not include the journal directory.
|
2011-12-12 22:03:28 +00:00
|
|
|
-
|
|
|
|
- The journal typically won't have a lot of files in it, so the hashing
|
|
|
|
- used in the branch is not necessary, and all the files are put directly
|
|
|
|
- in the journal directory.
|
|
|
|
-}
|
start implementing hidden git-annex repositories
This adds a separate journal, which does not currently get committed to
an index, but is planned to be committed to .git/annex/index-private.
Changes that are regarding a UUID that is private will get written to
this journal, and so will not be published into the git-annex branch.
All log writing should have been made to indicate the UUID it's
regarding, though I've not verified this yet.
Currently, no UUIDs are treated as private yet, a way to configure that
is needed.
The implementation is careful to not add any additional IO work when
privateUUIDsKnown is False. It will skip looking at the private journal
at all. So this should be free, or nearly so, unless the feature is
used. When it is used, all branch reads will be about twice as expensive.
It is very lucky -- or very prudent design -- that Annex.Branch.change
and maybeChange are the only ways to change a file on the branch,
and Annex.Branch.set is only internal use. That let Annex.Branch.get
always yield any private information that has been recorded, without
the risk that Annex.Branch.set might be called, with a non-private UUID,
and end up leaking the private information into the git-annex branch.
And, this relies on the way git-annex union merges the git-annex branch.
When reading a file, there can be a public and a private version, and
they are just concacenated together. That will be handled the same as if
there were two diverged git-annex branches that got union merged.
2021-04-20 18:32:41 +00:00
|
|
|
journalFile :: RawFilePath -> RawFilePath
|
2021-04-23 18:21:57 +00:00
|
|
|
journalFile file = B.concatMap mangle file
|
2012-12-13 04:24:19 +00:00
|
|
|
where
|
2013-05-12 20:37:32 +00:00
|
|
|
mangle c
|
2021-04-23 18:21:57 +00:00
|
|
|
| P.isPathSeparator c = B.singleton underscore
|
|
|
|
| c == underscore = B.pack [underscore, underscore]
|
|
|
|
| otherwise = B.singleton c
|
2019-12-18 15:29:34 +00:00
|
|
|
underscore = fromIntegral (ord '_')
|
2011-12-12 22:03:28 +00:00
|
|
|
|
|
|
|
{- Converts a journal file (relative to the journal dir) back to the
|
|
|
|
- filename on the branch. -}
|
2019-12-11 18:12:22 +00:00
|
|
|
fileJournal :: RawFilePath -> RawFilePath
|
2019-12-18 15:29:34 +00:00
|
|
|
fileJournal = go
|
2019-12-11 18:12:22 +00:00
|
|
|
where
|
2019-12-18 15:29:34 +00:00
|
|
|
go b =
|
2021-04-23 18:21:57 +00:00
|
|
|
let (h, t) = B.break (== underscore) b
|
|
|
|
in h <> case B.uncons t of
|
2019-12-18 15:29:34 +00:00
|
|
|
Nothing -> t
|
2021-04-23 18:21:57 +00:00
|
|
|
Just (_u, t') -> case B.uncons t' of
|
2019-12-18 15:29:34 +00:00
|
|
|
Nothing -> t'
|
|
|
|
Just (w, t'')
|
|
|
|
| w == underscore ->
|
2021-04-23 18:21:57 +00:00
|
|
|
B.cons underscore (go t'')
|
2019-12-18 15:29:34 +00:00
|
|
|
| otherwise ->
|
2021-04-23 18:21:57 +00:00
|
|
|
B.cons P.pathSeparator (go t')
|
2019-12-18 15:29:34 +00:00
|
|
|
|
|
|
|
underscore = fromIntegral (ord '_')
|
2011-12-12 22:03:28 +00:00
|
|
|
|
2013-10-03 18:41:57 +00:00
|
|
|
{- Sentinal value, only produced by lockJournal; required
|
|
|
|
- as a parameter by things that need to ensure the journal is
|
|
|
|
- locked. -}
|
|
|
|
data JournalLocked = ProduceJournalLocked
|
|
|
|
|
2011-12-12 22:03:28 +00:00
|
|
|
{- Runs an action that modifies the journal, using locking to avoid
|
|
|
|
- contention with other git-annex processes. -}
|
2013-10-03 18:41:57 +00:00
|
|
|
lockJournal :: (JournalLocked -> Annex a) -> Annex a
|
2022-08-11 20:57:44 +00:00
|
|
|
lockJournal a = do
|
|
|
|
lck <- fromRepo gitAnnexJournalLock
|
|
|
|
withExclusiveLock lck $ a ProduceJournalLocked
|