git-annex/Annex/Journal.hs
Joey Hess 928b2a4839
create journal directory in withJournalHandle
Fixes a crash by git-annex repair when .git/annex/journal/ does not exist.

Normally the journal directory is created before withJournalHandle gets
run, but git-annex repair can be run in a situation where it does not
exist.
2023-06-21 15:23:59 -04:00

294 lines
10 KiB
Haskell

{- management of the git-annex journal
-
- The journal is used to queue up changes before they are committed to the
- git-annex branch. Among other things, it ensures that if git-annex is
- interrupted, its recorded data is not lost.
-
- All files in the journal must be a series of lines separated by
- newlines.
-
- Copyright 2011-2022 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE CPP #-}
module Annex.Journal where
import Annex.Common
import qualified Annex
import qualified Git
import Annex.Perms
import Annex.Tmp
import Annex.LockFile
import Utility.Directory.Stream
import qualified Utility.RawFilePath as R
import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as B
import qualified System.FilePath.ByteString as P
import Data.ByteString.Builder
import Data.Char
class Journalable t where
writeJournalHandle :: Handle -> t -> IO ()
journalableByteString :: t -> L.ByteString
instance Journalable L.ByteString where
writeJournalHandle = L.hPut
journalableByteString = id
-- This is more efficient than the ByteString instance.
instance Journalable Builder where
writeJournalHandle = hPutBuilder
journalableByteString = toLazyByteString
{- When a file in the git-annex branch is changed, this indicates what
- repository UUID (or in some cases, UUIDs) a change is regarding.
-
- Using this lets changes regarding private UUIDs be stored separately
- from the git-annex branch, so its information does not get exposed
- outside the repo.
-}
data RegardingUUID = RegardingUUID [UUID]
regardingPrivateUUID :: RegardingUUID -> Annex Bool
regardingPrivateUUID (RegardingUUID []) = pure False
regardingPrivateUUID (RegardingUUID us) = do
s <- annexPrivateRepos <$> Annex.getGitConfig
return (any (flip S.member s) us)
{- Are any private UUIDs known to exist? If so, extra work has to be done,
- to check for information separately recorded for them, outside the usual
- locations.
-}
privateUUIDsKnown :: Annex Bool
privateUUIDsKnown = privateUUIDsKnown' <$> Annex.getState id
privateUUIDsKnown' :: Annex.AnnexState -> Bool
privateUUIDsKnown' = not . S.null . annexPrivateRepos . Annex.gitconfig
{- Records content for a file in the branch to the journal.
-
- Using the journal, rather than immediately staging content to the index
- avoids git needing to rewrite the index after every change.
-
- The file in the journal is updated atomically. This avoids an
- interrupted write truncating information that was earlier read from the
- file, and so losing data.
-}
setJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> content -> Annex ()
setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
( return gitAnnexPrivateJournalDir
, return gitAnnexJournalDir
)
-- journal file is written atomically
let jfile = journalFile file
let tmpfile = tmp P.</> jfile
liftIO $ withFile (fromRawFilePath tmpfile) WriteMode $ \h ->
writeJournalHandle h content
let dest = jd P.</> jfile
let mv = do
liftIO $ moveFile tmpfile dest
setAnnexFilePerm dest
-- avoid overhead of creating the journal directory when it already
-- exists
mv `catchIO` (const (createAnnexDirectory jd >> mv))
newtype AppendableJournalFile = AppendableJournalFile (RawFilePath, RawFilePath)
{- If the journal file does not exist, it cannot be appended to, because
- that would overwrite whatever content the file has in the git-annex
- branch. -}
checkCanAppendJournalFile :: JournalLocked -> RegardingUUID -> RawFilePath -> Annex (Maybe AppendableJournalFile)
checkCanAppendJournalFile _jl ru file = do
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
( return gitAnnexPrivateJournalDir
, return gitAnnexJournalDir
)
let jfile = jd P.</> journalFile file
ifM (liftIO $ R.doesPathExist jfile)
( return (Just (AppendableJournalFile (jd, jfile)))
, return Nothing
)
{- Appends content to an existing journal file.
-
- Appends are not necessarily atomic, though short appends often are.
- So, when this is interrupted, it can leave only part of the content
- written to the file. To deal with that situation, both this and
- getJournalFileStale check if the file ends with a newline, and if
- not discard the incomplete line.
-
- Due to the lack of atomicity, this should not be used when multiple
- lines need to be written to the file as an atomic unit.
-}
appendJournalFile :: Journalable content => JournalLocked -> AppendableJournalFile -> content -> Annex ()
appendJournalFile _jl (AppendableJournalFile (jd, jfile)) content = do
let write = liftIO $ withFile (fromRawFilePath jfile) ReadWriteMode $ \h -> do
sz <- hFileSize h
when (sz /= 0) $ do
hSeek h SeekFromEnd (-1)
lastchar <- B.hGet h 1
unless (lastchar == "\n") $ do
hSeek h AbsoluteSeek 0
goodpart <- L.length . discardIncompleteAppend
<$> L.hGet h (fromIntegral sz)
hSetFileSize h (fromIntegral goodpart)
hSeek h SeekFromEnd 0
writeJournalHandle h content
write `catchIO` (const (createAnnexDirectory jd >> write))
data JournalledContent
= NoJournalledContent
| JournalledContent L.ByteString
| PossiblyStaleJournalledContent L.ByteString
-- ^ This is used when the journalled content may have been
-- supersceded by content in the git-annex branch. The returned
-- content should be combined with content from the git-annex branch.
-- This is particularly the case when a file is in the private
-- journal, which does not get written to the git-annex branch,
-- and so the git-annex branch can contain changes to non-private
-- information that were made after that journal file was written.
{- Gets any journalled content for a file in the branch. -}
getJournalFile :: JournalLocked -> GetPrivate -> RawFilePath -> Annex JournalledContent
getJournalFile _jl = getJournalFileStale
data GetPrivate = GetPrivate Bool
{- Without locking, this is not guaranteed to be the most recent
- content of the file in the journal, so should not be used as a basis for
- making changes to the file.
-
- The file is read strictly so that its content can safely be fed into
- an operation that modifies the file (when getJournalFile calls this).
- The minor loss of laziness doesn't matter much, as the files are not
- very large.
-
- To recover from an append of a line that is interrupted part way through
- (or is in progress when this is called), if the file content does not end
- with a newline, it is truncated back to the previous newline.
-}
getJournalFileStale :: GetPrivate -> RawFilePath -> Annex JournalledContent
getJournalFileStale (GetPrivate getprivate) file = do
-- Optimisation to avoid a second MVar access.
st <- Annex.getState id
let g = Annex.repo st
liftIO $
if getprivate && privateUUIDsKnown' st
then do
x <- getfrom (gitAnnexJournalDir g)
getfrom (gitAnnexPrivateJournalDir g) >>= \case
Nothing -> return $ case x of
Nothing -> NoJournalledContent
Just b -> JournalledContent b
Just y -> return $ PossiblyStaleJournalledContent $ case x of
Nothing -> y
-- This concacenation is the same as
-- happens in a merge of two
-- git-annex branches.
Just x' -> x' <> y
else getfrom (gitAnnexJournalDir g) >>= return . \case
Nothing -> NoJournalledContent
Just b -> JournalledContent b
where
jfile = journalFile file
getfrom d = catchMaybeIO $
discardIncompleteAppend . L.fromStrict
<$> B.readFile (fromRawFilePath (d P.</> jfile))
-- Note that this forces read of the whole lazy bytestring.
discardIncompleteAppend :: L.ByteString -> L.ByteString
discardIncompleteAppend v
| L.null v = v
| L.last v == nl = v
| otherwise = dropwhileend (/= nl) v
where
nl = fromIntegral (ord '\n')
#if MIN_VERSION_bytestring(0,11,2)
dropwhileend = L.dropWhileEnd
#else
dropwhileend p = L.reverse . L.dropWhile p . L.reverse
#endif
{- List of existing journal files in a journal directory, but without locking,
- may miss new ones just being added, or may have false positives if the
- journal is staged as it is run. -}
getJournalledFilesStale :: (Git.Repo -> RawFilePath) -> Annex [RawFilePath]
getJournalledFilesStale getjournaldir = do
g <- gitRepo
fs <- liftIO $ catchDefaultIO [] $
getDirectoryContents $ fromRawFilePath (getjournaldir g)
return $ filter (`notElem` [".", ".."]) $
map (fileJournal . toRawFilePath) fs
{- Directory handle open on a journal directory. -}
withJournalHandle :: (Git.Repo -> RawFilePath) -> (DirectoryHandle -> IO a) -> Annex a
withJournalHandle getjournaldir a = do
d <- fromRepo getjournaldir
bracket (opendir d) (liftIO . closeDirectory) (liftIO . a)
where
-- avoid overhead of creating the journal directory when it already
-- exists
opendir d = liftIO (openDirectory (fromRawFilePath d))
`catchIO` (const (createAnnexDirectory d >> opendir d))
{- Checks if there are changes in the journal. -}
journalDirty :: (Git.Repo -> RawFilePath) -> Annex Bool
journalDirty getjournaldir = do
d <- fromRawFilePath <$> fromRepo getjournaldir
liftIO $
(not <$> isDirectoryEmpty d)
`catchIO` (const $ doesDirectoryExist d)
{- Produces a filename to use in the journal for a file on the branch.
- The filename does not include the journal directory.
-
- The journal typically won't have a lot of files in it, so the hashing
- used in the branch is not necessary, and all the files are put directly
- in the journal directory.
-}
journalFile :: RawFilePath -> RawFilePath
journalFile file = B.concatMap mangle file
where
mangle c
| P.isPathSeparator c = B.singleton underscore
| c == underscore = B.pack [underscore, underscore]
| otherwise = B.singleton c
underscore = fromIntegral (ord '_')
{- Converts a journal file (relative to the journal dir) back to the
- filename on the branch. -}
fileJournal :: RawFilePath -> RawFilePath
fileJournal = go
where
go b =
let (h, t) = B.break (== underscore) b
in h <> case B.uncons t of
Nothing -> t
Just (_u, t') -> case B.uncons t' of
Nothing -> t'
Just (w, t'')
| w == underscore ->
B.cons underscore (go t'')
| otherwise ->
B.cons P.pathSeparator (go t')
underscore = fromIntegral (ord '_')
{- Sentinal value, only produced by lockJournal; required
- as a parameter by things that need to ensure the journal is
- locked. -}
data JournalLocked = ProduceJournalLocked
{- Runs an action that modifies the journal, using locking to avoid
- contention with other git-annex processes. -}
lockJournal :: (JournalLocked -> Annex a) -> Annex a
lockJournal a = do
lck <- fromRepo gitAnnexJournalLock
withExclusiveLock lck $ a ProduceJournalLocked