de18d92de6
This is only for checking performance, it's not safe. Sponsored-by: Dartmouth College's DANDI project
242 lines
8.8 KiB
Haskell
242 lines
8.8 KiB
Haskell
{- management of the git-annex journal
|
|
-
|
|
- The journal is used to queue up changes before they are committed to the
|
|
- git-annex branch. Among other things, it ensures that if git-annex is
|
|
- interrupted, its recorded data is not lost.
|
|
-
|
|
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
|
|
-
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
|
-}
|
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
module Annex.Journal where
|
|
|
|
import Annex.Common
|
|
import qualified Annex
|
|
import qualified Git
|
|
import Annex.Perms
|
|
import Annex.Tmp
|
|
import Annex.LockFile
|
|
import Utility.Directory.Stream
|
|
|
|
import qualified Data.Set as S
|
|
import qualified Data.ByteString.Lazy as L
|
|
import qualified Data.ByteString as B
|
|
import qualified System.FilePath.ByteString as P
|
|
import Data.ByteString.Builder
|
|
import Data.Char
|
|
|
|
class Journalable t where
|
|
writeJournalHandle :: Handle -> t -> IO ()
|
|
journalableByteString :: t -> L.ByteString
|
|
|
|
instance Journalable L.ByteString where
|
|
writeJournalHandle = L.hPut
|
|
journalableByteString = id
|
|
|
|
-- This is more efficient than the ByteString instance.
|
|
instance Journalable Builder where
|
|
writeJournalHandle = hPutBuilder
|
|
journalableByteString = toLazyByteString
|
|
|
|
{- When a file in the git-annex branch is changed, this indicates what
|
|
- repository UUID (or in some cases, UUIDs) a change is regarding.
|
|
-
|
|
- Using this lets changes regarding private UUIDs be stored separately
|
|
- from the git-annex branch, so its information does not get exposed
|
|
- outside the repo.
|
|
-}
|
|
data RegardingUUID = RegardingUUID [UUID]
|
|
|
|
regardingPrivateUUID :: RegardingUUID -> Annex Bool
|
|
regardingPrivateUUID (RegardingUUID []) = pure False
|
|
regardingPrivateUUID (RegardingUUID us) = do
|
|
s <- annexPrivateRepos <$> Annex.getGitConfig
|
|
return (any (flip S.member s) us)
|
|
|
|
{- Are any private UUIDs known to exist? If so, extra work has to be done,
|
|
- to check for information separately recorded for them, outside the usual
|
|
- locations.
|
|
-}
|
|
privateUUIDsKnown :: Annex Bool
|
|
privateUUIDsKnown = privateUUIDsKnown' <$> Annex.getState id
|
|
|
|
privateUUIDsKnown' :: Annex.AnnexState -> Bool
|
|
privateUUIDsKnown' = not . S.null . annexPrivateRepos . Annex.gitconfig
|
|
|
|
{- Records content for a file in the branch to the journal.
|
|
-
|
|
- Using the journal, rather than immediatly staging content to the index
|
|
- avoids git needing to rewrite the index after every change.
|
|
-
|
|
- The file in the journal is updated atomically, which allows
|
|
- getJournalFileStale to always return a consistent journal file
|
|
- content, although possibly not the most current one.
|
|
-}
|
|
setJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> content -> Annex ()
|
|
setJournalFile _jl ru file content = withOtherTmp $ \tmp -> do
|
|
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
|
|
( return gitAnnexPrivateJournalDir
|
|
, return gitAnnexJournalDir
|
|
)
|
|
-- journal file is written atomically
|
|
let jfile = journalFile file
|
|
let tmpfile = tmp P.</> jfile
|
|
let write = liftIO $ do
|
|
withFile (fromRawFilePath tmpfile) WriteMode $ \h ->
|
|
writeJournalHandle h content
|
|
moveFile tmpfile (jd P.</> jfile)
|
|
-- avoid overhead of creating the journal directory when it already
|
|
-- exists
|
|
write `catchIO` (const (createAnnexDirectory jd >> write))
|
|
|
|
{- Appends content to a journal file.
|
|
-
|
|
- The oldcontent is whatever is in the git-annex branch.
|
|
- When the journal file does not yet exist, the oldcontent
|
|
- is first written to the journal file.
|
|
-
|
|
- TODO: Unsafe! Does not append atomically. -}
|
|
appendJournalFile :: Journalable content => JournalLocked -> RegardingUUID -> RawFilePath -> L.ByteString -> content -> Annex ()
|
|
appendJournalFile _jl ru file oldcontent toappend = do
|
|
jd <- fromRepo =<< ifM (regardingPrivateUUID ru)
|
|
( return gitAnnexPrivateJournalDir
|
|
, return gitAnnexJournalDir
|
|
)
|
|
let jfile = fromRawFilePath $ jd P.</> journalFile file
|
|
let write = liftIO $ ifM (doesFileExist jfile)
|
|
( withFile jfile AppendMode $ \h ->
|
|
writeJournalHandle h toappend
|
|
, withFile jfile WriteMode $ \h -> do
|
|
writeJournalHandle h oldcontent
|
|
writeJournalHandle h toappend
|
|
)
|
|
write `catchIO` (const (createAnnexDirectory jd >> write))
|
|
|
|
data JournalledContent
|
|
= NoJournalledContent
|
|
| JournalledContent L.ByteString
|
|
| PossiblyStaleJournalledContent L.ByteString
|
|
-- ^ This is used when the journalled content may have been
|
|
-- supersceded by content in the git-annex branch. The returned
|
|
-- content should be combined with content from the git-annex branch.
|
|
-- This is particularly the case when a file is in the private
|
|
-- journal, which does not get written to the git-annex branch,
|
|
-- and so the git-annex branch can contain changes to non-private
|
|
-- information that were made after that journal file was written.
|
|
|
|
{- Gets any journalled content for a file in the branch. -}
|
|
getJournalFile :: JournalLocked -> GetPrivate -> RawFilePath -> Annex JournalledContent
|
|
getJournalFile _jl = getJournalFileStale
|
|
|
|
data GetPrivate = GetPrivate Bool
|
|
|
|
{- Without locking, this is not guaranteed to be the most recent
|
|
- version of the file in the journal, so should not be used as a basis for
|
|
- changes.
|
|
-
|
|
- The file is read strictly so that its content can safely be fed into
|
|
- an operation that modifies the file. While setJournalFile doesn't
|
|
- write directly to journal files and so probably avoids problems with
|
|
- writing to the same file that's being read, but there could be
|
|
- concurrency or other issues with a lazy read, and the minor loss of
|
|
- laziness doesn't matter much, as the files are not very large.
|
|
-}
|
|
getJournalFileStale :: GetPrivate -> RawFilePath -> Annex JournalledContent
|
|
getJournalFileStale (GetPrivate getprivate) file = do
|
|
-- Optimisation to avoid a second MVar access.
|
|
st <- Annex.getState id
|
|
let g = Annex.repo st
|
|
liftIO $
|
|
if getprivate && privateUUIDsKnown' st
|
|
then do
|
|
x <- getfrom (gitAnnexJournalDir g)
|
|
getfrom (gitAnnexPrivateJournalDir g) >>= \case
|
|
Nothing -> return $ case x of
|
|
Nothing -> NoJournalledContent
|
|
Just b -> JournalledContent b
|
|
Just y -> return $ PossiblyStaleJournalledContent $ case x of
|
|
Nothing -> y
|
|
-- This concacenation is the same as
|
|
-- happens in a merge of two
|
|
-- git-annex branches.
|
|
Just x' -> x' <> y
|
|
else getfrom (gitAnnexJournalDir g) >>= return . \case
|
|
Nothing -> NoJournalledContent
|
|
Just b -> JournalledContent b
|
|
where
|
|
jfile = journalFile file
|
|
getfrom d = catchMaybeIO $
|
|
L.fromStrict <$> B.readFile (fromRawFilePath (d P.</> jfile))
|
|
|
|
{- List of existing journal files in a journal directory, but without locking,
|
|
- may miss new ones just being added, or may have false positives if the
|
|
- journal is staged as it is run. -}
|
|
getJournalledFilesStale :: (Git.Repo -> RawFilePath) -> Annex [RawFilePath]
|
|
getJournalledFilesStale getjournaldir = do
|
|
g <- gitRepo
|
|
fs <- liftIO $ catchDefaultIO [] $
|
|
getDirectoryContents $ fromRawFilePath (getjournaldir g)
|
|
return $ filter (`notElem` [".", ".."]) $
|
|
map (fileJournal . toRawFilePath) fs
|
|
|
|
{- Directory handle open on a journal directory. -}
|
|
withJournalHandle :: (Git.Repo -> RawFilePath) -> (DirectoryHandle -> IO a) -> Annex a
|
|
withJournalHandle getjournaldir a = do
|
|
d <- fromRawFilePath <$> fromRepo getjournaldir
|
|
bracketIO (openDirectory d) closeDirectory (liftIO . a)
|
|
|
|
{- Checks if there are changes in the journal. -}
|
|
journalDirty :: (Git.Repo -> RawFilePath) -> Annex Bool
|
|
journalDirty getjournaldir = do
|
|
d <- fromRawFilePath <$> fromRepo getjournaldir
|
|
liftIO $
|
|
(not <$> isDirectoryEmpty d)
|
|
`catchIO` (const $ doesDirectoryExist d)
|
|
|
|
{- Produces a filename to use in the journal for a file on the branch.
|
|
- The filename does not include the journal directory.
|
|
-
|
|
- The journal typically won't have a lot of files in it, so the hashing
|
|
- used in the branch is not necessary, and all the files are put directly
|
|
- in the journal directory.
|
|
-}
|
|
journalFile :: RawFilePath -> RawFilePath
|
|
journalFile file = B.concatMap mangle file
|
|
where
|
|
mangle c
|
|
| P.isPathSeparator c = B.singleton underscore
|
|
| c == underscore = B.pack [underscore, underscore]
|
|
| otherwise = B.singleton c
|
|
underscore = fromIntegral (ord '_')
|
|
|
|
{- Converts a journal file (relative to the journal dir) back to the
|
|
- filename on the branch. -}
|
|
fileJournal :: RawFilePath -> RawFilePath
|
|
fileJournal = go
|
|
where
|
|
go b =
|
|
let (h, t) = B.break (== underscore) b
|
|
in h <> case B.uncons t of
|
|
Nothing -> t
|
|
Just (_u, t') -> case B.uncons t' of
|
|
Nothing -> t'
|
|
Just (w, t'')
|
|
| w == underscore ->
|
|
B.cons underscore (go t'')
|
|
| otherwise ->
|
|
B.cons P.pathSeparator (go t')
|
|
|
|
underscore = fromIntegral (ord '_')
|
|
|
|
{- Sentinal value, only produced by lockJournal; required
|
|
- as a parameter by things that need to ensure the journal is
|
|
- locked. -}
|
|
data JournalLocked = ProduceJournalLocked
|
|
|
|
{- Runs an action that modifies the journal, using locking to avoid
|
|
- contention with other git-annex processes. -}
|
|
lockJournal :: (JournalLocked -> Annex a) -> Annex a
|
|
lockJournal a = withExclusiveLock gitAnnexJournalLock $ a ProduceJournalLocked
|