From 6a3bd283b8af53f810982e002e435c0d7c040c59 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 23 Sep 2022 14:38:59 -0400 Subject: [PATCH] add restage log When pointer files need to be restaged, they're first written to the log, and then when the restage operation runs, it reads the log. This way, if the git-annex process is interrupted before it can do the restaging, a later git-annex process can do it. Currently, this lets a git-annex get/drop command be interrupted and then re-ran, and as long as it gets/drops additional files, it will clean up after the interrupted command. But more changes are needed to make it easier to restage after an interrupted process. Kept using the git queue to run the restage action, even though the list of files that it builds up for that action is not actually used by the action. This could perhaps be simplified to make restaging a cleanup action that gets registered, rather than using the git queue for it. But I wasn't sure if that would cause visible behavior changes, when eg dropping a large number of files, currently the git queue flushes periodically, and so it restages incrementally, rather than all at the end. In restagePointerFiles, it reads the restage log twice, once to get the number of files and size, and a second time to process it. This seemed better than reading the whole file into memory, since potentially a huge number of files could be in there. Probably the OS will cache the file in memory and there will not be much performance impact. It might be better to keep running tallies in another file though. But updating that atomically with the log seems hard. Also note that it's possible for calcRestageLog to see a different file than streamRestageLog does. More files may be added to the log in between. That is ok, it will only cause the filterprocessfaster heuristic to operate with slightly out of date information, so it may make the wrong choice for the files that got added and be a little slower than ideal. Sponsored-by: Dartmouth College's DANDI project --- Annex/Link.hs | 100 +++++++++++++++++++++++++++------------------ Annex/Locations.hs | 12 +++++- Annex/PidLock.hs | 14 ++++--- Annex/Queue.hs | 2 +- Git/Queue.hs | 6 +-- Git/UpdateIndex.hs | 22 ++++++---- Logs/File.hs | 20 +++++++++ Logs/Restage.hs | 42 ++++++++++++------- git-annex.cabal | 1 + 9 files changed, 148 insertions(+), 71 deletions(-) diff --git a/Annex/Link.hs b/Annex/Link.hs index c1d15d411e..0849993d19 100644 --- a/Annex/Link.hs +++ b/Annex/Link.hs @@ -25,6 +25,7 @@ import qualified Git.Index import qualified Git.LockFile import qualified Git.Env import qualified Git +import Logs.Restage import Git.Types import Git.FilePath import Git.Config @@ -35,7 +36,6 @@ import Utility.FileMode import Utility.InodeCache import Utility.Tmp.Dir import Utility.CopyFile -import Utility.Tuple import qualified Database.Keys.Handle import qualified Utility.RawFilePath as R @@ -155,6 +155,10 @@ newtype Restage = Restage Bool - when content is added/removed, to prevent git status from showing - it as modified. - + - The InodeCache is for the worktree file. It is used to detect when + - the worktree file is changed by something else before git update-index + - gets to look at it. + - - Asks git to refresh its index information for the file. - That in turn runs the clean filter on the file; when the clean - filter produces the same pointer that was in the index before, git @@ -165,46 +169,46 @@ newtype Restage = Restage Bool - that. So it's safe to call at any time and any situation. - - If the index is known to be locked (eg, git add has run git-annex), - - that would fail. Restage False will prevent the index being updated. - - Will display a message to help the user understand why - - the file will appear to be modified. + - that would fail. Restage False will prevent the index being updated, + - and will store it in the restage log. Displays a message to help the + - user understand why the file will appear to be modified. - - This uses the git queue, so the update is not performed immediately, - - and this can be run multiple times cheaply. - - - - The InodeCache is for the worktree file. It is used to detect when - - the worktree file is changed by something else before git update-index - - gets to look at it. + - and this can be run multiple times cheaply. Using the git queue also + - prevents building up too large a number of updates when many files + - are being processed. It's also recorded in the restage log so that, + - if the process is interrupted before the git queue is fulushed, the + - restage will be taken care of later. -} restagePointerFile :: Restage -> RawFilePath -> InodeCache -> Annex () restagePointerFile (Restage False) f orig = do + flip writeRestageLog orig =<< inRepo (toTopFilePath f) toplevelWarning True $ unableToRestage $ Just $ fromRawFilePath f -restagePointerFile (Restage True) f orig = withTSDelta $ \tsd -> +restagePointerFile (Restage True) f orig = do + flip writeRestageLog orig =<< inRepo (toTopFilePath f) -- Avoid refreshing the index if run by the -- smudge clean filter, because git uses that when -- it's already refreshing the index, probably because -- this very action is running. Running it again would likely -- deadlock. - unlessM (Annex.getState Annex.insmudgecleanfilter) $ do - -- update-index is documented as picky about "./file" and it - -- fails on "../../repo/path/file" when cwd is not in the repo - -- being acted on. Avoid these problems with an absolute path. - absf <- liftIO $ absPath f - Annex.Queue.addFlushAction restagePointerFileRunner - [(absf, isunmodified tsd, inodeCacheFileSize orig)] - where - isunmodified tsd = genInodeCache f tsd >>= return . \case - Nothing -> False - Just new -> compareStrong orig new + unlessM (Annex.getState Annex.insmudgecleanfilter) $ + Annex.Queue.addFlushAction restagePointerFileRunner [f] +restagePointerFileRunner :: Git.Queue.FlushActionRunner Annex +restagePointerFileRunner = + Git.Queue.FlushActionRunner "restagePointerFiles" $ \r _fs -> + restagePointerFiles r + +-- Restage all files in the restage log that have not been modified. +-- -- Other changes to the files may have been staged before this -- gets a chance to run. To avoid a race with any staging of -- changes, first lock the index file. Then run git update-index -- on all still-unmodified files, using a copy of the index file, -- to bypass the lock. Then replace the old index file with the new -- updated index file. -restagePointerFileRunner :: Git.Queue.FlushActionRunner Annex -restagePointerFileRunner = Git.Queue.FlushActionRunner "restagePointerFile" $ \r l -> do +restagePointerFiles :: Git.Repo -> Annex () +restagePointerFiles r = unlessM (Annex.getState Annex.insmudgecleanfilter) $ do -- Flush any queued changes to the keys database, so they -- are visible to child processes. -- The database is closed because that may improve behavior @@ -219,7 +223,10 @@ restagePointerFileRunner = Git.Queue.FlushActionRunner "restagePointerFile" $ \r showwarning = warning $ unableToRestage Nothing go Nothing = showwarning go (Just _) = withTmpDirIn (fromRawFilePath $ Git.localGitDir r) "annexindex" $ \tmpdir -> do + tsd <- getTSDelta let tmpindex = toRawFilePath (tmpdir "index") + let replaceindex = liftIO $ + moveFile tmpindex realindex let updatetmpindex = do r' <- liftIO $ Git.Env.addGitEnv r Git.Index.indexEnv =<< Git.Index.indexEnvVal tmpindex @@ -228,30 +235,45 @@ restagePointerFileRunner = Git.Queue.FlushActionRunner "restagePointerFile" $ \r [ Param "-c" , Param $ "core.safecrlf=" ++ boolConfig False ] } - configfilterprocess l $ runsGitAnnexChildProcessViaGit' r'' $ \r''' -> - liftIO $ Git.UpdateIndex.refreshIndex r''' $ \feed -> - forM_ l $ \(f', checkunmodified, _) -> - whenM checkunmodified $ - feed f' - let replaceindex = catchBoolIO $ do - moveFile tmpindex realindex + numsz <- calcRestageLog (0, 0) $ \(_f, ic) (numfiles, sizefiles) -> + (numfiles+1, sizefiles + inodeCacheFileSize ic) + configfilterprocess numsz $ runsGitAnnexChildProcessViaGit' r'' $ \r''' -> + Git.UpdateIndex.refreshIndex r''' $ \feeder -> do + let atend = do + -- wait for index write + liftIO $ feeder Nothing + replaceindex + streamRestageLog atend $ \topf ic -> do + let f = fromTopFilePath topf r''' + liftIO $ whenM (isunmodified tsd f ic) $ + feedupdateindex f feeder return True ok <- liftIO (createLinkOrCopy realindex tmpindex) - <&&> updatetmpindex - <&&> liftIO replaceindex + <&&> catchBoolIO updatetmpindex unless ok showwarning bracket lockindex unlockindex go where + isunmodified tsd f orig = + genInodeCache f tsd >>= return . \case + Nothing -> False + Just new -> compareStrong orig new + + {- update-index is documented as picky about "./file" and it + - fails on "../../repo/path/file" when cwd is not in the repo + - being acted on. Avoid these problems with an absolute path. + -} + feedupdateindex f feeder = do + absf <- absPath f + feeder (Just absf) + {- filter.annex.process configured to use git-annex filter-process - is sometimes faster and sometimes slower than using - git-annex smudge. The latter is run once per file, while - the former has the content of files piped to it. -} - filterprocessfaster l = - let numfiles = genericLength l - sizefiles = sum (map thd3 l) - -- estimates based on benchmarking - estimate_enabled = sizefiles `div` 191739611 + filterprocessfaster :: (Integer, FileSize) -> Bool + filterprocessfaster (numfiles, sizefiles) = + let estimate_enabled = sizefiles `div` 191739611 estimate_disabled = numfiles `div` 7 in estimate_enabled <= estimate_disabled @@ -263,10 +285,10 @@ restagePointerFileRunner = Git.Queue.FlushActionRunner "restagePointerFile" $ \r - case this process is terminated early, the next time this - runs it will take care of reversing the modification. -} - configfilterprocess l = bracket setup cleanup . const + configfilterprocess numsz = bracket setup cleanup . const where setup - | filterprocessfaster l = return Nothing + | filterprocessfaster numsz = return Nothing | otherwise = fromRepo (Git.Config.getMaybe ck) >>= \case Nothing -> return Nothing Just v -> do diff --git a/Annex/Locations.hs b/Annex/Locations.hs index f119c9dca0..ace33a5da2 100644 --- a/Annex/Locations.hs +++ b/Annex/Locations.hs @@ -49,6 +49,8 @@ module Annex.Locations ( gitAnnexUpgradeLock, gitAnnexSmudgeLog, gitAnnexSmudgeLock, + gitAnnexRestageLog, + gitAnnexRestageLock, gitAnnexMoveLog, gitAnnexMoveLock, gitAnnexExportDir, @@ -370,7 +372,7 @@ gitAnnexUpgradeLog r = gitAnnexDir r P. "upgrade.log" gitAnnexUpgradeLock :: Git.Repo -> RawFilePath gitAnnexUpgradeLock r = gitAnnexDir r P. "upgrade.lck" -{- .git/annex/smudge.log is used to log smudges worktree files that need to +{- .git/annex/smudge.log is used to log smudged worktree files that need to - be updated. -} gitAnnexSmudgeLog :: Git.Repo -> RawFilePath gitAnnexSmudgeLog r = gitAnnexDir r P. "smudge.log" @@ -378,6 +380,14 @@ gitAnnexSmudgeLog r = gitAnnexDir r P. "smudge.log" gitAnnexSmudgeLock :: Git.Repo -> RawFilePath gitAnnexSmudgeLock r = gitAnnexDir r P. "smudge.lck" +{- .git/annex/restage.log is used to log worktree files that need to be + - restaged in git -} +gitAnnexRestageLog :: Git.Repo -> RawFilePath +gitAnnexRestageLog r = gitAnnexDir r P. "restage.log" + +gitAnnexRestageLock :: Git.Repo -> RawFilePath +gitAnnexRestageLock r = gitAnnexDir r P. "restage.lck" + {- .git/annex/move.log is used to log moves that are in progress, - to better support resuming an interrupted move. -} gitAnnexMoveLog :: Git.Repo -> RawFilePath diff --git a/Annex/PidLock.hs b/Annex/PidLock.hs index d69b03476a..ddc7529a2b 100644 --- a/Annex/PidLock.hs +++ b/Annex/PidLock.hs @@ -106,11 +106,15 @@ runsGitAnnexChildProcessViaGit a = pidLockFile >>= \case runsGitAnnexChildProcessViaGit a = a #endif -runsGitAnnexChildProcessViaGit' :: Git.Repo -> (Git.Repo -> IO a) -> Annex a +{- Like runsGitAnnexChildProcessViaGit, but the Annex state is not + - modified. Instead the input Repo's state is modified to set the + - necessary env var when git is run in that Repo. + -} +runsGitAnnexChildProcessViaGit' :: Git.Repo -> (Git.Repo -> Annex a) -> Annex a #ifndef mingw32_HOST_OS runsGitAnnexChildProcessViaGit' r a = pidLockFile >>= \case - Nothing -> liftIO $ a r - Just pidlock -> liftIO $ bracket (setup pidlock) cleanup (go pidlock) + Nothing -> a r + Just pidlock -> bracketIO (setup pidlock) cleanup (go pidlock) where setup pidlock = fmap fst <$> PidP.tryLock' pidlock @@ -119,8 +123,8 @@ runsGitAnnexChildProcessViaGit' r a = pidLockFile >>= \case go _ Nothing = a r go pidlock (Just _h) = do - v <- PidF.pidLockEnv pidlock - r' <- addGitEnv r v PidF.pidLockEnvValue + v <- liftIO $ PidF.pidLockEnv pidlock + r' <- liftIO $ addGitEnv r v PidF.pidLockEnvValue a r' #else runsGitAnnexChildProcessViaGit' r a = liftIO $ a r diff --git a/Annex/Queue.hs b/Annex/Queue.hs index f11681cbaa..b2b28bccb5 100644 --- a/Annex/Queue.hs +++ b/Annex/Queue.hs @@ -31,7 +31,7 @@ addCommand commonparams command params files = do store =<< flushWhenFull =<< (Git.Queue.addCommand commonparams command params files q =<< gitRepo) -addFlushAction :: Git.Queue.FlushActionRunner Annex -> [(RawFilePath, IO Bool, FileSize)] -> Annex () +addFlushAction :: Git.Queue.FlushActionRunner Annex -> [RawFilePath] -> Annex () addFlushAction runner files = do q <- get store =<< flushWhenFull =<< diff --git a/Git/Queue.hs b/Git/Queue.hs index bf7d33ab3e..dd77b35bf0 100644 --- a/Git/Queue.hs +++ b/Git/Queue.hs @@ -53,11 +53,11 @@ data Action m - those will be run before the FlushAction is. -} | FlushAction { getFlushActionRunner :: FlushActionRunner m - , getFlushActionFiles :: [(RawFilePath, IO Bool, FileSize)] + , getFlushActionFiles :: [RawFilePath] } {- The String must be unique for each flush action. -} -data FlushActionRunner m = FlushActionRunner String (Repo -> [(RawFilePath, IO Bool, FileSize)] -> m ()) +data FlushActionRunner m = FlushActionRunner String (Repo -> [RawFilePath] -> m ()) instance Eq (FlushActionRunner m) where FlushActionRunner s1 _ == FlushActionRunner s2 _ = s1 == s2 @@ -140,7 +140,7 @@ addCommand commonparams subcommand params files q repo = {- Adds an flush action to the queue. This can co-exist with anything else - that gets added to the queue, and when the queue is eventually flushed, - it will be run after the other things in the queue. -} -addFlushAction :: MonadIO m => FlushActionRunner m -> [(RawFilePath, IO Bool, FileSize)] -> Queue m -> Repo -> m (Queue m) +addFlushAction :: MonadIO m => FlushActionRunner m -> [RawFilePath] -> Queue m -> Repo -> m (Queue m) addFlushAction runner files q repo = updateQueue action (const False) (length files) q repo where diff --git a/Git/UpdateIndex.hs b/Git/UpdateIndex.hs index c7e42804bf..ee34afe983 100644 --- a/Git/UpdateIndex.hs +++ b/Git/UpdateIndex.hs @@ -135,8 +135,13 @@ stageDiffTreeItem d = case toTreeItemType (Diff.dstmode d) of indexPath :: TopFilePath -> InternalGitPath indexPath = toInternalGitPath . getTopFilePath -{- Refreshes the index, by checking file stat information. -} -refreshIndex :: (MonadIO m, MonadMask m) => Repo -> ((RawFilePath -> IO ()) -> m ()) -> m Bool +{- Refreshes the index, by checking file stat information. + - + - The action is passed a callback that it can use to send filenames to + - update-index. Sending Nothing will wait for update-index to finish + - updating the index. + -} +refreshIndex :: (MonadIO m, MonadMask m) => Repo -> ((Maybe RawFilePath -> IO ()) -> m ()) -> m () refreshIndex repo feeder = bracket (liftIO $ createProcess p) (liftIO . cleanupProcess) @@ -154,9 +159,12 @@ refreshIndex repo feeder = bracket { std_in = CreatePipe } go (Just h, _, _, pid) = do - feeder $ \f -> - S.hPut h (S.snoc f 0) - liftIO $ hFlush h - liftIO $ hClose h - liftIO $ checkSuccessProcess pid + let closer = do + hFlush h + hClose h + forceSuccessProcess p pid + feeder $ \case + Just f -> S.hPut h (S.snoc f 0) + Nothing -> closer + liftIO $ closer go _ = error "internal" diff --git a/Logs/File.hs b/Logs/File.hs index 7622399108..f70f8f79d4 100644 --- a/Logs/File.hs +++ b/Logs/File.hs @@ -14,6 +14,7 @@ module Logs.File ( modifyLogFile, streamLogFile, checkLogFile, + calcLogFile, ) where import Annex.Common @@ -99,6 +100,25 @@ checkLogFile f matchf = bracket setup cleanup go !r <- liftIO (any matchf . fullLines <$> L.hGetContents h) return r +-- | Folds a function over lines of a log file to calculate a value. +-- +-- This can safely be used while appendLogFile or any atomic +-- action is concurrently modifying the file. It does not lock the file, +-- for speed, but instead relies on the fact that a log file usually +-- ends in a newline. +calcLogFile :: FilePath -> t -> (L.ByteString -> t -> t) -> Annex t +calcLogFile f start update = bracket setup cleanup go + where + setup = liftIO $ tryWhenExists $ openFile f ReadMode + cleanup Nothing = noop + cleanup (Just h) = liftIO $ hClose h + go Nothing = return start + go (Just h) = go' start =<< liftIO (fullLines <$> L.hGetContents h) + go' v [] = return v + go' v (l:ls) = do + let !v' = update l v + go' v' ls + -- | Gets only the lines that end in a newline. If the last part of a file -- does not, it's assumed to be a new line being logged that is incomplete, -- and is omitted. diff --git a/Logs/Restage.hs b/Logs/Restage.hs index 75bba857c9..4e0f3e5146 100644 --- a/Logs/Restage.hs +++ b/Logs/Restage.hs @@ -12,7 +12,9 @@ module Logs.Restage where import Annex.Common import Git.FilePath import Logs.File +import Utility.InodeCache +import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L -- | Log a file whose pointer needs to be restaged in git. @@ -23,29 +25,39 @@ writeRestageLog :: TopFilePath -> InodeCache -> Annex () writeRestageLog f ic = do logf <- fromRepo gitAnnexRestageLog lckf <- fromRepo gitAnnexRestageLock - appendLogFile logf lckf $ L.fromStrict $ - encodeBS (showInodeCache ic) <> ":" <> getTopFilePath f + appendLogFile logf lckf $ L.fromStrict $ formatRestageLog f ic -- | Streams the content of the restage log, and then empties the log at -- the end. -- --- If the action is interrupted or throws an exception, the log file is --- left unchanged. +-- If the processor or finalizer is interrupted or throws an exception, +-- the log file is left unchanged. -- -- Locking is used to prevent new items being added to the log while this -- is running. -streamSmudged :: (TopFilePath -> InodeCache -> Annex ()) -> Annex () -streamSmudged a = do +streamRestageLog :: Annex () -> (TopFilePath -> InodeCache -> Annex ()) -> Annex () +streamRestageLog finalizer processor = do logf <- fromRepo gitAnnexRestageLog lckf <- fromRepo gitAnnexRestageLock - streamLogFile (fromRawFilePath logf) lckf $ \l -> - case parse l of + streamLogFile (fromRawFilePath logf) lckf finalizer $ \l -> + case parseRestageLog l of + Just (f, ic) -> processor f ic Nothing -> noop - Just (k, f) -> a f ic - where - parse l = - let (ics, f) = separate (== ':') l - in do - ic <- readInodeCache ics - return (asTopFilePath (toRawFilePath f), ic) +calcRestageLog :: t -> ((TopFilePath, InodeCache) -> t -> t) -> Annex t +calcRestageLog start proc = do + logf <- fromRepo gitAnnexRestageLog + calcLogFile (fromRawFilePath logf) start $ \l v -> + case parseRestageLog (decodeBL l) of + Just pl -> proc pl v + Nothing -> v + +formatRestageLog :: TopFilePath -> InodeCache -> S.ByteString +formatRestageLog f ic = encodeBS (showInodeCache ic) <> ":" <> getTopFilePath f + +parseRestageLog :: String -> Maybe (TopFilePath, InodeCache) +parseRestageLog l = + let (ics, f) = separate (== ':') l + in do + ic <- readInodeCache ics + return (asTopFilePath (toRawFilePath f), ic) diff --git a/git-annex.cabal b/git-annex.cabal index 3e6fa86d55..052db279db 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -909,6 +909,7 @@ Executable git-annex Logs.Remote Logs.Remote.Pure Logs.RemoteState + Logs.Restage Logs.Schedule Logs.SingleValue Logs.SingleValue.Pure