diff --git a/Annex/RepoSize.hs b/Annex/RepoSize.hs index a75e0440ba..14b6da4a92 100644 --- a/Annex/RepoSize.hs +++ b/Annex/RepoSize.hs @@ -17,12 +17,18 @@ import qualified Annex import Annex.Branch (UnmergedBranches(..), getBranch) import Types.RepoSize import qualified Database.RepoSize as Db +import Logs import Logs.Location import Logs.UUID import Git.Types (Sha) +import Git.FilePath +import Git.CatFile +import qualified Git.DiffTree as DiffTree import Control.Concurrent +import Control.Concurrent.Async import qualified Data.Map.Strict as M +import qualified Data.Set as S {- Gets the repo size map. Cached for speed. -} getRepoSizes :: Annex (M.Map UUID RepoSize) @@ -49,10 +55,7 @@ calcRepoSizes rsv = bracket setup cleanup $ \h -> go h `onException` failed currbranchsha <- getBranch if oldbranchsha == currbranchsha then calcJournalledRepoSizes oldsizemap oldbranchsha - else do - -- XXX todo incremental update by diffing - -- from old to new branch. - calculatefromscratch h + else incrementalupdate h oldsizemap oldbranchsha currbranchsha liftIO $ putMVar rsv (Just sizemap) return sizemap @@ -62,6 +65,11 @@ calcRepoSizes rsv = bracket setup cleanup $ \h -> go h `onException` failed liftIO $ Db.setRepoSizes h sizemap branchsha calcJournalledRepoSizes sizemap branchsha + incrementalupdate h oldsizemap oldbranchsha currbranchsha = do + (sizemap, branchsha) <- diffBranchRepoSizes oldsizemap oldbranchsha currbranchsha + liftIO $ Db.setRepoSizes h sizemap branchsha + calcJournalledRepoSizes sizemap branchsha + setup = Db.openDb cleanup = Db.closeDb @@ -75,7 +83,7 @@ calcRepoSizes rsv = bracket setup cleanup $ \h -> go h `onException` failed - branch commit that was used. - - The map includes the UUIDs of all known repositories, including - - repositories that are empty. + - repositories that are empty. But clusters are not included. - - Note that private repositories, which do not get recorded in - the git-annex branch, will have 0 size. journalledRepoSizes @@ -100,8 +108,48 @@ calcJournalledRepoSizes -> Sha -> Annex (M.Map UUID RepoSize) calcJournalledRepoSizes startmap branchsha = - overLocationLogsJournal startmap branchsha accumsizes Nothing + overLocationLogsJournal startmap branchsha + (\k v m -> pure (accumRepoSizes k v m)) + Nothing + +{- Incremental update by diffing. -} +diffBranchRepoSizes :: M.Map UUID RepoSize -> Sha -> Sha -> Annex (M.Map UUID RepoSize, Sha) +diffBranchRepoSizes oldsizemap oldbranchsha newbranchsha = do + g <- Annex.gitRepo + catObjectStream g $ \feeder closer reader -> do + (l, cleanup) <- inRepo $ + DiffTree.diffTreeRecursive oldbranchsha newbranchsha + feedtid <- liftIO $ async $ do + forM_ l $ feedpairs feeder + closer + newsizemap <- readpairs 500000 reader oldsizemap Nothing + liftIO $ wait feedtid + ifM (liftIO cleanup) + ( return (newsizemap, newbranchsha) + , return (oldsizemap, oldbranchsha) + ) where - accumsizes k (newlocs, removedlocs) m = return $ - let m' = foldl' (flip $ M.alter $ addKeyRepoSize k) m newlocs - in foldl' (flip $ M.alter $ removeKeyRepoSize k) m' removedlocs + feedpairs feeder ti = + let f = getTopFilePath (DiffTree.file ti) + in case extLogFileKey locationLogExt f of + Nothing -> noop + Just k -> do + feeder (k, DiffTree.srcsha ti) + feeder (k, DiffTree.dstsha ti) + + readpairs n reader sizemap Nothing = liftIO reader >>= \case + Just (_k, oldcontent) -> readpairs n reader sizemap (Just oldcontent) + Nothing -> return sizemap + readpairs n reader sizemap (Just oldcontent) = liftIO reader >>= \case + Just (k, newcontent) -> + let prevlog = parselog oldcontent + currlog = parselog newcontent + newlocs = S.difference currlog prevlog + removedlocs = S.difference prevlog currlog + !sizemap' = accumRepoSizes k (newlocs, removedlocs) sizemap + in do + n' <- countdownToMessage n $ + showSideAction "calculating repository sizes" + readpairs n' reader sizemap' Nothing + Nothing -> return sizemap + parselog = maybe mempty (S.fromList . parseLoggedLocationsWithoutClusters) diff --git a/Annex/RepoSize/LiveUpdate.hs b/Annex/RepoSize/LiveUpdate.hs index ebd656b4bf..3b8cb40d4f 100644 --- a/Annex/RepoSize/LiveUpdate.hs +++ b/Annex/RepoSize/LiveUpdate.hs @@ -16,6 +16,7 @@ import Logs.Presence.Pure import Control.Concurrent import qualified Data.Map.Strict as M +import qualified Data.Set as S updateRepoSize :: UUID -> Key -> LogStatus -> Annex () updateRepoSize u k s = do @@ -46,3 +47,8 @@ removeKeyRepoSize k mrs = case mrs of Nothing -> Nothing where ksz = fromMaybe 0 $ fromKey keySize k + +accumRepoSizes :: Key -> (S.Set UUID, S.Set UUID) -> M.Map UUID RepoSize -> M.Map UUID RepoSize +accumRepoSizes k (newlocs, removedlocs) sizemap = + let !sizemap' = foldl' (flip $ M.alter $ addKeyRepoSize k) sizemap newlocs + in foldl' (flip $ M.alter $ removeKeyRepoSize k) sizemap' removedlocs diff --git a/Database/Keys.hs b/Database/Keys.hs index 0af2a4446c..9704b6ff4c 100644 --- a/Database/Keys.hs +++ b/Database/Keys.hs @@ -476,19 +476,11 @@ reconcileStaged dbisnew qh = ifM isBareRepo dbwriter dbchanged n catreader = liftIO catreader >>= \case Just (ka, content) -> do changed <- ka (parseLinkTargetOrPointerLazy =<< content) - !n' <- countdownToMessage n + n' <- countdownToMessage n $ + showSideAction "scanning for annexed files" dbwriter (dbchanged || changed) n' catreader Nothing -> return dbchanged - -- When the diff is large, the scan can take a while, - -- so let the user know what's going on. - countdownToMessage n - | n < 1 = return 0 - | n == 1 = do - showSideAction "scanning for annexed files" - return 0 - | otherwise = return (pred n) - -- How large is large? Too large and there will be a long -- delay before the message is shown; too short and the message -- will clutter things up unnecessarily. It's uncommon for 1000 diff --git a/Logs.hs b/Logs.hs index 91d4566bdd..52968ca575 100644 --- a/Logs.hs +++ b/Logs.hs @@ -179,7 +179,10 @@ migrationTreeGraftPoint = "migrate.tree" {- The pathname of the location log file for a given key. -} locationLogFile :: GitConfig -> Key -> RawFilePath locationLogFile config key = - branchHashDir config key P.</> keyFile key <> ".log" + branchHashDir config key P.</> keyFile key <> locationLogExt + +locationLogExt :: S.ByteString +locationLogExt = ".log" {- The filename of the url log for a given key. -} urlLogFile :: GitConfig -> Key -> RawFilePath diff --git a/Logs/Location.hs b/Logs/Location.hs index dad2ddc808..3948c71a33 100644 --- a/Logs/Location.hs +++ b/Logs/Location.hs @@ -35,6 +35,8 @@ module Logs.Location ( overLocationLogs, overLocationLogs', overLocationLogsJournal, + parseLoggedLocations, + parseLoggedLocationsWithoutClusters, ) where import Annex.Common @@ -110,7 +112,10 @@ loggedLocationsHistorical = getLoggedLocations . historicalLogInfo loggedLocationsRef :: Ref -> Annex [UUID] loggedLocationsRef ref = map (toUUID . fromLogInfo) . getLog <$> catObject ref -{- Parses the content of a log file and gets the locations in it. -} +{- Parses the content of a log file and gets the locations in it. + - + - Adds the UUIDs of any clusters whose nodes are in the list. + -} parseLoggedLocations :: Clusters -> L.ByteString -> [UUID] parseLoggedLocations clusters = addClusterUUIDs clusters . parseLoggedLocationsWithoutClusters @@ -127,7 +132,6 @@ getLoggedLocations getter key = do clusters <- getClusters return $ addClusterUUIDs clusters locs --- Add UUIDs of any clusters whose nodes are in the list. addClusterUUIDs :: Clusters -> [UUID] -> [UUID] addClusterUUIDs clusters locs | M.null clustermap = locs diff --git a/Messages.hs b/Messages.hs index 89329592dc..c6ba6ed40a 100644 --- a/Messages.hs +++ b/Messages.hs @@ -5,7 +5,7 @@ - Licensed under the GNU AGPL version 3 or higher. -} -{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, CPP #-} +{-# LANGUAGE OverloadedStrings, ScopedTypeVariables, BangPatterns, CPP #-} module Messages ( showStartMessage, @@ -54,6 +54,7 @@ module Messages ( prompt, mkPrompter, sanitizeTopLevelExceptionMessages, + countdownToMessage, ) where import Control.Concurrent @@ -364,3 +365,17 @@ sanitizeTopLevelExceptionMessages a = a `catches` go e = do hPutStrLn stderr $ safeOutput $ toplevelMsg (show e) exitWith $ ExitFailure 1 + +{- Used to only run an action that displays a message after the specified + - number of steps. This is useful when performing an action that can + - sometimes take a long time, but often does not. + -} +countdownToMessage :: Int -> Annex () -> Annex Int +countdownToMessage n showmsg + | n < 1 = return 0 + | n == 1 = do + showmsg + return 0 + | otherwise = do + let !n' = pred n + return n' diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index f7da42a4bd..5d8cd5c5fa 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -30,19 +30,11 @@ Planned schedule of work: ## work notes -* Implement [[track_free_space_in_repos_via_git-annex_branch]]: - - * updateRepoSizes incrementally when the git-annex branch sha in the - database is older than the current git-annex branch. Diff from old to - new branch to efficiently update. - - Note ideas in above todo about doing this at git-annex branch merge - time to reuse the git diff done there. +* Concurrency issues with RepoSizes calculation and balanced content: * What if 2 concurrent threads are considering sending two different keys to a repo at the same time. It can hold either but not both. - It should avoid sending both in this situation. (Also discussed in - above todo) + It should avoid sending both in this situation. * There can also be a race with 2 concurrent threads where one just finished sending to a repo, but has not yet updated the location log. @@ -101,6 +93,7 @@ Planned schedule of work: * Balanced preferred content basic implementation, including --rebalance option. +* Implemented [[track_free_space_in_repos_via_git-annex_branch]] ## completed items for August's work on git-annex proxy support for exporttre diff --git a/doc/todo/optimise_git-annex_branch_merge_and_database_updates.mdwn b/doc/todo/optimise_git-annex_branch_merge_and_database_updates.mdwn new file mode 100644 index 0000000000..ddc1b67092 --- /dev/null +++ b/doc/todo/optimise_git-annex_branch_merge_and_database_updates.mdwn @@ -0,0 +1,15 @@ +When git-annex merges a remote into the git-annex branch, it uses +a CatFileHandle, making a query get the contents of each file in the +diff. It would be faster for it to use catObjectStream. +[[!commit d010ab04be5a8d74fe85a2fa27a853784d1f9009]] saw a 2x-16x +improvement to a similar process. + +Also, Database.ContentIdentifier.updateFromLog, +Database.ImportFeed.updateFromLog, and Annex.RepoSize.diffBranchRepoSizes +each do a similar diff and cat-file to update information cached from the +git-annex branch into a database. (diffBranchRepoSizes does use +catObjectStream, the others don't.) + +It seems like it might be possible to +make merging the git-annex branch do these updates in passing, and reduce +the overhead of diff and cat-file 4x. --[[Joey]] diff --git a/doc/todo/track_free_space_in_repos_via_git-annex_branch.mdwn b/doc/todo/track_free_space_in_repos_via_git-annex_branch.mdwn index 760a6de331..bace415615 100644 --- a/doc/todo/track_free_space_in_repos_via_git-annex_branch.mdwn +++ b/doc/todo/track_free_space_in_repos_via_git-annex_branch.mdwn @@ -92,6 +92,9 @@ merge time. Those are less expensive than diffing the location logs only because the logs they diff are less often used, and the work is only done when relevant commands are run. +(Opened [[todo/optimise_git-annex_branch_merge_and_database_updates]] +about that possibility.) + ## concurrency Suppose a repository is almost full. Two concurrent threads or processes @@ -106,3 +109,6 @@ sizeOfDownloadsInProgress. It would be possible to make a `sizeOfUploadsInProgressToRemote r` similarly. [[!tag projects/openneuro]] + +> Current status: This is implemented, but concurrency issues remain. +> --[[Joey]]