From eac4e9391b4a8bb363e87c2bfb2acc89f3de17d0 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 15 Aug 2024 11:50:01 -0400 Subject: [PATCH] finalize RepoSize database Including locking on creation, handling of permissions errors, and setting repo sizes. I'm confident that locking is not needed while using this database. Since writes happen in a single transaction. When there are two writers that are recording sizes based on different git-annex branch commits, one will overwrite what the other one recorded. Which is fine, it's only necessary that the database stays consistent with the content of a git-annex branch commit. --- Annex/Locations.hs | 5 ++ Database/RepoSize.hs | 116 ++++++++++++++++++++++---------- doc/todo/git-annex_proxies.mdwn | 2 - 3 files changed, 86 insertions(+), 37 deletions(-) diff --git a/Annex/Locations.hs b/Annex/Locations.hs index 0b1ad4d556..18b4dd2ed0 100644 --- a/Annex/Locations.hs +++ b/Annex/Locations.hs @@ -76,6 +76,7 @@ module Annex.Locations ( gitAnnexImportFeedDbDir, gitAnnexImportFeedDbLock, gitAnnexRepoSizeDbDir, + gitAnnexRepoSizeDbLock, gitAnnexScheduleState, gitAnnexTransferDir, gitAnnexCredsDir, @@ -521,6 +522,10 @@ gitAnnexRepoSizeDbDir :: Git.Repo -> GitConfig -> RawFilePath gitAnnexRepoSizeDbDir r c = fromMaybe (gitAnnexDir r) (annexDbDir c) P. "reposize" +{- Lock file for the reposize database. -} +gitAnnexRepoSizeDbLock :: Git.Repo -> GitConfig -> RawFilePath +gitAnnexRepoSizeDbLock r c = gitAnnexRepoSizeDbDir r c <> ".lck" + {- .git/annex/schedulestate is used to store information about when - scheduled jobs were last run. -} gitAnnexScheduleState :: Git.Repo -> RawFilePath diff --git a/Database/RepoSize.hs b/Database/RepoSize.hs index c4a6814e1a..ff70affbfc 100644 --- a/Database/RepoSize.hs +++ b/Database/RepoSize.hs @@ -22,17 +22,18 @@ module Database.RepoSize ( RepoSizeHandle, openDb, closeDb, - getRepoSizes, - setRepoSize, - updateRepoSize, + getRepoSizes, + setRepoSizes, ) where +import Annex.Common +import Annex.LockFile import Types.RepoSize -import Database.Types () +import Git.Types import qualified Database.Queue as H import Database.Init -import Annex.Locations -import Annex.Common +import Database.Utility +import Database.Types import qualified Utility.RawFilePath as R import Database.Persist.Sql hiding (Key) @@ -40,62 +41,107 @@ import Database.Persist.TH import qualified System.FilePath.ByteString as P import qualified Data.Map as M -newtype RepoSizeHandle = RepoSizeHandle H.DbQueue +newtype RepoSizeHandle = RepoSizeHandle (Maybe H.DbQueue) share [mkPersist sqlSettings, mkMigrate "migrateRepoSizes"] [persistLowerCase| +-- Corresponds to location log information from the git-annex branch. RepoSizes repo UUID size Integer UniqueRepo repo +-- The last git-annex branch commit that was used to update RepoSizes. +AnnexBranch + commit SSha + UniqueCommit commit |] {- Opens the database, creating it if it doesn't exist yet. - - - No locking is done by this, so caller must prevent multiple processes - - running this at the same time. + - Multiple readers and writers can have the database open at the same + - time. Database.Handle deals with the concurrency issues. + - The lock is held while opening the database, so that when + - the database doesn't exist yet, one caller wins the lock and + - can create it undisturbed. -} openDb :: Annex RepoSizeHandle openDb = do - dbdir <- calcRepo' gitAnnexRepoSizeDbDir - let db = dbdir P. "db" - unlessM (liftIO $ R.doesPathExist db) $ do - initDb db $ void $ - runMigrationSilent migrateRepoSizes - h <- liftIO $ H.openDbQueue db "reposizes" - return $ RepoSizeHandle h + lck <- calcRepo' gitAnnexRepoSizeDbLock + catchPermissionDenied permerr $ withExclusiveLock lck $ do + dbdir <- calcRepo' gitAnnexRepoSizeDbDir + let db = dbdir P. "db" + unlessM (liftIO $ R.doesPathExist db) $ do + initDb db $ void $ + runMigrationSilent migrateRepoSizes + h <- liftIO $ H.openDbQueue db "repo_sizes" + return $ RepoSizeHandle (Just h) + where + -- If permissions don't allow opening the database, + -- just don't use it. Since this database is just a cache + -- of information available in the git-annex branch, the same + -- information can be queried from the branch, though much less + -- efficiently. + permerr _e = return (RepoSizeHandle Nothing) closeDb :: RepoSizeHandle -> Annex () -closeDb (RepoSizeHandle h) = liftIO $ H.closeDbQueue h +closeDb (RepoSizeHandle (Just h)) = liftIO $ H.closeDbQueue h +closeDb (RepoSizeHandle Nothing) = noop -{- Doesn't see changes that were just made with setRepoSize or - - updateRepoSize before flushing the queue. -} -getRepoSizes :: RepoSizeHandle -> IO (M.Map UUID RepoSize) -getRepoSizes (RepoSizeHandle h) = H.queryDbQueue h $ - M.fromList . map conv <$> getRepoSizes' +getRepoSizes :: RepoSizeHandle -> IO (M.Map UUID RepoSize, Maybe Sha) +getRepoSizes (RepoSizeHandle (Just h)) = H.queryDbQueue h $ do + sizemap <- M.fromList . map conv <$> getRepoSizes' + annexbranchsha <- getAnnexBranchCommit + return (sizemap, annexbranchsha) where conv entity = let RepoSizes u sz = entityVal entity in (u, RepoSize sz) +getRepoSizes (RepoSizeHandle Nothing) = return (mempty, Nothing) getRepoSizes' :: SqlPersistM [Entity RepoSizes] getRepoSizes' = selectList [] [] -setRepoSize :: UUID -> RepoSize -> RepoSizeHandle -> IO () -setRepoSize u (RepoSize sz) (RepoSizeHandle h) = H.queueDb h checkCommit $ +getAnnexBranchCommit :: SqlPersistM (Maybe Sha) +getAnnexBranchCommit = do + l <- selectList ([] :: [Filter AnnexBranch]) [] + case l of + (s:[]) -> return $ Just $ fromSSha $ + annexBranchCommit $ entityVal s + _ -> return Nothing + +{- Updates the recorded sizes of all repositories. + - + - This can be called without locking since the update runs in a single + - transaction. + - + - Any repositories that are not in the provided map, but do have a size + - recorded in the database will have it cleared. This is unlikely to + - happen, but ensures that the database is consistent. + -} +setRepoSizes :: RepoSizeHandle -> M.Map UUID RepoSize -> Sha -> IO () +setRepoSizes (RepoSizeHandle (Just h)) sizemap branchcommitsha = + H.queueDb h commitimmediately $ do + l <- getRepoSizes' + forM_ (map entityVal l) $ \(RepoSizes u _) -> + unless (M.member u sizemap) $ + unsetRepoSize u + forM_ (M.toList sizemap) $ + uncurry setRepoSize + recordAnnexBranchCommit branchcommitsha + where + commitimmediately _ _ = pure True +setRepoSizes (RepoSizeHandle Nothing) _ _ = noop + +setRepoSize :: UUID -> RepoSize -> SqlPersistM () +setRepoSize u (RepoSize sz) = void $ upsertBy (UniqueRepo u) (RepoSizes u sz) [RepoSizesSize =. sz] -{- Applies an offset to the size. If no size is recorded for the repo, does - - nothing. -} -updateRepoSize :: UUID -> Integer -> RepoSizeHandle -> IO () -updateRepoSize u offset (RepoSizeHandle h) = H.queueDb h checkCommit $ - void $ updateWhere - [RepoSizesRepo ==. u] - [RepoSizesSize +=. offset] +unsetRepoSize :: UUID -> SqlPersistM () +unsetRepoSize u = deleteWhere [RepoSizesRepo ==. u] -checkCommit :: H.QueueSize -> H.LastCommitTime -> IO Bool -checkCommit sz _lastcommittime - | sz > 1000 = return True - | otherwise = return False +recordAnnexBranchCommit :: Sha -> SqlPersistM () +recordAnnexBranchCommit branchcommitsha = do + deleteWhere ([] :: [Filter AnnexBranch]) + void $ insertUniqueFast $ AnnexBranch $ toSSha branchcommitsha diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index e049ed3c13..a93f462829 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -55,8 +55,6 @@ Planned schedule of work: * Goal is for limitFullyBalanced not to need to calcRepoSizes. - * Add git-annex branch sha to Database.RepoSizes. - * When Annex.reposizes does not list the size of a UUID, and that UUID's size is needed eg for balanced preferred content, use calcRepoSizes and store in