added reposize database

The idea is that upon a merge of the git-annex branch, or a commit to
the git-annex branch, the reposize database will be updated. So it
should always accurately reflect the location log sizes, but it will
often be behind the actual current sizes.

Annex.reposizes will start with the value from the database, and get
updated with each transfer, so it will reflect a process's best
understanding of the current sizes.

When there are multiple processes all transferring to the same repo,
Annex.reposize will not reflect transfers made by the other processes
since the current process started. So when using balanced preferred
content, it may make suboptimal choices, including trying to transfer
content to the repo when another process has already filled it up.
But this is the same as if there are multiple processes running on
ifferent machines, so is acceptable. The reposize will eventually
get an accurate value reflecting changes made by other processes or in
other repos.
This commit is contained in:
Joey Hess 2024-08-12 11:19:58 -04:00
parent 71043fe9f7
commit 99a126bebb
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
8 changed files with 130 additions and 14 deletions

View file

@ -75,7 +75,7 @@ import Types.RemoteConfig
import Types.TransferrerPool
import Types.VectorClock
import Types.Cluster
import Types.MaxSize
import Types.RepoSize
import Annex.VectorClock.Utility
import Annex.Debug.Utility
import qualified Database.Keys.Handle as Keys
@ -202,6 +202,7 @@ data AnnexState = AnnexState
, remoteconfigmap :: Maybe (M.Map UUID RemoteConfig)
, clusters :: Maybe (Annex Clusters)
, maxsizes :: Maybe (M.Map UUID MaxSize)
, reposizes :: Maybe (M.Map UUID RepoSize)
, forcetrust :: TrustMap
, trustmap :: Maybe TrustMap
, groupmap :: Maybe GroupMap
@ -257,6 +258,7 @@ newAnnexState c r = do
, remoteconfigmap = Nothing
, clusters = Nothing
, maxsizes = Nothing
, reposizes = Nothing
, forcetrust = M.empty
, trustmap = Nothing
, groupmap = Nothing

View file

@ -75,6 +75,7 @@ module Annex.Locations (
gitAnnexContentIdentifierLock,
gitAnnexImportFeedDbDir,
gitAnnexImportFeedDbLock,
gitAnnexRepoSizeDbDir,
gitAnnexScheduleState,
gitAnnexTransferDir,
gitAnnexCredsDir,
@ -515,6 +516,11 @@ gitAnnexImportFeedDbDir r c =
gitAnnexImportFeedDbLock :: Git.Repo -> GitConfig -> RawFilePath
gitAnnexImportFeedDbLock r c = gitAnnexImportFeedDbDir r c <> ".lck"
{- Directory containing reposize database. -}
gitAnnexRepoSizeDbDir :: Git.Repo -> GitConfig -> RawFilePath
gitAnnexRepoSizeDbDir r c =
fromMaybe (gitAnnexDir r) (annexDbDir c) P.</> "reposize"
{- .git/annex/schedulestate is used to store information about when
- scheduled jobs were last run. -}
gitAnnexScheduleState :: Git.Repo -> RawFilePath

View file

@ -14,6 +14,7 @@ module Database.Queue (
closeDbQueue,
flushDbQueue,
QueueSize,
LastCommitTime,
queueDb,
) where

101
Database/RepoSize.hs Normal file
View file

@ -0,0 +1,101 @@
{- Sqlite database used to track the sizes of repositories.
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-:
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE DataKinds, FlexibleInstances #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE TypeOperators #-}
#if MIN_VERSION_persistent_template(2,8,0)
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE StandaloneDeriving #-}
#endif
module Database.RepoSize (
RepoSizeHandle,
openDb,
closeDb,
getRepoSizes,
setRepoSize,
updateRepoSize,
) where
import Types.RepoSize
import Database.Types ()
import qualified Database.Queue as H
import Database.Init
import Annex.Locations
import Annex.Common
import qualified Utility.RawFilePath as R
import Database.Persist.Sql hiding (Key)
import Database.Persist.TH
import qualified System.FilePath.ByteString as P
import qualified Data.Map as M
newtype RepoSizeHandle = RepoSizeHandle H.DbQueue
share [mkPersist sqlSettings, mkMigrate "migrateRepoSizes"] [persistLowerCase|
RepoSizes
repo UUID
size Integer
UniqueRepo repo
|]
{- Opens the database, creating it if it doesn't exist yet.
-
- No locking is done by this, so caller must prevent multiple processes
- running this at the same time.
-}
openDb :: Annex RepoSizeHandle
openDb = do
dbdir <- calcRepo' gitAnnexRepoSizeDbDir
let db = dbdir P.</> "db"
unlessM (liftIO $ R.doesPathExist db) $ do
initDb db $ void $
runMigrationSilent migrateRepoSizes
h <- liftIO $ H.openDbQueue db "reposizes"
return $ RepoSizeHandle h
closeDb :: RepoSizeHandle -> Annex ()
closeDb (RepoSizeHandle h) = liftIO $ H.closeDbQueue h
{- Doesn't see changes that were just made with setRepoSize or
- updateRepoSize before flushing the queue. -}
getRepoSizes :: RepoSizeHandle -> IO (M.Map UUID RepoSize)
getRepoSizes (RepoSizeHandle h) = H.queryDbQueue h $
M.fromList . map conv <$> getRepoSizes'
where
conv entity =
let RepoSizes u sz = entityVal entity
in (u, RepoSize sz)
getRepoSizes' :: SqlPersistM [Entity RepoSizes]
getRepoSizes' = selectList [] []
setRepoSize :: UUID -> RepoSize -> RepoSizeHandle -> IO ()
setRepoSize u (RepoSize sz) (RepoSizeHandle h) = H.queueDb h checkCommit $
void $ upsertBy
(UniqueRepo u)
(RepoSizes u sz)
[RepoSizesSize =. sz]
{- Applies an offset to the size. If no size is recorded for the repo, does
- nothing. -}
updateRepoSize :: UUID -> Integer -> RepoSizeHandle -> IO ()
updateRepoSize u offset (RepoSizeHandle h) = H.queueDb h checkCommit $
void $ updateWhere
[RepoSizesRepo ==. u]
[RepoSizesSize +=. offset]
checkCommit :: H.QueueSize -> H.LastCommitTime -> IO Bool
checkCommit sz _lastcommittime
| sz > 1000 = return True
| otherwise = return False

View file

@ -13,7 +13,7 @@ module Logs.MaxSize (
import qualified Annex
import Annex.Common
import Types.MaxSize
import Types.RepoSize
import Logs
import Logs.UUIDBased
import Logs.MapLog

View file

@ -1,11 +0,0 @@
{- git-annex maxsize type
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Types.MaxSize where
newtype MaxSize = MaxSize Integer
deriving (Show, Eq, Ord)

16
Types/RepoSize.hs Normal file
View file

@ -0,0 +1,16 @@
{- git-annex repo sizes types
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Types.RepoSize where
-- The current size of a repo.
newtype RepoSize = RepoSize Integer
deriving (Show, Eq, Ord)
-- The maximum size of a repo.
newtype MaxSize = MaxSize Integer
deriving (Show, Eq, Ord)

View file

@ -783,6 +783,7 @@ Executable git-annex
Database.Keys.SQL
Database.Queue
Database.RawFilePath
Database.RepoSize
Database.Types
Database.Utility
Git
@ -988,7 +989,6 @@ Executable git-annex
Types.KeySource
Types.Link
Types.LockCache
Types.MaxSize
Types.Messages
Types.MetaData
Types.Mime
@ -998,6 +998,7 @@ Executable git-annex
Types.Remote
Types.RemoteConfig
Types.RemoteState
Types.RepoSize
Types.RepoVersion
Types.ScheduledActivity
Types.StandardGroups