git-annex/Database/RepoSize.hs
Joey Hess f89a1b8216
remove stale live changes from reposize database
Reorganized the reposize database directory, and split up a column.

checkStaleSizeChanges needs to run before needLiveUpdate,
otherwise the process won't be holding a lock on its pid file, and
another process could go in and expire the live update it records. It
just so happens that they do get called in the correct order, since
checking balanced preferred content calls getLiveRepoSizes before
needLiveUpdate.

The 1 minute delay between checks is arbitrary, but will avoid excess
work. The downside of it is that, if a process is dropping a file and
gets interrupted, for 1 minute another process can expect a repository
will soon be smaller than it is. And so a process might send data to a
repository when a file is not really going to be dropped from it. But
note that can already happen if a drop takes some time in eg locking and
then fails. So it seems possible that live updates should only be
allowed to increase, rather than decrease the size of a repository.
2024-08-28 13:57:25 -04:00

414 lines
14 KiB
Haskell

{- Sqlite database used to track the sizes of repositories.
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-:
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE DataKinds, FlexibleInstances #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE UndecidableInstances #-}
{-# LANGUAGE TypeOperators #-}
#if MIN_VERSION_persistent_template(2,8,0)
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE StandaloneDeriving #-}
#endif
module Database.RepoSize (
RepoSizeHandle,
getRepoSizeHandle,
openDb,
closeDb,
lockDbWhile,
getRepoSizes,
setRepoSizes,
startingLiveSizeChange,
successfullyFinishedLiveSizeChange,
removeStaleLiveSizeChange,
removeStaleLiveSizeChanges,
recordedRepoOffsets,
liveRepoOffsets,
) where
import Annex.Common
import qualified Annex
import Database.RepoSize.Handle
import qualified Database.Handle as H
import Database.Init
import Database.Utility
import Database.Types
import Annex.LockFile
import Git.Types
import qualified Utility.RawFilePath as R
import Database.Persist.Sql hiding (Key)
import Database.Persist.TH
import qualified System.FilePath.ByteString as P
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import Control.Exception
import Control.Concurrent
share [mkPersist sqlSettings, mkMigrate "migrateRepoSizes"] [persistLowerCase|
-- Corresponds to location log information from the git-annex branch.
RepoSizes
repo UUID
size FileSize
UniqueRepo repo
-- The last git-annex branch commit that was used to update RepoSizes.
AnnexBranch
commit SSha
UniqueCommit commit
-- Changes that are currently being made that affect repo sizes.
-- (Only updated when preferred content expressions are in use that need
-- live size changes.)
LiveSizeChanges
repo UUID
key Key
changeid SizeChangeUniqueId
changepid SizeChangeProcessId
change SizeChange
UniqueLiveSizeChange repo key changeid changepid
-- A rolling total of size changes that were removed from LiveSizeChanges
-- upon successful completion.
SizeChanges
repo UUID
rollingtotal FileSize
UniqueRepoRollingTotal repo
-- The most recent size changes that were removed from LiveSizeChanges
-- upon successful completion.
RecentChanges
repo UUID
key Key
change SizeChange
UniqueRecentChange repo key
|]
{- Gets a handle to the database. It's cached in Annex state. -}
getRepoSizeHandle :: Annex RepoSizeHandle
getRepoSizeHandle = Annex.getState Annex.reposizehandle >>= \case
Just h -> return h
Nothing -> do
h <- openDb
Annex.changeState $ \s -> s { Annex.reposizehandle = Just h }
return h
{- Opens the database, creating it if it doesn't exist yet.
-
- Multiple readers and writers can have the database open at the same
- time. Database.Handle deals with the concurrency issues.
- The lock is held while opening the database, so that when
- the database doesn't exist yet, one caller wins the lock and
- can create it undisturbed.
-}
openDb :: Annex RepoSizeHandle
openDb = lockDbWhile permerr $ do
dbdir <- calcRepo' gitAnnexRepoSizeDbDir
let db = dbdir P.</> "db"
unlessM (liftIO $ R.doesPathExist db) $ do
initDb db $ void $
runMigrationSilent migrateRepoSizes
h <- liftIO $ H.openDb db "repo_sizes"
mkhandle (Just h)
where
mkhandle mh = do
livev <- liftIO $ newMVar Nothing
return $ RepoSizeHandle mh livev
-- If permissions don't allow opening the database,
-- just don't use it. Since this database is just a cache
-- of information available in the git-annex branch, the same
-- information can be queried from the branch, though much less
-- efficiently.
permerr _e = mkhandle Nothing
closeDb :: RepoSizeHandle -> Annex ()
closeDb (RepoSizeHandle (Just h) _) = liftIO $ H.closeDb h
closeDb (RepoSizeHandle Nothing _) = noop
-- This does not prevent another process that has already
-- opened the db from changing it at the same time.
lockDbWhile :: (IOException -> Annex a) -> Annex a -> Annex a
lockDbWhile permerr a = do
lck <- calcRepo' gitAnnexRepoSizeDbLock
catchPermissionDenied permerr $ withExclusiveLock lck a
{- Gets the sizes of repositories as of a commit to the git-annex
- branch. -}
getRepoSizes :: RepoSizeHandle -> IO (M.Map UUID RepoSize, Maybe Sha)
getRepoSizes (RepoSizeHandle (Just h) _) = H.queryDb h $ do
sizemap <- M.fromList <$> getRepoSizes'
annexbranchsha <- getAnnexBranchCommit
return (sizemap, annexbranchsha)
getRepoSizes (RepoSizeHandle Nothing _) = return (mempty, Nothing)
getRepoSizes' :: SqlPersistM [(UUID, RepoSize)]
getRepoSizes' = map conv <$> selectList [] []
where
conv entity =
let RepoSizes u sz = entityVal entity
in (u, RepoSize sz)
getAnnexBranchCommit :: SqlPersistM (Maybe Sha)
getAnnexBranchCommit = do
l <- selectList ([] :: [Filter AnnexBranch]) []
case l of
(s:[]) -> return $ Just $ fromSSha $
annexBranchCommit $ entityVal s
_ -> return Nothing
{- Updates the recorded sizes of all repositories.
-
- This can be called without locking since the update runs in a single
- transaction.
-
- Any repositories that are not in the provided map, but do have a size
- recorded in the database will have it cleared. This is unlikely to
- happen, but ensures that the database is consistent.
-}
setRepoSizes :: RepoSizeHandle -> M.Map UUID RepoSize -> Sha -> IO ()
setRepoSizes (RepoSizeHandle (Just h) _) sizemap branchcommitsha =
H.commitDb h $ do
l <- getRepoSizes'
forM_ (map fst l) $ \u ->
unless (M.member u sizemap) $
unsetRepoSize u
forM_ (M.toList sizemap) $
uncurry setRepoSize
clearRecentChanges
recordAnnexBranchCommit branchcommitsha
setRepoSizes (RepoSizeHandle Nothing _) _ _ = noop
setRepoSize :: UUID -> RepoSize -> SqlPersistM ()
setRepoSize u (RepoSize sz) =
void $ upsertBy
(UniqueRepo u)
(RepoSizes u sz)
[RepoSizesSize =. sz]
unsetRepoSize :: UUID -> SqlPersistM ()
unsetRepoSize u = deleteWhere [RepoSizesRepo ==. u]
recordAnnexBranchCommit :: Sha -> SqlPersistM ()
recordAnnexBranchCommit branchcommitsha = do
deleteWhere ([] :: [Filter AnnexBranch])
void $ insertUniqueFast $ AnnexBranch $ toSSha branchcommitsha
startingLiveSizeChange :: RepoSizeHandle -> UUID -> Key -> SizeChange -> SizeChangeId -> IO ()
startingLiveSizeChange (RepoSizeHandle (Just h) _) u k sc cid =
H.commitDb h $ void $ upsertBy
(UniqueLiveSizeChange u k
(sizeChangeUniqueId cid)
(sizeChangeProcessId cid))
(LiveSizeChanges u k
(sizeChangeUniqueId cid)
(sizeChangeProcessId cid)
sc)
[ LiveSizeChangesChange =. sc
, LiveSizeChangesChangeid =. sizeChangeUniqueId cid
, LiveSizeChangesChangepid =. sizeChangeProcessId cid
]
startingLiveSizeChange (RepoSizeHandle Nothing _) _ _ _ _ = noop
{- A live size change has successfully finished.
-
- Update the rolling total, add as a recent change,
- and remove the live change in the same transaction.
-
- But, it's possible that the same change has been done by two
- different processes or threads. If there is a matching recent change,
- then this one is redundant, so remove it without updating the rolling
- total.
-}
successfullyFinishedLiveSizeChange :: RepoSizeHandle -> UUID -> Key -> SizeChange -> SizeChangeId -> IO ()
successfullyFinishedLiveSizeChange (RepoSizeHandle (Just h) _) u k sc cid =
H.commitDb h $ do
getRecentChange u k >>= \case
Just sc' | sc == sc' -> remove
_ -> go
where
go = do
rollingtotal <- getSizeChangeFor u
setSizeChangeFor u (updateRollingTotal rollingtotal sc k)
addRecentChange u k sc
remove
remove = removeLiveSizeChange u k sc cid
successfullyFinishedLiveSizeChange (RepoSizeHandle Nothing _) _ _ _ _ = noop
updateRollingTotal :: FileSize -> SizeChange -> Key -> FileSize
updateRollingTotal t sc k = case sc of
AddingKey -> t + ksz
RemovingKey -> t - ksz
where
ksz = fromMaybe 0 $ fromKey keySize k
removeStaleLiveSizeChange :: RepoSizeHandle -> UUID -> Key -> SizeChange -> SizeChangeId -> IO ()
removeStaleLiveSizeChange (RepoSizeHandle (Just h) _) u k sc cid =
H.commitDb h $ removeLiveSizeChange u k sc cid
removeStaleLiveSizeChange (RepoSizeHandle Nothing _) _ _ _ _ = noop
removeLiveSizeChange :: UUID -> Key -> SizeChange -> SizeChangeId -> SqlPersistM ()
removeLiveSizeChange u k sc cid =
deleteWhere
[ LiveSizeChangesRepo ==. u
, LiveSizeChangesKey ==. k
, LiveSizeChangesChangeid ==. sizeChangeUniqueId cid
, LiveSizeChangesChangepid ==. sizeChangeProcessId cid
, LiveSizeChangesChange ==. sc
]
removeStaleLiveSizeChanges :: RepoSizeHandle -> [StaleSizeChanger] -> IO ()
removeStaleLiveSizeChanges (RepoSizeHandle (Just h) _) stale = do
let stalepids = map staleSizeChangerProcessId stale
H.commitDb h $ deleteWhere [ LiveSizeChangesChangepid <-. stalepids ]
removeStaleLiveSizeChanges (RepoSizeHandle Nothing _) _ = noop
getLiveSizeChangesMap :: SqlPersistM (M.Map UUID [(Key, (SizeChange, SizeChangeId))])
getLiveSizeChangesMap = M.fromListWith (++) . map conv <$> getLiveSizeChanges
where
conv (LiveSizeChanges u k cid pid sc) = (u, [(k, (sc, sid))])
where
sid = SizeChangeId cid pid
getLiveSizeChangesList :: SqlPersistM [(UUID, Key, SizeChange)]
getLiveSizeChangesList = map conv <$> getLiveSizeChanges
where
conv (LiveSizeChanges u k _cid _pid sc) = (u, k, sc)
getLiveSizeChanges :: SqlPersistM [LiveSizeChanges]
getLiveSizeChanges = map entityVal <$> selectList [] []
getSizeChanges :: SqlPersistM (M.Map UUID FileSize)
getSizeChanges = M.fromList . map conv <$> selectList [] []
where
conv entity =
let SizeChanges u n = entityVal entity
in (u, n)
getSizeChangeFor :: UUID -> SqlPersistM FileSize
getSizeChangeFor u = do
l <- selectList [SizeChangesRepo ==. u] []
return $ case l of
(s:_) -> sizeChangesRollingtotal $ entityVal s
[] -> 0
setSizeChangeFor :: UUID -> FileSize -> SqlPersistM ()
setSizeChangeFor u sz =
void $ upsertBy
(UniqueRepoRollingTotal u)
(SizeChanges u sz)
[SizeChangesRollingtotal =. sz]
addRecentChange :: UUID -> Key -> SizeChange -> SqlPersistM ()
addRecentChange u k sc =
void $ upsertBy
(UniqueRecentChange u k)
(RecentChanges u k sc)
[RecentChangesChange =. sc]
getRecentChange :: UUID -> Key -> SqlPersistM (Maybe SizeChange)
getRecentChange u k = do
l <- selectList
[ RecentChangesRepo ==. u
, RecentChangesKey ==. k
] []
return $ case l of
(s:_) -> Just $ recentChangesChange $ entityVal s
[] -> Nothing
getRecentChanges :: SqlPersistM [(UUID, Key, SizeChange)]
getRecentChanges = map conv <$> selectList [] []
where
conv entity =
let RecentChanges u k sc = entityVal entity
in (u, k, sc)
{- Clears recent changes, except when there is a live change that is
- redundant with a recent change. -}
clearRecentChanges :: SqlPersistM ()
clearRecentChanges = do
live <- getLiveSizeChangesList
if null live
then deleteWhere ([] :: [Filter RecentChanges])
else do
let liveset = S.fromList live
rcs <- getRecentChanges
forM_ rcs $ \rc@(u, k, sc) ->
when (S.notMember rc liveset) $
deleteWhere
[ RecentChangesRepo ==. u
, RecentChangesKey ==. k
, RecentChangesChange ==. sc
]
{- Gets the recorded offsets to sizes of Repos, not including live
- changes. -}
recordedRepoOffsets :: RepoSizeHandle -> IO (M.Map UUID SizeOffset)
recordedRepoOffsets (RepoSizeHandle (Just h) _) =
M.map SizeOffset <$> H.queryDb h getSizeChanges
recordedRepoOffsets (RepoSizeHandle Nothing _) = pure mempty
{- Gets the offsets to sizes of Repos, including all live changes that
- are happening now.
-
- This does not necessarily include all changes that have been made,
- only ones that had startingLiveSizeChange called for them will be
- included.
-
- In the unlikely case where two live changes are occurring, one
- adding a key and the other removing the same key, the one
- adding the key is used, in order to err on the side of a larger
- repository size.
-
- In the case where the same live change is recorded by two different
- processes or threads, the first to complete will record it as a recent
- change. This omits live changes that are redundant due to a recent
- change already being recorded for the same change.
-
- This is only expensive when there are a lot of live changes happening at
- the same time.
-}
liveRepoOffsets :: RepoSizeHandle -> IO (M.Map UUID SizeOffset)
liveRepoOffsets (RepoSizeHandle (Just h) _) = H.queryDb h $ do
sizechanges <- getSizeChanges
livechanges <- getLiveSizeChangesMap
let us = S.toList $ S.fromList $
M.keys sizechanges ++ M.keys livechanges
M.fromList <$> forM us (go sizechanges livechanges)
where
go sizechanges livechanges u = do
let livechangesbykey =
M.fromListWith (++) $
map (\(k, v) -> (k, [v])) $
fromMaybe [] $
M.lookup u livechanges
livechanges' <- combinelikelivechanges <$>
filterM (nonredundantlivechange livechangesbykey u)
(fromMaybe [] $ M.lookup u livechanges)
let sizechange = foldl'
(\t (k, sc) -> updateRollingTotal t sc k)
(fromMaybe 0 (M.lookup u sizechanges))
livechanges'
return (u, SizeOffset sizechange)
combinelikelivechanges =
S.elems
. S.fromList
. map (\(k, (sc, _)) -> (k, sc))
nonredundantlivechange livechangesbykey u (k, (sc, cid))
| null (competinglivechanges livechangesbykey k sc cid) =
getRecentChange u k >>= pure . \case
Nothing -> True
Just sc' -> sc /= sc'
| otherwise = pure False
competinglivechanges livechangesbykey k RemovingKey cid =
filter (\(sc', cid') -> cid /= cid' && sc' == AddingKey)
(fromMaybe [] $ M.lookup k livechangesbykey)
competinglivechanges _ _ AddingKey _ = []
liveRepoOffsets (RepoSizeHandle Nothing _) = pure mempty