take maxsize into account for balanced preferred content

This is very innefficient, it will need to be optimised not to
calculate the sizes of repos every time.

Also, fixed a bug in balancedPicker that caused it to pick a too high
index when some repos were excluded due to being full.
This commit is contained in:
Joey Hess 2024-08-13 11:00:20 -04:00
parent b201792391
commit 745bc5c547
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 60 additions and 11 deletions

View file

@ -21,14 +21,14 @@ type BalancedPicker = S.Set UUID -> Key -> UUID
-- The set of UUIDs provided here are all the UUIDs that are ever
-- expected to be picked amoung. A subset of that can be provided
-- when later using the BalancedPicker.
-- when later using the BalancedPicker. Neither set can be empty.
balancedPicker :: S.Set UUID -> BalancedPicker
balancedPicker s = \s' key ->
let n = calcMac tointeger HmacSha256 combineduuids (serializeKey' key)
m = fromIntegral (S.size s')
in S.elemAt (fromIntegral (n `mod` m)) s'
where
combineduuids = mconcat (map fromUUID (S.toAscList s))
m = fromIntegral (S.size s)
tointeger :: Digest a -> Integer
tointeger = foldl' (\i b -> (i `shiftL` 8) + fromIntegral b) 0

33
Annex/RepoSize.hs Normal file
View file

@ -0,0 +1,33 @@
{- git-annex repo sizes
-
- Copyright 2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Annex.RepoSize where
import Annex.Common
import Types.RepoSize
import Logs.Location
import Logs.UUID
import qualified Data.Map.Strict as M
{- Sum up the sizes of all keys in all repositories, from the information
- in the git-annex branch. Can be slow.
-
- The map includes the UUIDs of all known repositories, including
- repositories that are empty.
-}
calcRepoSizes :: Annex (M.Map UUID RepoSize)
calcRepoSizes = do
knownuuids <- M.keys <$> uuidDescMap
let startmap = M.fromList $ map (\u -> (u, RepoSize 0)) knownuuids
overLocationLogs startmap $ \k locs m ->
return $
let sz = fromMaybe 0 $ fromKey keySize k
in foldl' (flip $ M.alter $ addksz sz) m locs
where
addksz ksz (Just (RepoSize sz)) = Just $ RepoSize $ sz + ksz
addksz ksz Nothing = Just $ RepoSize ksz

View file

@ -17,6 +17,9 @@ import Annex.Content
import Annex.WorkTree
import Annex.UUID
import Annex.Magic
import Annex.RepoSize
import Types.RepoSize
import Logs.MaxSize
import Annex.Link
import Types.Link
import Logs.Trust
@ -590,14 +593,24 @@ limitBalanced mu getgroupmap groupname = do
limitFullyBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullyBalanced mu getgroupmap groupname = Right $ MatchFiles
{ matchAction = const $ checkKey $ \key -> do
{ matchAction = \notpresent -> checkKey $ \key -> do
gm <- getgroupmap
let groupmembers = fromMaybe S.empty $
M.lookup g (uuidsByGroup gm)
-- TODO free space checking
return $ case (mu, M.lookup g (balancedPickerByGroup gm)) of
(Just u, Just picker) -> u == picker groupmembers key
_ -> False
maxsizes <- getMaxSizes
-- XXX do not calc this every time!
sizemap <- calcRepoSizes
let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of
(Just (MaxSize maxsize), Just (RepoSize reposize)) ->
reposize + fromMaybe 0 (fromKey keySize key)
<= maxsize
_ -> True
let candidates = S.filter hasspace groupmembers
return $ if S.null candidates
then False
else case (mu, M.lookup g (balancedPickerByGroup gm)) of
(Just u, Just picker) -> u == picker candidates key
_ -> False
, matchNeedsFileName = False
, matchNeedsFileContent = False
, matchNeedsKey = True

View file

@ -45,13 +45,15 @@ Planned schedule of work:
Also note that "fullybalanced=foo:2" is not currently actually
implemented!
* implement size-based balancing, either as the default or as another
preferred content expression.
* implement size-based balancing, so all balanced repositories are around
the same percent full, either as the default or as another preferred
content expression.
* `git-annex info` can use maxsize to display how full repositories are
* balanced= and fullybalanced= need to limit the set of repositories to
ones with enough free space to contain a key.
* --rebalance is not stable. It will drop a key that was just stored in a
repo. Seems that limitFullyBalanced needs to take AssumeNotPresent
into account to handle dropping correctly.
* Implement [[track_free_space_in_repos_via_git-annex_branch]]:

View file

@ -574,6 +574,7 @@ Executable git-annex
Annex.Queue
Annex.ReplaceFile
Annex.RemoteTrackingBranch
Annex.RepoSize
Annex.SafeDropProof
Annex.SpecialRemote
Annex.SpecialRemote.Config