Support "sizebalanced=" and "fullysizebalanced=" too

Might want to make --rebalance turn balanced=group:N where N > 1
to fullysizebalanced=group:N. Have not yet determined if that will
improve situations enough to be worth the extra work.
This commit is contained in:
Joey Hess 2024-08-21 15:01:54 -04:00
parent 4e1dcc0372
commit 9e87061de2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 157 additions and 36 deletions

View file

@ -173,6 +173,8 @@ preferredContentTokens pcd =
, ValueToken "onlyingroup" (usev $ limitOnlyInGroup $ getGroupMap pcd) , ValueToken "onlyingroup" (usev $ limitOnlyInGroup $ getGroupMap pcd)
, ValueToken "balanced" (usev $ limitBalanced (repoUUID pcd) (getGroupMap pcd)) , ValueToken "balanced" (usev $ limitBalanced (repoUUID pcd) (getGroupMap pcd))
, ValueToken "fullybalanced" (usev $ limitFullyBalanced (repoUUID pcd) (getGroupMap pcd)) , ValueToken "fullybalanced" (usev $ limitFullyBalanced (repoUUID pcd) (getGroupMap pcd))
, ValueToken "sizebalanced" (usev $ limitSizeBalanced (repoUUID pcd) (getGroupMap pcd))
, ValueToken "fullysizebalanced" (usev $ limitFullySizeBalanced (repoUUID pcd) (getGroupMap pcd))
] ++ commonTokens LimitAnnexFiles ] ++ commonTokens LimitAnnexFiles
where where
preferreddir = maybe "public" fromProposedAccepted $ preferreddir = maybe "public" fromProposedAccepted $

View file

@ -19,6 +19,7 @@ git-annex (10.20240831) UNRELEASED; urgency=medium
remotes. External special remotes should not use that config for their remotes. External special remotes should not use that config for their
own purposes. own purposes.
* Support "balanced=" and "fullybalanced=" in preferred content expressions. * Support "balanced=" and "fullybalanced=" in preferred content expressions.
* Support "sizebalanced=" and "fullysizebalanced=" too.
* Added --rebalance option. * Added --rebalance option.
* maxsize: New command to tell git-annex how large the expected maximum * maxsize: New command to tell git-annex how large the expected maximum
size of a repository is, and to display repository sizes. size of a repository is, and to display repository sizes.

108
Limit.hs
View file

@ -558,7 +558,11 @@ limitOnlyInGroup getgroupmap groupname = Right $ MatchFiles
limitBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex limitBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitBalanced mu getgroupmap groupname = do limitBalanced mu getgroupmap groupname = do
fullybalanced <- limitFullyBalanced mu getgroupmap groupname fullybalanced <- limitFullyBalanced' "balanced" mu getgroupmap groupname
limitBalanced' "balanced" fullybalanced mu groupname
limitBalanced' :: String -> MatchFiles Annex -> Maybe UUID -> MkLimit Annex
limitBalanced' termname fullybalanced mu groupname = do
copies <- limitCopies $ if ':' `elem` groupname copies <- limitCopies $ if ':' `elem` groupname
then groupname then groupname
else groupname ++ ":1" else groupname ++ ":1"
@ -588,38 +592,65 @@ limitBalanced mu getgroupmap groupname = do
matchNeedsLocationLog present || matchNeedsLocationLog present ||
matchNeedsLocationLog fullybalanced || matchNeedsLocationLog fullybalanced ||
matchNeedsLocationLog copies matchNeedsLocationLog copies
, matchDesc = "balanced" =? groupname , matchDesc = termname =? groupname
} }
limitFullyBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex limitFullyBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullyBalanced mu getgroupmap want = limitFullyBalanced = limitFullyBalanced' "fullybalanced"
limitFullyBalanced' :: String -> Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullyBalanced' = limitFullyBalanced'' filtercandidates
where
filtercandidates _ key candidates = do
maxsizes <- getMaxSizes
sizemap <- getRepoSizes False
currentlocs <- S.fromList <$> loggedLocations key
let keysize = fromMaybe 0 (fromKey keySize key)
let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of
(Just maxsize, Just reposize) ->
repoHasSpace keysize (u `S.member` currentlocs) reposize maxsize
_ -> True
return $ S.filter hasspace candidates
repoHasSpace :: Integer -> Bool -> RepoSize -> MaxSize -> Bool
repoHasSpace keysize inrepo (RepoSize reposize) (MaxSize maxsize)
| inrepo =
reposize <= maxsize
| otherwise =
reposize + keysize <= maxsize
limitFullyBalanced''
:: (Int -> Key -> S.Set UUID -> Annex (S.Set UUID))
-> String
-> Maybe UUID
-> Annex GroupMap
-> MkLimit Annex
limitFullyBalanced'' filtercandidates termname mu getgroupmap want =
case splitc ':' want of case splitc ':' want of
[g] -> go g 1 [g] -> go g 1
[g, n] -> maybe [g, n] -> maybe
(Left "bad number for fullybalanced") (Left $ "bad number for " ++ termname)
(go g) (go g)
(readish n) (readish n)
_ -> Left "bad value for fullybalanced" _ -> Left $ "bad value for " ++ termname
where where
go s n = limitFullyBalanced' mu getgroupmap (toGroup s) n want go s n = limitFullyBalanced''' filtercandidates termname mu
getgroupmap (toGroup s) n want
limitFullyBalanced' :: Maybe UUID -> Annex GroupMap -> Group -> Int -> MkLimit Annex limitFullyBalanced'''
limitFullyBalanced' mu getgroupmap g n want = Right $ MatchFiles :: (Int -> Key -> S.Set UUID -> Annex (S.Set UUID))
-> String
-> Maybe UUID
-> Annex GroupMap
-> Group
-> Int
-> MkLimit Annex
limitFullyBalanced''' filtercandidates termname mu getgroupmap g n want = Right $ MatchFiles
{ matchAction = const $ checkKey $ \key -> do { matchAction = const $ checkKey $ \key -> do
gm <- getgroupmap gm <- getgroupmap
let groupmembers = fromMaybe S.empty $ let groupmembers = fromMaybe S.empty $
M.lookup g (uuidsByGroup gm) M.lookup g (uuidsByGroup gm)
maxsizes <- getMaxSizes candidates <- filtercandidates n key groupmembers
sizemap <- getRepoSizes False
let keysize = fromMaybe 0 (fromKey keySize key)
currentlocs <- S.fromList <$> loggedLocations key
let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of
(Just (MaxSize maxsize), Just (RepoSize reposize)) ->
if u `S.member` currentlocs
then reposize <= maxsize
else reposize + keysize <= maxsize
_ -> True
let candidates = S.filter hasspace groupmembers
return $ if S.null candidates return $ if S.null candidates
then False then False
else case (mu, M.lookup g (balancedPickerByGroup gm)) of else case (mu, M.lookup g (balancedPickerByGroup gm)) of
@ -630,9 +661,46 @@ limitFullyBalanced' mu getgroupmap g n want = Right $ MatchFiles
, matchNeedsFileContent = False , matchNeedsFileContent = False
, matchNeedsKey = True , matchNeedsKey = True
, matchNeedsLocationLog = False , matchNeedsLocationLog = False
, matchDesc = "fullybalanced" =? want , matchDesc = termname =? want
} }
limitSizeBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitSizeBalanced mu getgroupmap groupname = do
fullysizebalanced <- limitFullySizeBalanced' "sizebalanced" mu getgroupmap groupname
limitBalanced' "sizebalanced" fullysizebalanced mu groupname
limitFullySizeBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullySizeBalanced = limitFullySizeBalanced' "fullysizebalanced"
limitFullySizeBalanced' :: String -> Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullySizeBalanced' = limitFullyBalanced'' filtercandidates
where
filtercandidates n key candidates = do
maxsizes <- getMaxSizes
sizemap <- getRepoSizes False
currentlocs <- S.fromList <$> loggedLocations key
let keysize = fromMaybe 0 (fromKey keySize key)
let go u = case (M.lookup u maxsizes, M.lookup u sizemap, u `S.member` currentlocs) of
(Just maxsize, Just reposize, inrepo)
| repoHasSpace keysize inrepo reposize maxsize ->
proportionfree keysize inrepo u reposize maxsize
| otherwise -> Nothing
_ -> Nothing
return $ S.fromList $
map fst $ take n $ reverse $ sortOn snd $
mapMaybe go $ S.toList candidates
proportionfree keysize inrepo u (RepoSize reposize) (MaxSize maxsize)
| maxsize > 0 = Just
( u
, fromIntegral freespacesanskey / fromIntegral maxsize
:: Double
)
| otherwise = Nothing
where
freespacesanskey = maxsize - reposize +
if inrepo then keysize else 0
{- Adds a limit to skip files not using a specified key-value backend. -} {- Adds a limit to skip files not using a specified key-value backend. -}
addInBackend :: String -> Annex () addInBackend :: String -> Annex ()
addInBackend = addLimit . limitInBackend addInBackend = addLimit . limitInBackend

View file

@ -269,37 +269,39 @@ elsewhere to allow removing it).
The number is the number of repositories in the group that will The number is the number of repositories in the group that will
want each file. When not specified, the default is 1. want each file. When not specified, the default is 1.
For example, "balanced=backup:2", when there are 3 members of the backup
group, will make each backup repository want 2/3rds of the files.
For this to work, each repository in the group should have its preferred For this to work, each repository in the group should have its preferred
content set to the same expression. Using `groupwanted` is a good content set to the same expression. Using `groupwanted` is a good
way to do that. way to do that.
For example, "balanced=backup:2", when there are 3 members of the backup
group, will make each backup repository want 2/3rds of the files.
The sizes of files are not taken into account, so it's possible for The sizes of files are not taken into account, so it's possible for
one repository to get larger than usual files and so fill up before one repository to get larger than usual files and so fill up before
the other repositories. But files are only wanted by repositories that the other repositories. But files are only wanted by repositories that
have enough free space to hold them. So once a repository is full, have enough free space to hold them. So once a repository is full,
the remaining repositories will have any additional files balanced the remaining repositories will have any additional files balanced
amoung them. In order for this to work, you must use amoung them. For git-annex to know when a repository is full,
[[git-annex-maxsize]](1) to specify the size of each repository in the you must use [[git-annex-maxsize]](1) to specify the size of each
group. repository in the group.
This usually avoids moving files between repositories of the group, even This usually avoids moving files between repositories, even
if that means that things are not optimally balanced. Some of the ways if that means that things are not optimally balanced. Some of the ways
that it can get out of balance include adding a new repository to the that it can get out of balance include adding a new repository to the
group, or a file getting copied into more repositories in the group than group, or a file getting copied into more repositories in the
the specified number. Running git-annex commands with the `--rebalance` group than the specified number, or some of the repositories filling up.
option will make this expression instead behave like the `fullybalanced`
expression, which will make repositories want to move files around as
necessary in order to get fully balanced.
Using this in a perferred content expression makes git-annex need to do Running git-annex commands with the `--rebalance` option will make this
expression instead behave like the `fullybalanced` expression, which will
make repositories want to move files around as necessary in order to get
fully balanced.
Using this in a preferred content expression makes git-annex need to do
some additional work to keep track of how full repositories are. Usually some additional work to keep track of how full repositories are. Usually
that won't affect performance much. However, the first time git-annex that won't affect performance much. However, the first time git-annex
processes this in a given git repository, it will need to examine processes this expression in a given git repository, it will need to
all the locations of all files, which can be slow when there are a lot of calculate the sizes of all repositories, which can be slow when there are
them. When this causes git-annex to do a lot of work, it will a lot of files. When this causes git-annex to do a lot of work, it will
display "(calculating repository sizes)". display "(calculating repository sizes)".
Note that `not balanced` is a bad thing to put in a preferred content Note that `not balanced` is a bad thing to put in a preferred content
@ -316,6 +318,54 @@ elsewhere to allow removing it).
When the `--rebalance` option is used, `balanced` is the same as When the `--rebalance` option is used, `balanced` is the same as
`fullybalanced`. `fullybalanced`.
* `sizebalanced=groupname:number`
Distributes content amoung repositories in the group, keeping
repositories proportionally full.
The number is the number of repositories in the group that will
want each file. When not specified, the default is 1.
For this to work, you must use [[git-annex-maxsize]](1) to specify
the size of each repository in the group. When a repository's
maxsize has not been specified, it will not want any files.
For example, if one repository in the group has a maximum size of
100 gb with 60 gb used, and another has a maximum size of 50 gb with
25 gb used, the smaller one will want files (that fit in it),
and the larger one won't want any files
(that would fit in the smaller one)
until the smaller one gets equally full.
Note that, once a repository contains a file, it will continue to want
it, even if it's more full than other repositories. This is to avoid
churn in moving files around.
This is more likely to get out of balance than the `balanced=` expression
is, because git-annex does not always have a consistent knowledge of
how full repositories are. Consider for example if a laptop and a desktop
are each sending a new file to the group. They will both pick whichever
repository was least full, but that means both files go to the same
repository, when a better solution might have been to send the smaller
file to a different repository. When using `balanced=` in the same
situation, it's less likely that a repository will want both files.
Running git-annex commands with the `--rebalance` option will make this
expression instead behave like the `fullysizebalanced` expression, which
will make repositories want to move files around as necessary in order to
get fully balanced.
* `fullysizebalanced=groupname:number`
This is like `sizebalanced`, but allows moving content between repositories
in the group at any time to keep it fully balanced.
Normally "sizebalanced=groupname:number" is the same as
"(fullysizebalanced=groupname:number and not copies=groupname:number) or present"
When the `--rebalance` option is used, `sizebalanced` is the same as
`fullysizebalanced`.
* `anything` * `anything`
Always matches. Always matches.