Support "sizebalanced=" and "fullysizebalanced=" too

Might want to make --rebalance turn balanced=group:N where N > 1
to fullysizebalanced=group:N. Have not yet determined if that will
improve situations enough to be worth the extra work.
This commit is contained in:
Joey Hess 2024-08-21 15:01:54 -04:00
parent 4e1dcc0372
commit 9e87061de2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 157 additions and 36 deletions

View file

@ -173,6 +173,8 @@ preferredContentTokens pcd =
, ValueToken "onlyingroup" (usev $ limitOnlyInGroup $ getGroupMap pcd)
, ValueToken "balanced" (usev $ limitBalanced (repoUUID pcd) (getGroupMap pcd))
, ValueToken "fullybalanced" (usev $ limitFullyBalanced (repoUUID pcd) (getGroupMap pcd))
, ValueToken "sizebalanced" (usev $ limitSizeBalanced (repoUUID pcd) (getGroupMap pcd))
, ValueToken "fullysizebalanced" (usev $ limitFullySizeBalanced (repoUUID pcd) (getGroupMap pcd))
] ++ commonTokens LimitAnnexFiles
where
preferreddir = maybe "public" fromProposedAccepted $

View file

@ -19,6 +19,7 @@ git-annex (10.20240831) UNRELEASED; urgency=medium
remotes. External special remotes should not use that config for their
own purposes.
* Support "balanced=" and "fullybalanced=" in preferred content expressions.
* Support "sizebalanced=" and "fullysizebalanced=" too.
* Added --rebalance option.
* maxsize: New command to tell git-annex how large the expected maximum
size of a repository is, and to display repository sizes.

108
Limit.hs
View file

@ -558,7 +558,11 @@ limitOnlyInGroup getgroupmap groupname = Right $ MatchFiles
limitBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitBalanced mu getgroupmap groupname = do
fullybalanced <- limitFullyBalanced mu getgroupmap groupname
fullybalanced <- limitFullyBalanced' "balanced" mu getgroupmap groupname
limitBalanced' "balanced" fullybalanced mu groupname
limitBalanced' :: String -> MatchFiles Annex -> Maybe UUID -> MkLimit Annex
limitBalanced' termname fullybalanced mu groupname = do
copies <- limitCopies $ if ':' `elem` groupname
then groupname
else groupname ++ ":1"
@ -588,38 +592,65 @@ limitBalanced mu getgroupmap groupname = do
matchNeedsLocationLog present ||
matchNeedsLocationLog fullybalanced ||
matchNeedsLocationLog copies
, matchDesc = "balanced" =? groupname
, matchDesc = termname =? groupname
}
limitFullyBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullyBalanced mu getgroupmap want =
limitFullyBalanced = limitFullyBalanced' "fullybalanced"
limitFullyBalanced' :: String -> Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullyBalanced' = limitFullyBalanced'' filtercandidates
where
filtercandidates _ key candidates = do
maxsizes <- getMaxSizes
sizemap <- getRepoSizes False
currentlocs <- S.fromList <$> loggedLocations key
let keysize = fromMaybe 0 (fromKey keySize key)
let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of
(Just maxsize, Just reposize) ->
repoHasSpace keysize (u `S.member` currentlocs) reposize maxsize
_ -> True
return $ S.filter hasspace candidates
repoHasSpace :: Integer -> Bool -> RepoSize -> MaxSize -> Bool
repoHasSpace keysize inrepo (RepoSize reposize) (MaxSize maxsize)
| inrepo =
reposize <= maxsize
| otherwise =
reposize + keysize <= maxsize
limitFullyBalanced''
:: (Int -> Key -> S.Set UUID -> Annex (S.Set UUID))
-> String
-> Maybe UUID
-> Annex GroupMap
-> MkLimit Annex
limitFullyBalanced'' filtercandidates termname mu getgroupmap want =
case splitc ':' want of
[g] -> go g 1
[g, n] -> maybe
(Left "bad number for fullybalanced")
(Left $ "bad number for " ++ termname)
(go g)
(readish n)
_ -> Left "bad value for fullybalanced"
_ -> Left $ "bad value for " ++ termname
where
go s n = limitFullyBalanced' mu getgroupmap (toGroup s) n want
go s n = limitFullyBalanced''' filtercandidates termname mu
getgroupmap (toGroup s) n want
limitFullyBalanced' :: Maybe UUID -> Annex GroupMap -> Group -> Int -> MkLimit Annex
limitFullyBalanced' mu getgroupmap g n want = Right $ MatchFiles
limitFullyBalanced'''
:: (Int -> Key -> S.Set UUID -> Annex (S.Set UUID))
-> String
-> Maybe UUID
-> Annex GroupMap
-> Group
-> Int
-> MkLimit Annex
limitFullyBalanced''' filtercandidates termname mu getgroupmap g n want = Right $ MatchFiles
{ matchAction = const $ checkKey $ \key -> do
gm <- getgroupmap
let groupmembers = fromMaybe S.empty $
M.lookup g (uuidsByGroup gm)
maxsizes <- getMaxSizes
sizemap <- getRepoSizes False
let keysize = fromMaybe 0 (fromKey keySize key)
currentlocs <- S.fromList <$> loggedLocations key
let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of
(Just (MaxSize maxsize), Just (RepoSize reposize)) ->
if u `S.member` currentlocs
then reposize <= maxsize
else reposize + keysize <= maxsize
_ -> True
let candidates = S.filter hasspace groupmembers
candidates <- filtercandidates n key groupmembers
return $ if S.null candidates
then False
else case (mu, M.lookup g (balancedPickerByGroup gm)) of
@ -630,9 +661,46 @@ limitFullyBalanced' mu getgroupmap g n want = Right $ MatchFiles
, matchNeedsFileContent = False
, matchNeedsKey = True
, matchNeedsLocationLog = False
, matchDesc = "fullybalanced" =? want
, matchDesc = termname =? want
}
limitSizeBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitSizeBalanced mu getgroupmap groupname = do
fullysizebalanced <- limitFullySizeBalanced' "sizebalanced" mu getgroupmap groupname
limitBalanced' "sizebalanced" fullysizebalanced mu groupname
limitFullySizeBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullySizeBalanced = limitFullySizeBalanced' "fullysizebalanced"
limitFullySizeBalanced' :: String -> Maybe UUID -> Annex GroupMap -> MkLimit Annex
limitFullySizeBalanced' = limitFullyBalanced'' filtercandidates
where
filtercandidates n key candidates = do
maxsizes <- getMaxSizes
sizemap <- getRepoSizes False
currentlocs <- S.fromList <$> loggedLocations key
let keysize = fromMaybe 0 (fromKey keySize key)
let go u = case (M.lookup u maxsizes, M.lookup u sizemap, u `S.member` currentlocs) of
(Just maxsize, Just reposize, inrepo)
| repoHasSpace keysize inrepo reposize maxsize ->
proportionfree keysize inrepo u reposize maxsize
| otherwise -> Nothing
_ -> Nothing
return $ S.fromList $
map fst $ take n $ reverse $ sortOn snd $
mapMaybe go $ S.toList candidates
proportionfree keysize inrepo u (RepoSize reposize) (MaxSize maxsize)
| maxsize > 0 = Just
( u
, fromIntegral freespacesanskey / fromIntegral maxsize
:: Double
)
| otherwise = Nothing
where
freespacesanskey = maxsize - reposize +
if inrepo then keysize else 0
{- Adds a limit to skip files not using a specified key-value backend. -}
addInBackend :: String -> Annex ()
addInBackend = addLimit . limitInBackend

View file

@ -268,38 +268,40 @@ elsewhere to allow removing it).
The number is the number of repositories in the group that will
want each file. When not specified, the default is 1.
For example, "balanced=backup:2", when there are 3 members of the backup
group, will make each backup repository want 2/3rds of the files.
For this to work, each repository in the group should have its preferred
content set to the same expression. Using `groupwanted` is a good
way to do that.
For example, "balanced=backup:2", when there are 3 members of the backup
group, will make each backup repository want 2/3rds of the files.
The sizes of files are not taken into account, so it's possible for
one repository to get larger than usual files and so fill up before
the other repositories. But files are only wanted by repositories that
have enough free space to hold them. So once a repository is full,
the remaining repositories will have any additional files balanced
amoung them. In order for this to work, you must use
[[git-annex-maxsize]](1) to specify the size of each repository in the
group.
amoung them. For git-annex to know when a repository is full,
you must use [[git-annex-maxsize]](1) to specify the size of each
repository in the group.
This usually avoids moving files between repositories of the group, even
This usually avoids moving files between repositories, even
if that means that things are not optimally balanced. Some of the ways
that it can get out of balance include adding a new repository to the
group, or a file getting copied into more repositories in the group than
the specified number. Running git-annex commands with the `--rebalance`
option will make this expression instead behave like the `fullybalanced`
expression, which will make repositories want to move files around as
necessary in order to get fully balanced.
group, or a file getting copied into more repositories in the
group than the specified number, or some of the repositories filling up.
Using this in a perferred content expression makes git-annex need to do
Running git-annex commands with the `--rebalance` option will make this
expression instead behave like the `fullybalanced` expression, which will
make repositories want to move files around as necessary in order to get
fully balanced.
Using this in a preferred content expression makes git-annex need to do
some additional work to keep track of how full repositories are. Usually
that won't affect performance much. However, the first time git-annex
processes this in a given git repository, it will need to examine
all the locations of all files, which can be slow when there are a lot of
them. When this causes git-annex to do a lot of work, it will
processes this expression in a given git repository, it will need to
calculate the sizes of all repositories, which can be slow when there are
a lot of files. When this causes git-annex to do a lot of work, it will
display "(calculating repository sizes)".
Note that `not balanced` is a bad thing to put in a preferred content
@ -316,6 +318,54 @@ elsewhere to allow removing it).
When the `--rebalance` option is used, `balanced` is the same as
`fullybalanced`.
* `sizebalanced=groupname:number`
Distributes content amoung repositories in the group, keeping
repositories proportionally full.
The number is the number of repositories in the group that will
want each file. When not specified, the default is 1.
For this to work, you must use [[git-annex-maxsize]](1) to specify
the size of each repository in the group. When a repository's
maxsize has not been specified, it will not want any files.
For example, if one repository in the group has a maximum size of
100 gb with 60 gb used, and another has a maximum size of 50 gb with
25 gb used, the smaller one will want files (that fit in it),
and the larger one won't want any files
(that would fit in the smaller one)
until the smaller one gets equally full.
Note that, once a repository contains a file, it will continue to want
it, even if it's more full than other repositories. This is to avoid
churn in moving files around.
This is more likely to get out of balance than the `balanced=` expression
is, because git-annex does not always have a consistent knowledge of
how full repositories are. Consider for example if a laptop and a desktop
are each sending a new file to the group. They will both pick whichever
repository was least full, but that means both files go to the same
repository, when a better solution might have been to send the smaller
file to a different repository. When using `balanced=` in the same
situation, it's less likely that a repository will want both files.
Running git-annex commands with the `--rebalance` option will make this
expression instead behave like the `fullysizebalanced` expression, which
will make repositories want to move files around as necessary in order to
get fully balanced.
* `fullysizebalanced=groupname:number`
This is like `sizebalanced`, but allows moving content between repositories
in the group at any time to keep it fully balanced.
Normally "sizebalanced=groupname:number" is the same as
"(fullysizebalanced=groupname:number and not copies=groupname:number) or present"
When the `--rebalance` option is used, `sizebalanced` is the same as
`fullysizebalanced`.
* `anything`
Always matches.