From 76ece2a699a9cb14c4e7424b5b2412c3cb94aacb Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 21 Aug 2024 17:56:06 -0400 Subject: [PATCH] make --rebalance of balanced use fullysizebalanced when useful When the specified number of copies is > 1, and some repositories are too full, it can be better to move content from them to other less full repositories, in order to make space for new content. annex.fullybalancedthreshhold is documented, but not implemented yet This is not tested very well yet, and is known to sometimes take several runs to stabalize. --- Limit.hs | 77 ++++++++++++++++++---------- doc/git-annex-preferred-content.mdwn | 10 ++++ doc/git-annex.mdwn | 6 +++ doc/todo/git-annex_proxies.mdwn | 5 ++ 4 files changed, 70 insertions(+), 28 deletions(-) diff --git a/Limit.hs b/Limit.hs index 9d403c5ff2..4ed4853f78 100644 --- a/Limit.hs +++ b/Limit.hs @@ -599,18 +599,31 @@ limitFullyBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex limitFullyBalanced = limitFullyBalanced' "fullybalanced" limitFullyBalanced' :: String -> Maybe UUID -> Annex GroupMap -> MkLimit Annex -limitFullyBalanced' = limitFullyBalanced'' filtercandidates - where - filtercandidates _ key candidates = do - maxsizes <- getMaxSizes - sizemap <- getRepoSizes False - currentlocs <- S.fromList <$> loggedLocations key - let keysize = fromMaybe 0 (fromKey keySize key) - let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of - (Just maxsize, Just reposize) -> - repoHasSpace keysize (u `S.member` currentlocs) reposize maxsize - _ -> True - return $ S.filter hasspace candidates +limitFullyBalanced' = limitFullyBalanced'' $ \n key candidates -> do + maxsizes <- getMaxSizes + sizemap <- getRepoSizes False + let threshhold = 0.9 :: Double + let toofull u = + case (M.lookup u maxsizes, M.lookup u sizemap) of + (Just (MaxSize maxsize), Just (RepoSize reposize)) -> + fromIntegral reposize >= fromIntegral maxsize * threshhold + _ -> False + needsizebalance <- ifM (Annex.getRead Annex.rebalance) + ( return $ n > 1 && + n > S.size candidates + - S.size (S.filter toofull candidates) + , return False + ) + if needsizebalance + then filterCandidatesFullySizeBalanced maxsizes sizemap n key candidates + else do + currentlocs <- S.fromList <$> loggedLocations key + let keysize = fromMaybe 0 (fromKey keySize key) + let hasspace u = case (M.lookup u maxsizes, M.lookup u sizemap) of + (Just maxsize, Just reposize) -> + repoHasSpace keysize (u `S.member` currentlocs) reposize maxsize + _ -> True + return $ S.filter hasspace candidates repoHasSpace :: Integer -> Bool -> RepoSize -> MaxSize -> Bool repoHasSpace keysize inrepo (RepoSize reposize) (MaxSize maxsize) @@ -673,23 +686,31 @@ limitFullySizeBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex limitFullySizeBalanced = limitFullySizeBalanced' "fullysizebalanced" limitFullySizeBalanced' :: String -> Maybe UUID -> Annex GroupMap -> MkLimit Annex -limitFullySizeBalanced' = limitFullyBalanced'' filtercandidates +limitFullySizeBalanced' = limitFullyBalanced'' $ \n key candidates -> do + maxsizes <- getMaxSizes + sizemap <- getRepoSizes False + filterCandidatesFullySizeBalanced maxsizes sizemap n key candidates + +filterCandidatesFullySizeBalanced + :: M.Map UUID MaxSize + -> M.Map UUID RepoSize + -> Int + -> Key + -> S.Set UUID + -> Annex (S.Set UUID) +filterCandidatesFullySizeBalanced maxsizes sizemap n key candidates = do + currentlocs <- S.fromList <$> loggedLocations key + let keysize = fromMaybe 0 (fromKey keySize key) + let go u = case (M.lookup u maxsizes, M.lookup u sizemap, u `S.member` currentlocs) of + (Just maxsize, Just reposize, inrepo) + | repoHasSpace keysize inrepo reposize maxsize -> + proportionfree keysize inrepo u reposize maxsize + | otherwise -> Nothing + _ -> Nothing + return $ S.fromList $ + map fst $ take n $ reverse $ sortOn snd $ + mapMaybe go $ S.toList candidates where - filtercandidates n key candidates = do - maxsizes <- getMaxSizes - sizemap <- getRepoSizes False - currentlocs <- S.fromList <$> loggedLocations key - let keysize = fromMaybe 0 (fromKey keySize key) - let go u = case (M.lookup u maxsizes, M.lookup u sizemap, u `S.member` currentlocs) of - (Just maxsize, Just reposize, inrepo) - | repoHasSpace keysize inrepo reposize maxsize -> - proportionfree keysize inrepo u reposize maxsize - | otherwise -> Nothing - _ -> Nothing - return $ S.fromList $ - map fst $ take n $ reverse $ sortOn snd $ - mapMaybe go $ S.toList candidates - proportionfree keysize inrepo u (RepoSize reposize) (MaxSize maxsize) | maxsize > 0 = Just ( u diff --git a/doc/git-annex-preferred-content.mdwn b/doc/git-annex-preferred-content.mdwn index f8b8a1db26..a9e1cb0b5f 100644 --- a/doc/git-annex-preferred-content.mdwn +++ b/doc/git-annex-preferred-content.mdwn @@ -318,6 +318,16 @@ elsewhere to allow removing it). When the `--rebalance` option is used, `balanced` is the same as `fullybalanced`. + When the specified number is greater than 1, and too many repositories + in the group are more than 90% full (as configured by + annex.fullybalancedthreshhold), this behaves like `fullysizebalanced`. + + For example, `fullybalanced=foo:3`, when group foo has 5 repositories, + two 50% full and three 99% full, will make some content move from the + full repositories to the others. Moving content like that is expensive, + but it allows new files to continue to be stored on the specified number + of repositories. + * `sizebalanced=groupname:number` Distributes content amoung repositories in the group, keeping diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index b46526e8dc..2615ff4d2c 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -928,6 +928,12 @@ repository, using [[git-annex-config]]. See its man page for a list.) The default reserve is 100 megabytes. +* `annex.fullybalancedthreshhold` + + Configures the percent full a repository must be in order for + the "fullybalanced" preferred content expression to consider it + to be full. The default is 90. + * `annex.skipunknown` Set to true to make commands like "git-annex get" silently skip over diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 2c14e07fef..f8a7dbdd06 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -30,6 +30,11 @@ Planned schedule of work: ## work notes +* Implement annex.fullybalancedthreshhold + +* `git-annex assist --rebalance` of `balanced=foo:2` + sometimes needs several runs to stabalize. + * Bug: git init foo