diff --git a/CHANGELOG b/CHANGELOG index ef7fa0cf77..27cd108252 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,8 @@ git-annex (10.20250417) UNRELEASED; urgency=medium + * Preferred content now supports "balanced=groupname:lackingcopies" + to make files be evenly balanced amoung as many repositories as are + needed to satisfy numcopies. * map: Fix buggy handling of remotes that are bare git repositories accessed via ssh. * map: Avoid looping forever with mutually recursive paths between diff --git a/Limit.hs b/Limit.hs index 4bdd7f6e1b..d090e09d88 100644 --- a/Limit.hs +++ b/Limit.hs @@ -1,6 +1,6 @@ {- user-specified limits on files to act on - - - Copyright 2011-2024 Joey Hess + - Copyright 2011-2025 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -459,19 +459,26 @@ limitLackingCopies desc approx want = case readish want of } Nothing -> Left "bad value for number of lacking copies" where - go mi needed notpresent key = do - numcopies <- if approx - then approxNumCopies - else case mi of - MatchingFile fi -> getGlobalFileNumCopies $ - matchFile fi - MatchingInfo {} -> approxNumCopies - MatchingUserInfo {} -> approxNumCopies - us <- filter (`S.notMember` notpresent) - <$> (trustExclude UnTrusted =<< Remote.keyLocations key) - let vs nhave numcopies' = numcopies' - nhave >= needed - return $ numCopiesCheck'' us vs numcopies + go mi needed notpresent key = + limitCheckNumCopies approx mi notpresent key vs + where + vs nhave numcopies' = numcopies' - nhave >= needed + +limitCheckNumCopies :: Bool -> MatchInfo -> AssumeNotPresent -> Key -> (Int -> Int -> v) -> Annex v +limitCheckNumCopies approx mi notpresent key vs = do + numcopies <- if approx + then approxNumCopies + else case mi of + MatchingFile fi -> getGlobalFileNumCopies $ + matchFile fi + MatchingInfo {} -> approxNumCopies + MatchingUserInfo {} -> approxNumCopies + us <- filter (`S.notMember` notpresent) + <$> (trustExclude UnTrusted =<< Remote.keyLocations key) + return $ numCopiesCheck'' us vs numcopies + where approxNumCopies = fromMaybe defaultNumCopies <$> getGlobalNumCopies + {- Match keys that are unused. - @@ -597,17 +604,21 @@ limitBalanced mu getgroupmap groupname = do limitBalanced' :: String -> MatchFiles Annex -> Maybe UUID -> MkLimit Annex limitBalanced' termname fullybalanced mu groupname = do - copies <- limitCopies $ if ':' `elem` groupname - then groupname - else groupname ++ ":1" + let checknumcopies = ":lackingcopies" `isSuffixOf` groupname + enoughcopies <- if checknumcopies + then limitLackingCopies termname False "1" + else limitCopies $ if ':' `elem` groupname + then groupname + else groupname ++ ":1" + let checkenoughcopies = if checknumcopies then id else not let present = limitPresent mu - let combo f = f present || f fullybalanced || f copies + let combo f = f present || f fullybalanced || f enoughcopies Right $ MatchFiles { matchAction = \lu a i -> ifM (Annex.getRead Annex.rebalance) ( matchAction fullybalanced lu a i , matchAction present lu a i <||> - ((not <$> matchAction copies lu a i) + ((checkenoughcopies <$> matchAction enoughcopies lu a i) <&&> matchAction fullybalanced lu a i ) ) @@ -667,11 +678,16 @@ limitFullyBalanced'' -> MkLimit Annex limitFullyBalanced'' filtercandidates termname mu getgroupmap want = case splitc ':' want of - [g] -> go g 1 - [g, n] -> maybe - (Left $ "bad number for " ++ termname) - (go g) - (readish n) + [g] -> go g (Right 1) + [g, n] + | n == "lackingcopies" -> go g $ + Left $ \mi notpresent key -> + let vs nhave numcopies = numcopies - nhave + in limitCheckNumCopies False mi notpresent key vs + | otherwise -> maybe + (Left $ "bad number for " ++ termname) + (go g . Right) + (readish n) _ -> Left $ "bad value for " ++ termname where go s n = limitFullyBalanced''' filtercandidates termname mu @@ -683,13 +699,16 @@ limitFullyBalanced''' -> Maybe UUID -> Annex GroupMap -> Group - -> Int + -> Either (MatchInfo -> AssumeNotPresent -> Key -> Annex Int) Int -> MkLimit Annex -limitFullyBalanced''' filtercandidates termname mu getgroupmap g n want = Right $ MatchFiles - { matchAction = \lu -> const $ checkKey $ \key -> do +limitFullyBalanced''' filtercandidates termname mu getgroupmap g getn want = Right $ MatchFiles + { matchAction = \lu notpresent mi -> flip checkKey mi $ \key -> do gm <- getgroupmap let groupmembers = fromMaybe S.empty $ M.lookup g (uuidsByGroup gm) + n <- case getn of + Right n -> pure n + Left a -> a mi notpresent key candidates <- filtercandidates n key groupmembers let wanted = if S.null candidates then False diff --git a/doc/git-annex-preferred-content.mdwn b/doc/git-annex-preferred-content.mdwn index 68769484dc..52c6ff225e 100644 --- a/doc/git-annex-preferred-content.mdwn +++ b/doc/git-annex-preferred-content.mdwn @@ -267,7 +267,7 @@ content not being configured. says it wants them. (Or, if annex.expireunused is set, it may just delete them.) -* `balanced=groupname[:number]` +* `balanced=groupname[:number|:lackingcopies]` Makes content be evenly balanced amoung repositories in the group. @@ -277,9 +277,15 @@ content not being configured. For example, "balanced=backup:2", when there are 3 members of the backup group, will make each backup repository want 2/3rds of the files. - For this to work, each repository in the group should have its preferred - content set to the same expression. Using `groupwanted` is a good - way to do that. + Using "lackingcopies" rather than a number makes each file be balanced + amoung as many repositories in the group as are needed to satisfy + its numcopies configuration. Eg, "balanced=backup:lackingcopies", when + numcopies is 3 and there is 1 other copy will behave the same as + "balanced=backup:2". + + For balancing to work, each repository in the group should have its + preferred content set to the same expression. Using `groupwanted` is a + good way to do that. The sizes of files are not taken into account, so it's possible for one repository to get larger than usual files and so fill up before @@ -312,7 +318,7 @@ content not being configured. Note that `not balanced` not a reasonable thing to use in a preferred content expression for the same reasons as `not present`. -* `fullybalanced=groupname[:number]` +* `fullybalanced=groupname[:number|:lackingcopies]` This is like `balanced`, but allows moving content between repositories in the group at any time to keep it fully balanced. @@ -333,7 +339,7 @@ content not being configured. but it allows new files to continue to be stored on the specified number of repositories. -* `sizebalanced=groupname:number` +* `sizebalanced=groupname[:number|:lackingcopies]` Distributes content amoung repositories in the group, keeping repositories proportionally full. @@ -373,7 +379,7 @@ content not being configured. Note that `not sizebalanced` not a reasonable thing to use in a preferred content expression for the same reasons as `not present`. -* `fullysizebalanced=groupname:number` +* `fullysizebalanced=groupname[:number|:lackingcopies]` This is like `sizebalanced`, but allows moving content between repositories in the group at any time to keep it fully balanced. diff --git a/doc/todo/balanced_preferred_content_taking_numcopies_into_account.mdwn b/doc/todo/balanced_preferred_content_taking_numcopies_into_account.mdwn new file mode 100644 index 0000000000..b98a2d0e39 --- /dev/null +++ b/doc/todo/balanced_preferred_content_taking_numcopies_into_account.mdwn @@ -0,0 +1,21 @@ +Using `balanced` or similar preferred content expressions makes files be +spread amoung repositories in a group, but does not take numcopies into +account. Could this be made to support numcopies? + +The number of nodes to store each file on can be specified +(`balanced=foo:N`). That is a separate number than numcopies. When +different files have different numcopies, a single number there is not +always useful. (Although it may be that someone always wants each file in +the cluster to have say, 3 copies, regardless of numcopies.) + +Could `balanced=foo:lackingcopies` just make enough nodes want copies to +satisfy numcopies? (Assuming there are enough nodes.) + +That would not be the same as using `balanced=foo or lackingcopies=1`, +because that makes every node want every file until numcopies is satisfied. +So that does not evenly balance files amoung nodes. Eg, if `git-annex +sync` is syncing with 3 nodes and happens to always try sending to A first, +A would get every file, and B and C would not get any files with that +preferred content expression. + +> Implemented! [[done]] --[[Joey]]