From 3ce2e95a5f867e09ffeb1f3eaa6990d9f8fc00e7 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 9 Aug 2024 14:16:09 -0400 Subject: [PATCH] balanced preferred content and --rebalance This all works fine. But it doesn't check repository sizes yet, and without repository size checking, once a repository gets full, there will be no other repository that will want its files. Use of sha2 seems unncessary, probably alder2 or md5 or crc would have been enough. Possibly just summing up the bytes of the key mod the number of repositories would have sufficed. But sha2 is there, and probably hardware accellerated. I doubt very much there is any security benefit to using it though. If someone wants to construct a key that will be balanced onto a given repository, sha2 is certianly not going to stop them. --- Annex.hs | 2 + Annex/FileMatcher.hs | 4 +- CHANGELOG | 2 + CmdLine/GitAnnex/Options.hs | 6 ++ Limit.hs | 75 +++++++++++++++++++++- Utility/Hash.hs | 6 +- doc/design/balanced_preferred_content.mdwn | 22 +++---- doc/git-annex-common-options.mdwn | 6 ++ doc/git-annex-preferred-content.mdwn | 48 ++++++++++++++ doc/preferred_content.mdwn | 1 + doc/todo/git-annex_proxies.mdwn | 14 +++- 11 files changed, 169 insertions(+), 17 deletions(-) diff --git a/Annex.hs b/Annex.hs index eaba4703cf..0c41ad95c1 100644 --- a/Annex.hs +++ b/Annex.hs @@ -131,6 +131,7 @@ data AnnexRead = AnnexRead , forcenumcopies :: Maybe NumCopies , forcemincopies :: Maybe MinCopies , forcebackend :: Maybe String + , rebalance :: Bool , useragent :: Maybe String , desktopnotify :: DesktopNotify , gitcredentialcache :: TMVar CredentialCache @@ -164,6 +165,7 @@ newAnnexRead c = do , forcebackend = Nothing , forcenumcopies = Nothing , forcemincopies = Nothing + , rebalance = False , useragent = Nothing , desktopnotify = mempty , gitcredentialcache = cc diff --git a/Annex/FileMatcher.hs b/Annex/FileMatcher.hs index e48931f360..b7b5ca1553 100644 --- a/Annex/FileMatcher.hs +++ b/Annex/FileMatcher.hs @@ -1,6 +1,6 @@ {- git-annex file matching - - - Copyright 2012-2023 Joey Hess + - Copyright 2012-2024 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -171,6 +171,8 @@ preferredContentTokens pcd = , ValueToken "metadata" (usev limitMetaData) , ValueToken "inallgroup" (usev $ limitInAllGroup $ getGroupMap pcd) , ValueToken "onlyingroup" (usev $ limitOnlyInGroup $ getGroupMap pcd) + , ValueToken "balanced" (usev $ limitBalanced (repoUUID pcd) (getGroupMap pcd)) + , ValueToken "fullybalanced" (usev $ limitFullyBalanced (repoUUID pcd) (getGroupMap pcd)) ] ++ commonTokens LimitAnnexFiles where preferreddir = maybe "public" fromProposedAccepted $ diff --git a/CHANGELOG b/CHANGELOG index 37b3688b86..13659b244c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -18,6 +18,8 @@ git-annex (10.20240831) UNRELEASED; urgency=medium * The config versioning=true is now reserved for use by versioned special remotes. External special remotes should not use that config for their own purposes. + * Support "balanced=" and "fullybalanced=" in preferred content expressions. + * Added --rebalance option. -- Joey Hess Wed, 31 Jul 2024 15:52:03 -0400 diff --git a/CmdLine/GitAnnex/Options.hs b/CmdLine/GitAnnex/Options.hs index 7c1bd29d7a..00a23484f8 100644 --- a/CmdLine/GitAnnex/Options.hs +++ b/CmdLine/GitAnnex/Options.hs @@ -56,6 +56,11 @@ gitAnnexCommonOptions = commonOptions ++ <> help "override minimum number of copies" <> hidden ) + , annexFlag (setrebalance True) + ( long "rebalance" + <> help "move content as needed to improve balance" + <> hidden + ) , annexOption (setAnnexState . Remote.forceTrust Trusted) $ strOption ( long "trust" <> metavar paramRemote <> help "deprecated, does not override trust setting" @@ -103,6 +108,7 @@ gitAnnexCommonOptions = commonOptions ++ where setnumcopies n = setAnnexRead $ \rd -> rd { Annex.forcenumcopies = Just $ configuredNumCopies n } setmincopies n = setAnnexRead $ \rd -> rd { Annex.forcemincopies = Just $ configuredMinCopies n } + setrebalance v = setAnnexRead $ \rd -> rd { Annex.rebalance = v } setuseragent v = setAnnexRead $ \rd -> rd { Annex.useragent = Just v } setdesktopnotify v = setAnnexRead $ \rd -> rd { Annex.desktopnotify = Annex.desktopnotify rd <> v } setgitconfig v = Annex.addGitConfigOverride v diff --git a/Limit.hs b/Limit.hs index 252b3dd493..edbb5999c6 100644 --- a/Limit.hs +++ b/Limit.hs @@ -1,6 +1,6 @@ {- user-specified limits on files to act on - - - Copyright 2011-2023 Joey Hess + - Copyright 2011-2024 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -37,6 +37,7 @@ import Git.Types (RefDate(..)) import Utility.Glob import Utility.HumanTime import Utility.DataUnits +import Utility.Hash import qualified Database.Keys import qualified Utility.RawFilePath as R import Backend @@ -47,6 +48,8 @@ import qualified Data.Set as S import qualified Data.Map as M import qualified System.FilePath.ByteString as P import System.PosixCompat.Files (accessTime, isSymbolicLink) +import qualified Data.ByteArray as BA +import Data.Bits (shiftL) {- Some limits can look at the current status of files on - disk, or in the annex. This allows controlling which happens. -} @@ -553,6 +556,76 @@ limitOnlyInGroup getgroupmap groupname = Right $ MatchFiles return $ not (S.null $ present `S.intersection` want) && S.null (S.filter (`S.notMember` want) present) +limitBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex +limitBalanced mu getgroupmap groupname = do + fullybalanced <- limitFullyBalanced mu getgroupmap groupname + copies <- limitCopies $ if ':' `elem` groupname + then groupname + else groupname ++ ":1" + let present = limitPresent mu + Right $ MatchFiles + { matchAction = \a i -> + ifM (Annex.getRead Annex.rebalance) + ( matchAction fullybalanced a i + , matchAction present a i <||> + ((not <$> matchAction copies a i) + <&&> matchAction fullybalanced a i + ) + ) + , matchNeedsFileName = + matchNeedsFileName present || + matchNeedsFileName fullybalanced || + matchNeedsFileName copies + , matchNeedsFileContent = + matchNeedsFileContent present || + matchNeedsFileContent fullybalanced || + matchNeedsFileContent copies + , matchNeedsKey = + matchNeedsKey present || + matchNeedsKey fullybalanced || + matchNeedsKey copies + , matchNeedsLocationLog = + matchNeedsLocationLog present || + matchNeedsLocationLog fullybalanced || + matchNeedsLocationLog copies + , matchDesc = "balanced" =? groupname + } + +limitFullyBalanced :: Maybe UUID -> Annex GroupMap -> MkLimit Annex +limitFullyBalanced mu getgroupmap groupname = Right $ MatchFiles + { matchAction = const $ checkKey $ \key -> do + groupmembers <- fromMaybe S.empty + . M.lookup (toGroup groupname) + . uuidsByGroup + <$> getgroupmap + -- TODO free space checking + return $ case mu of + Just u -> u == pickBalanced key groupmembers + Nothing -> False + , matchNeedsFileName = False + , matchNeedsFileContent = False + , matchNeedsKey = True + , matchNeedsLocationLog = False + , matchDesc = "fullybalanced" =? groupname + } + where + +pickBalanced :: Key -> S.Set UUID -> UUID +pickBalanced key s = + let m = fromIntegral (S.size s) + n = keyToInteger key + in S.elemAt (fromIntegral (n `mod` m)) s + +{- Converts a Key into a stable Integer. + - + - The SHA2 hash of the key is used to constrain the size of the Integer + - and to get an even distribution. + -} +keyToInteger :: Key -> Integer +keyToInteger key = + foldl' (\i b -> (i `shiftL` 8) + fromIntegral b) 0 $ + BA.unpack (sha2_256s (serializeKey' key)) + {- Adds a limit to skip files not using a specified key-value backend. -} addInBackend :: String -> Annex () addInBackend = addLimit . limitInBackend diff --git a/Utility/Hash.hs b/Utility/Hash.hs index 81674a8d2b..84b4718aba 100644 --- a/Utility/Hash.hs +++ b/Utility/Hash.hs @@ -1,6 +1,6 @@ {- Convenience wrapper around cryptonite's hashing. - - - Copyright 2013-2021 Joey Hess + - Copyright 2013-2024 Joey Hess - - License: BSD-2-clause -} @@ -16,6 +16,7 @@ module Utility.Hash ( sha2_224_context, sha2_256, sha2_256_context, + sha2_256s, sha2_384, sha2_384_context, sha2_512, @@ -106,6 +107,9 @@ sha2_256 = hashlazy sha2_256_context :: Context SHA256 sha2_256_context = hashInit +sha2_256s :: S.ByteString -> Digest SHA256 +sha2_256s = hash + sha2_384 :: L.ByteString -> Digest SHA384 sha2_384 = hashlazy diff --git a/doc/design/balanced_preferred_content.mdwn b/doc/design/balanced_preferred_content.mdwn index a41483f329..23d0b12421 100644 --- a/doc/design/balanced_preferred_content.mdwn +++ b/doc/design/balanced_preferred_content.mdwn @@ -13,7 +13,7 @@ that entirely: Existing preferred content expressions such as the one for archive group have this problem. -So, let's add a new expression: `balanced(group)` +So, let's add a new expression: `balanced=group` ## implementation @@ -47,7 +47,7 @@ repo1 and repo2 want to swap some files between them, So, we'll want to add some precautions to avoid a lot of data moving around in such a case: - ((balanced(backup) and not (copies=backup:1)) or present + (balanced=backup and not (copies=backup:1)) or present So once file lands on a backup drive, it stays there, even if more backup drives change the balancing. @@ -56,7 +56,7 @@ drives change the balancing. What if we have 5 backup repos and want each key to be stored in 3 of them? There's a simple change that can support that: -`balanced(group:3)` +`balanced=group:3` This works the same as before, but rather than just `N mod M`, take `N+I mod M` where I is [0..2] to get the list of 3 repositories that want a @@ -78,10 +78,10 @@ number of drives. Each file should have 1 copy stored in each datacenter, on some drive there. This can be implemented by making a group for each datacenter, which all of -its drives are in, and using `balanced()` to pick the drive that holds the +its drives are in, and using `balanced` to pick the drive that holds the copy of the file. The preferred content expression would be eg: - ((balanced(datacenterA) and not (copies=datacenterA:1)) or present + (balanced=datacenterA and not copies=datacenterA:1) or present In such a situation, to avoid a `N^2` remote interconnect, there might be a transfer repository in each datacenter, that is in front of its drives. The @@ -90,7 +90,7 @@ destination drive. How to write a preferred content expression for that? It might be sufficient to use `copies=datacenterA:1`, so long as the file reaching any drive in the datacenter is enough. But may want to add something analagous to `inallgroup=` that checks if a file is in -the place that `balanced()` picks for a group. Eg, +the place that `balanced` picks for a group. Eg, `balancedgroup=datacenterA` for 1 copy and `balancedgroup=group:datacenterA:2` for N copies. @@ -143,18 +143,18 @@ the cluster. In several examples above, we have preferred content expressions in this form: - ((balanced(group:N) and not (copies=group:N)) or present + (balanced=group:N and not copies=group:N) or present In order to rebalance, that needs to be changed to: - balanced(group:N) + balanced=group:N What could be done is make `balanced()` usually expand to the former, but when --rebalance is used, it only expands to the latter. -(Might make the fully balanced behavior available as `fullybalanced()` for +(Might make the fully balanced behavior available as `fullybalanced` for users who want it, then -`balanced() == ((fullybalanced(group:N) and not (copies=group:N)) or present` -usually and when --rebalance is used, `balanced() == fullybalanced(group:N)` +`balanced=group:N == (fullybalanced=group:N and not copies=group:N) or present` +usually and when --rebalance is used, `balanced=group:N == fullybalanced=group:N)` diff --git a/doc/git-annex-common-options.mdwn b/doc/git-annex-common-options.mdwn index 44412a982c..c978c96bbc 100644 --- a/doc/git-annex-common-options.mdwn +++ b/doc/git-annex-common-options.mdwn @@ -76,6 +76,12 @@ Most of these options are accepted by all git-annex commands. Overrides the mincopies setting. +* `--rebalance` + + Changes the behavior of the "balanced" preferred content expression + to be the same as "fullbalanced". When that expression is used, + this can cause a lot of work to be done to rebalance repositories. + * `--time-limit=time` Limits how long a git-annex command runs. The time can be something diff --git a/doc/git-annex-preferred-content.mdwn b/doc/git-annex-preferred-content.mdwn index 466215792a..b23e9677fd 100644 --- a/doc/git-annex-preferred-content.mdwn +++ b/doc/git-annex-preferred-content.mdwn @@ -262,6 +262,52 @@ elsewhere to allow removing it). says it wants them. (Or, if annex.expireunused is set, it may just delete them.) +* `balanced=groupname[:number]` + + Makes content be evenly balanced amoung repositories in the group. + + The number is the number of repositories in the group that will + want each file. When not specified, the default is 1. + + For this to work, each repository in the group should have its preferred + content set to the same expression. Using `groupwanted` is a good + way to do that. + + For example, "balanced=backup:2", when there are 3 members of the backup + group, will make each backup repository want 2/3rds of the files. + + The sizes of files are not taken into account, so it's possible for + one repository to get larger than usual files and so fill up before + the other repositories. But files are only wanted by repositories that + have enough free space to hold them. So once a repository is full, + the remaining repositories will have any additional files balanced + amoung them. In order for this to work, you must use + [[git-annex-size]](1) to specify the size of each repository in the + group. + + This usually avoids moving files between repositories of the group, even + if that means that things are not optimally balanced. Some of the ways + that it can get out of balance include adding a new repository to the + group, or a file getting copied into more repositories in the group than + the specified number. Running git-annex commands with the `--rebalance` + option will make this expression instead behave like the `fullybalanced` + expression, which will make repositories want to move files around as + necessary in order to get fully balanced. + + Note that `not balanced` is a bad thing to put in a preferred content + expression for the same reason `not present` is. + +* `fullybalanced=groupname` + + This is like `balanced`, but allows moving content between repositories + in the group at any time to keep it fully balanced. + + Normally "balanced=groupname:number" is the same as + "(fullybalanced=groupname:number and not copies=groupname:number) or present" + + When the `--rebalance` option is used, `balanced` is the same as + `fullybalanced`. + * `anything` Always matches. @@ -304,6 +350,8 @@ for example `"exclude=* and copies=1"` will be displayed as [[git-annex-wanted]](1) +[[git-annex-size]](1) + diff --git a/doc/preferred_content.mdwn b/doc/preferred_content.mdwn index 19670723cc..0efe5c674e 100644 --- a/doc/preferred_content.mdwn +++ b/doc/preferred_content.mdwn @@ -58,6 +58,7 @@ it assumes all files that are currently present are preferred content. Here are recent changes to preferred content expressions, and the version they were added in. +* "balanced=", "fullybalanced=" 10.20240831 * "securehash" 6.20170228 * "nothing" 6.201600202 * "anything" 5.20150616 diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 893166db1e..c153ae585a 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -30,9 +30,17 @@ Planned schedule of work: ## work notes -* onward to balanced preferred content! But it depends on - [[track_free_space_in_repos_via_git-annex_branch]] so that will be the - first task. +* balanced= and fullybalanced= need to limit the set of repositories to + ones with enough free space to contain a key. + +* Add `git-annex size` command. + +* Implement [[track_free_space_in_repos_via_git-annex_branch]] + +## completed items for August's work on balanced preferred content + +* Balanced preferred content basic implementation, including --rebalance + option. ## completed items for August's work on git-annex proxy support for exporttre