diff --git a/Annex/Export.hs b/Annex/Export.hs index a01d263cf1..3ab68ad530 100644 --- a/Annex/Export.hs +++ b/Annex/Export.hs @@ -18,6 +18,7 @@ import qualified Types.Remote as Remote import Messages import Data.Maybe +import qualified Data.ByteString.Short as S (fromShort, toShort) -- From a sha pointing to the content of a file to the key -- to use to export it. When the file is annexed, it's the annexed key. @@ -39,7 +40,7 @@ exportKey sha = mk <$> catKey sha -- only checksum the content. gitShaKey :: Git.Sha -> Key gitShaKey (Git.Ref s) = mkKey $ \kd -> kd - { keyName = s + { keyName = S.toShort s , keyVariety = OtherKey "GIT" } @@ -47,7 +48,7 @@ gitShaKey (Git.Ref s) = mkKey $ \kd -> kd keyGitSha :: Key -> Maybe Git.Sha keyGitSha k | fromKey keyVariety k == OtherKey "GIT" = - Just (Git.Ref (fromKey keyName k)) + Just (Git.Ref (S.fromShort (fromKey keyName k))) | otherwise = Nothing -- Is a key storing a git sha, and not used for an annexed file? diff --git a/Backend.hs b/Backend.hs index d327fde3d3..3a00115368 100644 --- a/Backend.hs +++ b/Backend.hs @@ -33,6 +33,7 @@ import qualified Backend.URL import qualified Backend.External import qualified Data.Map as M +import qualified Data.ByteString.Short as S (toShort, fromShort) import qualified Data.ByteString.Char8 as S8 {- Build-in backends. Does not include externals. -} @@ -67,7 +68,7 @@ genKey source meterupdate preferredbackend = do where -- keyNames should not contain newline characters. makesane k = alterKey k $ \d -> d - { keyName = S8.map fixbadchar (fromKey keyName k) + { keyName = S.toShort (S8.map fixbadchar (S.fromShort (fromKey keyName k))) } fixbadchar c | c == '\n' = '_' diff --git a/Backend/External.hs b/Backend/External.hs index c353e049c7..fe7449f1f9 100644 --- a/Backend/External.hs +++ b/Backend/External.hs @@ -20,6 +20,7 @@ import Utility.Metered import qualified Utility.SimpleProtocol as Proto import qualified Data.ByteString as S +import qualified Data.ByteString.Short as S (toShort, fromShort) import qualified Data.Map.Strict as M import Data.Char import Control.Concurrent @@ -285,7 +286,7 @@ toProtoKey k = ProtoKey $ alterKey k $ \d -> d -- The extension can be easily removed, because the protocol -- documentation does not allow '.' to be used in the keyName, -- so the first one is the extension. - { keyName = S.takeWhile (/= dot) (keyName d) + { keyName = S.toShort (S.takeWhile (/= dot) (S.fromShort (keyName d))) , keyVariety = setHasExt (HasExt False) (keyVariety d) } where diff --git a/Backend/Hash.hs b/Backend/Hash.hs index bd66cb698e..4ffbcbbdee 100644 --- a/Backend/Hash.hs +++ b/Backend/Hash.hs @@ -24,6 +24,7 @@ import Utility.Metered import qualified Utility.RawFilePath as R import qualified Data.ByteString as S +import qualified Data.ByteString.Short as S (toShort, fromShort) import qualified Data.ByteString.Char8 as S8 import qualified Data.ByteString.Lazy as L import Control.DeepSeq @@ -106,7 +107,7 @@ keyValue hash source meterupdate = do filesize <- liftIO $ getFileSize file s <- hashFile hash file meterupdate return $ mkKey $ \k -> k - { keyName = encodeBS s + { keyName = S.toShort (encodeBS s) , keyVariety = hashKeyVariety hash (HasExt False) , keySize = Just filesize } @@ -160,7 +161,7 @@ needsUpgrade :: Key -> Bool needsUpgrade key = or [ "\\" `S8.isPrefixOf` keyHash key , S.any (not . validInExtension) (snd $ splitKeyNameExtension key) - , not (hasExt (fromKey keyVariety key)) && keyHash key /= fromKey keyName key + , not (hasExt (fromKey keyVariety key)) && keyHash key /= S.fromShort (fromKey keyName key) ] trivialMigrate :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key) @@ -171,14 +172,14 @@ trivialMigrate' :: Key -> Backend -> AssociatedFile -> Maybe Int -> Maybe Key trivialMigrate' oldkey newbackend afile maxextlen {- Fast migration from hashE to hash backend. -} | migratable && hasExt oldvariety = Just $ alterKey oldkey $ \d -> d - { keyName = keyHash oldkey + { keyName = S.toShort (keyHash oldkey) , keyVariety = newvariety } {- Fast migration from hash to hashE backend. -} | migratable && hasExt newvariety = case afile of AssociatedFile Nothing -> Nothing AssociatedFile (Just file) -> Just $ alterKey oldkey $ \d -> d - { keyName = keyHash oldkey + { keyName = S.toShort $ keyHash oldkey <> selectExtension maxextlen file , keyVariety = newvariety } @@ -186,9 +187,9 @@ trivialMigrate' oldkey newbackend afile maxextlen - non-extension preserving key, with an extension - in its keyName. -} | newvariety == oldvariety && not (hasExt oldvariety) && - keyHash oldkey /= fromKey keyName oldkey = + keyHash oldkey /= S.fromShort (fromKey keyName oldkey) = Just $ alterKey oldkey $ \d -> d - { keyName = keyHash oldkey + { keyName = S.toShort (keyHash oldkey) } | otherwise = Nothing where diff --git a/Backend/Utilities.hs b/Backend/Utilities.hs index 7121d4f2f5..58ba880f94 100644 --- a/Backend/Utilities.hs +++ b/Backend/Utilities.hs @@ -16,6 +16,7 @@ import Types.Key import Types.KeySource import qualified Data.ByteString as S +import qualified Data.ByteString.Short as S (ShortByteString, toShort) import qualified Data.ByteString.Lazy as L import qualified System.FilePath.ByteString as P import Data.Char @@ -25,13 +26,13 @@ import Data.Word - If it's not too long, the full string is used as the keyName. - Otherwise, it's truncated, and its md5 is prepended to ensure a unique - key. -} -genKeyName :: String -> S.ByteString +genKeyName :: String -> S.ShortByteString genKeyName s -- Avoid making keys longer than the length of a SHA256 checksum. - | bytelen > sha256len = encodeBS $ + | bytelen > sha256len = S.toShort $ encodeBS $ truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++ show (md5 bl) - | otherwise = encodeBS s' + | otherwise = S.toShort $ encodeBS s' where s' = preSanitizeKeyName s bl = encodeBL s @@ -47,7 +48,7 @@ addE source sethasext k = do maxlen <- annexMaxExtensionLength <$> Annex.getGitConfig let ext = selectExtension maxlen (keyFilename source) return $ alterKey k $ \d -> d - { keyName = keyName d <> ext + { keyName = keyName d <> S.toShort ext , keyVariety = sethasext (keyVariety d) } diff --git a/Backend/WORM.hs b/Backend/WORM.hs index af116a8077..233ca92e68 100644 --- a/Backend/WORM.hs +++ b/Backend/WORM.hs @@ -17,6 +17,7 @@ import Utility.Metered import qualified Data.ByteString.Char8 as S8 import qualified Utility.RawFilePath as R +import qualified Data.ByteString.Short as S (toShort, fromShort) backends :: [Backend] backends = [backend] @@ -53,12 +54,13 @@ keyValue source _ = do {- Old WORM keys could contain spaces and carriage returns, - and can be upgraded to remove them. -} needsUpgrade :: Key -> Bool -needsUpgrade key = any (`S8.elem` fromKey keyName key) [' ', '\r'] +needsUpgrade key = + any (`S8.elem` S.fromShort (fromKey keyName key)) [' ', '\r'] removeProblemChars :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key) removeProblemChars oldkey newbackend _ | migratable = return $ Just $ alterKey oldkey $ \d -> d - { keyName = encodeBS $ reSanitizeKeyName $ decodeBS $ keyName d } + { keyName = S.toShort $ encodeBS $ reSanitizeKeyName $ decodeBS $ S.fromShort $ keyName d } | otherwise = return Nothing where migratable = oldvariety == newvariety diff --git a/Command/Find.hs b/Command/Find.hs index d89ff2b967..0a5544e437 100644 --- a/Command/Find.hs +++ b/Command/Find.hs @@ -10,6 +10,7 @@ module Command.Find where import Data.Default import qualified Data.Map as M import qualified Data.ByteString as S +import qualified Data.ByteString.Short as S (fromShort) import qualified Data.ByteString.Char8 as S8 import Command @@ -100,7 +101,7 @@ formatVars key (AssociatedFile af) = , ("backend", decodeBS $ formatKeyVariety $ fromKey keyVariety key) , ("bytesize", size show) , ("humansize", size $ roughSize storageUnits True) - , ("keyname", decodeBS $ fromKey keyName key) + , ("keyname", decodeBS $ S.fromShort $ fromKey keyName key) , ("hashdirlower", fromRawFilePath $ hashDirLower def key) , ("hashdirmixed", fromRawFilePath $ hashDirMixed def key) , ("mtime", whenavail show $ fromKey keyMtime key) diff --git a/Crypto.hs b/Crypto.hs index ca10576eb8..751c1cd256 100644 --- a/Crypto.hs +++ b/Crypto.hs @@ -47,6 +47,7 @@ import Types.Crypto import Types.Remote import Types.Key import Annex.SpecialRemote.Config +import qualified Data.ByteString.Short as S (toShort) {- The beginning of a Cipher is used for MAC'ing; the remainder is used - as the GPG symmetric encryption passphrase when using the hybrid @@ -163,7 +164,7 @@ type EncKey = Key -> Key - on content. It does need to be repeatable. -} encryptKey :: Mac -> Cipher -> EncKey encryptKey mac c k = mkKey $ \d -> d - { keyName = encodeBS (macWithCipher mac c (serializeKey k)) + { keyName = S.toShort $ encodeBS $ macWithCipher mac c (serializeKey k) , keyVariety = OtherKey $ encryptedBackendNamePrefix <> encodeBS (showMac mac) } diff --git a/Database/Benchmark.hs b/Database/Benchmark.hs index 9174ac7f09..f895defdf6 100644 --- a/Database/Benchmark.hs +++ b/Database/Benchmark.hs @@ -22,6 +22,7 @@ import Types.Key import Utility.DataUnits import Criterion.Main +import qualified Data.ByteString.Short as S (toShort) import qualified Data.ByteString.Char8 as B8 import System.Random import Control.Concurrent @@ -87,7 +88,7 @@ populateAssociatedFiles h num = do keyN :: Integer -> Key keyN n = mkKey $ \k -> k - { keyName = B8.pack $ "key" ++ show n + { keyName = S.toShort (B8.pack $ "key" ++ show n) , keyVariety = OtherKey "BENCH" } diff --git a/Key.hs b/Key.hs index 4d31dcda36..b19aee8040 100644 --- a/Key.hs +++ b/Key.hs @@ -31,6 +31,7 @@ module Key ( import qualified Data.Text as T import qualified Data.ByteString as S +import qualified Data.ByteString.Short as S (toShort, fromShort) import qualified Data.Attoparsec.ByteString as A import Common @@ -62,7 +63,7 @@ serializeKey :: Key -> String serializeKey = decodeBS . serializeKey' serializeKey' :: Key -> S.ByteString -serializeKey' = keySerialization +serializeKey' = S.fromShort . keySerialization deserializeKey :: String -> Maybe Key deserializeKey = deserializeKey' . encodeBS @@ -72,7 +73,7 @@ deserializeKey' = eitherToMaybe . A.parseOnly keyParser instance Arbitrary KeyData where arbitrary = Key - <$> (encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t")) + <$> (S.toShort . encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t")) <*> (parseKeyVariety . encodeBS <$> (listOf1 $ elements ['A'..'Z'])) -- BACKEND <*> ((abs <$>) <$> arbitrary) -- size cannot be negative <*> ((abs . fromInteger <$>) <$> arbitrary) -- mtime cannot be negative diff --git a/Remote/External/Types.hs b/Remote/External/Types.hs index b259a5be30..a12a0d70eb 100644 --- a/Remote/External/Types.hs +++ b/Remote/External/Types.hs @@ -60,6 +60,7 @@ import Control.Concurrent.STM import Network.URI import Data.Char import Text.Read +import qualified Data.ByteString.Short as S (fromShort) data External = External { externalType :: ExternalType @@ -138,7 +139,7 @@ newtype SafeKey = SafeKey Key mkSafeKey :: Key -> Either String SafeKey mkSafeKey k - | any isSpace (decodeBS $ fromKey keyName k) = Left $ concat + | any isSpace (decodeBS $ S.fromShort $ fromKey keyName k) = Left $ concat [ "Sorry, this file cannot be stored on an external special remote because its key's name contains a space. " , "To avoid this problem, you can run: git-annex migrate --backend=" , decodeBS (formatKeyVariety (fromKey keyVariety k)) diff --git a/Remote/GitLFS.hs b/Remote/GitLFS.hs index 20e755e445..941bf68ce8 100644 --- a/Remote/GitLFS.hs +++ b/Remote/GitLFS.hs @@ -56,6 +56,7 @@ import Network.HTTP.Types import Network.HTTP.Client hiding (port) import qualified Data.Map as M import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString.Short as S (fromShort) import qualified Data.Text as T import qualified Data.Text.Encoding as E import qualified Control.Concurrent.MSemN as MSemN @@ -374,7 +375,7 @@ extractKeySha256 :: Key -> Maybe LFS.SHA256 extractKeySha256 k = case fromKey keyVariety k of SHA2Key (HashSize 256) (HasExt hasext) | hasext -> eitherToMaybe $ E.decodeUtf8' (keyHash k) - | otherwise -> eitherToMaybe $ E.decodeUtf8' (fromKey keyName k) + | otherwise -> eitherToMaybe $ E.decodeUtf8' $ S.fromShort (fromKey keyName k) _ -> Nothing -- The size of an encrypted key is the size of the input data, but we need diff --git a/Types/Key.hs b/Types/Key.hs index b8dd77c59d..7e1b59cdc6 100644 --- a/Types/Key.hs +++ b/Types/Key.hs @@ -29,6 +29,7 @@ module Types.Key ( ) where import qualified Data.ByteString as S +import qualified Data.ByteString.Short as S (ShortByteString, toShort, fromShort) import qualified Data.ByteString.Char8 as S8 import qualified Data.ByteString.Lazy as L import Data.ByteString.Builder @@ -49,7 +50,7 @@ import Prelude {- A Key has a unique name, which is derived from a particular backend, - and may contain other optional metadata. -} data KeyData = Key - { keyName :: S.ByteString + { keyName :: S.ShortByteString , keyVariety :: KeyVariety , keySize :: Maybe Integer , keyMtime :: Maybe EpochTime @@ -66,7 +67,7 @@ instance NFData KeyData -} data Key = MkKey { keyData :: KeyData - , keySerialization :: S.ByteString + , keySerialization :: S.ShortByteString } deriving (Show, Generic) instance Eq Key where @@ -111,8 +112,8 @@ isKeyPrefix s = [fieldSep, fieldSep] `isInfixOf` s fieldSep :: Char fieldSep = '-' -mkKeySerialization :: KeyData -> S.ByteString -mkKeySerialization = L.toStrict +mkKeySerialization :: KeyData -> S.ShortByteString +mkKeySerialization = S.toShort . L.toStrict . toLazyByteStringWith (safeStrategy 128 smallChunkSize) L.empty . buildKeyData @@ -127,7 +128,7 @@ buildKeyData k = byteString (formatKeyVariety (keyVariety k)) <> 'm' ?: (integerDec . (\(CTime t) -> fromIntegral t) <$> keyMtime k) <> 'S' ?: (integerDec <$> keyChunkSize k) <> 'C' ?: (integerDec <$> keyChunkNum k) - <> sepbefore (sepbefore (byteString (keyName k))) + <> sepbefore (sepbefore (byteString (S.fromShort (keyName k)))) where sepbefore s = char7 fieldSep <> s c ?: (Just b) = sepbefore (char7 c <> b) @@ -156,7 +157,7 @@ keyParser = do if validKeyName v n then let d = Key - { keyName = n + { keyName = S.toShort n , keyVariety = v , keySize = s , keyMtime = m @@ -195,7 +196,7 @@ validKeyName kv name - keyName minus extension, and the extension (including leading dot). -} splitKeyNameExtension :: Key -> (S.ByteString, S.ByteString) -splitKeyNameExtension = splitKeyNameExtension' . keyName . keyData +splitKeyNameExtension = splitKeyNameExtension' . S.fromShort . keyName . keyData splitKeyNameExtension' :: S.ByteString -> (S.ByteString, S.ByteString) splitKeyNameExtension' keyname = S8.span (/= '.') keyname diff --git a/Upgrade/V1.hs b/Upgrade/V1.hs index 8befeb0640..bb2abd743d 100644 --- a/Upgrade/V1.hs +++ b/Upgrade/V1.hs @@ -12,6 +12,7 @@ import Data.Char import Data.Default import Data.ByteString.Builder import qualified Data.ByteString as S +import qualified Data.ByteString.Short as S (toShort, fromShort) import qualified Data.ByteString.Lazy as L import qualified System.FilePath.ByteString as P @@ -138,7 +139,7 @@ oldlog2key l where len = length l - 4 k = readKey1 (take len l) - sane = (not . S.null $ fromKey keyName k) && (not . S.null $ formatKeyVariety $ fromKey keyVariety k) + sane = (not . S.null $ S.fromShort $ fromKey keyName k) && (not . S.null $ formatKeyVariety $ fromKey keyVariety k) -- WORM backend keys: "WORM:mtime:size:filename" -- all the rest: "backend:key" @@ -150,7 +151,7 @@ readKey1 :: String -> Key readKey1 v | mixup = fromJust $ deserializeKey $ intercalate ":" $ Prelude.tail bits | otherwise = mkKey $ \d -> d - { keyName = encodeBS n + { keyName = S.toShort (encodeBS n) , keyVariety = parseKeyVariety (encodeBS b) , keySize = s , keyMtime = t @@ -175,7 +176,7 @@ showKey1 k = intercalate ":" $ filter (not . null) showifhere Nothing = "" showifhere (Just x) = show x b = decodeBS $ formatKeyVariety v - n = fromKey keyName k + n = S.fromShort $ fromKey keyName k v = fromKey keyVariety k s = fromKey keySize k t = fromKey keyMtime k @@ -212,7 +213,7 @@ lookupKey1 file = do where k = fileKey1 l bname = decodeBS (formatKeyVariety (fromKey keyVariety k)) - kname = decodeBS (fromKey keyName k) + kname = decodeBS (S.fromShort (fromKey keyName k)) skip = "skipping " ++ file ++ " (unknown backend " ++ bname ++ ")" diff --git a/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo/comment_7_f59d9c51716892240ebd12fa80a2e58b._comment b/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo/comment_7_f59d9c51716892240ebd12fa80a2e58b._comment index d308db7522..0d8cad31c7 100644 --- a/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo/comment_7_f59d9c51716892240ebd12fa80a2e58b._comment +++ b/doc/bugs/borg_special_remote_memory_usage_high_for_large_borg_repo/comment_7_f59d9c51716892240ebd12fa80a2e58b._comment @@ -4,6 +4,20 @@ date="2021-10-05T23:00:18Z" content=""" I tried converting Ref to use ShortByteString. Memory use did not improve -and the -hc profile is unchanged. So the pinned memory is not in refs. My -guess is it must be filenames in the tree then. +and the -hc profile is unchanged. So the pinned memory is not in refs. + +Also tried converting Key to use ShortByteString. That was a win! +My 20 borg archive test case is down from 320 mb to 242 mb. + +Looking at Command.SyncpullThirdPartyPopulated, +it calls listContents, which calls borg's listImportableContents, +and produces an `ImportableContents (ContentIdentifier, ByteSize)` +then that gets passed through importKeys to produce +an `ImportableContents (Either Sha Key)`. Probably +double memory is used while doing that conversion, unless +the GC manages to free the first one while it's traversed. + +If borg's listImportableContents included a Key (which it does +produce already only to throw away!) that might +eliminate the big spike just before treeItemsToTree. """]]