convert Key to ShortByteString

This adds the overhead of a copy when serializing and deserializing keys.
I have not benchmarked much, but runtimes seem barely changed at all by that.

When a lot of keys are in memory, it improves memory use.

And, it prevents keys sometimes getting PINNED in memory and failing to GC,
which is a problem ByteString has sometimes. In particular, git-annex sync
from a borg special remote had that problem and this improved its memory
use by a large amount.

Sponsored-by: Shae Erisson on Patreon
This commit is contained in:
Joey Hess 2021-10-05 20:20:08 -04:00
parent 012b71e471
commit 19e78816f0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
15 changed files with 65 additions and 36 deletions

View file

@ -18,6 +18,7 @@ import qualified Types.Remote as Remote
import Messages
import Data.Maybe
import qualified Data.ByteString.Short as S (fromShort, toShort)
-- From a sha pointing to the content of a file to the key
-- to use to export it. When the file is annexed, it's the annexed key.
@ -39,7 +40,7 @@ exportKey sha = mk <$> catKey sha
-- only checksum the content.
gitShaKey :: Git.Sha -> Key
gitShaKey (Git.Ref s) = mkKey $ \kd -> kd
{ keyName = s
{ keyName = S.toShort s
, keyVariety = OtherKey "GIT"
}
@ -47,7 +48,7 @@ gitShaKey (Git.Ref s) = mkKey $ \kd -> kd
keyGitSha :: Key -> Maybe Git.Sha
keyGitSha k
| fromKey keyVariety k == OtherKey "GIT" =
Just (Git.Ref (fromKey keyName k))
Just (Git.Ref (S.fromShort (fromKey keyName k)))
| otherwise = Nothing
-- Is a key storing a git sha, and not used for an annexed file?

View file

@ -33,6 +33,7 @@ import qualified Backend.URL
import qualified Backend.External
import qualified Data.Map as M
import qualified Data.ByteString.Short as S (toShort, fromShort)
import qualified Data.ByteString.Char8 as S8
{- Build-in backends. Does not include externals. -}
@ -67,7 +68,7 @@ genKey source meterupdate preferredbackend = do
where
-- keyNames should not contain newline characters.
makesane k = alterKey k $ \d -> d
{ keyName = S8.map fixbadchar (fromKey keyName k)
{ keyName = S.toShort (S8.map fixbadchar (S.fromShort (fromKey keyName k)))
}
fixbadchar c
| c == '\n' = '_'

View file

@ -20,6 +20,7 @@ import Utility.Metered
import qualified Utility.SimpleProtocol as Proto
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (toShort, fromShort)
import qualified Data.Map.Strict as M
import Data.Char
import Control.Concurrent
@ -285,7 +286,7 @@ toProtoKey k = ProtoKey $ alterKey k $ \d -> d
-- The extension can be easily removed, because the protocol
-- documentation does not allow '.' to be used in the keyName,
-- so the first one is the extension.
{ keyName = S.takeWhile (/= dot) (keyName d)
{ keyName = S.toShort (S.takeWhile (/= dot) (S.fromShort (keyName d)))
, keyVariety = setHasExt (HasExt False) (keyVariety d)
}
where

View file

@ -24,6 +24,7 @@ import Utility.Metered
import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (toShort, fromShort)
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L
import Control.DeepSeq
@ -106,7 +107,7 @@ keyValue hash source meterupdate = do
filesize <- liftIO $ getFileSize file
s <- hashFile hash file meterupdate
return $ mkKey $ \k -> k
{ keyName = encodeBS s
{ keyName = S.toShort (encodeBS s)
, keyVariety = hashKeyVariety hash (HasExt False)
, keySize = Just filesize
}
@ -160,7 +161,7 @@ needsUpgrade :: Key -> Bool
needsUpgrade key = or
[ "\\" `S8.isPrefixOf` keyHash key
, S.any (not . validInExtension) (snd $ splitKeyNameExtension key)
, not (hasExt (fromKey keyVariety key)) && keyHash key /= fromKey keyName key
, not (hasExt (fromKey keyVariety key)) && keyHash key /= S.fromShort (fromKey keyName key)
]
trivialMigrate :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key)
@ -171,14 +172,14 @@ trivialMigrate' :: Key -> Backend -> AssociatedFile -> Maybe Int -> Maybe Key
trivialMigrate' oldkey newbackend afile maxextlen
{- Fast migration from hashE to hash backend. -}
| migratable && hasExt oldvariety = Just $ alterKey oldkey $ \d -> d
{ keyName = keyHash oldkey
{ keyName = S.toShort (keyHash oldkey)
, keyVariety = newvariety
}
{- Fast migration from hash to hashE backend. -}
| migratable && hasExt newvariety = case afile of
AssociatedFile Nothing -> Nothing
AssociatedFile (Just file) -> Just $ alterKey oldkey $ \d -> d
{ keyName = keyHash oldkey
{ keyName = S.toShort $ keyHash oldkey
<> selectExtension maxextlen file
, keyVariety = newvariety
}
@ -186,9 +187,9 @@ trivialMigrate' oldkey newbackend afile maxextlen
- non-extension preserving key, with an extension
- in its keyName. -}
| newvariety == oldvariety && not (hasExt oldvariety) &&
keyHash oldkey /= fromKey keyName oldkey =
keyHash oldkey /= S.fromShort (fromKey keyName oldkey) =
Just $ alterKey oldkey $ \d -> d
{ keyName = keyHash oldkey
{ keyName = S.toShort (keyHash oldkey)
}
| otherwise = Nothing
where

View file

@ -16,6 +16,7 @@ import Types.Key
import Types.KeySource
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (ShortByteString, toShort)
import qualified Data.ByteString.Lazy as L
import qualified System.FilePath.ByteString as P
import Data.Char
@ -25,13 +26,13 @@ import Data.Word
- If it's not too long, the full string is used as the keyName.
- Otherwise, it's truncated, and its md5 is prepended to ensure a unique
- key. -}
genKeyName :: String -> S.ByteString
genKeyName :: String -> S.ShortByteString
genKeyName s
-- Avoid making keys longer than the length of a SHA256 checksum.
| bytelen > sha256len = encodeBS $
| bytelen > sha256len = S.toShort $ encodeBS $
truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++
show (md5 bl)
| otherwise = encodeBS s'
| otherwise = S.toShort $ encodeBS s'
where
s' = preSanitizeKeyName s
bl = encodeBL s
@ -47,7 +48,7 @@ addE source sethasext k = do
maxlen <- annexMaxExtensionLength <$> Annex.getGitConfig
let ext = selectExtension maxlen (keyFilename source)
return $ alterKey k $ \d -> d
{ keyName = keyName d <> ext
{ keyName = keyName d <> S.toShort ext
, keyVariety = sethasext (keyVariety d)
}

View file

@ -17,6 +17,7 @@ import Utility.Metered
import qualified Data.ByteString.Char8 as S8
import qualified Utility.RawFilePath as R
import qualified Data.ByteString.Short as S (toShort, fromShort)
backends :: [Backend]
backends = [backend]
@ -53,12 +54,13 @@ keyValue source _ = do
{- Old WORM keys could contain spaces and carriage returns,
- and can be upgraded to remove them. -}
needsUpgrade :: Key -> Bool
needsUpgrade key = any (`S8.elem` fromKey keyName key) [' ', '\r']
needsUpgrade key =
any (`S8.elem` S.fromShort (fromKey keyName key)) [' ', '\r']
removeProblemChars :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key)
removeProblemChars oldkey newbackend _
| migratable = return $ Just $ alterKey oldkey $ \d -> d
{ keyName = encodeBS $ reSanitizeKeyName $ decodeBS $ keyName d }
{ keyName = S.toShort $ encodeBS $ reSanitizeKeyName $ decodeBS $ S.fromShort $ keyName d }
| otherwise = return Nothing
where
migratable = oldvariety == newvariety

View file

@ -10,6 +10,7 @@ module Command.Find where
import Data.Default
import qualified Data.Map as M
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (fromShort)
import qualified Data.ByteString.Char8 as S8
import Command
@ -100,7 +101,7 @@ formatVars key (AssociatedFile af) =
, ("backend", decodeBS $ formatKeyVariety $ fromKey keyVariety key)
, ("bytesize", size show)
, ("humansize", size $ roughSize storageUnits True)
, ("keyname", decodeBS $ fromKey keyName key)
, ("keyname", decodeBS $ S.fromShort $ fromKey keyName key)
, ("hashdirlower", fromRawFilePath $ hashDirLower def key)
, ("hashdirmixed", fromRawFilePath $ hashDirMixed def key)
, ("mtime", whenavail show $ fromKey keyMtime key)

View file

@ -47,6 +47,7 @@ import Types.Crypto
import Types.Remote
import Types.Key
import Annex.SpecialRemote.Config
import qualified Data.ByteString.Short as S (toShort)
{- The beginning of a Cipher is used for MAC'ing; the remainder is used
- as the GPG symmetric encryption passphrase when using the hybrid
@ -163,7 +164,7 @@ type EncKey = Key -> Key
- on content. It does need to be repeatable. -}
encryptKey :: Mac -> Cipher -> EncKey
encryptKey mac c k = mkKey $ \d -> d
{ keyName = encodeBS (macWithCipher mac c (serializeKey k))
{ keyName = S.toShort $ encodeBS $ macWithCipher mac c (serializeKey k)
, keyVariety = OtherKey $
encryptedBackendNamePrefix <> encodeBS (showMac mac)
}

View file

@ -22,6 +22,7 @@ import Types.Key
import Utility.DataUnits
import Criterion.Main
import qualified Data.ByteString.Short as S (toShort)
import qualified Data.ByteString.Char8 as B8
import System.Random
import Control.Concurrent
@ -87,7 +88,7 @@ populateAssociatedFiles h num = do
keyN :: Integer -> Key
keyN n = mkKey $ \k -> k
{ keyName = B8.pack $ "key" ++ show n
{ keyName = S.toShort (B8.pack $ "key" ++ show n)
, keyVariety = OtherKey "BENCH"
}

5
Key.hs
View file

@ -31,6 +31,7 @@ module Key (
import qualified Data.Text as T
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (toShort, fromShort)
import qualified Data.Attoparsec.ByteString as A
import Common
@ -62,7 +63,7 @@ serializeKey :: Key -> String
serializeKey = decodeBS . serializeKey'
serializeKey' :: Key -> S.ByteString
serializeKey' = keySerialization
serializeKey' = S.fromShort . keySerialization
deserializeKey :: String -> Maybe Key
deserializeKey = deserializeKey' . encodeBS
@ -72,7 +73,7 @@ deserializeKey' = eitherToMaybe . A.parseOnly keyParser
instance Arbitrary KeyData where
arbitrary = Key
<$> (encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t"))
<$> (S.toShort . encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t"))
<*> (parseKeyVariety . encodeBS <$> (listOf1 $ elements ['A'..'Z'])) -- BACKEND
<*> ((abs <$>) <$> arbitrary) -- size cannot be negative
<*> ((abs . fromInteger <$>) <$> arbitrary) -- mtime cannot be negative

View file

@ -60,6 +60,7 @@ import Control.Concurrent.STM
import Network.URI
import Data.Char
import Text.Read
import qualified Data.ByteString.Short as S (fromShort)
data External = External
{ externalType :: ExternalType
@ -138,7 +139,7 @@ newtype SafeKey = SafeKey Key
mkSafeKey :: Key -> Either String SafeKey
mkSafeKey k
| any isSpace (decodeBS $ fromKey keyName k) = Left $ concat
| any isSpace (decodeBS $ S.fromShort $ fromKey keyName k) = Left $ concat
[ "Sorry, this file cannot be stored on an external special remote because its key's name contains a space. "
, "To avoid this problem, you can run: git-annex migrate --backend="
, decodeBS (formatKeyVariety (fromKey keyVariety k))

View file

@ -56,6 +56,7 @@ import Network.HTTP.Types
import Network.HTTP.Client hiding (port)
import qualified Data.Map as M
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Short as S (fromShort)
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import qualified Control.Concurrent.MSemN as MSemN
@ -374,7 +375,7 @@ extractKeySha256 :: Key -> Maybe LFS.SHA256
extractKeySha256 k = case fromKey keyVariety k of
SHA2Key (HashSize 256) (HasExt hasext)
| hasext -> eitherToMaybe $ E.decodeUtf8' (keyHash k)
| otherwise -> eitherToMaybe $ E.decodeUtf8' (fromKey keyName k)
| otherwise -> eitherToMaybe $ E.decodeUtf8' $ S.fromShort (fromKey keyName k)
_ -> Nothing
-- The size of an encrypted key is the size of the input data, but we need

View file

@ -29,6 +29,7 @@ module Types.Key (
) where
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (ShortByteString, toShort, fromShort)
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Builder
@ -49,7 +50,7 @@ import Prelude
{- A Key has a unique name, which is derived from a particular backend,
- and may contain other optional metadata. -}
data KeyData = Key
{ keyName :: S.ByteString
{ keyName :: S.ShortByteString
, keyVariety :: KeyVariety
, keySize :: Maybe Integer
, keyMtime :: Maybe EpochTime
@ -66,7 +67,7 @@ instance NFData KeyData
-}
data Key = MkKey
{ keyData :: KeyData
, keySerialization :: S.ByteString
, keySerialization :: S.ShortByteString
} deriving (Show, Generic)
instance Eq Key where
@ -111,8 +112,8 @@ isKeyPrefix s = [fieldSep, fieldSep] `isInfixOf` s
fieldSep :: Char
fieldSep = '-'
mkKeySerialization :: KeyData -> S.ByteString
mkKeySerialization = L.toStrict
mkKeySerialization :: KeyData -> S.ShortByteString
mkKeySerialization = S.toShort . L.toStrict
. toLazyByteStringWith (safeStrategy 128 smallChunkSize) L.empty
. buildKeyData
@ -127,7 +128,7 @@ buildKeyData k = byteString (formatKeyVariety (keyVariety k))
<> 'm' ?: (integerDec . (\(CTime t) -> fromIntegral t) <$> keyMtime k)
<> 'S' ?: (integerDec <$> keyChunkSize k)
<> 'C' ?: (integerDec <$> keyChunkNum k)
<> sepbefore (sepbefore (byteString (keyName k)))
<> sepbefore (sepbefore (byteString (S.fromShort (keyName k))))
where
sepbefore s = char7 fieldSep <> s
c ?: (Just b) = sepbefore (char7 c <> b)
@ -156,7 +157,7 @@ keyParser = do
if validKeyName v n
then
let d = Key
{ keyName = n
{ keyName = S.toShort n
, keyVariety = v
, keySize = s
, keyMtime = m
@ -195,7 +196,7 @@ validKeyName kv name
- keyName minus extension, and the extension (including leading dot).
-}
splitKeyNameExtension :: Key -> (S.ByteString, S.ByteString)
splitKeyNameExtension = splitKeyNameExtension' . keyName . keyData
splitKeyNameExtension = splitKeyNameExtension' . S.fromShort . keyName . keyData
splitKeyNameExtension' :: S.ByteString -> (S.ByteString, S.ByteString)
splitKeyNameExtension' keyname = S8.span (/= '.') keyname

View file

@ -12,6 +12,7 @@ import Data.Char
import Data.Default
import Data.ByteString.Builder
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (toShort, fromShort)
import qualified Data.ByteString.Lazy as L
import qualified System.FilePath.ByteString as P
@ -138,7 +139,7 @@ oldlog2key l
where
len = length l - 4
k = readKey1 (take len l)
sane = (not . S.null $ fromKey keyName k) && (not . S.null $ formatKeyVariety $ fromKey keyVariety k)
sane = (not . S.null $ S.fromShort $ fromKey keyName k) && (not . S.null $ formatKeyVariety $ fromKey keyVariety k)
-- WORM backend keys: "WORM:mtime:size:filename"
-- all the rest: "backend:key"
@ -150,7 +151,7 @@ readKey1 :: String -> Key
readKey1 v
| mixup = fromJust $ deserializeKey $ intercalate ":" $ Prelude.tail bits
| otherwise = mkKey $ \d -> d
{ keyName = encodeBS n
{ keyName = S.toShort (encodeBS n)
, keyVariety = parseKeyVariety (encodeBS b)
, keySize = s
, keyMtime = t
@ -175,7 +176,7 @@ showKey1 k = intercalate ":" $ filter (not . null)
showifhere Nothing = ""
showifhere (Just x) = show x
b = decodeBS $ formatKeyVariety v
n = fromKey keyName k
n = S.fromShort $ fromKey keyName k
v = fromKey keyVariety k
s = fromKey keySize k
t = fromKey keyMtime k
@ -212,7 +213,7 @@ lookupKey1 file = do
where
k = fileKey1 l
bname = decodeBS (formatKeyVariety (fromKey keyVariety k))
kname = decodeBS (fromKey keyName k)
kname = decodeBS (S.fromShort (fromKey keyName k))
skip = "skipping " ++ file ++
" (unknown backend " ++ bname ++ ")"

View file

@ -4,6 +4,20 @@
date="2021-10-05T23:00:18Z"
content="""
I tried converting Ref to use ShortByteString. Memory use did not improve
and the -hc profile is unchanged. So the pinned memory is not in refs. My
guess is it must be filenames in the tree then.
and the -hc profile is unchanged. So the pinned memory is not in refs.
Also tried converting Key to use ShortByteString. That was a win!
My 20 borg archive test case is down from 320 mb to 242 mb.
Looking at Command.SyncpullThirdPartyPopulated,
it calls listContents, which calls borg's listImportableContents,
and produces an `ImportableContents (ContentIdentifier, ByteSize)`
then that gets passed through importKeys to produce
an `ImportableContents (Either Sha Key)`. Probably
double memory is used while doing that conversion, unless
the GC manages to free the first one while it's traversed.
If borg's listImportableContents included a Key (which it does
produce already only to throw away!) that might
eliminate the big spike just before treeItemsToTree.
"""]]