cache the serialization of a Key

This will speed up the common case where a Key is deserialized from
disk, but is then serialized to build eg, the path to the annex object.

Previously attempted in 4536c93bb2
and reverted in 96aba8eff7.
The problems mentioned in the latter commit are addressed now:

Read/Show of KeyData is backwards-compatible with Read/Show of Key from before
this change, so Types.Distribution will keep working.

The Eq instance is fixed.

Also, Key has smart constructors, avoiding needing to remember to update
the cached serialization.

Used git-annex benchmark:
  find is 7% faster
  whereis is 3% faster
  get when all files are already present is 5% faster
Generally, the benchmarks are running 0.1 seconds faster per 2000 files,
on a ram disk in my laptop.
This commit is contained in:
Joey Hess 2019-11-22 16:24:04 -04:00
parent e296637737
commit 81d402216d
53 changed files with 388 additions and 289 deletions

View file

@ -7,19 +7,47 @@
{-# LANGUAGE OverloadedStrings, DeriveGeneric #-}
module Types.Key where
module Types.Key (
KeyData(..),
Key,
fromKey,
mkKey,
alterKey,
isKeyPrefix,
splitKeyNameExtension,
keyParser,
keySerialization,
AssociatedFile(..),
KeyVariety(..),
HasExt(..),
HashSize(..),
hasExt,
sameExceptExt,
cryptographicallySecure,
isVerifiable,
formatKeyVariety,
parseKeyVariety,
) where
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Builder
import Data.ByteString.Builder.Extra
import qualified Data.Attoparsec.ByteString as A
import qualified Data.Attoparsec.ByteString.Char8 as A8
import Data.List
import System.Posix.Types
import Foreign.C.Types
import Data.Monoid
import Control.Applicative
import GHC.Generics
import Control.DeepSeq
import Prelude
{- A Key has a unique name, which is derived from a particular backend,
- and may contain other optional metadata. -}
data Key = Key
data KeyData = Key
{ keyName :: S.ByteString
, keyVariety :: KeyVariety
, keySize :: Maybe Integer
@ -28,8 +56,149 @@ data Key = Key
, keyChunkNum :: Maybe Integer
} deriving (Eq, Ord, Read, Show, Generic)
instance NFData KeyData
{- Caching the seralization of a key is an optimization.
-
- This constructor is not exported, and all smart constructors maintain
- the serialization.
-}
data Key = MkKey
{ keyData :: KeyData
, keySerialization :: S.ByteString
} deriving (Show, Generic)
instance Eq Key where
-- comparing the serialization would be unncessary work
a == b = keyData a == keyData b
instance Ord Key where
compare a b = compare (keyData a) (keyData b)
instance NFData Key
{- Access a field of data from the KeyData. -}
{-# INLINE fromKey #-}
fromKey :: (KeyData -> a) -> Key -> a
fromKey f = f . keyData
{- Smart constructor for a Key. The provided KeyData has all values empty. -}
mkKey :: (KeyData -> KeyData) -> Key
mkKey f =
let d = f stub
in MkKey d (mkKeySerialization d)
where
stub = Key
{ keyName = mempty
, keyVariety = OtherKey mempty
, keySize = Nothing
, keyMtime = Nothing
, keyChunkSize = Nothing
, keyChunkNum = Nothing
}
{- Alter a Key's data. -}
alterKey :: Key -> (KeyData -> KeyData) -> Key
alterKey k f =
let d = f (keyData k)
in MkKey d (mkKeySerialization d)
-- Checks if a string looks like at least the start of a key.
isKeyPrefix :: String -> Bool
isKeyPrefix s = [fieldSep, fieldSep] `isInfixOf` s
fieldSep :: Char
fieldSep = '-'
mkKeySerialization :: KeyData -> S.ByteString
mkKeySerialization = L.toStrict
. toLazyByteStringWith (safeStrategy 128 smallChunkSize) L.empty
. buildKeyData
{- Builds a ByteString from a KeyData.
-
- The name field is always shown last, separated by doubled fieldSeps,
- and is the only field allowed to contain the fieldSep.
-}
buildKeyData :: KeyData -> Builder
buildKeyData k = byteString (formatKeyVariety (keyVariety k))
<> 's' ?: (integerDec <$> keySize k)
<> 'm' ?: (integerDec . (\(CTime t) -> fromIntegral t) <$> keyMtime k)
<> 'S' ?: (integerDec <$> keyChunkSize k)
<> 'C' ?: (integerDec <$> keyChunkNum k)
<> sepbefore (sepbefore (byteString (keyName k)))
where
sepbefore s = char7 fieldSep <> s
c ?: (Just b) = sepbefore (char7 c <> b)
_ ?: Nothing = mempty
{- This is a strict parser for security reasons; a key
- can contain only 4 fields, which all consist only of numbers.
- Any key containing other fields, or non-numeric data will fail
- to parse.
-
- If a key contained non-numeric fields, they could be used to
- embed data used in a SHA1 collision attack, which would be a
- problem since the keys are committed to git.
-}
keyParser :: A.Parser Key
keyParser = do
-- key variety cannot be empty
v <- (parseKeyVariety <$> A8.takeWhile1 (/= fieldSep))
s <- parsesize
m <- parsemtime
cs <- parsechunksize
cn <- parsechunknum
_ <- A8.char fieldSep
_ <- A8.char fieldSep
n <- A.takeByteString
if validKeyName v n
then
let d = Key
{ keyName = n
, keyVariety = v
, keySize = s
, keyMtime = m
, keyChunkSize = cs
, keyChunkNum = cn
}
in pure $ MkKey d (mkKeySerialization d)
else fail "invalid keyName"
where
parseopt p = (Just <$> (A8.char fieldSep *> p)) <|> pure Nothing
parsesize = parseopt $ A8.char 's' *> A8.decimal
parsemtime = parseopt $ CTime <$> (A8.char 'm' *> A8.decimal)
parsechunksize = parseopt $ A8.char 'S' *> A8.decimal
parsechunknum = parseopt $ A8.char 'C' *> A8.decimal
{- Limits the length of the extension in the keyName to mitigate against
- SHA1 collision attacks.
-
- In such an attack, the extension of the key could be made to contain
- the collision generation data, with the result that a signed git commit
- including such keys would not be secure.
-
- The maximum extension length ever generated for such a key was 8
- characters, but they may be unicode which could use up to 4 bytes each,
- so 32 bytes. 64 bytes is used here to give a little future wiggle-room.
- The SHA1 common-prefix attack needs 128 bytes of data.
-}
validKeyName :: KeyVariety -> S.ByteString -> Bool
validKeyName kv name
| hasExt kv =
let ext = snd $ splitKeyNameExtension' name
in S.length ext <= 64
| otherwise = True
{- This splits any extension out of the keyName, returning the
- keyName minus extension, and the extension (including leading dot).
-}
splitKeyNameExtension :: Key -> (S.ByteString, S.ByteString)
splitKeyNameExtension = splitKeyNameExtension' . keyName . keyData
splitKeyNameExtension' :: S.ByteString -> (S.ByteString, S.ByteString)
splitKeyNameExtension' keyname = S8.span (/= '.') keyname
{- A filename may be associated with a Key. -}
newtype AssociatedFile = AssociatedFile (Maybe FilePath)
deriving (Show, Eq, Ord)