git-annex/Key.hs

109 lines
2.8 KiB
Haskell
Raw Permalink Normal View History

2017-02-24 17:42:30 +00:00
{- git-annex Keys
-
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
2017-02-24 17:42:30 +00:00
-
- Licensed under the GNU AGPL version 3 or higher.
2017-02-24 17:42:30 +00:00
-}
{-# LANGUAGE OverloadedStrings #-}
2017-02-24 17:42:30 +00:00
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Key (
Key,
KeyData(..),
AssociatedFile(..),
fromKey,
mkKey,
alterKey,
keyParser,
serializeKey,
2019-01-14 17:17:47 +00:00
serializeKey',
deserializeKey,
deserializeKey',
2017-02-24 17:42:30 +00:00
nonChunkKey,
chunkKeyOffset,
isChunkKey,
isKeyPrefix,
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
splitKeyNameExtension,
2017-02-24 17:42:30 +00:00
prop_isomorphic_key_encode
2017-02-24 17:42:30 +00:00
) where
import qualified Data.Text as T
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (toShort, fromShort)
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
import qualified Data.Attoparsec.ByteString as A
2017-02-24 17:42:30 +00:00
import Common
import Types.Key
import Utility.QuickCheck
import Utility.Bloom
import Utility.Aeson
2017-02-24 17:42:30 +00:00
import qualified Utility.SimpleProtocol as Proto
-- Gets the parent of a chunk key.
nonChunkKey :: Key -> Key
nonChunkKey k
| fromKey keyChunkSize k == Nothing && fromKey keyChunkNum k == Nothing = k
| otherwise = alterKey k $ \d -> d
{ keyChunkSize = Nothing
, keyChunkNum = Nothing
}
2017-02-24 17:42:30 +00:00
-- Where a chunk key is offset within its parent.
chunkKeyOffset :: Key -> Maybe Integer
chunkKeyOffset k = (*)
<$> fromKey keyChunkSize k
<*> (pred <$> fromKey keyChunkNum k)
2017-02-24 17:42:30 +00:00
isChunkKey :: Key -> Bool
isChunkKey k = isJust (fromKey keyChunkSize k) && isJust (fromKey keyChunkNum k)
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
serializeKey :: Key -> String
serializeKey = decodeBS . serializeKey'
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
serializeKey' :: Key -> S.ByteString
serializeKey' = S.fromShort . keySerialization
2017-02-24 17:42:30 +00:00
deserializeKey :: String -> Maybe Key
deserializeKey = deserializeKey' . encodeBS
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
deserializeKey' :: S.ByteString -> Maybe Key
deserializeKey' = eitherToMaybe . A.parseOnly keyParser
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
instance Arbitrary KeyData where
2017-02-24 17:42:30 +00:00
arbitrary = Key
<$> (S.toShort . encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t"))
convert key2file and file2key to use builder and attoparsec The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
2019-01-11 20:33:42 +00:00
<*> (parseKeyVariety . encodeBS <$> (listOf1 $ elements ['A'..'Z'])) -- BACKEND
2017-02-24 17:42:30 +00:00
<*> ((abs <$>) <$> arbitrary) -- size cannot be negative
<*> ((abs . fromInteger <$>) <$> arbitrary) -- mtime cannot be negative
2017-02-24 17:42:30 +00:00
<*> ((abs <$>) <$> arbitrary) -- chunksize cannot be negative
<*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative
instance Arbitrary AssociatedFile where
arbitrary = AssociatedFile
. fmap (toRawFilePath . fromTestableFilePath)
<$> arbitrary
instance Arbitrary Key where
arbitrary = mkKey . const <$> arbitrary
2017-02-24 17:42:30 +00:00
instance Hashable Key where
2019-01-14 17:17:47 +00:00
hashIO32 = hashIO32 . serializeKey'
hashIO64 = hashIO64 . serializeKey'
2017-02-24 17:42:30 +00:00
instance ToJSON' Key where
toJSON' = toJSON' . serializeKey
2017-02-24 17:42:30 +00:00
instance FromJSON Key where
parseJSON (String t) = maybe mempty pure $ deserializeKey $ T.unpack t
2017-02-24 17:42:30 +00:00
parseJSON _ = mempty
instance Proto.Serializable Key where
serialize = serializeKey
deserialize = deserializeKey
2017-02-24 17:42:30 +00:00
prop_isomorphic_key_encode :: Key -> Bool
prop_isomorphic_key_encode k = Just k == (deserializeKey . serializeKey) k