2017-02-24 17:42:30 +00:00
|
|
|
{- git-annex Keys
|
|
|
|
-
|
2019-01-11 20:33:42 +00:00
|
|
|
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
2017-02-24 17:42:30 +00:00
|
|
|
-
|
|
|
|
- Licensed under the GNU GPL version 3 or higher.
|
|
|
|
-}
|
|
|
|
|
|
|
|
{-# OPTIONS_GHC -fno-warn-orphans #-}
|
|
|
|
|
|
|
|
module Key (
|
|
|
|
Key(..),
|
2017-03-10 17:12:24 +00:00
|
|
|
AssociatedFile(..),
|
2017-02-24 17:42:30 +00:00
|
|
|
stubKey,
|
2019-01-14 17:03:35 +00:00
|
|
|
buildKey,
|
|
|
|
keyParser,
|
|
|
|
serializeKey,
|
2019-01-14 17:17:47 +00:00
|
|
|
serializeKey',
|
|
|
|
deserializeKey,
|
2019-01-14 17:03:35 +00:00
|
|
|
deserializeKey',
|
2017-02-24 17:42:30 +00:00
|
|
|
nonChunkKey,
|
|
|
|
chunkKeyOffset,
|
|
|
|
isChunkKey,
|
|
|
|
isKeyPrefix,
|
2019-01-11 20:33:42 +00:00
|
|
|
splitKeyNameExtension,
|
2017-02-24 17:42:30 +00:00
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
prop_isomorphic_key_encode
|
2017-02-24 17:42:30 +00:00
|
|
|
) where
|
|
|
|
|
|
|
|
import qualified Data.Text as T
|
2019-01-11 20:33:42 +00:00
|
|
|
import qualified Data.ByteString as S
|
|
|
|
import qualified Data.ByteString.Char8 as S8
|
|
|
|
import qualified Data.ByteString.Lazy as L
|
|
|
|
import Data.ByteString.Builder
|
|
|
|
import Data.ByteString.Builder.Extra
|
|
|
|
import qualified Data.Attoparsec.ByteString as A
|
|
|
|
import qualified Data.Attoparsec.ByteString.Char8 as A8
|
|
|
|
import Foreign.C.Types
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
import Common
|
|
|
|
import Types.Key
|
|
|
|
import Utility.QuickCheck
|
|
|
|
import Utility.Bloom
|
Fix mangling of --json output of utf-8 characters when not running in a utf-8 locale
As long as all code imports Utility.Aeson rather than Data.Aeson,
and no Strings that may contain utf-8 characters are used for eg, object
keys via T.pack, this is guaranteed to fix the problem everywhere that
git-annex generates json.
It's kind of annoying to need to wrap ToJSON with a ToJSON', especially
since every data type that has a ToJSON instance has to be ported over.
However, that only took 50 lines of code, which is worth it to ensure full
coverage. I initially tried an alternative approach of a newtype FileEncoded,
which had to be used everywhere a String was fed into aeson, and chasing
down all the sites would have been far too hard. Did consider creating an
intentionally overlapping instance ToJSON String, and letting ghc fail
to build anything that passed in a String, but am not sure that wouldn't
pollute some library that git-annex depends on that happens to use ToJSON
String internally.
This commit was supported by the NSF-funded DataLad project.
2018-04-16 19:42:45 +00:00
|
|
|
import Utility.Aeson
|
2017-02-24 17:42:30 +00:00
|
|
|
import qualified Utility.SimpleProtocol as Proto
|
|
|
|
|
|
|
|
stubKey :: Key
|
|
|
|
stubKey = Key
|
2019-01-11 20:33:42 +00:00
|
|
|
{ keyName = mempty
|
|
|
|
, keyVariety = OtherKey mempty
|
2017-02-24 17:42:30 +00:00
|
|
|
, keySize = Nothing
|
|
|
|
, keyMtime = Nothing
|
|
|
|
, keyChunkSize = Nothing
|
|
|
|
, keyChunkNum = Nothing
|
|
|
|
}
|
|
|
|
|
|
|
|
-- Gets the parent of a chunk key.
|
|
|
|
nonChunkKey :: Key -> Key
|
2019-01-16 20:09:53 +00:00
|
|
|
nonChunkKey k = k
|
|
|
|
{ keyChunkSize = Nothing
|
|
|
|
, keyChunkNum = Nothing
|
|
|
|
}
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
-- Where a chunk key is offset within its parent.
|
|
|
|
chunkKeyOffset :: Key -> Maybe Integer
|
|
|
|
chunkKeyOffset k = (*)
|
|
|
|
<$> keyChunkSize k
|
|
|
|
<*> (pred <$> keyChunkNum k)
|
|
|
|
|
|
|
|
isChunkKey :: Key -> Bool
|
|
|
|
isChunkKey k = isJust (keyChunkSize k) && isJust (keyChunkNum k)
|
|
|
|
|
|
|
|
-- Checks if a string looks like at least the start of a key.
|
|
|
|
isKeyPrefix :: String -> Bool
|
|
|
|
isKeyPrefix s = [fieldSep, fieldSep] `isInfixOf` s
|
|
|
|
|
|
|
|
fieldSep :: Char
|
|
|
|
fieldSep = '-'
|
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
{- Builds a ByteString from a Key.
|
|
|
|
-
|
|
|
|
- The name field is always shown last, separated by doubled fieldSeps,
|
|
|
|
- and is the only field allowed to contain the fieldSep.
|
|
|
|
-}
|
|
|
|
buildKey :: Key -> Builder
|
|
|
|
buildKey k = byteString (formatKeyVariety (keyVariety k))
|
2019-01-11 20:33:42 +00:00
|
|
|
<> 's' ?: (integerDec <$> keySize k)
|
|
|
|
<> 'm' ?: (integerDec . (\(CTime t) -> fromIntegral t) <$> keyMtime k)
|
|
|
|
<> 'S' ?: (integerDec <$> keyChunkSize k)
|
|
|
|
<> 'C' ?: (integerDec <$> keyChunkNum k)
|
|
|
|
<> sepbefore (sepbefore (byteString (keyName k)))
|
|
|
|
where
|
|
|
|
sepbefore s = char7 fieldSep <> s
|
|
|
|
c ?: (Just b) = sepbefore (char7 c <> b)
|
|
|
|
_ ?: Nothing = mempty
|
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
serializeKey :: Key -> String
|
2019-01-16 20:09:53 +00:00
|
|
|
serializeKey = decodeBL' . serializeKey'
|
2019-01-11 20:33:42 +00:00
|
|
|
|
2019-01-16 20:09:53 +00:00
|
|
|
serializeKey' :: Key -> L.ByteString
|
|
|
|
serializeKey' = toLazyByteStringWith (safeStrategy 128 smallChunkSize) L.empty . buildKey
|
2019-01-11 20:33:42 +00:00
|
|
|
|
|
|
|
{- This is a strict parser for security reasons; a key
|
|
|
|
- can contain only 4 fields, which all consist only of numbers.
|
|
|
|
- Any key containing other fields, or non-numeric data will fail
|
|
|
|
- to parse.
|
|
|
|
-
|
|
|
|
- If a key contained non-numeric fields, they could be used to
|
|
|
|
- embed data used in a SHA1 collision attack, which would be a
|
|
|
|
- problem since the keys are committed to git.
|
|
|
|
-}
|
2019-01-14 17:03:35 +00:00
|
|
|
keyParser :: A.Parser Key
|
|
|
|
keyParser = do
|
2019-01-11 20:33:42 +00:00
|
|
|
-- key variety cannot be empty
|
|
|
|
v <- (parseKeyVariety <$> A8.takeWhile1 (/= fieldSep))
|
|
|
|
s <- parsesize
|
|
|
|
m <- parsemtime
|
|
|
|
cs <- parsechunksize
|
|
|
|
cn <- parsechunknum
|
|
|
|
_ <- A8.char fieldSep
|
|
|
|
_ <- A8.char fieldSep
|
|
|
|
n <- A.takeByteString
|
|
|
|
if validKeyName v n
|
|
|
|
then return $ Key
|
|
|
|
{ keyName = n
|
|
|
|
, keyVariety = v
|
|
|
|
, keySize = s
|
|
|
|
, keyMtime = m
|
|
|
|
, keyChunkSize = cs
|
|
|
|
, keyChunkNum = cn
|
|
|
|
}
|
|
|
|
else fail "invalid keyName"
|
2017-02-24 17:42:30 +00:00
|
|
|
where
|
2019-01-11 20:33:42 +00:00
|
|
|
parseopt p = (Just <$> (A8.char fieldSep *> p)) <|> pure Nothing
|
|
|
|
parsesize = parseopt $ A8.char 's' *> A8.decimal
|
|
|
|
parsemtime = parseopt $ CTime <$> (A8.char 'm' *> A8.decimal)
|
|
|
|
parsechunksize = parseopt $ A8.char 'S' *> A8.decimal
|
|
|
|
parsechunknum = parseopt $ A8.char 'C' *> A8.decimal
|
2017-02-24 17:42:30 +00:00
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
deserializeKey :: String -> Maybe Key
|
|
|
|
deserializeKey = deserializeKey' . encodeBS'
|
2019-01-11 20:33:42 +00:00
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
deserializeKey' :: S.ByteString -> Maybe Key
|
2019-01-16 20:09:53 +00:00
|
|
|
deserializeKey' b = eitherToMaybe $ A.parseOnly keyParser b
|
2019-01-11 20:33:42 +00:00
|
|
|
|
|
|
|
{- This splits any extension out of the keyName, returning the
|
|
|
|
- keyName minus extension, and the extension (including leading dot).
|
|
|
|
-}
|
|
|
|
splitKeyNameExtension :: Key -> (S.ByteString, S.ByteString)
|
|
|
|
splitKeyNameExtension = splitKeyNameExtension' . keyName
|
|
|
|
|
|
|
|
splitKeyNameExtension' :: S.ByteString -> (S.ByteString, S.ByteString)
|
|
|
|
splitKeyNameExtension' keyname = S8.span (/= '.') keyname
|
|
|
|
|
|
|
|
{- Limits the length of the extension in the keyName to mitigate against
|
|
|
|
- SHA1 collision attacks.
|
2017-02-24 17:42:30 +00:00
|
|
|
-
|
|
|
|
- In such an attack, the extension of the key could be made to contain
|
|
|
|
- the collision generation data, with the result that a signed git commit
|
2017-02-24 23:54:36 +00:00
|
|
|
- including such keys would not be secure.
|
2017-02-24 17:42:30 +00:00
|
|
|
-
|
|
|
|
- The maximum extension length ever generated for such a key was 8
|
2019-01-14 23:03:25 +00:00
|
|
|
- characters, but they may be unicode which could use up to 4 bytes each,
|
|
|
|
- so 32 bytes. 64 bytes is used here to give a little future wiggle-room.
|
2017-02-24 19:16:56 +00:00
|
|
|
- The SHA1 common-prefix attack needs 128 bytes of data.
|
2017-02-24 17:42:30 +00:00
|
|
|
-}
|
2019-01-11 20:33:42 +00:00
|
|
|
validKeyName :: KeyVariety -> S.ByteString -> Bool
|
|
|
|
validKeyName kv name
|
|
|
|
| hasExt kv =
|
|
|
|
let ext = snd $ splitKeyNameExtension' name
|
2019-01-14 23:03:25 +00:00
|
|
|
in S.length ext <= 64
|
2017-02-24 17:42:30 +00:00
|
|
|
| otherwise = True
|
|
|
|
|
|
|
|
instance Arbitrary Key where
|
|
|
|
arbitrary = Key
|
2019-01-11 20:33:42 +00:00
|
|
|
<$> (encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t"))
|
|
|
|
<*> (parseKeyVariety . encodeBS <$> (listOf1 $ elements ['A'..'Z'])) -- BACKEND
|
2017-02-24 17:42:30 +00:00
|
|
|
<*> ((abs <$>) <$> arbitrary) -- size cannot be negative
|
2017-06-17 17:04:48 +00:00
|
|
|
<*> ((abs . fromInteger <$>) <$> arbitrary) -- mtime cannot be negative
|
2017-02-24 17:42:30 +00:00
|
|
|
<*> ((abs <$>) <$> arbitrary) -- chunksize cannot be negative
|
|
|
|
<*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative
|
|
|
|
|
|
|
|
instance Hashable Key where
|
2019-01-14 17:17:47 +00:00
|
|
|
hashIO32 = hashIO32 . serializeKey'
|
|
|
|
hashIO64 = hashIO64 . serializeKey'
|
2017-02-24 17:42:30 +00:00
|
|
|
|
Fix mangling of --json output of utf-8 characters when not running in a utf-8 locale
As long as all code imports Utility.Aeson rather than Data.Aeson,
and no Strings that may contain utf-8 characters are used for eg, object
keys via T.pack, this is guaranteed to fix the problem everywhere that
git-annex generates json.
It's kind of annoying to need to wrap ToJSON with a ToJSON', especially
since every data type that has a ToJSON instance has to be ported over.
However, that only took 50 lines of code, which is worth it to ensure full
coverage. I initially tried an alternative approach of a newtype FileEncoded,
which had to be used everywhere a String was fed into aeson, and chasing
down all the sites would have been far too hard. Did consider creating an
intentionally overlapping instance ToJSON String, and letting ghc fail
to build anything that passed in a String, but am not sure that wouldn't
pollute some library that git-annex depends on that happens to use ToJSON
String internally.
This commit was supported by the NSF-funded DataLad project.
2018-04-16 19:42:45 +00:00
|
|
|
instance ToJSON' Key where
|
2019-01-14 17:03:35 +00:00
|
|
|
toJSON' = toJSON' . serializeKey
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
instance FromJSON Key where
|
2019-01-14 17:03:35 +00:00
|
|
|
parseJSON (String t) = maybe mempty pure $ deserializeKey $ T.unpack t
|
2017-02-24 17:42:30 +00:00
|
|
|
parseJSON _ = mempty
|
|
|
|
|
|
|
|
instance Proto.Serializable Key where
|
2019-01-14 17:03:35 +00:00
|
|
|
serialize = serializeKey
|
|
|
|
deserialize = deserializeKey
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
prop_isomorphic_key_encode :: Key -> Bool
|
2019-01-14 17:03:35 +00:00
|
|
|
prop_isomorphic_key_encode k = Just k == (deserializeKey . serializeKey) k
|