2017-02-24 17:42:30 +00:00
|
|
|
{- git-annex Keys
|
|
|
|
-
|
2020-02-02 20:01:46 +00:00
|
|
|
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
|
2017-02-24 17:42:30 +00:00
|
|
|
-
|
2019-03-13 19:48:14 +00:00
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
2017-02-24 17:42:30 +00:00
|
|
|
-}
|
|
|
|
|
|
|
|
{-# OPTIONS_GHC -fno-warn-orphans #-}
|
|
|
|
|
|
|
|
module Key (
|
2019-11-22 20:24:04 +00:00
|
|
|
Key,
|
|
|
|
KeyData(..),
|
2017-03-10 17:12:24 +00:00
|
|
|
AssociatedFile(..),
|
2019-11-22 20:24:04 +00:00
|
|
|
fromKey,
|
|
|
|
mkKey,
|
|
|
|
alterKey,
|
2019-01-14 17:03:35 +00:00
|
|
|
keyParser,
|
|
|
|
serializeKey,
|
2019-01-14 17:17:47 +00:00
|
|
|
serializeKey',
|
|
|
|
deserializeKey,
|
2019-01-14 17:03:35 +00:00
|
|
|
deserializeKey',
|
2017-02-24 17:42:30 +00:00
|
|
|
nonChunkKey,
|
|
|
|
chunkKeyOffset,
|
|
|
|
isChunkKey,
|
|
|
|
isKeyPrefix,
|
2019-01-11 20:33:42 +00:00
|
|
|
splitKeyNameExtension,
|
2017-02-24 17:42:30 +00:00
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
prop_isomorphic_key_encode
|
2017-02-24 17:42:30 +00:00
|
|
|
) where
|
|
|
|
|
2020-02-02 20:01:46 +00:00
|
|
|
import Data.Char
|
2017-02-24 17:42:30 +00:00
|
|
|
import qualified Data.Text as T
|
2019-01-11 20:33:42 +00:00
|
|
|
import qualified Data.ByteString as S
|
|
|
|
import qualified Data.Attoparsec.ByteString as A
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
import Common
|
|
|
|
import Types.Key
|
|
|
|
import Utility.QuickCheck
|
|
|
|
import Utility.Bloom
|
Fix mangling of --json output of utf-8 characters when not running in a utf-8 locale
As long as all code imports Utility.Aeson rather than Data.Aeson,
and no Strings that may contain utf-8 characters are used for eg, object
keys via T.pack, this is guaranteed to fix the problem everywhere that
git-annex generates json.
It's kind of annoying to need to wrap ToJSON with a ToJSON', especially
since every data type that has a ToJSON instance has to be ported over.
However, that only took 50 lines of code, which is worth it to ensure full
coverage. I initially tried an alternative approach of a newtype FileEncoded,
which had to be used everywhere a String was fed into aeson, and chasing
down all the sites would have been far too hard. Did consider creating an
intentionally overlapping instance ToJSON String, and letting ghc fail
to build anything that passed in a String, but am not sure that wouldn't
pollute some library that git-annex depends on that happens to use ToJSON
String internally.
This commit was supported by the NSF-funded DataLad project.
2018-04-16 19:42:45 +00:00
|
|
|
import Utility.Aeson
|
2017-02-24 17:42:30 +00:00
|
|
|
import qualified Utility.SimpleProtocol as Proto
|
|
|
|
|
|
|
|
-- Gets the parent of a chunk key.
|
|
|
|
nonChunkKey :: Key -> Key
|
2019-11-22 20:24:04 +00:00
|
|
|
nonChunkKey k
|
|
|
|
| fromKey keyChunkSize k == Nothing && fromKey keyChunkNum k == Nothing = k
|
|
|
|
| otherwise = alterKey k $ \d -> d
|
|
|
|
{ keyChunkSize = Nothing
|
|
|
|
, keyChunkNum = Nothing
|
|
|
|
}
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
-- Where a chunk key is offset within its parent.
|
|
|
|
chunkKeyOffset :: Key -> Maybe Integer
|
|
|
|
chunkKeyOffset k = (*)
|
2019-11-22 20:24:04 +00:00
|
|
|
<$> fromKey keyChunkSize k
|
|
|
|
<*> (pred <$> fromKey keyChunkNum k)
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
isChunkKey :: Key -> Bool
|
2019-11-22 20:24:04 +00:00
|
|
|
isChunkKey k = isJust (fromKey keyChunkSize k) && isJust (fromKey keyChunkNum k)
|
2019-01-11 20:33:42 +00:00
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
serializeKey :: Key -> String
|
2019-11-22 20:24:04 +00:00
|
|
|
serializeKey = decodeBS' . serializeKey'
|
2019-01-11 20:33:42 +00:00
|
|
|
|
2019-11-22 20:24:04 +00:00
|
|
|
serializeKey' :: Key -> S.ByteString
|
|
|
|
serializeKey' = keySerialization
|
2017-02-24 17:42:30 +00:00
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
deserializeKey :: String -> Maybe Key
|
|
|
|
deserializeKey = deserializeKey' . encodeBS'
|
2019-01-11 20:33:42 +00:00
|
|
|
|
2019-01-14 17:03:35 +00:00
|
|
|
deserializeKey' :: S.ByteString -> Maybe Key
|
2019-10-29 16:28:01 +00:00
|
|
|
deserializeKey' = eitherToMaybe . A.parseOnly keyParser
|
2019-01-11 20:33:42 +00:00
|
|
|
|
2019-11-22 20:24:04 +00:00
|
|
|
instance Arbitrary KeyData where
|
2017-02-24 17:42:30 +00:00
|
|
|
arbitrary = Key
|
2019-01-11 20:33:42 +00:00
|
|
|
<$> (encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t"))
|
|
|
|
<*> (parseKeyVariety . encodeBS <$> (listOf1 $ elements ['A'..'Z'])) -- BACKEND
|
2017-02-24 17:42:30 +00:00
|
|
|
<*> ((abs <$>) <$> arbitrary) -- size cannot be negative
|
2017-06-17 17:04:48 +00:00
|
|
|
<*> ((abs . fromInteger <$>) <$> arbitrary) -- mtime cannot be negative
|
2017-02-24 17:42:30 +00:00
|
|
|
<*> ((abs <$>) <$> arbitrary) -- chunksize cannot be negative
|
|
|
|
<*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative
|
|
|
|
|
fix Arbitrary AssociatedFile
Empty filenames were already filtered out as not allowed. But before
the change to ByteString, a NUL could appear in an Arbitrary String,
and so Arbitrary AssociatedFile sometimes generated illegal filenames,
as NUL never appears in a filename. The change to ByteString meant the
String was run through toRawFilePath, which assumes a filename never
contains a NUL. That truncated the String at the NUL, which could
result in an AssociatedFile being generated with an empty filename.
The filtering of NUL added here is not really necessary, because
of the truncation, but it makes explicit that NUL is not allowed.
The real fix is that the suchThat now applies to the final
AssociatedFile, so will catch any empty ones however generated.
This raises the more general question of whether toRawFilePath might
truncate other strings that later get used as filenames. I think new
bugs probably won't be introduced by that. Before, a FilePath that got
read from somewhere (eg an attacker) and contained a NUL would perhaps
be printed out by git-annex, including the NUL, or written to disk
inside a file, or what have you. But as soon as that FilePath gets
passed to any IO action that treats it as a filename, it gets truncated
after the NUL. Eg, writeFile "foo\NULbar" "bar" writes to file "foo".
Now toRawFilePath will make the truncation happen earler, but at most
this will affect what gets printed out or is written to disk inside a
file; actually using the RawFilePath as a filename will not change from
using the FilePath as a filename.
2019-12-06 16:44:18 +00:00
|
|
|
-- AssociatedFile cannot be empty, and cannot contain a NUL
|
2020-02-02 20:01:46 +00:00
|
|
|
-- (but can be Nothing).
|
2019-11-26 19:27:22 +00:00
|
|
|
instance Arbitrary AssociatedFile where
|
2020-02-02 20:01:46 +00:00
|
|
|
arbitrary = (AssociatedFile . fmap conv <$> arbitrary)
|
fix Arbitrary AssociatedFile
Empty filenames were already filtered out as not allowed. But before
the change to ByteString, a NUL could appear in an Arbitrary String,
and so Arbitrary AssociatedFile sometimes generated illegal filenames,
as NUL never appears in a filename. The change to ByteString meant the
String was run through toRawFilePath, which assumes a filename never
contains a NUL. That truncated the String at the NUL, which could
result in an AssociatedFile being generated with an empty filename.
The filtering of NUL added here is not really necessary, because
of the truncation, but it makes explicit that NUL is not allowed.
The real fix is that the suchThat now applies to the final
AssociatedFile, so will catch any empty ones however generated.
This raises the more general question of whether toRawFilePath might
truncate other strings that later get used as filenames. I think new
bugs probably won't be introduced by that. Before, a FilePath that got
read from somewhere (eg an attacker) and contained a NUL would perhaps
be printed out by git-annex, including the NUL, or written to disk
inside a file, or what have you. But as soon as that FilePath gets
passed to any IO action that treats it as a filename, it gets truncated
after the NUL. Eg, writeFile "foo\NULbar" "bar" writes to file "foo".
Now toRawFilePath will make the truncation happen earler, but at most
this will affect what gets printed out or is written to disk inside a
file; actually using the RawFilePath as a filename will not change from
using the FilePath as a filename.
2019-12-06 16:44:18 +00:00
|
|
|
`suchThat` (/= AssociatedFile (Just S.empty))
|
2019-12-30 16:27:45 +00:00
|
|
|
`suchThat` (\(AssociatedFile f) -> maybe True (S.notElem 0) f)
|
2020-02-02 20:01:46 +00:00
|
|
|
where
|
|
|
|
-- Generating arbitrary unicode leads to encoding errors
|
|
|
|
-- when LANG=C, so limit to ascii.
|
|
|
|
conv = toRawFilePath . filter isAscii
|
2019-11-26 19:27:22 +00:00
|
|
|
|
2019-11-22 20:24:04 +00:00
|
|
|
instance Arbitrary Key where
|
|
|
|
arbitrary = mkKey . const <$> arbitrary
|
|
|
|
|
2017-02-24 17:42:30 +00:00
|
|
|
instance Hashable Key where
|
2019-01-14 17:17:47 +00:00
|
|
|
hashIO32 = hashIO32 . serializeKey'
|
|
|
|
hashIO64 = hashIO64 . serializeKey'
|
2017-02-24 17:42:30 +00:00
|
|
|
|
Fix mangling of --json output of utf-8 characters when not running in a utf-8 locale
As long as all code imports Utility.Aeson rather than Data.Aeson,
and no Strings that may contain utf-8 characters are used for eg, object
keys via T.pack, this is guaranteed to fix the problem everywhere that
git-annex generates json.
It's kind of annoying to need to wrap ToJSON with a ToJSON', especially
since every data type that has a ToJSON instance has to be ported over.
However, that only took 50 lines of code, which is worth it to ensure full
coverage. I initially tried an alternative approach of a newtype FileEncoded,
which had to be used everywhere a String was fed into aeson, and chasing
down all the sites would have been far too hard. Did consider creating an
intentionally overlapping instance ToJSON String, and letting ghc fail
to build anything that passed in a String, but am not sure that wouldn't
pollute some library that git-annex depends on that happens to use ToJSON
String internally.
This commit was supported by the NSF-funded DataLad project.
2018-04-16 19:42:45 +00:00
|
|
|
instance ToJSON' Key where
|
2019-01-14 17:03:35 +00:00
|
|
|
toJSON' = toJSON' . serializeKey
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
instance FromJSON Key where
|
2019-01-14 17:03:35 +00:00
|
|
|
parseJSON (String t) = maybe mempty pure $ deserializeKey $ T.unpack t
|
2017-02-24 17:42:30 +00:00
|
|
|
parseJSON _ = mempty
|
|
|
|
|
|
|
|
instance Proto.Serializable Key where
|
2019-01-14 17:03:35 +00:00
|
|
|
serialize = serializeKey
|
|
|
|
deserialize = deserializeKey
|
2017-02-24 17:42:30 +00:00
|
|
|
|
|
|
|
prop_isomorphic_key_encode :: Key -> Bool
|
2019-01-14 17:03:35 +00:00
|
|
|
prop_isomorphic_key_encode k = Just k == (deserializeKey . serializeKey) k
|
2019-11-22 20:24:04 +00:00
|
|
|
|