diff --git a/Remote/Helper/Encryptable.hs b/Remote/Helper/Encryptable.hs index c1243a5188..2c1935ba93 100644 --- a/Remote/Helper/Encryptable.hs +++ b/Remote/Helper/Encryptable.hs @@ -20,13 +20,14 @@ module Remote.Helper.Encryptable ( ) where import qualified Data.Map as M +import qualified "dataenc" Codec.Binary.Base64 as B64 +import Data.Bits.Utils import Common.Annex import Types.Remote import Crypto import Types.Crypto import qualified Annex -import Utility.Base64 -- Used to ensure that encryption has been set up before trying to -- eg, store creds in the remote config that would need to use the @@ -137,9 +138,9 @@ cipherKey c = fmap make <$> remoteCipher c {- Stores an StorableCipher in a remote's configuration. -} storeCipher :: RemoteConfig -> StorableCipher -> RemoteConfig -storeCipher c (SharedCipher t) = M.insert "cipher" (toB64 t) c +storeCipher c (SharedCipher t) = M.insert "cipher" (toB64bs t) c storeCipher c (EncryptedCipher t _ ks) = - M.insert "cipher" (toB64 t) $ M.insert "cipherkeys" (showkeys ks) c + M.insert "cipher" (toB64bs t) $ M.insert "cipherkeys" (showkeys ks) c where showkeys (KeyIds l) = intercalate "," l @@ -149,11 +150,11 @@ extractCipher c = case (M.lookup "cipher" c, M.lookup "cipherkeys" c, M.lookup "encryption" c) of (Just t, Just ks, encryption) | maybe True (== "hybrid") encryption -> - Just $ EncryptedCipher (fromB64 t) Hybrid (readkeys ks) + Just $ EncryptedCipher (fromB64bs t) Hybrid (readkeys ks) (Just t, Just ks, Just "pubkey") -> - Just $ EncryptedCipher (fromB64 t) PubKey (readkeys ks) + Just $ EncryptedCipher (fromB64bs t) PubKey (readkeys ks) (Just t, Nothing, encryption) | maybe True (== "shared") encryption -> - Just $ SharedCipher (fromB64 t) + Just $ SharedCipher (fromB64bs t) _ -> Nothing where readkeys = KeyIds . split "," @@ -169,3 +170,14 @@ describeEncryption c = case extractCipher c of PubKey -> Nothing Hybrid -> Just "(hybrid mode)" ] + +{- Not using Utility.Base64 because these "Strings" are really + - bags of bytes and that would convert to unicode and not roung-trip + - cleanly. -} +toB64bs :: String -> String +toB64bs = B64.encode . s2w8 + +fromB64bs :: String -> String +fromB64bs s = fromMaybe bad $ w82s <$> B64.decode s + where + bad = error "bad base64 encoded data" diff --git a/Test.hs b/Test.hs index 290ff0b699..5286bc6f61 100644 --- a/Test.hs +++ b/Test.hs @@ -71,6 +71,7 @@ import qualified Utility.Hash import qualified Utility.Scheduled import qualified Utility.HumanTime import qualified Utility.ThreadScheduler +import qualified Utility.Base64 import qualified Command.Uninit import qualified CmdLine.GitAnnex as GitAnnex #ifndef mingw32_HOST_OS @@ -163,6 +164,7 @@ properties = localOption (QuickCheckTests 1000) $ testGroup "QuickCheck" , testProperty "prop_branchView_legal" Logs.View.prop_branchView_legal , testProperty "prop_view_roundtrips" Annex.View.prop_view_roundtrips , testProperty "prop_viewedFile_rountrips" Annex.View.ViewedFile.prop_viewedFile_roundtrips + , testProperty "prop_b64_roundtrips" Utility.Base64.prop_b64_roundtrips ] {- These tests set up the test environment, but also test some basic parts diff --git a/Types/MetaData.hs b/Types/MetaData.hs index 2a6b3b8646..1e8fb4aa2f 100644 --- a/Types/MetaData.hs +++ b/Types/MetaData.hs @@ -224,6 +224,7 @@ data ModMeta | DelMeta MetaField MetaValue | SetMeta MetaField MetaValue -- removes any existing values | MaybeSetMeta MetaField MetaValue -- when field has no existing value + deriving (Show) {- Applies a ModMeta, generating the new MetaData. - Note that the new MetaData does not include all the diff --git a/Utility/Base64.hs b/Utility/Base64.hs index 56637a1178..80cc122a17 100644 --- a/Utility/Base64.hs +++ b/Utility/Base64.hs @@ -1,24 +1,28 @@ -{- Simple Base64 access +{- Simple Base64 encoding of Strings - - Copyright 2011 Joey Hess - - License: BSD-2-clause -} -module Utility.Base64 (toB64, fromB64Maybe, fromB64) where +module Utility.Base64 (toB64, fromB64Maybe, fromB64, prop_b64_roundtrips) where -import "dataenc" Codec.Binary.Base64 -import Data.Bits.Utils +import qualified "dataenc" Codec.Binary.Base64 as B64 import Control.Applicative import Data.Maybe +import qualified Data.ByteString.Lazy as L +import Data.ByteString.Lazy.UTF8 (fromString, toString) -toB64 :: String -> String -toB64 = encode . s2w8 +toB64 :: String -> String +toB64 = B64.encode . L.unpack . fromString fromB64Maybe :: String -> Maybe String -fromB64Maybe s = w82s <$> decode s +fromB64Maybe s = toString . L.pack <$> B64.decode s fromB64 :: String -> String fromB64 = fromMaybe bad . fromB64Maybe where bad = error "bad base64 encoded data" + +prop_b64_roundtrips :: String -> Bool +prop_b64_roundtrips s = s == fromB64 (toB64 s) diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs index 844e81e59c..139b74fe4c 100644 --- a/Utility/FileSystemEncoding.hs +++ b/Utility/FileSystemEncoding.hs @@ -14,6 +14,8 @@ module Utility.FileSystemEncoding ( decodeBS, decodeW8, encodeW8, + encodeW8NUL, + decodeW8NUL, truncateFilePath, ) where @@ -25,6 +27,7 @@ import System.IO.Unsafe import qualified Data.Hash.MD5 as MD5 import Data.Word import Data.Bits.Utils +import Data.List.Utils import qualified Data.ByteString.Lazy as L #ifdef mingw32_HOST_OS import qualified Data.ByteString.Lazy.UTF8 as L8 @@ -89,6 +92,9 @@ decodeBS = L8.toString - w82c produces a String, which may contain Chars that are invalid - unicode. From there, this is really a simple matter of applying the - file system encoding, only complicated by GHC's interface to doing so. + - + - Note that the encoding stops at any NUL in the input. FilePaths + - do not normally contain embedded NUL, but Haskell Strings may. -} {-# NOINLINE encodeW8 #-} encodeW8 :: [Word8] -> FilePath @@ -101,6 +107,17 @@ encodeW8 w8 = unsafePerformIO $ do decodeW8 :: FilePath -> [Word8] decodeW8 = s2w8 . _encodeFilePath +{- Like encodeW8 and decodeW8, but NULs are passed through unchanged. -} +encodeW8NUL :: [Word8] -> FilePath +encodeW8NUL = join nul . map encodeW8 . split (s2w8 nul) + where + nul = ['\NUL'] + +decodeW8NUL :: FilePath -> [Word8] +decodeW8NUL = join (s2w8 nul) . map decodeW8 . split nul + where + nul = ['\NUL'] + {- Truncates a FilePath to the given number of bytes (or less), - as represented on disk. - diff --git a/debian/changelog b/debian/changelog index 9fd6440f61..921c0ff4ab 100644 --- a/debian/changelog +++ b/debian/changelog @@ -22,6 +22,11 @@ git-annex (5.2015022) UNRELEASED; urgency=medium * When re-execing git-annex, use current program location, rather than ~/.config/git-annex/program, when possible. * Submodules are now supported by git-annex! + * metadata: Fix encoding problem that led to mojibake when storing + metadata strings that contained both unicode characters and a space + (or '!') character. + * Also potentially fixes encoding problem when embedding credentials + that contain unicode characters. -- Joey Hess Thu, 19 Feb 2015 14:16:03 -0400 diff --git a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn index 8d7475163e..ebfef3b772 100644 --- a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn +++ b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata.mdwn @@ -13,3 +13,5 @@ Unicode characters in metadata are pruned/converted/lost: ### What version of git-annex are you using? On what operating system? 5.20141125 Debian + +> [[fixed|done]]; test pass. --[[Joey]] diff --git a/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment new file mode 100644 index 0000000000..1ae2782899 --- /dev/null +++ b/doc/bugs/Unicode_characters_lost__47__converted_in_metadata/comment_1_799fd2dc62dc51a6a8fa8a6d15d7a1f9._comment @@ -0,0 +1,32 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2015-03-04T14:31:21Z" + content=""" +What I'm seeing is the unicode arrow is replaced with 0092 and the elipsis +with &. It's losing the other byte. + +The problem seems to be in the base64 encoding that's done, when the metadata +value contains spaces or a few other problem characters. These same +unicode characters roundtrip through without a problem when not embedded +in a string with spaces. + +
+*Utility.Base64> let s = "…"
+*Utility.Base64> (s, fromB64 $ toB64 s)
+("\8230","&")
+
+ +git-annex also uses base64 for encoding some creds +(and for tagged pushes over XMPP, but only the JID is encoded). + +The real culprit is the use of `w82s`, which doesn't handle multi-byte +characters. I can easily fix this by using `encodeW8` instead. +Audited git-annex for other problem w82s uses and don't see any, so will +only need to fix this once. + +Added a quickcheck test for fromB64 . toB64 roundtripping. + +Unfortunately, the entered unicode characters didn't get saved right, +so git-annex can do nothing to fix data that was already entered. +"""]]