From 1052eefacef44df67b151c9e4566b41cee11238d Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 19 Mar 2014 14:49:01 -0400 Subject: [PATCH] Windows: Fix some filename encoding bugs. http://git-annex.branchable.com/bugs/Unicode_file_names_ignored_on_Windows/ Not a complete fix yet. --- Annex/CatFile.hs | 2 +- Command/Unused.hs | 3 +- Git/CatFile.hs | 2 +- Test.hs | 3 +- Utility/FileSystemEncoding.hs | 43 ++++++++++++++++++- debian/changelog | 1 + ...Unicode_file_names_ignored_on_Windows.mdwn | 4 ++ doc/todo/windows_support.mdwn | 36 ++++++++++++++++ 8 files changed, 86 insertions(+), 8 deletions(-) diff --git a/Annex/CatFile.hs b/Annex/CatFile.hs index fc722c8e71..6a778db039 100644 --- a/Annex/CatFile.hs +++ b/Annex/CatFile.hs @@ -80,7 +80,7 @@ catKey = catKey' True catKey' :: Bool -> Ref -> FileMode -> Annex (Maybe Key) catKey' modeguaranteed ref mode | isSymLink mode = do - l <- fromInternalGitPath . encodeW8 . L.unpack <$> get + l <- fromInternalGitPath . decodeBS <$> get return $ if isLinkToAnnex l then fileKey $ takeFileName l else Nothing diff --git a/Command/Unused.hs b/Command/Unused.hs index c174cd2566..3e844e5a8b 100644 --- a/Command/Unused.hs +++ b/Command/Unused.hs @@ -10,7 +10,6 @@ module Command.Unused where import qualified Data.Set as S -import qualified Data.ByteString.Lazy as L import Data.BloomFilter import Data.BloomFilter.Easy import Data.BloomFilter.Hash @@ -296,7 +295,7 @@ withKeysReferencedInGitRef a ref = do liftIO $ void clean where tKey True = fmap fst <$$> Backend.lookupFile . getTopFilePath . DiffTree.file - tKey False = fileKey . takeFileName . encodeW8 . L.unpack <$$> + tKey False = fileKey . takeFileName . decodeBS <$$> catFile ref . getTopFilePath . DiffTree.file {- Looks in the specified directory for bad/tmp keys, and returns a list diff --git a/Git/CatFile.hs b/Git/CatFile.hs index c7c51b8943..8e64fc5589 100644 --- a/Git/CatFile.hs +++ b/Git/CatFile.hs @@ -108,6 +108,6 @@ catTree h treeref = go <$> catObjectDetails h treeref dropsha = L.drop 21 parsemodefile b = - let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b) + let (modestr, file) = separate (== ' ') (decodeBS b) in (file, readmode modestr) readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct diff --git a/Test.hs b/Test.hs index 1c448b357b..209242fb3d 100644 --- a/Test.hs +++ b/Test.hs @@ -22,7 +22,6 @@ import qualified Data.Map as M import System.IO.HVFS (SystemFS(..)) import qualified Text.JSON import System.Path -import qualified Data.ByteString.Lazy as L import Common @@ -1272,7 +1271,7 @@ test_add_subdirs env = intmpclonerepo env $ do {- Regression test for Windows bug where symlinks were not - calculated correctly for files in subdirs. -} git_annex env "sync" [] @? "sync failed" - l <- annexeval $ encodeW8 . L.unpack <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo") + l <- annexeval $ decodeBS <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo") "../.git/annex/" `isPrefixOf` l @? ("symlink from subdir to .git/annex is wrong: " ++ l) createDirectory "dir2" diff --git a/Utility/FileSystemEncoding.hs b/Utility/FileSystemEncoding.hs index ac105e73d2..690942cbab 100644 --- a/Utility/FileSystemEncoding.hs +++ b/Utility/FileSystemEncoding.hs @@ -1,14 +1,17 @@ {- GHC File system encoding handling. - - - Copyright 2012-2013 Joey Hess + - Copyright 2012-2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} +{-# LANGUAGE CPP #-} + module Utility.FileSystemEncoding ( fileEncoding, withFilePath, md5FilePath, + decodeBS, decodeW8, encodeW8, truncateFilePath, @@ -22,13 +25,24 @@ import System.IO.Unsafe import qualified Data.Hash.MD5 as MD5 import Data.Word import Data.Bits.Utils +import qualified Data.ByteString.Lazy as L +#ifdef mingw32_HOST_OS +import qualified Data.ByteString.Lazy.UTF8 as L8 +#endif {- Sets a Handle to use the filesystem encoding. This causes data - written or read from it to be encoded/decoded the same - as ghc 7.4 does to filenames etc. This special encoding - - allows "arbitrary undecodable bytes to be round-tripped through it". -} + - allows "arbitrary undecodable bytes to be round-tripped through it". + -} fileEncoding :: Handle -> IO () +#ifndef mingw32_HOST_OS fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding +#else +{- The file system encoding does not work well on Windows, + - and Windows only has utf FilePaths anyway. -} +fileEncoding h = hSetEncoding h Encoding.utf8 +#endif {- Marshal a Haskell FilePath into a NUL terminated C string using temporary - storage. The FilePath is encoded using the filesystem encoding, @@ -60,6 +74,16 @@ _encodeFilePath fp = unsafePerformIO $ do md5FilePath :: FilePath -> MD5.Str md5FilePath = MD5.Str . _encodeFilePath +{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -} +decodeBS :: L.ByteString -> FilePath +#ifndef mingw32_HOST_OS +decodeBS = encodeW8 . L.unpack +#else +{- On Windows, we assume that the ByteString is utf-8, since Windows + - only uses unicode for filenames. -} +decodeBS = L8.toString +#endif + {- Converts a [Word8] to a FilePath, encoding using the filesystem encoding. - - w82c produces a String, which may contain Chars that are invalid @@ -84,6 +108,7 @@ decodeW8 = s2w8 . _encodeFilePath - cost of efficiency when running on a large FilePath. -} truncateFilePath :: Int -> FilePath -> FilePath +#ifndef mingw32_HOST_OS truncateFilePath n = go . reverse where go f = @@ -91,3 +116,17 @@ truncateFilePath n = go . reverse in if length bytes <= n then reverse f else go (drop 1 f) +#else +{- On Windows, count the number of bytes used by each utf8 character. -} +truncateFilePath n = reverse . go [] n . L8.fromString + where + go coll cnt bs + | cnt <= 0 = coll + | otherwise = case L8.decode bs of + Just (c, x) | c /= L8.replacement_char -> + let x' = fromIntegral x + in if cnt - x' < 0 + then coll + else go (c:coll) (cnt - x') (L8.drop 1 bs) + _ -> coll +#endif diff --git a/debian/changelog b/debian/changelog index 6ef4e5e84f..f082cbbfbb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -30,6 +30,7 @@ git-annex (5.20140307) UNRELEASED; urgency=medium * Each for each metadata field, there's now an automatically maintained "$field-lastchanged" that gives the timestamp of the last change to that field. + * Windows: Fix some filename encoding bugs. -- Joey Hess Thu, 06 Mar 2014 16:17:01 -0400 diff --git a/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn b/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn index af3877dbe3..b58cf25712 100644 --- a/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn +++ b/doc/bugs/Unicode_file_names_ignored_on_Windows.mdwn @@ -35,3 +35,7 @@ According to https://github.com/msysgit/msysgit/wiki/Git-for-Windows-Unicode-Sup [2014-03-18 14:28:03 Central Europe Standard Time] read: git ["--git-dir=D:\\anntest\\.git","--work-tree=D:\\anntest","-c","core.bare=false","ls-files","--modified","-z","--","h\225\269ky.txt"] I can provide additional information, just tell me what you need. + +> [[fixed|done]], although this is not the end of encoding issues +> on Windows. Updating [[windows_support]] to discuss some other ones. +> --[[Joey]] diff --git a/doc/todo/windows_support.mdwn b/doc/todo/windows_support.mdwn index 17accd62e5..af78d517f4 100644 --- a/doc/todo/windows_support.mdwn +++ b/doc/todo/windows_support.mdwn @@ -29,6 +29,42 @@ now! --[[Joey]] * Deleting a git repository from inside the webapp fails "RemoveDirectory permision denied ... file is being used by another process" +## potential encoding problems + +[[bugs/Unicode_file_names_ignored_on_Windows]] is fixed, but some potential +problems remain, since the FileSystemEncoding that git-annex relies on +seems unreliable/broken on Windows. + +* When git-annex displays a filename that it's acting on, there + can be mojibake on Windows. For example, "háčky.txt" displays + the accented characters as instead the pairs of bytes making + up the utf-8. Tried doing various things to the stdout handle + to avoid this, but only ended up with encoding crashes, or worse + mojibake than this. + +* `md5FilePath` still uses the filesystem encoding, and so may produce the + wrong value on Windows. This would impact keys that contain problem characters + (probably coming from the filename extension), and might cause + interoperability problems when git-annex generates the hash directories of a + remote, for example a rsync remote. + +* `encodeW8` is used in Git.UnionMerge, and while I fixed the other calls to + encodeW8, which all involved ByteStrings reading from git and so can just + treat it as utf-8 on Windows (via `decodeBS`), in the union merge case, + the ByteString has no defined encoding. It may have been written on Unix + and contain keys with invalid unicode in them. On windows, the union + merge code should probably check if it's valid utf-8, and if not, + abort the merge. + +* If interoperating with a git-annex repository from a unix system, it's + possible for a key to contain some invalid utf-8, which means its filename + cannot even be represented on Windows, so who knows what will happen in that + case -- probably it will fail in some way when adding the object file + to the Windows repo. + +* If data from the git repo does not have a unicode encoding, it will be + mangled in various places on Windows, which can lead to undefined behavior. + ## minor problems * rsync special remotes with a rsyncurl of a local directory are known