Windows: Fix some filename encoding bugs.

http://git-annex.branchable.com/bugs/Unicode_file_names_ignored_on_Windows/

Not a complete fix yet.
This commit is contained in:
Joey Hess 2014-03-19 14:49:01 -04:00
parent 2f52f727c0
commit 1052eeface
8 changed files with 86 additions and 8 deletions

View file

@ -80,7 +80,7 @@ catKey = catKey' True
catKey' :: Bool -> Ref -> FileMode -> Annex (Maybe Key) catKey' :: Bool -> Ref -> FileMode -> Annex (Maybe Key)
catKey' modeguaranteed ref mode catKey' modeguaranteed ref mode
| isSymLink mode = do | isSymLink mode = do
l <- fromInternalGitPath . encodeW8 . L.unpack <$> get l <- fromInternalGitPath . decodeBS <$> get
return $ if isLinkToAnnex l return $ if isLinkToAnnex l
then fileKey $ takeFileName l then fileKey $ takeFileName l
else Nothing else Nothing

View file

@ -10,7 +10,6 @@
module Command.Unused where module Command.Unused where
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
import Data.BloomFilter import Data.BloomFilter
import Data.BloomFilter.Easy import Data.BloomFilter.Easy
import Data.BloomFilter.Hash import Data.BloomFilter.Hash
@ -296,7 +295,7 @@ withKeysReferencedInGitRef a ref = do
liftIO $ void clean liftIO $ void clean
where where
tKey True = fmap fst <$$> Backend.lookupFile . getTopFilePath . DiffTree.file tKey True = fmap fst <$$> Backend.lookupFile . getTopFilePath . DiffTree.file
tKey False = fileKey . takeFileName . encodeW8 . L.unpack <$$> tKey False = fileKey . takeFileName . decodeBS <$$>
catFile ref . getTopFilePath . DiffTree.file catFile ref . getTopFilePath . DiffTree.file
{- Looks in the specified directory for bad/tmp keys, and returns a list {- Looks in the specified directory for bad/tmp keys, and returns a list

View file

@ -108,6 +108,6 @@ catTree h treeref = go <$> catObjectDetails h treeref
dropsha = L.drop 21 dropsha = L.drop 21
parsemodefile b = parsemodefile b =
let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b) let (modestr, file) = separate (== ' ') (decodeBS b)
in (file, readmode modestr) in (file, readmode modestr)
readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct

View file

@ -22,7 +22,6 @@ import qualified Data.Map as M
import System.IO.HVFS (SystemFS(..)) import System.IO.HVFS (SystemFS(..))
import qualified Text.JSON import qualified Text.JSON
import System.Path import System.Path
import qualified Data.ByteString.Lazy as L
import Common import Common
@ -1272,7 +1271,7 @@ test_add_subdirs env = intmpclonerepo env $ do
{- Regression test for Windows bug where symlinks were not {- Regression test for Windows bug where symlinks were not
- calculated correctly for files in subdirs. -} - calculated correctly for files in subdirs. -}
git_annex env "sync" [] @? "sync failed" git_annex env "sync" [] @? "sync failed"
l <- annexeval $ encodeW8 . L.unpack <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo") l <- annexeval $ decodeBS <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo")
"../.git/annex/" `isPrefixOf` l @? ("symlink from subdir to .git/annex is wrong: " ++ l) "../.git/annex/" `isPrefixOf` l @? ("symlink from subdir to .git/annex is wrong: " ++ l)
createDirectory "dir2" createDirectory "dir2"

View file

@ -1,14 +1,17 @@
{- GHC File system encoding handling. {- GHC File system encoding handling.
- -
- Copyright 2012-2013 Joey Hess <joey@kitenet.net> - Copyright 2012-2014 Joey Hess <joey@kitenet.net>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
{-# LANGUAGE CPP #-}
module Utility.FileSystemEncoding ( module Utility.FileSystemEncoding (
fileEncoding, fileEncoding,
withFilePath, withFilePath,
md5FilePath, md5FilePath,
decodeBS,
decodeW8, decodeW8,
encodeW8, encodeW8,
truncateFilePath, truncateFilePath,
@ -22,13 +25,24 @@ import System.IO.Unsafe
import qualified Data.Hash.MD5 as MD5 import qualified Data.Hash.MD5 as MD5
import Data.Word import Data.Word
import Data.Bits.Utils import Data.Bits.Utils
import qualified Data.ByteString.Lazy as L
#ifdef mingw32_HOST_OS
import qualified Data.ByteString.Lazy.UTF8 as L8
#endif
{- Sets a Handle to use the filesystem encoding. This causes data {- Sets a Handle to use the filesystem encoding. This causes data
- written or read from it to be encoded/decoded the same - written or read from it to be encoded/decoded the same
- as ghc 7.4 does to filenames etc. This special encoding - as ghc 7.4 does to filenames etc. This special encoding
- allows "arbitrary undecodable bytes to be round-tripped through it". -} - allows "arbitrary undecodable bytes to be round-tripped through it".
-}
fileEncoding :: Handle -> IO () fileEncoding :: Handle -> IO ()
#ifndef mingw32_HOST_OS
fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
#else
{- The file system encoding does not work well on Windows,
- and Windows only has utf FilePaths anyway. -}
fileEncoding h = hSetEncoding h Encoding.utf8
#endif
{- Marshal a Haskell FilePath into a NUL terminated C string using temporary {- Marshal a Haskell FilePath into a NUL terminated C string using temporary
- storage. The FilePath is encoded using the filesystem encoding, - storage. The FilePath is encoded using the filesystem encoding,
@ -60,6 +74,16 @@ _encodeFilePath fp = unsafePerformIO $ do
md5FilePath :: FilePath -> MD5.Str md5FilePath :: FilePath -> MD5.Str
md5FilePath = MD5.Str . _encodeFilePath md5FilePath = MD5.Str . _encodeFilePath
{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
decodeBS :: L.ByteString -> FilePath
#ifndef mingw32_HOST_OS
decodeBS = encodeW8 . L.unpack
#else
{- On Windows, we assume that the ByteString is utf-8, since Windows
- only uses unicode for filenames. -}
decodeBS = L8.toString
#endif
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding. {- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
- -
- w82c produces a String, which may contain Chars that are invalid - w82c produces a String, which may contain Chars that are invalid
@ -84,6 +108,7 @@ decodeW8 = s2w8 . _encodeFilePath
- cost of efficiency when running on a large FilePath. - cost of efficiency when running on a large FilePath.
-} -}
truncateFilePath :: Int -> FilePath -> FilePath truncateFilePath :: Int -> FilePath -> FilePath
#ifndef mingw32_HOST_OS
truncateFilePath n = go . reverse truncateFilePath n = go . reverse
where where
go f = go f =
@ -91,3 +116,17 @@ truncateFilePath n = go . reverse
in if length bytes <= n in if length bytes <= n
then reverse f then reverse f
else go (drop 1 f) else go (drop 1 f)
#else
{- On Windows, count the number of bytes used by each utf8 character. -}
truncateFilePath n = reverse . go [] n . L8.fromString
where
go coll cnt bs
| cnt <= 0 = coll
| otherwise = case L8.decode bs of
Just (c, x) | c /= L8.replacement_char ->
let x' = fromIntegral x
in if cnt - x' < 0
then coll
else go (c:coll) (cnt - x') (L8.drop 1 bs)
_ -> coll
#endif

1
debian/changelog vendored
View file

@ -30,6 +30,7 @@ git-annex (5.20140307) UNRELEASED; urgency=medium
* Each for each metadata field, there's now an automatically maintained * Each for each metadata field, there's now an automatically maintained
"$field-lastchanged" that gives the timestamp of the last change to that "$field-lastchanged" that gives the timestamp of the last change to that
field. field.
* Windows: Fix some filename encoding bugs.
-- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400 -- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400

View file

@ -35,3 +35,7 @@ According to https://github.com/msysgit/msysgit/wiki/Git-for-Windows-Unicode-Sup
[2014-03-18 14:28:03 Central Europe Standard Time] read: git ["--git-dir=D:\\anntest\\.git","--work-tree=D:\\anntest","-c","core.bare=false","ls-files","--modified","-z","--","h\225\269ky.txt"] [2014-03-18 14:28:03 Central Europe Standard Time] read: git ["--git-dir=D:\\anntest\\.git","--work-tree=D:\\anntest","-c","core.bare=false","ls-files","--modified","-z","--","h\225\269ky.txt"]
I can provide additional information, just tell me what you need. I can provide additional information, just tell me what you need.
> [[fixed|done]], although this is not the end of encoding issues
> on Windows. Updating [[windows_support]] to discuss some other ones.
> --[[Joey]]

View file

@ -29,6 +29,42 @@ now! --[[Joey]]
* Deleting a git repository from inside the webapp fails "RemoveDirectory * Deleting a git repository from inside the webapp fails "RemoveDirectory
permision denied ... file is being used by another process" permision denied ... file is being used by another process"
## potential encoding problems
[[bugs/Unicode_file_names_ignored_on_Windows]] is fixed, but some potential
problems remain, since the FileSystemEncoding that git-annex relies on
seems unreliable/broken on Windows.
* When git-annex displays a filename that it's acting on, there
can be mojibake on Windows. For example, "háčky.txt" displays
the accented characters as instead the pairs of bytes making
up the utf-8. Tried doing various things to the stdout handle
to avoid this, but only ended up with encoding crashes, or worse
mojibake than this.
* `md5FilePath` still uses the filesystem encoding, and so may produce the
wrong value on Windows. This would impact keys that contain problem characters
(probably coming from the filename extension), and might cause
interoperability problems when git-annex generates the hash directories of a
remote, for example a rsync remote.
* `encodeW8` is used in Git.UnionMerge, and while I fixed the other calls to
encodeW8, which all involved ByteStrings reading from git and so can just
treat it as utf-8 on Windows (via `decodeBS`), in the union merge case,
the ByteString has no defined encoding. It may have been written on Unix
and contain keys with invalid unicode in them. On windows, the union
merge code should probably check if it's valid utf-8, and if not,
abort the merge.
* If interoperating with a git-annex repository from a unix system, it's
possible for a key to contain some invalid utf-8, which means its filename
cannot even be represented on Windows, so who knows what will happen in that
case -- probably it will fail in some way when adding the object file
to the Windows repo.
* If data from the git repo does not have a unicode encoding, it will be
mangled in various places on Windows, which can lead to undefined behavior.
## minor problems ## minor problems
* rsync special remotes with a rsyncurl of a local directory are known * rsync special remotes with a rsyncurl of a local directory are known