Windows: Fix some filename encoding bugs.

http://git-annex.branchable.com/bugs/Unicode_file_names_ignored_on_Windows/

Not a complete fix yet.
This commit is contained in:
Joey Hess 2014-03-19 14:49:01 -04:00
parent 2f52f727c0
commit 1052eeface
8 changed files with 86 additions and 8 deletions

View file

@ -80,7 +80,7 @@ catKey = catKey' True
catKey' :: Bool -> Ref -> FileMode -> Annex (Maybe Key)
catKey' modeguaranteed ref mode
| isSymLink mode = do
l <- fromInternalGitPath . encodeW8 . L.unpack <$> get
l <- fromInternalGitPath . decodeBS <$> get
return $ if isLinkToAnnex l
then fileKey $ takeFileName l
else Nothing

View file

@ -10,7 +10,6 @@
module Command.Unused where
import qualified Data.Set as S
import qualified Data.ByteString.Lazy as L
import Data.BloomFilter
import Data.BloomFilter.Easy
import Data.BloomFilter.Hash
@ -296,7 +295,7 @@ withKeysReferencedInGitRef a ref = do
liftIO $ void clean
where
tKey True = fmap fst <$$> Backend.lookupFile . getTopFilePath . DiffTree.file
tKey False = fileKey . takeFileName . encodeW8 . L.unpack <$$>
tKey False = fileKey . takeFileName . decodeBS <$$>
catFile ref . getTopFilePath . DiffTree.file
{- Looks in the specified directory for bad/tmp keys, and returns a list

View file

@ -108,6 +108,6 @@ catTree h treeref = go <$> catObjectDetails h treeref
dropsha = L.drop 21
parsemodefile b =
let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b)
let (modestr, file) = separate (== ' ') (decodeBS b)
in (file, readmode modestr)
readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct

View file

@ -22,7 +22,6 @@ import qualified Data.Map as M
import System.IO.HVFS (SystemFS(..))
import qualified Text.JSON
import System.Path
import qualified Data.ByteString.Lazy as L
import Common
@ -1272,7 +1271,7 @@ test_add_subdirs env = intmpclonerepo env $ do
{- Regression test for Windows bug where symlinks were not
- calculated correctly for files in subdirs. -}
git_annex env "sync" [] @? "sync failed"
l <- annexeval $ encodeW8 . L.unpack <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo")
l <- annexeval $ decodeBS <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo")
"../.git/annex/" `isPrefixOf` l @? ("symlink from subdir to .git/annex is wrong: " ++ l)
createDirectory "dir2"

View file

@ -1,14 +1,17 @@
{- GHC File system encoding handling.
-
- Copyright 2012-2013 Joey Hess <joey@kitenet.net>
- Copyright 2012-2014 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE CPP #-}
module Utility.FileSystemEncoding (
fileEncoding,
withFilePath,
md5FilePath,
decodeBS,
decodeW8,
encodeW8,
truncateFilePath,
@ -22,13 +25,24 @@ import System.IO.Unsafe
import qualified Data.Hash.MD5 as MD5
import Data.Word
import Data.Bits.Utils
import qualified Data.ByteString.Lazy as L
#ifdef mingw32_HOST_OS
import qualified Data.ByteString.Lazy.UTF8 as L8
#endif
{- Sets a Handle to use the filesystem encoding. This causes data
- written or read from it to be encoded/decoded the same
- as ghc 7.4 does to filenames etc. This special encoding
- allows "arbitrary undecodable bytes to be round-tripped through it". -}
- allows "arbitrary undecodable bytes to be round-tripped through it".
-}
fileEncoding :: Handle -> IO ()
#ifndef mingw32_HOST_OS
fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
#else
{- The file system encoding does not work well on Windows,
- and Windows only has utf FilePaths anyway. -}
fileEncoding h = hSetEncoding h Encoding.utf8
#endif
{- Marshal a Haskell FilePath into a NUL terminated C string using temporary
- storage. The FilePath is encoded using the filesystem encoding,
@ -60,6 +74,16 @@ _encodeFilePath fp = unsafePerformIO $ do
md5FilePath :: FilePath -> MD5.Str
md5FilePath = MD5.Str . _encodeFilePath
{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
decodeBS :: L.ByteString -> FilePath
#ifndef mingw32_HOST_OS
decodeBS = encodeW8 . L.unpack
#else
{- On Windows, we assume that the ByteString is utf-8, since Windows
- only uses unicode for filenames. -}
decodeBS = L8.toString
#endif
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
-
- w82c produces a String, which may contain Chars that are invalid
@ -84,6 +108,7 @@ decodeW8 = s2w8 . _encodeFilePath
- cost of efficiency when running on a large FilePath.
-}
truncateFilePath :: Int -> FilePath -> FilePath
#ifndef mingw32_HOST_OS
truncateFilePath n = go . reverse
where
go f =
@ -91,3 +116,17 @@ truncateFilePath n = go . reverse
in if length bytes <= n
then reverse f
else go (drop 1 f)
#else
{- On Windows, count the number of bytes used by each utf8 character. -}
truncateFilePath n = reverse . go [] n . L8.fromString
where
go coll cnt bs
| cnt <= 0 = coll
| otherwise = case L8.decode bs of
Just (c, x) | c /= L8.replacement_char ->
let x' = fromIntegral x
in if cnt - x' < 0
then coll
else go (c:coll) (cnt - x') (L8.drop 1 bs)
_ -> coll
#endif

1
debian/changelog vendored
View file

@ -30,6 +30,7 @@ git-annex (5.20140307) UNRELEASED; urgency=medium
* Each for each metadata field, there's now an automatically maintained
"$field-lastchanged" that gives the timestamp of the last change to that
field.
* Windows: Fix some filename encoding bugs.
-- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400

View file

@ -35,3 +35,7 @@ According to https://github.com/msysgit/msysgit/wiki/Git-for-Windows-Unicode-Sup
[2014-03-18 14:28:03 Central Europe Standard Time] read: git ["--git-dir=D:\\anntest\\.git","--work-tree=D:\\anntest","-c","core.bare=false","ls-files","--modified","-z","--","h\225\269ky.txt"]
I can provide additional information, just tell me what you need.
> [[fixed|done]], although this is not the end of encoding issues
> on Windows. Updating [[windows_support]] to discuss some other ones.
> --[[Joey]]

View file

@ -29,6 +29,42 @@ now! --[[Joey]]
* Deleting a git repository from inside the webapp fails "RemoveDirectory
permision denied ... file is being used by another process"
## potential encoding problems
[[bugs/Unicode_file_names_ignored_on_Windows]] is fixed, but some potential
problems remain, since the FileSystemEncoding that git-annex relies on
seems unreliable/broken on Windows.
* When git-annex displays a filename that it's acting on, there
can be mojibake on Windows. For example, "háčky.txt" displays
the accented characters as instead the pairs of bytes making
up the utf-8. Tried doing various things to the stdout handle
to avoid this, but only ended up with encoding crashes, or worse
mojibake than this.
* `md5FilePath` still uses the filesystem encoding, and so may produce the
wrong value on Windows. This would impact keys that contain problem characters
(probably coming from the filename extension), and might cause
interoperability problems when git-annex generates the hash directories of a
remote, for example a rsync remote.
* `encodeW8` is used in Git.UnionMerge, and while I fixed the other calls to
encodeW8, which all involved ByteStrings reading from git and so can just
treat it as utf-8 on Windows (via `decodeBS`), in the union merge case,
the ByteString has no defined encoding. It may have been written on Unix
and contain keys with invalid unicode in them. On windows, the union
merge code should probably check if it's valid utf-8, and if not,
abort the merge.
* If interoperating with a git-annex repository from a unix system, it's
possible for a key to contain some invalid utf-8, which means its filename
cannot even be represented on Windows, so who knows what will happen in that
case -- probably it will fail in some way when adding the object file
to the Windows repo.
* If data from the git repo does not have a unicode encoding, it will be
mangled in various places on Windows, which can lead to undefined behavior.
## minor problems
* rsync special remotes with a rsyncurl of a local directory are known