Windows: Fix some filename encoding bugs.
http://git-annex.branchable.com/bugs/Unicode_file_names_ignored_on_Windows/ Not a complete fix yet.
This commit is contained in:
parent
2f52f727c0
commit
1052eeface
8 changed files with 86 additions and 8 deletions
|
@ -80,7 +80,7 @@ catKey = catKey' True
|
||||||
catKey' :: Bool -> Ref -> FileMode -> Annex (Maybe Key)
|
catKey' :: Bool -> Ref -> FileMode -> Annex (Maybe Key)
|
||||||
catKey' modeguaranteed ref mode
|
catKey' modeguaranteed ref mode
|
||||||
| isSymLink mode = do
|
| isSymLink mode = do
|
||||||
l <- fromInternalGitPath . encodeW8 . L.unpack <$> get
|
l <- fromInternalGitPath . decodeBS <$> get
|
||||||
return $ if isLinkToAnnex l
|
return $ if isLinkToAnnex l
|
||||||
then fileKey $ takeFileName l
|
then fileKey $ takeFileName l
|
||||||
else Nothing
|
else Nothing
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
module Command.Unused where
|
module Command.Unused where
|
||||||
|
|
||||||
import qualified Data.Set as S
|
import qualified Data.Set as S
|
||||||
import qualified Data.ByteString.Lazy as L
|
|
||||||
import Data.BloomFilter
|
import Data.BloomFilter
|
||||||
import Data.BloomFilter.Easy
|
import Data.BloomFilter.Easy
|
||||||
import Data.BloomFilter.Hash
|
import Data.BloomFilter.Hash
|
||||||
|
@ -296,7 +295,7 @@ withKeysReferencedInGitRef a ref = do
|
||||||
liftIO $ void clean
|
liftIO $ void clean
|
||||||
where
|
where
|
||||||
tKey True = fmap fst <$$> Backend.lookupFile . getTopFilePath . DiffTree.file
|
tKey True = fmap fst <$$> Backend.lookupFile . getTopFilePath . DiffTree.file
|
||||||
tKey False = fileKey . takeFileName . encodeW8 . L.unpack <$$>
|
tKey False = fileKey . takeFileName . decodeBS <$$>
|
||||||
catFile ref . getTopFilePath . DiffTree.file
|
catFile ref . getTopFilePath . DiffTree.file
|
||||||
|
|
||||||
{- Looks in the specified directory for bad/tmp keys, and returns a list
|
{- Looks in the specified directory for bad/tmp keys, and returns a list
|
||||||
|
|
|
@ -108,6 +108,6 @@ catTree h treeref = go <$> catObjectDetails h treeref
|
||||||
dropsha = L.drop 21
|
dropsha = L.drop 21
|
||||||
|
|
||||||
parsemodefile b =
|
parsemodefile b =
|
||||||
let (modestr, file) = separate (== ' ') (encodeW8 $ L.unpack b)
|
let (modestr, file) = separate (== ' ') (decodeBS b)
|
||||||
in (file, readmode modestr)
|
in (file, readmode modestr)
|
||||||
readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct
|
readmode = fst . fromMaybe (0, undefined) . headMaybe . readOct
|
||||||
|
|
3
Test.hs
3
Test.hs
|
@ -22,7 +22,6 @@ import qualified Data.Map as M
|
||||||
import System.IO.HVFS (SystemFS(..))
|
import System.IO.HVFS (SystemFS(..))
|
||||||
import qualified Text.JSON
|
import qualified Text.JSON
|
||||||
import System.Path
|
import System.Path
|
||||||
import qualified Data.ByteString.Lazy as L
|
|
||||||
|
|
||||||
import Common
|
import Common
|
||||||
|
|
||||||
|
@ -1272,7 +1271,7 @@ test_add_subdirs env = intmpclonerepo env $ do
|
||||||
{- Regression test for Windows bug where symlinks were not
|
{- Regression test for Windows bug where symlinks were not
|
||||||
- calculated correctly for files in subdirs. -}
|
- calculated correctly for files in subdirs. -}
|
||||||
git_annex env "sync" [] @? "sync failed"
|
git_annex env "sync" [] @? "sync failed"
|
||||||
l <- annexeval $ encodeW8 . L.unpack <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo")
|
l <- annexeval $ decodeBS <$> Annex.CatFile.catObject (Git.Types.Ref "HEAD:dir/foo")
|
||||||
"../.git/annex/" `isPrefixOf` l @? ("symlink from subdir to .git/annex is wrong: " ++ l)
|
"../.git/annex/" `isPrefixOf` l @? ("symlink from subdir to .git/annex is wrong: " ++ l)
|
||||||
|
|
||||||
createDirectory "dir2"
|
createDirectory "dir2"
|
||||||
|
|
|
@ -1,14 +1,17 @@
|
||||||
{- GHC File system encoding handling.
|
{- GHC File system encoding handling.
|
||||||
-
|
-
|
||||||
- Copyright 2012-2013 Joey Hess <joey@kitenet.net>
|
- Copyright 2012-2014 Joey Hess <joey@kitenet.net>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU GPL version 3 or higher.
|
- Licensed under the GNU GPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
|
{-# LANGUAGE CPP #-}
|
||||||
|
|
||||||
module Utility.FileSystemEncoding (
|
module Utility.FileSystemEncoding (
|
||||||
fileEncoding,
|
fileEncoding,
|
||||||
withFilePath,
|
withFilePath,
|
||||||
md5FilePath,
|
md5FilePath,
|
||||||
|
decodeBS,
|
||||||
decodeW8,
|
decodeW8,
|
||||||
encodeW8,
|
encodeW8,
|
||||||
truncateFilePath,
|
truncateFilePath,
|
||||||
|
@ -22,13 +25,24 @@ import System.IO.Unsafe
|
||||||
import qualified Data.Hash.MD5 as MD5
|
import qualified Data.Hash.MD5 as MD5
|
||||||
import Data.Word
|
import Data.Word
|
||||||
import Data.Bits.Utils
|
import Data.Bits.Utils
|
||||||
|
import qualified Data.ByteString.Lazy as L
|
||||||
|
#ifdef mingw32_HOST_OS
|
||||||
|
import qualified Data.ByteString.Lazy.UTF8 as L8
|
||||||
|
#endif
|
||||||
|
|
||||||
{- Sets a Handle to use the filesystem encoding. This causes data
|
{- Sets a Handle to use the filesystem encoding. This causes data
|
||||||
- written or read from it to be encoded/decoded the same
|
- written or read from it to be encoded/decoded the same
|
||||||
- as ghc 7.4 does to filenames etc. This special encoding
|
- as ghc 7.4 does to filenames etc. This special encoding
|
||||||
- allows "arbitrary undecodable bytes to be round-tripped through it". -}
|
- allows "arbitrary undecodable bytes to be round-tripped through it".
|
||||||
|
-}
|
||||||
fileEncoding :: Handle -> IO ()
|
fileEncoding :: Handle -> IO ()
|
||||||
|
#ifndef mingw32_HOST_OS
|
||||||
fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
|
fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding
|
||||||
|
#else
|
||||||
|
{- The file system encoding does not work well on Windows,
|
||||||
|
- and Windows only has utf FilePaths anyway. -}
|
||||||
|
fileEncoding h = hSetEncoding h Encoding.utf8
|
||||||
|
#endif
|
||||||
|
|
||||||
{- Marshal a Haskell FilePath into a NUL terminated C string using temporary
|
{- Marshal a Haskell FilePath into a NUL terminated C string using temporary
|
||||||
- storage. The FilePath is encoded using the filesystem encoding,
|
- storage. The FilePath is encoded using the filesystem encoding,
|
||||||
|
@ -60,6 +74,16 @@ _encodeFilePath fp = unsafePerformIO $ do
|
||||||
md5FilePath :: FilePath -> MD5.Str
|
md5FilePath :: FilePath -> MD5.Str
|
||||||
md5FilePath = MD5.Str . _encodeFilePath
|
md5FilePath = MD5.Str . _encodeFilePath
|
||||||
|
|
||||||
|
{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
|
||||||
|
decodeBS :: L.ByteString -> FilePath
|
||||||
|
#ifndef mingw32_HOST_OS
|
||||||
|
decodeBS = encodeW8 . L.unpack
|
||||||
|
#else
|
||||||
|
{- On Windows, we assume that the ByteString is utf-8, since Windows
|
||||||
|
- only uses unicode for filenames. -}
|
||||||
|
decodeBS = L8.toString
|
||||||
|
#endif
|
||||||
|
|
||||||
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
|
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
|
||||||
-
|
-
|
||||||
- w82c produces a String, which may contain Chars that are invalid
|
- w82c produces a String, which may contain Chars that are invalid
|
||||||
|
@ -84,6 +108,7 @@ decodeW8 = s2w8 . _encodeFilePath
|
||||||
- cost of efficiency when running on a large FilePath.
|
- cost of efficiency when running on a large FilePath.
|
||||||
-}
|
-}
|
||||||
truncateFilePath :: Int -> FilePath -> FilePath
|
truncateFilePath :: Int -> FilePath -> FilePath
|
||||||
|
#ifndef mingw32_HOST_OS
|
||||||
truncateFilePath n = go . reverse
|
truncateFilePath n = go . reverse
|
||||||
where
|
where
|
||||||
go f =
|
go f =
|
||||||
|
@ -91,3 +116,17 @@ truncateFilePath n = go . reverse
|
||||||
in if length bytes <= n
|
in if length bytes <= n
|
||||||
then reverse f
|
then reverse f
|
||||||
else go (drop 1 f)
|
else go (drop 1 f)
|
||||||
|
#else
|
||||||
|
{- On Windows, count the number of bytes used by each utf8 character. -}
|
||||||
|
truncateFilePath n = reverse . go [] n . L8.fromString
|
||||||
|
where
|
||||||
|
go coll cnt bs
|
||||||
|
| cnt <= 0 = coll
|
||||||
|
| otherwise = case L8.decode bs of
|
||||||
|
Just (c, x) | c /= L8.replacement_char ->
|
||||||
|
let x' = fromIntegral x
|
||||||
|
in if cnt - x' < 0
|
||||||
|
then coll
|
||||||
|
else go (c:coll) (cnt - x') (L8.drop 1 bs)
|
||||||
|
_ -> coll
|
||||||
|
#endif
|
||||||
|
|
1
debian/changelog
vendored
1
debian/changelog
vendored
|
@ -30,6 +30,7 @@ git-annex (5.20140307) UNRELEASED; urgency=medium
|
||||||
* Each for each metadata field, there's now an automatically maintained
|
* Each for each metadata field, there's now an automatically maintained
|
||||||
"$field-lastchanged" that gives the timestamp of the last change to that
|
"$field-lastchanged" that gives the timestamp of the last change to that
|
||||||
field.
|
field.
|
||||||
|
* Windows: Fix some filename encoding bugs.
|
||||||
|
|
||||||
-- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400
|
-- Joey Hess <joeyh@debian.org> Thu, 06 Mar 2014 16:17:01 -0400
|
||||||
|
|
||||||
|
|
|
@ -35,3 +35,7 @@ According to https://github.com/msysgit/msysgit/wiki/Git-for-Windows-Unicode-Sup
|
||||||
[2014-03-18 14:28:03 Central Europe Standard Time] read: git ["--git-dir=D:\\anntest\\.git","--work-tree=D:\\anntest","-c","core.bare=false","ls-files","--modified","-z","--","h\225\269ky.txt"]
|
[2014-03-18 14:28:03 Central Europe Standard Time] read: git ["--git-dir=D:\\anntest\\.git","--work-tree=D:\\anntest","-c","core.bare=false","ls-files","--modified","-z","--","h\225\269ky.txt"]
|
||||||
|
|
||||||
I can provide additional information, just tell me what you need.
|
I can provide additional information, just tell me what you need.
|
||||||
|
|
||||||
|
> [[fixed|done]], although this is not the end of encoding issues
|
||||||
|
> on Windows. Updating [[windows_support]] to discuss some other ones.
|
||||||
|
> --[[Joey]]
|
||||||
|
|
|
@ -29,6 +29,42 @@ now! --[[Joey]]
|
||||||
* Deleting a git repository from inside the webapp fails "RemoveDirectory
|
* Deleting a git repository from inside the webapp fails "RemoveDirectory
|
||||||
permision denied ... file is being used by another process"
|
permision denied ... file is being used by another process"
|
||||||
|
|
||||||
|
## potential encoding problems
|
||||||
|
|
||||||
|
[[bugs/Unicode_file_names_ignored_on_Windows]] is fixed, but some potential
|
||||||
|
problems remain, since the FileSystemEncoding that git-annex relies on
|
||||||
|
seems unreliable/broken on Windows.
|
||||||
|
|
||||||
|
* When git-annex displays a filename that it's acting on, there
|
||||||
|
can be mojibake on Windows. For example, "háčky.txt" displays
|
||||||
|
the accented characters as instead the pairs of bytes making
|
||||||
|
up the utf-8. Tried doing various things to the stdout handle
|
||||||
|
to avoid this, but only ended up with encoding crashes, or worse
|
||||||
|
mojibake than this.
|
||||||
|
|
||||||
|
* `md5FilePath` still uses the filesystem encoding, and so may produce the
|
||||||
|
wrong value on Windows. This would impact keys that contain problem characters
|
||||||
|
(probably coming from the filename extension), and might cause
|
||||||
|
interoperability problems when git-annex generates the hash directories of a
|
||||||
|
remote, for example a rsync remote.
|
||||||
|
|
||||||
|
* `encodeW8` is used in Git.UnionMerge, and while I fixed the other calls to
|
||||||
|
encodeW8, which all involved ByteStrings reading from git and so can just
|
||||||
|
treat it as utf-8 on Windows (via `decodeBS`), in the union merge case,
|
||||||
|
the ByteString has no defined encoding. It may have been written on Unix
|
||||||
|
and contain keys with invalid unicode in them. On windows, the union
|
||||||
|
merge code should probably check if it's valid utf-8, and if not,
|
||||||
|
abort the merge.
|
||||||
|
|
||||||
|
* If interoperating with a git-annex repository from a unix system, it's
|
||||||
|
possible for a key to contain some invalid utf-8, which means its filename
|
||||||
|
cannot even be represented on Windows, so who knows what will happen in that
|
||||||
|
case -- probably it will fail in some way when adding the object file
|
||||||
|
to the Windows repo.
|
||||||
|
|
||||||
|
* If data from the git repo does not have a unicode encoding, it will be
|
||||||
|
mangled in various places on Windows, which can lead to undefined behavior.
|
||||||
|
|
||||||
## minor problems
|
## minor problems
|
||||||
|
|
||||||
* rsync special remotes with a rsyncurl of a local directory are known
|
* rsync special remotes with a rsyncurl of a local directory are known
|
||||||
|
|
Loading…
Reference in a new issue