Better sanitization of problem characters when generating URL and WORM keys.

FAT has a lot of characters it does not allow in filenames, like ? and *
It's probably the worst offender, but other filesystems also have
limitiations.

In 2011, I made keyFile escape : to handle FAT, but missed the other
characters. It also turns out that when I did that, I was also living
dangerously; any existing keys that contained a : had their object
location change. Oops.

So, adding new characters to escape to keyFile is out. Well, it would be
possible to make keyFile behave differently on a per-filesystem basis, but
this would be a real nightmare to get right. Consider that a rsync special
remote uses keyFile to determine the filenames to use, and we don't know
the underlying filesystem on the rsync server..

Instead, I have gone for a solution that is backwards compatable and
simple. Its only downside is that already generated URL and WORM keys
might not be able to be stored on FAT or some other filesystem that
dislikes a character used in the key. (In this case, the user can just
migrate the problem keys to a checksumming backend. If this became a big
problem, fsck could be made to detect these and suggest a migration.)

Going forward, new keys that are created will escape all characters that
are likely to cause problems. And if some filesystem comes along that's
even worse than FAT (seems unlikely, but here it is 2013, and people are
still using FAT!), additional characters can be added to the set that are
escaped without difficulty.

(Also, made WORM limit the part of the filename that is embedded in the key,
to deal with filesystem filename length limits. This could have already
been a problem, but is more likely now, since the escaping of the filename
can make it longer.)

This commit was sponsored by Ian Downes
This commit is contained in:
Joey Hess 2013-10-05 15:01:49 -04:00
parent 3dac026598
commit 1be4d281d6
6 changed files with 74 additions and 20 deletions

View file

@ -10,11 +10,10 @@ module Backend.URL (
fromUrl fromUrl
) where ) where
import Data.Hash.MD5
import Common.Annex import Common.Annex
import Types.Backend import Types.Backend
import Types.Key import Types.Key
import Backend.Utilities
backends :: [Backend] backends :: [Backend]
backends = [backend] backends = [backend]
@ -27,18 +26,12 @@ backend = Backend
, canUpgradeKey = Nothing , canUpgradeKey = Nothing
} }
{- When it's not too long, use the full url as the key name. {- Every unique url has a corresponding key. -}
- If the url is too long, it's truncated at half the filename length
- limit, and the md5 of the url is prepended to ensure a unique key. -}
fromUrl :: String -> Maybe Integer -> Annex Key fromUrl :: String -> Maybe Integer -> Annex Key
fromUrl url size = do fromUrl url size = do
limit <- liftIO . fileNameLengthLimit =<< fromRepo gitAnnexDir n <- genKeyName url
let truncurl = truncateFilePath (limit `div` 2) url
let key = if url == truncurl
then url
else truncurl ++ "-" ++ md5s (Str url)
return $ stubKey return $ stubKey
{ keyName = key { keyName = n
, keyBackendName = "URL" , keyBackendName = "URL"
, keySize = size , keySize = size
} }

25
Backend/Utilities.hs Normal file
View file

@ -0,0 +1,25 @@
{- git-annex backend utilities
-
- Copyright 2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Backend.Utilities where
import Data.Hash.MD5
import Common.Annex
{- Generates a keyName from an input string. Takes care of sanitizing it.
- If it's not too long, the full string is used as the keyName.
- Otherwise, it's truncated at half the filename length limit, and its
- md5 is prepended to ensure a unique key. -}
genKeyName :: String -> Annex String
genKeyName s = do
limit <- liftIO . fileNameLengthLimit =<< fromRepo gitAnnexDir
let s' = preSanitizeKeyName s
let truncs = truncateFilePath (limit `div` 2) s'
return $ if s' == truncs
then s'
else truncs ++ "-" ++ md5s (Str s)

View file

@ -11,6 +11,7 @@ import Common.Annex
import Types.Backend import Types.Backend
import Types.Key import Types.Key
import Types.KeySource import Types.KeySource
import Backend.Utilities
backends :: [Backend] backends :: [Backend]
backends = [backend] backends = [backend]
@ -33,9 +34,10 @@ backend = Backend
keyValue :: KeySource -> Annex (Maybe Key) keyValue :: KeySource -> Annex (Maybe Key)
keyValue source = do keyValue source = do
stat <- liftIO $ getFileStatus $ contentLocation source stat <- liftIO $ getFileStatus $ contentLocation source
return $ Just Key { n <- genKeyName $ keyFilename source
keyName = takeFileName $ keyFilename source, return $ Just Key
keyBackendName = name backend, { keyName = n
keySize = Just $ fromIntegral $ fileSize stat, , keyBackendName = name backend
keyMtime = Just $ modificationTime stat , keySize = Just $ fromIntegral $ fileSize stat
} , keyMtime = Just $ modificationTime stat
}

View file

@ -51,6 +51,7 @@ module Locations (
annexHashes, annexHashes,
hashDirMixed, hashDirMixed,
hashDirLower, hashDirLower,
preSanitizeKeyName,
prop_idempotent_fileKey prop_idempotent_fileKey
) where ) where
@ -58,6 +59,7 @@ module Locations (
import Data.Bits import Data.Bits
import Data.Word import Data.Word
import Data.Hash.MD5 import Data.Hash.MD5
import Data.Char
import Common import Common
import Types import Types
@ -284,6 +286,32 @@ gitAnnexAssistantDefaultDir = "annex"
isLinkToAnnex :: FilePath -> Bool isLinkToAnnex :: FilePath -> Bool
isLinkToAnnex s = (pathSeparator:objectDir) `isInfixOf` s isLinkToAnnex s = (pathSeparator:objectDir) `isInfixOf` s
{- Sanitizes a String that will be used as part of a Key's keyName,
- dealing with characters that cause problems on substandard filesystems.
-
- This is used when a new Key is initially being generated, eg by getKey.
- Unlike keyFile and fileKey, it does not need to be a reversable
- escaping. Also, it's ok to change this to add more problimatic
- characters later. Unlike changing keyFile, which could result in the
- filenames used for existing keys changing and contents getting lost.
-
- It is, however, important that the input and output of this function
- have a 1:1 mapping, to avoid two different inputs from mapping to the
- same key.
-}
preSanitizeKeyName :: String -> String
preSanitizeKeyName = concatMap escape
where
escape c
| isAsciiUpper c || isAsciiLower c || isDigit c = [c]
| c `elem` ".-_ " = [c] -- common, assumed safe
| c `elem` "/%:" = [c] -- handled by keyFile
-- , is safe and uncommon, so will be used to escape
-- other characters. By itself, it is escaped to
-- doubled form.
| c == ',' = ",,"
| otherwise = ',' : show(ord(c))
{- Converts a key into a filename fragment without any directory. {- Converts a key into a filename fragment without any directory.
- -
- Escape "/" in the key name, to keep a flat tree of files and avoid - Escape "/" in the key name, to keep a flat tree of files and avoid
@ -293,8 +321,10 @@ isLinkToAnnex s = (pathSeparator:objectDir) `isInfixOf` s
- a slash - a slash
- "%" is escaped to "&s", and "&" to "&a"; this ensures that the mapping - "%" is escaped to "&s", and "&" to "&a"; this ensures that the mapping
- is one to one. - is one to one.
- ":" is escaped to "&c", because despite it being 20XX people still care - ":" is escaped to "&c", because it seemed like a good idea at the time.
- about FAT. -
- Changing what this function escapes and how is not a good idea, as it
- can cause existing objects to get lost.
-} -}
keyFile :: Key -> FilePath keyFile :: Key -> FilePath
keyFile key = replace "/" "%" $ replace ":" "&c" $ keyFile key = replace "/" "%" $ replace ":" "&c" $

2
debian/changelog vendored
View file

@ -7,6 +7,8 @@ git-annex (4.20131003) UNRELEASED; urgency=low
and remove it so it does not interfere with the automatic and remove it so it does not interfere with the automatic
commits of changed files. commits of changed files.
* addurl: Better sanitization of generated filenames. * addurl: Better sanitization of generated filenames.
* Better sanitization of problem characters when generating URL and WORM
keys.
-- Joey Hess <joeyh@debian.org> Thu, 03 Oct 2013 15:41:24 -0400 -- Joey Hess <joeyh@debian.org> Thu, 03 Oct 2013 15:41:24 -0400

View file

@ -40,3 +40,5 @@ git-annex: addurl: 1 failed
# End of transcript or log. # End of transcript or log.
"""]] """]]
> [[fixed|done]] --[[Joey]]