diff --git a/Annex/UntrustedFilePath.hs b/Annex/UntrustedFilePath.hs new file mode 100644 index 0000000000..29c02709f0 --- /dev/null +++ b/Annex/UntrustedFilePath.hs @@ -0,0 +1,56 @@ +{- handling untrusted filepaths + - + - Copyright 2010-2020 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Annex.UntrustedFilePath where + +import Data.Char +import System.FilePath + +{- Given a string that we'd like to use as the basis for FilePath, but that + - was provided by a third party and is not to be trusted, returns the closest + - sane FilePath. + - + - All spaces and punctuation and other wacky stuff are replaced + - with '_', except for '.' + - + - "../" becomes ".._", which is safe. + - "/foo" becomes "_foo", which is safe. + - "c:foo" becomes "c_foo", which is safe even on windows. + - + - Leading '.' is also replaced with '_', so ".git/foo" becomes "_git_foo" + - and so no dotfiles that might control a program are inadvertently created. + -} +sanitizeFilePath :: String -> FilePath +sanitizeFilePath = leadingdot . map sanitize + where + sanitize c + | c == '.' = c + | isSpace c || isPunctuation c || isSymbol c || isControl c || c == '/' = '_' + | otherwise = c + + leadingdot ('.':s) = '_':s + leadingdot s = s + +escapeSequenceInFilePath :: FilePath -> Bool +escapeSequenceInFilePath f = '\ESC' `elem` f + +{- ../ is a path traversal, no matter where it appears. + - + - An absolute path is, of course. + -} +pathTraversalInFilePath :: FilePath -> Bool +pathTraversalInFilePath f + | isAbsolute f = True + | any (== "..") (splitPath f) = True + -- On windows, C:foo with no directory is not considered absolute + | hasDrive f = True + | otherwise = False + +gitDirectoryInFilePath :: FilePath -> Bool +gitDirectoryInFilePath = any (== ".git") + . map dropTrailingPathSeparator + . splitPath diff --git a/CHANGELOG b/CHANGELOG index 4afb03eb69..ad4d9ad3c2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,6 +16,10 @@ git-annex (8.20200502) UNRELEASED; urgency=medium * git-lfs repos that encrypt the annexed content but not the git repo only need --force passed to initremote, allow enableremote and autoenable of such remotes without forcing again. + * addurl, importfeed: Avoid adding filenames with leading '.', instead + it will be replaced with '_'. + * addurl --preserve-filename: New option, uses server-provided filename + without any sanitization, but with some security checking. -- Joey Hess Mon, 04 May 2020 12:46:11 -0400 diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index 9097b7f94a..73746e8c1a 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -1,6 +1,6 @@ {- git-annex command - - - Copyright 2011-2017 Joey Hess + - Copyright 2011-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -23,6 +23,7 @@ import Annex.CheckIgnore import Annex.Perms import Annex.UUID import Annex.YoutubeDl +import Annex.UntrustedFilePath import Logs.Web import Types.KeySource import Types.UrlContents @@ -52,6 +53,7 @@ data DownloadOptions = DownloadOptions { relaxedOption :: Bool , rawOption :: Bool , fileOption :: Maybe FilePath + , preserveFilenameOption :: Bool } optParser :: CmdParamsDesc -> Parser AddUrlOptions @@ -77,7 +79,7 @@ optParser desc = AddUrlOptions ) parseDownloadOptions :: Bool -> Parser DownloadOptions -parseDownloadOptions withfileoption = DownloadOptions +parseDownloadOptions withfileoptions = DownloadOptions <$> switch ( long "relaxed" <> help "skip size check" @@ -86,12 +88,18 @@ parseDownloadOptions withfileoption = DownloadOptions ( long "raw" <> help "disable special handling for torrents, youtube-dl, etc" ) - <*> if withfileoption + <*> (if withfileoptions then optional (strOption ( long "file" <> metavar paramFile <> help "specify what file the url is added to" )) - else pure Nothing + else pure Nothing) + <*> (if withfileoptions + then switch + ( long "preserve-filename" + <> help "use filename provided by server as-is" + ) + else pure False) seek :: AddUrlOptions -> CommandSeek seek o = startConcurrency commandStages $ do @@ -207,16 +215,35 @@ startWeb addunlockedmatcher o urlstring = go $ fromMaybe bad $ parseURI urlstrin file <- adjustFile o <$> case fileOption (downloadOptions o) of Just f -> pure f Nothing -> case Url.urlSuggestedFile urlinfo of - Nothing -> pure $ url2file url (pathdepthOption o) pathmax - Just sf -> do - let f = truncateFilePath pathmax $ - sanitizeFilePath sf - ifM (liftIO $ doesFileExist f <||> doesDirectoryExist f) - ( pure $ url2file url (pathdepthOption o) pathmax - , pure f - ) + Just sf | not (null sf) -> if preserveFilenameOption (downloadOptions o) + then do + checkPreserveFileNameSecurity sf + return sf + else do + let f = truncateFilePath pathmax $ + sanitizeFilePath sf + ifM (liftIO $ doesFileExist f <||> doesDirectoryExist f) + ( pure $ url2file url (pathdepthOption o) pathmax + , pure f + ) + _ -> pure $ url2file url (pathdepthOption o) pathmax performWeb addunlockedmatcher o urlstring file urlinfo +-- sanitizeFilePath avoids all these security problems +-- (and probably others, but at least this catches the most egrarious ones). +checkPreserveFileNameSecurity :: FilePath -> Annex () +checkPreserveFileNameSecurity f = do + checksecurity escapeSequenceInFilePath False "escape sequence" + checksecurity pathTraversalInFilePath True "path traversal" + checksecurity gitDirectoryInFilePath True "contains a .git directory" + where + checksecurity p canshow d = when (p f) $ + giveup $ concat + [ "--preserve-filename was used, but the filename " + , if canshow then "(" ++ f ++ ") " else "" + , "has a security problem (" ++ d ++ "), not adding." + ] + performWeb :: AddUnlockedMatcher -> AddUrlOptions -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform performWeb addunlockedmatcher o url file urlinfo = ifAnnexed (toRawFilePath file) addurl geturl where diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs index 6b670668cc..be03ccb793 100644 --- a/Command/ImportFeed.hs +++ b/Command/ImportFeed.hs @@ -40,6 +40,7 @@ import Logs.MetaData import Annex.MetaData import Annex.FileMatcher import Command.AddUrl (addWorkTree) +import Annex.UntrustedFilePath cmd :: Command cmd = notBareRepo $ diff --git a/Types/UrlContents.hs b/Types/UrlContents.hs index c68efd00a4..7cbfe37f88 100644 --- a/Types/UrlContents.hs +++ b/Types/UrlContents.hs @@ -13,7 +13,7 @@ module Types.UrlContents ( ) where import Utility.Url -import Utility.Path +import Annex.UntrustedFilePath import System.FilePath @@ -35,13 +35,7 @@ newtype SafeFilePath = SafeFilePath FilePath mkSafeFilePath :: FilePath -> SafeFilePath mkSafeFilePath p = SafeFilePath $ if null p' then "file" else p' where - p' = joinPath $ filter safe $ map sanitizeFilePath $ splitDirectories p - safe s - | isDrive s = False - | s == ".." = False - | s == ".git" = False - | null s = False - | otherwise = True + p' = joinPath $ map sanitizeFilePath $ splitDirectories p fromSafeFilePath :: SafeFilePath -> FilePath fromSafeFilePath (SafeFilePath p) = p diff --git a/Utility/Path.hs b/Utility/Path.hs index a8ab918afa..c1137d7924 100644 --- a/Utility/Path.hs +++ b/Utility/Path.hs @@ -1,6 +1,6 @@ {- path manipulation - - - Copyright 2010-2014 Joey Hess + - Copyright 2010-2020 Joey Hess - - License: BSD-2-clause -} @@ -24,7 +24,6 @@ module Utility.Path ( inPath, searchPath, dotfile, - sanitizeFilePath, splitShortExtensions, prop_upFrom_basics, @@ -35,7 +34,6 @@ module Utility.Path ( import System.FilePath import Data.List import Data.Maybe -import Data.Char import Control.Applicative import Prelude @@ -276,22 +274,6 @@ dotfile file where f = takeFileName file -{- Given a string that we'd like to use as the basis for FilePath, but that - - was provided by a third party and is not to be trusted, returns the closest - - sane FilePath. - - - - All spaces and punctuation and other wacky stuff are replaced - - with '_', except for '.' - - "../" will thus turn into ".._", which is safe. - -} -sanitizeFilePath :: String -> FilePath -sanitizeFilePath = map sanitize - where - sanitize c - | c == '.' = c - | isSpace c || isPunctuation c || isSymbol c || isControl c || c == '/' = '_' - | otherwise = c - {- Similar to splitExtensions, but knows that some things in FilePaths - after a dot are too long to be extensions. -} splitShortExtensions :: FilePath -> (FilePath, [String]) diff --git a/doc/bugs/addurl__58___content-disposition_field_should_be_taken_as_is_without_obfuscation_/comment_3_a1aeed36b8317144bc5df6fe7bcfd488._comment b/doc/bugs/addurl__58___content-disposition_field_should_be_taken_as_is_without_obfuscation_/comment_3_a1aeed36b8317144bc5df6fe7bcfd488._comment new file mode 100644 index 0000000000..b09f93c9a4 --- /dev/null +++ b/doc/bugs/addurl__58___content-disposition_field_should_be_taken_as_is_without_obfuscation_/comment_3_a1aeed36b8317144bc5df6fe7bcfd488._comment @@ -0,0 +1,18 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2020-05-08T19:56:27Z" + content=""" +Implemented git-annex addurl --preserve-filename, which will do what you +want. + +Leaving this bug open because I only implemented it for web urls, not yet +for torrents and other special remotes that have their own url scheme. +The sanitization for those is currently done at a lower level than addurl, +and so that will take a bit more work to implement. + +(importfeed does not, I think, need to implement this option, because +the filenames are based on information from the rss feed, and it's +perfectly fine to sanitize eg a podcast episode title to get a reasonable +filename.) +"""]] diff --git a/doc/git-annex-addurl.mdwn b/doc/git-annex-addurl.mdwn index 224d23078a..68ff68b2b9 100644 --- a/doc/git-annex-addurl.mdwn +++ b/doc/git-annex-addurl.mdwn @@ -57,6 +57,16 @@ be used to get better filenames. If the file already exists, addurl will record that it can be downloaded from the specified url(s). +* `--preserve-filename` + + When the web server (or torrent, etc) provides a filename, use it as-is, + avoiding sanitizing unusual characters, or truncating it to length, or any + other modifications. + + git-annex will still check the filename for safety, and if the filename + has a security problem such as path traversal or an escape sequence, + it will refuse to add it. + * `--pathdepth=N` Rather than basing the filename on the whole url, this causes a path to diff --git a/git-annex.cabal b/git-annex.cabal index 7af56a3d1c..8e76988ea7 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -661,6 +661,7 @@ Executable git-annex Annex.TaggedPush Annex.Tmp Annex.Transfer + Annex.UntrustedFilePath Annex.UpdateInstead Annex.UUID Annex.Url