addurl: When a Content-Disposition header suggests a filename to use, addurl will consider using it, if it's reasonable and doesn't conflict with an existing file. (--file overrides this)

This commit is contained in:
Joey Hess 2015-01-22 14:52:52 -04:00
parent 91f1b2bdcf
commit 587f6a919b
7 changed files with 113 additions and 72 deletions

View file

@ -96,7 +96,7 @@ newAssistantUrl repo = do
- warp-tls listens to http, in order to show an error page, so this works.
-}
assistantListening :: URLString -> IO Bool
assistantListening url = catchBoolIO $ fst <$> exists url' def
assistantListening url = catchBoolIO $ exists url' def
where
url' = case parseURI url of
Nothing -> url

View file

@ -191,7 +191,7 @@ escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
getRepoInfo :: RemoteConfig -> Widget
getRepoInfo c = do
uo <- liftAnnex Url.getUrlOptions
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url uo
exists <- liftIO $ catchDefaultIO False $ Url.exists url uo
[whamlet|
<a href="#{url}">
Internet Archive item

View file

@ -120,17 +120,16 @@ downloadRemoteFile r relaxed uri file sz = do
loguri = setDownloader uri OtherDownloader
startWeb :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart
startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI urlstring
where
(s', downloader) = getDownloader s
bad = fromMaybe (error $ "bad url " ++ s') $
parseURI $ escapeURIString isUnescapedInURI s'
choosefile = flip fromMaybe optfile
(urlstring, downloader) = getDownloader s
bad = fromMaybe (error $ "bad url " ++ urlstring) $
parseURI $ escapeURIString isUnescapedInURI urlstring
go url = case downloader of
QuviDownloader -> usequvi
_ ->
#ifdef WITH_QUVI
ifM (quviSupported s')
ifM (quviSupported urlstring)
( usequvi
, regulardownload url
)
@ -139,30 +138,44 @@ startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
#endif
regulardownload url = do
pathmax <- liftIO $ fileNameLengthLimit "."
let file = choosefile $ url2file url pathdepth pathmax
urlinfo <- if relaxed
then pure $ Url.UrlInfo True Nothing Nothing
else Url.withUrlOptions (Url.getUrlInfo urlstring)
file <- case optfile of
Just f -> pure f
Nothing -> case Url.urlSuggestedFile urlinfo of
Nothing -> pure $ url2file url pathdepth pathmax
Just sf -> do
let f = truncateFilePath pathmax $
sanitizeFilePath sf
ifM (liftIO $ doesFileExist f <||> doesDirectoryExist f)
( pure $ url2file url pathdepth pathmax
, pure f
)
showStart "addurl" file
next $ performWeb relaxed s' file
next $ performWeb relaxed urlstring file urlinfo
#ifdef WITH_QUVI
badquvi = error $ "quvi does not know how to download url " ++ s'
badquvi = error $ "quvi does not know how to download url " ++ urlstring
usequvi = do
page <- fromMaybe badquvi
<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] s'
<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring
let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page
pathmax <- liftIO $ fileNameLengthLimit "."
let file = choosefile $ truncateFilePath pathmax $ sanitizeFilePath $
let file = flip fromMaybe optfile $
truncateFilePath pathmax $ sanitizeFilePath $
Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link
showStart "addurl" file
next $ performQuvi relaxed s' (Quvi.linkUrl link) file
next $ performQuvi relaxed urlstring (Quvi.linkUrl link) file
#else
usequvi = error "not built with quvi support"
#endif
performWeb :: Bool -> URLString -> FilePath -> CommandPerform
performWeb relaxed url file = ifAnnexed file addurl geturl
performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform
performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl
where
geturl = next $ isJust <$> addUrlFile relaxed url file
addurl = addUrlChecked relaxed url webUUID checkexistssize
checkexistssize = Url.withUrlOptions . Url.check url . keySize
geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file
addurl = addUrlChecked relaxed url webUUID $ \k -> return $
(Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k)
#ifdef WITH_QUVI
performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
@ -189,7 +202,8 @@ addUrlFileQuvi relaxed quviurl videourl file = do
- retained, because the size of a video stream
- might change and we want to be able to download
- it later. -}
sizedkey <- addSizeUrlKey videourl key
urlinfo <- Url.withUrlOptions (Url.getUrlInfo videourl)
let sizedkey = addSizeUrlKey urlinfo key
prepGetViaTmpChecked sizedkey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation key
showOutput
@ -225,17 +239,17 @@ addUrlChecked relaxed url u checkexistssize key
stop
)
addUrlFile :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
addUrlFile relaxed url file = do
addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
addUrlFile relaxed url urlinfo file = do
liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed)
( nodownload relaxed url file
, downloadWeb url file
( nodownload relaxed url urlinfo file
, downloadWeb url urlinfo file
)
downloadWeb :: URLString -> FilePath -> Annex (Maybe Key)
downloadWeb url file = do
dummykey <- addSizeUrlKey url =<< Backend.URL.fromUrl url Nothing
downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
downloadWeb url urlinfo file = do
dummykey <- addSizeUrlKey urlinfo <$> Backend.URL.fromUrl url Nothing
let downloader f _ = do
showOutput
downloadUrl [url] f
@ -272,15 +286,9 @@ downloadWith downloader dummykey u url file =
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloader tmp p
{- Hits the url to get the size, if available.
-
- This is needed to avoid exceeding the diskreserve when downloading,
- and so the assistant can display a pretty progress bar.
-}
addSizeUrlKey :: URLString -> Key -> Annex Key
addSizeUrlKey url key = do
size <- snd <$> Url.withUrlOptions (Url.exists url)
return $ key { keySize = size }
{- Adds the url size to the Key. -}
addSizeUrlKey :: Url.UrlInfo -> Key -> Key
addSizeUrlKey urlinfo key = key { keySize = Url.urlSize urlinfo }
cleanup :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex ()
cleanup u url file key mtmp = do
@ -295,17 +303,13 @@ cleanup u url file key mtmp = do
Annex.Queue.flush
maybe noop (moveAnnex key) mtmp
nodownload :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
nodownload relaxed url file = do
(exists, size) <- if relaxed
then pure (True, Nothing)
else Url.withUrlOptions (Url.exists url)
if exists
then do
key <- Backend.URL.fromUrl url size
nodownload :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
nodownload relaxed url urlinfo file
| Url.urlExists urlinfo = do
key <- Backend.URL.fromUrl url (Url.urlSize urlinfo)
cleanup webUUID url file key Nothing
return (Just key)
else do
| otherwise = do
warning $ "unable to access url: " ++ url
return Nothing

View file

@ -144,7 +144,9 @@ performDownload relaxed cache todownload = case location todownload of
rundownload url (takeExtension url) $ \f -> do
r <- Remote.claimingUrl url
if Remote.uuid r == webUUID
then maybeToList <$> addUrlFile relaxed url f
then do
urlinfo <- Url.withUrlOptions (Url.getUrlInfo url)
maybeToList <$> addUrlFile relaxed url urlinfo f
else do
res <- tryNonAsync $ maybe
(error $ "unable to checkUrl of " ++ Remote.name r)

View file

@ -17,6 +17,8 @@ module Utility.Url (
check,
checkBoth,
exists,
UrlInfo(..),
getUrlInfo,
download,
downloadQuiet,
parseURIRelaxed
@ -84,18 +86,27 @@ checkBoth url expected_size uo = do
v <- check url expected_size uo
return (fst v && snd v)
check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool)
check url expected_size = go <$$> exists url
check url expected_size = go <$$> getUrlInfo url
where
go (False, _) = (False, False)
go (True, Nothing) = (True, True)
go (True, s) = case expected_size of
go (UrlInfo False _ _) = (False, False)
go (UrlInfo True Nothing _) = (True, True)
go (UrlInfo True s _) = case expected_size of
Just _ -> (True, expected_size == s)
Nothing -> (True, True)
exists :: URLString -> UrlOptions -> IO Bool
exists url uo = urlExists <$> getUrlInfo url uo
data UrlInfo = UrlInfo
{ urlExists :: Bool
, urlSize :: Maybe Integer
, urlSuggestedFile :: Maybe FilePath
}
{- Checks that an url exists and could be successfully downloaded,
- also returning its size if available. -}
exists :: URLString -> UrlOptions -> IO (Bool, Maybe Integer)
exists url uo = case parseURIRelaxed url of
- also returning its size and suggested filename if available. -}
getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
getUrlInfo url uo = case parseURIRelaxed url of
Just u -> case parseUrl (show u) of
Just req -> existsconduit req `catchNonAsync` const dne
-- http-conduit does not support file:, ftp:, etc urls,
@ -107,18 +118,21 @@ exists url uo = case parseURIRelaxed url of
case s of
Just stat -> do
sz <- getFileSize' f stat
return (True, Just sz)
found (Just sz) Nothing
Nothing -> dne
| Build.SysConfig.curl -> do
output <- catchDefaultIO "" $
readProcess "curl" $ toCommand curlparams
case lastMaybe (lines output) of
Just ('2':_:_) -> return (True, extractlencurl output)
Just ('2':_:_) -> found
(extractlencurl output)
Nothing
_ -> dne
| otherwise -> dne
Nothing -> dne
where
dne = return (False, Nothing)
dne = return $ UrlInfo False Nothing Nothing
found sz f = return $ UrlInfo True sz f
curlparams = addUserAgent uo $
[ Param "-s"
@ -133,23 +147,36 @@ exists url uo = case parseURIRelaxed url of
_ -> Nothing
_ -> Nothing
extractlen resp = readish . B8.toString =<< headMaybe lenheaders
where
lenheaders = map snd $
filter (\(h, _) -> h == hContentLength)
(responseHeaders resp)
extractlen = readish . B8.toString <=< firstheader hContentLength
extractfilename = contentDispositionFilename . B8.toString
<=< firstheader hContentDisposition
firstheader h = headMaybe . map snd .
filter (\p -> fst p == h) . responseHeaders
existsconduit req = withManager $ \mgr -> do
let req' = headRequest (applyRequest uo req)
resp <- http req' mgr
-- forces processing the response before the
-- manager is closed
ret <- if responseStatus resp == ok200
then return (True, extractlen resp)
else liftIO dne
ret <- liftIO $ if responseStatus resp == ok200
then found
(extractlen resp)
(extractfilename resp)
else dne
liftIO $ closeManager mgr
return ret
-- Parse eg: attachment; filename="fname.ext"
-- per RFC 2616
contentDispositionFilename :: String -> Maybe FilePath
contentDispositionFilename s
| "attachment; filename=\"" `isPrefixOf` s && "\"" `isSuffixOf` s =
Just $ reverse $ drop 1 $ reverse $
drop 1 $ dropWhile (/= '"') s
| otherwise = Nothing
#if MIN_VERSION_http_conduit(2,0,0)
headRequest :: Request -> Request
#else
@ -229,6 +256,9 @@ parseURIRelaxed = parseURI . escapeURIString isAllowedInURI
hAcceptEncoding :: CI.CI B.ByteString
hAcceptEncoding = "Accept-Encoding"
hContentDisposition :: CI.CI B.ByteString
hContentDisposition = "Content-Disposition"
#if ! MIN_VERSION_http_types(0,7,0)
hContentLength :: CI.CI B.ByteString
hContentLength = "Content-Length"

3
debian/changelog vendored
View file

@ -16,6 +16,9 @@ git-annex (5.20150114) UNRELEASED; urgency=medium
* Avoid using fileSize which maxes out at just 2 gb on Windows.
Instead, use hFileSize, which doesn't have a bounded size.
Fixes support for files > 2 gb on Windows.
* addurl: When a Content-Disposition header suggests a filename to use,
addurl will consider using it, if it's reasonable and doesn't conflict
with an existing file. (--file overrides this)
-- Joey Hess <id@joeyh.name> Tue, 13 Jan 2015 17:03:39 -0400

View file

@ -218,8 +218,10 @@ subdirectories).
is there at a future point, specify `--relaxed`. (Implies `--fast`.)
Normally the filename is based on the full url, so will look like
"www.example.com_dir_subdir_bigfile". For a shorter filename, specify
`--pathdepth=N`. For example, `--pathdepth=1` will use "dir/subdir/bigfile",
"www.example.com_dir_subdir_bigfile". In some cases, addurl is able to
come up with a better filename based on other information. Or, for a
shorter filename, specify `--pathdepth=N`. For example,
`--pathdepth=1` will use "dir/subdir/bigfile",
while `--pathdepth=3` will use "bigfile". It can also be negative;
`--pathdepth=-2` will use the last two parts of the url.