addurl: When a Content-Disposition header suggests a filename to use, addurl will consider using it, if it's reasonable and doesn't conflict with an existing file. (--file overrides this)

This commit is contained in:
Joey Hess 2015-01-22 14:52:52 -04:00
parent 91f1b2bdcf
commit 587f6a919b
7 changed files with 113 additions and 72 deletions

View file

@ -96,7 +96,7 @@ newAssistantUrl repo = do
- warp-tls listens to http, in order to show an error page, so this works. - warp-tls listens to http, in order to show an error page, so this works.
-} -}
assistantListening :: URLString -> IO Bool assistantListening :: URLString -> IO Bool
assistantListening url = catchBoolIO $ fst <$> exists url' def assistantListening url = catchBoolIO $ exists url' def
where where
url' = case parseURI url of url' = case parseURI url of
Nothing -> url Nothing -> url

View file

@ -191,7 +191,7 @@ escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
getRepoInfo :: RemoteConfig -> Widget getRepoInfo :: RemoteConfig -> Widget
getRepoInfo c = do getRepoInfo c = do
uo <- liftAnnex Url.getUrlOptions uo <- liftAnnex Url.getUrlOptions
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url uo exists <- liftIO $ catchDefaultIO False $ Url.exists url uo
[whamlet| [whamlet|
<a href="#{url}"> <a href="#{url}">
Internet Archive item Internet Archive item

View file

@ -120,17 +120,16 @@ downloadRemoteFile r relaxed uri file sz = do
loguri = setDownloader uri OtherDownloader loguri = setDownloader uri OtherDownloader
startWeb :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart startWeb :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart
startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI urlstring
where where
(s', downloader) = getDownloader s (urlstring, downloader) = getDownloader s
bad = fromMaybe (error $ "bad url " ++ s') $ bad = fromMaybe (error $ "bad url " ++ urlstring) $
parseURI $ escapeURIString isUnescapedInURI s' parseURI $ escapeURIString isUnescapedInURI urlstring
choosefile = flip fromMaybe optfile
go url = case downloader of go url = case downloader of
QuviDownloader -> usequvi QuviDownloader -> usequvi
_ -> _ ->
#ifdef WITH_QUVI #ifdef WITH_QUVI
ifM (quviSupported s') ifM (quviSupported urlstring)
( usequvi ( usequvi
, regulardownload url , regulardownload url
) )
@ -139,30 +138,44 @@ startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
#endif #endif
regulardownload url = do regulardownload url = do
pathmax <- liftIO $ fileNameLengthLimit "." pathmax <- liftIO $ fileNameLengthLimit "."
let file = choosefile $ url2file url pathdepth pathmax urlinfo <- if relaxed
then pure $ Url.UrlInfo True Nothing Nothing
else Url.withUrlOptions (Url.getUrlInfo urlstring)
file <- case optfile of
Just f -> pure f
Nothing -> case Url.urlSuggestedFile urlinfo of
Nothing -> pure $ url2file url pathdepth pathmax
Just sf -> do
let f = truncateFilePath pathmax $
sanitizeFilePath sf
ifM (liftIO $ doesFileExist f <||> doesDirectoryExist f)
( pure $ url2file url pathdepth pathmax
, pure f
)
showStart "addurl" file showStart "addurl" file
next $ performWeb relaxed s' file next $ performWeb relaxed urlstring file urlinfo
#ifdef WITH_QUVI #ifdef WITH_QUVI
badquvi = error $ "quvi does not know how to download url " ++ s' badquvi = error $ "quvi does not know how to download url " ++ urlstring
usequvi = do usequvi = do
page <- fromMaybe badquvi page <- fromMaybe badquvi
<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] s' <$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring
let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page
pathmax <- liftIO $ fileNameLengthLimit "." pathmax <- liftIO $ fileNameLengthLimit "."
let file = choosefile $ truncateFilePath pathmax $ sanitizeFilePath $ let file = flip fromMaybe optfile $
truncateFilePath pathmax $ sanitizeFilePath $
Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link
showStart "addurl" file showStart "addurl" file
next $ performQuvi relaxed s' (Quvi.linkUrl link) file next $ performQuvi relaxed urlstring (Quvi.linkUrl link) file
#else #else
usequvi = error "not built with quvi support" usequvi = error "not built with quvi support"
#endif #endif
performWeb :: Bool -> URLString -> FilePath -> CommandPerform performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform
performWeb relaxed url file = ifAnnexed file addurl geturl performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl
where where
geturl = next $ isJust <$> addUrlFile relaxed url file geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file
addurl = addUrlChecked relaxed url webUUID checkexistssize addurl = addUrlChecked relaxed url webUUID $ \k -> return $
checkexistssize = Url.withUrlOptions . Url.check url . keySize (Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k)
#ifdef WITH_QUVI #ifdef WITH_QUVI
performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
@ -189,7 +202,8 @@ addUrlFileQuvi relaxed quviurl videourl file = do
- retained, because the size of a video stream - retained, because the size of a video stream
- might change and we want to be able to download - might change and we want to be able to download
- it later. -} - it later. -}
sizedkey <- addSizeUrlKey videourl key urlinfo <- Url.withUrlOptions (Url.getUrlInfo videourl)
let sizedkey = addSizeUrlKey urlinfo key
prepGetViaTmpChecked sizedkey Nothing $ do prepGetViaTmpChecked sizedkey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation key tmp <- fromRepo $ gitAnnexTmpObjectLocation key
showOutput showOutput
@ -225,17 +239,17 @@ addUrlChecked relaxed url u checkexistssize key
stop stop
) )
addUrlFile :: Bool -> URLString -> FilePath -> Annex (Maybe Key) addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
addUrlFile relaxed url file = do addUrlFile relaxed url urlinfo file = do
liftIO $ createDirectoryIfMissing True (parentDir file) liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed) ifM (Annex.getState Annex.fast <||> pure relaxed)
( nodownload relaxed url file ( nodownload relaxed url urlinfo file
, downloadWeb url file , downloadWeb url urlinfo file
) )
downloadWeb :: URLString -> FilePath -> Annex (Maybe Key) downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
downloadWeb url file = do downloadWeb url urlinfo file = do
dummykey <- addSizeUrlKey url =<< Backend.URL.fromUrl url Nothing dummykey <- addSizeUrlKey urlinfo <$> Backend.URL.fromUrl url Nothing
let downloader f _ = do let downloader f _ = do
showOutput showOutput
downloadUrl [url] f downloadUrl [url] f
@ -272,15 +286,9 @@ downloadWith downloader dummykey u url file =
liftIO $ createDirectoryIfMissing True (parentDir tmp) liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloader tmp p downloader tmp p
{- Hits the url to get the size, if available. {- Adds the url size to the Key. -}
- addSizeUrlKey :: Url.UrlInfo -> Key -> Key
- This is needed to avoid exceeding the diskreserve when downloading, addSizeUrlKey urlinfo key = key { keySize = Url.urlSize urlinfo }
- and so the assistant can display a pretty progress bar.
-}
addSizeUrlKey :: URLString -> Key -> Annex Key
addSizeUrlKey url key = do
size <- snd <$> Url.withUrlOptions (Url.exists url)
return $ key { keySize = size }
cleanup :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex () cleanup :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex ()
cleanup u url file key mtmp = do cleanup u url file key mtmp = do
@ -295,17 +303,13 @@ cleanup u url file key mtmp = do
Annex.Queue.flush Annex.Queue.flush
maybe noop (moveAnnex key) mtmp maybe noop (moveAnnex key) mtmp
nodownload :: Bool -> URLString -> FilePath -> Annex (Maybe Key) nodownload :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
nodownload relaxed url file = do nodownload relaxed url urlinfo file
(exists, size) <- if relaxed | Url.urlExists urlinfo = do
then pure (True, Nothing) key <- Backend.URL.fromUrl url (Url.urlSize urlinfo)
else Url.withUrlOptions (Url.exists url)
if exists
then do
key <- Backend.URL.fromUrl url size
cleanup webUUID url file key Nothing cleanup webUUID url file key Nothing
return (Just key) return (Just key)
else do | otherwise = do
warning $ "unable to access url: " ++ url warning $ "unable to access url: " ++ url
return Nothing return Nothing

View file

@ -144,7 +144,9 @@ performDownload relaxed cache todownload = case location todownload of
rundownload url (takeExtension url) $ \f -> do rundownload url (takeExtension url) $ \f -> do
r <- Remote.claimingUrl url r <- Remote.claimingUrl url
if Remote.uuid r == webUUID if Remote.uuid r == webUUID
then maybeToList <$> addUrlFile relaxed url f then do
urlinfo <- Url.withUrlOptions (Url.getUrlInfo url)
maybeToList <$> addUrlFile relaxed url urlinfo f
else do else do
res <- tryNonAsync $ maybe res <- tryNonAsync $ maybe
(error $ "unable to checkUrl of " ++ Remote.name r) (error $ "unable to checkUrl of " ++ Remote.name r)

View file

@ -17,6 +17,8 @@ module Utility.Url (
check, check,
checkBoth, checkBoth,
exists, exists,
UrlInfo(..),
getUrlInfo,
download, download,
downloadQuiet, downloadQuiet,
parseURIRelaxed parseURIRelaxed
@ -84,18 +86,27 @@ checkBoth url expected_size uo = do
v <- check url expected_size uo v <- check url expected_size uo
return (fst v && snd v) return (fst v && snd v)
check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool) check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool)
check url expected_size = go <$$> exists url check url expected_size = go <$$> getUrlInfo url
where where
go (False, _) = (False, False) go (UrlInfo False _ _) = (False, False)
go (True, Nothing) = (True, True) go (UrlInfo True Nothing _) = (True, True)
go (True, s) = case expected_size of go (UrlInfo True s _) = case expected_size of
Just _ -> (True, expected_size == s) Just _ -> (True, expected_size == s)
Nothing -> (True, True) Nothing -> (True, True)
exists :: URLString -> UrlOptions -> IO Bool
exists url uo = urlExists <$> getUrlInfo url uo
data UrlInfo = UrlInfo
{ urlExists :: Bool
, urlSize :: Maybe Integer
, urlSuggestedFile :: Maybe FilePath
}
{- Checks that an url exists and could be successfully downloaded, {- Checks that an url exists and could be successfully downloaded,
- also returning its size if available. -} - also returning its size and suggested filename if available. -}
exists :: URLString -> UrlOptions -> IO (Bool, Maybe Integer) getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
exists url uo = case parseURIRelaxed url of getUrlInfo url uo = case parseURIRelaxed url of
Just u -> case parseUrl (show u) of Just u -> case parseUrl (show u) of
Just req -> existsconduit req `catchNonAsync` const dne Just req -> existsconduit req `catchNonAsync` const dne
-- http-conduit does not support file:, ftp:, etc urls, -- http-conduit does not support file:, ftp:, etc urls,
@ -107,18 +118,21 @@ exists url uo = case parseURIRelaxed url of
case s of case s of
Just stat -> do Just stat -> do
sz <- getFileSize' f stat sz <- getFileSize' f stat
return (True, Just sz) found (Just sz) Nothing
Nothing -> dne Nothing -> dne
| Build.SysConfig.curl -> do | Build.SysConfig.curl -> do
output <- catchDefaultIO "" $ output <- catchDefaultIO "" $
readProcess "curl" $ toCommand curlparams readProcess "curl" $ toCommand curlparams
case lastMaybe (lines output) of case lastMaybe (lines output) of
Just ('2':_:_) -> return (True, extractlencurl output) Just ('2':_:_) -> found
(extractlencurl output)
Nothing
_ -> dne _ -> dne
| otherwise -> dne | otherwise -> dne
Nothing -> dne Nothing -> dne
where where
dne = return (False, Nothing) dne = return $ UrlInfo False Nothing Nothing
found sz f = return $ UrlInfo True sz f
curlparams = addUserAgent uo $ curlparams = addUserAgent uo $
[ Param "-s" [ Param "-s"
@ -133,23 +147,36 @@ exists url uo = case parseURIRelaxed url of
_ -> Nothing _ -> Nothing
_ -> Nothing _ -> Nothing
extractlen resp = readish . B8.toString =<< headMaybe lenheaders extractlen = readish . B8.toString <=< firstheader hContentLength
where
lenheaders = map snd $ extractfilename = contentDispositionFilename . B8.toString
filter (\(h, _) -> h == hContentLength) <=< firstheader hContentDisposition
(responseHeaders resp)
firstheader h = headMaybe . map snd .
filter (\p -> fst p == h) . responseHeaders
existsconduit req = withManager $ \mgr -> do existsconduit req = withManager $ \mgr -> do
let req' = headRequest (applyRequest uo req) let req' = headRequest (applyRequest uo req)
resp <- http req' mgr resp <- http req' mgr
-- forces processing the response before the -- forces processing the response before the
-- manager is closed -- manager is closed
ret <- if responseStatus resp == ok200 ret <- liftIO $ if responseStatus resp == ok200
then return (True, extractlen resp) then found
else liftIO dne (extractlen resp)
(extractfilename resp)
else dne
liftIO $ closeManager mgr liftIO $ closeManager mgr
return ret return ret
-- Parse eg: attachment; filename="fname.ext"
-- per RFC 2616
contentDispositionFilename :: String -> Maybe FilePath
contentDispositionFilename s
| "attachment; filename=\"" `isPrefixOf` s && "\"" `isSuffixOf` s =
Just $ reverse $ drop 1 $ reverse $
drop 1 $ dropWhile (/= '"') s
| otherwise = Nothing
#if MIN_VERSION_http_conduit(2,0,0) #if MIN_VERSION_http_conduit(2,0,0)
headRequest :: Request -> Request headRequest :: Request -> Request
#else #else
@ -229,6 +256,9 @@ parseURIRelaxed = parseURI . escapeURIString isAllowedInURI
hAcceptEncoding :: CI.CI B.ByteString hAcceptEncoding :: CI.CI B.ByteString
hAcceptEncoding = "Accept-Encoding" hAcceptEncoding = "Accept-Encoding"
hContentDisposition :: CI.CI B.ByteString
hContentDisposition = "Content-Disposition"
#if ! MIN_VERSION_http_types(0,7,0) #if ! MIN_VERSION_http_types(0,7,0)
hContentLength :: CI.CI B.ByteString hContentLength :: CI.CI B.ByteString
hContentLength = "Content-Length" hContentLength = "Content-Length"

3
debian/changelog vendored
View file

@ -16,6 +16,9 @@ git-annex (5.20150114) UNRELEASED; urgency=medium
* Avoid using fileSize which maxes out at just 2 gb on Windows. * Avoid using fileSize which maxes out at just 2 gb on Windows.
Instead, use hFileSize, which doesn't have a bounded size. Instead, use hFileSize, which doesn't have a bounded size.
Fixes support for files > 2 gb on Windows. Fixes support for files > 2 gb on Windows.
* addurl: When a Content-Disposition header suggests a filename to use,
addurl will consider using it, if it's reasonable and doesn't conflict
with an existing file. (--file overrides this)
-- Joey Hess <id@joeyh.name> Tue, 13 Jan 2015 17:03:39 -0400 -- Joey Hess <id@joeyh.name> Tue, 13 Jan 2015 17:03:39 -0400

View file

@ -218,8 +218,10 @@ subdirectories).
is there at a future point, specify `--relaxed`. (Implies `--fast`.) is there at a future point, specify `--relaxed`. (Implies `--fast`.)
Normally the filename is based on the full url, so will look like Normally the filename is based on the full url, so will look like
"www.example.com_dir_subdir_bigfile". For a shorter filename, specify "www.example.com_dir_subdir_bigfile". In some cases, addurl is able to
`--pathdepth=N`. For example, `--pathdepth=1` will use "dir/subdir/bigfile", come up with a better filename based on other information. Or, for a
shorter filename, specify `--pathdepth=N`. For example,
`--pathdepth=1` will use "dir/subdir/bigfile",
while `--pathdepth=3` will use "bigfile". It can also be negative; while `--pathdepth=3` will use "bigfile". It can also be negative;
`--pathdepth=-2` will use the last two parts of the url. `--pathdepth=-2` will use the last two parts of the url.