addurl: When a Content-Disposition header suggests a filename to use, addurl will consider using it, if it's reasonable and doesn't conflict with an existing file. (--file overrides this)

2015-01-22 14:52:52 -04:00 · 2015-01-22 14:52:52 -04:00 · 587f6a919b
commit 587f6a919b
parent 91f1b2bdcf
7 changed files with 113 additions and 72 deletions
--- a/Assistant/Restart.hs
+++ b/Assistant/Restart.hs
@ -96,7 +96,7 @@ newAssistantUrl repo = do
 - warp-tls listens to http, in order to show an error page, so this works.
 -}
 assistantListening :: URLString -> IO Bool
-assistantListening url = catchBoolIO $ fst <$> exists url' def
+assistantListening url = catchBoolIO $ exists url' def
  where
 	url' = case parseURI url of
 		Nothing -> url
--- a/Assistant/WebApp/Configurators/IA.hs
+++ b/Assistant/WebApp/Configurators/IA.hs
@ -191,7 +191,7 @@ escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
 getRepoInfo :: RemoteConfig -> Widget
 getRepoInfo c = do
 	uo <- liftAnnex Url.getUrlOptions
-	exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url uo
+	exists <- liftIO $ catchDefaultIO False $ Url.exists url uo
 	[whamlet|
 <a href="#{url}">
  Internet Archive item
--- a/Command/AddUrl.hs
+++ b/Command/AddUrl.hs
@ -120,17 +120,16 @@ downloadRemoteFile r relaxed uri file sz = do
 	loguri = setDownloader uri OtherDownloader
 startWeb :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart
-startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
+startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI urlstring
  where
-	(s', downloader) = getDownloader s
+	(urlstring, downloader) = getDownloader s
-	bad = fromMaybe (error $ "bad url " ++ s') $
+	bad = fromMaybe (error $ "bad url " ++ urlstring) $
-		parseURI $ escapeURIString isUnescapedInURI s'
+		parseURI $ escapeURIString isUnescapedInURI urlstring
 	choosefile = flip fromMaybe optfile
 	go url = case downloader of
 		QuviDownloader -> usequvi
 		_ -> 
 #ifdef WITH_QUVI
-			ifM (quviSupported s')
+			ifM (quviSupported urlstring)
 				( usequvi
 				, regulardownload url
 				)
@ -139,30 +138,44 @@ startWeb relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
 #endif
 	regulardownload url = do
 		pathmax <- liftIO $ fileNameLengthLimit "."
-		let file = choosefile $ url2file url pathdepth pathmax
+		urlinfo <- if relaxed
 			then pure $ Url.UrlInfo True Nothing Nothing
 			else Url.withUrlOptions (Url.getUrlInfo urlstring)
 		file <- case optfile of
 			Just f -> pure f
 			Nothing -> case Url.urlSuggestedFile urlinfo of
 				Nothing -> pure $ url2file url pathdepth pathmax
 				Just sf -> do
 					let f = truncateFilePath pathmax $
 						sanitizeFilePath sf
 					ifM (liftIO $ doesFileExist f <||> doesDirectoryExist f)
 						( pure $ url2file url pathdepth pathmax
 						, pure f
 						)
 		showStart "addurl" file
-		next $ performWeb relaxed s' file
+		next $ performWeb relaxed urlstring file urlinfo
 #ifdef WITH_QUVI
-	badquvi = error $ "quvi does not know how to download url " ++ s'
+	badquvi = error $ "quvi does not know how to download url " ++ urlstring
 	usequvi = do
 		page <- fromMaybe badquvi
-			<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] s'
+			<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] urlstring
 		let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page
 		pathmax <- liftIO $ fileNameLengthLimit "."
-		let file = choosefile $ truncateFilePath pathmax $ sanitizeFilePath $
+		let file = flip fromMaybe optfile $
 			truncateFilePath pathmax $ sanitizeFilePath $
 				Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link
 		showStart "addurl" file
-		next $ performQuvi relaxed s' (Quvi.linkUrl link) file
+		next $ performQuvi relaxed urlstring (Quvi.linkUrl link) file
 #else
 	usequvi = error "not built with quvi support"
 #endif
-performWeb :: Bool -> URLString -> FilePath -> CommandPerform
+performWeb :: Bool -> URLString -> FilePath -> Url.UrlInfo -> CommandPerform
-performWeb relaxed url file = ifAnnexed file addurl geturl
+performWeb relaxed url file urlinfo = ifAnnexed file addurl geturl
  where
-	geturl = next $ isJust <$> addUrlFile relaxed url file
+	geturl = next $ isJust <$> addUrlFile relaxed url urlinfo file
-	addurl = addUrlChecked relaxed url webUUID checkexistssize
+	addurl = addUrlChecked relaxed url webUUID $ \k -> return $
-	checkexistssize = Url.withUrlOptions . Url.check url . keySize
+		(Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k)
 #ifdef WITH_QUVI
 performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
@ -189,7 +202,8 @@ addUrlFileQuvi relaxed quviurl videourl file = do
 			 - retained, because the size of a video stream
 			 - might change and we want to be able to download
 			 - it later. -}
-			sizedkey <- addSizeUrlKey videourl key
+			urlinfo <- Url.withUrlOptions (Url.getUrlInfo videourl)
 			let sizedkey = addSizeUrlKey urlinfo key
 			prepGetViaTmpChecked sizedkey Nothing $ do
 				tmp <- fromRepo $ gitAnnexTmpObjectLocation key
 				showOutput
@ -225,17 +239,17 @@ addUrlChecked relaxed url u checkexistssize key
 					stop
 		)
-addUrlFile :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
+addUrlFile :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
-addUrlFile relaxed url file = do
+addUrlFile relaxed url urlinfo file = do
 	liftIO $ createDirectoryIfMissing True (parentDir file)
 	ifM (Annex.getState Annex.fast <||> pure relaxed)
-		( nodownload relaxed url file
+		( nodownload relaxed url urlinfo file
-		, downloadWeb url file
+		, downloadWeb url urlinfo file
 		)
-downloadWeb :: URLString -> FilePath -> Annex (Maybe Key)
+downloadWeb :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
-downloadWeb url file = do
+downloadWeb url urlinfo file = do
-	dummykey <- addSizeUrlKey url =<< Backend.URL.fromUrl url Nothing
+	dummykey <- addSizeUrlKey urlinfo <$> Backend.URL.fromUrl url Nothing
 	let downloader f _ = do
 		showOutput
 		downloadUrl [url] f
@ -272,15 +286,9 @@ downloadWith downloader dummykey u url file =
 			liftIO $ createDirectoryIfMissing True (parentDir tmp)
 			downloader tmp p
-{- Hits the url to get the size, if available.
+{- Adds the url size to the Key. -}
- -
+addSizeUrlKey :: Url.UrlInfo -> Key -> Key
- - This is needed to avoid exceeding the diskreserve when downloading,
+addSizeUrlKey urlinfo key = key { keySize = Url.urlSize urlinfo }
 - and so the assistant can display a pretty progress bar.
 -}
 addSizeUrlKey :: URLString -> Key -> Annex Key
 addSizeUrlKey url key = do
 	size <- snd <$> Url.withUrlOptions (Url.exists url)
 	return $ key { keySize = size }
 cleanup :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex ()
 cleanup u url file key mtmp = do
@ -295,17 +303,13 @@ cleanup u url file key mtmp = do
 		Annex.Queue.flush
 	maybe noop (moveAnnex key) mtmp
-nodownload :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
+nodownload :: Bool -> URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
-nodownload relaxed url file = do
+nodownload relaxed url urlinfo file
-	(exists, size) <- if relaxed
+	| Url.urlExists urlinfo = do
-		then pure (True, Nothing)
+		key <- Backend.URL.fromUrl url (Url.urlSize urlinfo)
 		else Url.withUrlOptions (Url.exists url)
 	if exists
 		then do
 			key <- Backend.URL.fromUrl url size
 		cleanup webUUID url file key Nothing
 		return (Just key)
-		else do
+	| otherwise = do
 		warning $ "unable to access url: " ++ url
 		return Nothing
--- a/Command/ImportFeed.hs
+++ b/Command/ImportFeed.hs
@ -144,7 +144,9 @@ performDownload relaxed cache todownload = case location todownload of
 		rundownload url (takeExtension url) $ \f -> do
 			r <- Remote.claimingUrl url
 			if Remote.uuid r == webUUID
-				then maybeToList <$> addUrlFile relaxed url f
+				then do
 					urlinfo <- Url.withUrlOptions (Url.getUrlInfo url)
 					maybeToList <$> addUrlFile relaxed url urlinfo f
 				else do
 					res <- tryNonAsync $ maybe
 						(error $ "unable to checkUrl of " ++ Remote.name r)
--- a/Utility/Url.hs
+++ b/Utility/Url.hs
@ -17,6 +17,8 @@ module Utility.Url (
 	check,
 	checkBoth,
 	exists,
 	UrlInfo(..),
 	getUrlInfo,
 	download,
 	downloadQuiet,
 	parseURIRelaxed
@ -84,18 +86,27 @@ checkBoth url expected_size uo = do
 	v <- check url expected_size uo
 	return (fst v && snd v)
 check :: URLString -> Maybe Integer -> UrlOptions -> IO (Bool, Bool)
-check url expected_size = go <$$> exists url
+check url expected_size = go <$$> getUrlInfo url
  where
-	go (False, _) = (False, False)
+	go (UrlInfo False _ _) = (False, False)
-	go (True, Nothing) = (True, True)
+	go (UrlInfo True Nothing _) = (True, True)
-	go (True, s) = case expected_size of
+	go (UrlInfo True s _) = case expected_size of
 		Just _ -> (True, expected_size == s)
 		Nothing -> (True, True)
 exists :: URLString -> UrlOptions -> IO Bool
 exists url uo = urlExists <$> getUrlInfo url uo
 data UrlInfo = UrlInfo
 	{ urlExists :: Bool
 	, urlSize :: Maybe Integer
 	, urlSuggestedFile :: Maybe FilePath
 	}
 {- Checks that an url exists and could be successfully downloaded,
- - also returning its size if available. -}
+ - also returning its size and suggested filename if available. -}
-exists :: URLString -> UrlOptions -> IO (Bool, Maybe Integer)
+getUrlInfo :: URLString -> UrlOptions -> IO UrlInfo
-exists url uo = case parseURIRelaxed url of
+getUrlInfo url uo = case parseURIRelaxed url of
 	Just u -> case parseUrl (show u) of
 		Just req -> existsconduit req `catchNonAsync` const dne
 		-- http-conduit does not support file:, ftp:, etc urls,
@ -107,18 +118,21 @@ exists url uo = case parseURIRelaxed url of
 				case s of
 					Just stat -> do
 						sz <- getFileSize' f stat
-						return (True, Just sz)
+						found (Just sz) Nothing
 					Nothing -> dne
 			| Build.SysConfig.curl -> do
 				output <- catchDefaultIO "" $
 					readProcess "curl" $ toCommand curlparams
 				case lastMaybe (lines output) of
-					Just ('2':_:_) -> return (True, extractlencurl output)
+					Just ('2':_:_) -> found
 						(extractlencurl output)
 						Nothing
 					_ -> dne
 			| otherwise -> dne
 	Nothing -> dne
  where
-	dne = return (False, Nothing)
+	dne = return $ UrlInfo False Nothing Nothing
 	found sz f = return $ UrlInfo True sz f
 	curlparams = addUserAgent uo $
 		[ Param "-s"
@ -133,23 +147,36 @@ exists url uo = case parseURIRelaxed url of
 			_ -> Nothing
 		_ -> Nothing
-	extractlen resp = readish . B8.toString =<< headMaybe lenheaders
+	extractlen = readish . B8.toString <=< firstheader hContentLength
-	  where
+
-		lenheaders = map snd $ 
+	extractfilename = contentDispositionFilename . B8.toString
-			filter (\(h, _) -> h == hContentLength)
+		<=< firstheader hContentDisposition
-				(responseHeaders resp)
+
 	firstheader h = headMaybe . map snd .
 		filter (\p -> fst p == h) . responseHeaders
 	existsconduit req = withManager $ \mgr -> do
 		let req' = headRequest (applyRequest uo req)
 		resp <- http req' mgr
 		-- forces processing the response before the
 		-- manager is closed
-		ret <- if responseStatus resp == ok200
+		ret <- liftIO $ if responseStatus resp == ok200
-			then return (True, extractlen resp)
+			then found
-			else liftIO dne
+				(extractlen resp)
 				(extractfilename resp)
 			else dne
 		liftIO $ closeManager mgr
 		return ret
 -- Parse eg: attachment; filename="fname.ext"
 -- per RFC 2616
 contentDispositionFilename :: String -> Maybe FilePath
 contentDispositionFilename s
 	| "attachment; filename=\"" `isPrefixOf` s && "\"" `isSuffixOf` s =
 		Just $ reverse $ drop 1 $ reverse $ 
 			drop 1 $ dropWhile (/= '"') s
 	| otherwise = Nothing
 #if MIN_VERSION_http_conduit(2,0,0)
 headRequest :: Request -> Request
 #else
@ -229,6 +256,9 @@ parseURIRelaxed = parseURI . escapeURIString isAllowedInURI
 hAcceptEncoding :: CI.CI B.ByteString
 hAcceptEncoding = "Accept-Encoding"
 hContentDisposition :: CI.CI B.ByteString
 hContentDisposition = "Content-Disposition"
 #if ! MIN_VERSION_http_types(0,7,0)
 hContentLength :: CI.CI B.ByteString
 hContentLength = "Content-Length"
--- a/debian/changelog
+++ b/debian/changelog
@ -16,6 +16,9 @@ git-annex (5.20150114) UNRELEASED; urgency=medium
  * Avoid using fileSize which maxes out at just 2 gb on Windows.
    Instead, use hFileSize, which doesn't have a bounded size.
    Fixes support for files > 2 gb on Windows.
  * addurl: When a Content-Disposition header suggests a filename to use,
    addurl will consider using it, if it's reasonable and doesn't conflict
    with an existing file. (--file overrides this)
 -- Joey Hess <id@joeyh.name>  Tue, 13 Jan 2015 17:03:39 -0400
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@ -218,8 +218,10 @@ subdirectories).
  is there at a future point, specify `--relaxed`. (Implies `--fast`.)
  Normally the filename is based on the full url, so will look like
-  "www.example.com_dir_subdir_bigfile". For a shorter filename, specify
+  "www.example.com_dir_subdir_bigfile". In some cases, addurl is able to
-  `--pathdepth=N`. For example, `--pathdepth=1` will use "dir/subdir/bigfile",
+  come up with a better filename based on other information. Or, for a
  shorter filename, specify `--pathdepth=N`. For example,
  `--pathdepth=1` will use "dir/subdir/bigfile",
  while `--pathdepth=3` will use "bigfile". It can also be negative;
  `--pathdepth=-2` will use the last two parts of the url.