Send a git-annex user-agent when downloading urls.

Overridable with --user-agent option.

Not yet done for S3 or WebDAV due to limitations of libraries used --
nether allows a user-agent header to be specified.

This commit sponsored by Michael Zehrer.
This commit is contained in:
Joey Hess 2013-09-28 14:35:21 -04:00
parent 55362462ae
commit 12f6b9693a
14 changed files with 90 additions and 36 deletions

View file

@ -108,6 +108,7 @@ data AnnexState = AnnexState
, fields :: M.Map String String , fields :: M.Map String String
, cleanup :: M.Map String (Annex ()) , cleanup :: M.Map String (Annex ())
, inodeschanged :: Maybe Bool , inodeschanged :: Maybe Bool
, useragent :: Maybe String
} }
newState :: Git.Repo -> AnnexState newState :: Git.Repo -> AnnexState
@ -141,6 +142,7 @@ newState gitrepo = AnnexState
, fields = M.empty , fields = M.empty
, cleanup = M.empty , cleanup = M.empty
, inodeschanged = Nothing , inodeschanged = Nothing
, useragent = Nothing
} }
{- Makes an Annex state object for the specified git repo. {- Makes an Annex state object for the specified git repo.

View file

@ -43,7 +43,7 @@ import qualified Annex.Queue
import qualified Annex.Branch import qualified Annex.Branch
import Utility.DiskFree import Utility.DiskFree
import Utility.FileMode import Utility.FileMode
import qualified Utility.Url as Url import qualified Annex.Url as Url
import Types.Key import Types.Key
import Utility.DataUnits import Utility.DataUnits
import Utility.CopyFile import Utility.CopyFile
@ -458,7 +458,7 @@ downloadUrl urls file = go =<< annexWebDownloadCommand <$> Annex.getGitConfig
go Nothing = do go Nothing = do
opts <- map Param . annexWebOptions <$> Annex.getGitConfig opts <- map Param . annexWebOptions <$> Annex.getGitConfig
headers <- getHttpHeaders headers <- getHttpHeaders
liftIO $ anyM (\u -> Url.download u headers opts file) urls anyM (\u -> Url.withUserAgent $ Url.download u headers opts file) urls
go (Just basecmd) = liftIO $ anyM (downloadcmd basecmd) urls go (Just basecmd) = liftIO $ anyM (downloadcmd basecmd) urls
downloadcmd basecmd url = downloadcmd basecmd url =
boolSystem "sh" [Param "-c", Param $ gencmd url basecmd] boolSystem "sh" [Param "-c", Param $ gencmd url basecmd]

27
Annex/Url.hs Normal file
View file

@ -0,0 +1,27 @@
{- Url downloading, with git-annex user agent.
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Annex.Url (
module U,
withUserAgent,
getUserAgent,
) where
import Common.Annex
import qualified Annex
import Utility.Url as U
import qualified Build.SysConfig as SysConfig
defaultUserAgent :: U.UserAgent
defaultUserAgent = "git-annex/" ++ SysConfig.packageversion
getUserAgent :: Annex (Maybe U.UserAgent)
getUserAgent = Annex.getState $
Just . fromMaybe defaultUserAgent . Annex.useragent
withUserAgent :: (Maybe U.UserAgent -> IO a) -> Annex a
withUserAgent a = liftIO . a =<< getUserAgent

View file

@ -21,7 +21,7 @@ import qualified Types.Remote as Remote
import Types.StandardGroups import Types.StandardGroups
import Types.Remote (RemoteConfig) import Types.Remote (RemoteConfig)
import Logs.Remote import Logs.Remote
import qualified Utility.Url as Url import qualified Annex.Url as Url
import Creds import Creds
import Assistant.Gpg import Assistant.Gpg
@ -190,7 +190,8 @@ escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
getRepoInfo :: RemoteConfig -> Widget getRepoInfo :: RemoteConfig -> Widget
getRepoInfo c = do getRepoInfo c = do
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url [] ua <- liftAnnex Url.getUserAgent
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url [] ua
[whamlet| [whamlet|
<a href="#{url}"> <a href="#{url}">
Internet Archive item Internet Archive item

View file

@ -56,7 +56,7 @@ getSwitchToRepositoryR repo = do
( return url ( return url
, delayed $ waiturl urlfile , delayed $ waiturl urlfile
) )
listening url = catchBoolIO $ fst <$> Url.exists url [] listening url = catchBoolIO $ fst <$> Url.exists url [] Nothing
delayed a = do delayed a = do
threadDelay 100000 -- 1/10th of a second threadDelay 100000 -- 1/10th of a second
a a

View file

@ -17,8 +17,8 @@ import Backend
import qualified Command.Add import qualified Command.Add
import qualified Annex import qualified Annex
import qualified Annex.Queue import qualified Annex.Queue
import qualified Annex.Url as Url
import qualified Backend.URL import qualified Backend.URL
import qualified Utility.Url as Url
import Annex.Content import Annex.Content
import Logs.Web import Logs.Web
import qualified Option import qualified Option
@ -123,7 +123,7 @@ perform relaxed url file = ifAnnexed file addurl geturl
next $ return True next $ return True
| otherwise = do | otherwise = do
headers <- getHttpHeaders headers <- getHttpHeaders
ifM (liftIO $ Url.check url headers $ keySize key) ifM (Url.withUserAgent $ Url.check url headers $ keySize key)
( do ( do
setUrlPresent key url setUrlPresent key url
next $ return True next $ return True
@ -174,7 +174,7 @@ download url file = do
size <- ifM (liftIO $ isJust <$> checkDaemon pidfile) size <- ifM (liftIO $ isJust <$> checkDaemon pidfile)
( do ( do
headers <- getHttpHeaders headers <- getHttpHeaders
liftIO $ snd <$> Url.exists url headers snd <$> Url.withUserAgent (Url.exists url headers)
, return Nothing , return Nothing
) )
Backend.URL.fromUrl url size Backend.URL.fromUrl url size
@ -203,7 +203,7 @@ nodownload relaxed url file = do
headers <- getHttpHeaders headers <- getHttpHeaders
(exists, size) <- if relaxed (exists, size) <- if relaxed
then pure (True, Nothing) then pure (True, Nothing)
else liftIO $ Url.exists url headers else Url.withUserAgent $ Url.exists url headers
if exists if exists
then do then do
key <- Backend.URL.fromUrl url size key <- Backend.URL.fromUrl url size

View file

@ -17,7 +17,7 @@ import Data.Time.Clock
import Common.Annex import Common.Annex
import qualified Annex import qualified Annex
import Command import Command
import qualified Utility.Url as Url import qualified Annex.Url as Url
import Logs.Web import Logs.Web
import qualified Option import qualified Option
import qualified Utility.Format import qualified Utility.Format
@ -102,9 +102,10 @@ findEnclosures url = extract <$> downloadFeed url
downloadFeed :: URLString -> Annex (Maybe Feed) downloadFeed :: URLString -> Annex (Maybe Feed)
downloadFeed url = do downloadFeed url = do
showOutput showOutput
ua <- Url.getUserAgent
liftIO $ withTmpFile "feed" $ \f h -> do liftIO $ withTmpFile "feed" $ \f h -> do
fileEncoding h fileEncoding h
ifM (Url.download url [] [] f) ifM (Url.download url [] [] f ua)
( liftIO $ parseFeedString <$> hGetContentsStrict h ( liftIO $ parseFeedString <$> hGetContentsStrict h
, return Nothing , return Nothing
) )

View file

@ -48,6 +48,8 @@ options = Option.common ++
"skip files smaller than a size" "skip files smaller than a size"
, Option ['T'] ["time-limit"] (ReqArg Limit.addTimeLimit paramTime) , Option ['T'] ["time-limit"] (ReqArg Limit.addTimeLimit paramTime)
"stop after the specified amount of time" "stop after the specified amount of time"
, Option [] ["user-agent"] (ReqArg setuseragent paramName)
"override default User-Agent"
, Option [] ["trust-glacier"] (NoArg (Annex.setFlag "trustglacier")) , Option [] ["trust-glacier"] (NoArg (Annex.setFlag "trustglacier"))
"Trust Amazon Glacier inventory" "Trust Amazon Glacier inventory"
] ++ Option.matcher ] ++ Option.matcher
@ -55,6 +57,7 @@ options = Option.common ++
setnumcopies v = maybe noop setnumcopies v = maybe noop
(\n -> Annex.changeState $ \s -> s { Annex.forcenumcopies = Just n }) (\n -> Annex.changeState $ \s -> s { Annex.forcenumcopies = Just n })
(readish v) (readish v)
setuseragent v = Annex.changeState $ \s -> s { Annex.useragent = Just v }
setgitconfig v = Annex.changeGitRepo =<< inRepo (Git.Config.store v) setgitconfig v = Annex.changeGitRepo =<< inRepo (Git.Config.store v)
trustArg t = ReqArg (Remote.forceTrust t) paramRemote trustArg t = ReqArg (Remote.forceTrust t) paramRemote

View file

@ -30,7 +30,7 @@ import Annex.Exception
import qualified Annex.Content import qualified Annex.Content
import qualified Annex.BranchState import qualified Annex.BranchState
import qualified Annex.Branch import qualified Annex.Branch
import qualified Utility.Url as Url import qualified Annex.Url as Url
import Utility.Tmp import Utility.Tmp
import Config import Config
import Config.Cost import Config.Cost
@ -177,9 +177,10 @@ tryGitConfigRead r
Left l -> return $ Left l Left l -> return $ Left l
geturlconfig headers = do geturlconfig headers = do
ua <- Url.getUserAgent
v <- liftIO $ withTmpFile "git-annex.tmp" $ \tmpfile h -> do v <- liftIO $ withTmpFile "git-annex.tmp" $ \tmpfile h -> do
hClose h hClose h
ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile) ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile ua)
( pipedconfig "git" [Param "config", Param "--null", Param "--list", Param "--file", File tmpfile] ( pipedconfig "git" [Param "config", Param "--null", Param "--list", Param "--file", File tmpfile]
, return $ Left undefined , return $ Left undefined
) )
@ -240,7 +241,7 @@ inAnnex r key
where where
checkhttp headers = do checkhttp headers = do
showChecking r showChecking r
liftIO $ ifM (anyM (\u -> Url.check u headers (keySize key)) (keyUrls r key)) ifM (anyM (\u -> Url.withUserAgent $ Url.check u headers (keySize key)) (keyUrls r key))
( return $ Right True ( return $ Right True
, return $ Left "not found" , return $ Left "not found"
) )

View file

@ -19,7 +19,7 @@ import Config.Cost
import Logs.Web import Logs.Web
import Types.Key import Types.Key
import Utility.Metered import Utility.Metered
import qualified Utility.Url as Url import qualified Annex.Url as Url
#ifdef WITH_QUVI #ifdef WITH_QUVI
import Annex.Quvi import Annex.Quvi
import qualified Utility.Quvi as Quvi import qualified Utility.Quvi as Quvi
@ -118,7 +118,7 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
#endif #endif
DefaultDownloader -> do DefaultDownloader -> do
headers <- getHttpHeaders headers <- getHttpHeaders
liftIO $ Right <$> Url.check u' headers (keySize key) Right <$> Url.withUserAgent (Url.check u' headers $ keySize key)
where where
firsthit [] miss _ = return miss firsthit [] miss _ = return miss
firsthit (u:rest) _ a = do firsthit (u:rest) _ a = do

View file

@ -9,6 +9,7 @@
module Utility.Url ( module Utility.Url (
URLString, URLString,
UserAgent,
check, check,
exists, exists,
download, download,
@ -27,10 +28,12 @@ type URLString = String
type Headers = [String] type Headers = [String]
type UserAgent = String
{- Checks that an url exists and could be successfully downloaded, {- Checks that an url exists and could be successfully downloaded,
- also checking that its size, if available, matches a specified size. -} - also checking that its size, if available, matches a specified size. -}
check :: URLString -> Headers -> Maybe Integer -> IO Bool check :: URLString -> Headers -> Maybe Integer -> Maybe UserAgent -> IO Bool
check url headers expected_size = handle <$> exists url headers check url headers expected_size = handle <$$> exists url headers
where where
handle (False, _) = False handle (False, _) = False
handle (True, Nothing) = True handle (True, Nothing) = True
@ -44,8 +47,8 @@ check url headers expected_size = handle <$> exists url headers
- Uses curl otherwise, when available, since curl handles https better - Uses curl otherwise, when available, since curl handles https better
- than does Haskell's Network.Browser. - than does Haskell's Network.Browser.
-} -}
exists :: URLString -> Headers -> IO (Bool, Maybe Integer) exists :: URLString -> Headers -> Maybe UserAgent -> IO (Bool, Maybe Integer)
exists url headers = case parseURIRelaxed url of exists url headers ua = case parseURIRelaxed url of
Just u Just u
| uriScheme u == "file:" -> do | uriScheme u == "file:" -> do
s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u) s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u)
@ -54,12 +57,12 @@ exists url headers = case parseURIRelaxed url of
Nothing -> dne Nothing -> dne
| otherwise -> if Build.SysConfig.curl | otherwise -> if Build.SysConfig.curl
then do then do
output <- readProcess "curl" curlparams output <- readProcess "curl" $ toCommand curlparams
case lastMaybe (lines output) of case lastMaybe (lines output) of
Just ('2':_:_) -> return (True, extractsize output) Just ('2':_:_) -> return (True, extractsize output)
_ -> dne _ -> dne
else do else do
r <- request u headers HEAD r <- request u headers HEAD ua
case rspCode r of case rspCode r of
(2,_,_) -> return (True, size r) (2,_,_) -> return (True, size r)
_ -> return (False, Nothing) _ -> return (False, Nothing)
@ -67,13 +70,12 @@ exists url headers = case parseURIRelaxed url of
where where
dne = return (False, Nothing) dne = return (False, Nothing)
curlparams = curlparams = addUserAgent ua $
[ "-s" [ Param "-s"
, "--head" , Param "--head"
, "-L" , Param "-L", Param url
, url , Param "-w", Param "%{http_code}"
, "-w", "%{http_code}" ] ++ concatMap (\h -> [Param "-H", Param h]) headers
] ++ concatMap (\h -> ["-H", h]) headers
extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of
Just l -> case lastMaybe $ words l of Just l -> case lastMaybe $ words l of
@ -83,6 +85,11 @@ exists url headers = case parseURIRelaxed url of
size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders
-- works for both wget and curl commands
addUserAgent :: Maybe UserAgent -> [CommandParam] -> [CommandParam]
addUserAgent Nothing ps = ps
addUserAgent (Just ua) ps = ps ++ [Param "--user-agent", Param ua]
{- Used to download large files, such as the contents of keys. {- Used to download large files, such as the contents of keys.
- -
- Uses wget or curl program for its progress bar. (Wget has a better one, - Uses wget or curl program for its progress bar. (Wget has a better one,
@ -90,15 +97,15 @@ exists url headers = case parseURIRelaxed url of
- would not be appropriate to test at configure time and build support - would not be appropriate to test at configure time and build support
- for only one in. - for only one in.
-} -}
download :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool download :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
download = download' False download = download' False
{- No output, even on error. -} {- No output, even on error. -}
downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
downloadQuiet = download' True downloadQuiet = download' True
download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
download' quiet url headers options file = download' quiet url headers options file ua =
case parseURIRelaxed url of case parseURIRelaxed url of
Just u Just u
| uriScheme u == "file:" -> do | uriScheme u == "file:" -> do
@ -119,7 +126,7 @@ download' quiet url headers options file =
curl = go "curl" $ headerparams ++ quietopt "-s" ++ curl = go "curl" $ headerparams ++ quietopt "-s" ++
[Params "-f -L -C - -# -o"] [Params "-f -L -C - -# -o"]
go cmd opts = boolSystem cmd $ go cmd opts = boolSystem cmd $
options++opts++[File file, File url] addUserAgent ua $ options++opts++[File file, File url]
quietopt s quietopt s
| quiet = [Param s] | quiet = [Param s]
| otherwise = [] | otherwise = []
@ -134,13 +141,14 @@ download' quiet url headers options file =
- Unfortunately, does not handle https, so should only be used - Unfortunately, does not handle https, so should only be used
- when curl is not available. - when curl is not available.
-} -}
request :: URI -> Headers -> RequestMethod -> IO (Response String) request :: URI -> Headers -> RequestMethod -> Maybe UserAgent -> IO (Response String)
request url headers requesttype = go 5 url request url headers requesttype ua = go 5 url
where where
go :: Int -> URI -> IO (Response String) go :: Int -> URI -> IO (Response String)
go 0 _ = error "Too many redirects " go 0 _ = error "Too many redirects "
go n u = do go n u = do
rsp <- Browser.browse $ do rsp <- Browser.browse $ do
maybe noop Browser.setUserAgent ua
Browser.setErrHandler ignore Browser.setErrHandler ignore
Browser.setOutHandler ignore Browser.setOutHandler ignore
Browser.setAllowRedirects False Browser.setAllowRedirects False

3
debian/changelog vendored
View file

@ -18,6 +18,9 @@ git-annex (4.20130921) UNRELEASED; urgency=low
* add, import, assistant: Better preserve the mtime of symlinks, * add, import, assistant: Better preserve the mtime of symlinks,
when when adding content that gets deduplicated. when when adding content that gets deduplicated.
* webapp: Support storing encrypted git repositories on rsync.net. * webapp: Support storing encrypted git repositories on rsync.net.
* Send a git-annex user-agent when downloading urls.
Overridable with --user-agent option.
(Not yet done for S3 or WebDAV due to limitations of libraries used.)
-- Joey Hess <joeyh@debian.org> Sun, 22 Sep 2013 19:42:29 -0400 -- Joey Hess <joeyh@debian.org> Sun, 22 Sep 2013 19:42:29 -0400

View file

@ -71,3 +71,7 @@ HTTP request sent, awaiting response... 200 OK
> switch, and/or to make git-annex set a default user agent header > switch, and/or to make git-annex set a default user agent header
> of "git-annex", rather than relying on the curl/wget defaults. > of "git-annex", rather than relying on the curl/wget defaults.
> --[[Joey]] > --[[Joey]]
> I've [[done]] what's discussed above, and verified it fixes
> behavior for this specific server too.
> --[[Joey]]

View file

@ -824,6 +824,10 @@ subdirectories).
Also, '\\n' is a newline, '\\000' is a NULL, etc. Also, '\\n' is a newline, '\\000' is a NULL, etc.
* `--user-agent=value`
Overrides the User-Agent to use when downloading files from the web.
* `-c name=value` * `-c name=value`
Used to override git configuration settings. May be specified multiple times. Used to override git configuration settings. May be specified multiple times.