Send a git-annex user-agent when downloading urls.

Overridable with --user-agent option.

Not yet done for S3 or WebDAV due to limitations of libraries used --
nether allows a user-agent header to be specified.

This commit sponsored by Michael Zehrer.
This commit is contained in:
Joey Hess 2013-09-28 14:35:21 -04:00
parent 55362462ae
commit 12f6b9693a
14 changed files with 90 additions and 36 deletions

View file

@ -108,6 +108,7 @@ data AnnexState = AnnexState
, fields :: M.Map String String
, cleanup :: M.Map String (Annex ())
, inodeschanged :: Maybe Bool
, useragent :: Maybe String
}
newState :: Git.Repo -> AnnexState
@ -141,6 +142,7 @@ newState gitrepo = AnnexState
, fields = M.empty
, cleanup = M.empty
, inodeschanged = Nothing
, useragent = Nothing
}
{- Makes an Annex state object for the specified git repo.

View file

@ -43,7 +43,7 @@ import qualified Annex.Queue
import qualified Annex.Branch
import Utility.DiskFree
import Utility.FileMode
import qualified Utility.Url as Url
import qualified Annex.Url as Url
import Types.Key
import Utility.DataUnits
import Utility.CopyFile
@ -458,7 +458,7 @@ downloadUrl urls file = go =<< annexWebDownloadCommand <$> Annex.getGitConfig
go Nothing = do
opts <- map Param . annexWebOptions <$> Annex.getGitConfig
headers <- getHttpHeaders
liftIO $ anyM (\u -> Url.download u headers opts file) urls
anyM (\u -> Url.withUserAgent $ Url.download u headers opts file) urls
go (Just basecmd) = liftIO $ anyM (downloadcmd basecmd) urls
downloadcmd basecmd url =
boolSystem "sh" [Param "-c", Param $ gencmd url basecmd]

27
Annex/Url.hs Normal file
View file

@ -0,0 +1,27 @@
{- Url downloading, with git-annex user agent.
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Annex.Url (
module U,
withUserAgent,
getUserAgent,
) where
import Common.Annex
import qualified Annex
import Utility.Url as U
import qualified Build.SysConfig as SysConfig
defaultUserAgent :: U.UserAgent
defaultUserAgent = "git-annex/" ++ SysConfig.packageversion
getUserAgent :: Annex (Maybe U.UserAgent)
getUserAgent = Annex.getState $
Just . fromMaybe defaultUserAgent . Annex.useragent
withUserAgent :: (Maybe U.UserAgent -> IO a) -> Annex a
withUserAgent a = liftIO . a =<< getUserAgent

View file

@ -21,7 +21,7 @@ import qualified Types.Remote as Remote
import Types.StandardGroups
import Types.Remote (RemoteConfig)
import Logs.Remote
import qualified Utility.Url as Url
import qualified Annex.Url as Url
import Creds
import Assistant.Gpg
@ -190,7 +190,8 @@ escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
getRepoInfo :: RemoteConfig -> Widget
getRepoInfo c = do
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url []
ua <- liftAnnex Url.getUserAgent
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url [] ua
[whamlet|
<a href="#{url}">
Internet Archive item

View file

@ -56,7 +56,7 @@ getSwitchToRepositoryR repo = do
( return url
, delayed $ waiturl urlfile
)
listening url = catchBoolIO $ fst <$> Url.exists url []
listening url = catchBoolIO $ fst <$> Url.exists url [] Nothing
delayed a = do
threadDelay 100000 -- 1/10th of a second
a

View file

@ -17,8 +17,8 @@ import Backend
import qualified Command.Add
import qualified Annex
import qualified Annex.Queue
import qualified Annex.Url as Url
import qualified Backend.URL
import qualified Utility.Url as Url
import Annex.Content
import Logs.Web
import qualified Option
@ -123,7 +123,7 @@ perform relaxed url file = ifAnnexed file addurl geturl
next $ return True
| otherwise = do
headers <- getHttpHeaders
ifM (liftIO $ Url.check url headers $ keySize key)
ifM (Url.withUserAgent $ Url.check url headers $ keySize key)
( do
setUrlPresent key url
next $ return True
@ -174,7 +174,7 @@ download url file = do
size <- ifM (liftIO $ isJust <$> checkDaemon pidfile)
( do
headers <- getHttpHeaders
liftIO $ snd <$> Url.exists url headers
snd <$> Url.withUserAgent (Url.exists url headers)
, return Nothing
)
Backend.URL.fromUrl url size
@ -203,7 +203,7 @@ nodownload relaxed url file = do
headers <- getHttpHeaders
(exists, size) <- if relaxed
then pure (True, Nothing)
else liftIO $ Url.exists url headers
else Url.withUserAgent $ Url.exists url headers
if exists
then do
key <- Backend.URL.fromUrl url size

View file

@ -17,7 +17,7 @@ import Data.Time.Clock
import Common.Annex
import qualified Annex
import Command
import qualified Utility.Url as Url
import qualified Annex.Url as Url
import Logs.Web
import qualified Option
import qualified Utility.Format
@ -102,9 +102,10 @@ findEnclosures url = extract <$> downloadFeed url
downloadFeed :: URLString -> Annex (Maybe Feed)
downloadFeed url = do
showOutput
ua <- Url.getUserAgent
liftIO $ withTmpFile "feed" $ \f h -> do
fileEncoding h
ifM (Url.download url [] [] f)
ifM (Url.download url [] [] f ua)
( liftIO $ parseFeedString <$> hGetContentsStrict h
, return Nothing
)

View file

@ -48,6 +48,8 @@ options = Option.common ++
"skip files smaller than a size"
, Option ['T'] ["time-limit"] (ReqArg Limit.addTimeLimit paramTime)
"stop after the specified amount of time"
, Option [] ["user-agent"] (ReqArg setuseragent paramName)
"override default User-Agent"
, Option [] ["trust-glacier"] (NoArg (Annex.setFlag "trustglacier"))
"Trust Amazon Glacier inventory"
] ++ Option.matcher
@ -55,6 +57,7 @@ options = Option.common ++
setnumcopies v = maybe noop
(\n -> Annex.changeState $ \s -> s { Annex.forcenumcopies = Just n })
(readish v)
setuseragent v = Annex.changeState $ \s -> s { Annex.useragent = Just v }
setgitconfig v = Annex.changeGitRepo =<< inRepo (Git.Config.store v)
trustArg t = ReqArg (Remote.forceTrust t) paramRemote

View file

@ -30,7 +30,7 @@ import Annex.Exception
import qualified Annex.Content
import qualified Annex.BranchState
import qualified Annex.Branch
import qualified Utility.Url as Url
import qualified Annex.Url as Url
import Utility.Tmp
import Config
import Config.Cost
@ -177,9 +177,10 @@ tryGitConfigRead r
Left l -> return $ Left l
geturlconfig headers = do
ua <- Url.getUserAgent
v <- liftIO $ withTmpFile "git-annex.tmp" $ \tmpfile h -> do
hClose h
ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile)
ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile ua)
( pipedconfig "git" [Param "config", Param "--null", Param "--list", Param "--file", File tmpfile]
, return $ Left undefined
)
@ -240,7 +241,7 @@ inAnnex r key
where
checkhttp headers = do
showChecking r
liftIO $ ifM (anyM (\u -> Url.check u headers (keySize key)) (keyUrls r key))
ifM (anyM (\u -> Url.withUserAgent $ Url.check u headers (keySize key)) (keyUrls r key))
( return $ Right True
, return $ Left "not found"
)

View file

@ -19,7 +19,7 @@ import Config.Cost
import Logs.Web
import Types.Key
import Utility.Metered
import qualified Utility.Url as Url
import qualified Annex.Url as Url
#ifdef WITH_QUVI
import Annex.Quvi
import qualified Utility.Quvi as Quvi
@ -118,7 +118,7 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
#endif
DefaultDownloader -> do
headers <- getHttpHeaders
liftIO $ Right <$> Url.check u' headers (keySize key)
Right <$> Url.withUserAgent (Url.check u' headers $ keySize key)
where
firsthit [] miss _ = return miss
firsthit (u:rest) _ a = do

View file

@ -9,6 +9,7 @@
module Utility.Url (
URLString,
UserAgent,
check,
exists,
download,
@ -27,10 +28,12 @@ type URLString = String
type Headers = [String]
type UserAgent = String
{- Checks that an url exists and could be successfully downloaded,
- also checking that its size, if available, matches a specified size. -}
check :: URLString -> Headers -> Maybe Integer -> IO Bool
check url headers expected_size = handle <$> exists url headers
check :: URLString -> Headers -> Maybe Integer -> Maybe UserAgent -> IO Bool
check url headers expected_size = handle <$$> exists url headers
where
handle (False, _) = False
handle (True, Nothing) = True
@ -44,8 +47,8 @@ check url headers expected_size = handle <$> exists url headers
- Uses curl otherwise, when available, since curl handles https better
- than does Haskell's Network.Browser.
-}
exists :: URLString -> Headers -> IO (Bool, Maybe Integer)
exists url headers = case parseURIRelaxed url of
exists :: URLString -> Headers -> Maybe UserAgent -> IO (Bool, Maybe Integer)
exists url headers ua = case parseURIRelaxed url of
Just u
| uriScheme u == "file:" -> do
s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u)
@ -54,12 +57,12 @@ exists url headers = case parseURIRelaxed url of
Nothing -> dne
| otherwise -> if Build.SysConfig.curl
then do
output <- readProcess "curl" curlparams
output <- readProcess "curl" $ toCommand curlparams
case lastMaybe (lines output) of
Just ('2':_:_) -> return (True, extractsize output)
_ -> dne
else do
r <- request u headers HEAD
r <- request u headers HEAD ua
case rspCode r of
(2,_,_) -> return (True, size r)
_ -> return (False, Nothing)
@ -67,13 +70,12 @@ exists url headers = case parseURIRelaxed url of
where
dne = return (False, Nothing)
curlparams =
[ "-s"
, "--head"
, "-L"
, url
, "-w", "%{http_code}"
] ++ concatMap (\h -> ["-H", h]) headers
curlparams = addUserAgent ua $
[ Param "-s"
, Param "--head"
, Param "-L", Param url
, Param "-w", Param "%{http_code}"
] ++ concatMap (\h -> [Param "-H", Param h]) headers
extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of
Just l -> case lastMaybe $ words l of
@ -83,6 +85,11 @@ exists url headers = case parseURIRelaxed url of
size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders
-- works for both wget and curl commands
addUserAgent :: Maybe UserAgent -> [CommandParam] -> [CommandParam]
addUserAgent Nothing ps = ps
addUserAgent (Just ua) ps = ps ++ [Param "--user-agent", Param ua]
{- Used to download large files, such as the contents of keys.
-
- Uses wget or curl program for its progress bar. (Wget has a better one,
@ -90,15 +97,15 @@ exists url headers = case parseURIRelaxed url of
- would not be appropriate to test at configure time and build support
- for only one in.
-}
download :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
download :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
download = download' False
{- No output, even on error. -}
downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
downloadQuiet = download' True
download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
download' quiet url headers options file =
download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
download' quiet url headers options file ua =
case parseURIRelaxed url of
Just u
| uriScheme u == "file:" -> do
@ -119,7 +126,7 @@ download' quiet url headers options file =
curl = go "curl" $ headerparams ++ quietopt "-s" ++
[Params "-f -L -C - -# -o"]
go cmd opts = boolSystem cmd $
options++opts++[File file, File url]
addUserAgent ua $ options++opts++[File file, File url]
quietopt s
| quiet = [Param s]
| otherwise = []
@ -134,13 +141,14 @@ download' quiet url headers options file =
- Unfortunately, does not handle https, so should only be used
- when curl is not available.
-}
request :: URI -> Headers -> RequestMethod -> IO (Response String)
request url headers requesttype = go 5 url
request :: URI -> Headers -> RequestMethod -> Maybe UserAgent -> IO (Response String)
request url headers requesttype ua = go 5 url
where
go :: Int -> URI -> IO (Response String)
go 0 _ = error "Too many redirects "
go n u = do
rsp <- Browser.browse $ do
maybe noop Browser.setUserAgent ua
Browser.setErrHandler ignore
Browser.setOutHandler ignore
Browser.setAllowRedirects False

3
debian/changelog vendored
View file

@ -18,6 +18,9 @@ git-annex (4.20130921) UNRELEASED; urgency=low
* add, import, assistant: Better preserve the mtime of symlinks,
when when adding content that gets deduplicated.
* webapp: Support storing encrypted git repositories on rsync.net.
* Send a git-annex user-agent when downloading urls.
Overridable with --user-agent option.
(Not yet done for S3 or WebDAV due to limitations of libraries used.)
-- Joey Hess <joeyh@debian.org> Sun, 22 Sep 2013 19:42:29 -0400

View file

@ -71,3 +71,7 @@ HTTP request sent, awaiting response... 200 OK
> switch, and/or to make git-annex set a default user agent header
> of "git-annex", rather than relying on the curl/wget defaults.
> --[[Joey]]
> I've [[done]] what's discussed above, and verified it fixes
> behavior for this specific server too.
> --[[Joey]]

View file

@ -824,6 +824,10 @@ subdirectories).
Also, '\\n' is a newline, '\\000' is a NULL, etc.
* `--user-agent=value`
Overrides the User-Agent to use when downloading files from the web.
* `-c name=value`
Used to override git configuration settings. May be specified multiple times.