Send a git-annex user-agent when downloading urls.
Overridable with --user-agent option. Not yet done for S3 or WebDAV due to limitations of libraries used -- nether allows a user-agent header to be specified. This commit sponsored by Michael Zehrer.
This commit is contained in:
parent
55362462ae
commit
12f6b9693a
14 changed files with 90 additions and 36 deletions
2
Annex.hs
2
Annex.hs
|
@ -108,6 +108,7 @@ data AnnexState = AnnexState
|
|||
, fields :: M.Map String String
|
||||
, cleanup :: M.Map String (Annex ())
|
||||
, inodeschanged :: Maybe Bool
|
||||
, useragent :: Maybe String
|
||||
}
|
||||
|
||||
newState :: Git.Repo -> AnnexState
|
||||
|
@ -141,6 +142,7 @@ newState gitrepo = AnnexState
|
|||
, fields = M.empty
|
||||
, cleanup = M.empty
|
||||
, inodeschanged = Nothing
|
||||
, useragent = Nothing
|
||||
}
|
||||
|
||||
{- Makes an Annex state object for the specified git repo.
|
||||
|
|
|
@ -43,7 +43,7 @@ import qualified Annex.Queue
|
|||
import qualified Annex.Branch
|
||||
import Utility.DiskFree
|
||||
import Utility.FileMode
|
||||
import qualified Utility.Url as Url
|
||||
import qualified Annex.Url as Url
|
||||
import Types.Key
|
||||
import Utility.DataUnits
|
||||
import Utility.CopyFile
|
||||
|
@ -458,7 +458,7 @@ downloadUrl urls file = go =<< annexWebDownloadCommand <$> Annex.getGitConfig
|
|||
go Nothing = do
|
||||
opts <- map Param . annexWebOptions <$> Annex.getGitConfig
|
||||
headers <- getHttpHeaders
|
||||
liftIO $ anyM (\u -> Url.download u headers opts file) urls
|
||||
anyM (\u -> Url.withUserAgent $ Url.download u headers opts file) urls
|
||||
go (Just basecmd) = liftIO $ anyM (downloadcmd basecmd) urls
|
||||
downloadcmd basecmd url =
|
||||
boolSystem "sh" [Param "-c", Param $ gencmd url basecmd]
|
||||
|
|
27
Annex/Url.hs
Normal file
27
Annex/Url.hs
Normal file
|
@ -0,0 +1,27 @@
|
|||
{- Url downloading, with git-annex user agent.
|
||||
-
|
||||
- Copyright 2013 Joey Hess <joey@kitenet.net>
|
||||
-
|
||||
- Licensed under the GNU GPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Annex.Url (
|
||||
module U,
|
||||
withUserAgent,
|
||||
getUserAgent,
|
||||
) where
|
||||
|
||||
import Common.Annex
|
||||
import qualified Annex
|
||||
import Utility.Url as U
|
||||
import qualified Build.SysConfig as SysConfig
|
||||
|
||||
defaultUserAgent :: U.UserAgent
|
||||
defaultUserAgent = "git-annex/" ++ SysConfig.packageversion
|
||||
|
||||
getUserAgent :: Annex (Maybe U.UserAgent)
|
||||
getUserAgent = Annex.getState $
|
||||
Just . fromMaybe defaultUserAgent . Annex.useragent
|
||||
|
||||
withUserAgent :: (Maybe U.UserAgent -> IO a) -> Annex a
|
||||
withUserAgent a = liftIO . a =<< getUserAgent
|
|
@ -21,7 +21,7 @@ import qualified Types.Remote as Remote
|
|||
import Types.StandardGroups
|
||||
import Types.Remote (RemoteConfig)
|
||||
import Logs.Remote
|
||||
import qualified Utility.Url as Url
|
||||
import qualified Annex.Url as Url
|
||||
import Creds
|
||||
import Assistant.Gpg
|
||||
|
||||
|
@ -190,7 +190,8 @@ escapeHeader = escapeURIString (\c -> isUnescapedInURI c && c /= ' ')
|
|||
|
||||
getRepoInfo :: RemoteConfig -> Widget
|
||||
getRepoInfo c = do
|
||||
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url []
|
||||
ua <- liftAnnex Url.getUserAgent
|
||||
exists <- liftIO $ catchDefaultIO False $ fst <$> Url.exists url [] ua
|
||||
[whamlet|
|
||||
<a href="#{url}">
|
||||
Internet Archive item
|
||||
|
|
|
@ -56,7 +56,7 @@ getSwitchToRepositoryR repo = do
|
|||
( return url
|
||||
, delayed $ waiturl urlfile
|
||||
)
|
||||
listening url = catchBoolIO $ fst <$> Url.exists url []
|
||||
listening url = catchBoolIO $ fst <$> Url.exists url [] Nothing
|
||||
delayed a = do
|
||||
threadDelay 100000 -- 1/10th of a second
|
||||
a
|
||||
|
|
|
@ -17,8 +17,8 @@ import Backend
|
|||
import qualified Command.Add
|
||||
import qualified Annex
|
||||
import qualified Annex.Queue
|
||||
import qualified Annex.Url as Url
|
||||
import qualified Backend.URL
|
||||
import qualified Utility.Url as Url
|
||||
import Annex.Content
|
||||
import Logs.Web
|
||||
import qualified Option
|
||||
|
@ -123,7 +123,7 @@ perform relaxed url file = ifAnnexed file addurl geturl
|
|||
next $ return True
|
||||
| otherwise = do
|
||||
headers <- getHttpHeaders
|
||||
ifM (liftIO $ Url.check url headers $ keySize key)
|
||||
ifM (Url.withUserAgent $ Url.check url headers $ keySize key)
|
||||
( do
|
||||
setUrlPresent key url
|
||||
next $ return True
|
||||
|
@ -174,7 +174,7 @@ download url file = do
|
|||
size <- ifM (liftIO $ isJust <$> checkDaemon pidfile)
|
||||
( do
|
||||
headers <- getHttpHeaders
|
||||
liftIO $ snd <$> Url.exists url headers
|
||||
snd <$> Url.withUserAgent (Url.exists url headers)
|
||||
, return Nothing
|
||||
)
|
||||
Backend.URL.fromUrl url size
|
||||
|
@ -203,7 +203,7 @@ nodownload relaxed url file = do
|
|||
headers <- getHttpHeaders
|
||||
(exists, size) <- if relaxed
|
||||
then pure (True, Nothing)
|
||||
else liftIO $ Url.exists url headers
|
||||
else Url.withUserAgent $ Url.exists url headers
|
||||
if exists
|
||||
then do
|
||||
key <- Backend.URL.fromUrl url size
|
||||
|
|
|
@ -17,7 +17,7 @@ import Data.Time.Clock
|
|||
import Common.Annex
|
||||
import qualified Annex
|
||||
import Command
|
||||
import qualified Utility.Url as Url
|
||||
import qualified Annex.Url as Url
|
||||
import Logs.Web
|
||||
import qualified Option
|
||||
import qualified Utility.Format
|
||||
|
@ -102,9 +102,10 @@ findEnclosures url = extract <$> downloadFeed url
|
|||
downloadFeed :: URLString -> Annex (Maybe Feed)
|
||||
downloadFeed url = do
|
||||
showOutput
|
||||
ua <- Url.getUserAgent
|
||||
liftIO $ withTmpFile "feed" $ \f h -> do
|
||||
fileEncoding h
|
||||
ifM (Url.download url [] [] f)
|
||||
ifM (Url.download url [] [] f ua)
|
||||
( liftIO $ parseFeedString <$> hGetContentsStrict h
|
||||
, return Nothing
|
||||
)
|
||||
|
|
|
@ -48,6 +48,8 @@ options = Option.common ++
|
|||
"skip files smaller than a size"
|
||||
, Option ['T'] ["time-limit"] (ReqArg Limit.addTimeLimit paramTime)
|
||||
"stop after the specified amount of time"
|
||||
, Option [] ["user-agent"] (ReqArg setuseragent paramName)
|
||||
"override default User-Agent"
|
||||
, Option [] ["trust-glacier"] (NoArg (Annex.setFlag "trustglacier"))
|
||||
"Trust Amazon Glacier inventory"
|
||||
] ++ Option.matcher
|
||||
|
@ -55,6 +57,7 @@ options = Option.common ++
|
|||
setnumcopies v = maybe noop
|
||||
(\n -> Annex.changeState $ \s -> s { Annex.forcenumcopies = Just n })
|
||||
(readish v)
|
||||
setuseragent v = Annex.changeState $ \s -> s { Annex.useragent = Just v }
|
||||
setgitconfig v = Annex.changeGitRepo =<< inRepo (Git.Config.store v)
|
||||
trustArg t = ReqArg (Remote.forceTrust t) paramRemote
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ import Annex.Exception
|
|||
import qualified Annex.Content
|
||||
import qualified Annex.BranchState
|
||||
import qualified Annex.Branch
|
||||
import qualified Utility.Url as Url
|
||||
import qualified Annex.Url as Url
|
||||
import Utility.Tmp
|
||||
import Config
|
||||
import Config.Cost
|
||||
|
@ -177,9 +177,10 @@ tryGitConfigRead r
|
|||
Left l -> return $ Left l
|
||||
|
||||
geturlconfig headers = do
|
||||
ua <- Url.getUserAgent
|
||||
v <- liftIO $ withTmpFile "git-annex.tmp" $ \tmpfile h -> do
|
||||
hClose h
|
||||
ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile)
|
||||
ifM (Url.downloadQuiet (Git.repoLocation r ++ "/config") headers [] tmpfile ua)
|
||||
( pipedconfig "git" [Param "config", Param "--null", Param "--list", Param "--file", File tmpfile]
|
||||
, return $ Left undefined
|
||||
)
|
||||
|
@ -240,7 +241,7 @@ inAnnex r key
|
|||
where
|
||||
checkhttp headers = do
|
||||
showChecking r
|
||||
liftIO $ ifM (anyM (\u -> Url.check u headers (keySize key)) (keyUrls r key))
|
||||
ifM (anyM (\u -> Url.withUserAgent $ Url.check u headers (keySize key)) (keyUrls r key))
|
||||
( return $ Right True
|
||||
, return $ Left "not found"
|
||||
)
|
||||
|
|
|
@ -19,7 +19,7 @@ import Config.Cost
|
|||
import Logs.Web
|
||||
import Types.Key
|
||||
import Utility.Metered
|
||||
import qualified Utility.Url as Url
|
||||
import qualified Annex.Url as Url
|
||||
#ifdef WITH_QUVI
|
||||
import Annex.Quvi
|
||||
import qualified Utility.Quvi as Quvi
|
||||
|
@ -118,7 +118,7 @@ checkKey' key us = firsthit us (Right False) $ \u -> do
|
|||
#endif
|
||||
DefaultDownloader -> do
|
||||
headers <- getHttpHeaders
|
||||
liftIO $ Right <$> Url.check u' headers (keySize key)
|
||||
Right <$> Url.withUserAgent (Url.check u' headers $ keySize key)
|
||||
where
|
||||
firsthit [] miss _ = return miss
|
||||
firsthit (u:rest) _ a = do
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
module Utility.Url (
|
||||
URLString,
|
||||
UserAgent,
|
||||
check,
|
||||
exists,
|
||||
download,
|
||||
|
@ -27,10 +28,12 @@ type URLString = String
|
|||
|
||||
type Headers = [String]
|
||||
|
||||
type UserAgent = String
|
||||
|
||||
{- Checks that an url exists and could be successfully downloaded,
|
||||
- also checking that its size, if available, matches a specified size. -}
|
||||
check :: URLString -> Headers -> Maybe Integer -> IO Bool
|
||||
check url headers expected_size = handle <$> exists url headers
|
||||
check :: URLString -> Headers -> Maybe Integer -> Maybe UserAgent -> IO Bool
|
||||
check url headers expected_size = handle <$$> exists url headers
|
||||
where
|
||||
handle (False, _) = False
|
||||
handle (True, Nothing) = True
|
||||
|
@ -44,8 +47,8 @@ check url headers expected_size = handle <$> exists url headers
|
|||
- Uses curl otherwise, when available, since curl handles https better
|
||||
- than does Haskell's Network.Browser.
|
||||
-}
|
||||
exists :: URLString -> Headers -> IO (Bool, Maybe Integer)
|
||||
exists url headers = case parseURIRelaxed url of
|
||||
exists :: URLString -> Headers -> Maybe UserAgent -> IO (Bool, Maybe Integer)
|
||||
exists url headers ua = case parseURIRelaxed url of
|
||||
Just u
|
||||
| uriScheme u == "file:" -> do
|
||||
s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u)
|
||||
|
@ -54,12 +57,12 @@ exists url headers = case parseURIRelaxed url of
|
|||
Nothing -> dne
|
||||
| otherwise -> if Build.SysConfig.curl
|
||||
then do
|
||||
output <- readProcess "curl" curlparams
|
||||
output <- readProcess "curl" $ toCommand curlparams
|
||||
case lastMaybe (lines output) of
|
||||
Just ('2':_:_) -> return (True, extractsize output)
|
||||
_ -> dne
|
||||
else do
|
||||
r <- request u headers HEAD
|
||||
r <- request u headers HEAD ua
|
||||
case rspCode r of
|
||||
(2,_,_) -> return (True, size r)
|
||||
_ -> return (False, Nothing)
|
||||
|
@ -67,13 +70,12 @@ exists url headers = case parseURIRelaxed url of
|
|||
where
|
||||
dne = return (False, Nothing)
|
||||
|
||||
curlparams =
|
||||
[ "-s"
|
||||
, "--head"
|
||||
, "-L"
|
||||
, url
|
||||
, "-w", "%{http_code}"
|
||||
] ++ concatMap (\h -> ["-H", h]) headers
|
||||
curlparams = addUserAgent ua $
|
||||
[ Param "-s"
|
||||
, Param "--head"
|
||||
, Param "-L", Param url
|
||||
, Param "-w", Param "%{http_code}"
|
||||
] ++ concatMap (\h -> [Param "-H", Param h]) headers
|
||||
|
||||
extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of
|
||||
Just l -> case lastMaybe $ words l of
|
||||
|
@ -83,6 +85,11 @@ exists url headers = case parseURIRelaxed url of
|
|||
|
||||
size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders
|
||||
|
||||
-- works for both wget and curl commands
|
||||
addUserAgent :: Maybe UserAgent -> [CommandParam] -> [CommandParam]
|
||||
addUserAgent Nothing ps = ps
|
||||
addUserAgent (Just ua) ps = ps ++ [Param "--user-agent", Param ua]
|
||||
|
||||
{- Used to download large files, such as the contents of keys.
|
||||
-
|
||||
- Uses wget or curl program for its progress bar. (Wget has a better one,
|
||||
|
@ -90,15 +97,15 @@ exists url headers = case parseURIRelaxed url of
|
|||
- would not be appropriate to test at configure time and build support
|
||||
- for only one in.
|
||||
-}
|
||||
download :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
|
||||
download :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
|
||||
download = download' False
|
||||
|
||||
{- No output, even on error. -}
|
||||
downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
|
||||
downloadQuiet :: URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
|
||||
downloadQuiet = download' True
|
||||
|
||||
download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> IO Bool
|
||||
download' quiet url headers options file =
|
||||
download' :: Bool -> URLString -> Headers -> [CommandParam] -> FilePath -> Maybe UserAgent -> IO Bool
|
||||
download' quiet url headers options file ua =
|
||||
case parseURIRelaxed url of
|
||||
Just u
|
||||
| uriScheme u == "file:" -> do
|
||||
|
@ -119,7 +126,7 @@ download' quiet url headers options file =
|
|||
curl = go "curl" $ headerparams ++ quietopt "-s" ++
|
||||
[Params "-f -L -C - -# -o"]
|
||||
go cmd opts = boolSystem cmd $
|
||||
options++opts++[File file, File url]
|
||||
addUserAgent ua $ options++opts++[File file, File url]
|
||||
quietopt s
|
||||
| quiet = [Param s]
|
||||
| otherwise = []
|
||||
|
@ -134,13 +141,14 @@ download' quiet url headers options file =
|
|||
- Unfortunately, does not handle https, so should only be used
|
||||
- when curl is not available.
|
||||
-}
|
||||
request :: URI -> Headers -> RequestMethod -> IO (Response String)
|
||||
request url headers requesttype = go 5 url
|
||||
request :: URI -> Headers -> RequestMethod -> Maybe UserAgent -> IO (Response String)
|
||||
request url headers requesttype ua = go 5 url
|
||||
where
|
||||
go :: Int -> URI -> IO (Response String)
|
||||
go 0 _ = error "Too many redirects "
|
||||
go n u = do
|
||||
rsp <- Browser.browse $ do
|
||||
maybe noop Browser.setUserAgent ua
|
||||
Browser.setErrHandler ignore
|
||||
Browser.setOutHandler ignore
|
||||
Browser.setAllowRedirects False
|
||||
|
|
3
debian/changelog
vendored
3
debian/changelog
vendored
|
@ -18,6 +18,9 @@ git-annex (4.20130921) UNRELEASED; urgency=low
|
|||
* add, import, assistant: Better preserve the mtime of symlinks,
|
||||
when when adding content that gets deduplicated.
|
||||
* webapp: Support storing encrypted git repositories on rsync.net.
|
||||
* Send a git-annex user-agent when downloading urls.
|
||||
Overridable with --user-agent option.
|
||||
(Not yet done for S3 or WebDAV due to limitations of libraries used.)
|
||||
|
||||
-- Joey Hess <joeyh@debian.org> Sun, 22 Sep 2013 19:42:29 -0400
|
||||
|
||||
|
|
|
@ -71,3 +71,7 @@ HTTP request sent, awaiting response... 200 OK
|
|||
> switch, and/or to make git-annex set a default user agent header
|
||||
> of "git-annex", rather than relying on the curl/wget defaults.
|
||||
> --[[Joey]]
|
||||
|
||||
> I've [[done]] what's discussed above, and verified it fixes
|
||||
> behavior for this specific server too.
|
||||
> --[[Joey]]
|
||||
|
|
|
@ -824,6 +824,10 @@ subdirectories).
|
|||
|
||||
Also, '\\n' is a newline, '\\000' is a NULL, etc.
|
||||
|
||||
* `--user-agent=value`
|
||||
|
||||
Overrides the User-Agent to use when downloading files from the web.
|
||||
|
||||
* `-c name=value`
|
||||
|
||||
Used to override git configuration settings. May be specified multiple times.
|
||||
|
|
Loading…
Reference in a new issue