git-annex/Remote/S3.hs

432 lines
13 KiB
Haskell
Raw Normal View History

{- S3 remotes
2011-03-28 02:00:44 +00:00
-
- Copyright 2011-2014 Joey Hess <joey@kitenet.net>
2011-03-28 02:00:44 +00:00
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE TypeFamilies #-}
module Remote.S3 (remote, iaHost, isIA, isIAHost, iaItemUrl) where
2011-03-28 02:00:44 +00:00
import qualified Aws as AWS
import qualified Aws.Core as AWS
import qualified Aws.S3 as S3
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as S
2011-03-29 17:49:54 +00:00
import qualified Data.Map as M
import Data.Char
2013-04-25 17:14:49 +00:00
import Network.Socket (HostName)
import Network.HTTP.Conduit (Manager, newManager, closeManager)
import Network.HTTP.Client (defaultManagerSettings, managerResponseTimeout, responseStatus, responseBody, RequestBody(..))
import Network.HTTP.Types
import Control.Monad.Trans.Resource
import Control.Monad.Catch
import Data.Conduit
2011-03-28 02:00:44 +00:00
2011-10-05 20:02:51 +00:00
import Common.Annex
import Types.Remote
import Types.Key
import qualified Git
2011-03-30 19:15:46 +00:00
import Config
import Config.Cost
2011-08-17 00:49:54 +00:00
import Remote.Helper.Special
import Remote.Helper.Http
import qualified Remote.Helper.AWS as AWS
2012-11-14 23:32:27 +00:00
import Creds
import Annex.UUID
import Logs.Web
import Utility.Metered
2014-08-09 00:29:56 +00:00
type BucketName = String
2011-03-28 02:00:44 +00:00
remote :: RemoteType
remote = RemoteType {
2011-03-29 18:55:59 +00:00
typename = "S3",
2011-03-30 18:00:54 +00:00
enumerate = findSpecialRemotes "s3",
generate = gen,
2011-03-29 18:55:59 +00:00
setup = s3Setup
}
2011-03-29 03:51:07 +00:00
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
2014-08-10 02:13:03 +00:00
gen r u c gc = do
cst <- remoteCost gc expensiveRemoteCost
info <- extractS3Info c
return $ new cst info
2012-11-30 04:55:59 +00:00
where
2014-08-10 02:13:03 +00:00
new cst info = Just $ specialRemote c
(prepareS3 this info $ store this)
(prepareS3 this info retrieve)
(prepareS3 this info remove)
(prepareS3 this info $ checkKey this)
this
2012-11-30 04:55:59 +00:00
where
this = Remote {
uuid = u,
cost = cst,
name = Git.repoDescribe r,
storeKey = storeKeyDummy,
retrieveKeyFile = retreiveKeyFileDummy,
retrieveKeyFileCheap = retrieveCheap,
removeKey = removeKeyDummy,
checkPresent = checkPresentDummy,
checkPresentCheap = False,
2012-11-30 04:55:59 +00:00
whereisKey = Nothing,
remoteFsck = Nothing,
repairRepo = Nothing,
2012-11-30 04:55:59 +00:00
config = c,
repo = r,
gitconfig = gc,
2012-11-30 04:55:59 +00:00
localpath = Nothing,
readonly = False,
availability = GloballyAvailable,
2012-11-30 04:55:59 +00:00
remotetype = remote
}
s3Setup :: Maybe UUID -> Maybe CredPair -> RemoteConfig -> Annex (RemoteConfig, UUID)
s3Setup mu mcreds c = do
u <- maybe (liftIO genUUID) return mu
c' <- setRemoteCredPair c (AWS.creds u) mcreds
s3Setup' u c'
s3Setup' :: UUID -> RemoteConfig -> Annex (RemoteConfig, UUID)
2014-08-10 02:13:03 +00:00
s3Setup' u c = if configIA c then archiveorg else defaulthost
2012-11-11 04:51:07 +00:00
where
remotename = fromJust (M.lookup "name" c)
defbucket = remotename ++ "-" ++ fromUUID u
defaults = M.fromList
[ ("datacenter", T.unpack $ AWS.defaultRegion AWS.S3)
2012-11-11 04:51:07 +00:00
, ("storageclass", "STANDARD")
, ("host", AWS.s3DefaultHost)
, ("port", "80")
2012-11-11 04:51:07 +00:00
, ("bucket", defbucket)
]
2012-11-11 04:51:07 +00:00
use fullconfig = do
gitConfigSpecialRemote u fullconfig "s3" "true"
return (fullconfig, u)
2012-11-11 04:51:07 +00:00
defaulthost = do
c' <- encryptionSetup c
let fullconfig = c' `M.union` defaults
genBucket fullconfig u
use fullconfig
2012-11-11 04:51:07 +00:00
archiveorg = do
showNote "Internet Archive mode"
-- Ensure user enters a valid bucket name, since
-- this determines the name of the archive.org item.
2014-08-10 02:13:03 +00:00
let validbucket = replace " " "-" $ map toLower $
fromMaybe (error "specify bucket=") $
2014-08-09 00:29:56 +00:00
getBucketName c
let archiveconfig =
2014-08-09 01:42:46 +00:00
-- IA acdepts x-amz-* as an alias for x-archive-*
2012-11-11 04:51:07 +00:00
M.mapKeys (replace "x-archive-" "x-amz-") $
-- encryption does not make sense here
M.insert "encryption" "none" $
2014-08-10 02:13:03 +00:00
M.insert "bucket" validbucket $
2012-11-11 04:51:07 +00:00
M.union c $
-- special constraints on key names
M.insert "mungekeys" "ia" $
-- bucket created only when files are uploaded
M.insert "x-amz-auto-make-bucket" "1" defaults
2014-08-10 02:13:03 +00:00
info <- extractS3Info archiveconfig
withS3Handle archiveconfig u info $
writeUUIDFile archiveconfig u
use archiveconfig
2011-03-29 20:21:21 +00:00
-- Sets up a http connection manager for S3 encdpoint, which allows
-- http connections to be reused across calls to the helper.
2014-08-10 02:13:03 +00:00
prepareS3 :: Remote -> S3Info -> (S3Handle -> helper) -> Preparer helper
prepareS3 r info = resourcePrepare $ const $
withS3Handle (config r) (uuid r) info
store :: Remote -> S3Handle -> Storer
store r h = fileStorer $ \k f p -> do
rbody <- liftIO $ httpBodyStorer f p
2014-08-10 02:13:03 +00:00
void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody
-- Store public URL to item in Internet Archive.
2014-08-10 02:13:03 +00:00
when (isIA (hinfo h) && not (isChunkKey k)) $
setUrlPresent k (iaKeyUrl r k)
return True
{- Implemented as a fileRetriever, that uses conduit to stream the chunks
- out to the file. Would be better to implement a byteRetriever, but
- that is difficult. -}
2014-08-09 18:30:28 +00:00
retrieve :: S3Handle -> Retriever
retrieve h = fileRetriever $ \f k p -> liftIO $ runResourceT $ do
(fr, fh) <- allocate (openFile f WriteMode) hClose
2014-08-10 02:13:03 +00:00
let req = S3.getObject (bucket info) (bucketObject info k)
S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req
responseBody rsp $$+- sinkprogressfile fh p zeroBytesProcessed
release fr
where
2014-08-10 02:13:03 +00:00
info = hinfo h
sinkprogressfile fh meterupdate sofar = do
mbs <- await
case mbs of
Nothing -> return ()
Just bs -> do
2014-08-09 20:49:31 +00:00
let sofar' = addBytesProcessed sofar (S.length bs)
liftIO $ do
void $ meterupdate sofar'
S.hPut fh bs
sinkprogressfile fh meterupdate sofar'
retrieveCheap :: Key -> FilePath -> Annex Bool
retrieveCheap _ _ = return False
{- Internet Archive doesn't easily allow removing content.
- While it may remove the file, there are generally other files
- derived from it that it does not remove. -}
2014-08-09 18:30:28 +00:00
remove :: S3Handle -> Remover
remove h k
2014-08-10 02:13:03 +00:00
| isIA info = do
warning "Cannot remove content from the Internet Archive"
return False
| otherwise = do
res <- tryNonAsync $ sendS3Handle h $
2014-08-10 02:13:03 +00:00
S3.DeleteObject (bucketObject info k) (bucket info)
return $ either (const False) (const True) res
2014-08-10 02:13:03 +00:00
where
info = hinfo h
checkKey :: Remote -> S3Handle -> CheckPresent
checkKey r h k = do
showAction $ "checking " ++ name r
catchMissingException $ do
void $ sendS3Handle h $
2014-08-10 02:13:03 +00:00
S3.headObject (bucket (hinfo h)) (bucketObject (hinfo h) k)
return True
{- Catch exception headObject returns when an object is not present
- in the bucket, and returns False. All other exceptions indicate a
- check error and are let through. -}
catchMissingException :: Annex Bool -> Annex Bool
catchMissingException a = catchJust missing a (const $ return False)
2012-11-11 04:51:07 +00:00
where
-- This is not very good; see
-- https://github.com/aristidb/aws/issues/121
missing :: AWS.HeaderException -> Maybe ()
missing e
| AWS.headerErrorMessage e == "ETag missing" = Just ()
| otherwise = Nothing
{- Generate the bucket if it does not already exist, including creating the
- UUID file within the bucket.
-
- Some ACLs can allow read/write to buckets, but not querying them,
- so first check if the UUID file already exists and we can skip doing
- anything.
-}
genBucket :: RemoteConfig -> UUID -> Annex ()
genBucket c u = do
showAction "checking bucket"
2014-08-10 02:13:03 +00:00
info <- extractS3Info c
withS3Handle c u info $ \h ->
go h =<< checkUUIDFile c u h
2012-11-11 04:51:07 +00:00
where
go _ (Right True) = noop
go h _ = do
2014-08-10 02:13:03 +00:00
v <- tryS3 $ sendS3Handle h (S3.getBucket $ bucket $ hinfo h)
case v of
Right _ -> noop
Left _ -> do
showAction $ "creating bucket in " ++ datacenter
2014-08-09 00:29:56 +00:00
void $ sendS3Handle h $
2014-08-10 02:13:03 +00:00
S3.PutBucket (bucket $ hinfo h) Nothing $
AWS.mkLocationConstraint $
T.pack datacenter
writeUUIDFile c u h
2012-11-11 04:51:07 +00:00
datacenter = fromJust $ M.lookup "datacenter" c
2011-05-16 13:42:54 +00:00
{- Writes the UUID to an annex-uuid file within the bucket.
-
- If the file already exists in the bucket, it must match.
-
- Note that IA buckets can only created by having a file
- stored in them. So this also takes care of that.
-}
writeUUIDFile :: RemoteConfig -> UUID -> S3Handle -> Annex ()
writeUUIDFile c u h = do
v <- checkUUIDFile c u h
case v of
Left e -> throwM e
Right True -> noop
2014-08-09 00:29:56 +00:00
Right False -> void $ sendS3Handle h mkobject
where
file = T.pack $ uuidFile c
uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u]
mkobject = putObject h file (RequestBodyLBS uuidb)
{- Checks if the UUID file exists in the bucket
- and has the specified UUID already. -}
checkUUIDFile :: RemoteConfig -> UUID -> S3Handle -> Annex (Either SomeException Bool)
checkUUIDFile c u h = tryNonAsync $ check <$> get
where
get = liftIO
. runResourceT
. either (pure . Left) (Right <$$> AWS.loadToMemory)
2014-08-10 02:13:03 +00:00
=<< tryS3 (sendS3Handle h (S3.getObject (bucket (hinfo h)) file))
check (Right (S3.GetObjectMemoryResponse _meta rsp)) =
responseStatus rsp == ok200 && responseBody rsp == uuidb
check (Left _S3Error) = False
file = T.pack $ uuidFile c
uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u]
uuidFile :: RemoteConfig -> FilePath
2014-08-09 00:29:56 +00:00
uuidFile c = getFilePrefix c ++ "annex-uuid"
-- TODO: auto-create bucket when hIsIA.
putObject :: S3Handle -> T.Text -> RequestBody -> S3.PutObject
2014-08-10 02:13:03 +00:00
putObject h file rbody = (S3.putObject (bucket (hinfo h)) file rbody)
{ S3.poStorageClass = Just (storageClass (hinfo h))
, S3.poMetadata = metaHeaders (hinfo h)
}
2014-08-09 00:51:22 +00:00
data S3Handle = S3Handle
{ hmanager :: Manager
, hawscfg :: AWS.Configuration
, hs3cfg :: S3.S3Configuration AWS.NormalQuery
2014-08-10 02:13:03 +00:00
, hinfo :: S3Info
2014-08-09 00:51:22 +00:00
}
{- Sends a request to S3 and gets back the response.
-
- Note that pureAws's use of ResourceT is bypassed here;
2014-08-09 00:29:56 +00:00
- the response should be fully processed while the S3Handle
- is still open, eg within a call to withS3Handle.
-}
sendS3Handle
:: (AWS.Transaction req res, AWS.ServiceConfiguration req ~ S3.S3Configuration)
=> S3Handle
-> req
2014-08-09 00:29:56 +00:00
-> Annex res
sendS3Handle h r = liftIO $ runResourceT $ sendS3Handle' h r
sendS3Handle'
:: (AWS.Transaction r a, AWS.ServiceConfiguration r ~ S3.S3Configuration)
=> S3Handle
-> r
-> ResourceT IO a
sendS3Handle' h = AWS.pureAws (hawscfg h) (hs3cfg h) (hmanager h)
2014-08-10 02:13:03 +00:00
withS3Handle :: RemoteConfig -> UUID -> S3Info -> (S3Handle -> Annex a) -> Annex a
withS3Handle c u info a = do
creds <- getRemoteCredPairFor "S3" c (AWS.creds u)
awscreds <- liftIO $ AWS.genCredentials $ fromMaybe nocreds creds
let awscfg = AWS.Configuration AWS.Timestamp awscreds (AWS.defaultLog AWS.Error)
bracketIO (newManager httpcfg) closeManager $ \mgr ->
2014-08-10 02:13:03 +00:00
a $ S3Handle mgr awscfg s3cfg info
where
s3cfg = s3Configuration c
httpcfg = defaultManagerSettings
{ managerResponseTimeout = Nothing }
nocreds = error "Cannot use S3 without credentials configured"
s3Configuration :: RemoteConfig -> S3.S3Configuration AWS.NormalQuery
s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port }
where
proto
| port == 443 = AWS.HTTPS
| otherwise = AWS.HTTP
host = fromJust $ M.lookup "host" c
datacenter = fromJust $ M.lookup "datacenter" c
-- When the default S3 host is configured, connect directly to
-- the S3 endpoint for the configured datacenter.
-- When another host is configured, it's used as-is.
endpoint
| host == AWS.s3DefaultHost = AWS.s3HostName $ T.pack datacenter
| otherwise = T.encodeUtf8 $ T.pack host
port = let s = fromJust $ M.lookup "port" c in
case reads s of
[(p, _)] -> p
_ -> error $ "bad S3 port value: " ++ s
2014-08-09 00:29:56 +00:00
tryS3 :: Annex a -> Annex (Either S3.S3Error a)
tryS3 a = (Right <$> a) `catch` (pure . Left)
2014-08-10 02:13:03 +00:00
data S3Info = S3Info
{ bucket :: S3.Bucket
, storageClass :: S3.StorageClass
, bucketObject :: Key -> T.Text
, metaHeaders :: [(T.Text, T.Text)]
, isIA :: Bool
}
extractS3Info :: RemoteConfig -> Annex S3Info
extractS3Info c = do
b <- maybe
(error "S3 bucket not configured")
(return . T.pack)
(getBucketName c)
return $ S3Info
{ bucket = b
, storageClass = getStorageClass c
, bucketObject = T.pack . getBucketObject c
, metaHeaders = getMetaHeaders c
, isIA = configIA c
}
2014-08-09 00:29:56 +00:00
getBucketName :: RemoteConfig -> Maybe BucketName
getBucketName = M.lookup "bucket"
getStorageClass :: RemoteConfig -> S3.StorageClass
2014-08-09 00:51:22 +00:00
getStorageClass c = case M.lookup "storageclass" c of
Just "REDUCED_REDUNDANCY" -> S3.ReducedRedundancy
_ -> S3.Standard
getMetaHeaders :: RemoteConfig -> [(T.Text, T.Text)]
getMetaHeaders = map munge . filter ismetaheader . M.assocs
where
ismetaheader (h, _) = metaprefix `isPrefixOf` h
metaprefix = "x-amz-meta-"
metaprefixlen = length metaprefix
munge (k, v) = (T.pack $ drop metaprefixlen k, T.pack v)
2014-08-09 00:29:56 +00:00
getFilePrefix :: RemoteConfig -> String
getFilePrefix = M.findWithDefault "" "fileprefix"
2014-08-10 02:13:03 +00:00
getBucketObject :: RemoteConfig -> Key -> FilePath
getBucketObject c = munge . key2file
where
munge s = case M.lookup "mungekeys" c of
Just "ia" -> iaMunge $ getFilePrefix c ++ s
_ -> getFilePrefix c ++ s
2014-08-09 00:29:56 +00:00
{- Internet Archive limits filenames to a subset of ascii,
- with no whitespace. Other characters are xml entity
- encoded. -}
iaMunge :: String -> String
iaMunge = (>>= munge)
where
munge c
| isAsciiUpper c || isAsciiLower c || isNumber c = [c]
| c `elem` "_-.\"" = [c]
| isSpace c = []
| otherwise = "&" ++ show (ord c) ++ ";"
2014-08-10 02:13:03 +00:00
configIA :: RemoteConfig -> Bool
configIA = maybe False isIAHost . M.lookup "host"
2013-04-25 17:14:49 +00:00
{- Hostname to use for archive.org S3. -}
iaHost :: HostName
iaHost = "s3.us.archive.org"
isIAHost :: HostName -> Bool
isIAHost h = ".archive.org" `isSuffixOf` map toLower h
2014-08-09 00:29:56 +00:00
iaItemUrl :: BucketName -> URLString
2014-08-10 02:13:03 +00:00
iaItemUrl b = "http://archive.org/details/" ++ b
iaKeyUrl :: Remote -> Key -> URLString
2014-08-10 02:13:03 +00:00
iaKeyUrl r k = "http://archive.org/download/" ++ b ++ "/" ++ getBucketObject (config r) k
where
2014-08-10 02:13:03 +00:00
b = fromMaybe "" $ getBucketName $ config r