From 6fcca2f13eefa140a9bef6626b9e8285e2f2edf7 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 8 Aug 2014 18:54:04 -0400 Subject: [PATCH] WIP converting S3 special remote from hS3 to aws library Currently, initremote works, but not the other operations. They should be fairly easy to add from this base. Also, https://github.com/aristidb/aws/issues/119 blocks internet archive support. Note that since http-conduit is used, this also adds https support to S3. Although git-annex encrypts everything anyway, so that may not be extremely useful. It is not enabled by default, because existing S3 special remotes have port=80 in their config. Setting port=443 will enable it. This commit was sponsored by Daniel Brockman. --- Remote/Helper/AWS.hs | 34 ++++++- Remote/S3.hs | 186 +++++++++++++++++++++++++----------- debian/changelog | 4 + doc/special_remotes/S3.mdwn | 3 +- git-annex.cabal | 2 +- 5 files changed, 165 insertions(+), 64 deletions(-) diff --git a/Remote/Helper/AWS.hs b/Remote/Helper/AWS.hs index 0687a5ee1d..fb8af713a8 100644 --- a/Remote/Helper/AWS.hs +++ b/Remote/Helper/AWS.hs @@ -1,6 +1,6 @@ {- Amazon Web Services common infrastructure. - - - Copyright 2011,2012 Joey Hess + - Copyright 2011-2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -12,8 +12,14 @@ module Remote.Helper.AWS where import Common.Annex import Creds +import qualified Aws +import qualified Aws.S3 as S3 import qualified Data.Map as M +import qualified Data.ByteString as B +import qualified Data.Text as T +import Data.Text.Encoding (encodeUtf8) import Data.Text (Text) +import Data.IORef creds :: UUID -> CredPairStorage creds u = CredPairStorage @@ -22,6 +28,13 @@ creds u = CredPairStorage , credPairRemoteKey = Just "s3creds" } +genCredentials :: CredPair -> IO Aws.Credentials +genCredentials (keyid, secret) = Aws.Credentials + <$> pure (encodeUtf8 (T.pack keyid)) + <*> pure (encodeUtf8 (T.pack secret)) + <*> newIORef [] + <*> pure Nothing + data Service = S3 | Glacier deriving (Eq) @@ -33,9 +46,10 @@ regionMap = M.fromList . regionInfo defaultRegion :: Service -> Region defaultRegion = snd . Prelude.head . regionInfo -{- S3 and Glacier use different names for some regions. Ie, "us-east-1" - - cannot be used with S3, while "US" cannot be used with Glacier. Dunno why. - - Also, Glacier is not yet available in all regions. -} +data ServiceRegion = BothRegion Region | S3Region Region | GlacierRegion Region + +{- The "US" and "EU" names are used as location constraints when creating a + - S3 bucket. -} regionInfo :: Service -> [(Text, Region)] regionInfo service = map (\(t, r) -> (t, fromServiceRegion r)) $ filter (matchingService . snd) $ @@ -60,4 +74,14 @@ regionInfo service = map (\(t, r) -> (t, fromServiceRegion r)) $ matchingService (S3Region _) = service == S3 matchingService (GlacierRegion _) = service == Glacier -data ServiceRegion = BothRegion Region | S3Region Region | GlacierRegion Region +s3HostName :: Region -> B.ByteString +s3HostName "US" = "s3.amazonaws.com" +s3HostName "EU" = "s3-eu-west-1.amazonaws.com" +s3HostName r = encodeUtf8 $ T.concat ["s3-", r, ".amazonaws.com"] + +s3DefaultHost :: String +s3DefaultHost = "s3.amazonaws.com" + +mkLocationConstraint :: Region -> S3.LocationConstraint +mkLocationConstraint "US" = S3.locationUsClassic +mkLocationConstraint r = r diff --git a/Remote/S3.hs b/Remote/S3.hs index 1aba392453..2b2dc17239 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -1,15 +1,19 @@ {- S3 remotes - - - Copyright 2011-2013 Joey Hess + - Copyright 2011-2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} +{-# LANGUAGE TypeFamilies #-} + module Remote.S3 (remote, iaHost, isIA, isIAHost, iaItemUrl) where +import qualified Aws as AWS +import qualified Aws.Core as AWS +import qualified Aws.S3 as S3 import Network.AWS.AWSConnection import Network.AWS.S3Object hiding (getStorageClass) -import Network.AWS.S3Bucket hiding (size) import Network.AWS.AWSResult import qualified Data.Text as T import qualified Data.Text.Encoding as T @@ -17,6 +21,11 @@ import qualified Data.ByteString.Lazy as L import qualified Data.Map as M import Data.Char import Network.Socket (HostName) +import Network.HTTP.Conduit (Manager, newManager, closeManager) +import Network.HTTP.Client (defaultManagerSettings, managerResponseTimeout, responseStatus, responseBody, RequestBody(..)) +import Network.HTTP.Types +import Control.Monad.Trans.Resource +import Control.Monad.Catch import Common.Annex import Types.Remote @@ -86,8 +95,8 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost defaults = M.fromList [ ("datacenter", T.unpack $ AWS.defaultRegion AWS.S3) , ("storageclass", "STANDARD") - , ("host", defaultAmazonS3Host) - , ("port", show defaultAmazonS3Port) + , ("host", AWS.s3DefaultHost) + , ("port", "80") , ("bucket", defbucket) ] @@ -119,7 +128,8 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost M.insert "mungekeys" "ia" $ -- bucket created only when files are uploaded M.insert "x-amz-auto-make-bucket" "1" defaults - writeUUIDFile archiveconfig u + withS3Handle archiveconfig u $ + writeUUIDFile archiveconfig u use archiveconfig prepareStore :: Remote -> Preparer Storer @@ -135,6 +145,8 @@ prepareStore r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> store :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ()) store (conn, bucket) r k p file = do + error "TODO" + {- size <- (fromIntegral . fileSize <$> getFileStatus file) :: IO Integer withMeteredFile file p $ \content -> do -- size is provided to S3 so the whole content @@ -145,12 +157,16 @@ store (conn, bucket) r k p file = do content sendObject conn $ setStorageClass (getStorageClass $ config r) object + -} prepareRetrieve :: Remote -> Preparer Retriever prepareRetrieve r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> + error "TODO" + {- byteRetriever $ \k sink -> liftIO (getObject conn $ bucketKey r bucket k) >>= either s3Error (sink . obj_data) + -} retrieveCheap :: Key -> FilePath -> Annex Bool retrieveCheap _ _ = return False @@ -172,11 +188,14 @@ remove' r k = s3Action r False $ \(conn, bucket) -> checkKey :: Remote -> CheckPresent checkKey r k = s3Action r noconn $ \(conn, bucket) -> do showAction $ "checking " ++ name r + {- res <- liftIO $ getObjectInfo conn $ bucketKey r bucket k case res of Right _ -> return True Left (AWSError _ _) -> return False Left e -> s3Error e + -} + error "TODO" where noconn = error "S3 not configured" @@ -185,9 +204,6 @@ s3Warning e = do warning $ prettyReqError e return False -s3Error :: ReqError -> a -s3Error e = error $ prettyReqError e - s3Bool :: AWSResult () -> Annex Bool s3Bool (Right _) = return True s3Bool (Left e) = s3Warning e @@ -229,76 +245,76 @@ iaMunge = (>>= munge) {- Generate the bucket if it does not already exist, including creating the - UUID file within the bucket. - - - To check if the bucket exists, ask for its location. However, some ACLs - - can allow read/write to buckets, but not querying location, so first - - check if the UUID file already exists and we can skip doing anything. + - Some ACLs can allow read/write to buckets, but not querying them, + - so first check if the UUID file already exists and we can skip doing + - anything. -} genBucket :: RemoteConfig -> UUID -> Annex () genBucket c u = do - conn <- s3ConnectionRequired c u showAction "checking bucket" - unlessM ((== Right True) <$> checkUUIDFile c u conn) $ do - loc <- liftIO $ getBucketLocation conn bucket - case loc of - Right _ -> writeUUIDFile c u - Left err@(NetworkError _) -> s3Error err - Left (AWSError _ _) -> do - showAction $ "creating bucket in " ++ datacenter - res <- liftIO $ createBucketIn conn bucket datacenter - case res of - Right _ -> writeUUIDFile c u - Left err -> s3Error err + withS3Handle c u $ \h -> + go h =<< checkUUIDFile c u h where - bucket = fromJust $ getBucket c + go _ (Right True) = noop + go h _ = do + v <- sendS3Handle h (S3.getBucket bucket) + case v of + Right _ -> noop + Left _ -> do + showAction $ "creating bucket in " ++ datacenter + void $ mustSucceed $ sendS3Handle h $ + S3.PutBucket bucket Nothing $ + AWS.mkLocationConstraint $ + T.pack datacenter + writeUUIDFile c u h + + bucket = T.pack $ fromJust $ getBucket c datacenter = fromJust $ M.lookup "datacenter" c {- Writes the UUID to an annex-uuid file within the bucket. - - If the file already exists in the bucket, it must match. - - - Note that IA items do not get created by createBucketIn. - - Rather, they are created the first time a file is stored in them. - - So this also takes care of that. + - Note that IA buckets can only created by having a file + - stored in them. So this also takes care of that. -} -writeUUIDFile :: RemoteConfig -> UUID -> Annex () -writeUUIDFile c u = do - conn <- s3ConnectionRequired c u - v <- checkUUIDFile c u conn +writeUUIDFile :: RemoteConfig -> UUID -> S3Handle -> Annex () +writeUUIDFile c u h = do + v <- checkUUIDFile c u h case v of - Left e -> error e - Right True -> return () - Right False -> do - let object = setStorageClass (getStorageClass c) (mkobject uuidb) - either s3Error return =<< liftIO (sendObject conn object) + Left e -> throwM e + Right True -> noop + Right False -> void $ mustSucceed $ sendS3Handle h mkobject where - file = uuidFile c + file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] - bucket = fromJust $ getBucket c + bucket = T.pack $ fromJust $ getBucket c - mkobject = S3Object bucket file "" (getXheaders c) + -- TODO: add headers from getXheaders + -- (See https://github.com/aristidb/aws/issues/119) + mkobject = (S3.putObject bucket file $ RequestBodyLBS uuidb) + { S3.poStorageClass = Just (getStorageClass c) } -{- Checks if the UUID file exists in the bucket and has the specified UUID already. -} -checkUUIDFile :: RemoteConfig -> UUID -> AWSConnection -> Annex (Either String Bool) -checkUUIDFile c u conn = check <$> liftIO (tryNonAsync $ getObject conn $ mkobject L.empty) +{- Checks if the UUID file exists in the bucket + - and has the specified UUID already. -} +checkUUIDFile :: RemoteConfig -> UUID -> S3Handle -> Annex (Either SomeException Bool) +checkUUIDFile c u h = tryNonAsync $ check <$> get where - check (Right (Right o)) - | obj_data o == uuidb = Right True - | otherwise = Left $ "This bucket is already in use by a different S3 special remote, with UUID: " ++ show (obj_data o) - check _ = Right False + get = liftIO + . runResourceT + . either (pure . Left) (Right <$$> AWS.loadToMemory) + =<< sendS3Handle h (S3.getObject bucket file) + check (Right (S3.GetObjectMemoryResponse _meta rsp)) = + responseStatus rsp == ok200 && responseBody rsp == uuidb + check (Left _S3Error) = False + bucket = T.pack $ fromJust $ getBucket c + file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] - bucket = fromJust $ getBucket c - file = uuidFile c - - mkobject = S3Object bucket file "" (getXheaders c) uuidFile :: RemoteConfig -> FilePath uuidFile c = filePrefix c ++ "annex-uuid" -s3ConnectionRequired :: RemoteConfig -> UUID -> Annex AWSConnection -s3ConnectionRequired c u = - maybe (error "Cannot connect to S3") return =<< s3Connection c u - s3Connection :: RemoteConfig -> UUID -> Annex (Maybe AWSConnection) s3Connection c u = go =<< getRemoteCredPairFor "S3" c (AWS.creds u) where @@ -311,13 +327,69 @@ s3Connection c u = go =<< getRemoteCredPairFor "S3" c (AWS.creds u) [(p, _)] -> p _ -> error $ "bad S3 port value: " ++ s +data S3Handle = S3Handle Manager AWS.Configuration (S3.S3Configuration AWS.NormalQuery) + +{- Sends a request to S3 and gets back the response. + - + - Note that pureAws's use of ResourceT is bypassed here; + - the response should be processed while the S3Handle is still open, + - eg within a call to withS3Handle. + -} +sendS3Handle + :: (AWS.Transaction req res, AWS.ServiceConfiguration req ~ S3.S3Configuration) + => S3Handle + -> req + -> Annex (Either S3.S3Error res) +sendS3Handle (S3Handle manager awscfg s3cfg) req = safely $ liftIO $ + runResourceT $ AWS.pureAws awscfg s3cfg manager req + where + safely a = (Right <$> a) `catch` (pure . Left) + +mustSucceed :: Annex (Either S3.S3Error res) -> Annex res +mustSucceed a = a >>= either s3Error return + +s3Error :: S3.S3Error -> a +s3Error (S3.S3Error { S3.s3ErrorMessage = m }) = error $ "S3 error: " ++ T.unpack m + +withS3Handle :: RemoteConfig -> UUID -> (S3Handle -> Annex a) -> Annex a +withS3Handle c u a = do + creds <- getRemoteCredPairFor "S3" c (AWS.creds u) + awscreds <- liftIO $ AWS.genCredentials $ fromMaybe nocreds creds + let awscfg = AWS.Configuration AWS.Timestamp awscreds (AWS.defaultLog AWS.Error) + bracketIO (newManager httpcfg) closeManager $ \mgr -> + a $ S3Handle mgr awscfg s3cfg + where + s3cfg = s3Configuration c + httpcfg = defaultManagerSettings + { managerResponseTimeout = Nothing } + nocreds = error "Cannot use S3 without credentials configured" + +s3Configuration :: RemoteConfig -> S3.S3Configuration AWS.NormalQuery +s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port } + where + proto + | port == 443 = AWS.HTTPS + | otherwise = AWS.HTTP + host = fromJust $ M.lookup "host" c + datacenter = fromJust $ M.lookup "datacenter" c + -- When the default S3 host is configured, connect directly to + -- the S3 endpoint for the configured datacenter. + -- When another host is configured, it's used as-is. + endpoint + | host == AWS.s3DefaultHost = AWS.s3HostName $ T.pack datacenter + | otherwise = T.encodeUtf8 $ T.pack host + port = let s = fromJust $ M.lookup "port" c in + case reads s of + [(p, _)] -> p + _ -> error $ "bad S3 port value: " ++ s + getBucket :: RemoteConfig -> Maybe Bucket getBucket = M.lookup "bucket" -getStorageClass :: RemoteConfig -> StorageClass +getStorageClass :: RemoteConfig -> S3.StorageClass getStorageClass c = case fromJust $ M.lookup "storageclass" c of - "REDUCED_REDUNDANCY" -> REDUCED_REDUNDANCY - _ -> STANDARD + "REDUCED_REDUNDANCY" -> S3.ReducedRedundancy + _ -> S3.Standard getXheaders :: RemoteConfig -> [(String, String)] getXheaders = filter isxheader . M.assocs diff --git a/debian/changelog b/debian/changelog index 3a8ab302e1..cfc47906d8 100644 --- a/debian/changelog +++ b/debian/changelog @@ -21,6 +21,10 @@ git-annex (5.20140718) UNRELEASED; urgency=medium * WebDAV: Avoid buffering whole file in memory when uploading and downloading. * WebDAV: Dropped support for DAV before 1.0. + * S3: Switched to using the haskell aws library. + * S3: Now supports https. To enable this, configure a S3 special remote to + use port=443. However, with encrypted special remotes, this does not + add any security. * testremote: New command to test uploads/downloads to a remote. * Dropping an object from a bup special remote now deletes the git branch for the object, although of course the object's content cannot be deleted diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index fe46948b3b..a8af932983 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -33,7 +33,8 @@ the S3 remote. embedcreds without gpg encryption. * `datacenter` - Defaults to "US". Other values include "EU", - "us-west-1", and "ap-southeast-1". + "us-west-1", "us-west-2", "ap-southeast-1", "ap-southeast-2", and + "sa-east-1". * `storageclass` - Default is "STANDARD". If you have configured git-annex to preserve multiple [[copies]], consider setting this to "REDUCED_REDUNDANCY" diff --git a/git-annex.cabal b/git-annex.cabal index 5154b27dd8..70bd9c88b1 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -137,7 +137,7 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: hS3 + Build-Depends: hS3, http-conduit, http-client, resourcet, http-types, aws CPP-Options: -DWITH_S3 if flag(WebDAV)