From 1002c9448da339b9eec52f04a9857b464c48157a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sun, 3 Aug 2014 14:49:37 -0400 Subject: [PATCH 01/51] playing with aws library This is basically all I need to convert git-annex from hS3 to aws. In particular, both GetObject and PutObject stream content w/o buffering it in memory at all. \o/ --- GetObject.hs | 24 ++++++++++++++++++++++++ PutObject.hs | 33 +++++++++++++++++++++++++++++++++ RemoveObject.hs | 19 +++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 GetObject.hs create mode 100644 PutObject.hs create mode 100644 RemoveObject.hs diff --git a/GetObject.hs b/GetObject.hs new file mode 100644 index 0000000000..51764bf726 --- /dev/null +++ b/GetObject.hs @@ -0,0 +1,24 @@ +{-# LANGUAGE OverloadedStrings #-} + +import qualified Aws +import qualified Aws.S3 as S3 +import Data.Conduit (($$+-)) +import Data.Conduit.Binary (sinkFile) +import Network.HTTP.Conduit (withManager, responseBody) + +main :: IO () +main = do + {- Set up AWS credentials and the default configuration. -} + Just creds <- Aws.loadCredentialsFromEnv + let cfg = Aws.Configuration Aws.Timestamp creds (Aws.defaultLog Aws.Debug) + let s3cfg = Aws.defServiceConfig :: S3.S3Configuration Aws.NormalQuery + + {- Set up a ResourceT region with an available HTTP manager. -} + withManager $ \mgr -> do + {- Create a request object with S3.getObject and run the request with pureAws. -} + S3.GetObjectResponse { S3.gorResponse = rsp } <- + Aws.pureAws cfg s3cfg mgr $ + S3.getObject "joeyh-test" "cloud-remote.pdf" + + {- Save the response to a file. -} + responseBody rsp $$+- sinkFile "cloud-remote2.pdf" diff --git a/PutObject.hs b/PutObject.hs new file mode 100644 index 0000000000..68db1685af --- /dev/null +++ b/PutObject.hs @@ -0,0 +1,33 @@ +{-# LANGUAGE OverloadedStrings #-} + +import qualified Aws +import qualified Aws.S3 as S3 +import Data.Conduit (($$+-)) +import Data.Conduit.Binary (sinkFile) +import Network.HTTP.Conduit (withManager, RequestBody(..)) +import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString as S +import Control.Monad.IO.Class +import Control.Concurrent +import System.Posix.Files +import System.IO +import Control.Applicative +import qualified Data.Text as T + +main :: IO () +main = do + {- Set up AWS credentials and the default configuration. -} + Just creds <- Aws.loadCredentialsFromEnv + let cfg = Aws.Configuration Aws.Timestamp creds (Aws.defaultLog Aws.Debug) + let s3cfg = Aws.defServiceConfig :: S3.S3Configuration Aws.NormalQuery + + {- Set up a ResourceT region with an available HTTP manager. -} + withManager $ \mgr -> do + let file ="cloud-remote.pdf" + -- streams file content, without buffering more than 1k in memory! + let streamer sink = withFile file ReadMode $ \h -> sink $ S.hGet h 1024 + b <- liftIO $ L.readFile file + size <- liftIO $ (fromIntegral . fileSize <$> getFileStatus file :: IO Integer) + rsp <- Aws.pureAws cfg s3cfg mgr $ + S3.putObject "joeyh-test" (T.pack file) (RequestBodyStream (fromInteger size) streamer) + liftIO $ print rsp diff --git a/RemoveObject.hs b/RemoveObject.hs new file mode 100644 index 0000000000..7d61907cde --- /dev/null +++ b/RemoveObject.hs @@ -0,0 +1,19 @@ +{-# LANGUAGE OverloadedStrings #-} + +import qualified Aws +import qualified Aws.S3 as S3 +import Data.Conduit (($$+-)) +import Data.Conduit.Binary (sinkFile) +import Network.HTTP.Conduit (withManager, responseBody) +import Control.Monad.IO.Class + +main :: IO () +main = do + Just creds <- Aws.loadCredentialsFromEnv + let cfg = Aws.Configuration Aws.Timestamp creds (Aws.defaultLog Aws.Debug) + let s3cfg = Aws.defServiceConfig :: S3.S3Configuration Aws.NormalQuery + + withManager $ \mgr -> do + rsp <- Aws.pureAws cfg s3cfg mgr $ + S3.DeleteObject "cloud-remote.pdf" "joeyh-test" + liftIO $ print "removal done" From 6fcca2f13eefa140a9bef6626b9e8285e2f2edf7 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 8 Aug 2014 18:54:04 -0400 Subject: [PATCH 02/51] WIP converting S3 special remote from hS3 to aws library Currently, initremote works, but not the other operations. They should be fairly easy to add from this base. Also, https://github.com/aristidb/aws/issues/119 blocks internet archive support. Note that since http-conduit is used, this also adds https support to S3. Although git-annex encrypts everything anyway, so that may not be extremely useful. It is not enabled by default, because existing S3 special remotes have port=80 in their config. Setting port=443 will enable it. This commit was sponsored by Daniel Brockman. --- Remote/Helper/AWS.hs | 34 ++++++- Remote/S3.hs | 186 +++++++++++++++++++++++++----------- debian/changelog | 4 + doc/special_remotes/S3.mdwn | 3 +- git-annex.cabal | 2 +- 5 files changed, 165 insertions(+), 64 deletions(-) diff --git a/Remote/Helper/AWS.hs b/Remote/Helper/AWS.hs index 0687a5ee1d..fb8af713a8 100644 --- a/Remote/Helper/AWS.hs +++ b/Remote/Helper/AWS.hs @@ -1,6 +1,6 @@ {- Amazon Web Services common infrastructure. - - - Copyright 2011,2012 Joey Hess + - Copyright 2011-2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -12,8 +12,14 @@ module Remote.Helper.AWS where import Common.Annex import Creds +import qualified Aws +import qualified Aws.S3 as S3 import qualified Data.Map as M +import qualified Data.ByteString as B +import qualified Data.Text as T +import Data.Text.Encoding (encodeUtf8) import Data.Text (Text) +import Data.IORef creds :: UUID -> CredPairStorage creds u = CredPairStorage @@ -22,6 +28,13 @@ creds u = CredPairStorage , credPairRemoteKey = Just "s3creds" } +genCredentials :: CredPair -> IO Aws.Credentials +genCredentials (keyid, secret) = Aws.Credentials + <$> pure (encodeUtf8 (T.pack keyid)) + <*> pure (encodeUtf8 (T.pack secret)) + <*> newIORef [] + <*> pure Nothing + data Service = S3 | Glacier deriving (Eq) @@ -33,9 +46,10 @@ regionMap = M.fromList . regionInfo defaultRegion :: Service -> Region defaultRegion = snd . Prelude.head . regionInfo -{- S3 and Glacier use different names for some regions. Ie, "us-east-1" - - cannot be used with S3, while "US" cannot be used with Glacier. Dunno why. - - Also, Glacier is not yet available in all regions. -} +data ServiceRegion = BothRegion Region | S3Region Region | GlacierRegion Region + +{- The "US" and "EU" names are used as location constraints when creating a + - S3 bucket. -} regionInfo :: Service -> [(Text, Region)] regionInfo service = map (\(t, r) -> (t, fromServiceRegion r)) $ filter (matchingService . snd) $ @@ -60,4 +74,14 @@ regionInfo service = map (\(t, r) -> (t, fromServiceRegion r)) $ matchingService (S3Region _) = service == S3 matchingService (GlacierRegion _) = service == Glacier -data ServiceRegion = BothRegion Region | S3Region Region | GlacierRegion Region +s3HostName :: Region -> B.ByteString +s3HostName "US" = "s3.amazonaws.com" +s3HostName "EU" = "s3-eu-west-1.amazonaws.com" +s3HostName r = encodeUtf8 $ T.concat ["s3-", r, ".amazonaws.com"] + +s3DefaultHost :: String +s3DefaultHost = "s3.amazonaws.com" + +mkLocationConstraint :: Region -> S3.LocationConstraint +mkLocationConstraint "US" = S3.locationUsClassic +mkLocationConstraint r = r diff --git a/Remote/S3.hs b/Remote/S3.hs index 1aba392453..2b2dc17239 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -1,15 +1,19 @@ {- S3 remotes - - - Copyright 2011-2013 Joey Hess + - Copyright 2011-2014 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} +{-# LANGUAGE TypeFamilies #-} + module Remote.S3 (remote, iaHost, isIA, isIAHost, iaItemUrl) where +import qualified Aws as AWS +import qualified Aws.Core as AWS +import qualified Aws.S3 as S3 import Network.AWS.AWSConnection import Network.AWS.S3Object hiding (getStorageClass) -import Network.AWS.S3Bucket hiding (size) import Network.AWS.AWSResult import qualified Data.Text as T import qualified Data.Text.Encoding as T @@ -17,6 +21,11 @@ import qualified Data.ByteString.Lazy as L import qualified Data.Map as M import Data.Char import Network.Socket (HostName) +import Network.HTTP.Conduit (Manager, newManager, closeManager) +import Network.HTTP.Client (defaultManagerSettings, managerResponseTimeout, responseStatus, responseBody, RequestBody(..)) +import Network.HTTP.Types +import Control.Monad.Trans.Resource +import Control.Monad.Catch import Common.Annex import Types.Remote @@ -86,8 +95,8 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost defaults = M.fromList [ ("datacenter", T.unpack $ AWS.defaultRegion AWS.S3) , ("storageclass", "STANDARD") - , ("host", defaultAmazonS3Host) - , ("port", show defaultAmazonS3Port) + , ("host", AWS.s3DefaultHost) + , ("port", "80") , ("bucket", defbucket) ] @@ -119,7 +128,8 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost M.insert "mungekeys" "ia" $ -- bucket created only when files are uploaded M.insert "x-amz-auto-make-bucket" "1" defaults - writeUUIDFile archiveconfig u + withS3Handle archiveconfig u $ + writeUUIDFile archiveconfig u use archiveconfig prepareStore :: Remote -> Preparer Storer @@ -135,6 +145,8 @@ prepareStore r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> store :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ()) store (conn, bucket) r k p file = do + error "TODO" + {- size <- (fromIntegral . fileSize <$> getFileStatus file) :: IO Integer withMeteredFile file p $ \content -> do -- size is provided to S3 so the whole content @@ -145,12 +157,16 @@ store (conn, bucket) r k p file = do content sendObject conn $ setStorageClass (getStorageClass $ config r) object + -} prepareRetrieve :: Remote -> Preparer Retriever prepareRetrieve r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> + error "TODO" + {- byteRetriever $ \k sink -> liftIO (getObject conn $ bucketKey r bucket k) >>= either s3Error (sink . obj_data) + -} retrieveCheap :: Key -> FilePath -> Annex Bool retrieveCheap _ _ = return False @@ -172,11 +188,14 @@ remove' r k = s3Action r False $ \(conn, bucket) -> checkKey :: Remote -> CheckPresent checkKey r k = s3Action r noconn $ \(conn, bucket) -> do showAction $ "checking " ++ name r + {- res <- liftIO $ getObjectInfo conn $ bucketKey r bucket k case res of Right _ -> return True Left (AWSError _ _) -> return False Left e -> s3Error e + -} + error "TODO" where noconn = error "S3 not configured" @@ -185,9 +204,6 @@ s3Warning e = do warning $ prettyReqError e return False -s3Error :: ReqError -> a -s3Error e = error $ prettyReqError e - s3Bool :: AWSResult () -> Annex Bool s3Bool (Right _) = return True s3Bool (Left e) = s3Warning e @@ -229,76 +245,76 @@ iaMunge = (>>= munge) {- Generate the bucket if it does not already exist, including creating the - UUID file within the bucket. - - - To check if the bucket exists, ask for its location. However, some ACLs - - can allow read/write to buckets, but not querying location, so first - - check if the UUID file already exists and we can skip doing anything. + - Some ACLs can allow read/write to buckets, but not querying them, + - so first check if the UUID file already exists and we can skip doing + - anything. -} genBucket :: RemoteConfig -> UUID -> Annex () genBucket c u = do - conn <- s3ConnectionRequired c u showAction "checking bucket" - unlessM ((== Right True) <$> checkUUIDFile c u conn) $ do - loc <- liftIO $ getBucketLocation conn bucket - case loc of - Right _ -> writeUUIDFile c u - Left err@(NetworkError _) -> s3Error err - Left (AWSError _ _) -> do - showAction $ "creating bucket in " ++ datacenter - res <- liftIO $ createBucketIn conn bucket datacenter - case res of - Right _ -> writeUUIDFile c u - Left err -> s3Error err + withS3Handle c u $ \h -> + go h =<< checkUUIDFile c u h where - bucket = fromJust $ getBucket c + go _ (Right True) = noop + go h _ = do + v <- sendS3Handle h (S3.getBucket bucket) + case v of + Right _ -> noop + Left _ -> do + showAction $ "creating bucket in " ++ datacenter + void $ mustSucceed $ sendS3Handle h $ + S3.PutBucket bucket Nothing $ + AWS.mkLocationConstraint $ + T.pack datacenter + writeUUIDFile c u h + + bucket = T.pack $ fromJust $ getBucket c datacenter = fromJust $ M.lookup "datacenter" c {- Writes the UUID to an annex-uuid file within the bucket. - - If the file already exists in the bucket, it must match. - - - Note that IA items do not get created by createBucketIn. - - Rather, they are created the first time a file is stored in them. - - So this also takes care of that. + - Note that IA buckets can only created by having a file + - stored in them. So this also takes care of that. -} -writeUUIDFile :: RemoteConfig -> UUID -> Annex () -writeUUIDFile c u = do - conn <- s3ConnectionRequired c u - v <- checkUUIDFile c u conn +writeUUIDFile :: RemoteConfig -> UUID -> S3Handle -> Annex () +writeUUIDFile c u h = do + v <- checkUUIDFile c u h case v of - Left e -> error e - Right True -> return () - Right False -> do - let object = setStorageClass (getStorageClass c) (mkobject uuidb) - either s3Error return =<< liftIO (sendObject conn object) + Left e -> throwM e + Right True -> noop + Right False -> void $ mustSucceed $ sendS3Handle h mkobject where - file = uuidFile c + file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] - bucket = fromJust $ getBucket c + bucket = T.pack $ fromJust $ getBucket c - mkobject = S3Object bucket file "" (getXheaders c) + -- TODO: add headers from getXheaders + -- (See https://github.com/aristidb/aws/issues/119) + mkobject = (S3.putObject bucket file $ RequestBodyLBS uuidb) + { S3.poStorageClass = Just (getStorageClass c) } -{- Checks if the UUID file exists in the bucket and has the specified UUID already. -} -checkUUIDFile :: RemoteConfig -> UUID -> AWSConnection -> Annex (Either String Bool) -checkUUIDFile c u conn = check <$> liftIO (tryNonAsync $ getObject conn $ mkobject L.empty) +{- Checks if the UUID file exists in the bucket + - and has the specified UUID already. -} +checkUUIDFile :: RemoteConfig -> UUID -> S3Handle -> Annex (Either SomeException Bool) +checkUUIDFile c u h = tryNonAsync $ check <$> get where - check (Right (Right o)) - | obj_data o == uuidb = Right True - | otherwise = Left $ "This bucket is already in use by a different S3 special remote, with UUID: " ++ show (obj_data o) - check _ = Right False + get = liftIO + . runResourceT + . either (pure . Left) (Right <$$> AWS.loadToMemory) + =<< sendS3Handle h (S3.getObject bucket file) + check (Right (S3.GetObjectMemoryResponse _meta rsp)) = + responseStatus rsp == ok200 && responseBody rsp == uuidb + check (Left _S3Error) = False + bucket = T.pack $ fromJust $ getBucket c + file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] - bucket = fromJust $ getBucket c - file = uuidFile c - - mkobject = S3Object bucket file "" (getXheaders c) uuidFile :: RemoteConfig -> FilePath uuidFile c = filePrefix c ++ "annex-uuid" -s3ConnectionRequired :: RemoteConfig -> UUID -> Annex AWSConnection -s3ConnectionRequired c u = - maybe (error "Cannot connect to S3") return =<< s3Connection c u - s3Connection :: RemoteConfig -> UUID -> Annex (Maybe AWSConnection) s3Connection c u = go =<< getRemoteCredPairFor "S3" c (AWS.creds u) where @@ -311,13 +327,69 @@ s3Connection c u = go =<< getRemoteCredPairFor "S3" c (AWS.creds u) [(p, _)] -> p _ -> error $ "bad S3 port value: " ++ s +data S3Handle = S3Handle Manager AWS.Configuration (S3.S3Configuration AWS.NormalQuery) + +{- Sends a request to S3 and gets back the response. + - + - Note that pureAws's use of ResourceT is bypassed here; + - the response should be processed while the S3Handle is still open, + - eg within a call to withS3Handle. + -} +sendS3Handle + :: (AWS.Transaction req res, AWS.ServiceConfiguration req ~ S3.S3Configuration) + => S3Handle + -> req + -> Annex (Either S3.S3Error res) +sendS3Handle (S3Handle manager awscfg s3cfg) req = safely $ liftIO $ + runResourceT $ AWS.pureAws awscfg s3cfg manager req + where + safely a = (Right <$> a) `catch` (pure . Left) + +mustSucceed :: Annex (Either S3.S3Error res) -> Annex res +mustSucceed a = a >>= either s3Error return + +s3Error :: S3.S3Error -> a +s3Error (S3.S3Error { S3.s3ErrorMessage = m }) = error $ "S3 error: " ++ T.unpack m + +withS3Handle :: RemoteConfig -> UUID -> (S3Handle -> Annex a) -> Annex a +withS3Handle c u a = do + creds <- getRemoteCredPairFor "S3" c (AWS.creds u) + awscreds <- liftIO $ AWS.genCredentials $ fromMaybe nocreds creds + let awscfg = AWS.Configuration AWS.Timestamp awscreds (AWS.defaultLog AWS.Error) + bracketIO (newManager httpcfg) closeManager $ \mgr -> + a $ S3Handle mgr awscfg s3cfg + where + s3cfg = s3Configuration c + httpcfg = defaultManagerSettings + { managerResponseTimeout = Nothing } + nocreds = error "Cannot use S3 without credentials configured" + +s3Configuration :: RemoteConfig -> S3.S3Configuration AWS.NormalQuery +s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port } + where + proto + | port == 443 = AWS.HTTPS + | otherwise = AWS.HTTP + host = fromJust $ M.lookup "host" c + datacenter = fromJust $ M.lookup "datacenter" c + -- When the default S3 host is configured, connect directly to + -- the S3 endpoint for the configured datacenter. + -- When another host is configured, it's used as-is. + endpoint + | host == AWS.s3DefaultHost = AWS.s3HostName $ T.pack datacenter + | otherwise = T.encodeUtf8 $ T.pack host + port = let s = fromJust $ M.lookup "port" c in + case reads s of + [(p, _)] -> p + _ -> error $ "bad S3 port value: " ++ s + getBucket :: RemoteConfig -> Maybe Bucket getBucket = M.lookup "bucket" -getStorageClass :: RemoteConfig -> StorageClass +getStorageClass :: RemoteConfig -> S3.StorageClass getStorageClass c = case fromJust $ M.lookup "storageclass" c of - "REDUCED_REDUNDANCY" -> REDUCED_REDUNDANCY - _ -> STANDARD + "REDUCED_REDUNDANCY" -> S3.ReducedRedundancy + _ -> S3.Standard getXheaders :: RemoteConfig -> [(String, String)] getXheaders = filter isxheader . M.assocs diff --git a/debian/changelog b/debian/changelog index 3a8ab302e1..cfc47906d8 100644 --- a/debian/changelog +++ b/debian/changelog @@ -21,6 +21,10 @@ git-annex (5.20140718) UNRELEASED; urgency=medium * WebDAV: Avoid buffering whole file in memory when uploading and downloading. * WebDAV: Dropped support for DAV before 1.0. + * S3: Switched to using the haskell aws library. + * S3: Now supports https. To enable this, configure a S3 special remote to + use port=443. However, with encrypted special remotes, this does not + add any security. * testremote: New command to test uploads/downloads to a remote. * Dropping an object from a bup special remote now deletes the git branch for the object, although of course the object's content cannot be deleted diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index fe46948b3b..a8af932983 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -33,7 +33,8 @@ the S3 remote. embedcreds without gpg encryption. * `datacenter` - Defaults to "US". Other values include "EU", - "us-west-1", and "ap-southeast-1". + "us-west-1", "us-west-2", "ap-southeast-1", "ap-southeast-2", and + "sa-east-1". * `storageclass` - Default is "STANDARD". If you have configured git-annex to preserve multiple [[copies]], consider setting this to "REDUCED_REDUNDANCY" diff --git a/git-annex.cabal b/git-annex.cabal index 5154b27dd8..70bd9c88b1 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -137,7 +137,7 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: hS3 + Build-Depends: hS3, http-conduit, http-client, resourcet, http-types, aws CPP-Options: -DWITH_S3 if flag(WebDAV) From cf82b0e1ec5674bc3a9c2faae6f00a1bd10e480c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 8 Aug 2014 20:29:56 -0400 Subject: [PATCH 03/51] cleanup --- Remote/S3.hs | 91 ++++++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 2b2dc17239..b9f03020e5 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -40,7 +40,7 @@ import Utility.Metered import Annex.UUID import Logs.Web -type Bucket = String +type BucketName = String remote :: RemoteType remote = RemoteType { @@ -116,7 +116,7 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost -- this determines the name of the archive.org item. let bucket = replace " " "-" $ map toLower $ fromMaybe (error "specify bucket=") $ - getBucket c + getBucketName c let archiveconfig = -- hS3 does not pass through x-archive-* headers M.mapKeys (replace "x-archive-" "x-amz-") $ @@ -143,7 +143,7 @@ prepareStore r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> return ok -store :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ()) +store :: (AWSConnection, BucketName) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ()) store (conn, bucket) r k p file = do error "TODO" {- @@ -208,7 +208,7 @@ s3Bool :: AWSResult () -> Annex Bool s3Bool (Right _) = return True s3Bool (Left e) = s3Warning e -s3Action :: Remote -> a -> ((AWSConnection, Bucket) -> Annex a) -> Annex a +s3Action :: Remote -> a -> ((AWSConnection, BucketName) -> Annex a) -> Annex a s3Action r noconn action = do let bucket = M.lookup "bucket" $ config r conn <- s3Connection (config r) (uuid r) @@ -220,28 +220,13 @@ bucketFile :: Remote -> Key -> FilePath bucketFile r = munge . key2file where munge s = case M.lookup "mungekeys" c of - Just "ia" -> iaMunge $ filePrefix c ++ s - _ -> filePrefix c ++ s + Just "ia" -> iaMunge $ getFilePrefix c ++ s + _ -> getFilePrefix c ++ s c = config r -filePrefix :: RemoteConfig -> String -filePrefix = M.findWithDefault "" "fileprefix" - -bucketKey :: Remote -> Bucket -> Key -> S3Object +bucketKey :: Remote -> BucketName -> Key -> S3Object bucketKey r bucket k = S3Object bucket (bucketFile r k) "" [] L.empty -{- Internet Archive limits filenames to a subset of ascii, - - with no whitespace. Other characters are xml entity - - encoded. -} -iaMunge :: String -> String -iaMunge = (>>= munge) - where - munge c - | isAsciiUpper c || isAsciiLower c || isNumber c = [c] - | c `elem` "_-.\"" = [c] - | isSpace c = [] - | otherwise = "&" ++ show (ord c) ++ ";" - {- Generate the bucket if it does not already exist, including creating the - UUID file within the bucket. - @@ -257,18 +242,18 @@ genBucket c u = do where go _ (Right True) = noop go h _ = do - v <- sendS3Handle h (S3.getBucket bucket) + v <- tryS3 $ sendS3Handle h (S3.getBucket bucket) case v of Right _ -> noop Left _ -> do showAction $ "creating bucket in " ++ datacenter - void $ mustSucceed $ sendS3Handle h $ + void $ sendS3Handle h $ S3.PutBucket bucket Nothing $ AWS.mkLocationConstraint $ T.pack datacenter writeUUIDFile c u h - bucket = T.pack $ fromJust $ getBucket c + bucket = T.pack $ fromJust $ getBucketName c datacenter = fromJust $ M.lookup "datacenter" c {- Writes the UUID to an annex-uuid file within the bucket. @@ -284,11 +269,11 @@ writeUUIDFile c u h = do case v of Left e -> throwM e Right True -> noop - Right False -> void $ mustSucceed $ sendS3Handle h mkobject + Right False -> void $ sendS3Handle h mkobject where file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] - bucket = T.pack $ fromJust $ getBucket c + bucket = T.pack $ fromJust $ getBucketName c -- TODO: add headers from getXheaders -- (See https://github.com/aristidb/aws/issues/119) @@ -303,17 +288,17 @@ checkUUIDFile c u h = tryNonAsync $ check <$> get get = liftIO . runResourceT . either (pure . Left) (Right <$$> AWS.loadToMemory) - =<< sendS3Handle h (S3.getObject bucket file) + =<< tryS3 (sendS3Handle h (S3.getObject bucket file)) check (Right (S3.GetObjectMemoryResponse _meta rsp)) = responseStatus rsp == ok200 && responseBody rsp == uuidb check (Left _S3Error) = False - bucket = T.pack $ fromJust $ getBucket c + bucket = T.pack $ fromJust $ getBucketName c file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] uuidFile :: RemoteConfig -> FilePath -uuidFile c = filePrefix c ++ "annex-uuid" +uuidFile c = getFilePrefix c ++ "annex-uuid" s3Connection :: RemoteConfig -> UUID -> Annex (Maybe AWSConnection) s3Connection c u = go =<< getRemoteCredPairFor "S3" c (AWS.creds u) @@ -332,24 +317,16 @@ data S3Handle = S3Handle Manager AWS.Configuration (S3.S3Configuration AWS.Norma {- Sends a request to S3 and gets back the response. - - Note that pureAws's use of ResourceT is bypassed here; - - the response should be processed while the S3Handle is still open, - - eg within a call to withS3Handle. + - the response should be fully processed while the S3Handle + - is still open, eg within a call to withS3Handle. -} sendS3Handle :: (AWS.Transaction req res, AWS.ServiceConfiguration req ~ S3.S3Configuration) => S3Handle -> req - -> Annex (Either S3.S3Error res) -sendS3Handle (S3Handle manager awscfg s3cfg) req = safely $ liftIO $ + -> Annex res +sendS3Handle (S3Handle manager awscfg s3cfg) req = liftIO $ runResourceT $ AWS.pureAws awscfg s3cfg manager req - where - safely a = (Right <$> a) `catch` (pure . Left) - -mustSucceed :: Annex (Either S3.S3Error res) -> Annex res -mustSucceed a = a >>= either s3Error return - -s3Error :: S3.S3Error -> a -s3Error (S3.S3Error { S3.s3ErrorMessage = m }) = error $ "S3 error: " ++ T.unpack m withS3Handle :: RemoteConfig -> UUID -> (S3Handle -> Annex a) -> Annex a withS3Handle c u a = do @@ -383,8 +360,15 @@ s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port } [(p, _)] -> p _ -> error $ "bad S3 port value: " ++ s -getBucket :: RemoteConfig -> Maybe Bucket -getBucket = M.lookup "bucket" +tryS3 :: Annex a -> Annex (Either S3.S3Error a) +tryS3 a = (Right <$> a) `catch` (pure . Left) + +s3Error :: S3.S3Error -> a +s3Error (S3.S3Error { S3.s3ErrorMessage = m }) = + error $ "S3 error: " ++ T.unpack m + +getBucketName :: RemoteConfig -> Maybe BucketName +getBucketName = M.lookup "bucket" getStorageClass :: RemoteConfig -> S3.StorageClass getStorageClass c = case fromJust $ M.lookup "storageclass" c of @@ -396,6 +380,21 @@ getXheaders = filter isxheader . M.assocs where isxheader (h, _) = "x-amz-" `isPrefixOf` h +getFilePrefix :: RemoteConfig -> String +getFilePrefix = M.findWithDefault "" "fileprefix" + +{- Internet Archive limits filenames to a subset of ascii, + - with no whitespace. Other characters are xml entity + - encoded. -} +iaMunge :: String -> String +iaMunge = (>>= munge) + where + munge c + | isAsciiUpper c || isAsciiLower c || isNumber c = [c] + | c `elem` "_-.\"" = [c] + | isSpace c = [] + | otherwise = "&" ++ show (ord c) ++ ";" + {- Hostname to use for archive.org S3. -} iaHost :: HostName iaHost = "s3.us.archive.org" @@ -406,10 +405,10 @@ isIA c = maybe False isIAHost (M.lookup "host" c) isIAHost :: HostName -> Bool isIAHost h = ".archive.org" `isSuffixOf` map toLower h -iaItemUrl :: Bucket -> URLString +iaItemUrl :: BucketName -> URLString iaItemUrl bucket = "http://archive.org/details/" ++ bucket iaKeyUrl :: Remote -> Key -> URLString iaKeyUrl r k = "http://archive.org/download/" ++ bucket ++ "/" ++ bucketFile r k where - bucket = fromMaybe "" $ getBucket $ config r + bucket = fromMaybe "" $ getBucketName $ config r From ccfb433ab3aee7c5dd7a6643e290845155448f13 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 8 Aug 2014 20:51:22 -0400 Subject: [PATCH 04/51] cleanup --- Remote/S3.hs | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index b9f03020e5..4bc341e412 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -242,18 +242,17 @@ genBucket c u = do where go _ (Right True) = noop go h _ = do - v <- tryS3 $ sendS3Handle h (S3.getBucket bucket) + v <- tryS3 $ sendS3Handle h (S3.getBucket $ hBucket h) case v of Right _ -> noop Left _ -> do showAction $ "creating bucket in " ++ datacenter void $ sendS3Handle h $ - S3.PutBucket bucket Nothing $ + S3.PutBucket (hBucket h) Nothing $ AWS.mkLocationConstraint $ T.pack datacenter writeUUIDFile c u h - bucket = T.pack $ fromJust $ getBucketName c datacenter = fromJust $ M.lookup "datacenter" c {- Writes the UUID to an annex-uuid file within the bucket. @@ -273,12 +272,11 @@ writeUUIDFile c u h = do where file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] - bucket = T.pack $ fromJust $ getBucketName c -- TODO: add headers from getXheaders -- (See https://github.com/aristidb/aws/issues/119) - mkobject = (S3.putObject bucket file $ RequestBodyLBS uuidb) - { S3.poStorageClass = Just (getStorageClass c) } + mkobject = (S3.putObject (hBucket h) file $ RequestBodyLBS uuidb) + { S3.poStorageClass = Just (hStorageClass h) } {- Checks if the UUID file exists in the bucket - and has the specified UUID already. -} @@ -288,12 +286,11 @@ checkUUIDFile c u h = tryNonAsync $ check <$> get get = liftIO . runResourceT . either (pure . Left) (Right <$$> AWS.loadToMemory) - =<< tryS3 (sendS3Handle h (S3.getObject bucket file)) + =<< tryS3 (sendS3Handle h (S3.getObject (hBucket h) file)) check (Right (S3.GetObjectMemoryResponse _meta rsp)) = responseStatus rsp == ok200 && responseBody rsp == uuidb check (Left _S3Error) = False - bucket = T.pack $ fromJust $ getBucketName c file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] @@ -312,7 +309,13 @@ s3Connection c u = go =<< getRemoteCredPairFor "S3" c (AWS.creds u) [(p, _)] -> p _ -> error $ "bad S3 port value: " ++ s -data S3Handle = S3Handle Manager AWS.Configuration (S3.S3Configuration AWS.NormalQuery) +data S3Handle = S3Handle + { hmanager :: Manager + , hawscfg :: AWS.Configuration + , hs3cfg :: S3.S3Configuration AWS.NormalQuery + , hBucket :: S3.Bucket + , hStorageClass :: S3.StorageClass + } {- Sends a request to S3 and gets back the response. - @@ -325,21 +328,25 @@ sendS3Handle => S3Handle -> req -> Annex res -sendS3Handle (S3Handle manager awscfg s3cfg) req = liftIO $ - runResourceT $ AWS.pureAws awscfg s3cfg manager req +sendS3Handle h = liftIO . runResourceT . call + where + call = AWS.pureAws (hawscfg h) (hs3cfg h) (hmanager h) withS3Handle :: RemoteConfig -> UUID -> (S3Handle -> Annex a) -> Annex a withS3Handle c u a = do creds <- getRemoteCredPairFor "S3" c (AWS.creds u) awscreds <- liftIO $ AWS.genCredentials $ fromMaybe nocreds creds + bucket <- maybe nobucket (return . T.pack) (getBucketName c) let awscfg = AWS.Configuration AWS.Timestamp awscreds (AWS.defaultLog AWS.Error) bracketIO (newManager httpcfg) closeManager $ \mgr -> - a $ S3Handle mgr awscfg s3cfg + a $ S3Handle mgr awscfg s3cfg bucket sc where s3cfg = s3Configuration c httpcfg = defaultManagerSettings { managerResponseTimeout = Nothing } + sc = getStorageClass c nocreds = error "Cannot use S3 without credentials configured" + nobucket = error "S3 bucket not configured" s3Configuration :: RemoteConfig -> S3.S3Configuration AWS.NormalQuery s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port } @@ -371,8 +378,8 @@ getBucketName :: RemoteConfig -> Maybe BucketName getBucketName = M.lookup "bucket" getStorageClass :: RemoteConfig -> S3.StorageClass -getStorageClass c = case fromJust $ M.lookup "storageclass" c of - "REDUCED_REDUNDANCY" -> S3.ReducedRedundancy +getStorageClass c = case M.lookup "storageclass" c of + Just "REDUCED_REDUNDANCY" -> S3.ReducedRedundancy _ -> S3.Standard getXheaders :: RemoteConfig -> [(String, String)] From 809ee40d7630d4b20cc54a3a948f8c089272c6f5 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 8 Aug 2014 21:42:46 -0400 Subject: [PATCH 05/51] wording --- Remote/S3.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 4bc341e412..4f15f988ea 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -118,7 +118,7 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost fromMaybe (error "specify bucket=") $ getBucketName c let archiveconfig = - -- hS3 does not pass through x-archive-* headers + -- IA acdepts x-amz-* as an alias for x-archive-* M.mapKeys (replace "x-archive-" "x-amz-") $ -- encryption does not make sense here M.insert "encryption" "none" $ From 4f007ace87a2f8c7cda6057d11e7b75d152e3cc4 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 14:23:54 -0400 Subject: [PATCH 06/51] S3: convert to aws for store, remove, checkPresent Fixes the memory leak on store.. the second oldest open git-annex bug! Only retrieve remains to be converted. This commit was sponsored by Scott Robinson. --- Remote/Helper/Http.hs | 14 +++-- Remote/S3.hs | 120 +++++++++++++++++++++--------------------- 2 files changed, 69 insertions(+), 65 deletions(-) diff --git a/Remote/Helper/Http.hs b/Remote/Helper/Http.hs index f1d576d1c9..4088854ff3 100644 --- a/Remote/Helper/Http.hs +++ b/Remote/Helper/Http.hs @@ -24,14 +24,18 @@ import Control.Concurrent -- Implemented as a fileStorer, so that the content can be streamed -- from the file in constant space. httpStorer :: (Key -> RequestBody -> Annex Bool) -> Storer -httpStorer a = fileStorer $ \k f m -> do - size <- liftIO $ (fromIntegral . fileSize <$> getFileStatus f :: IO Integer) - let streamer sink = withMeteredFile f m $ \b -> do +httpStorer a = fileStorer $ \k f m -> a k =<< liftIO (httpBodyStorer f m) + +-- Reads the file and generates a streaming request body, that will update +-- the meter as it's sent. +httpBodyStorer :: FilePath -> MeterUpdate -> IO RequestBody +httpBodyStorer src m = do + size <- fromIntegral . fileSize <$> getFileStatus src :: IO Integer + let streamer sink = withMeteredFile src m $ \b -> do mvar <- newMVar $ L.toChunks b let getnextchunk = modifyMVar mvar $ pure . pop sink getnextchunk - let body = RequestBodyStream (fromInteger size) streamer - a k body + return $ RequestBodyStream (fromInteger size) streamer where pop [] = ([], S.empty) pop (c:cs) = (cs, c) diff --git a/Remote/S3.hs b/Remote/S3.hs index 4f15f988ea..58b408cd27 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -34,9 +34,9 @@ import qualified Git import Config import Config.Cost import Remote.Helper.Special +import Remote.Helper.Http import qualified Remote.Helper.AWS as AWS import Creds -import Utility.Metered import Annex.UUID import Logs.Web @@ -54,10 +54,10 @@ gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remot gen r u c gc = new <$> remoteCost gc expensiveRemoteCost where new cst = Just $ specialRemote c - (prepareStore this) + (prepareS3 this $ store this) (prepareRetrieve this) - (simplyPrepare $ remove this c) - (simplyPrepare $ checkKey this) + (prepareS3 this $ remove this) + (prepareS3 this $ checkKey this) this where this = Remote { @@ -132,32 +132,22 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost writeUUIDFile archiveconfig u use archiveconfig -prepareStore :: Remote -> Preparer Storer -prepareStore r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> - fileStorer $ \k src p -> do - ok <- s3Bool =<< liftIO (store (conn, bucket) r k p src) +-- Sets up a http connection manager for S3 encdpoint, which allows +-- http connections to be reused across calls to the helper. +prepareS3 :: Remote -> (S3Handle -> helper) -> Preparer helper +prepareS3 r = resourcePrepare $ const $ withS3Handle (config r) (uuid r) - -- Store public URL to item in Internet Archive. - when (ok && isIA (config r) && not (isChunkKey k)) $ - setUrlPresent k (iaKeyUrl r k) - - return ok - -store :: (AWSConnection, BucketName) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ()) -store (conn, bucket) r k p file = do - error "TODO" - {- - size <- (fromIntegral . fileSize <$> getFileStatus file) :: IO Integer - withMeteredFile file p $ \content -> do - -- size is provided to S3 so the whole content - -- does not need to be buffered to calculate it - let object = S3Object - bucket (bucketFile r k) "" - (("Content-Length", show size) : getXheaders (config r)) - content - sendObject conn $ - setStorageClass (getStorageClass $ config r) object - -} +store :: Remote -> S3Handle -> Storer +store r h = fileStorer $ \k f p -> do + rbody <- liftIO $ httpBodyStorer f p + void $ sendS3Handle h $ + S3.putObject (hBucket h) (hBucketObject h k) rbody + + -- Store public URL to item in Internet Archive. + when (hIsIA h && not (isChunkKey k)) $ + setUrlPresent k (iaKeyUrl r k) + + return True prepareRetrieve :: Remote -> Preparer Retriever prepareRetrieve r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> @@ -174,31 +164,37 @@ retrieveCheap _ _ = return False {- Internet Archive doesn't easily allow removing content. - While it may remove the file, there are generally other files - derived from it that it does not remove. -} -remove :: Remote -> RemoteConfig -> Remover -remove r c k - | isIA c = do +remove :: Remote -> S3Handle -> Remover +remove r h k + | hIsIA h = do warning "Cannot remove content from the Internet Archive" return False - | otherwise = remove' r k + | otherwise = do + res <- tryNonAsync $ sendS3Handle h $ + S3.DeleteObject (hBucketObject h k) (hBucket h) + return $ either (const False) (const True) res -remove' :: Remote -> Key -> Annex Bool -remove' r k = s3Action r False $ \(conn, bucket) -> - s3Bool =<< liftIO (deleteObject conn $ bucketKey r bucket k) - -checkKey :: Remote -> CheckPresent -checkKey r k = s3Action r noconn $ \(conn, bucket) -> do +checkKey :: Remote -> S3Handle -> CheckPresent +checkKey r h k = do showAction $ "checking " ++ name r - {- - res <- liftIO $ getObjectInfo conn $ bucketKey r bucket k - case res of - Right _ -> return True - Left (AWSError _ _) -> return False - Left e -> s3Error e - -} - error "TODO" + catchMissingException $ do + void $ sendS3Handle h $ + S3.headObject (hBucket h) (hBucketObject h k) + return True + +{- Catch exception headObject returns when an object is not present + - in the bucket, and returns False. All other exceptions indicate a + - check error and are let through. -} +catchMissingException :: Annex Bool -> Annex Bool +catchMissingException a = catchJust missing a (const $ return False) where - noconn = error "S3 not configured" - + -- This is not very good; see + -- https://github.com/aristidb/aws/issues/121 + missing :: AWS.HeaderException -> Maybe () + missing e + | AWS.headerErrorMessage e == "ETag missing" = Just () + | otherwise = Nothing + s3Warning :: ReqError -> Annex Bool s3Warning e = do warning $ prettyReqError e @@ -216,16 +212,8 @@ s3Action r noconn action = do (Just b, Just c) -> action (c, b) _ -> return noconn -bucketFile :: Remote -> Key -> FilePath -bucketFile r = munge . key2file - where - munge s = case M.lookup "mungekeys" c of - Just "ia" -> iaMunge $ getFilePrefix c ++ s - _ -> getFilePrefix c ++ s - c = config r - bucketKey :: Remote -> BucketName -> Key -> S3Object -bucketKey r bucket k = S3Object bucket (bucketFile r k) "" [] L.empty +bucketKey r bucket k = S3Object bucket (bucketObject (config r) k) "" [] L.empty {- Generate the bucket if it does not already exist, including creating the - UUID file within the bucket. @@ -313,8 +301,12 @@ data S3Handle = S3Handle { hmanager :: Manager , hawscfg :: AWS.Configuration , hs3cfg :: S3.S3Configuration AWS.NormalQuery + + -- Cached values. , hBucket :: S3.Bucket , hStorageClass :: S3.StorageClass + , hBucketObject :: Key -> S3.Bucket + , hIsIA :: Bool } {- Sends a request to S3 and gets back the response. @@ -339,12 +331,13 @@ withS3Handle c u a = do bucket <- maybe nobucket (return . T.pack) (getBucketName c) let awscfg = AWS.Configuration AWS.Timestamp awscreds (AWS.defaultLog AWS.Error) bracketIO (newManager httpcfg) closeManager $ \mgr -> - a $ S3Handle mgr awscfg s3cfg bucket sc + a $ S3Handle mgr awscfg s3cfg bucket sc bo (isIA c) where s3cfg = s3Configuration c httpcfg = defaultManagerSettings { managerResponseTimeout = Nothing } sc = getStorageClass c + bo = T.pack . bucketObject c -- memoized nocreds = error "Cannot use S3 without credentials configured" nobucket = error "S3 bucket not configured" @@ -390,6 +383,13 @@ getXheaders = filter isxheader . M.assocs getFilePrefix :: RemoteConfig -> String getFilePrefix = M.findWithDefault "" "fileprefix" +bucketObject :: RemoteConfig -> Key -> FilePath +bucketObject c = munge . key2file + where + munge s = case M.lookup "mungekeys" c of + Just "ia" -> iaMunge $ getFilePrefix c ++ s + _ -> getFilePrefix c ++ s + {- Internet Archive limits filenames to a subset of ascii, - with no whitespace. Other characters are xml entity - encoded. -} @@ -416,6 +416,6 @@ iaItemUrl :: BucketName -> URLString iaItemUrl bucket = "http://archive.org/details/" ++ bucket iaKeyUrl :: Remote -> Key -> URLString -iaKeyUrl r k = "http://archive.org/download/" ++ bucket ++ "/" ++ bucketFile r k +iaKeyUrl r k = "http://archive.org/download/" ++ bucket ++ "/" ++ bucketObject (config r) k where bucket = fromMaybe "" $ getBucketName $ config r From 1ba1e37be30ac63a6135d37e20de1342a25d3d5c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 14:30:28 -0400 Subject: [PATCH 07/51] remove dead code --- Remote/S3.hs | 53 +++++------------------------------ doc/bugs/S3_memory_leaks.mdwn | 2 ++ 2 files changed, 9 insertions(+), 46 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 58b408cd27..790d827a5a 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -12,9 +12,6 @@ module Remote.S3 (remote, iaHost, isIA, isIAHost, iaItemUrl) where import qualified Aws as AWS import qualified Aws.Core as AWS import qualified Aws.S3 as S3 -import Network.AWS.AWSConnection -import Network.AWS.S3Object hiding (getStorageClass) -import Network.AWS.AWSResult import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.ByteString.Lazy as L @@ -55,8 +52,8 @@ gen r u c gc = new <$> remoteCost gc expensiveRemoteCost where new cst = Just $ specialRemote c (prepareS3 this $ store this) - (prepareRetrieve this) - (prepareS3 this $ remove this) + (prepareS3 this retrieve) + (prepareS3 this remove) (prepareS3 this $ checkKey this) this where @@ -149,10 +146,10 @@ store r h = fileStorer $ \k f p -> do return True -prepareRetrieve :: Remote -> Preparer Retriever -prepareRetrieve r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> - error "TODO" +retrieve :: S3Handle -> Retriever +retrieve _h = error "TODO" {- + resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> byteRetriever $ \k sink -> liftIO (getObject conn $ bucketKey r bucket k) >>= either s3Error (sink . obj_data) @@ -164,8 +161,8 @@ retrieveCheap _ _ = return False {- Internet Archive doesn't easily allow removing content. - While it may remove the file, there are generally other files - derived from it that it does not remove. -} -remove :: Remote -> S3Handle -> Remover -remove r h k +remove :: S3Handle -> Remover +remove h k | hIsIA h = do warning "Cannot remove content from the Internet Archive" return False @@ -195,26 +192,6 @@ catchMissingException a = catchJust missing a (const $ return False) | AWS.headerErrorMessage e == "ETag missing" = Just () | otherwise = Nothing -s3Warning :: ReqError -> Annex Bool -s3Warning e = do - warning $ prettyReqError e - return False - -s3Bool :: AWSResult () -> Annex Bool -s3Bool (Right _) = return True -s3Bool (Left e) = s3Warning e - -s3Action :: Remote -> a -> ((AWSConnection, BucketName) -> Annex a) -> Annex a -s3Action r noconn action = do - let bucket = M.lookup "bucket" $ config r - conn <- s3Connection (config r) (uuid r) - case (bucket, conn) of - (Just b, Just c) -> action (c, b) - _ -> return noconn - -bucketKey :: Remote -> BucketName -> Key -> S3Object -bucketKey r bucket k = S3Object bucket (bucketObject (config r) k) "" [] L.empty - {- Generate the bucket if it does not already exist, including creating the - UUID file within the bucket. - @@ -285,18 +262,6 @@ checkUUIDFile c u h = tryNonAsync $ check <$> get uuidFile :: RemoteConfig -> FilePath uuidFile c = getFilePrefix c ++ "annex-uuid" -s3Connection :: RemoteConfig -> UUID -> Annex (Maybe AWSConnection) -s3Connection c u = go =<< getRemoteCredPairFor "S3" c (AWS.creds u) - where - go Nothing = return Nothing - go (Just (ak, sk)) = return $ Just $ AWSConnection host port ak sk - - host = fromJust $ M.lookup "host" c - port = let s = fromJust $ M.lookup "port" c in - case reads s of - [(p, _)] -> p - _ -> error $ "bad S3 port value: " ++ s - data S3Handle = S3Handle { hmanager :: Manager , hawscfg :: AWS.Configuration @@ -363,10 +328,6 @@ s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port } tryS3 :: Annex a -> Annex (Either S3.S3Error a) tryS3 a = (Right <$> a) `catch` (pure . Left) -s3Error :: S3.S3Error -> a -s3Error (S3.S3Error { S3.s3ErrorMessage = m }) = - error $ "S3 error: " ++ T.unpack m - getBucketName :: RemoteConfig -> Maybe BucketName getBucketName = M.lookup "bucket" diff --git a/doc/bugs/S3_memory_leaks.mdwn b/doc/bugs/S3_memory_leaks.mdwn index 94bbdc3980..88dd6eaa6c 100644 --- a/doc/bugs/S3_memory_leaks.mdwn +++ b/doc/bugs/S3_memory_leaks.mdwn @@ -2,6 +2,8 @@ S3 has memory leaks Sending a file to S3 causes a slow memory increase toward the file size. +> This is fixed, now that it uses aws. --[[Joey]] + Copying the file back from S3 causes a slow memory increase toward the file size. From 57872b457b913dbd2bb62f2b658adf83cb0f12e3 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 14:44:53 -0400 Subject: [PATCH 08/51] pass metadata headers and storage class to S3 when putting objects --- Remote/S3.hs | 32 ++++++++++++++++++++------------ doc/special_remotes/S3.mdwn | 2 +- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 790d827a5a..885396f98c 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -137,8 +137,7 @@ prepareS3 r = resourcePrepare $ const $ withS3Handle (config r) (uuid r) store :: Remote -> S3Handle -> Storer store r h = fileStorer $ \k f p -> do rbody <- liftIO $ httpBodyStorer f p - void $ sendS3Handle h $ - S3.putObject (hBucket h) (hBucketObject h k) rbody + void $ sendS3Handle h $ putObject h (hBucketObject h k) rbody -- Store public URL to item in Internet Archive. when (hIsIA h && not (isChunkKey k)) $ @@ -238,10 +237,7 @@ writeUUIDFile c u h = do file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] - -- TODO: add headers from getXheaders - -- (See https://github.com/aristidb/aws/issues/119) - mkobject = (S3.putObject (hBucket h) file $ RequestBodyLBS uuidb) - { S3.poStorageClass = Just (hStorageClass h) } + mkobject = putObject h file (RequestBodyLBS uuidb) {- Checks if the UUID file exists in the bucket - and has the specified UUID already. -} @@ -262,6 +258,13 @@ checkUUIDFile c u h = tryNonAsync $ check <$> get uuidFile :: RemoteConfig -> FilePath uuidFile c = getFilePrefix c ++ "annex-uuid" +-- TODO: auto-create bucket when hIsIA. +putObject :: S3Handle -> T.Text -> RequestBody -> S3.PutObject +putObject h file rbody = (S3.putObject (hBucket h) file rbody) + { S3.poStorageClass = Just (hStorageClass h) + , S3.poMetadata = hMetaHeaders h + } + data S3Handle = S3Handle { hmanager :: Manager , hawscfg :: AWS.Configuration @@ -270,7 +273,8 @@ data S3Handle = S3Handle -- Cached values. , hBucket :: S3.Bucket , hStorageClass :: S3.StorageClass - , hBucketObject :: Key -> S3.Bucket + , hBucketObject :: Key -> T.Text + , hMetaHeaders :: [(T.Text, T.Text)] , hIsIA :: Bool } @@ -296,13 +300,14 @@ withS3Handle c u a = do bucket <- maybe nobucket (return . T.pack) (getBucketName c) let awscfg = AWS.Configuration AWS.Timestamp awscreds (AWS.defaultLog AWS.Error) bracketIO (newManager httpcfg) closeManager $ \mgr -> - a $ S3Handle mgr awscfg s3cfg bucket sc bo (isIA c) + a $ S3Handle mgr awscfg s3cfg bucket sc bo mh (isIA c) where s3cfg = s3Configuration c httpcfg = defaultManagerSettings { managerResponseTimeout = Nothing } sc = getStorageClass c - bo = T.pack . bucketObject c -- memoized + bo = T.pack . bucketObject c + mh = getMetaHeaders c nocreds = error "Cannot use S3 without credentials configured" nobucket = error "S3 bucket not configured" @@ -336,10 +341,13 @@ getStorageClass c = case M.lookup "storageclass" c of Just "REDUCED_REDUNDANCY" -> S3.ReducedRedundancy _ -> S3.Standard -getXheaders :: RemoteConfig -> [(String, String)] -getXheaders = filter isxheader . M.assocs +getMetaHeaders :: RemoteConfig -> [(T.Text, T.Text)] +getMetaHeaders = map munge . filter ismetaheader . M.assocs where - isxheader (h, _) = "x-amz-" `isPrefixOf` h + ismetaheader (h, _) = metaprefix `isPrefixOf` h + metaprefix = "x-amz-meta-" + metaprefixlen = length metaprefix + munge (k, v) = (T.pack $ drop metaprefixlen k, T.pack v) getFilePrefix :: RemoteConfig -> String getFilePrefix = M.findWithDefault "" "fileprefix" diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index a8af932983..492c247cbe 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -53,5 +53,5 @@ the S3 remote. and to "bar/" in another special remote, and both special remotes could then use the same bucket. -* `x-amz-*` are passed through as http headers when storing keys +* `x-amz-meta-*` are passed through as http headers when storing keys in S3. From a6da13c1e921905073ec89d9d30f3539aed630bb Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 15:31:12 -0400 Subject: [PATCH 09/51] deps --- debian/control | 6 +++++- git-annex.cabal | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/debian/control b/debian/control index 522b7c5cce..7defd8a7c0 100644 --- a/debian/control +++ b/debian/control @@ -13,7 +13,11 @@ Build-Depends: libghc-cryptohash-dev, libghc-dataenc-dev, libghc-utf8-string-dev, - libghc-hs3-dev (>= 0.5.6), + libghc-aws-dev, + libghc-conduit-dev, + libghc-resourcet-dev, + libghc-http-conduit-dev, + libghc-http-client-dev, libghc-dav-dev (>= 1.0) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 powerpc], libghc-quickcheck2-dev, libghc-monad-control-dev (>= 0.3), diff --git a/git-annex.cabal b/git-annex.cabal index 70bd9c88b1..b05863b797 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -137,12 +137,12 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: hS3, http-conduit, http-client, resourcet, http-types, aws + Build-Depends: aws, conduit, resourcet, + http-conduit, http-client, http-types CPP-Options: -DWITH_S3 if flag(WebDAV) - Build-Depends: DAV (>= 1.0), - http-client, http-types + Build-Depends: DAV (>= 1.0), http-client, http-types CPP-Options: -DWITH_WEBDAV if flag(Assistant) && ! os(solaris) From 3659cb9efb505b27c39fafb25b060cfe6c14be23 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 15:58:01 -0400 Subject: [PATCH 10/51] S3: finish converting to aws library Implemented the Retriever. Unfortunately, it is a fileRetriever and not a byteRetriever. It should be possible to convert this to a byteRetiever, but I got stuck: The conduit sink needs to process individual chunks, but a byteRetriever needs to pass a single L.ByteString to its callback for processing. I looked into using unsafeInerlaveIO to build up the bytestring lazily, but the sink is already operating under conduit's inversion of control, and does not run directly in IO anyway. On the plus side, no more memory leak.. --- Remote/S3.hs | 41 ++++++++++++++++++++++++++--------- debian/changelog | 4 ++-- doc/bugs/S3_memory_leaks.mdwn | 2 ++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 885396f98c..e06a3d6c8c 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -15,6 +15,7 @@ import qualified Aws.S3 as S3 import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString as S import qualified Data.Map as M import Data.Char import Network.Socket (HostName) @@ -23,6 +24,7 @@ import Network.HTTP.Client (defaultManagerSettings, managerResponseTimeout, resp import Network.HTTP.Types import Control.Monad.Trans.Resource import Control.Monad.Catch +import Data.Conduit import Common.Annex import Types.Remote @@ -36,6 +38,7 @@ import qualified Remote.Helper.AWS as AWS import Creds import Annex.UUID import Logs.Web +import Utility.Metered type BucketName = String @@ -145,14 +148,27 @@ store r h = fileStorer $ \k f p -> do return True +{- Implemented as a fileRetriever, that uses conduit to stream the chunks + - out to the file. Would be better to implement a byteRetriever, but + - that is difficult. -} retrieve :: S3Handle -> Retriever -retrieve _h = error "TODO" - {- - resourcePrepare (const $ s3Action r False) $ \(conn, bucket) -> - byteRetriever $ \k sink -> - liftIO (getObject conn $ bucketKey r bucket k) - >>= either s3Error (sink . obj_data) - -} +retrieve h = fileRetriever $ \f k p -> liftIO $ runResourceT $ do + (fr, fh) <- allocate (openFile f WriteMode) hClose + let req = S3.getObject (hBucket h) (hBucketObject h k) + S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req + responseBody rsp $$+- sinkprogressfile fh p zeroBytesProcessed + release fr + where + sinkprogressfile fh meterupdate sofar = do + mbs <- await + case mbs of + Nothing -> return () + Just bs -> do + let sofar' = sofar -- addBytesProcessed $ S.length bs + liftIO $ do + void $ meterupdate sofar' + S.hPut fh bs + sinkprogressfile fh meterupdate sofar' retrieveCheap :: Key -> FilePath -> Annex Bool retrieveCheap _ _ = return False @@ -289,9 +305,14 @@ sendS3Handle => S3Handle -> req -> Annex res -sendS3Handle h = liftIO . runResourceT . call - where - call = AWS.pureAws (hawscfg h) (hs3cfg h) (hmanager h) +sendS3Handle h r = liftIO $ runResourceT $ sendS3Handle' h r + +sendS3Handle' + :: (AWS.Transaction r a, AWS.ServiceConfiguration r ~ S3.S3Configuration) + => S3Handle + -> r + -> ResourceT IO a +sendS3Handle' h = AWS.pureAws (hawscfg h) (hs3cfg h) (hmanager h) withS3Handle :: RemoteConfig -> UUID -> (S3Handle -> Annex a) -> Annex a withS3Handle c u a = do diff --git a/debian/changelog b/debian/changelog index fb30e7736c..48d4d9144c 100644 --- a/debian/changelog +++ b/debian/changelog @@ -16,9 +16,9 @@ git-annex (5.20140718) UNRELEASED; urgency=medium were incompletely repaired before. * Fix cost calculation for non-encrypted remotes. * Display exception message when a transfer fails due to an exception. - * WebDAV: Sped up by avoiding making multiple http connections + * WebDAV, S3: Sped up by avoiding making multiple http connections when storing a file. - * WebDAV: Avoid buffering whole file in memory when uploading and + * WebDAV, S3: Avoid buffering whole file in memory when uploading and downloading. * WebDAV: Dropped support for DAV before 1.0. * S3: Switched to using the haskell aws library. diff --git a/doc/bugs/S3_memory_leaks.mdwn b/doc/bugs/S3_memory_leaks.mdwn index 88dd6eaa6c..7dc1e57571 100644 --- a/doc/bugs/S3_memory_leaks.mdwn +++ b/doc/bugs/S3_memory_leaks.mdwn @@ -7,6 +7,8 @@ Sending a file to S3 causes a slow memory increase toward the file size. Copying the file back from S3 causes a slow memory increase toward the file size. +> [[fixed|done]] too! --[[Joey]] + The author of hS3 is aware of the problem, and working on it. I think I have identified the root cause of the buffering; it's done by hS3 so it can resend the data if S3 sends it a 307 redirect. --[[Joey]] From 5ee72b1baef09e0473cbb0e6119a98abb51ee34e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 16:49:31 -0400 Subject: [PATCH 11/51] fix meter update --- Remote/S3.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index e06a3d6c8c..11e681bd8c 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -164,7 +164,7 @@ retrieve h = fileRetriever $ \f k p -> liftIO $ runResourceT $ do case mbs of Nothing -> return () Just bs -> do - let sofar' = sofar -- addBytesProcessed $ S.length bs + let sofar' = addBytesProcessed sofar (S.length bs) liftIO $ do void $ meterupdate sofar' S.hPut fh bs From 445f04472cb26fbe9b7c469e9b0d54cb31d53670 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 22:13:03 -0400 Subject: [PATCH 12/51] better memoization --- Remote/S3.hs | 116 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 48 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 11e681bd8c..9821a045fd 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -51,13 +51,16 @@ remote = RemoteType { } gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote) -gen r u c gc = new <$> remoteCost gc expensiveRemoteCost +gen r u c gc = do + cst <- remoteCost gc expensiveRemoteCost + info <- extractS3Info c + return $ new cst info where - new cst = Just $ specialRemote c - (prepareS3 this $ store this) - (prepareS3 this retrieve) - (prepareS3 this remove) - (prepareS3 this $ checkKey this) + new cst info = Just $ specialRemote c + (prepareS3 this info $ store this) + (prepareS3 this info retrieve) + (prepareS3 this info remove) + (prepareS3 this info $ checkKey this) this where this = Remote { @@ -88,7 +91,7 @@ s3Setup mu mcreds c = do c' <- setRemoteCredPair c (AWS.creds u) mcreds s3Setup' u c' s3Setup' :: UUID -> RemoteConfig -> Annex (RemoteConfig, UUID) -s3Setup' u c = if isIA c then archiveorg else defaulthost +s3Setup' u c = if configIA c then archiveorg else defaulthost where remotename = fromJust (M.lookup "name" c) defbucket = remotename ++ "-" ++ fromUUID u @@ -114,7 +117,7 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost showNote "Internet Archive mode" -- Ensure user enters a valid bucket name, since -- this determines the name of the archive.org item. - let bucket = replace " " "-" $ map toLower $ + let validbucket = replace " " "-" $ map toLower $ fromMaybe (error "specify bucket=") $ getBucketName c let archiveconfig = @@ -122,28 +125,30 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost M.mapKeys (replace "x-archive-" "x-amz-") $ -- encryption does not make sense here M.insert "encryption" "none" $ - M.insert "bucket" bucket $ + M.insert "bucket" validbucket $ M.union c $ -- special constraints on key names M.insert "mungekeys" "ia" $ -- bucket created only when files are uploaded M.insert "x-amz-auto-make-bucket" "1" defaults - withS3Handle archiveconfig u $ + info <- extractS3Info archiveconfig + withS3Handle archiveconfig u info $ writeUUIDFile archiveconfig u use archiveconfig -- Sets up a http connection manager for S3 encdpoint, which allows -- http connections to be reused across calls to the helper. -prepareS3 :: Remote -> (S3Handle -> helper) -> Preparer helper -prepareS3 r = resourcePrepare $ const $ withS3Handle (config r) (uuid r) +prepareS3 :: Remote -> S3Info -> (S3Handle -> helper) -> Preparer helper +prepareS3 r info = resourcePrepare $ const $ + withS3Handle (config r) (uuid r) info store :: Remote -> S3Handle -> Storer store r h = fileStorer $ \k f p -> do rbody <- liftIO $ httpBodyStorer f p - void $ sendS3Handle h $ putObject h (hBucketObject h k) rbody + void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody -- Store public URL to item in Internet Archive. - when (hIsIA h && not (isChunkKey k)) $ + when (isIA (hinfo h) && not (isChunkKey k)) $ setUrlPresent k (iaKeyUrl r k) return True @@ -154,11 +159,12 @@ store r h = fileStorer $ \k f p -> do retrieve :: S3Handle -> Retriever retrieve h = fileRetriever $ \f k p -> liftIO $ runResourceT $ do (fr, fh) <- allocate (openFile f WriteMode) hClose - let req = S3.getObject (hBucket h) (hBucketObject h k) + let req = S3.getObject (bucket info) (bucketObject info k) S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req responseBody rsp $$+- sinkprogressfile fh p zeroBytesProcessed release fr where + info = hinfo h sinkprogressfile fh meterupdate sofar = do mbs <- await case mbs of @@ -178,20 +184,22 @@ retrieveCheap _ _ = return False - derived from it that it does not remove. -} remove :: S3Handle -> Remover remove h k - | hIsIA h = do + | isIA info = do warning "Cannot remove content from the Internet Archive" return False | otherwise = do res <- tryNonAsync $ sendS3Handle h $ - S3.DeleteObject (hBucketObject h k) (hBucket h) + S3.DeleteObject (bucketObject info k) (bucket info) return $ either (const False) (const True) res + where + info = hinfo h checkKey :: Remote -> S3Handle -> CheckPresent checkKey r h k = do showAction $ "checking " ++ name r catchMissingException $ do void $ sendS3Handle h $ - S3.headObject (hBucket h) (hBucketObject h k) + S3.headObject (bucket (hinfo h)) (bucketObject (hinfo h) k) return True {- Catch exception headObject returns when an object is not present @@ -217,18 +225,19 @@ catchMissingException a = catchJust missing a (const $ return False) genBucket :: RemoteConfig -> UUID -> Annex () genBucket c u = do showAction "checking bucket" - withS3Handle c u $ \h -> + info <- extractS3Info c + withS3Handle c u info $ \h -> go h =<< checkUUIDFile c u h where go _ (Right True) = noop go h _ = do - v <- tryS3 $ sendS3Handle h (S3.getBucket $ hBucket h) + v <- tryS3 $ sendS3Handle h (S3.getBucket $ bucket $ hinfo h) case v of Right _ -> noop Left _ -> do showAction $ "creating bucket in " ++ datacenter void $ sendS3Handle h $ - S3.PutBucket (hBucket h) Nothing $ + S3.PutBucket (bucket $ hinfo h) Nothing $ AWS.mkLocationConstraint $ T.pack datacenter writeUUIDFile c u h @@ -263,7 +272,7 @@ checkUUIDFile c u h = tryNonAsync $ check <$> get get = liftIO . runResourceT . either (pure . Left) (Right <$$> AWS.loadToMemory) - =<< tryS3 (sendS3Handle h (S3.getObject (hBucket h) file)) + =<< tryS3 (sendS3Handle h (S3.getObject (bucket (hinfo h)) file)) check (Right (S3.GetObjectMemoryResponse _meta rsp)) = responseStatus rsp == ok200 && responseBody rsp == uuidb check (Left _S3Error) = False @@ -276,22 +285,16 @@ uuidFile c = getFilePrefix c ++ "annex-uuid" -- TODO: auto-create bucket when hIsIA. putObject :: S3Handle -> T.Text -> RequestBody -> S3.PutObject -putObject h file rbody = (S3.putObject (hBucket h) file rbody) - { S3.poStorageClass = Just (hStorageClass h) - , S3.poMetadata = hMetaHeaders h +putObject h file rbody = (S3.putObject (bucket (hinfo h)) file rbody) + { S3.poStorageClass = Just (storageClass (hinfo h)) + , S3.poMetadata = metaHeaders (hinfo h) } data S3Handle = S3Handle { hmanager :: Manager , hawscfg :: AWS.Configuration , hs3cfg :: S3.S3Configuration AWS.NormalQuery - - -- Cached values. - , hBucket :: S3.Bucket - , hStorageClass :: S3.StorageClass - , hBucketObject :: Key -> T.Text - , hMetaHeaders :: [(T.Text, T.Text)] - , hIsIA :: Bool + , hinfo :: S3Info } {- Sends a request to S3 and gets back the response. @@ -314,23 +317,18 @@ sendS3Handle' -> ResourceT IO a sendS3Handle' h = AWS.pureAws (hawscfg h) (hs3cfg h) (hmanager h) -withS3Handle :: RemoteConfig -> UUID -> (S3Handle -> Annex a) -> Annex a -withS3Handle c u a = do +withS3Handle :: RemoteConfig -> UUID -> S3Info -> (S3Handle -> Annex a) -> Annex a +withS3Handle c u info a = do creds <- getRemoteCredPairFor "S3" c (AWS.creds u) awscreds <- liftIO $ AWS.genCredentials $ fromMaybe nocreds creds - bucket <- maybe nobucket (return . T.pack) (getBucketName c) let awscfg = AWS.Configuration AWS.Timestamp awscreds (AWS.defaultLog AWS.Error) bracketIO (newManager httpcfg) closeManager $ \mgr -> - a $ S3Handle mgr awscfg s3cfg bucket sc bo mh (isIA c) + a $ S3Handle mgr awscfg s3cfg info where s3cfg = s3Configuration c httpcfg = defaultManagerSettings { managerResponseTimeout = Nothing } - sc = getStorageClass c - bo = T.pack . bucketObject c - mh = getMetaHeaders c nocreds = error "Cannot use S3 without credentials configured" - nobucket = error "S3 bucket not configured" s3Configuration :: RemoteConfig -> S3.S3Configuration AWS.NormalQuery s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port } @@ -354,6 +352,28 @@ s3Configuration c = (S3.s3 proto endpoint False) { S3.s3Port = port } tryS3 :: Annex a -> Annex (Either S3.S3Error a) tryS3 a = (Right <$> a) `catch` (pure . Left) +data S3Info = S3Info + { bucket :: S3.Bucket + , storageClass :: S3.StorageClass + , bucketObject :: Key -> T.Text + , metaHeaders :: [(T.Text, T.Text)] + , isIA :: Bool + } + +extractS3Info :: RemoteConfig -> Annex S3Info +extractS3Info c = do + b <- maybe + (error "S3 bucket not configured") + (return . T.pack) + (getBucketName c) + return $ S3Info + { bucket = b + , storageClass = getStorageClass c + , bucketObject = T.pack . getBucketObject c + , metaHeaders = getMetaHeaders c + , isIA = configIA c + } + getBucketName :: RemoteConfig -> Maybe BucketName getBucketName = M.lookup "bucket" @@ -373,8 +393,8 @@ getMetaHeaders = map munge . filter ismetaheader . M.assocs getFilePrefix :: RemoteConfig -> String getFilePrefix = M.findWithDefault "" "fileprefix" -bucketObject :: RemoteConfig -> Key -> FilePath -bucketObject c = munge . key2file +getBucketObject :: RemoteConfig -> Key -> FilePath +getBucketObject c = munge . key2file where munge s = case M.lookup "mungekeys" c of Just "ia" -> iaMunge $ getFilePrefix c ++ s @@ -392,20 +412,20 @@ iaMunge = (>>= munge) | isSpace c = [] | otherwise = "&" ++ show (ord c) ++ ";" +configIA :: RemoteConfig -> Bool +configIA = maybe False isIAHost . M.lookup "host" + {- Hostname to use for archive.org S3. -} iaHost :: HostName iaHost = "s3.us.archive.org" -isIA :: RemoteConfig -> Bool -isIA c = maybe False isIAHost (M.lookup "host" c) - isIAHost :: HostName -> Bool isIAHost h = ".archive.org" `isSuffixOf` map toLower h iaItemUrl :: BucketName -> URLString -iaItemUrl bucket = "http://archive.org/details/" ++ bucket +iaItemUrl b = "http://archive.org/details/" ++ b iaKeyUrl :: Remote -> Key -> URLString -iaKeyUrl r k = "http://archive.org/download/" ++ bucket ++ "/" ++ bucketObject (config r) k +iaKeyUrl r k = "http://archive.org/download/" ++ b ++ "/" ++ getBucketObject (config r) k where - bucket = fromMaybe "" $ getBucketName $ config r + b = fromMaybe "" $ getBucketName $ config r From 5fc54cb182168081ee7043ec51447d7a0f526419 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sat, 9 Aug 2014 22:17:40 -0400 Subject: [PATCH 13/51] auto-create IA buckets Needs my patch to aws which will hopefully be accepted soon. --- Assistant/WebApp/Configurators/Edit.hs | 2 +- Remote/S3.hs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Assistant/WebApp/Configurators/Edit.hs b/Assistant/WebApp/Configurators/Edit.hs index c8113d18dc..e7b9613b22 100644 --- a/Assistant/WebApp/Configurators/Edit.hs +++ b/Assistant/WebApp/Configurators/Edit.hs @@ -235,7 +235,7 @@ getRepoInfo :: Maybe Remote.Remote -> Maybe Remote.RemoteConfig -> Widget getRepoInfo (Just r) (Just c) = case M.lookup "type" c of Just "S3" #ifdef WITH_S3 - | S3.isIA c -> IA.getRepoInfo c + | S3.configIA c -> IA.getRepoInfo c #endif | otherwise -> AWS.getRepoInfo c Just t diff --git a/Remote/S3.hs b/Remote/S3.hs index 9821a045fd..5c41116904 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -7,7 +7,7 @@ {-# LANGUAGE TypeFamilies #-} -module Remote.S3 (remote, iaHost, isIA, isIAHost, iaItemUrl) where +module Remote.S3 (remote, iaHost, configIA, isIAHost, iaItemUrl) where import qualified Aws as AWS import qualified Aws.Core as AWS @@ -128,9 +128,7 @@ s3Setup' u c = if configIA c then archiveorg else defaulthost M.insert "bucket" validbucket $ M.union c $ -- special constraints on key names - M.insert "mungekeys" "ia" $ - -- bucket created only when files are uploaded - M.insert "x-amz-auto-make-bucket" "1" defaults + M.insert "mungekeys" "ia" defaults info <- extractS3Info archiveconfig withS3Handle archiveconfig u info $ writeUUIDFile archiveconfig u @@ -283,12 +281,14 @@ checkUUIDFile c u h = tryNonAsync $ check <$> get uuidFile :: RemoteConfig -> FilePath uuidFile c = getFilePrefix c ++ "annex-uuid" --- TODO: auto-create bucket when hIsIA. putObject :: S3Handle -> T.Text -> RequestBody -> S3.PutObject -putObject h file rbody = (S3.putObject (bucket (hinfo h)) file rbody) - { S3.poStorageClass = Just (storageClass (hinfo h)) - , S3.poMetadata = metaHeaders (hinfo h) +putObject h file rbody = (S3.putObject (bucket info) file rbody) + { S3.poStorageClass = Just (storageClass info) + , S3.poMetadata = metaHeaders info + , S3.poAutoMakeBucket = isIA info } + where + info = hinfo h data S3Handle = S3Handle { hmanager :: Manager From edac4afc53f14c38e8f62a3bd66753e4ef7280b1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 15 Aug 2014 17:17:19 -0400 Subject: [PATCH 14/51] Switched from the old haskell HTTP library to http-conduit. The hoary old HTTP library was only used when checking if an url exists, when curl was not available. It had many problems, including not supporting https at all. Now, this is done using http-conduit for all urls that it supports. Falls back to curl for any url that http-conduit doesn't like (probably ftp etc, but could also be an url that its parser chokes on for whatever reason). This adds a new dependency on http-conduit, but webdav support already indirectly depended on that, and the s3-aws branch also uses it. --- Utility/Url.hs | 125 +++++++++++++++++++---------------------------- debian/changelog | 1 + debian/control | 2 +- git-annex.cabal | 9 ++-- 4 files changed, 57 insertions(+), 80 deletions(-) diff --git a/Utility/Url.hs b/Utility/Url.hs index 4137a5d8bb..ebcae55cae 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -1,6 +1,6 @@ {- Url downloading. - - - Copyright 2011,2013 Joey Hess + - Copyright 2011-2014 Joey Hess - - License: BSD-2-clause -} @@ -21,10 +21,11 @@ module Utility.Url ( import Common import Network.URI -import qualified Network.Browser as Browser -import Network.HTTP -import Data.Either +import Network.HTTP.Conduit +import Network.HTTP.Types import Data.Default +import qualified Data.CaseInsensitive as CI +import qualified Data.ByteString.UTF8 as B8 import qualified Build.SysConfig @@ -60,33 +61,26 @@ check url expected_size = go <$$> exists url Nothing -> (True, True) {- Checks that an url exists and could be successfully downloaded, - - also returning its size if available. - - - - For a file: url, check it directly. - - - - Uses curl otherwise, when available, since curl handles https better - - than does Haskell's Network.Browser. - -} + - also returning its size if available. -} exists :: URLString -> UrlOptions -> IO (Bool, Maybe Integer) exists url uo = case parseURIRelaxed url of - Just u - | uriScheme u == "file:" -> do - s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u) - case s of - Just stat -> return (True, Just $ fromIntegral $ fileSize stat) - Nothing -> dne - | otherwise -> if Build.SysConfig.curl - then do + Just u -> case parseUrl (show u) of + Just req -> existsconduit req `catchNonAsync` const dne + -- http-conduit does not support file:, ftp:, etc urls, + -- so fall back to reading files and using curl. + Nothing + | uriScheme u == "file:" -> do + s <- catchMaybeIO $ getFileStatus (unEscapeString $ uriPath u) + case s of + Just stat -> return (True, Just $ fromIntegral $ fileSize stat) + Nothing -> dne + | Build.SysConfig.curl -> do output <- catchDefaultIO "" $ readProcess "curl" $ toCommand curlparams case lastMaybe (lines output) of - Just ('2':_:_) -> return (True, extractsize output) + Just ('2':_:_) -> return (True, extractlencurl output) _ -> dne - else do - r <- request u HEAD uo - case rspCode r of - (2,_,_) -> return (True, size r) - _ -> return (False, Nothing) + | otherwise -> dne Nothing -> dne where dne = return (False, Nothing) @@ -98,13 +92,28 @@ exists url uo = case parseURIRelaxed url of , Param "-w", Param "%{http_code}" ] ++ concatMap (\h -> [Param "-H", Param h]) (reqHeaders uo) ++ (reqParams uo) - extractsize s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of + extractlencurl s = case lastMaybe $ filter ("Content-Length:" `isPrefixOf`) (lines s) of Just l -> case lastMaybe $ words l of Just sz -> readish sz _ -> Nothing _ -> Nothing - - size = liftM Prelude.read . lookupHeader HdrContentLength . rspHeaders + + extractlen resp = readish . B8.toString =<< headMaybe lenheaders + where + lenheaders = map snd $ + filter (\(h, _) -> h == hContentLength) + (responseHeaders resp) + + existsconduit req = withManager $ \mgr -> do + let req' = (addUrlOptions uo req) { method = methodHead } + resp <- http req' mgr + -- forces processing the response before the + -- manager is closed + ret <- if responseStatus resp == ok200 + then return (True, extractlen resp) + else liftIO dne + liftIO $ closeManager mgr + return ret -- works for both wget and curl commands addUserAgent :: UrlOptions -> [CommandParam] -> [CommandParam] @@ -112,6 +121,20 @@ addUserAgent uo ps = case userAgent uo of Nothing -> ps Just ua -> ps ++ [Param "--user-agent", Param ua] +addUrlOptions :: UrlOptions -> Request -> Request +addUrlOptions uo r = r { requestHeaders = requestHeaders r ++ uaheader ++ otherheaders} + where + uaheader = case userAgent uo of + Nothing -> [] + Just ua -> [(hUserAgent, B8.fromString ua)] + otherheaders = map toheader (reqHeaders uo) + toheader s = + let (h, v) = separate (== ':') s + h' = CI.mk (B8.fromString h) + in case v of + (' ':v') -> (h', B8.fromString v') + _ -> (h', B8.fromString v) + {- Used to download large files, such as the contents of keys. - - Uses wget or curl program for its progress bar. (Wget has a better one, @@ -161,52 +184,6 @@ download' quiet url file uo = | quiet = [Param s] | otherwise = [] -{- Uses Network.Browser to make a http request of an url. - - For example, HEAD can be used to check if the url exists, - - or GET used to get the url content (best for small urls). - - - - This does its own redirect following because Browser's is buggy for HEAD - - requests. - - - - Unfortunately, does not handle https, so should only be used - - when curl is not available. - -} -request :: URI -> RequestMethod -> UrlOptions -> IO (Response String) -request url requesttype uo = go 5 url - where - go :: Int -> URI -> IO (Response String) - go 0 _ = error "Too many redirects " - go n u = do - rsp <- Browser.browse $ do - maybe noop Browser.setUserAgent (userAgent uo) - Browser.setErrHandler ignore - Browser.setOutHandler ignore - Browser.setAllowRedirects False - let req = mkRequest requesttype u :: Request_String - snd <$> Browser.request (addheaders req) - case rspCode rsp of - (3,0,x) | x /= 5 -> redir (n - 1) u rsp - _ -> return rsp - addheaders req = setHeaders req (rqHeaders req ++ userheaders) - userheaders = rights $ map parseHeader (reqHeaders uo) - ignore = const noop - redir n u rsp = case retrieveHeaders HdrLocation rsp of - [] -> return rsp - (Header _ newu:_) -> - case parseURIReference newu of - Nothing -> return rsp - Just newURI -> go n $ -#if defined VERSION_network -#if ! MIN_VERSION_network(2,4,0) -#define WITH_OLD_URI -#endif -#endif -#ifdef WITH_OLD_URI - fromMaybe newURI (newURI `relativeTo` u) -#else - newURI `relativeTo` u -#endif - {- Allows for spaces and other stuff in urls, properly escaping them. -} parseURIRelaxed :: URLString -> Maybe URI parseURIRelaxed = parseURI . escapeURIString isAllowedInURI diff --git a/debian/changelog b/debian/changelog index c55fbabd32..0d884cd818 100644 --- a/debian/changelog +++ b/debian/changelog @@ -39,6 +39,7 @@ git-annex (5.20140718) UNRELEASED; urgency=medium * git-annex-shell sendkey: Don't fail if a remote asks for a key to be sent that already has a transfer lock file indicating it's being sent to that remote. The remote may have moved between networks, or reconnected. + * Switched from the old haskell HTTP library to http-conduit. -- Joey Hess Mon, 21 Jul 2014 14:41:26 -0400 diff --git a/debian/control b/debian/control index 522b7c5cce..1106bc89da 100644 --- a/debian/control +++ b/debian/control @@ -46,6 +46,7 @@ Build-Depends: libghc-dns-dev, libghc-case-insensitive-dev, libghc-http-types-dev, + libghc-http-conduit-dev, libghc-blaze-builder-dev, libghc-crypto-api-dev, libghc-network-multicast-dev, @@ -55,7 +56,6 @@ Build-Depends: libghc-gnutls-dev (>= 0.1.4), libghc-xml-types-dev, libghc-async-dev, - libghc-http-dev, libghc-feed-dev (>= 0.3.9.2), libghc-regex-tdfa-dev [!mipsel !s390], libghc-regex-compat-dev [mipsel s390], diff --git a/git-annex.cabal b/git-annex.cabal index 097fee4cb1..58aac39b31 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -96,11 +96,11 @@ Executable git-annex Main-Is: git-annex.hs Build-Depends: MissingH, hslogger, directory, filepath, containers, utf8-string, network (>= 2.0), mtl (>= 2), - bytestring, old-locale, time, HTTP, dataenc, SHA, process, json, + bytestring, old-locale, time, dataenc, SHA, process, json, base (>= 4.5 && < 4.9), monad-control, exceptions (>= 0.5), transformers, IfElse, text, QuickCheck >= 2.1, bloomfilter, edit-distance, process, SafeSemaphore, uuid, random, dlist, unix-compat, async, stm (>= 2.3), - data-default, case-insensitive + data-default, case-insensitive, http-conduit, http-types CC-Options: -Wall GHC-Options: -Wall Extensions: PackageImports @@ -141,8 +141,7 @@ Executable git-annex CPP-Options: -DWITH_S3 if flag(WebDAV) - Build-Depends: DAV (>= 1.0), - http-client, http-types + Build-Depends: DAV (>= 1.0), http-client CPP-Options: -DWITH_WEBDAV if flag(Assistant) && ! os(solaris) @@ -188,7 +187,7 @@ Executable git-annex if flag(Webapp) Build-Depends: yesod, yesod-default, yesod-static, yesod-form, yesod-core, - http-types, wai, wai-extra, warp, warp-tls, + wai, wai-extra, warp, warp-tls, blaze-builder, crypto-api, hamlet, clientsession, template-haskell, data-default, aeson, path-pieces, shakespeare From 5eb5451021c2ee36bf90e22f991f78915ee280e3 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 18 Aug 2014 15:26:18 -0400 Subject: [PATCH 15/51] update aws version requirements --- debian/control | 2 +- git-annex.cabal | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/debian/control b/debian/control index 89141a7e32..2b430b4dc0 100644 --- a/debian/control +++ b/debian/control @@ -13,7 +13,7 @@ Build-Depends: libghc-cryptohash-dev, libghc-dataenc-dev, libghc-utf8-string-dev, - libghc-aws-dev, + libghc-aws-dev (>= 0.10.2), libghc-conduit-dev, libghc-resourcet-dev, libghc-dav-dev (>= 1.0) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 powerpc], diff --git a/git-annex.cabal b/git-annex.cabal index b0835f50ae..6e57dd1f4f 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -137,7 +137,7 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: aws, conduit, resourcet + Build-Depends: aws (>= 0.10.2), conduit, resourcet CPP-Options: -DWITH_S3 if flag(WebDAV) From 895b4964901b4a988e3c83aa79ab8d94aaa2892c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 18 Sep 2014 14:37:20 -0400 Subject: [PATCH 16/51] remove scratch programs --- GetObject.hs | 24 ------------------------ PutObject.hs | 33 --------------------------------- RemoveObject.hs | 19 ------------------- 3 files changed, 76 deletions(-) delete mode 100644 GetObject.hs delete mode 100644 PutObject.hs delete mode 100644 RemoveObject.hs diff --git a/GetObject.hs b/GetObject.hs deleted file mode 100644 index 51764bf726..0000000000 --- a/GetObject.hs +++ /dev/null @@ -1,24 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} - -import qualified Aws -import qualified Aws.S3 as S3 -import Data.Conduit (($$+-)) -import Data.Conduit.Binary (sinkFile) -import Network.HTTP.Conduit (withManager, responseBody) - -main :: IO () -main = do - {- Set up AWS credentials and the default configuration. -} - Just creds <- Aws.loadCredentialsFromEnv - let cfg = Aws.Configuration Aws.Timestamp creds (Aws.defaultLog Aws.Debug) - let s3cfg = Aws.defServiceConfig :: S3.S3Configuration Aws.NormalQuery - - {- Set up a ResourceT region with an available HTTP manager. -} - withManager $ \mgr -> do - {- Create a request object with S3.getObject and run the request with pureAws. -} - S3.GetObjectResponse { S3.gorResponse = rsp } <- - Aws.pureAws cfg s3cfg mgr $ - S3.getObject "joeyh-test" "cloud-remote.pdf" - - {- Save the response to a file. -} - responseBody rsp $$+- sinkFile "cloud-remote2.pdf" diff --git a/PutObject.hs b/PutObject.hs deleted file mode 100644 index 68db1685af..0000000000 --- a/PutObject.hs +++ /dev/null @@ -1,33 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} - -import qualified Aws -import qualified Aws.S3 as S3 -import Data.Conduit (($$+-)) -import Data.Conduit.Binary (sinkFile) -import Network.HTTP.Conduit (withManager, RequestBody(..)) -import qualified Data.ByteString.Lazy as L -import qualified Data.ByteString as S -import Control.Monad.IO.Class -import Control.Concurrent -import System.Posix.Files -import System.IO -import Control.Applicative -import qualified Data.Text as T - -main :: IO () -main = do - {- Set up AWS credentials and the default configuration. -} - Just creds <- Aws.loadCredentialsFromEnv - let cfg = Aws.Configuration Aws.Timestamp creds (Aws.defaultLog Aws.Debug) - let s3cfg = Aws.defServiceConfig :: S3.S3Configuration Aws.NormalQuery - - {- Set up a ResourceT region with an available HTTP manager. -} - withManager $ \mgr -> do - let file ="cloud-remote.pdf" - -- streams file content, without buffering more than 1k in memory! - let streamer sink = withFile file ReadMode $ \h -> sink $ S.hGet h 1024 - b <- liftIO $ L.readFile file - size <- liftIO $ (fromIntegral . fileSize <$> getFileStatus file :: IO Integer) - rsp <- Aws.pureAws cfg s3cfg mgr $ - S3.putObject "joeyh-test" (T.pack file) (RequestBodyStream (fromInteger size) streamer) - liftIO $ print rsp diff --git a/RemoveObject.hs b/RemoveObject.hs deleted file mode 100644 index 7d61907cde..0000000000 --- a/RemoveObject.hs +++ /dev/null @@ -1,19 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} - -import qualified Aws -import qualified Aws.S3 as S3 -import Data.Conduit (($$+-)) -import Data.Conduit.Binary (sinkFile) -import Network.HTTP.Conduit (withManager, responseBody) -import Control.Monad.IO.Class - -main :: IO () -main = do - Just creds <- Aws.loadCredentialsFromEnv - let cfg = Aws.Configuration Aws.Timestamp creds (Aws.defaultLog Aws.Debug) - let s3cfg = Aws.defServiceConfig :: S3.S3Configuration Aws.NormalQuery - - withManager $ \mgr -> do - rsp <- Aws.pureAws cfg s3cfg mgr $ - S3.DeleteObject "cloud-remote.pdf" "joeyh-test" - liftIO $ print "removal done" From 147aaa10128b06d5478dbcc3c3e4905b3bca1d7d Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 18 Sep 2014 14:39:05 -0400 Subject: [PATCH 17/51] remove unused function --- Utility/Url.hs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/Utility/Url.hs b/Utility/Url.hs index 9cbabac399..cb950b8248 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -161,20 +161,6 @@ headRequest r = r (requestHeaders r) } -addUrlOptions :: UrlOptions -> Request -> Request -addUrlOptions uo r = r { requestHeaders = requestHeaders r ++ uaheader ++ otherheaders} - where - uaheader = case userAgent uo of - Nothing -> [] - Just ua -> [(hUserAgent, B8.fromString ua)] - otherheaders = map toheader (reqHeaders uo) - toheader s = - let (h, v) = separate (== ':') s - h' = CI.mk (B8.fromString h) - in case v of - (' ':v') -> (h', B8.fromString v') - _ -> (h', B8.fromString v) - {- Used to download large files, such as the contents of keys. - - Uses wget or curl program for its progress bar. (Wget has a better one, From 8b48bdfdc8e2e5bc775935e4f41e1fa5d2be4c9b Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 11:02:24 -0400 Subject: [PATCH 18/51] enable frankfurt The aws library supports the AWS4-HMAC-SHA256 that it requires. --- Remote/Helper/AWS.hs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Remote/Helper/AWS.hs b/Remote/Helper/AWS.hs index 1675e8a50e..d27f2aad17 100644 --- a/Remote/Helper/AWS.hs +++ b/Remote/Helper/AWS.hs @@ -59,9 +59,7 @@ regionInfo service = map (\(t, r) -> (t, fromServiceRegion r)) $ [ ("US East (N. Virginia)", [S3Region "US", GlacierRegion "us-east-1"]) , ("US West (Oregon)", [BothRegion "us-west-2"]) , ("US West (N. California)", [BothRegion "us-west-1"]) - -- Requires AWS4-HMAC-SHA256 which S3 library does not - -- currently support. - -- , ("EU (Frankfurt)", [BothRegion "eu-central-1"]) + , ("EU (Frankfurt)", [BothRegion "eu-central-1"]) , ("EU (Ireland)", [S3Region "EU", GlacierRegion "eu-west-1"]) , ("Asia Pacific (Singapore)", [S3Region "ap-southeast-1"]) , ("Asia Pacific (Tokyo)", [BothRegion "ap-northeast-1"]) From f0989cf0bdcf5c157a133b5ba4134e98310d9fe5 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 15:41:57 -0400 Subject: [PATCH 19/51] fix build --- Remote/S3.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index fe0b4992a7..85392bb062 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -83,9 +83,9 @@ gen r u c gc = do remotetype = remote, mkUnavailable = gen r u (M.insert "host" "!dne!" c) gc, getInfo = includeCredsInfo c (AWS.creds u) $ catMaybes - [ Just ("bucket", fromMaybe "unknown" (getBucket c)) + [ Just ("bucket", fromMaybe "unknown" (getBucketName c)) , if isIA c - then Just ("internet archive item", iaItemUrl $ fromMaybe "unknown" $ getBucket c) + then Just ("internet archive item", iaItemUrl $ fromMaybe "unknown" $ getBucketName c) else Nothing ] } From 76ee815e89173d5c175ee756229c18eac80f8ee3 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 15:48:37 -0400 Subject: [PATCH 20/51] needs type families --- Remote/S3.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Remote/S3.hs b/Remote/S3.hs index 85392bb062..e6544a0240 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -5,6 +5,8 @@ - Licensed under the GNU GPL version 3 or higher. -} +{-# LANGUAGE TypeFamilies #-} + module Remote.S3 (remote, iaHost, configIA, isIA, iaItemUrl) where import qualified Aws as AWS From 7489f516bc13091347db19b70514dca50038faee Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 15:50:41 -0400 Subject: [PATCH 21/51] one last build fix, yes it builds now --- Remote/S3.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index e6544a0240..e94612936c 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -86,7 +86,7 @@ gen r u c gc = do mkUnavailable = gen r u (M.insert "host" "!dne!" c) gc, getInfo = includeCredsInfo c (AWS.creds u) $ catMaybes [ Just ("bucket", fromMaybe "unknown" (getBucketName c)) - , if isIA c + , if configIA c then Just ("internet archive item", iaItemUrl $ fromMaybe "unknown" $ getBucketName c) else Nothing ] @@ -97,7 +97,7 @@ s3Setup mu mcreds c = do u <- maybe (liftIO genUUID) return mu s3Setup' u mcreds c s3Setup' :: UUID -> Maybe CredPair -> RemoteConfig -> Annex (RemoteConfig, UUID) -s3Setup' u mcreds c = if isIA c then archiveorg else defaulthost +s3Setup' u mcreds c = if configIA c then archiveorg else defaulthost where remotename = fromJust (M.lookup "name" c) defbucket = remotename ++ "-" ++ fromUUID u From 6acc6863c53bbd51846a7415a1383e144c6d9201 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 15:54:00 -0400 Subject: [PATCH 22/51] fix build --- Assistant/WebApp/Configurators/AWS.hs | 4 ++-- Remote/S3.hs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Assistant/WebApp/Configurators/AWS.hs b/Assistant/WebApp/Configurators/AWS.hs index fa0c23bea1..701c2c8704 100644 --- a/Assistant/WebApp/Configurators/AWS.hs +++ b/Assistant/WebApp/Configurators/AWS.hs @@ -162,7 +162,7 @@ getEnableS3R :: UUID -> Handler Html #ifdef WITH_S3 getEnableS3R uuid = do m <- liftAnnex readRemoteLog - if maybe False S3.isIA (M.lookup uuid m) + if maybe False S3.configIA (M.lookup uuid m) then redirect $ EnableIAR uuid else postEnableS3R uuid #else @@ -224,5 +224,5 @@ previouslyUsedAWSCreds :: Annex (Maybe CredPair) previouslyUsedAWSCreds = getM gettype [S3.remote, Glacier.remote] where gettype t = previouslyUsedCredPair AWS.creds t $ - not . S3.isIA . Remote.config + not . S3.configIA . Remote.config #endif diff --git a/Remote/S3.hs b/Remote/S3.hs index e94612936c..6969096d6b 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -7,7 +7,7 @@ {-# LANGUAGE TypeFamilies #-} -module Remote.S3 (remote, iaHost, configIA, isIA, iaItemUrl) where +module Remote.S3 (remote, iaHost, configIA, iaItemUrl) where import qualified Aws as AWS import qualified Aws.Core as AWS From 171e677a3ce31fce83033159aedcd6aa81712e42 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 16:32:18 -0400 Subject: [PATCH 23/51] update for aws 0.10's better handling of DNE for HEAD Kept support for older aws, since Debian has 0.9.2 still. --- Remote/S3.hs | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 6969096d6b..b818e2f9bc 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -203,23 +203,30 @@ remove h k checkKey :: Remote -> S3Handle -> CheckPresent checkKey r h k = do showAction $ "checking " ++ name r +#if MIN_VERSION_aws(0,10,0) + rsp <- go + return (isJust $ S3.horMetadata r) +#else catchMissingException $ do - void $ sendS3Handle h $ - S3.headObject (bucket (hinfo h)) (bucketObject (hinfo h) k) + void go return True - -{- Catch exception headObject returns when an object is not present - - in the bucket, and returns False. All other exceptions indicate a - - check error and are let through. -} -catchMissingException :: Annex Bool -> Annex Bool -catchMissingException a = catchJust missing a (const $ return False) +#endif where - -- This is not very good; see - -- https://github.com/aristidb/aws/issues/121 - missing :: AWS.HeaderException -> Maybe () - missing e - | AWS.headerErrorMessage e == "ETag missing" = Just () - | otherwise = Nothing + go = sendS3Handle h $ + S3.headObject (bucket (hinfo h)) (bucketObject (hinfo h) k) + +#if ! MIN_VERSION_aws(0,10,0) + {- Catch exception headObject returns when an object is not present + - in the bucket, and returns False. All other exceptions indicate a + - check error and are let through. -} + catchMissingException :: Annex Bool -> Annex Bool + catchMissingException a = catchJust missing a (const $ return False) + where + missing :: AWS.HeaderException -> Maybe () + missing e + | AWS.headerErrorMessage e == "ETag missing" = Just () + | otherwise = Nothing +#endif {- Generate the bucket if it does not already exist, including creating the - UUID file within the bucket. From c986bc67e54b0c43e9f17c872a03b3ee9ab8bf4e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 16:39:55 -0400 Subject: [PATCH 24/51] add cabal flag for use with debian's older version of aws, which is now patched with the necessary stuff --- debian/control | 2 +- debian/rules | 4 ++++ git-annex.cabal | 10 +++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/debian/control b/debian/control index 659ec05faf..55fc17448f 100644 --- a/debian/control +++ b/debian/control @@ -14,7 +14,7 @@ Build-Depends: libghc-cryptohash-dev, libghc-dataenc-dev, libghc-utf8-string-dev, - libghc-aws-dev (>= 0.10.2), + libghc-aws-dev (>= 0.9.2), libghc-conduit-dev, libghc-resourcet-dev, libghc-dav-dev (>= 1.0) [amd64 i386 kfreebsd-amd64 kfreebsd-i386 powerpc], diff --git a/debian/rules b/debian/rules index 7c8f8a560d..22be481952 100755 --- a/debian/rules +++ b/debian/rules @@ -8,6 +8,10 @@ export RELEASE_BUILD=1 %: dh $@ +# Debian currently has a patched aws 0.9.2, rather than the newer 0.10.2. +override_dh_auto_configure: + debian/cabal-wrapper configure -fPatchedAWS + # Not intended for use by anyone except the author. announcedir: @echo ${HOME}/src/git-annex/doc/news diff --git a/git-annex.cabal b/git-annex.cabal index 94b1ed3bec..863f35cf88 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -34,6 +34,10 @@ Description: Flag S3 Description: Enable S3 support +Flag PatchedAWS + Description: Building on system, like Debian, with old AWS patched to support git-annex + Default: False + Flag WebDAV Description: Enable WebDAV support @@ -151,7 +155,11 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: aws (>= 0.10.2), conduit, resourcet + Build-Depends: conduit, resourcet + if flag(PatchedAWS) + Build-Depends: aws (>= 0.9.2) + else + Build-Depends: aws (>= 0.10.2) CPP-Options: -DWITH_S3 if flag(WebDAV) From 8edf7a0fc34abae6f02ae6c4f4a032a83f1d14c0 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 16:51:10 -0400 Subject: [PATCH 25/51] fix build --- Remote/S3.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/Remote/S3.hs b/Remote/S3.hs index b818e2f9bc..4815b09884 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -6,6 +6,7 @@ -} {-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE CPP #-} module Remote.S3 (remote, iaHost, configIA, iaItemUrl) where From 8ed1a0afee79699afbc8f08085aacc6cdf47d336 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 23 Oct 2014 16:52:05 -0400 Subject: [PATCH 26/51] fix build --- Remote/S3.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 4815b09884..bf130b7aed 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -206,7 +206,7 @@ checkKey r h k = do showAction $ "checking " ++ name r #if MIN_VERSION_aws(0,10,0) rsp <- go - return (isJust $ S3.horMetadata r) + return (isJust $ S3.horMetadata rsp) #else catchMissingException $ do void go From 6e89d070bc38afbf1fdf30e887dab920785c9fe8 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 28 Oct 2014 14:17:30 -0400 Subject: [PATCH 27/51] WIP multipart S3 upload I'm a little stuck on getting the list of etags of the parts. This seems to require taking the md5 of each part locally, which doesn't get along well with lazily streaming in the part from the file. It would need to read the file twice, or lose laziness and buffer a whole part -- but parts might be quite large. This seems to be a problem with the API provided; S3 is supposed to return an etag, but that is not exposed. I have filed a bug: https://github.com/aristidb/aws/issues/141 --- Remote/S3.hs | 48 +++++++++++++++++++++++++++++++++---- doc/special_remotes/S3.mdwn | 5 ++++ git-annex.cabal | 2 +- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index bf130b7aed..9a618329a5 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -40,6 +40,7 @@ import Creds import Annex.UUID import Logs.Web import Utility.Metered +import Utility.DataUnits type BucketName = String @@ -151,14 +152,46 @@ prepareS3 r info = resourcePrepare $ const $ store :: Remote -> S3Handle -> Storer store r h = fileStorer $ \k f p -> do - rbody <- liftIO $ httpBodyStorer f p - void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody - + case partSize (hinfo h) of + Just sz -> do + fsz <- fromIntegral . fileSize <$> liftIO (getFileStatus f) + if fsz > sz + then multipartupload sz k f p + else singlepartupload k f p + Nothing -> singlepartupload k f p -- Store public URL to item in Internet Archive. when (isIA (hinfo h) && not (isChunkKey k)) $ setUrlPresent k (iaKeyUrl r k) - return True + where + singlepartupload k f p = do + rbody <- liftIO $ httpBodyStorer f p + void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody + multipartupload sz k f p = do +#if MIN_VERSION_aws(0,10,4) + let info = hinfo h + let objects = bucketObject info h + + uploadid <- S3.imurUploadId <$> sendS3Handle' h $ + (S3.postInitiateMultipartUpload (bucket info) object) + { S3.imuStorageClass = Just (storageClass info) + , S3.imuMetadata = metaHeaders info + , S3.imuAutoMakeBucket = isIA info + , S3.imuExpires = Nothing -- TODO set some reasonable expiry + } + + -- TODO open file, read each part of size sz (streaming + -- it); send part to S3, and get a list of etags of all + -- the parts + + + void $ sendS3Handle' h $ + S3.postCompleteMultipartUpload (bucket info) object uploadid $ + zip [1..] (map T.pack etags) +#else + warning $ "Cannot do multipart upload (partsize " ++ show sz ++ "); built with too old a version of the aws library." + singlepartupload k f p +#endif {- Implemented as a fileRetriever, that uses conduit to stream the chunks - out to the file. Would be better to implement a byteRetriever, but @@ -373,6 +406,7 @@ data S3Info = S3Info , storageClass :: S3.StorageClass , bucketObject :: Key -> T.Text , metaHeaders :: [(T.Text, T.Text)] + , partSize :: Maybe Integer , isIA :: Bool } @@ -387,6 +421,7 @@ extractS3Info c = do , storageClass = getStorageClass c , bucketObject = T.pack . getBucketObject c , metaHeaders = getMetaHeaders c + , partSize = getPartSize c , isIA = configIA c } @@ -397,7 +432,10 @@ getStorageClass :: RemoteConfig -> S3.StorageClass getStorageClass c = case M.lookup "storageclass" c of Just "REDUCED_REDUNDANCY" -> S3.ReducedRedundancy _ -> S3.Standard - + +getPartSize :: RemoteConfig -> Maybe Integer +getPartSize c = readSize dataUnits =<< M.lookup "partsize" c + getMetaHeaders :: RemoteConfig -> [(T.Text, T.Text)] getMetaHeaders = map munge . filter ismetaheader . M.assocs where diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index 492c247cbe..c7c6f76c57 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -21,6 +21,11 @@ the S3 remote. * `chunk` - Enables [[chunking]] when storing large files. `chunk=1MiB` is a good starting point for chunking. +* `partsize` - Specifies the largest object to attempt to store in the + bucket. Multipart uploads will be used when storing larger objects. + This is not enabled by default, but can be enabled or changed at any + time. Setting `partsize=1GiB` is reasonable for S3. + * `keyid` - Specifies the gpg key to use for [[encryption]]. * `embedcreds` - Optional. Set to "yes" embed the login credentials inside diff --git a/git-annex.cabal b/git-annex.cabal index 27050d30c6..d746dbf590 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -159,7 +159,7 @@ Executable git-annex if flag(PatchedAWS) Build-Depends: aws (>= 0.9.2) else - Build-Depends: aws (>= 0.10.2) + Build-Depends: aws (>= 0.10.4) CPP-Options: -DWITH_S3 if flag(WebDAV) From 8faeb25076c29b0c65dd2f4125414df6c865123c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 15:53:22 -0400 Subject: [PATCH 28/51] finish multipart support using unreleased update to aws lib to yield etags Untested and not even compiled yet. Testing should include checks that file content streams through without buffering in memory. Note that CL.consume causes all the etags to be buffered in memory. This is probably nearly unavoidable, since a request has to be constructed that contains the list of etags in its body. (While it might be possible to stream generation of the body, that would entail making a http request that dribbles out parts of the body as the multipart uploads complete, which is not likely to work well.. To limit this being a problem, it's best for partsize to be set to some suitably large value, like 1gb. Then a full terabyte file will need only 1024 etags to be stored, which will probably use around 1 mb of memory. --- Remote/S3.hs | 19 +++++++++++-------- doc/bugs/S3_upload_not_using_multipart.mdwn | 8 ++++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 9a618329a5..9c90d4b2c0 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -13,6 +13,10 @@ module Remote.S3 (remote, iaHost, configIA, iaItemUrl) where import qualified Aws as AWS import qualified Aws.Core as AWS import qualified Aws.S3 as S3 +#if MIN_VERSION_aws(0,10,4) +import qualified Aws.S3.Commands.Multipart as Multipart +import qualified Data.Conduit.List as CL +#endif import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.ByteString.Lazy as L @@ -170,7 +174,7 @@ store r h = fileStorer $ \k f p -> do multipartupload sz k f p = do #if MIN_VERSION_aws(0,10,4) let info = hinfo h - let objects = bucketObject info h + let object = bucketObject info h uploadid <- S3.imurUploadId <$> sendS3Handle' h $ (S3.postInitiateMultipartUpload (bucket info) object) @@ -180,14 +184,13 @@ store r h = fileStorer $ \k f p -> do , S3.imuExpires = Nothing -- TODO set some reasonable expiry } - -- TODO open file, read each part of size sz (streaming - -- it); send part to S3, and get a list of etags of all - -- the parts - + etags <- sourceFile f + $= Multipart.chunkedConduit sz + $= Multipart.putConduit (hawscfg h) (hs3cfg h) (hmanager h) (bucket info) object uploadid + $$ CL.consume - void $ sendS3Handle' h $ - S3.postCompleteMultipartUpload (bucket info) object uploadid $ - zip [1..] (map T.pack etags) + void $ sendS3Handle' h $ S3.postCompleteMultipartUpload + (bucket info) object uploadid (zip [1..] etags) #else warning $ "Cannot do multipart upload (partsize " ++ show sz ++ "); built with too old a version of the aws library." singlepartupload k f p diff --git a/doc/bugs/S3_upload_not_using_multipart.mdwn b/doc/bugs/S3_upload_not_using_multipart.mdwn index 5e5d97c6a3..cd40e9d2ba 100644 --- a/doc/bugs/S3_upload_not_using_multipart.mdwn +++ b/doc/bugs/S3_upload_not_using_multipart.mdwn @@ -52,3 +52,11 @@ Please provide any additional information below. upgrade supported from repository versions: 0 1 2 [[!tag confirmed]] + +> [[fixed|done]] This is now supported, when git-annex is built with a new +> enough version of the aws library. You need to configure the remote to +> use an appropriate value for multipart, eg: +> +> git annex enableremote cloud multipart=1GiB +> +> --[[Joey]] From 5c3d9d6caa06b57b4ec25ce61e2db4d37fe109e7 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 16:07:41 -0400 Subject: [PATCH 29/51] show multipart configuration in git annex info s3remote --- Remote/S3.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/Remote/S3.hs b/Remote/S3.hs index 9c90d4b2c0..26f7a7a9aa 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -95,6 +95,7 @@ gen r u c gc = do , if configIA c then Just ("internet archive item", iaItemUrl $ fromMaybe "unknown" $ getBucketName c) else Nothing + , Just ("multipart", maybe "disabled" (roughSize storageUnits False) (getPartSize c)) ] } From 6a965cf8d762427af466c251b889630a621a8a0e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 16:20:51 -0400 Subject: [PATCH 30/51] adjust version check I assume 0.10.6 will have the fix for the bug I reported, which got fixed in master already.. --- Remote/S3.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 26f7a7a9aa..e9879b9f41 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -13,7 +13,7 @@ module Remote.S3 (remote, iaHost, configIA, iaItemUrl) where import qualified Aws as AWS import qualified Aws.Core as AWS import qualified Aws.S3 as S3 -#if MIN_VERSION_aws(0,10,4) +#if MIN_VERSION_aws(0,10,6) import qualified Aws.S3.Commands.Multipart as Multipart import qualified Data.Conduit.List as CL #endif @@ -173,7 +173,7 @@ store r h = fileStorer $ \k f p -> do rbody <- liftIO $ httpBodyStorer f p void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody multipartupload sz k f p = do -#if MIN_VERSION_aws(0,10,4) +#if MIN_VERSION_aws(0,10,6) let info = hinfo h let object = bucketObject info h From 2c53f331bdc4a02a4e8e47435715592c57df5c15 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 17:23:46 -0400 Subject: [PATCH 31/51] fix build --- Remote/S3.hs | 26 +++++++++++++++----------- git-annex.cabal | 2 +- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index e9879b9f41..e5ed17c492 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -13,10 +13,6 @@ module Remote.S3 (remote, iaHost, configIA, iaItemUrl) where import qualified Aws as AWS import qualified Aws.Core as AWS import qualified Aws.S3 as S3 -#if MIN_VERSION_aws(0,10,6) -import qualified Aws.S3.Commands.Multipart as Multipart -import qualified Data.Conduit.List as CL -#endif import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.ByteString.Lazy as L @@ -24,12 +20,18 @@ import qualified Data.ByteString as S import qualified Data.Map as M import Data.Char import Network.Socket (HostName) -import Network.HTTP.Conduit (Manager, newManager, closeManager) +import Network.HTTP.Conduit (Manager, newManager, closeManager, withManager) import Network.HTTP.Client (defaultManagerSettings, managerResponseTimeout, responseStatus, responseBody, RequestBody(..)) import Network.HTTP.Types import Control.Monad.Trans.Resource import Control.Monad.Catch import Data.Conduit +#if MIN_VERSION_aws(0,10,6) +import qualified Aws.S3.Commands.Multipart as Multipart +import qualified Data.Conduit.List as CL +import qualified Data.Conduit.Binary as CB +import Network.HTTP.Conduit (withManager) +#endif import Common.Annex import Types.Remote @@ -175,22 +177,24 @@ store r h = fileStorer $ \k f p -> do multipartupload sz k f p = do #if MIN_VERSION_aws(0,10,6) let info = hinfo h - let object = bucketObject info h + let object = bucketObject info k - uploadid <- S3.imurUploadId <$> sendS3Handle' h $ - (S3.postInitiateMultipartUpload (bucket info) object) + let req = (S3.postInitiateMultipartUpload (bucket info) object) { S3.imuStorageClass = Just (storageClass info) , S3.imuMetadata = metaHeaders info , S3.imuAutoMakeBucket = isIA info , S3.imuExpires = Nothing -- TODO set some reasonable expiry } + uploadid <- S3.imurUploadId <$> sendS3Handle h req - etags <- sourceFile f + -- TODO: progress display + etags <- liftIO $ withManager $ \mgr -> + CB.sourceFile f $= Multipart.chunkedConduit sz - $= Multipart.putConduit (hawscfg h) (hs3cfg h) (hmanager h) (bucket info) object uploadid + $= Multipart.putConduit (hawscfg h) (hs3cfg h) mgr (bucket info) object uploadid $$ CL.consume - void $ sendS3Handle' h $ S3.postCompleteMultipartUpload + void $ sendS3Handle h $ S3.postCompleteMultipartUpload (bucket info) object uploadid (zip [1..] etags) #else warning $ "Cannot do multipart upload (partsize " ++ show sz ++ "); built with too old a version of the aws library." diff --git a/git-annex.cabal b/git-annex.cabal index d746dbf590..ad50552e40 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -155,7 +155,7 @@ Executable git-annex CPP-Options: -DWITH_CRYPTOHASH if flag(S3) - Build-Depends: conduit, resourcet + Build-Depends: conduit, resourcet, conduit-extra if flag(PatchedAWS) Build-Depends: aws (>= 0.9.2) else From 711b18a6ebdd68e4d5bf00b71938b5597a4cd72a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 17:24:53 -0400 Subject: [PATCH 32/51] improve info display for multipart --- Remote/S3.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index e5ed17c492..3bf15c4ffb 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -97,7 +97,7 @@ gen r u c gc = do , if configIA c then Just ("internet archive item", iaItemUrl $ fromMaybe "unknown" $ getBucketName c) else Nothing - , Just ("multipart", maybe "disabled" (roughSize storageUnits False) (getPartSize c)) + , Just ("partsize", maybe "unlimited" (roughSize storageUnits False) (getPartSize c)) ] } From 8f61bfad510125e21ecf57baf59420cd0d86cfd2 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 17:55:05 -0400 Subject: [PATCH 33/51] link to memory leak bug --- Remote/S3.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Remote/S3.hs b/Remote/S3.hs index 3bf15c4ffb..60501f2ce2 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -188,6 +188,8 @@ store r h = fileStorer $ \k f p -> do uploadid <- S3.imurUploadId <$> sendS3Handle h req -- TODO: progress display + -- TODO: avoid needing tons of memory + -- https://github.com/aristidb/aws/issues/142 etags <- liftIO $ withManager $ \mgr -> CB.sourceFile f $= Multipart.chunkedConduit sz From 5360417436a13e0671d2165159b48253fb072521 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 19:18:46 -0400 Subject: [PATCH 34/51] WIP try sending using RequestBodyStreamChunked May not work; if it does this is gonna be the simplest way to get good memory size and progress reporting. --- Remote/S3.hs | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 60501f2ce2..1de1abad6c 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -27,7 +27,6 @@ import Control.Monad.Trans.Resource import Control.Monad.Catch import Data.Conduit #if MIN_VERSION_aws(0,10,6) -import qualified Aws.S3.Commands.Multipart as Multipart import qualified Data.Conduit.List as CL import qualified Data.Conduit.Binary as CB import Network.HTTP.Conduit (withManager) @@ -160,10 +159,10 @@ prepareS3 r info = resourcePrepare $ const $ store :: Remote -> S3Handle -> Storer store r h = fileStorer $ \k f p -> do case partSize (hinfo h) of - Just sz -> do + Just partsz | partsz > 0 -> do fsz <- fromIntegral . fileSize <$> liftIO (getFileStatus f) - if fsz > sz - then multipartupload sz k f p + if fsz > partsz + then multipartupload fsz partsz k f p else singlepartupload k f p Nothing -> singlepartupload k f p -- Store public URL to item in Internet Archive. @@ -174,7 +173,7 @@ store r h = fileStorer $ \k f p -> do singlepartupload k f p = do rbody <- liftIO $ httpBodyStorer f p void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody - multipartupload sz k f p = do + multipartupload fsz partsz k f p = do #if MIN_VERSION_aws(0,10,6) let info = hinfo h let object = bucketObject info k @@ -187,19 +186,30 @@ store r h = fileStorer $ \k f p -> do } uploadid <- S3.imurUploadId <$> sendS3Handle h req - -- TODO: progress display - -- TODO: avoid needing tons of memory - -- https://github.com/aristidb/aws/issues/142 - etags <- liftIO $ withManager $ \mgr -> - CB.sourceFile f - $= Multipart.chunkedConduit sz - $= Multipart.putConduit (hawscfg h) (hs3cfg h) mgr (bucket info) object uploadid - $$ CL.consume + -- Send parts of the file, taking care to stream each part + -- w/o buffering in memory, since the parts can be large. + etags <- bracketIO (openBinaryFile f ReadMode) hClose $ \h -> do + let sendparts etags partnum = do + b <- liftIO $ hGetUntilMetered h (< partsz) p + if L.null b + then return (reverse etags) + else do + mvar <- newMVar $ L.toChunks b + let streamer sink = do + let getnextchunk = modifyMVar mvar $ pure . pop + sink getnextchunk + let body = RequestBodyStreamChunked streamer + S3.UploadPartResponse _ etag <- sendS3Handle h $ + S3.uploadPart (bucket info) object partnum uploadid body + sendparts (etag:etags) (partnum + 1) + sendparts [] 0 1 void $ sendS3Handle h $ S3.postCompleteMultipartUpload (bucket info) object uploadid (zip [1..] etags) + pop [] = ([], S.empty) + pop (c:cs) = (cs, c) #else - warning $ "Cannot do multipart upload (partsize " ++ show sz ++ "); built with too old a version of the aws library." + warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ " vs filesize " ++ show fsz ++ "); built with too old a version of the aws library." singlepartupload k f p #endif From d16382e99f2c8651c1e850279b8a7b2cfa2ea7ba Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 19:50:33 -0400 Subject: [PATCH 35/51] WIP 2 --- Remote/Helper/Http.hs | 13 ++++++++----- Remote/S3.hs | 37 +++++++++++++++---------------------- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/Remote/Helper/Http.hs b/Remote/Helper/Http.hs index 4088854ff3..cb3af335ab 100644 --- a/Remote/Helper/Http.hs +++ b/Remote/Helper/Http.hs @@ -11,7 +11,7 @@ import Common.Annex import Types.StoreRetrieve import Utility.Metered import Remote.Helper.Special -import Network.HTTP.Client (RequestBody(..), Response, responseStatus, responseBody, BodyReader) +import Network.HTTP.Client (RequestBody(..), Response, responseStatus, responseBody, BodyReader, NeedsPopper) import Network.HTTP.Types import qualified Data.ByteString.Lazy as L @@ -31,11 +31,14 @@ httpStorer a = fileStorer $ \k f m -> a k =<< liftIO (httpBodyStorer f m) httpBodyStorer :: FilePath -> MeterUpdate -> IO RequestBody httpBodyStorer src m = do size <- fromIntegral . fileSize <$> getFileStatus src :: IO Integer - let streamer sink = withMeteredFile src m $ \b -> do - mvar <- newMVar $ L.toChunks b - let getnextchunk = modifyMVar mvar $ pure . pop - sink getnextchunk + let streamer sink = withMeteredFile src m $ \b -> mkPopper b sink return $ RequestBodyStream (fromInteger size) streamer + +mkPopper :: L.ByteString -> NeedsPopper () -> IO () +mkPopper b sink = do + mvar <- newMVar $ L.toChunks b + let getnextchunk = modifyMVar mvar $ pure . pop + sink getnextchunk where pop [] = ([], S.empty) pop (c:cs) = (cs, c) diff --git a/Remote/S3.hs b/Remote/S3.hs index 1de1abad6c..685f95bbb8 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -29,7 +29,6 @@ import Data.Conduit #if MIN_VERSION_aws(0,10,6) import qualified Data.Conduit.List as CL import qualified Data.Conduit.Binary as CB -import Network.HTTP.Conduit (withManager) #endif import Common.Annex @@ -162,9 +161,9 @@ store r h = fileStorer $ \k f p -> do Just partsz | partsz > 0 -> do fsz <- fromIntegral . fileSize <$> liftIO (getFileStatus f) if fsz > partsz - then multipartupload fsz partsz k f p + then multipartupload partsz k f p else singlepartupload k f p - Nothing -> singlepartupload k f p + _ -> singlepartupload k f p -- Store public URL to item in Internet Archive. when (isIA (hinfo h) && not (isChunkKey k)) $ setUrlPresent k (iaKeyUrl r k) @@ -173,7 +172,7 @@ store r h = fileStorer $ \k f p -> do singlepartupload k f p = do rbody <- liftIO $ httpBodyStorer f p void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody - multipartupload fsz partsz k f p = do + multipartupload partsz k f p = do #if MIN_VERSION_aws(0,10,6) let info = hinfo h let object = bucketObject info k @@ -188,28 +187,22 @@ store r h = fileStorer $ \k f p -> do -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. - etags <- bracketIO (openBinaryFile f ReadMode) hClose $ \h -> do - let sendparts etags partnum = do - b <- liftIO $ hGetUntilMetered h (< partsz) p - if L.null b - then return (reverse etags) - else do - mvar <- newMVar $ L.toChunks b - let streamer sink = do - let getnextchunk = modifyMVar mvar $ pure . pop - sink getnextchunk - let body = RequestBodyStreamChunked streamer - S3.UploadPartResponse _ etag <- sendS3Handle h $ - S3.uploadPart (bucket info) object partnum uploadid body - sendparts (etag:etags) (partnum + 1) - sendparts [] 0 1 + etags <- bracketIO (openBinaryFile f ReadMode) hClose $ \fh -> do + let sendparts etags partnum = ifM (hIsEOF fh) + ( return (reverse etags) + , do + b <- liftIO $ hGetUntilMetered fh (< partsz) p + let body = RequestBodyStream (L.length b) (mkPopper b) + S3.UploadPartResponse _ etag <- sendS3Handle h $ + S3.uploadPart (bucket info) object partnum uploadid body + sendparts (etag:etags) (partnum + 1) + ) + sendparts [] 1 void $ sendS3Handle h $ S3.postCompleteMultipartUpload (bucket info) object uploadid (zip [1..] etags) - pop [] = ([], S.empty) - pop (c:cs) = (cs, c) #else - warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ " vs filesize " ++ show fsz ++ "); built with too old a version of the aws library." + warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ "); built with too old a version of the aws library." singlepartupload k f p #endif From 62de9a39bfb0cb0bc241ece5f95d0f25f53934c6 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 20:04:42 -0400 Subject: [PATCH 36/51] WIP 3 --- Remote/S3.hs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 685f95bbb8..73518c65c7 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -26,10 +26,6 @@ import Network.HTTP.Types import Control.Monad.Trans.Resource import Control.Monad.Catch import Data.Conduit -#if MIN_VERSION_aws(0,10,6) -import qualified Data.Conduit.List as CL -import qualified Data.Conduit.Binary as CB -#endif import Common.Annex import Types.Remote @@ -188,16 +184,17 @@ store r h = fileStorer $ \k f p -> do -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. etags <- bracketIO (openBinaryFile f ReadMode) hClose $ \fh -> do - let sendparts etags partnum = ifM (hIsEOF fh) + let sendparts meter etags partnum = ifM (liftIO $ hIsOpen fh) ( return (reverse etags) , do - b <- liftIO $ hGetUntilMetered fh (< partsz) p - let body = RequestBodyStream (L.length b) (mkPopper b) + b <- liftIO $ hGetUntilMetered fh (< partsz) meter + let sz = L.length b + let body = RequestBodyStream sz (mkPopper b) S3.UploadPartResponse _ etag <- sendS3Handle h $ S3.uploadPart (bucket info) object partnum uploadid body - sendparts (etag:etags) (partnum + 1) + sendparts (offsetMeterUpdate meter (toBytesProcessed sz)) (etag:etags) (partnum + 1) ) - sendparts [] 1 + sendparts p [] 1 void $ sendS3Handle h $ S3.postCompleteMultipartUpload (bucket info) object uploadid (zip [1..] etags) From 4230b56b797b759e53ac0cf3cb7ef2765a004854 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 20:15:33 -0400 Subject: [PATCH 37/51] logic error --- Remote/S3.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 73518c65c7..3e87407c5b 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -20,7 +20,7 @@ import qualified Data.ByteString as S import qualified Data.Map as M import Data.Char import Network.Socket (HostName) -import Network.HTTP.Conduit (Manager, newManager, closeManager, withManager) +import Network.HTTP.Conduit (Manager, newManager, closeManager) import Network.HTTP.Client (defaultManagerSettings, managerResponseTimeout, responseStatus, responseBody, RequestBody(..)) import Network.HTTP.Types import Control.Monad.Trans.Resource @@ -184,7 +184,7 @@ store r h = fileStorer $ \k f p -> do -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. etags <- bracketIO (openBinaryFile f ReadMode) hClose $ \fh -> do - let sendparts meter etags partnum = ifM (liftIO $ hIsOpen fh) + let sendparts meter etags partnum = ifM (liftIO $ hIsClosed fh) ( return (reverse etags) , do b <- liftIO $ hGetUntilMetered fh (< partsz) meter From bd09046291a5fde26afc02f5838c6594c1fa9c00 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 20:36:11 -0400 Subject: [PATCH 38/51] switch from hGetSome to hGet This should be essentially no-op change for hGetContentsMetered, since it always gets the entire contents. So the only difference is that each chunk of the lazy bytestring will always be the full chunk size. So, I'm pretty sure this is safe. Also, the only current users of hGetContentsMetered are reading files, so the stream won't block for long in the middle. The improvement is that hGetUntilMetered will always get some multiple of the defaultChunkSize. This will allow the S3 multipart code to pick a fixed size and know that hGetUntilMetered will really get that size. --- Utility/Metered.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utility/Metered.hs b/Utility/Metered.hs index 447eab2e89..f27eee26db 100644 --- a/Utility/Metered.hs +++ b/Utility/Metered.hs @@ -120,7 +120,7 @@ hGetUntilMetered h keepgoing meterupdate = lazyRead zeroBytesProcessed lazyRead sofar = unsafeInterleaveIO $ loop sofar loop sofar = do - c <- S.hGetSome h defaultChunkSize + c <- S.hGet h defaultChunkSize if S.null c then do hClose h From f0551578d6082c457e1609c23d4560f0684fbfe9 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 20:49:30 -0400 Subject: [PATCH 39/51] this should avoid leaking memory --- Remote/S3.hs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 3e87407c5b..9184e0698c 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -157,7 +157,7 @@ store r h = fileStorer $ \k f p -> do Just partsz | partsz > 0 -> do fsz <- fromIntegral . fileSize <$> liftIO (getFileStatus f) if fsz > partsz - then multipartupload partsz k f p + then multipartupload fsz partsz k f p else singlepartupload k f p _ -> singlepartupload k f p -- Store public URL to item in Internet Archive. @@ -168,7 +168,7 @@ store r h = fileStorer $ \k f p -> do singlepartupload k f p = do rbody <- liftIO $ httpBodyStorer f p void $ sendS3Handle h $ putObject h (bucketObject (hinfo h) k) rbody - multipartupload partsz k f p = do + multipartupload fsz partsz k f p = do #if MIN_VERSION_aws(0,10,6) let info = hinfo h let object = bucketObject info k @@ -181,14 +181,23 @@ store r h = fileStorer $ \k f p -> do } uploadid <- S3.imurUploadId <$> sendS3Handle h req + -- The actual part size will be a even multiple of the + -- 32k chunk size that hGetUntilMetered uses. + let partsz' = (partsz `div` defaultChunkSize) * defaultChunkSize + -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. etags <- bracketIO (openBinaryFile f ReadMode) hClose $ \fh -> do let sendparts meter etags partnum = ifM (liftIO $ hIsClosed fh) ( return (reverse etags) , do - b <- liftIO $ hGetUntilMetered fh (< partsz) meter - let sz = L.length b + pos <- liftIO $ hTell fh + -- Calculate size of part that will + -- be read. + let sz = if fsz - pos < partsz' + then fsz - pos + else partsz' + b <- liftIO $ hGetUntilMetered fh (< partsz') meter let body = RequestBodyStream sz (mkPopper b) S3.UploadPartResponse _ etag <- sendS3Handle h $ S3.uploadPart (bucket info) object partnum uploadid body @@ -199,7 +208,7 @@ store r h = fileStorer $ \k f p -> do void $ sendS3Handle h $ S3.postCompleteMultipartUpload (bucket info) object uploadid (zip [1..] etags) #else - warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ "); built with too old a version of the aws library." + warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ ") of large file (" ++ show fsz ++ "); built with too old a version of the aws library." singlepartupload k f p #endif From 0f78f197eb87482aa25196e64eacf132570f9dc6 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Nov 2014 21:02:44 -0400 Subject: [PATCH 40/51] casts; now fully working.. but still leaking Still seems to buffer the whole partsize in memory, but I'm pretty sure my code is not what's doing it. See https://github.com/aristidb/aws/issues/142 --- Remote/S3.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 9184e0698c..b105855151 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -183,7 +183,7 @@ store r h = fileStorer $ \k f p -> do -- The actual part size will be a even multiple of the -- 32k chunk size that hGetUntilMetered uses. - let partsz' = (partsz `div` defaultChunkSize) * defaultChunkSize + let partsz' = (partsz `div` toInteger defaultChunkSize) * toInteger defaultChunkSize -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. @@ -198,7 +198,7 @@ store r h = fileStorer $ \k f p -> do then fsz - pos else partsz' b <- liftIO $ hGetUntilMetered fh (< partsz') meter - let body = RequestBodyStream sz (mkPopper b) + let body = RequestBodyStream (fromIntegral sz) (mkPopper b) S3.UploadPartResponse _ etag <- sendS3Handle h $ S3.uploadPart (bucket info) object partnum uploadid body sendparts (offsetMeterUpdate meter (toBytesProcessed sz)) (etag:etags) (partnum + 1) From 29871e320cfad071f179072d14872d8f6e1b08ce Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 14:47:18 -0400 Subject: [PATCH 41/51] combine 2 checks --- Remote/S3.hs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index b105855151..f19c428429 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -188,21 +188,21 @@ store r h = fileStorer $ \k f p -> do -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. etags <- bracketIO (openBinaryFile f ReadMode) hClose $ \fh -> do - let sendparts meter etags partnum = ifM (liftIO $ hIsClosed fh) - ( return (reverse etags) - , do - pos <- liftIO $ hTell fh - -- Calculate size of part that will - -- be read. - let sz = if fsz - pos < partsz' - then fsz - pos - else partsz' - b <- liftIO $ hGetUntilMetered fh (< partsz') meter - let body = RequestBodyStream (fromIntegral sz) (mkPopper b) - S3.UploadPartResponse _ etag <- sendS3Handle h $ - S3.uploadPart (bucket info) object partnum uploadid body - sendparts (offsetMeterUpdate meter (toBytesProcessed sz)) (etag:etags) (partnum + 1) - ) + let sendparts meter etags partnum = do + pos <- liftIO $ hTell fh + if pos >= fsz + then return (reverse etags) + else do + -- Calculate size of part that will + -- be read. + let sz = if fsz - pos < partsz' + then fsz - pos + else partsz' + b <- liftIO $ hGetUntilMetered fh (< partsz') meter + let body = RequestBodyStream (fromIntegral sz) (mkPopper b) + S3.UploadPartResponse _ etag <- sendS3Handle h $ + S3.uploadPart (bucket info) object partnum uploadid body + sendparts (offsetMeterUpdate meter (toBytesProcessed sz)) (etag:etags) (partnum + 1) sendparts p [] 1 void $ sendS3Handle h $ S3.postCompleteMultipartUpload From fccdd61eeccc7a2559bdbf5b9ff2940d362d9917 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 15:22:08 -0400 Subject: [PATCH 42/51] fix memory leak Unfortunately, I don't fully understand why it was leaking using the old method of a lazy bytestring. I just know that it was leaking, despite neither hGetUntilMetered nor byteStringPopper seeming to leak by themselves. The new method avoids the lazy bytestring, and simply reads chunks from the handle and streams them out to the http socket. --- Remote/Helper/Http.hs | 35 +++++++++++++++++++++++++++++------ Remote/S3.hs | 9 +++++---- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/Remote/Helper/Http.hs b/Remote/Helper/Http.hs index cb3af335ab..6ce5bacb82 100644 --- a/Remote/Helper/Http.hs +++ b/Remote/Helper/Http.hs @@ -5,6 +5,8 @@ - Licensed under the GNU GPL version 3 or higher. -} +{-# LANGUAGE BangPatterns #-} + module Remote.Helper.Http where import Common.Annex @@ -31,17 +33,38 @@ httpStorer a = fileStorer $ \k f m -> a k =<< liftIO (httpBodyStorer f m) httpBodyStorer :: FilePath -> MeterUpdate -> IO RequestBody httpBodyStorer src m = do size <- fromIntegral . fileSize <$> getFileStatus src :: IO Integer - let streamer sink = withMeteredFile src m $ \b -> mkPopper b sink + let streamer sink = withMeteredFile src m $ \b -> byteStringPopper b sink return $ RequestBodyStream (fromInteger size) streamer -mkPopper :: L.ByteString -> NeedsPopper () -> IO () -mkPopper b sink = do +byteStringPopper :: L.ByteString -> NeedsPopper () -> IO () +byteStringPopper b sink = do mvar <- newMVar $ L.toChunks b - let getnextchunk = modifyMVar mvar $ pure . pop + let getnextchunk = modifyMVar mvar $ \v -> + case v of + [] -> return ([], S.empty) + (c:cs) -> return (cs, c) + sink getnextchunk + +{- Makes a Popper that streams a given number of chunks of a given + - size from the handle, updating the meter as the chunks are read. -} +handlePopper :: Integer -> Int -> MeterUpdate -> Handle -> NeedsPopper () -> IO () +handlePopper numchunks chunksize meterupdate h sink = do + mvar <- newMVar zeroBytesProcessed + let getnextchunk = do + sent <- takeMVar mvar + if sent >= target + then do + putMVar mvar sent + return S.empty + else do + b <- S.hGet h chunksize + let !sent' = addBytesProcessed sent chunksize + putMVar mvar sent' + meterupdate sent' + return b sink getnextchunk where - pop [] = ([], S.empty) - pop (c:cs) = (cs, c) + target = toBytesProcessed (numchunks * fromIntegral chunksize) -- Reads the http body and stores it to the specified file, updating the -- meter as it goes. diff --git a/Remote/S3.hs b/Remote/S3.hs index f19c428429..a1f5bf75d8 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -198,10 +198,11 @@ store r h = fileStorer $ \k f p -> do let sz = if fsz - pos < partsz' then fsz - pos else partsz' - b <- liftIO $ hGetUntilMetered fh (< partsz') meter - let body = RequestBodyStream (fromIntegral sz) (mkPopper b) - S3.UploadPartResponse _ etag <- sendS3Handle h $ - S3.uploadPart (bucket info) object partnum uploadid body + let numchunks = ceiling (fromIntegral sz / defaultChunkSize) + let popper = handlePopper numchunks defaultChunkSize p fh + let req = S3.uploadPart (bucket info) object partnum uploadid $ + RequestBodyStream (fromIntegral sz) popper + S3.UploadPartResponse _ etag <- sendS3Handle h req sendparts (offsetMeterUpdate meter (toBytesProcessed sz)) (etag:etags) (partnum + 1) sendparts p [] 1 From ad2125e24a450118a525b5fd6998afdbe21b4fae Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 15:39:48 -0400 Subject: [PATCH 43/51] fix a couple type errors and the progress bar --- Remote/S3.hs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index a1f5bf75d8..e0ff93bb37 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -173,13 +173,13 @@ store r h = fileStorer $ \k f p -> do let info = hinfo h let object = bucketObject info k - let req = (S3.postInitiateMultipartUpload (bucket info) object) + let startreq = (S3.postInitiateMultipartUpload (bucket info) object) { S3.imuStorageClass = Just (storageClass info) , S3.imuMetadata = metaHeaders info , S3.imuAutoMakeBucket = isIA info , S3.imuExpires = Nothing -- TODO set some reasonable expiry } - uploadid <- S3.imurUploadId <$> sendS3Handle h req + uploadid <- S3.imurUploadId <$> sendS3Handle h startreq -- The actual part size will be a even multiple of the -- 32k chunk size that hGetUntilMetered uses. @@ -198,8 +198,9 @@ store r h = fileStorer $ \k f p -> do let sz = if fsz - pos < partsz' then fsz - pos else partsz' - let numchunks = ceiling (fromIntegral sz / defaultChunkSize) - let popper = handlePopper numchunks defaultChunkSize p fh + let p' = offsetMeterUpdate p (toBytesProcessed pos) + let numchunks = ceiling (fromIntegral sz / fromIntegral defaultChunkSize :: Double) + let popper = handlePopper numchunks defaultChunkSize p' fh let req = S3.uploadPart (bucket info) object partnum uploadid $ RequestBodyStream (fromIntegral sz) popper S3.UploadPartResponse _ etag <- sendS3Handle h req From a42022d8ffe2697e56c00b121473dfa92d8cf1b8 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 16:06:13 -0400 Subject: [PATCH 44/51] work around minimum part size problem When uploading the last part of a file, which was 640229 bytes, S3 rejected that part: "Your proposed upload is smaller than the minimum allowed size" I don't know what the minimum is, but the fix is just to include the last part into the previous part. Since this can result in a part that's double-sized, use half-sized parts normally. --- Remote/S3.hs | 15 +++++++++++---- doc/special_remotes/S3.mdwn | 12 ++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index e0ff93bb37..8d30c7c9b2 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -181,9 +181,16 @@ store r h = fileStorer $ \k f p -> do } uploadid <- S3.imurUploadId <$> sendS3Handle h startreq - -- The actual part size will be a even multiple of the - -- 32k chunk size that hGetUntilMetered uses. - let partsz' = (partsz `div` toInteger defaultChunkSize) * toInteger defaultChunkSize + {- The actual part size will be a even multiple of the + - 32k chunk size that hGetUntilMetered uses. + - + - Also, half-size parts are used. This is so that + - the final part of a file can be rolled into the + - last full-size part, which avoids a problem when the + - final part could otherwise be too small for S3 to accept + - it. + -} + let partsz' = (partsz `div` toInteger defaultChunkSize `div` 2) * toInteger defaultChunkSize -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. @@ -195,7 +202,7 @@ store r h = fileStorer $ \k f p -> do else do -- Calculate size of part that will -- be read. - let sz = if fsz - pos < partsz' + let sz = if fsz - pos < partsz' * 2 then fsz - pos else partsz' let p' = offsetMeterUpdate p (toBytesProcessed pos) diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index c7c6f76c57..59c1abed73 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -21,10 +21,14 @@ the S3 remote. * `chunk` - Enables [[chunking]] when storing large files. `chunk=1MiB` is a good starting point for chunking. -* `partsize` - Specifies the largest object to attempt to store in the - bucket. Multipart uploads will be used when storing larger objects. - This is not enabled by default, but can be enabled or changed at any - time. Setting `partsize=1GiB` is reasonable for S3. +* `partsize` - Amazon S3 only accepts uploads up to a certian file size, + and storing larger files requires a multipart upload process. + Setting `partsize=1GiB` is recommended for Amazon S3; this will + cause multipart uploads to be done using parts up to 1GiB in size. + + This is not enabled by default, since other S3 implementations may + not support multipart uploads, but can be enabled or changed at any + time. * `keyid` - Specifies the gpg key to use for [[encryption]]. From 93feefae053070b7b99f92c932cddecae9238f47 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 16:21:55 -0400 Subject: [PATCH 45/51] Revert "work around minimum part size problem" This reverts commit a42022d8ffe2697e56c00b121473dfa92d8cf1b8. I misunderstood the cause of the problem. --- Remote/S3.hs | 15 ++++----------- doc/special_remotes/S3.mdwn | 12 ++++-------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 8d30c7c9b2..e0ff93bb37 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -181,16 +181,9 @@ store r h = fileStorer $ \k f p -> do } uploadid <- S3.imurUploadId <$> sendS3Handle h startreq - {- The actual part size will be a even multiple of the - - 32k chunk size that hGetUntilMetered uses. - - - - Also, half-size parts are used. This is so that - - the final part of a file can be rolled into the - - last full-size part, which avoids a problem when the - - final part could otherwise be too small for S3 to accept - - it. - -} - let partsz' = (partsz `div` toInteger defaultChunkSize `div` 2) * toInteger defaultChunkSize + -- The actual part size will be a even multiple of the + -- 32k chunk size that hGetUntilMetered uses. + let partsz' = (partsz `div` toInteger defaultChunkSize) * toInteger defaultChunkSize -- Send parts of the file, taking care to stream each part -- w/o buffering in memory, since the parts can be large. @@ -202,7 +195,7 @@ store r h = fileStorer $ \k f p -> do else do -- Calculate size of part that will -- be read. - let sz = if fsz - pos < partsz' * 2 + let sz = if fsz - pos < partsz' then fsz - pos else partsz' let p' = offsetMeterUpdate p (toBytesProcessed pos) diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index 59c1abed73..c7c6f76c57 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -21,14 +21,10 @@ the S3 remote. * `chunk` - Enables [[chunking]] when storing large files. `chunk=1MiB` is a good starting point for chunking. -* `partsize` - Amazon S3 only accepts uploads up to a certian file size, - and storing larger files requires a multipart upload process. - Setting `partsize=1GiB` is recommended for Amazon S3; this will - cause multipart uploads to be done using parts up to 1GiB in size. - - This is not enabled by default, since other S3 implementations may - not support multipart uploads, but can be enabled or changed at any - time. +* `partsize` - Specifies the largest object to attempt to store in the + bucket. Multipart uploads will be used when storing larger objects. + This is not enabled by default, but can be enabled or changed at any + time. Setting `partsize=1GiB` is reasonable for S3. * `keyid` - Specifies the gpg key to use for [[encryption]]. From 83901c6c17175c3fa85309f70409bf15a6b07e61 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 16:22:29 -0400 Subject: [PATCH 46/51] better partsize docs The minimum allowsed size actually refers to the part size! --- doc/special_remotes/S3.mdwn | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index c7c6f76c57..aac66abebe 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -21,10 +21,18 @@ the S3 remote. * `chunk` - Enables [[chunking]] when storing large files. `chunk=1MiB` is a good starting point for chunking. -* `partsize` - Specifies the largest object to attempt to store in the - bucket. Multipart uploads will be used when storing larger objects. - This is not enabled by default, but can be enabled or changed at any - time. Setting `partsize=1GiB` is reasonable for S3. +* `partsize` - Amazon S3 only accepts uploads up to a certian file size, + and storing larger files requires a multipart upload process. + + Setting `partsize=1GiB` is recommended for Amazon S3; this will + cause multipart uploads to be done using parts up to 1GiB in size. + Note that setting partsize to less than 100MiB will cause Amazon S3 to + reject uploads. + + This is not enabled by default, since other S3 implementations may + not support multipart uploads or have different limits, + but can be enabled or changed at any time. + time. * `keyid` - Specifies the gpg key to use for [[encryption]]. From ca68beaa64cb051bd2d8c2e78c8f63bf8fc4cf0a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 17:18:20 -0400 Subject: [PATCH 47/51] add todo item so I don't forget; it will only come into effect when this branch is merged --- doc/todo/S3_multipart_interruption_cleanup.mdwn | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 doc/todo/S3_multipart_interruption_cleanup.mdwn diff --git a/doc/todo/S3_multipart_interruption_cleanup.mdwn b/doc/todo/S3_multipart_interruption_cleanup.mdwn new file mode 100644 index 0000000000..adb5fd2cb0 --- /dev/null +++ b/doc/todo/S3_multipart_interruption_cleanup.mdwn @@ -0,0 +1,14 @@ +When a multipart S3 upload is being made, and gets interrupted, +the parts remain in the bucket, and S3 may charge for them. + +I am not sure what happens if the same object gets uploaded again. Is S3 +nice enough to remove the old parts? I need to find out.. + +If not, this needs to be dealt with somehow. One way would be to configure an +expiry of the uploaded parts, but this is tricky as a huge upload could +take arbitrarily long. Another way would be to record the uploadid and the +etags of the parts, and then resume where it left off the next time the +object is sent to S3. (Or at least cancel the old upload; resume isn't +practical when uploading an encrypted object.) + +It could store that info in either the local FS or the git-annex branch. From 4eca1a05dea05d2ef3daaf1225a15a12dcbcf54a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 4 Nov 2014 18:05:53 -0400 Subject: [PATCH 48/51] close bug --- doc/bugs/new_AWS_region___40__eu-central-1__41__.mdwn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/bugs/new_AWS_region___40__eu-central-1__41__.mdwn b/doc/bugs/new_AWS_region___40__eu-central-1__41__.mdwn index 80f89b243d..177f7e1388 100644 --- a/doc/bugs/new_AWS_region___40__eu-central-1__41__.mdwn +++ b/doc/bugs/new_AWS_region___40__eu-central-1__41__.mdwn @@ -6,3 +6,5 @@ Amazon has opened up a new region in AWS with a datacenter in Frankfurt/Germany. * Region: eu-central-1 This should be added to the "Adding an Amazon S3 repository" page in the Datacenter dropdown of the webapp. + +> [[fixed|done]] --[[Joey]] From 0a891fcfc5226fd0b34247488f44b59068a6aefe Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 5 Nov 2014 12:42:12 -0400 Subject: [PATCH 49/51] support S3 front-end used by globalways.net This threw an unusual exception w/o an error message when probing to see if the bucket exists yet. So rather than relying on tryS3, catch all exceptions. This does mean that it might get an exception for some transient network error, think this means the bucket DNE yet, and try to create it, and then fail when it already exists. --- Remote/S3.hs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index e0ff93bb37..844d87902d 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -299,7 +299,7 @@ genBucket c u = do where go _ (Right True) = noop go h _ = do - v <- tryS3 $ sendS3Handle h (S3.getBucket $ bucket $ hinfo h) + v <- tryNonAsync $ sendS3Handle h (S3.getBucket $ bucket $ hinfo h) case v of Right _ -> noop Left _ -> do @@ -323,9 +323,8 @@ writeUUIDFile :: RemoteConfig -> UUID -> S3Handle -> Annex () writeUUIDFile c u h = do v <- checkUUIDFile c u h case v of - Left e -> throwM e Right True -> noop - Right False -> void $ sendS3Handle h mkobject + _ -> void $ sendS3Handle h mkobject where file = T.pack $ uuidFile c uuidb = L.fromChunks [T.encodeUtf8 $ T.pack $ fromUUID u] From 2fbaf6d89c94ceaee7ce9b3f5f36af68762ef147 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 6 Nov 2014 14:26:01 -0400 Subject: [PATCH 50/51] reorder --- doc/special_remotes/S3.mdwn | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/doc/special_remotes/S3.mdwn b/doc/special_remotes/S3.mdwn index aac66abebe..5d161c3b87 100644 --- a/doc/special_remotes/S3.mdwn +++ b/doc/special_remotes/S3.mdwn @@ -18,24 +18,11 @@ the S3 remote. * `encryption` - One of "none", "hybrid", "shared", or "pubkey". See [[encryption]]. +* `keyid` - Specifies the gpg key to use for [[encryption]]. + * `chunk` - Enables [[chunking]] when storing large files. `chunk=1MiB` is a good starting point for chunking. -* `partsize` - Amazon S3 only accepts uploads up to a certian file size, - and storing larger files requires a multipart upload process. - - Setting `partsize=1GiB` is recommended for Amazon S3; this will - cause multipart uploads to be done using parts up to 1GiB in size. - Note that setting partsize to less than 100MiB will cause Amazon S3 to - reject uploads. - - This is not enabled by default, since other S3 implementations may - not support multipart uploads or have different limits, - but can be enabled or changed at any time. - time. - -* `keyid` - Specifies the gpg key to use for [[encryption]]. - * `embedcreds` - Optional. Set to "yes" embed the login credentials inside the git repository, which allows other clones to also access them. This is the default when gpg encryption is enabled; the credentials are stored @@ -60,6 +47,19 @@ the S3 remote. so by default, a bucket name is chosen based on the remote name and UUID. This can be specified to pick a bucket name. +* `partsize` - Amazon S3 only accepts uploads up to a certian file size, + and storing larger files requires a multipart upload process. + + Setting `partsize=1GiB` is recommended for Amazon S3 when not using + chunking; this will cause multipart uploads to be done using parts + up to 1GiB in size. Note that setting partsize to less than 100MiB + will cause Amazon S3 to reject uploads. + + This is not enabled by default, since other S3 implementations may + not support multipart uploads or have different limits, + but can be enabled or changed at any time. + time. + * `fileprefix` - By default, git-annex places files in a tree rooted at the top of the S3 bucket. When this is set, it's prefixed to the filenames used. For example, you could set it to "foo/" in one special remote, From 748e403fedf6e1c1f27114149e1cce61725ef36e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 6 Nov 2014 14:38:51 -0400 Subject: [PATCH 51/51] add changelog entires for when this branch gets merged --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/changelog b/debian/changelog index efaba2d950..b57ca66789 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ + * S3: Switched to using the haskell aws library. + * S3: No longer buffers entire files in memory when uploading without + chunking. + * S3: When built with a new enough version of the haskell aws library, + supports doing multipart uploads, in order to store extremely large + files in S3 when not using chunking. + git-annex (5.20141025) UNRELEASED; urgency=medium * Windows: Fix crash when user.name is not set in git config.