fix S3 upload buffering problem

Provide file size to new version of hS3.
This commit is contained in:
Joey Hess 2011-04-21 10:31:54 -04:00
parent d8329731c6
commit 6fcd3e1ef7
4 changed files with 43 additions and 30 deletions

View file

@ -10,7 +10,7 @@ module Remote.S3 (remote) where
import Control.Exception.Extensible (IOException) import Control.Exception.Extensible (IOException)
import Network.AWS.AWSConnection import Network.AWS.AWSConnection
import Network.AWS.S3Object import Network.AWS.S3Object
import Network.AWS.S3Bucket import Network.AWS.S3Bucket hiding (size)
import Network.AWS.AWSResult import Network.AWS.AWSResult
import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Map as M import qualified Data.Map as M
@ -18,6 +18,8 @@ import Data.Maybe
import Control.Monad (when) import Control.Monad (when)
import Control.Monad.State (liftIO) import Control.Monad.State (liftIO)
import System.Environment import System.Environment
import System.Posix.Files
import System.Directory
import RemoteClass import RemoteClass
import Types import Types
@ -30,6 +32,7 @@ import Config
import Remote.Special import Remote.Special
import Remote.Encryptable import Remote.Encryptable
import Crypto import Crypto
import Key
remote :: RemoteType Annex remote :: RemoteType Annex
remote = RemoteType { remote = RemoteType {
@ -100,21 +103,35 @@ s3Setup u c = do
store :: Remote Annex -> Key -> Annex Bool store :: Remote Annex -> Key -> Annex Bool
store r k = s3Action r False $ \(conn, bucket) -> do store r k = s3Action r False $ \(conn, bucket) -> do
g <- Annex.gitRepo g <- Annex.gitRepo
content <- liftIO $ L.readFile $ gitAnnexLocation g k res <- liftIO $ storeHelper (conn, bucket) r k $ gitAnnexLocation g k
res <- liftIO $ storeHelper (conn, bucket) r k content
s3Bool res s3Bool res
storeEncrypted :: Remote Annex -> (Cipher, Key) -> Key -> Annex Bool storeEncrypted :: Remote Annex -> (Cipher, Key) -> Key -> Annex Bool
storeEncrypted r (cipher, enck) k = s3Action r False $ \(conn, bucket) -> do storeEncrypted r (cipher, enck) k = s3Action r False $ \(conn, bucket) -> do
g <- Annex.gitRepo g <- Annex.gitRepo
let f = gitAnnexLocation g k let f = gitAnnexLocation g k
res <- liftIO $ withEncryptedContent cipher (L.readFile f) $ \s -> do -- To get file size of the encrypted content, have to use a temp file.
storeHelper (conn, bucket) r enck s -- (An alternative would be chunking to to a constant size.)
let tmp = gitAnnexTmpLocation g enck
liftIO $ withEncryptedContent cipher (L.readFile f) $ \s -> L.writeFile tmp s
res <- liftIO $ storeHelper (conn, bucket) r enck tmp
tmp_exists <- liftIO $ doesFileExist tmp
when tmp_exists $ liftIO $ removeFile tmp
s3Bool res s3Bool res
storeHelper :: (AWSConnection, String) -> Remote Annex -> Key -> L.ByteString -> IO (AWSResult ()) storeHelper :: (AWSConnection, String) -> Remote Annex -> Key -> FilePath -> IO (AWSResult ())
storeHelper (conn, bucket) r k content = do storeHelper (conn, bucket) r k file = do
let object = setStorageClass storageclass $ bucketKey bucket k content content <- liftIO $ L.readFile file
-- size is provided to S3 so the whole content does not need to be
-- buffered to calculate it
size <- case keySize k of
Just s -> return $ fromIntegral s
Nothing -> do
s <- liftIO $ getFileStatus file
return $ fileSize s
let object = setStorageClass storageclass $
S3Object bucket (show k) ""
[("Content-Length",(show size))] content
sendObject conn object sendObject conn object
where where
storageclass = storageclass =
@ -124,7 +141,7 @@ storeHelper (conn, bucket) r k content = do
retrieve :: Remote Annex -> Key -> FilePath -> Annex Bool retrieve :: Remote Annex -> Key -> FilePath -> Annex Bool
retrieve r k f = s3Action r False $ \(conn, bucket) -> do retrieve r k f = s3Action r False $ \(conn, bucket) -> do
res <- liftIO $ getObject conn $ bucketKey bucket k L.empty res <- liftIO $ getObject conn $ bucketKey bucket k
case res of case res of
Right o -> do Right o -> do
liftIO $ L.writeFile f $ obj_data o liftIO $ L.writeFile f $ obj_data o
@ -133,7 +150,7 @@ retrieve r k f = s3Action r False $ \(conn, bucket) -> do
retrieveEncrypted :: Remote Annex -> (Cipher, Key) -> FilePath -> Annex Bool retrieveEncrypted :: Remote Annex -> (Cipher, Key) -> FilePath -> Annex Bool
retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
res <- liftIO $ getObject conn $ bucketKey bucket enck L.empty res <- liftIO $ getObject conn $ bucketKey bucket enck
case res of case res of
Right o -> liftIO $ Right o -> liftIO $
withDecryptedContent cipher (return $ obj_data o) $ \content -> do withDecryptedContent cipher (return $ obj_data o) $ \content -> do
@ -143,13 +160,13 @@ retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
remove :: Remote Annex -> Key -> Annex Bool remove :: Remote Annex -> Key -> Annex Bool
remove r k = s3Action r False $ \(conn, bucket) -> do remove r k = s3Action r False $ \(conn, bucket) -> do
res <- liftIO $ deleteObject conn $ bucketKey bucket k L.empty res <- liftIO $ deleteObject conn $ bucketKey bucket k
s3Bool res s3Bool res
checkPresent :: Remote Annex -> Key -> Annex (Either IOException Bool) checkPresent :: Remote Annex -> Key -> Annex (Either IOException Bool)
checkPresent r k = s3Action r noconn $ \(conn, bucket) -> do checkPresent r k = s3Action r noconn $ \(conn, bucket) -> do
showNote ("checking " ++ name r ++ "...") showNote ("checking " ++ name r ++ "...")
res <- liftIO $ getObjectInfo conn $ bucketKey bucket k L.empty res <- liftIO $ getObjectInfo conn $ bucketKey bucket k
case res of case res of
Right _ -> return $ Right True Right _ -> return $ Right True
Left (AWSError _ _) -> return $ Right False Left (AWSError _ _) -> return $ Right False
@ -205,5 +222,5 @@ s3Action r noconn action = do
(Just b, Just c) -> action (c, b) (Just b, Just c) -> action (c, b)
_ -> return noconn _ -> return noconn
bucketKey :: String -> Key -> L.ByteString -> S3Object bucketKey :: String -> Key -> S3Object
bucketKey bucket k content = S3Object bucket (show k) "" [] content bucketKey bucket k = S3Object bucket (show k) "" [] L.empty

9
debian/changelog vendored
View file

@ -1,9 +1,12 @@
git-annex (0.20110420) UNRELEASED; urgency=low git-annex (0.20110420) UNRELEASED; urgency=low
* Update Debian build dependencies for ghc 7. * Update Debian build dependencies for ghc 7.
* Debian package is now built with S3 support. Thanks Joachim Breitner for * Debian package is now built with S3 support.
making this possible, also thanks Greg Heartsfield for working to improve Thanks Joachim Breitner for making this possible.
the hS3 library for git-annex. * No longer needs to buffer entire files when sending them to S3.
(However, getting files from S3 still requires buffering.)
Thanks Greg Heartsfield for ongoing work to improve the hS3 library
for git-annex.
-- Joey Hess <joeyh@debian.org> Thu, 21 Apr 2011 02:00:00 -0400 -- Joey Hess <joeyh@debian.org> Thu, 21 Apr 2011 02:00:00 -0400

1
debian/control vendored
View file

@ -43,4 +43,3 @@ Description: manage files with git, without checking their contents into git
versioned files, which is convenient for maintaining documents, Makefiles, versioned files, which is convenient for maintaining documents, Makefiles,
etc that are associated with annexed files but that benefit from full etc that are associated with annexed files but that benefit from full
revision control. revision control.

View file

@ -1,4 +1,4 @@
S3 has two memory leaks. S3 has memory leaks
## with encryption ## with encryption
@ -8,16 +8,10 @@ not yet for S3, in 5985acdfad8a6791f0b2fc54a1e116cee9c12479.
## always ## always
The other occurs independant of encryption use. Copying a 100 mb
file to S3 causes an immediate sharp memory spike to 119 mb.
Copying the file back from S3 causes a slow memory increase toward 119 mb. Copying the file back from S3 causes a slow memory increase toward 119 mb.
It's likely that this memory is used by the hS3 library, if it does not
construct the message to Amazon lazily. (And it may not be possible to
construct it lazily, if it includes checksum headers..)
I have emailed the hS3 author about this. He wrote back quickly, seems The author of hS3 is aware of the problem, and working on it.
only getting the size of the file is causing it to be buffered, and a quick
fix should be forthcoming. Update: 0.5.6 has been released which will ## fixed
allow providing file size out of band to avoid buffering when uploading.
Downloading will take further work in hS3. memory leak while uploading content to S3
--[[Joey]]