S3: support chunking

The assistant defaults to 1MiB chunk size for new S3 special remotes.
Which will work around a couple of bugs:
  http://git-annex.branchable.com/bugs/S3_memory_leaks/
  http://git-annex.branchable.com/bugs/S3_upload_not_using_multipart/
This commit is contained in:
Joey Hess 2014-08-02 15:51:58 -04:00
parent c3750901d8
commit 32e4368377
6 changed files with 38 additions and 60 deletions

View file

@ -129,6 +129,7 @@ postAddS3R = awsConfigurator $ do
, ("type", "S3") , ("type", "S3")
, ("datacenter", T.unpack $ datacenter input) , ("datacenter", T.unpack $ datacenter input)
, ("storageclass", show $ storageClass input) , ("storageclass", show $ storageClass input)
, ("chunk", "1MiB")
] ]
_ -> $(widgetFile "configurators/adds3") _ -> $(widgetFile "configurators/adds3")
#else #else

View file

@ -25,12 +25,10 @@ import qualified Git
import Config import Config
import Config.Cost import Config.Cost
import Remote.Helper.Special import Remote.Helper.Special
import Remote.Helper.Encryptable import Remote.Helper.ChunkedEncryptable
import qualified Remote.Helper.AWS as AWS import qualified Remote.Helper.AWS as AWS
import Crypto
import Creds import Creds
import Utility.Metered import Utility.Metered
import Annex.Content
import Annex.UUID import Annex.UUID
import Logs.Web import Logs.Web
@ -47,17 +45,17 @@ remote = RemoteType {
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote) gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
gen r u c gc = new <$> remoteCost gc expensiveRemoteCost gen r u c gc = new <$> remoteCost gc expensiveRemoteCost
where where
new cst = Just $ encryptableRemote c new cst = Just $ chunkedEncryptableRemote c
(storeEncrypted this) (prepareStore this)
(retrieveEncrypted this) (prepareRetrieve this)
this this
where where
this = Remote { this = Remote {
uuid = u, uuid = u,
cost = cst, cost = cst,
name = Git.repoDescribe r, name = Git.repoDescribe r,
storeKey = store this, storeKey = storeKeyDummy,
retrieveKeyFile = retrieve this, retrieveKeyFile = retreiveKeyFileDummy,
retrieveKeyFileCheap = retrieveCheap this, retrieveKeyFileCheap = retrieveCheap this,
removeKey = remove this c, removeKey = remove this c,
hasKey = checkPresent this, hasKey = checkPresent this,
@ -123,67 +121,39 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost
writeUUIDFile archiveconfig u writeUUIDFile archiveconfig u
use archiveconfig use archiveconfig
store :: Remote -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool prepareStore :: Remote -> Preparer Storer
store r k _f p = s3Action r False $ \(conn, bucket) -> prepareStore r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) ->
sendAnnex k (void $ remove' r k) $ \src -> do fileStorer $ \k src p -> do
ok <- s3Bool =<< storeHelper (conn, bucket) r k p src ok <- s3Bool =<< liftIO (store (conn, bucket) r k p src)
-- Store public URL to item in Internet Archive. -- Store public URL to item in Internet Archive.
when (ok && isIA (config r)) $ when (ok && isIA (config r) && not (isChunkKey k)) $
setUrlPresent k (iaKeyUrl r k) setUrlPresent k (iaKeyUrl r k)
return ok return ok
storeEncrypted :: Remote -> (Cipher, Key) -> Key -> MeterUpdate -> Annex Bool store :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ())
storeEncrypted r (cipher, enck) k p = s3Action r False $ \(conn, bucket) -> store (conn, bucket) r k p file = do
-- To get file size of the encrypted content, have to use a temp file. size <- (fromIntegral . fileSize <$> getFileStatus file) :: IO Integer
-- (An alternative would be chunking to to a constant size.) withMeteredFile file p $ \content -> do
withTmp enck $ \tmp -> sendAnnex k (void $ remove' r enck) $ \src -> do -- size is provided to S3 so the whole content
liftIO $ encrypt (getGpgEncParams r) cipher (feedFile src) $ -- does not need to be buffered to calculate it
readBytes $ L.writeFile tmp let object = S3Object
s3Bool =<< storeHelper (conn, bucket) r enck p tmp bucket (bucketFile r k) ""
(("Content-Length", show size) : getXheaders (config r))
content
sendObject conn $
setStorageClass (getStorageClass $ config r) object
storeHelper :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> Annex (AWSResult ()) prepareRetrieve :: Remote -> Preparer Retriever
storeHelper (conn, bucket) r k p file = do prepareRetrieve r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) ->
size <- maybe getsize (return . fromIntegral) $ keySize k byteRetriever $ \k ->
meteredBytes (Just p) size $ \meterupdate -> liftIO (getObject conn $ bucketKey r bucket k)
liftIO $ withMeteredFile file meterupdate $ \content -> do >>= either s3Error (return . obj_data)
-- size is provided to S3 so the whole content
-- does not need to be buffered to calculate it
let object = S3Object
bucket (bucketFile r k) ""
(("Content-Length", show size) : getXheaders (config r))
content
sendObject conn $
setStorageClass (getStorageClass $ config r) object
where
getsize = liftIO $ fromIntegral . fileSize <$> getFileStatus file
retrieve :: Remote -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Bool
retrieve r k _f d p = s3Action r False $ \(conn, bucket) ->
metered (Just p) k $ \meterupdate -> do
res <- liftIO $ getObject conn $ bucketKey r bucket k
case res of
Right o -> do
liftIO $ meteredWriteFile meterupdate d $
obj_data o
return True
Left e -> s3Warning e
retrieveCheap :: Remote -> Key -> FilePath -> Annex Bool retrieveCheap :: Remote -> Key -> FilePath -> Annex Bool
retrieveCheap _ _ _ = return False retrieveCheap _ _ _ = return False
retrieveEncrypted :: Remote -> (Cipher, Key) -> Key -> FilePath -> MeterUpdate -> Annex Bool
retrieveEncrypted r (cipher, enck) k d p = s3Action r False $ \(conn, bucket) ->
metered (Just p) k $ \meterupdate -> do
res <- liftIO $ getObject conn $ bucketKey r bucket enck
case res of
Right o -> liftIO $ decrypt cipher (\h -> meteredWrite meterupdate h $ obj_data o) $
readBytes $ \content -> do
L.writeFile d content
return True
Left e -> s3Warning e
{- Internet Archive doesn't easily allow removing content. {- Internet Archive doesn't easily allow removing content.
- While it may remove the file, there are generally other files - While it may remove the file, there are generally other files
- derived from it that it does not remove. -} - derived from it that it does not remove. -}

View file

@ -15,6 +15,7 @@ module Types.Key (
file2key, file2key,
nonChunkKey, nonChunkKey,
chunkKeyOffset, chunkKeyOffset,
isChunkKey,
prop_idempotent_key_encode, prop_idempotent_key_encode,
prop_idempotent_key_decode prop_idempotent_key_decode
@ -62,6 +63,9 @@ chunkKeyOffset k = (*)
<$> keyChunkSize k <$> keyChunkSize k
<*> (pred <$> keyChunkNum k) <*> (pred <$> keyChunkNum k)
isChunkKey :: Key -> Bool
isChunkKey k = isJust (keyChunkSize k) && isJust (keyChunkNum k)
fieldSep :: Char fieldSep :: Char
fieldSep = '-' fieldSep = '-'

2
debian/changelog vendored
View file

@ -1,7 +1,7 @@
git-annex (5.20140718) UNRELEASED; urgency=medium git-annex (5.20140718) UNRELEASED; urgency=medium
* New chunk= option to chunk files stored in special remotes. * New chunk= option to chunk files stored in special remotes.
Currently supported by: directory, and all external special remotes. Currently supported by: directory, S3, and all external special remotes.
* Partially transferred files are automatically resumed when using * Partially transferred files are automatically resumed when using
chunked remotes! chunked remotes!
* The old chunksize= option is deprecated. Do not use for new remotes. * The old chunksize= option is deprecated. Do not use for new remotes.

View file

@ -18,6 +18,9 @@ the S3 remote.
* `encryption` - One of "none", "hybrid", "shared", or "pubkey". * `encryption` - One of "none", "hybrid", "shared", or "pubkey".
See [[encryption]]. See [[encryption]].
* `chunk` - Enables [[chunking]] when storing large files.
`chunk=1MiB` is a good starting point for chunking.
* `keyid` - Specifies the gpg key to use for [[encryption]]. * `keyid` - Specifies the gpg key to use for [[encryption]].
* `embedcreds` - Optional. Set to "yes" embed the login credentials inside * `embedcreds` - Optional. Set to "yes" embed the login credentials inside

View file

@ -14,7 +14,7 @@ like "2512E3C7"
Next, create the S3 remote, and describe it. Next, create the S3 remote, and describe it.
# git annex initremote cloud type=S3 keyid=2512E3C7 # git annex initremote cloud type=S3 chunk=1MiB keyid=2512E3C7
initremote cloud (encryption setup with gpg key C910D9222512E3C7) (checking bucket) (creating bucket in US) (gpg) ok initremote cloud (encryption setup with gpg key C910D9222512E3C7) (checking bucket) (creating bucket in US) (gpg) ok
# git annex describe cloud "at Amazon's US datacenter" # git annex describe cloud "at Amazon's US datacenter"
describe cloud ok describe cloud ok