S3: support chunking

The assistant defaults to 1MiB chunk size for new S3 special remotes.
Which will work around a couple of bugs:
  http://git-annex.branchable.com/bugs/S3_memory_leaks/
  http://git-annex.branchable.com/bugs/S3_upload_not_using_multipart/
This commit is contained in:
Joey Hess 2014-08-02 15:51:58 -04:00
parent c3750901d8
commit 32e4368377
6 changed files with 38 additions and 60 deletions

View file

@ -129,6 +129,7 @@ postAddS3R = awsConfigurator $ do
, ("type", "S3")
, ("datacenter", T.unpack $ datacenter input)
, ("storageclass", show $ storageClass input)
, ("chunk", "1MiB")
]
_ -> $(widgetFile "configurators/adds3")
#else

View file

@ -25,12 +25,10 @@ import qualified Git
import Config
import Config.Cost
import Remote.Helper.Special
import Remote.Helper.Encryptable
import Remote.Helper.ChunkedEncryptable
import qualified Remote.Helper.AWS as AWS
import Crypto
import Creds
import Utility.Metered
import Annex.Content
import Annex.UUID
import Logs.Web
@ -47,17 +45,17 @@ remote = RemoteType {
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
gen r u c gc = new <$> remoteCost gc expensiveRemoteCost
where
new cst = Just $ encryptableRemote c
(storeEncrypted this)
(retrieveEncrypted this)
new cst = Just $ chunkedEncryptableRemote c
(prepareStore this)
(prepareRetrieve this)
this
where
this = Remote {
uuid = u,
cost = cst,
name = Git.repoDescribe r,
storeKey = store this,
retrieveKeyFile = retrieve this,
storeKey = storeKeyDummy,
retrieveKeyFile = retreiveKeyFileDummy,
retrieveKeyFileCheap = retrieveCheap this,
removeKey = remove this c,
hasKey = checkPresent this,
@ -123,67 +121,39 @@ s3Setup' u c = if isIA c then archiveorg else defaulthost
writeUUIDFile archiveconfig u
use archiveconfig
store :: Remote -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool
store r k _f p = s3Action r False $ \(conn, bucket) ->
sendAnnex k (void $ remove' r k) $ \src -> do
ok <- s3Bool =<< storeHelper (conn, bucket) r k p src
prepareStore :: Remote -> Preparer Storer
prepareStore r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) ->
fileStorer $ \k src p -> do
ok <- s3Bool =<< liftIO (store (conn, bucket) r k p src)
-- Store public URL to item in Internet Archive.
when (ok && isIA (config r)) $
when (ok && isIA (config r) && not (isChunkKey k)) $
setUrlPresent k (iaKeyUrl r k)
return ok
storeEncrypted :: Remote -> (Cipher, Key) -> Key -> MeterUpdate -> Annex Bool
storeEncrypted r (cipher, enck) k p = s3Action r False $ \(conn, bucket) ->
-- To get file size of the encrypted content, have to use a temp file.
-- (An alternative would be chunking to to a constant size.)
withTmp enck $ \tmp -> sendAnnex k (void $ remove' r enck) $ \src -> do
liftIO $ encrypt (getGpgEncParams r) cipher (feedFile src) $
readBytes $ L.writeFile tmp
s3Bool =<< storeHelper (conn, bucket) r enck p tmp
store :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> IO (AWSResult ())
store (conn, bucket) r k p file = do
size <- (fromIntegral . fileSize <$> getFileStatus file) :: IO Integer
withMeteredFile file p $ \content -> do
-- size is provided to S3 so the whole content
-- does not need to be buffered to calculate it
let object = S3Object
bucket (bucketFile r k) ""
(("Content-Length", show size) : getXheaders (config r))
content
sendObject conn $
setStorageClass (getStorageClass $ config r) object
storeHelper :: (AWSConnection, Bucket) -> Remote -> Key -> MeterUpdate -> FilePath -> Annex (AWSResult ())
storeHelper (conn, bucket) r k p file = do
size <- maybe getsize (return . fromIntegral) $ keySize k
meteredBytes (Just p) size $ \meterupdate ->
liftIO $ withMeteredFile file meterupdate $ \content -> do
-- size is provided to S3 so the whole content
-- does not need to be buffered to calculate it
let object = S3Object
bucket (bucketFile r k) ""
(("Content-Length", show size) : getXheaders (config r))
content
sendObject conn $
setStorageClass (getStorageClass $ config r) object
where
getsize = liftIO $ fromIntegral . fileSize <$> getFileStatus file
retrieve :: Remote -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Bool
retrieve r k _f d p = s3Action r False $ \(conn, bucket) ->
metered (Just p) k $ \meterupdate -> do
res <- liftIO $ getObject conn $ bucketKey r bucket k
case res of
Right o -> do
liftIO $ meteredWriteFile meterupdate d $
obj_data o
return True
Left e -> s3Warning e
prepareRetrieve :: Remote -> Preparer Retriever
prepareRetrieve r = resourcePrepare (const $ s3Action r False) $ \(conn, bucket) ->
byteRetriever $ \k ->
liftIO (getObject conn $ bucketKey r bucket k)
>>= either s3Error (return . obj_data)
retrieveCheap :: Remote -> Key -> FilePath -> Annex Bool
retrieveCheap _ _ _ = return False
retrieveEncrypted :: Remote -> (Cipher, Key) -> Key -> FilePath -> MeterUpdate -> Annex Bool
retrieveEncrypted r (cipher, enck) k d p = s3Action r False $ \(conn, bucket) ->
metered (Just p) k $ \meterupdate -> do
res <- liftIO $ getObject conn $ bucketKey r bucket enck
case res of
Right o -> liftIO $ decrypt cipher (\h -> meteredWrite meterupdate h $ obj_data o) $
readBytes $ \content -> do
L.writeFile d content
return True
Left e -> s3Warning e
{- Internet Archive doesn't easily allow removing content.
- While it may remove the file, there are generally other files
- derived from it that it does not remove. -}

View file

@ -15,6 +15,7 @@ module Types.Key (
file2key,
nonChunkKey,
chunkKeyOffset,
isChunkKey,
prop_idempotent_key_encode,
prop_idempotent_key_decode
@ -62,6 +63,9 @@ chunkKeyOffset k = (*)
<$> keyChunkSize k
<*> (pred <$> keyChunkNum k)
isChunkKey :: Key -> Bool
isChunkKey k = isJust (keyChunkSize k) && isJust (keyChunkNum k)
fieldSep :: Char
fieldSep = '-'

2
debian/changelog vendored
View file

@ -1,7 +1,7 @@
git-annex (5.20140718) UNRELEASED; urgency=medium
* New chunk= option to chunk files stored in special remotes.
Currently supported by: directory, and all external special remotes.
Currently supported by: directory, S3, and all external special remotes.
* Partially transferred files are automatically resumed when using
chunked remotes!
* The old chunksize= option is deprecated. Do not use for new remotes.

View file

@ -18,6 +18,9 @@ the S3 remote.
* `encryption` - One of "none", "hybrid", "shared", or "pubkey".
See [[encryption]].
* `chunk` - Enables [[chunking]] when storing large files.
`chunk=1MiB` is a good starting point for chunking.
* `keyid` - Specifies the gpg key to use for [[encryption]].
* `embedcreds` - Optional. Set to "yes" embed the login credentials inside

View file

@ -14,7 +14,7 @@ like "2512E3C7"
Next, create the S3 remote, and describe it.
# git annex initremote cloud type=S3 keyid=2512E3C7
# git annex initremote cloud type=S3 chunk=1MiB keyid=2512E3C7
initremote cloud (encryption setup with gpg key C910D9222512E3C7) (checking bucket) (creating bucket in US) (gpg) ok
# git annex describe cloud "at Amazon's US datacenter"
describe cloud ok