add a few tweaks to make it easy to use the Internet Archive's variant of S3

In particular, munge key filenames to comply with the IA's filename limits,
disable encryption, support their nonstandard way of creating buckets, and
allow x-amz-* headers to be specified in initremote to set item metadata.

Still TODO: initremote does not handle multiword metadata headers right.
This commit is contained in:
Joey Hess 2011-05-16 11:20:30 -04:00
parent 79c74bf27d
commit 1d2984441c
4 changed files with 108 additions and 40 deletions

View file

@ -15,6 +15,9 @@ import Network.AWS.AWSResult
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Map as M
import Data.Maybe
import Data.List
import Data.Char
import Data.String.Utils
import Control.Monad (when)
import Control.Monad.State (liftIO)
import System.Environment
@ -68,24 +71,53 @@ gen' r u c cst = do
}
s3Setup :: UUID -> RemoteConfig -> Annex RemoteConfig
s3Setup u c = do
-- verify configuration is sane
c' <- encryptionSetup c
let fullconfig = M.union c' defaults
genBucket fullconfig
gitConfigSpecialRemote u fullconfig "s3" "true"
s3SetCreds fullconfig
s3Setup u c = handlehost $ M.lookup "host" c
where
remotename = fromJust (M.lookup "name" c)
bucket = remotename ++ "-" ++ u
defbucket = remotename ++ "-" ++ u
defaults = M.fromList
[ ("datacenter", "US")
, ("storageclass", "STANDARD")
, ("host", defaultAmazonS3Host)
, ("port", show defaultAmazonS3Port)
, ("bucket", bucket)
, ("bucket", defbucket)
]
handlehost Nothing = defaulthost
handlehost (Just h)
| ".archive.org" `isSuffixOf` (map toLower h) = archiveorg
| otherwise = defaulthost
use fullconfig = do
genBucket fullconfig
gitConfigSpecialRemote u fullconfig "s3" "true"
s3SetCreds fullconfig
defaulthost = do
c' <- encryptionSetup c
use $ M.union c' defaults
archiveorg = do
showNote $ "Internet Archive mode"
maybe (error "specify bucket=") (const $ return ()) $
M.lookup "bucket" archiveconfig
use archiveconfig
where
archiveconfig =
-- hS3 does not pass through
-- x-archive-* headers
M.mapKeys (replace "x-archive-" "x-amz-") $
-- encryption does not make sense here
M.insert "encryption" "none" $
M.union c $
-- special constraints on key names
M.insert "mungekeys" "ia" $
-- buckets created only as files
-- are uploaded
M.insert "x-amz-auto-make-bucket" "1" $
-- no default bucket name; should
-- be human-readable
M.delete "bucket" defaults
store :: Remote Annex -> Key -> Annex Bool
store r k = s3Action r False $ \(conn, bucket) -> do
@ -111,8 +143,8 @@ storeHelper (conn, bucket) r k file = do
-- buffered to calculate it
size <- maybe getsize (return . fromIntegral) $ keySize k
let object = setStorageClass storageclass $
S3Object bucket (show k) ""
[("Content-Length",(show size)), ("x-amz-auto-make-bucket","1")] content
S3Object bucket (bucketFile r k) ""
(("Content-Length", show size) : xheaders) content
sendObject conn object
where
storageclass =
@ -122,10 +154,13 @@ storeHelper (conn, bucket) r k file = do
getsize = do
s <- liftIO $ getFileStatus file
return $ fileSize s
xheaders = filter isxheader $ M.assocs $ fromJust $ config r
isxheader (h, _) = "x-amz-" `isPrefixOf` h
retrieve :: Remote Annex -> Key -> FilePath -> Annex Bool
retrieve r k f = s3Action r False $ \(conn, bucket) -> do
res <- liftIO $ getObject conn $ bucketKey bucket k
res <- liftIO $ getObject conn $ bucketKey r bucket k
case res of
Right o -> do
liftIO $ L.writeFile f $ obj_data o
@ -134,7 +169,7 @@ retrieve r k f = s3Action r False $ \(conn, bucket) -> do
retrieveEncrypted :: Remote Annex -> (Cipher, Key) -> FilePath -> Annex Bool
retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
res <- liftIO $ getObject conn $ bucketKey bucket enck
res <- liftIO $ getObject conn $ bucketKey r bucket enck
case res of
Right o -> liftIO $
withDecryptedContent cipher (return $ obj_data o) $ \content -> do
@ -144,13 +179,13 @@ retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
remove :: Remote Annex -> Key -> Annex Bool
remove r k = s3Action r False $ \(conn, bucket) -> do
res <- liftIO $ deleteObject conn $ bucketKey bucket k
res <- liftIO $ deleteObject conn $ bucketKey r bucket k
s3Bool res
checkPresent :: Remote Annex -> Key -> Annex (Either IOException Bool)
checkPresent r k = s3Action r noconn $ \(conn, bucket) -> do
showNote ("checking " ++ name r ++ "...")
res <- liftIO $ getObjectInfo conn $ bucketKey bucket k
res <- liftIO $ getObjectInfo conn $ bucketKey r bucket k
case res of
Right _ -> return $ Right True
Left (AWSError _ _) -> return $ Right False
@ -182,8 +217,27 @@ s3Action r noconn action = do
(Just b, Just c) -> action (c, b)
_ -> return noconn
bucketKey :: String -> Key -> S3Object
bucketKey bucket k = S3Object bucket (show k) "" [] L.empty
bucketFile :: Remote Annex -> Key -> FilePath
bucketFile r k = (munge $ show k)
where
munge s = case M.lookup "mungekeys" $ fromJust $ config r of
Just "ia" -> iaMunge s
_ -> s
bucketKey :: Remote Annex -> String -> Key -> S3Object
bucketKey r bucket k = S3Object bucket (bucketFile r k) "" [] L.empty
{- Internet Archive limits filenames to a subset of ascii,
- with no whitespace. Other characters are xml entity
- encoded. -}
iaMunge :: String -> String
iaMunge = concat . (map munge)
where
munge c
| isAsciiUpper c || isAsciiLower c || isNumber c = [c]
| c `elem` "_-.\"" = [c]
| isSpace c = []
| otherwise = "&" ++ show (ord c) ++ ";"
genBucket :: RemoteConfig -> Annex ()
genBucket c = do

5
debian/changelog vendored
View file

@ -1,6 +1,11 @@
git-annex (0.20110504) UNRELEASED; urgency=low
* Work around a bug in Network.URI's handling of bracketed ipv6 addresses.
* Add a few tweaks to make it easy to use the Internet Archive's variant
of S3. In particular, munge key filenames to comply with the IA's filename
limits, disable encryption, support their nonstandard way of creating
buckets, and allow x-amz-* headers to be specified in initremote to set
item metadata.
-- Joey Hess <joeyh@debian.org> Fri, 06 May 2011 15:20:38 -0400

View file

@ -5,6 +5,12 @@ See [[walkthrough/using_Amazon_S3]] for usage examples.
## configuration
The standard environment variables `ANNEX_S3_ACCESS_KEY_ID` and
`ANNEX_S3_SECRET_ACCESS_KEY` are used to supply login credentials
for Amazon. When encryption is enabled, they are stored in encrypted form
by `git annex initremote`, so you do not need to keep the environment
variables set after the initial initalization of the remote.
A number of parameters can be passed to `git annex initremote` to configure
the S3 remote.
@ -29,8 +35,5 @@ the S3 remote.
so by default, a bucket name is chosen based on the remote name
and UUID. This can be specified to pick a bucket name.
The standard environment variables `ANNEX_S3_ACCESS_KEY_ID` and
`ANNEX_S3_SECRET_ACCESS_KEY` can be used to supply login credentials
for Amazon. When encryption is enabled, they are stored in encrypted form
by `git annex initremote`, so you do not need to keep the environment
variables set after the initial initalization of the remote.
* `x-amz-*` are passed through as http headers when storing keys
in S3.

View file

@ -15,20 +15,18 @@ Sign up for an account, and get your access keys here:
# export AWS_ACCESS_KEY_ID=blahblah
# export AWS_SECRET_ACCESS_KEY=xxxxxxx
Now go to <http://www.archive.org/create/> and create the item.
This allows you to fill in metadata which git-annex cannot provide to the
Internet Archive. (It also works around a bug with bucket creation.)
Specify `host=s3.us.archive.org` when doing `initremote` to set up
a remote at the Archive. This will enable a special Internet Archive mode:
Encryption is not allowed; you are required to specify a bucket name
rather than letting git-annex pick a random one; and you can optionally
specify `x-archive-meta*` headers to add metadata as explained in their
[documentation](http://www.archive.org/help/abouts3.txt).
(Note that there seems to be a bug in either hS3 or the archive that
breaks authentication when the item name contains spaces or upper-case
letters.. use all lowercase and no spaces.)
Specify `host=s3.us.archive.org` when doing initremote to set up
a remote at the Archive. It does not make sense to use encryption.
For the bucket name, specify the item name you created earlier.
# git annex initremote panama type=S3 encryption=none host=s3.us.archive.org bucket=panama-canal-lock-blueprints
initremote archive-panama (checking bucket) (creating bucket in US) ok
# git annex initremote archive-panama type=S3
# host=s3.us.archive.org bucket=panama-canal-lock-blueprints \
x-archive-meta-mediatype=texts x-archive-meta-language=eng \
x-archive-meta-title="original Panama Canal lock design blueprints"
initremote archive-panama (Internet Archive mode) (checking bucket) (creating bucket in US) ok
# git annex describe archive-panama "Internet Archive item for my grandfather's Panama Canal lock design blueprints"
describe archive-panama ok
@ -36,11 +34,19 @@ Then you can annex files and copy them to the remote as usual:
# git annex add photo1.jpeg
add photo1.jpeg ok
# git annex copy photo1.jpeg --to archive-panama
copy (checking archive-panama...) (to archive-panama...) ok
# git annex copy photo1.jpeg --fast --to archive-panama
copy (to archive-panama...) ok
-----
Note that it probably makes the most sense to use the WORM backend
for files, since that exposes the original filename in the key stored
in the Archive, which allows its special processing for sound files,
movies, etc to be done. Also, the Internet Archive has restrictions
on what is allowed in a filename; particularly no spaces are allowed.
movies, etc to be done.
Also, the Internet Archive has restrictions on what is allowed in a
filename; particularly no spaces are allowed.
There seems to be a bug in either hS3 or the archive that breaks
authentication when the bucket name contains spaces or upper-case letters..
use all lowercase and no spaces when making the bucket with `initremote`.