add a few tweaks to make it easy to use the Internet Archive's variant of S3
In particular, munge key filenames to comply with the IA's filename limits, disable encryption, support their nonstandard way of creating buckets, and allow x-amz-* headers to be specified in initremote to set item metadata. Still TODO: initremote does not handle multiword metadata headers right.
This commit is contained in:
parent
79c74bf27d
commit
1d2984441c
4 changed files with 108 additions and 40 deletions
|
@ -15,6 +15,9 @@ import Network.AWS.AWSResult
|
||||||
import qualified Data.ByteString.Lazy.Char8 as L
|
import qualified Data.ByteString.Lazy.Char8 as L
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
import Data.Maybe
|
import Data.Maybe
|
||||||
|
import Data.List
|
||||||
|
import Data.Char
|
||||||
|
import Data.String.Utils
|
||||||
import Control.Monad (when)
|
import Control.Monad (when)
|
||||||
import Control.Monad.State (liftIO)
|
import Control.Monad.State (liftIO)
|
||||||
import System.Environment
|
import System.Environment
|
||||||
|
@ -68,24 +71,53 @@ gen' r u c cst = do
|
||||||
}
|
}
|
||||||
|
|
||||||
s3Setup :: UUID -> RemoteConfig -> Annex RemoteConfig
|
s3Setup :: UUID -> RemoteConfig -> Annex RemoteConfig
|
||||||
s3Setup u c = do
|
s3Setup u c = handlehost $ M.lookup "host" c
|
||||||
-- verify configuration is sane
|
|
||||||
c' <- encryptionSetup c
|
|
||||||
let fullconfig = M.union c' defaults
|
|
||||||
|
|
||||||
genBucket fullconfig
|
|
||||||
gitConfigSpecialRemote u fullconfig "s3" "true"
|
|
||||||
s3SetCreds fullconfig
|
|
||||||
where
|
where
|
||||||
remotename = fromJust (M.lookup "name" c)
|
remotename = fromJust (M.lookup "name" c)
|
||||||
bucket = remotename ++ "-" ++ u
|
defbucket = remotename ++ "-" ++ u
|
||||||
defaults = M.fromList
|
defaults = M.fromList
|
||||||
[ ("datacenter", "US")
|
[ ("datacenter", "US")
|
||||||
, ("storageclass", "STANDARD")
|
, ("storageclass", "STANDARD")
|
||||||
, ("host", defaultAmazonS3Host)
|
, ("host", defaultAmazonS3Host)
|
||||||
, ("port", show defaultAmazonS3Port)
|
, ("port", show defaultAmazonS3Port)
|
||||||
, ("bucket", bucket)
|
, ("bucket", defbucket)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
handlehost Nothing = defaulthost
|
||||||
|
handlehost (Just h)
|
||||||
|
| ".archive.org" `isSuffixOf` (map toLower h) = archiveorg
|
||||||
|
| otherwise = defaulthost
|
||||||
|
|
||||||
|
use fullconfig = do
|
||||||
|
genBucket fullconfig
|
||||||
|
gitConfigSpecialRemote u fullconfig "s3" "true"
|
||||||
|
s3SetCreds fullconfig
|
||||||
|
|
||||||
|
defaulthost = do
|
||||||
|
c' <- encryptionSetup c
|
||||||
|
use $ M.union c' defaults
|
||||||
|
|
||||||
|
archiveorg = do
|
||||||
|
showNote $ "Internet Archive mode"
|
||||||
|
maybe (error "specify bucket=") (const $ return ()) $
|
||||||
|
M.lookup "bucket" archiveconfig
|
||||||
|
use archiveconfig
|
||||||
|
where
|
||||||
|
archiveconfig =
|
||||||
|
-- hS3 does not pass through
|
||||||
|
-- x-archive-* headers
|
||||||
|
M.mapKeys (replace "x-archive-" "x-amz-") $
|
||||||
|
-- encryption does not make sense here
|
||||||
|
M.insert "encryption" "none" $
|
||||||
|
M.union c $
|
||||||
|
-- special constraints on key names
|
||||||
|
M.insert "mungekeys" "ia" $
|
||||||
|
-- buckets created only as files
|
||||||
|
-- are uploaded
|
||||||
|
M.insert "x-amz-auto-make-bucket" "1" $
|
||||||
|
-- no default bucket name; should
|
||||||
|
-- be human-readable
|
||||||
|
M.delete "bucket" defaults
|
||||||
|
|
||||||
store :: Remote Annex -> Key -> Annex Bool
|
store :: Remote Annex -> Key -> Annex Bool
|
||||||
store r k = s3Action r False $ \(conn, bucket) -> do
|
store r k = s3Action r False $ \(conn, bucket) -> do
|
||||||
|
@ -111,8 +143,8 @@ storeHelper (conn, bucket) r k file = do
|
||||||
-- buffered to calculate it
|
-- buffered to calculate it
|
||||||
size <- maybe getsize (return . fromIntegral) $ keySize k
|
size <- maybe getsize (return . fromIntegral) $ keySize k
|
||||||
let object = setStorageClass storageclass $
|
let object = setStorageClass storageclass $
|
||||||
S3Object bucket (show k) ""
|
S3Object bucket (bucketFile r k) ""
|
||||||
[("Content-Length",(show size)), ("x-amz-auto-make-bucket","1")] content
|
(("Content-Length", show size) : xheaders) content
|
||||||
sendObject conn object
|
sendObject conn object
|
||||||
where
|
where
|
||||||
storageclass =
|
storageclass =
|
||||||
|
@ -122,10 +154,13 @@ storeHelper (conn, bucket) r k file = do
|
||||||
getsize = do
|
getsize = do
|
||||||
s <- liftIO $ getFileStatus file
|
s <- liftIO $ getFileStatus file
|
||||||
return $ fileSize s
|
return $ fileSize s
|
||||||
|
|
||||||
|
xheaders = filter isxheader $ M.assocs $ fromJust $ config r
|
||||||
|
isxheader (h, _) = "x-amz-" `isPrefixOf` h
|
||||||
|
|
||||||
retrieve :: Remote Annex -> Key -> FilePath -> Annex Bool
|
retrieve :: Remote Annex -> Key -> FilePath -> Annex Bool
|
||||||
retrieve r k f = s3Action r False $ \(conn, bucket) -> do
|
retrieve r k f = s3Action r False $ \(conn, bucket) -> do
|
||||||
res <- liftIO $ getObject conn $ bucketKey bucket k
|
res <- liftIO $ getObject conn $ bucketKey r bucket k
|
||||||
case res of
|
case res of
|
||||||
Right o -> do
|
Right o -> do
|
||||||
liftIO $ L.writeFile f $ obj_data o
|
liftIO $ L.writeFile f $ obj_data o
|
||||||
|
@ -134,7 +169,7 @@ retrieve r k f = s3Action r False $ \(conn, bucket) -> do
|
||||||
|
|
||||||
retrieveEncrypted :: Remote Annex -> (Cipher, Key) -> FilePath -> Annex Bool
|
retrieveEncrypted :: Remote Annex -> (Cipher, Key) -> FilePath -> Annex Bool
|
||||||
retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
|
retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
|
||||||
res <- liftIO $ getObject conn $ bucketKey bucket enck
|
res <- liftIO $ getObject conn $ bucketKey r bucket enck
|
||||||
case res of
|
case res of
|
||||||
Right o -> liftIO $
|
Right o -> liftIO $
|
||||||
withDecryptedContent cipher (return $ obj_data o) $ \content -> do
|
withDecryptedContent cipher (return $ obj_data o) $ \content -> do
|
||||||
|
@ -144,13 +179,13 @@ retrieveEncrypted r (cipher, enck) f = s3Action r False $ \(conn, bucket) -> do
|
||||||
|
|
||||||
remove :: Remote Annex -> Key -> Annex Bool
|
remove :: Remote Annex -> Key -> Annex Bool
|
||||||
remove r k = s3Action r False $ \(conn, bucket) -> do
|
remove r k = s3Action r False $ \(conn, bucket) -> do
|
||||||
res <- liftIO $ deleteObject conn $ bucketKey bucket k
|
res <- liftIO $ deleteObject conn $ bucketKey r bucket k
|
||||||
s3Bool res
|
s3Bool res
|
||||||
|
|
||||||
checkPresent :: Remote Annex -> Key -> Annex (Either IOException Bool)
|
checkPresent :: Remote Annex -> Key -> Annex (Either IOException Bool)
|
||||||
checkPresent r k = s3Action r noconn $ \(conn, bucket) -> do
|
checkPresent r k = s3Action r noconn $ \(conn, bucket) -> do
|
||||||
showNote ("checking " ++ name r ++ "...")
|
showNote ("checking " ++ name r ++ "...")
|
||||||
res <- liftIO $ getObjectInfo conn $ bucketKey bucket k
|
res <- liftIO $ getObjectInfo conn $ bucketKey r bucket k
|
||||||
case res of
|
case res of
|
||||||
Right _ -> return $ Right True
|
Right _ -> return $ Right True
|
||||||
Left (AWSError _ _) -> return $ Right False
|
Left (AWSError _ _) -> return $ Right False
|
||||||
|
@ -182,8 +217,27 @@ s3Action r noconn action = do
|
||||||
(Just b, Just c) -> action (c, b)
|
(Just b, Just c) -> action (c, b)
|
||||||
_ -> return noconn
|
_ -> return noconn
|
||||||
|
|
||||||
bucketKey :: String -> Key -> S3Object
|
bucketFile :: Remote Annex -> Key -> FilePath
|
||||||
bucketKey bucket k = S3Object bucket (show k) "" [] L.empty
|
bucketFile r k = (munge $ show k)
|
||||||
|
where
|
||||||
|
munge s = case M.lookup "mungekeys" $ fromJust $ config r of
|
||||||
|
Just "ia" -> iaMunge s
|
||||||
|
_ -> s
|
||||||
|
|
||||||
|
bucketKey :: Remote Annex -> String -> Key -> S3Object
|
||||||
|
bucketKey r bucket k = S3Object bucket (bucketFile r k) "" [] L.empty
|
||||||
|
|
||||||
|
{- Internet Archive limits filenames to a subset of ascii,
|
||||||
|
- with no whitespace. Other characters are xml entity
|
||||||
|
- encoded. -}
|
||||||
|
iaMunge :: String -> String
|
||||||
|
iaMunge = concat . (map munge)
|
||||||
|
where
|
||||||
|
munge c
|
||||||
|
| isAsciiUpper c || isAsciiLower c || isNumber c = [c]
|
||||||
|
| c `elem` "_-.\"" = [c]
|
||||||
|
| isSpace c = []
|
||||||
|
| otherwise = "&" ++ show (ord c) ++ ";"
|
||||||
|
|
||||||
genBucket :: RemoteConfig -> Annex ()
|
genBucket :: RemoteConfig -> Annex ()
|
||||||
genBucket c = do
|
genBucket c = do
|
||||||
|
|
5
debian/changelog
vendored
5
debian/changelog
vendored
|
@ -1,6 +1,11 @@
|
||||||
git-annex (0.20110504) UNRELEASED; urgency=low
|
git-annex (0.20110504) UNRELEASED; urgency=low
|
||||||
|
|
||||||
* Work around a bug in Network.URI's handling of bracketed ipv6 addresses.
|
* Work around a bug in Network.URI's handling of bracketed ipv6 addresses.
|
||||||
|
* Add a few tweaks to make it easy to use the Internet Archive's variant
|
||||||
|
of S3. In particular, munge key filenames to comply with the IA's filename
|
||||||
|
limits, disable encryption, support their nonstandard way of creating
|
||||||
|
buckets, and allow x-amz-* headers to be specified in initremote to set
|
||||||
|
item metadata.
|
||||||
|
|
||||||
-- Joey Hess <joeyh@debian.org> Fri, 06 May 2011 15:20:38 -0400
|
-- Joey Hess <joeyh@debian.org> Fri, 06 May 2011 15:20:38 -0400
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,12 @@ See [[walkthrough/using_Amazon_S3]] for usage examples.
|
||||||
|
|
||||||
## configuration
|
## configuration
|
||||||
|
|
||||||
|
The standard environment variables `ANNEX_S3_ACCESS_KEY_ID` and
|
||||||
|
`ANNEX_S3_SECRET_ACCESS_KEY` are used to supply login credentials
|
||||||
|
for Amazon. When encryption is enabled, they are stored in encrypted form
|
||||||
|
by `git annex initremote`, so you do not need to keep the environment
|
||||||
|
variables set after the initial initalization of the remote.
|
||||||
|
|
||||||
A number of parameters can be passed to `git annex initremote` to configure
|
A number of parameters can be passed to `git annex initremote` to configure
|
||||||
the S3 remote.
|
the S3 remote.
|
||||||
|
|
||||||
|
@ -29,8 +35,5 @@ the S3 remote.
|
||||||
so by default, a bucket name is chosen based on the remote name
|
so by default, a bucket name is chosen based on the remote name
|
||||||
and UUID. This can be specified to pick a bucket name.
|
and UUID. This can be specified to pick a bucket name.
|
||||||
|
|
||||||
The standard environment variables `ANNEX_S3_ACCESS_KEY_ID` and
|
* `x-amz-*` are passed through as http headers when storing keys
|
||||||
`ANNEX_S3_SECRET_ACCESS_KEY` can be used to supply login credentials
|
in S3.
|
||||||
for Amazon. When encryption is enabled, they are stored in encrypted form
|
|
||||||
by `git annex initremote`, so you do not need to keep the environment
|
|
||||||
variables set after the initial initalization of the remote.
|
|
||||||
|
|
|
@ -15,20 +15,18 @@ Sign up for an account, and get your access keys here:
|
||||||
# export AWS_ACCESS_KEY_ID=blahblah
|
# export AWS_ACCESS_KEY_ID=blahblah
|
||||||
# export AWS_SECRET_ACCESS_KEY=xxxxxxx
|
# export AWS_SECRET_ACCESS_KEY=xxxxxxx
|
||||||
|
|
||||||
Now go to <http://www.archive.org/create/> and create the item.
|
Specify `host=s3.us.archive.org` when doing `initremote` to set up
|
||||||
This allows you to fill in metadata which git-annex cannot provide to the
|
a remote at the Archive. This will enable a special Internet Archive mode:
|
||||||
Internet Archive. (It also works around a bug with bucket creation.)
|
Encryption is not allowed; you are required to specify a bucket name
|
||||||
|
rather than letting git-annex pick a random one; and you can optionally
|
||||||
|
specify `x-archive-meta*` headers to add metadata as explained in their
|
||||||
|
[documentation](http://www.archive.org/help/abouts3.txt).
|
||||||
|
|
||||||
(Note that there seems to be a bug in either hS3 or the archive that
|
# git annex initremote archive-panama type=S3
|
||||||
breaks authentication when the item name contains spaces or upper-case
|
# host=s3.us.archive.org bucket=panama-canal-lock-blueprints \
|
||||||
letters.. use all lowercase and no spaces.)
|
x-archive-meta-mediatype=texts x-archive-meta-language=eng \
|
||||||
|
x-archive-meta-title="original Panama Canal lock design blueprints"
|
||||||
Specify `host=s3.us.archive.org` when doing initremote to set up
|
initremote archive-panama (Internet Archive mode) (checking bucket) (creating bucket in US) ok
|
||||||
a remote at the Archive. It does not make sense to use encryption.
|
|
||||||
For the bucket name, specify the item name you created earlier.
|
|
||||||
|
|
||||||
# git annex initremote panama type=S3 encryption=none host=s3.us.archive.org bucket=panama-canal-lock-blueprints
|
|
||||||
initremote archive-panama (checking bucket) (creating bucket in US) ok
|
|
||||||
# git annex describe archive-panama "Internet Archive item for my grandfather's Panama Canal lock design blueprints"
|
# git annex describe archive-panama "Internet Archive item for my grandfather's Panama Canal lock design blueprints"
|
||||||
describe archive-panama ok
|
describe archive-panama ok
|
||||||
|
|
||||||
|
@ -36,11 +34,19 @@ Then you can annex files and copy them to the remote as usual:
|
||||||
|
|
||||||
# git annex add photo1.jpeg
|
# git annex add photo1.jpeg
|
||||||
add photo1.jpeg ok
|
add photo1.jpeg ok
|
||||||
# git annex copy photo1.jpeg --to archive-panama
|
# git annex copy photo1.jpeg --fast --to archive-panama
|
||||||
copy (checking archive-panama...) (to archive-panama...) ok
|
copy (to archive-panama...) ok
|
||||||
|
|
||||||
|
-----
|
||||||
|
|
||||||
Note that it probably makes the most sense to use the WORM backend
|
Note that it probably makes the most sense to use the WORM backend
|
||||||
for files, since that exposes the original filename in the key stored
|
for files, since that exposes the original filename in the key stored
|
||||||
in the Archive, which allows its special processing for sound files,
|
in the Archive, which allows its special processing for sound files,
|
||||||
movies, etc to be done. Also, the Internet Archive has restrictions
|
movies, etc to be done.
|
||||||
on what is allowed in a filename; particularly no spaces are allowed.
|
|
||||||
|
Also, the Internet Archive has restrictions on what is allowed in a
|
||||||
|
filename; particularly no spaces are allowed.
|
||||||
|
|
||||||
|
There seems to be a bug in either hS3 or the archive that breaks
|
||||||
|
authentication when the bucket name contains spaces or upper-case letters..
|
||||||
|
use all lowercase and no spaces when making the bucket with `initremote`.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue