Amazon Glacier special remote; 100% working

This commit is contained in:
Joey Hess 2012-11-20 16:43:58 -04:00
parent d093587abf
commit a5111a6d85
16 changed files with 429 additions and 33 deletions

View file

@ -17,6 +17,7 @@ import Assistant.WebApp.Types
import Assistant.WebApp.SideBar
import Utility.Yesod
import qualified Remote.S3 as S3
import qualified Remote.Helper.AWS as AWS
import Logs.Remote
import qualified Remote
import Types.Remote (RemoteConfig)
@ -113,7 +114,7 @@ getEnableS3R uuid = s3Configurator $ do
makeS3Remote :: S3Creds -> String -> (Remote -> Handler ()) -> RemoteConfig -> Handler ()
makeS3Remote (S3Creds ak sk) name setup config = do
remotename <- runAnnex name $ fromRepo $ uniqueRemoteName name 0
liftIO $ S3.setCredsEnv (T.unpack ak, T.unpack sk)
liftIO $ AWS.setCredsEnv (T.unpack ak, T.unpack sk)
r <- liftAssistant $ liftAnnex $ addRemote $ do
makeSpecialRemote name S3.remote config
return remotename

View file

@ -34,7 +34,7 @@ data CredPairStorage = CredPairStorage
{- Stores creds in a remote's configuration, if the remote allows
- that. Otherwise, caches them locally. -}
setRemoteCredPair :: RemoteConfig -> CredPairStorage -> Annex RemoteConfig
setRemoteCredPair c storage = go =<< getRemoteCredPair c storage
setRemoteCredPair c storage = go =<< getRemoteCredPair' c storage
where
go (Just creds)
| embedCreds c = case credPairRemoteKey storage of
@ -58,8 +58,20 @@ setRemoteCredPair c storage = go =<< getRemoteCredPair c storage
{- Gets a remote's credpair, from the environment if set, otherwise
- from the cache in gitAnnexCredsDir, or failing that, from the
- value in RemoteConfig. -}
getRemoteCredPair :: RemoteConfig -> CredPairStorage -> Annex (Maybe CredPair)
getRemoteCredPair c storage = maybe fromcache (return . Just) =<< fromenv
getRemoteCredPair :: String -> RemoteConfig -> CredPairStorage -> Annex (Maybe CredPair)
getRemoteCredPair this c storage = maybe missing (return . Just) =<< getRemoteCredPair' c storage
where
(loginvar, passwordvar) = credPairEnvironment storage
missing = do
warning $ unwords
[ "Set both", loginvar
, "and", passwordvar
, "to use", this
]
return Nothing
getRemoteCredPair' :: RemoteConfig -> CredPairStorage -> Annex (Maybe CredPair)
getRemoteCredPair' c storage = maybe fromcache (return . Just) =<< fromenv
where
fromenv = liftIO $ getEnvCredPair storage
fromcache = maybe fromconfig (return . Just) =<< readCacheCredPair storage
@ -91,6 +103,8 @@ getEnvCredPair storage = liftM2 (,)
(uenv, penv) = credPairEnvironment storage
get = catchMaybeIO . getEnv
{- Stores a CredPair in the environment. -}
setEnvCredPair :: CredPair -> CredPairStorage -> IO ()
setEnvCredPair (l, p) storage = do

View file

@ -163,6 +163,7 @@ options = Option.common ++
"skip files smaller than a size"
, Option ['T'] ["time-limit"] (ReqArg Limit.addTimeLimit paramTime)
"stop after the specified amount of time"
, Option [] ["trust-glacier"] (NoArg (Annex.setFlag "trustglacier")) "Trust Amazon Glacier inventory"
] ++ Option.matcher
where
setnumcopies v = Annex.changeState $

235
Remote/Glacier.hs Normal file
View file

@ -0,0 +1,235 @@
{- Amazon Glacier remotes.
-
- Copyright 2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Remote.Glacier (remote) where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Map as M
import System.Environment
import Common.Annex
import Types.Remote
import Types.Key
import qualified Git
import Config
import Remote.Helper.Special
import Remote.Helper.Encryptable
import qualified Remote.Helper.AWS as AWS
import Crypto
import Creds
import Annex.Content
import qualified Annex
type Vault = String
type Archive = FilePath
remote :: RemoteType
remote = RemoteType {
typename = "glacier",
enumerate = findSpecialRemotes "glacier",
generate = gen,
setup = glacierSetup
}
gen :: Git.Repo -> UUID -> Maybe RemoteConfig -> Annex Remote
gen r u c = do
cst <- remoteCost r expensiveRemoteCost
return $ gen' r u c cst
gen' :: Git.Repo -> UUID -> Maybe RemoteConfig -> Int -> Remote
gen' r u c cst =
encryptableRemote c
(storeEncrypted this)
(retrieveEncrypted this)
this
where
this = Remote {
uuid = u,
cost = cst,
name = Git.repoDescribe r,
storeKey = store this,
retrieveKeyFile = retrieve this,
retrieveKeyFileCheap = retrieveCheap this,
removeKey = remove this,
hasKey = checkPresent this,
hasKeyCheap = False,
whereisKey = Nothing,
config = c,
repo = r,
localpath = Nothing,
readonly = False,
remotetype = remote
}
glacierSetup :: UUID -> RemoteConfig -> Annex RemoteConfig
glacierSetup u c = do
c' <- encryptionSetup c
let fullconfig = c' `M.union` defaults
genVault fullconfig u
gitConfigSpecialRemote u fullconfig "glacier" "true"
setRemoteCredPair fullconfig (AWS.creds u)
where
remotename = fromJust (M.lookup "name" c)
defvault = remotename ++ "-" ++ fromUUID u
defaults = M.fromList
[ ("datacenter", "us-east-1")
, ("vault", defvault)
]
store :: Remote -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool
store r k _f _p
| keySize k == Just 0 = do
warning "Cannot store empty files in Glacier."
return False
| otherwise = do
src <- inRepo $ gitAnnexLocation k
storeHelper r k src
storeEncrypted :: Remote -> (Cipher, Key) -> Key -> MeterUpdate -> Annex Bool
storeEncrypted r (cipher, enck) k _p =
-- With current glacier-cli UI, have to encrypt to a temp file.
withTmp enck $ \tmp -> do
f <- inRepo $ gitAnnexLocation k
liftIO $ encrypt cipher (feedFile f) $
readBytes $ L.writeFile tmp
storeHelper r enck tmp
{- Glacier cannot store empty files. So empty keys are handled by
- doing nothing on storage, and re-creating the empty file on retrieve. -}
storeHelper :: Remote -> Key -> FilePath -> Annex Bool
storeHelper r k file = do
showOutput
glacierAction r
[ Param "archive"
, Param "upload"
, Param "--name", Param $ archive r k
, Param $ remoteVault r
, File file
]
retrieve :: Remote -> Key -> AssociatedFile -> FilePath -> Annex Bool
retrieve r k _f d = retrieveHelper r k d
retrieveCheap :: Remote -> Key -> FilePath -> Annex Bool
retrieveCheap _ _ _ = return False
retrieveEncrypted :: Remote -> (Cipher, Key) -> Key -> FilePath -> Annex Bool
retrieveEncrypted r (cipher, enck) _k d = do
withTmp enck $ \tmp -> do
ok <- retrieveHelper r enck tmp
if ok
then liftIO $ decrypt cipher (feedFile tmp) $
readBytes $ \content -> do
L.writeFile d content
return True
else return False
retrieveHelper :: Remote -> Key -> FilePath -> Annex Bool
retrieveHelper r k file = do
showOutput
ok <- glacierAction r
[ Param "archive"
, Param "retrieve"
, Param "-o", File file
, Param $ remoteVault r
, Param $ archive r k
]
unless ok $
showLongNote "Recommend you wait up to 4 hours, and then run this command again."
return ok
remove :: Remote -> Key -> Annex Bool
remove r k = glacierAction r
[ Param "archive"
, Param "delete"
, Param $ remoteVault r
, Param $ archive r k
]
checkPresent :: Remote -> Key -> Annex (Either String Bool)
checkPresent r k = do
showAction $ "checking " ++ name r
go =<< glacierEnv (fromJust $ config r) (uuid r)
where
go Nothing = return $ Left "cannot check glacier"
go (Just env) = do
{- glacier checkpresent outputs the archive name to stdout if
- it's present. -}
v <- liftIO $ catchMsgIO $
readProcessEnv "glacier" (toCommand params) (Just env)
case v of
Right s -> do
let probablypresent = key2file k `elem` lines s
if probablypresent
then ifM (Annex.getFlag "trustglacier")
( return $ Right True, untrusted )
else return $ Right False
Left e -> return $ Left e
params =
[ Param "archive"
, Param "checkpresent"
, Param $ remoteVault r
, Param $ archive r k
]
untrusted = do
showLongNote $ unlines
[ "Glacier's inventory says it has a copy."
, "However, the inventory could be out of date, if it was recently removed."
, "(Use --trust-glacier if you're sure it's still in Glacier.)"
, ""
]
return $ Right False
glacierAction :: Remote -> [CommandParam] -> Annex Bool
glacierAction r params = do
when (isNothing $ config r) $
error $ "Missing configuration for special remote " ++ name r
runGlacier (fromJust $ config r) (uuid r) params
runGlacier :: RemoteConfig -> UUID -> [CommandParam] -> Annex Bool
runGlacier c u params = go =<< glacierEnv c u
where
go Nothing = return False
go (Just env) = liftIO $
boolSystemEnv "glacier" (datacenter:params) (Just env)
datacenter = Param $ "--region=" ++
(fromJust $ M.lookup "datacenter" c)
glacierEnv :: RemoteConfig -> UUID -> Annex (Maybe [(String, String)])
glacierEnv c u = go =<< getRemoteCredPair "glacier" c creds
where
go Nothing = return Nothing
go (Just (user, pass)) = do
env <- liftIO getEnvironment
return $ Just $ (uk, user):(pk, pass):env
creds = AWS.creds u
(uk, pk) = credPairEnvironment creds
remoteVault :: Remote -> Vault
remoteVault = vault . fromJust . config
vault :: RemoteConfig -> Vault
vault = fromJust . M.lookup "vault"
archive :: Remote -> Key -> Archive
archive r k = fileprefix ++ key2file k
where
fileprefix = M.findWithDefault "" "fileprefix" $ fromJust $ config r
-- glacier vault create will succeed even if the vault already exists.
genVault :: RemoteConfig -> UUID -> Annex ()
genVault c u = unlessM (runGlacier c u params) $
error "Failed creating glacier vault."
where
params =
[ Param "vault"
, Param "create"
, Param $ vault c
]

21
Remote/Helper/AWS.hs Normal file
View file

@ -0,0 +1,21 @@
{- Amazon Web Services common infrastructure.
-
- Copyright 2011,2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Remote.Helper.AWS where
import Common.Annex
import Creds
creds :: UUID -> CredPairStorage
creds u = CredPairStorage
{ credPairFile = fromUUID u
, credPairEnvironment = ("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
, credPairRemoteKey = Just "s3creds"
}
setCredsEnv :: CredPair -> IO ()
setCredsEnv p = setEnvCredPair p $ creds undefined

View file

@ -32,6 +32,7 @@ import qualified Remote.Web
#ifdef WITH_WEBDAV
import qualified Remote.WebDAV
#endif
import qualified Remote.Glacier
import qualified Remote.Hook
remoteTypes :: [RemoteType]
@ -47,6 +48,7 @@ remoteTypes =
#ifdef WITH_WEBDAV
, Remote.WebDAV.remote
#endif
, Remote.Glacier.remote
, Remote.Hook.remote
]

View file

@ -5,7 +5,7 @@
- Licensed under the GNU GPL version 3 or higher.
-}
module Remote.S3 (remote, setCredsEnv) where
module Remote.S3 (remote) where
import Network.AWS.AWSConnection
import Network.AWS.S3Object
@ -22,6 +22,7 @@ import qualified Git
import Config
import Remote.Helper.Special
import Remote.Helper.Encryptable
import qualified Remote.Helper.AWS as AWS
import Crypto
import Creds
import Meters
@ -84,7 +85,7 @@ s3Setup u c = handlehost $ M.lookup "host" c
use fullconfig = do
gitConfigSpecialRemote u fullconfig "s3" "true"
setRemoteCredPair fullconfig (s3Creds u)
setRemoteCredPair fullconfig (AWS.creds u)
defaulthost = do
c' <- encryptionSetup c
@ -261,28 +262,13 @@ s3ConnectionRequired c u =
maybe (error "Cannot connect to S3") return =<< s3Connection c u
s3Connection :: RemoteConfig -> UUID -> Annex (Maybe AWSConnection)
s3Connection c u = go =<< getRemoteCredPair c creds
s3Connection c u = go =<< getRemoteCredPair "S3" c (AWS.creds u)
where
go Nothing = do
warning $ "Set both " ++ s3AccessKey ++ " and " ++ s3SecretKey ++ " to use S3"
return Nothing
go Nothing = return Nothing
go (Just (ak, sk)) = return $ Just $ AWSConnection host port ak sk
creds = s3Creds u
(s3AccessKey, s3SecretKey) = credPairEnvironment creds
host = fromJust $ M.lookup "host" c
port = let s = fromJust $ M.lookup "port" c in
case reads s of
[(p, _)] -> p
_ -> error $ "bad S3 port value: " ++ s
s3Creds :: UUID -> CredPairStorage
s3Creds u = CredPairStorage
{ credPairFile = fromUUID u
, credPairEnvironment = ("AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")
, credPairRemoteKey = Just "s3creds"
}
setCredsEnv :: (String, String) -> IO ()
setCredsEnv creds = setEnvCredPair creds $ s3Creds undefined

View file

@ -321,13 +321,7 @@ noProps :: XML.Document
noProps = XML.parseText_ XML.def $ LT.pack "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<propertyupdate/>"
getCreds :: RemoteConfig -> UUID -> Annex (Maybe CredPair)
getCreds c u = maybe missing (return . Just) =<< getRemoteCredPair c creds
where
creds = davCreds u
(loginvar, passwordvar) = credPairEnvironment creds
missing = do
warning $ "Set both " ++ loginvar ++ " and " ++ passwordvar ++ " to use webdav"
return Nothing
getCreds c u = getRemoteCredPair "webdav" c (davCreds u)
davCreds :: UUID -> CredPairStorage
davCreds u = CredPairStorage

2
debian/changelog vendored
View file

@ -1,6 +1,6 @@
git-annex (3.20121113) UNRELEASED; urgency=low
* New webdav special remotes.
* New webdav and Amazon glacier special remotes.
* Show error message to user when testing XMPP creds.
* Fix build of assistant without yesod.
* webapp: The list of repositiories refreshes when new repositories are

View file

@ -15,7 +15,7 @@ More should be added, such as:
* Box.com (it's free, and current method is hard to set up and a sorta
shakey; a better method would be to use its API) **done**
* Dropbox? That would be ironic.. Via its API, presumably.
* [[Amazon Glacier|todo/special_remote_for_amazon_glacier]]
* [[Amazon Glacier|todo/special_remote_for_amazon_glacier]] **done**
* [nimbus.io](https://nimbus.io/) Fairly low prices ($0.06/GB);
REST API; free software

View file

@ -568,6 +568,17 @@ subdirectories).
The repository should be specified using the name of a configured remote,
or the UUID or description of a repository.
* --trust-glacier-inventory
Amazon Glacier inventories take hours to retrieve, and may not represent
the current state of a repository. So git-annex does not trust that
files that the inventory claims are in Glacier are really there.
This switch can be used to allow it to trust the inventory.
Be careful using this, especially if you or someone else might have recently
removed a file from Glacier. If you try to drop the only other copy of the
file, and this switch is enabled, you could lose data!
* --backend=name
Specifies which key-value backend to use. This can be used when
@ -885,6 +896,11 @@ Here are all the supported configuration settings.
Used to identify Amazon S3 special remotes.
Normally this is automatically set up by `git annex initremote`.
* `remote.<name>.glacier`
Used to identify Amazon Glacier special remotes.
Normally this is automatically set up by `git annex initremote`.
* `remote.<name>.webdav`
Used to identify webdav special remotes.

View file

@ -19,6 +19,7 @@ into many cloud services. Here are specific instructions
for various cloud things:
* [[tips/using_Amazon_S3]]
* [[tips/using_Amazon_Glacier]]
* [[tips/Internet_Archive_via_S3]]
* [[tahoe-lafs|forum/tips:_special__95__remotes__47__hook_with_tahoe-lafs]]
* [[tips/using_box.com_as_a_special_remote]]

View file

@ -0,0 +1,51 @@
This special remote type stores file contents in Amazon Glacier.
To use it, you need to have [glacier-cli](http://github.com/basak/glacier-cli)
installed.
The unusual thing about Amazon Glacier is the multiple-hour delay it takes
to retrieve information out of Glacier. To deal with this, commands like
"git-annex get" request Glacier start the retrieval process, and will fail
due to the data not yet being available. You can then wait appriximately
four hours, re-run the same command, and this time, it will actually
download the data.
## configuration
The standard environment variables `AWS_ACCESS_KEY_ID` and
`AWS_SECRET_ACCESS_KEY` are used to supply login credentials
for Amazon. You need to set these only when running
`git annex initremote`, as they will be cached in a file only you
can read inside the local git repository.
A number of parameters can be passed to `git annex initremote` to configure
the Glacier remote.
* `encryption` - Required. Either "none" to disable encryption (not recommended),
or a value that can be looked up (using gpg -k) to find a gpg encryption
key that will be given access to the remote, or "shared" which allows
every clone of the repository to access the encrypted data (use with caution).
Note that additional gpg keys can be given access to a remote by
rerunning initremote with the new key id. See [[encryption]].
* `embedcreds` - Optional. Set to "yes" embed the login credentials inside
the git repository, which allows other clones to also access them. This is
the default when gpg encryption is enabled; the credentials are stored
encrypted and only those with the repository's keys can access them.
It is not the default when using shared encryption, or no encryption.
Think carefully about who can access your repository before using
embedcreds without gpg encryption.
* `datacenter` - Defaults to "us-east-1".
* `vault` - Glacier requires that vaults have a globally unique name,
so by default, a vault name is chosen based on the remote name
and UUID. This can be specified to pick a valult name.
* `fileprefix` - By default, git-annex places files in a tree rooted at the
top of the Glacier vault. When this is set, it's prefixed to the filenames
used. For example, you could set it to "foo/" in one special remote,
and to "bar/" in another special remote, and both special remotes could
then use the same vault.

View file

@ -0,0 +1,69 @@
Amazon Glacier provides low-cost storage, well suited for archiving and
backup. But it takes around 4 hours to get content out of Glacier.
Recent versions of git-annex support Glacier. To use it, you need to have
[glacier-cli](http://github.com/basak/glacier-cli) installed.
First, export your Amazon AWS credentials:
# export AWS_ACCESS_KEY_ID="08TJMT99S3511WOZEP91"
# export AWS_SECRET_ACCESS_KEY="s3kr1t"
Now, create a gpg key, if you don't already have one. This will be used
to encrypt everything stored in Glacier, for your privacy. Once you have
a gpg key, run `gpg --list-secret-keys` to look up its key id, something
like "2512E3C7"
Next, create the Glacier remote.
# git annex initremote glacier type=glacier encryption=2512E3C7
initremote glacier (encryption setup with gpg key C910D9222512E3C7) (gpg) ok
The configuration for the Glacier remote is stored in git. So to make another
repository use the same Glacier remote is easy:
# cd /media/usb/annex
# git pull laptop
# git annex initremote glacier
initremote glacier (gpg) ok
Now the remote can be used like any other remote.
# git annex move my_cool_big_file --to glacier
copy my_cool_big_file (gpg) (checking glacier...) (to glacier...) ok
But, when you try to get a file out of Glacier, it'll queue a retrieval
job:
# git annex get my_cool_big_file
get my_cool_big_file (from glacier...) (gpg)
glacier: queued retrieval job for archive 'GPGHMACSHA1--862afd4e67e3946587a9ef7fa5beb4e8f1aeb6b8'
Recommend you wait up to 4 hours, and then run this command again.
failed
Like it says, you'll need to run the command again later. Let's remember to
do that:
# at now + 4 hours
at> git annex get my_cool_big_file
Another oddity of Glacier is that git-annex is never entirely sure
if a file is still in Glacier. Glacier inventories take hours to retrieve,
and even when retrieved do not necessarily represent the current state.
So, git-annex plays it safe, and avoids trusting the inventory:
# git annex copy important_file --to glacier
copy important_file (gpg) (checking glacier...) (to glacier...) ok
# git annex drop important_file
drop important_file (gpg) (checking glacier...)
However, the inventory could be out of date, if it was recently removed.
(Use --trust-glacier-inventory if you're sure it's still in Glacier.)
(unsafe)
Could only verify the existence of 0 out of 1 necessary copies
Like it says, you can use `--trust-glacier-inventory` if you're sure
Glacier's inventory is correct and up-to-date.
See [[special_remotes/Glacier]] for details.

View file

@ -2,7 +2,7 @@ git-annex extends git's usual remotes with some [[special_remotes]], that
are not git repositories. This way you can set up a remote using say,
Amazon S3, and use git-annex to transfer files into the cloud.
First, export your S3 credentials:
First, export your Amazon AWS credentials:
# export AWS_ACCESS_KEY_ID="08TJMT99S3511WOZEP91"
# export AWS_SECRET_ACCESS_KEY="s3kr1t"

View file

@ -18,8 +18,13 @@ run, or files to transfer, at that point.
--[[Joey]]
> [[done]]! --[[Joey]]
-----
> In the coming months, Amazon S3 will introduce an option that will allow customers to seamlessly move data between Amazon S3 and Amazon Glacier based on data lifecycle policies.
-- <http://aws.amazon.com/glacier/faqs/#How_should_I_choose_between_Amazon_Glacier_and_Amazon_S3>
>> They did, but it's IMHO not very useful for git-annex. It's rather
>> intended to allow aging S3 storage out to Glacier. --[[Joey]]