git-annex/Remote/Bup.hs

289 lines
8.9 KiB
Haskell
Raw Normal View History

{- Using bup as a remote.
-
- Copyright 2011 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Remote.Bup (remote) where
import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M
import System.Process
Use cryptohash rather than SHA for hashing. This is a massive win on OSX, which doesn't have a sha256sum normally. Only use external hash commands when the file is > 1 mb, since cryptohash is quite close to them in speed. SHA is still used to calculate HMACs. I don't quite understand cryptohash's API for those. Used the following benchmark to arrive at the 1 mb number. 1 mb file: benchmarking sha256/internal mean: 13.86696 ms, lb 13.83010 ms, ub 13.93453 ms, ci 0.950 std dev: 249.3235 us, lb 162.0448 us, ub 458.1744 us, ci 0.950 found 5 outliers among 100 samples (5.0%) 4 (4.0%) high mild 1 (1.0%) high severe variance introduced by outliers: 10.415% variance is moderately inflated by outliers benchmarking sha256/external mean: 14.20670 ms, lb 14.17237 ms, ub 14.27004 ms, ci 0.950 std dev: 230.5448 us, lb 150.7310 us, ub 427.6068 us, ci 0.950 found 3 outliers among 100 samples (3.0%) 2 (2.0%) high mild 1 (1.0%) high severe 2 mb file: benchmarking sha256/internal mean: 26.44270 ms, lb 26.23701 ms, ub 26.63414 ms, ci 0.950 std dev: 1.012303 ms, lb 925.8921 us, ub 1.122267 ms, ci 0.950 variance introduced by outliers: 35.540% variance is moderately inflated by outliers benchmarking sha256/external mean: 26.84521 ms, lb 26.77644 ms, ub 26.91433 ms, ci 0.950 std dev: 347.7867 us, lb 210.6283 us, ub 571.3351 us, ci 0.950 found 6 outliers among 100 samples (6.0%) import Crypto.Hash import Data.ByteString.Lazy as L import Criterion.Main import Common testfile :: FilePath testfile = "/run/shm/data" -- on ram disk main = defaultMain [ bgroup "sha256" [ bench "internal" $ whnfIO internal , bench "external" $ whnfIO external ] ] sha256 :: L.ByteString -> Digest SHA256 sha256 = hashlazy internal :: IO String internal = show . sha256 <$> L.readFile testfile external :: IO String external = do s <- readProcess "sha256sum" [testfile] return $ fst $ separate (== ' ') s
2013-09-22 23:45:08 +00:00
import Data.ByteString.Lazy.UTF8 (fromString)
2011-10-05 20:02:51 +00:00
import Common.Annex
import Types.Remote
import Types.Key
import qualified Git
2011-12-14 19:56:11 +00:00
import qualified Git.Command
import qualified Git.Config
import qualified Git.Construct
import qualified Git.Ref
import Config
import Config.Cost
import qualified Remote.Helper.Ssh as Ssh
2011-08-17 00:49:54 +00:00
import Remote.Helper.Special
import Remote.Helper.Encryptable
import Remote.Helper.Messages
import Crypto
Use cryptohash rather than SHA for hashing. This is a massive win on OSX, which doesn't have a sha256sum normally. Only use external hash commands when the file is > 1 mb, since cryptohash is quite close to them in speed. SHA is still used to calculate HMACs. I don't quite understand cryptohash's API for those. Used the following benchmark to arrive at the 1 mb number. 1 mb file: benchmarking sha256/internal mean: 13.86696 ms, lb 13.83010 ms, ub 13.93453 ms, ci 0.950 std dev: 249.3235 us, lb 162.0448 us, ub 458.1744 us, ci 0.950 found 5 outliers among 100 samples (5.0%) 4 (4.0%) high mild 1 (1.0%) high severe variance introduced by outliers: 10.415% variance is moderately inflated by outliers benchmarking sha256/external mean: 14.20670 ms, lb 14.17237 ms, ub 14.27004 ms, ci 0.950 std dev: 230.5448 us, lb 150.7310 us, ub 427.6068 us, ci 0.950 found 3 outliers among 100 samples (3.0%) 2 (2.0%) high mild 1 (1.0%) high severe 2 mb file: benchmarking sha256/internal mean: 26.44270 ms, lb 26.23701 ms, ub 26.63414 ms, ci 0.950 std dev: 1.012303 ms, lb 925.8921 us, ub 1.122267 ms, ci 0.950 variance introduced by outliers: 35.540% variance is moderately inflated by outliers benchmarking sha256/external mean: 26.84521 ms, lb 26.77644 ms, ub 26.91433 ms, ci 0.950 std dev: 347.7867 us, lb 210.6283 us, ub 571.3351 us, ci 0.950 found 6 outliers among 100 samples (6.0%) import Crypto.Hash import Data.ByteString.Lazy as L import Criterion.Main import Common testfile :: FilePath testfile = "/run/shm/data" -- on ram disk main = defaultMain [ bgroup "sha256" [ bench "internal" $ whnfIO internal , bench "external" $ whnfIO external ] ] sha256 :: L.ByteString -> Digest SHA256 sha256 = hashlazy internal :: IO String internal = show . sha256 <$> L.readFile testfile external :: IO String external = do s <- readProcess "sha256sum" [testfile] return $ fst $ separate (== ' ') s
2013-09-22 23:45:08 +00:00
import Utility.Hash
import Utility.UserInfo
import Annex.Content
import Annex.UUID
import Utility.Metered
type BupRepo = String
2011-12-31 08:11:39 +00:00
remote :: RemoteType
remote = RemoteType {
typename = "bup",
enumerate = findSpecialRemotes "buprepo",
generate = gen,
setup = bupSetup
}
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
gen r u c gc = do
bupr <- liftIO $ bup2GitRemote buprepo
cst <- remoteCost gc $
if bupLocal buprepo
then nearlyCheapRemoteCost
else expensiveRemoteCost
(u', bupr') <- getBupUUID bupr u
2011-04-09 01:37:59 +00:00
let new = Remote
{ uuid = u'
, cost = cst
, name = Git.repoDescribe r
, storeKey = store new buprepo
, retrieveKeyFile = retrieve buprepo
, retrieveKeyFileCheap = retrieveCheap buprepo
, removeKey = remove
, hasKey = checkPresent r bupr'
, hasKeyCheap = bupLocal buprepo
, whereisKey = Nothing
, remoteFsck = Nothing
, repairRepo = Nothing
, config = c
, repo = r
, gitconfig = gc
, localpath = if bupLocal buprepo && not (null buprepo)
then Just buprepo
else Nothing
, remotetype = remote
2013-03-15 23:16:13 +00:00
, globallyAvailable = not $ bupLocal buprepo
, readonly = False
}
return $ Just $ encryptableRemote c
(storeEncrypted new buprepo)
(retrieveEncrypted buprepo)
new
where
buprepo = fromMaybe (error "missing buprepo") $ remoteAnnexBupRepo gc
bupSetup :: Maybe UUID -> RemoteConfig -> Annex (RemoteConfig, UUID)
bupSetup mu c = do
u <- maybe (liftIO genUUID) return mu
-- verify configuration is sane
2011-07-15 16:47:14 +00:00
let buprepo = fromMaybe (error "Specify buprepo=") $
2011-05-15 06:49:43 +00:00
M.lookup "buprepo" c
c' <- encryptionSetup c
-- bup init will create the repository.
-- (If the repository already exists, bup init again appears safe.)
showAction "bup init"
unlessM (bup "init" buprepo []) $ error "bup init failed"
storeBupUUID u buprepo
2011-04-09 16:34:49 +00:00
-- The buprepo is stored in git config, as well as this repo's
-- persistant state, so it can vary between hosts.
gitConfigSpecialRemote u c' "buprepo" buprepo
return (c', u)
bupParams :: String -> BupRepo -> [CommandParam] -> [CommandParam]
bupParams command buprepo params =
2011-07-15 16:47:14 +00:00
Param command : [Param "-r", Param buprepo] ++ params
bup :: String -> BupRepo -> [CommandParam] -> Annex Bool
bup command buprepo params = do
showOutput -- make way for bup output
liftIO $ boolSystem "bup" $ bupParams command buprepo params
pipeBup :: [CommandParam] -> Maybe Handle -> Maybe Handle -> IO Bool
pipeBup params inh outh = do
p <- runProcess "bup" (toCommand params)
Nothing Nothing inh outh Nothing
ok <- waitForProcess p
case ok of
ExitSuccess -> return True
_ -> return False
bupSplitParams :: Remote -> BupRepo -> Key -> [CommandParam] -> Annex [CommandParam]
bupSplitParams r buprepo k src = do
let os = map Param $ remoteAnnexBupSplitOptions $ gitconfig r
showOutput -- make way for bup output
return $ bupParams "split" buprepo
2012-10-24 14:54:58 +00:00
(os ++ [Param "-n", Param (bupRef k)] ++ src)
store :: Remote -> BupRepo -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool
store r buprepo k _f _p = sendAnnex k (rollback k buprepo) $ \src -> do
params <- bupSplitParams r buprepo k [File src]
liftIO $ boolSystem "bup" params
storeEncrypted :: Remote -> BupRepo -> (Cipher, Key) -> Key -> MeterUpdate -> Annex Bool
storeEncrypted r buprepo (cipher, enck) k _p =
sendAnnex k (rollback enck buprepo) $ \src -> do
params <- bupSplitParams r buprepo enck []
liftIO $ catchBoolIO $
encrypt (getGpgEncParams r) cipher (feedFile src) $ \h ->
pipeBup params (Just h) Nothing
retrieve :: BupRepo -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Bool
retrieve buprepo k _f d _p = do
let params = bupParams "join" buprepo [Param $ bupRef k]
liftIO $ catchBoolIO $ do
tofile <- openFile d WriteMode
pipeBup params Nothing (Just tofile)
retrieveCheap :: BupRepo -> Key -> FilePath -> Annex Bool
retrieveCheap _ _ _ = return False
retrieveEncrypted :: BupRepo -> (Cipher, Key) -> Key -> FilePath -> MeterUpdate -> Annex Bool
retrieveEncrypted buprepo (cipher, enck) _ f _p = liftIO $ catchBoolIO $
withHandle StdoutHandle createProcessSuccess p $ \h -> do
decrypt cipher (\toh -> L.hPut toh =<< L.hGetContents h) $
readBytes $ L.writeFile f
2011-04-17 04:57:11 +00:00
return True
2012-11-11 04:51:07 +00:00
where
params = bupParams "join" buprepo [Param $ bupRef enck]
p = proc "bup" $ toCommand params
remove :: Key -> Annex Bool
remove _ = do
warning "content cannot be removed from bup remote"
return False
{- Cannot revert having stored a key in bup, but at least the data for the
- key will be used for deltaing data of other keys stored later.
-
- We can, however, remove the git branch that bup created for the key.
-}
rollback :: Key -> BupRepo -> Annex ()
rollback k bupr = go =<< liftIO (bup2GitRemote bupr)
where
go r
| Git.repoIsUrl r = void $ onBupRemote r boolSystem "git" params
| otherwise = void $ liftIO $ catchMaybeIO $
boolSystem "git" $ Git.Command.gitCommandLine params r
params = [ Params "branch -D", Param (bupRef k) ]
{- Bup does not provide a way to tell if a given dataset is present
- in a bup repository. One way it to check if the git repository has
- a branch matching the name (as created by bup split -n).
-}
checkPresent :: Git.Repo -> Git.Repo -> Key -> Annex (Either String Bool)
checkPresent r bupr k
| Git.repoIsUrl bupr = do
showChecking r
ok <- onBupRemote bupr boolSystem "git" params
return $ Right ok
| otherwise = liftIO $ catchMsgIO $
2011-12-14 19:56:11 +00:00
boolSystem "git" $ Git.Command.gitCommandLine params bupr
2012-11-11 04:51:07 +00:00
where
params =
[ Params "show-ref --quiet --verify"
, Param $ "refs/heads/" ++ bupRef k
]
2011-04-09 16:34:49 +00:00
{- Store UUID in the annex.uuid setting of the bup repository. -}
storeBupUUID :: UUID -> BupRepo -> Annex ()
storeBupUUID u buprepo = do
r <- liftIO $ bup2GitRemote buprepo
2011-04-09 16:34:49 +00:00
if Git.repoIsUrl r
then do
showAction "storing uuid"
unlessM (onBupRemote r boolSystem "git"
[Params $ "config annex.uuid " ++ v]) $
error "ssh failed"
2011-04-09 16:34:49 +00:00
else liftIO $ do
r' <- Git.Config.read r
let olduuid = Git.Config.get "annex.uuid" "" r'
when (olduuid == "") $
Git.Command.run
[ Param "config"
, Param "annex.uuid"
, Param v
] r'
2012-11-11 04:51:07 +00:00
where
v = fromUUID u
2011-04-09 16:34:49 +00:00
onBupRemote :: Git.Repo -> (FilePath -> [CommandParam] -> IO a) -> FilePath -> [CommandParam] -> Annex a
onBupRemote r a command params = do
sshparams <- Ssh.toRepo r [Param $
2011-07-15 16:47:14 +00:00
"cd " ++ dir ++ " && " ++ unwords (command : toCommand params)]
liftIO $ a "ssh" sshparams
where
path = Git.repoPath r
base = fromMaybe path (stripPrefix "/~/" path)
dir = shellEscape base
{- Allow for bup repositories on removable media by checking
- local bup repositories to see if they are available, and getting their
- uuid (which may be different from the stored uuid for the bup remote).
-
2011-12-10 22:51:01 +00:00
- If a bup repository is not available, returns NoUUID.
- This will cause checkPresent to indicate nothing from the bup remote
- is known to be present.
-
- Also, returns a version of the repo with config read, if it is local.
-}
getBupUUID :: Git.Repo -> UUID -> Annex (UUID, Git.Repo)
getBupUUID r u
| Git.repoIsUrl r = return (u, r)
| otherwise = liftIO $ do
ret <- tryIO $ Git.Config.read r
case ret of
Right r' -> return (toUUID $ Git.Config.get "annex.uuid" "" r', r')
Left _ -> return (NoUUID, r)
2011-04-09 16:34:49 +00:00
{- Converts a bup remote path spec into a Git.Repo. There are some
- differences in path representation between git and bup. -}
bup2GitRemote :: BupRepo -> IO Git.Repo
2011-04-09 16:34:49 +00:00
bup2GitRemote "" = do
-- bup -r "" operates on ~/.bup
h <- myHomeDir
Git.Construct.fromAbsPath $ h </> ".bup"
2011-04-09 16:34:49 +00:00
bup2GitRemote r
| bupLocal r =
if "/" `isPrefixOf` r
then Git.Construct.fromAbsPath r
2011-04-09 16:34:49 +00:00
else error "please specify an absolute path"
| otherwise = Git.Construct.fromUrl $ "ssh://" ++ host ++ slash dir
2012-11-11 04:51:07 +00:00
where
bits = split ":" r
host = Prelude.head bits
dir = intercalate ":" $ drop 1 bits
2012-11-11 04:51:07 +00:00
-- "host:~user/dir" is not supported specially by bup;
-- "host:dir" is relative to the home directory;
-- "host:" goes in ~/.bup
slash d
| null d = "/~/.bup"
| "/" `isPrefixOf` d = d
| otherwise = "/~/" ++ d
2011-04-09 16:34:49 +00:00
{- Converts a key into a git ref name, which bup-split -n will use to point
- to it. -}
bupRef :: Key -> String
bupRef k
| Git.Ref.legal True shown = shown
Use cryptohash rather than SHA for hashing. This is a massive win on OSX, which doesn't have a sha256sum normally. Only use external hash commands when the file is > 1 mb, since cryptohash is quite close to them in speed. SHA is still used to calculate HMACs. I don't quite understand cryptohash's API for those. Used the following benchmark to arrive at the 1 mb number. 1 mb file: benchmarking sha256/internal mean: 13.86696 ms, lb 13.83010 ms, ub 13.93453 ms, ci 0.950 std dev: 249.3235 us, lb 162.0448 us, ub 458.1744 us, ci 0.950 found 5 outliers among 100 samples (5.0%) 4 (4.0%) high mild 1 (1.0%) high severe variance introduced by outliers: 10.415% variance is moderately inflated by outliers benchmarking sha256/external mean: 14.20670 ms, lb 14.17237 ms, ub 14.27004 ms, ci 0.950 std dev: 230.5448 us, lb 150.7310 us, ub 427.6068 us, ci 0.950 found 3 outliers among 100 samples (3.0%) 2 (2.0%) high mild 1 (1.0%) high severe 2 mb file: benchmarking sha256/internal mean: 26.44270 ms, lb 26.23701 ms, ub 26.63414 ms, ci 0.950 std dev: 1.012303 ms, lb 925.8921 us, ub 1.122267 ms, ci 0.950 variance introduced by outliers: 35.540% variance is moderately inflated by outliers benchmarking sha256/external mean: 26.84521 ms, lb 26.77644 ms, ub 26.91433 ms, ci 0.950 std dev: 347.7867 us, lb 210.6283 us, ub 571.3351 us, ci 0.950 found 6 outliers among 100 samples (6.0%) import Crypto.Hash import Data.ByteString.Lazy as L import Criterion.Main import Common testfile :: FilePath testfile = "/run/shm/data" -- on ram disk main = defaultMain [ bgroup "sha256" [ bench "internal" $ whnfIO internal , bench "external" $ whnfIO external ] ] sha256 :: L.ByteString -> Digest SHA256 sha256 = hashlazy internal :: IO String internal = show . sha256 <$> L.readFile testfile external :: IO String external = do s <- readProcess "sha256sum" [testfile] return $ fst $ separate (== ' ') s
2013-09-22 23:45:08 +00:00
| otherwise = "git-annex-" ++ show (sha256 (fromString shown))
2012-11-11 04:51:07 +00:00
where
shown = key2file k
bupLocal :: BupRepo -> Bool
2011-04-09 16:34:49 +00:00
bupLocal = notElem ':'