git-annex/Remote/Bup.hs

318 lines
9.7 KiB
Haskell
Raw Normal View History

{- Using bup as a remote.
-
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Remote.Bup (remote) where
import qualified Data.Map as M
2019-12-02 16:26:33 +00:00
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
Use cryptohash rather than SHA for hashing. This is a massive win on OSX, which doesn't have a sha256sum normally. Only use external hash commands when the file is > 1 mb, since cryptohash is quite close to them in speed. SHA is still used to calculate HMACs. I don't quite understand cryptohash's API for those. Used the following benchmark to arrive at the 1 mb number. 1 mb file: benchmarking sha256/internal mean: 13.86696 ms, lb 13.83010 ms, ub 13.93453 ms, ci 0.950 std dev: 249.3235 us, lb 162.0448 us, ub 458.1744 us, ci 0.950 found 5 outliers among 100 samples (5.0%) 4 (4.0%) high mild 1 (1.0%) high severe variance introduced by outliers: 10.415% variance is moderately inflated by outliers benchmarking sha256/external mean: 14.20670 ms, lb 14.17237 ms, ub 14.27004 ms, ci 0.950 std dev: 230.5448 us, lb 150.7310 us, ub 427.6068 us, ci 0.950 found 3 outliers among 100 samples (3.0%) 2 (2.0%) high mild 1 (1.0%) high severe 2 mb file: benchmarking sha256/internal mean: 26.44270 ms, lb 26.23701 ms, ub 26.63414 ms, ci 0.950 std dev: 1.012303 ms, lb 925.8921 us, ub 1.122267 ms, ci 0.950 variance introduced by outliers: 35.540% variance is moderately inflated by outliers benchmarking sha256/external mean: 26.84521 ms, lb 26.77644 ms, ub 26.91433 ms, ci 0.950 std dev: 347.7867 us, lb 210.6283 us, ub 571.3351 us, ci 0.950 found 6 outliers among 100 samples (6.0%) import Crypto.Hash import Data.ByteString.Lazy as L import Criterion.Main import Common testfile :: FilePath testfile = "/run/shm/data" -- on ram disk main = defaultMain [ bgroup "sha256" [ bench "internal" $ whnfIO internal , bench "external" $ whnfIO external ] ] sha256 :: L.ByteString -> Digest SHA256 sha256 = hashlazy internal :: IO String internal = show . sha256 <$> L.readFile testfile external :: IO String external = do s <- readProcess "sha256sum" [testfile] return $ fst $ separate (== ' ') s
2013-09-22 23:45:08 +00:00
import Data.ByteString.Lazy.UTF8 (fromString)
import Annex.Common
2014-05-16 20:08:20 +00:00
import qualified Annex
import Types.Remote
import Types.Creds
import Git.Types (ConfigValue(..), fromConfigKey)
import qualified Git
2011-12-14 19:56:11 +00:00
import qualified Git.Command
import qualified Git.Config
import qualified Git.Construct
import qualified Git.Ref
import Config
import Config.Cost
import qualified Remote.Helper.Ssh as Ssh
import Annex.SpecialRemote.Config
2011-08-17 00:49:54 +00:00
import Remote.Helper.Special
import Remote.Helper.Messages
2019-02-20 19:55:01 +00:00
import Remote.Helper.ExportImport
Use cryptohash rather than SHA for hashing. This is a massive win on OSX, which doesn't have a sha256sum normally. Only use external hash commands when the file is > 1 mb, since cryptohash is quite close to them in speed. SHA is still used to calculate HMACs. I don't quite understand cryptohash's API for those. Used the following benchmark to arrive at the 1 mb number. 1 mb file: benchmarking sha256/internal mean: 13.86696 ms, lb 13.83010 ms, ub 13.93453 ms, ci 0.950 std dev: 249.3235 us, lb 162.0448 us, ub 458.1744 us, ci 0.950 found 5 outliers among 100 samples (5.0%) 4 (4.0%) high mild 1 (1.0%) high severe variance introduced by outliers: 10.415% variance is moderately inflated by outliers benchmarking sha256/external mean: 14.20670 ms, lb 14.17237 ms, ub 14.27004 ms, ci 0.950 std dev: 230.5448 us, lb 150.7310 us, ub 427.6068 us, ci 0.950 found 3 outliers among 100 samples (3.0%) 2 (2.0%) high mild 1 (1.0%) high severe 2 mb file: benchmarking sha256/internal mean: 26.44270 ms, lb 26.23701 ms, ub 26.63414 ms, ci 0.950 std dev: 1.012303 ms, lb 925.8921 us, ub 1.122267 ms, ci 0.950 variance introduced by outliers: 35.540% variance is moderately inflated by outliers benchmarking sha256/external mean: 26.84521 ms, lb 26.77644 ms, ub 26.91433 ms, ci 0.950 std dev: 347.7867 us, lb 210.6283 us, ub 571.3351 us, ci 0.950 found 6 outliers among 100 samples (6.0%) import Crypto.Hash import Data.ByteString.Lazy as L import Criterion.Main import Common testfile :: FilePath testfile = "/run/shm/data" -- on ram disk main = defaultMain [ bgroup "sha256" [ bench "internal" $ whnfIO internal , bench "external" $ whnfIO external ] ] sha256 :: L.ByteString -> Digest SHA256 sha256 = hashlazy internal :: IO String internal = show . sha256 <$> L.readFile testfile external :: IO String external = do s <- readProcess "sha256sum" [testfile] return $ fst $ separate (== ' ') s
2013-09-22 23:45:08 +00:00
import Utility.Hash
import Utility.UserInfo
import Annex.UUID
import Annex.Ssh
import Utility.Metered
import Types.ProposedAccepted
type BupRepo = String
2011-12-31 08:11:39 +00:00
remote :: RemoteType
remote = specialRemoteType $ RemoteType
{ typename = "bup"
, enumerate = const (findSpecialRemotes "buprepo")
, generate = gen
, configParser = mkRemoteConfigParser
[ optionalStringParser buprepoField
(FieldDesc "(required) bup repository to use")
]
, setup = bupSetup
, exportSupported = exportUnsupported
2019-02-20 19:55:01 +00:00
, importSupported = importUnsupported
}
buprepoField :: RemoteConfigField
buprepoField = Accepted "buprepo"
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote)
gen r u rc gc rs = do
c <- parsedRemoteConfig remote rc
bupr <- liftIO $ bup2GitRemote buprepo
cst <- remoteCost gc $
if bupLocal buprepo
then nearlyCheapRemoteCost
else expensiveRemoteCost
(u', bupr') <- getBupUUID bupr u
2011-04-09 01:37:59 +00:00
let this = Remote
{ uuid = u'
, cost = cst
, name = Git.repoDescribe r
, storeKey = storeKeyDummy
, retrieveKeyFile = retrieveKeyFileDummy
, retrieveKeyFileCheap = Nothing
-- Bup uses git, which cryptographically verifies content
-- (with SHA1, but sufficiently for this).
, retrievalSecurityPolicy = RetrievalAllKeysSecure
, removeKey = removeKeyDummy
, lockContent = Nothing
, checkPresent = checkPresentDummy
, checkPresentCheap = bupLocal buprepo
, exportActions = exportUnsupported
2019-02-20 19:55:01 +00:00
, importActions = importUnsupported
, whereisKey = Nothing
, remoteFsck = Nothing
, repairRepo = Nothing
, config = c
, getRepo = return r
, gitconfig = gc
, localpath = if bupLocal buprepo && not (null buprepo)
then Just buprepo
else Nothing
, remotetype = remote
, availability = if bupLocal buprepo then LocallyAvailable else GloballyAvailable
, readonly = False
, appendonly = False
, mkUnavailable = return Nothing
, getInfo = return [("repo", buprepo)]
2014-12-08 17:40:15 +00:00
, claimUrl = Nothing
, checkUrl = Nothing
, remoteStateHandle = rs
}
let specialcfg = (specialRemoteCfg c)
-- chunking would not improve bup
{ chunkConfig = NoChunks
}
return $ Just $ specialRemote' specialcfg c
(store this buprepo)
(retrieve buprepo)
(remove buprepo)
(checkKey r bupr')
this
where
buprepo = fromMaybe (giveup "missing buprepo") $ remoteAnnexBupRepo gc
bupSetup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
bupSetup _ mu _ c gc = do
u <- maybe (liftIO genUUID) return mu
-- verify configuration is sane
let buprepo = maybe (giveup "Specify buprepo=") fromProposedAccepted $
M.lookup buprepoField c
(c', _encsetup) <- encryptionSetup c gc
-- bup init will create the repository.
-- (If the repository already exists, bup init again appears safe.)
showAction "bup init"
unlessM (bup "init" buprepo []) $ giveup "bup init failed"
storeBupUUID u buprepo
2011-04-09 16:34:49 +00:00
-- The buprepo is stored in git config, as well as this repo's
-- persistant state, so it can vary between hosts.
gitConfigSpecialRemote u c' [("buprepo", buprepo)]
return (c', u)
bupParams :: String -> BupRepo -> [CommandParam] -> [CommandParam]
bupParams command buprepo params =
2011-07-15 16:47:14 +00:00
Param command : [Param "-r", Param buprepo] ++ params
bup :: String -> BupRepo -> [CommandParam] -> Annex Bool
bup command buprepo params = do
showOutput -- make way for bup output
liftIO $ boolSystem "bup" $ bupParams command buprepo params
bupSplitParams :: Remote -> BupRepo -> Key -> [CommandParam] -> [CommandParam]
bupSplitParams r buprepo k src =
let os = map Param $ remoteAnnexBupSplitOptions $ gitconfig r
in bupParams "split" buprepo
(os ++ [Param "-q", Param "-n", Param (bupRef k)] ++ src)
store :: Remote -> BupRepo -> Storer
store r buprepo = byteStorer $ \k b p -> do
showOutput -- make way for bup output
quiet <- commandProgressDisabled
liftIO $ withNullHandle $ \nullh ->
let params = bupSplitParams r buprepo k []
cmd = (proc "bup" (toCommand params))
{ std_in = CreatePipe }
cmd' = if quiet
then cmd
{ std_out = UseHandle nullh
, std_err = UseHandle nullh
}
else cmd
feeder = \h -> do
meteredWrite p h b
hClose h
in withCreateProcess cmd' (go feeder cmd')
where
go feeder p (Just h) _ _ pid =
forceSuccessProcess p pid
`after`
2020-06-05 20:45:12 +00:00
feeder h
go _ _ _ _ _ _ = error "internal"
retrieve :: BupRepo -> Retriever
retrieve buprepo = byteRetriever $ \k sink -> do
let params = bupParams "join" buprepo [Param $ bupRef k]
let p = (proc "bup" (toCommand params))
{ std_out = CreatePipe }
bracketIO (createProcess p) cleanupProcess (go sink p)
where
go sink p (_, Just h, _, pid) = do
() <- sink =<< liftIO (L.hGetContents h)
liftIO $ do
hClose h
forceSuccessProcess p pid
go _ _ _ = error "internal"
{- Cannot revert having stored a key in bup, but at least the data for the
- key will be used for deltaing data of other keys stored later.
-
- We can, however, remove the git branch that bup created for the key.
-}
remove :: BupRepo -> Remover
remove buprepo k = do
go =<< liftIO (bup2GitRemote buprepo)
warning "content cannot be completely removed from bup remote"
where
go r
| Git.repoIsUrl r = void $ onBupRemote r boolSystem "git" params
| otherwise = void $ liftIO $ catchMaybeIO $ do
r' <- Git.Config.read r
boolSystem "git" $ Git.Command.gitCommandLine params r'
params = [ Param "branch", Param "-q", Param "-D", Param (bupRef k) ]
{- Bup does not provide a way to tell if a given dataset is present
- in a bup repository. One way it to check if the git repository has
- a branch matching the name (as created by bup split -n).
-}
checkKey :: Git.Repo -> Git.Repo -> CheckPresent
checkKey r bupr k
| Git.repoIsUrl bupr = do
showChecking r
onBupRemote bupr boolSystem "git" params
| otherwise = liftIO $ boolSystem "git" $
Git.Command.gitCommandLine params bupr
2012-11-11 04:51:07 +00:00
where
params =
[ Param "show-ref"
, Param "--quiet"
, Param "--verify"
2012-11-11 04:51:07 +00:00
, Param $ "refs/heads/" ++ bupRef k
]
2011-04-09 16:34:49 +00:00
{- Store UUID in the annex.uuid setting of the bup repository. -}
storeBupUUID :: UUID -> BupRepo -> Annex ()
storeBupUUID u buprepo = do
r <- liftIO $ bup2GitRemote buprepo
2011-04-09 16:34:49 +00:00
if Git.repoIsUrl r
then do
showAction "storing uuid"
unlessM (onBupRemote r boolSystem "git"
2019-12-02 16:26:33 +00:00
[Param "config", Param (fromConfigKey configkeyUUID), Param v]) $
giveup "ssh failed"
2011-04-09 16:34:49 +00:00
else liftIO $ do
r' <- Git.Config.read r
let noolduuid = case Git.Config.get configkeyUUID mempty r' of
ConfigValue olduuid -> S.null olduuid
NoConfigValue -> True
when noolduuid $
Git.Command.run
[ Param "config"
, Param "annex.uuid"
, Param v
] r'
2012-11-11 04:51:07 +00:00
where
v = fromUUID u
2011-04-09 16:34:49 +00:00
onBupRemote :: Git.Repo -> (FilePath -> [CommandParam] -> IO a) -> FilePath -> [CommandParam] -> Annex a
onBupRemote r runner command params = do
2014-05-16 20:08:20 +00:00
c <- Annex.getRemoteGitConfig r
let remotecmd = "cd " ++ dir ++ " && " ++ unwords (command : toCommand params)
(sshcmd, sshparams) <- Ssh.toRepo NoConsumeStdin r c remotecmd
liftIO $ runner sshcmd sshparams
where
path = fromRawFilePath $ Git.repoPath r
base = fromMaybe path (stripPrefix "/~/" path)
dir = shellEscape base
{- Allow for bup repositories on removable media by checking
- local bup repositories to see if they are available, and getting their
- uuid (which may be different from the stored uuid for the bup remote).
-
2011-12-10 22:51:01 +00:00
- If a bup repository is not available, returns NoUUID.
- This will cause checkPresent to indicate nothing from the bup remote
- is known to be present.
-
- Also, returns a version of the repo with config read, if it is local.
-}
getBupUUID :: Git.Repo -> UUID -> Annex (UUID, Git.Repo)
getBupUUID r u
| Git.repoIsUrl r = return (u, r)
| otherwise = liftIO $ do
ret <- tryIO $ Git.Config.read r
case ret of
2019-12-02 16:26:33 +00:00
Right r' -> return (toUUID $ Git.Config.get configkeyUUID mempty r', r')
Left _ -> return (NoUUID, r)
2011-04-09 16:34:49 +00:00
{- Converts a bup remote path spec into a Git.Repo. There are some
- differences in path representation between git and bup. -}
bup2GitRemote :: BupRepo -> IO Git.Repo
2011-04-09 16:34:49 +00:00
bup2GitRemote "" = do
-- bup -r "" operates on ~/.bup
h <- myHomeDir
Git.Construct.fromAbsPath $ toRawFilePath $ h </> ".bup"
2011-04-09 16:34:49 +00:00
bup2GitRemote r
| bupLocal r =
if "/" `isPrefixOf` r
then Git.Construct.fromAbsPath (toRawFilePath r)
else giveup "please specify an absolute path"
| otherwise = Git.Construct.fromUrl $ "ssh://" ++ host ++ slash dir
2012-11-11 04:51:07 +00:00
where
bits = splitc ':' r
2012-11-11 04:51:07 +00:00
host = Prelude.head bits
dir = intercalate ":" $ drop 1 bits
2012-11-11 04:51:07 +00:00
-- "host:~user/dir" is not supported specially by bup;
-- "host:dir" is relative to the home directory;
-- "host:" goes in ~/.bup
slash d
| null d = "/~/.bup"
| "/" `isPrefixOf` d = d
| otherwise = "/~/" ++ d
2011-04-09 16:34:49 +00:00
{- Converts a key into a git ref name, which bup-split -n will use to point
- to it. -}
bupRef :: Key -> String
bupRef k
| Git.Ref.legal True shown = shown
| otherwise = "git-annex-" ++ show (sha2_256 (fromString shown))
2012-11-11 04:51:07 +00:00
where
shown = serializeKey k
bupLocal :: BupRepo -> Bool
2011-04-09 16:34:49 +00:00
bupLocal = notElem ':'