diff --git a/Remote/Ddar.hs b/Remote/Ddar.hs new file mode 100644 index 0000000000..01d4833094 --- /dev/null +++ b/Remote/Ddar.hs @@ -0,0 +1,229 @@ +{- Using ddar as a remote. Based on bup and rsync remotes. + - + - Copyright 2011 Joey Hess + - Copyright 2014 Robie Basak + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Remote.Ddar (remote) where + +import Control.Exception +import qualified Data.ByteString.Lazy as L +import qualified Data.Map as M +import System.IO.Error +import System.Process + +import Data.String.Utils +import Common.Annex +import Types.Remote +import Types.Key +import Types.Creds +import qualified Git +import Config +import Config.Cost +import Remote.Helper.Special +import Remote.Helper.Encryptable +import Crypto +import Annex.Content +import Annex.Ssh +import Annex.UUID +import Utility.Metered + +type DdarRepo = String + +remote :: RemoteType +remote = RemoteType { + typename = "ddar", + enumerate = findSpecialRemotes "ddarrepo", + generate = gen, + setup = ddarSetup +} + +gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote) +gen r u c gc = do + cst <- remoteCost gc $ + if ddarLocal ddarrepo + then nearlyCheapRemoteCost + else expensiveRemoteCost + + let new = Remote + { uuid = u + , cost = cst + , name = Git.repoDescribe r + , storeKey = store ddarrepo + , retrieveKeyFile = retrieve ddarrepo + , retrieveKeyFileCheap = retrieveCheap + , removeKey = remove ddarrepo + , hasKey = checkPresent ddarrepo + , hasKeyCheap = ddarLocal ddarrepo + , whereisKey = Nothing + , remoteFsck = Nothing + , repairRepo = Nothing + , config = c + , repo = r + , gitconfig = gc + , localpath = if ddarLocal ddarrepo && not (null ddarrepo) + then Just ddarrepo + else Nothing + , remotetype = remote + , availability = if ddarLocal ddarrepo then LocallyAvailable else GloballyAvailable + , readonly = False + } + return $ Just $ encryptableRemote c + (storeEncrypted new ddarrepo) + (retrieveEncrypted ddarrepo) + new + where + ddarrepo = fromMaybe (error "missing ddarrepo") $ remoteAnnexDdarRepo gc + +ddarSetup :: Maybe UUID -> Maybe CredPair -> RemoteConfig -> Annex (RemoteConfig, UUID) +ddarSetup mu _ c = do + u <- maybe (liftIO genUUID) return mu + + -- verify configuration is sane + let ddarrepo = fromMaybe (error "Specify ddarrepo=") $ + M.lookup "ddarrepo" c + c' <- encryptionSetup c + + -- The ddarrepo is stored in git config, as well as this repo's + -- persistant state, so it can vary between hosts. + gitConfigSpecialRemote u c' "ddarrepo" ddarrepo + + return (c', u) + +pipeDdar :: [CommandParam] -> Maybe Handle -> Maybe Handle -> IO Bool +pipeDdar params inh outh = do + p <- runProcess "ddar" (toCommand params) + Nothing Nothing inh outh Nothing + ok <- waitForProcess p + case ok of + ExitSuccess -> return True + _ -> return False + +store :: DdarRepo -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool +store ddarrepo k _f _p = sendAnnex k (void $ remove ddarrepo k) $ \src -> do + let params = + [ Param "c" + , Param "-N" + , Param $ key2file k + , Param ddarrepo + , File src + ] + liftIO $ boolSystem "ddar" params + +storeEncrypted :: Remote -> DdarRepo -> (Cipher, Key) -> Key -> MeterUpdate -> Annex Bool +storeEncrypted r ddarrepo (cipher, enck) k _p = + sendAnnex k (void $ remove ddarrepo k) $ \src -> + liftIO $ catchBoolIO $ + encrypt (getGpgEncParams r) cipher (feedFile src) $ \h -> + pipeDdar params (Just h) Nothing + where + params = + [ Param "c" + , Param "-N" + , Param $ key2file enck + , Param ddarrepo + , Param "-" + ] + +{- Convert remote DdarRepo to host and path on remote end -} +splitRemoteDdarRepo :: DdarRepo -> (String, String) +splitRemoteDdarRepo ddarrepo = + (host, ddarrepo') + where + (host, remainder) = span (/= ':') ddarrepo + ddarrepo' = drop 1 remainder + +{- Return the command and parameters to use for a ddar call that may need to be + - made on a remote repository. This will call ssh if needed. -} + +ddarRemoteCall :: DdarRepo -> Char -> [CommandParam] -> Annex (String, [CommandParam]) +ddarRemoteCall ddarrepo cmd params + | ddarLocal ddarrepo = return ("ddar", localParams) + | otherwise = do + remoteCachingParams <- sshCachingOptions (host, Nothing) [] + return ("ssh", remoteCachingParams ++ remoteParams) + where + (host, ddarrepo') = splitRemoteDdarRepo ddarrepo + localParams = Param [cmd] : Param ddarrepo : params + remoteParams = Param host : Param "ddar" : Param [cmd] : Param ddarrepo' : params + +{- Specialized ddarRemoteCall that includes extraction command and flags -} + +ddarExtractRemoteCall :: DdarRepo -> Key -> Annex (String, [CommandParam]) +ddarExtractRemoteCall ddarrepo k = + ddarRemoteCall ddarrepo 'x' [Param "--force-stdout", Param $ key2file k] + +retrieve :: DdarRepo -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex Bool +retrieve ddarrepo k _f d _p = do + (cmd, params) <- ddarExtractRemoteCall ddarrepo k + liftIO $ catchBoolIO $ withFile d WriteMode $ \h -> do + let p = (proc cmd $ toCommand params){ std_out = UseHandle h } + (_, _, _, pid) <- Common.Annex.createProcess p + forceSuccessProcess p pid + return True + +retrieveCheap :: Key -> FilePath -> Annex Bool +retrieveCheap _ _ = return False + +retrieveEncrypted :: DdarRepo -> (Cipher, Key) -> Key -> FilePath -> MeterUpdate -> Annex Bool +retrieveEncrypted ddarrepo (cipher, enck) _ f _p = do + (cmd, params) <- ddarExtractRemoteCall ddarrepo enck + let p = proc cmd $ toCommand params + liftIO $ catchBoolIO $ withHandle StdoutHandle createProcessSuccess p $ \h -> do + decrypt cipher (\toh -> L.hPut toh =<< L.hGetContents h) $ + readBytes $ L.writeFile f + return True + +remove :: DdarRepo -> Key -> Annex Bool +remove ddarrepo key = do + (cmd, params) <- ddarRemoteCall ddarrepo 'd' [Param $ key2file key] + liftIO $ boolSystem cmd params + +ddarDirectoryExists :: DdarRepo -> Annex (Either String Bool) +ddarDirectoryExists ddarrepo + | ddarLocal ddarrepo = do + maybeStatus <- liftIO $ tryJust (guard . isDoesNotExistError) $ getFileStatus ddarrepo + return $ case maybeStatus of + Left _ -> Right False + Right status -> Right $ isDirectory status + | otherwise = do + sshCachingParams <- sshCachingOptions (host, Nothing) [] + exitCode <- liftIO $ safeSystem "ssh" $ sshCachingParams ++ params + case exitCode of + ExitSuccess -> return $ Right True + ExitFailure 1 -> return $ Right False + ExitFailure code -> return $ Left $ "ssh call " ++ + show (Data.String.Utils.join " " $ toCommand params) ++ + " failed with status " ++ show code + where + (host, ddarrepo') = splitRemoteDdarRepo ddarrepo + params = + [ Param host + , Param "test" + , Param "-d" + , Param ddarrepo' + ] + +{- Use "ddar t" to determine if a given key is present in a ddar archive -} +inDdarManifest :: DdarRepo -> Key -> Annex (Either String Bool) +inDdarManifest ddarrepo k = do + (cmd, params) <- ddarRemoteCall ddarrepo 't' [] + let p = proc cmd $ toCommand params + liftIO $ catchMsgIO $ withHandle StdoutHandle createProcessSuccess p $ \h -> do + contents <- hGetContents h + return $ elem k' $ lines contents + where + k' = key2file k + +checkPresent :: DdarRepo -> Key -> Annex (Either String Bool) +checkPresent ddarrepo key = do + directoryExists <- ddarDirectoryExists ddarrepo + case directoryExists of + Left e -> return $ Left e + Right True -> inDdarManifest ddarrepo key + Right False -> return $ Right False + +ddarLocal :: DdarRepo -> Bool +ddarLocal = notElem ':' diff --git a/Remote/List.hs b/Remote/List.hs index e3afc939ca..221ab9a544 100644 --- a/Remote/List.hs +++ b/Remote/List.hs @@ -38,6 +38,7 @@ import qualified Remote.WebDAV import qualified Remote.Tahoe #endif import qualified Remote.Glacier +import qualified Remote.Ddar import qualified Remote.Hook import qualified Remote.External @@ -59,6 +60,7 @@ remoteTypes = , Remote.Tahoe.remote #endif , Remote.Glacier.remote + , Remote.Ddar.remote , Remote.Hook.remote , Remote.External.remote ] diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index b49f3d762d..71f06ff45e 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -131,6 +131,7 @@ data RemoteGitConfig = RemoteGitConfig , remoteAnnexBupSplitOptions :: [String] , remoteAnnexDirectory :: Maybe FilePath , remoteAnnexGCrypt :: Maybe String + , remoteAnnexDdarRepo :: Maybe String , remoteAnnexHookType :: Maybe String , remoteAnnexExternalType :: Maybe String {- A regular git remote's git repository config. -} @@ -162,6 +163,7 @@ extractRemoteGitConfig r remotename = RemoteGitConfig , remoteAnnexBupSplitOptions = getoptions "bup-split-options" , remoteAnnexDirectory = notempty $ getmaybe "directory" , remoteAnnexGCrypt = notempty $ getmaybe "gcrypt" + , remoteAnnexDdarRepo = getmaybe "ddarrepo" , remoteAnnexHookType = notempty $ getmaybe "hooktype" , remoteAnnexExternalType = notempty $ getmaybe "externaltype" , remoteGitConfig = Nothing diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index d5408a2ae9..bb741810ce 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1647,6 +1647,12 @@ Here are all the supported configuration settings. the location of the bup repository to use. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. +* `remote..ddarrepo` + + Used by ddar special remotes, this configures + the location of the ddar repository to use. Normally this is automatically + set up by `git annex initremote`, but you can change it if needed. + * `remote..directory` Used by directory special remotes, this configures diff --git a/doc/special_remotes.mdwn b/doc/special_remotes.mdwn index e3004d2cf1..cc52dfc553 100644 --- a/doc/special_remotes.mdwn +++ b/doc/special_remotes.mdwn @@ -10,6 +10,7 @@ They cannot be used by other git commands though. * [[S3]] (Amazon S3, and other compatible services) * [[Amazon_Glacier|glacier]] * [[bup]] +* [[ddar]] * [[gcrypt]] (encrypted git repositories!) * [[directory]] * [[rsync]] diff --git a/doc/special_remotes/ddar.mdwn b/doc/special_remotes/ddar.mdwn new file mode 100644 index 0000000000..8de86e2c79 --- /dev/null +++ b/doc/special_remotes/ddar.mdwn @@ -0,0 +1,40 @@ +This special remote type stores file contents in a +[ddar](https://github.com/basak/ddar) repository. This provides easy +de-duplication when you use git-annex to manage many files that are similar. + +Unlike bup, ddar uses its own storage format, which allows for both creation +and deletion of de-deduplicated files. In addition to using local storage, ddar +archives can be remote, providing that ddar is installed on the remote machine +and ssh is available to it. + +See [[walkthrough/using_ddar]] for usage examples. + +## encryption + +Encryption is nominally supported, but is not useful. Since effective +encryption necessarily obfuscates file contents, similar areas across different +files are no longer visible to ddar and cannot be de-duplicated. + +## compression + +The same caveat with encryption also generally applies to compression, since +file compression changes file contents such that similar regions across files +no longer appear similar. An exception is `gzip --rsyncable`, which is +specifically designed to work around this issue. This is the only compression +mechanism with which de-duplication remains effective. + +## configuration + +These parameters can be passed to `git annex initremote` to configure ddar: + +* `encryption` - One of "none", "hybrid", "shared", or "pubkey". + See [[encryption]]. However, note that encryption renders all de-duplication +ineffective. + +* `keyid` - Specifies the gpg key to use for [[encryption]]. + +* `ddarrepo` - Required. This is passed to `ddar` as the path to the ddar + archive to use. If it doesn't exist, the ddar repository will be created + automatically when a file is first copied to it. To use a remote ddar + repository, use a colon (`:`) to separate the hostname from the remote path. + Example: "ddarrepo=example.com:/big/myddar" or "ddarrepo=/big/myddar" diff --git a/doc/walkthrough/using_ddar.mdwn b/doc/walkthrough/using_ddar.mdwn new file mode 100644 index 0000000000..1d7cdd7497 --- /dev/null +++ b/doc/walkthrough/using_ddar.mdwn @@ -0,0 +1,32 @@ +Another [[special_remote|special_remotes]] that git-annex can use is +a [[special_remotes/ddar]] repository. ddar stores large file contents +in a directory structure of its own, with deduplication. For remote +repositories, ddar requires that ssh is available on the remote, with ddar also +installed remotely. When copying files to the remote, ddar only needs to send +over the network the parts of the files that are not already present remotely. + +Unlike bup, ddar uses its own storage format, which allows for both creation +and deletion of de-deduplicated files. + +Here's how to create a ddar remote, and describe it. + +[[!template id=note text=""" +Instead of specifying a remote system, you could choose to make a bup +remote that is only accessible on the current system, by passing +"ddarrepo=/big/myddar". +"""]] + + # git annex initremote myddar type=ddar encryption=none ddarrepo=example.com:/big/myddar + initremote ddar (bup init) + Initialized empty Git repository in /big/myddar/ + ok + # git annex describe myddar "my bup repository at example.com" + describe myddar ok + +Now the remote can be used like any other remote. + + # git annex move my_cool_big_file --to myddar + move my_cool_big_file (to myddar...) + ok + +See [[special_remotes/bup]] for details.