diff --git a/Backend/Hash.hs b/Backend/Hash.hs index 08bc6b1a3d..6cac6e3718 100644 --- a/Backend/Hash.hs +++ b/Backend/Hash.hs @@ -10,6 +10,7 @@ module Backend.Hash ( backends, testKeyBackend, + keyHash, ) where import Annex.Common diff --git a/CHANGELOG b/CHANGELOG index ae4c2e96ad..2aa6630d00 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,14 @@ git-annex (7.20190731) UNRELEASED; urgency=medium + * New git-lfs special remote, which can be used to store data on any git-lfs + server, including github, gitlab, and gogs. + * Support fully encrypting all data sent to a git-lfs special remote, + using a combination of gcrypt to encrypt the git data, and git-annex's + encryption of its data. * Use the same optimisation for --in=here as has always been used for --in=. rather than the slow code path that unncessarily queries the git-annex branch. + * Allow setting up a gcrypt special remote with encryption=shared. -- Joey Hess Thu, 01 Aug 2019 00:11:56 -0400 diff --git a/COPYRIGHT b/COPYRIGHT index fd94655fbf..7fe4c9d946 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -25,6 +25,10 @@ Copyright: 2018 Joey Hess 2013 Michael Snoyman License: Expat +Files: Utility/GitLFS.hs +Copyright: © 2019 Joey Hess +License: AGPL-3+ + Files: Utility/* Copyright: 2012-2019 Joey Hess License: BSD-2-clause diff --git a/Git/Url.hs b/Git/Url.hs index f9cc575a63..8430655758 100644 --- a/Git/Url.hs +++ b/Git/Url.hs @@ -11,9 +11,10 @@ module Git.Url ( port, hostuser, authority, + path, ) where -import Network.URI hiding (scheme, authority) +import Network.URI hiding (scheme, authority, path) import Common import Git.Types @@ -66,6 +67,11 @@ authpart :: (URIAuth -> a) -> Repo -> Maybe a authpart a Repo { location = Url u } = a <$> uriAuthority u authpart _ repo = notUrl repo +{- Path part of an URL repo. -} +path :: Repo -> FilePath +path Repo { location = Url u } = uriPath u +path repo = notUrl repo + notUrl :: Repo -> a notUrl repo = error $ "acting on local git repo " ++ repoDescribe repo ++ " not supported" diff --git a/Remote/GCrypt.hs b/Remote/GCrypt.hs index 22a88cd6b9..931a1491f3 100644 --- a/Remote/GCrypt.hs +++ b/Remote/GCrypt.hs @@ -12,6 +12,7 @@ module Remote.GCrypt ( coreGCryptId, setupRepo, accessShellConfig, + setGcryptEncryption, ) where import qualified Data.Map as M @@ -318,16 +319,18 @@ shellOrRsync r ashell arsync setGcryptEncryption :: RemoteConfig -> String -> Annex () setGcryptEncryption c remotename = do let participants = remoteconfig Git.GCrypt.remoteParticipantConfigKey - case cipherKeyIds =<< extractCipher c of + case extractCipher c of Nothing -> noCrypto - Just (KeyIds { keyIds = ks}) -> do - setConfig participants (unwords ks) - let signingkey = ConfigKey $ Git.GCrypt.remoteSigningKey remotename - cmd <- gpgCmd <$> Annex.getGitConfig - skeys <- M.keys <$> liftIO (secretKeys cmd) - case filter (`elem` ks) skeys of - [] -> noop - (k:_) -> setConfig signingkey k + Just cip -> case cipherKeyIds cip of + Nothing -> noop + Just (KeyIds { keyIds = ks}) -> do + setConfig participants (unwords ks) + let signingkey = ConfigKey $ Git.GCrypt.remoteSigningKey remotename + cmd <- gpgCmd <$> Annex.getGitConfig + skeys <- M.keys <$> liftIO (secretKeys cmd) + case filter (`elem` ks) skeys of + [] -> noop + (k:_) -> setConfig signingkey k setConfig (remoteconfig Git.GCrypt.remotePublishParticipantConfigKey) (Git.Config.boolConfig True) where diff --git a/Remote/Git.hs b/Remote/Git.hs index 61b58a4890..e7ed224047 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -51,6 +51,7 @@ import Remote.Helper.Messages import Remote.Helper.ExportImport import qualified Remote.Helper.Ssh as Ssh import qualified Remote.GCrypt +import qualified Remote.GitLFS import qualified Remote.P2P import qualified Remote.Helper.P2P as P2PHelper import P2P.Address @@ -143,6 +144,9 @@ configRead autoinit r = do gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote) gen r u c gc + -- Remote.GitLFS may be used with a repo that is also encrypted + -- with gcrypt so is checked first. + | remoteAnnexGitLFS gc = Remote.GitLFS.gen r u c gc | Git.GCrypt.isEncrypted r = Remote.GCrypt.chainGen r u c gc | otherwise = case repoP2PAddress r of Nothing -> do diff --git a/Remote/GitLFS.hs b/Remote/GitLFS.hs new file mode 100644 index 0000000000..4765d2fddb --- /dev/null +++ b/Remote/GitLFS.hs @@ -0,0 +1,449 @@ +{- Using git-lfs as a remote. + - + - Copyright 2019 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +module Remote.GitLFS (remote, gen) where + +import Annex.Common +import Types.Remote +import Annex.Url +import Types.Key +import Types.Creds +import qualified Annex +import qualified Git +import qualified Git.Types as Git +import qualified Git.Url +import qualified Git.GCrypt +import Config +import Config.Cost +import Remote.Helper.Special +import Remote.Helper.ExportImport +import Remote.Helper.Git +import Remote.Helper.Http +import qualified Remote.GCrypt +import Annex.Ssh +import Annex.UUID +import Crypto +import Backend.Hash +import Utility.Hash +import Utility.SshHost +import Logs.RemoteState +import qualified Utility.GitLFS as LFS + +import Control.Concurrent.STM +import Data.String +import Network.HTTP.Types +import Network.HTTP.Client hiding (port) +import System.Log.Logger +import qualified Data.Map as M +import qualified Data.ByteString.Lazy as L +import qualified Data.Text as T +import qualified Data.Text.Encoding as E + +remote :: RemoteType +remote = RemoteType + { typename = "git-lfs" + -- Remote.Git takes care of enumerating git-lfs remotes too, + -- and will call our gen on them. + , enumerate = const (return []) + , generate = gen + , setup = mySetup + , exportSupported = exportUnsupported + , importSupported = importUnsupported + } + +gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote) +gen r u c gc = do + -- If the repo uses gcrypt, get the underlaying repo without the + -- gcrypt url, to do LFS endpoint discovery on. + r' <- if Git.GCrypt.isEncrypted r + then do + g <- Annex.gitRepo + liftIO $ Git.GCrypt.encryptedRemote g r + else pure r + h <- liftIO $ newTVarIO $ LFSHandle Nothing Nothing r' gc + cst <- remoteCost gc expensiveRemoteCost + return $ Just $ specialRemote' specialcfg c + (simplyPrepare $ store u h) + (simplyPrepare $ retrieve u h) + (simplyPrepare $ remove h) + (simplyPrepare $ checkKey u h) + (this cst) + where + this cst = Remote + { uuid = u + , cost = cst + , name = Git.repoDescribe r + , storeKey = storeKeyDummy + , retrieveKeyFile = retreiveKeyFileDummy + , retrieveKeyFileCheap = retrieveCheap + -- content stored on git-lfs is hashed with SHA256 + -- no matter what git-annex key it's for, and the hash + -- is checked on download + , retrievalSecurityPolicy = RetrievalAllKeysSecure + , removeKey = removeKeyDummy + , lockContent = Nothing + , checkPresent = checkPresentDummy + , checkPresentCheap = False + , exportActions = exportUnsupported + , importActions = importUnsupported + , whereisKey = Nothing + , remoteFsck = Nothing + , repairRepo = Nothing + , config = c + , getRepo = return r + , gitconfig = gc + , localpath = Nothing + , remotetype = remote + , availability = GloballyAvailable + , readonly = False + -- content cannot be removed from a git-lfs repo + , appendonly = True + , mkUnavailable = return Nothing + , getInfo = gitRepoInfo (this cst) + , claimUrl = Nothing + , checkUrl = Nothing + } + specialcfg = (specialRemoteCfg c) + -- chunking would not improve git-lfs + { chunkConfig = NoChunks + } + +mySetup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID) +mySetup _ mu _ c gc = do + u <- maybe (liftIO genUUID) return mu + + (c', _encsetup) <- encryptionSetup c gc + case (isEncrypted c', Git.GCrypt.urlPrefix `isPrefixOf` url) of + (False, False) -> noop + (True, True) -> Remote.GCrypt.setGcryptEncryption c' remotename + (True, False) -> unlessM (Annex.getState Annex.force) $ + giveup $ unwords $ + [ "Encryption is enabled for this remote," + , "but only the files that git-annex stores on" + , "it would be encrypted; " + , "anything that git push sends to it would" + , "not be encrypted. Recommend prefixing the" + , "url with \"gcrypt::\" to also encrypt" + , "git pushes." + , "(Use --force if you want to use this" + , "likely insecure configuration.)" + ] + (False, True) -> unlessM (Annex.getState Annex.force) $ + giveup $ unwords $ + [ "You used a \"gcrypt::\" url for this remote," + , "but encryption=none prevents git-annex" + , "from encrypting files it stores there." + , "(Use --force if you want to use this" + , "likely insecure configuration.)" + ] + + -- The url is not stored in the remote log, because the same + -- git-lfs repo can be accessed using different urls by different + -- people (eg over ssh or http). + -- + -- Instead, set up remote.name.url to point to the repo, + -- (so it's also usable by git as a non-special remote), + -- and set remote.name.git-lfs = true + let c'' = M.delete "url" c' + gitConfigSpecialRemote u c'' [("git-lfs", "true")] + setConfig (ConfigKey ("remote." ++ getRemoteName c ++ ".url")) url + return (c'', u) + where + url = fromMaybe (giveup "Specify url=") (M.lookup "url" c) + remotename = fromJust (M.lookup "name" c) + +data LFSHandle = LFSHandle + { downloadEndpoint :: Maybe LFS.Endpoint + , uploadEndpoint :: Maybe LFS.Endpoint + , remoteRepo :: Git.Repo + , remoteGitConfig :: RemoteGitConfig + } + +discoverLFSEndpoint :: LFS.TransferRequestOperation -> LFSHandle -> Annex (Maybe LFS.Endpoint) +discoverLFSEndpoint tro h + | Git.repoIsSsh r = gossh + | Git.repoIsHttp r = gohttp + | otherwise = do + warning "git-lfs endpoint has unsupported URI scheme" + return Nothing + where + r = remoteRepo h + lfsrepouri = case Git.location r of + Git.Url u -> u + _ -> giveup $ "unsupported git-lfs remote location " ++ Git.repoLocation r + gohttp = case tro of + LFS.RequestDownload -> return $ LFS.guessEndpoint lfsrepouri + LFS.RequestUpload -> do + -- git-lfs does support storing over http, + -- but it would need prompting for http basic + -- authentication each time git-annex discovered + -- the endpoint. + warning "Storing content in git-lfs currently needs a ssh repository url, not http." + return Nothing + gossh = case mkSshHost <$> Git.Url.hostuser r of + Nothing -> do + warning "Unable to parse ssh url for git-lfs remote." + return Nothing + Just (Left err) -> do + warning err + return Nothing + Just (Right hostuser) -> do + let port = Git.Url.port r + -- Remove leading /~/ from path. That is added when + -- converting a scp-style repository location with + -- a relative path into an url, and is legal + -- according to git-clone(1), but github does not + -- support it. + let remotepath = if "/~/" `isPrefixOf` Git.Url.path r + then drop 3 (Git.Url.path r) + else Git.Url.path r + let ps = LFS.sshDiscoverEndpointCommand remotepath tro + -- Note that no shellEscape is done here, because + -- at least github's git-lfs implementation does + -- not allow for shell quoting. + let remotecmd = unwords ps + (sshcommand, sshparams) <- sshCommand NoConsumeStdin (hostuser, port) (remoteGitConfig h) remotecmd + liftIO (tryIO (readProcess sshcommand (toCommand sshparams))) >>= \case + Left err -> do + warning $ "ssh connection to git-lfs remote failed: " ++ show err + return Nothing + Right resp -> case LFS.parseSshDiscoverEndpointResponse (fromString resp) of + Nothing -> do + warning $ "unexpected response from git-lfs remote when doing ssh endpoint discovery" + return Nothing + Just endpoint -> return (Just endpoint) + +-- The endpoint is cached for later use. +getLFSEndpoint :: LFS.TransferRequestOperation -> TVar LFSHandle -> Annex (Maybe LFS.Endpoint) +getLFSEndpoint tro hv = do + h <- liftIO $ atomically $ readTVar hv + case f h of + Just endpoint -> return (Just endpoint) + Nothing -> discoverLFSEndpoint tro h >>= \case + Just endpoint -> do + liftIO $ atomically $ writeTVar hv $ + case tro of + LFS.RequestDownload -> + h { downloadEndpoint = Just endpoint } + LFS.RequestUpload -> + h { uploadEndpoint = Just endpoint } + return (Just endpoint) + Nothing -> return Nothing + where + f = case tro of + LFS.RequestDownload -> downloadEndpoint + LFS.RequestUpload -> uploadEndpoint + +-- Make an API request that is expected to have a small response body. +-- Not for use in downloading an object. +makeSmallAPIRequest :: Request -> Annex (Response L.ByteString) +makeSmallAPIRequest req = do + uo <- getUrlOptions + let req' = applyRequest uo req + liftIO $ debugM "git-lfs" (show req') + resp <- liftIO $ httpLbs req' (httpManager uo) + -- Only debug the http status code, not the json + -- which may include an authentication token. + liftIO $ debugM "git-lfs" (show $ responseStatus resp) + return resp + +sendTransferRequest + :: LFS.IsTransferResponseOperation op + => LFS.TransferRequest + -> LFS.Endpoint + -> Annex (Either String (LFS.TransferResponse op)) +sendTransferRequest req endpoint = + case LFS.startTransferRequest endpoint req of + Just httpreq -> do + httpresp <- makeSmallAPIRequest $ setRequestCheckStatus httpreq + return $ case LFS.parseTransferResponse (responseBody httpresp) of + LFS.ParsedTransferResponse resp -> Right resp + LFS.ParsedTransferResponseError tro -> Left $ + T.unpack $ LFS.resperr_message tro + LFS.ParseFailed err -> Left err + Nothing -> return $ Left "unable to parse git-lfs endpoint url" + +extractKeySha256 :: Key -> Maybe LFS.SHA256 +extractKeySha256 k = case keyVariety k of + SHA2Key (HashSize 256) (HasExt hasext) + | hasext -> eitherToMaybe $ E.decodeUtf8' (keyHash k) + | otherwise -> eitherToMaybe $ E.decodeUtf8' (keyName k) + _ -> Nothing + +-- The size of an encrypted key is the size of the input data, but we need +-- the actual object size. +extractKeySize :: Key -> Maybe Integer +extractKeySize k + | isEncKey k = Nothing + | otherwise = keySize k + +mkUploadRequest :: UUID -> Key -> FilePath -> Annex (LFS.TransferRequest, LFS.SHA256, Integer) +mkUploadRequest u k content = case (extractKeySha256 k, extractKeySize k) of + (Just sha256, Just size) -> + ret sha256 size + (_, Just size) -> do + sha256 <- calcsha256 + remembersha256 sha256 + ret sha256 size + _ -> do + sha256 <- calcsha256 + size <- liftIO $ getFileSize content + rememberboth sha256 size + ret sha256 size + where + calcsha256 = liftIO $ T.pack . show . sha2_256 <$> L.readFile content + ret sha256 size = do + let obj = LFS.TransferRequestObject + { LFS.req_oid = sha256 + , LFS.req_size = size + } + let req = LFS.TransferRequest + { LFS.req_operation = LFS.RequestUpload + , LFS.req_transfers = [LFS.Basic] + , LFS.req_ref = Nothing + , LFS.req_objects = [obj] + } + return (req, sha256, size) + + remembersha256 sha256 = setRemoteState u k (T.unpack sha256) + rememberboth sha256 size = setRemoteState u k $ + show size ++ " " ++ T.unpack sha256 + +mkDownloadRequest :: UUID -> Key -> Annex (Maybe (LFS.TransferRequest, LFS.SHA256, Integer)) +mkDownloadRequest u k = case (extractKeySha256 k, extractKeySize k) of + (Just sha256, Just size) -> ret sha256 size + (_, Just size) -> + remembersha256 >>= \case + Just sha256 -> ret sha256 size + Nothing -> return Nothing + _ -> do + rememberboth >>= \case + Just (sha256, size) -> ret sha256 size + Nothing -> return Nothing + where + ret sha256 size = do + let obj = LFS.TransferRequestObject + { LFS.req_oid = sha256 + , LFS.req_size = size + } + let req = LFS.TransferRequest + { LFS.req_operation = LFS.RequestDownload + , LFS.req_transfers = [LFS.Basic] + , LFS.req_ref = Nothing + , LFS.req_objects = [obj] + } + return $ Just (req, sha256, size) + remembersha256 = fmap T.pack <$> getRemoteState u k + rememberboth = maybe Nothing parse <$> getRemoteState u k + where + parse s = case words s of + [ssize, ssha256] -> do + size <- readish ssize + return (T.pack ssha256, size) + _ -> Nothing + +store :: UUID -> TVar LFSHandle -> Storer +store u h = fileStorer $ \k src p -> getLFSEndpoint LFS.RequestUpload h >>= \case + Nothing -> return False + Just endpoint -> flip catchNonAsync failederr $ do + (req, sha256, size) <- mkUploadRequest u k src + sendTransferRequest req endpoint >>= \case + Left err -> do + warning err + return False + Right resp -> do + body <- liftIO $ httpBodyStorer src p + forM_ (LFS.objects resp) $ + send body sha256 size + return True + where + send body sha256 size tro + | LFS.resp_oid tro /= sha256 || LFS.resp_size tro /= size = + giveup "git-lfs server requested other object than the one we asked to send" + | otherwise = case LFS.resp_error tro of + Just err -> giveup $ + T.unpack $ LFS.respobjerr_message err + Nothing -> case LFS.resp_actions tro of + Nothing -> noop + Just op -> case LFS.uploadOperationRequests op body sha256 size of + Nothing -> giveup "unable to parse git-lfs server upload url" + Just [] -> noop -- server already has it + Just reqs -> forM_ reqs $ + makeSmallAPIRequest . setRequestCheckStatus + failederr e = do + warning (show e) + return False + +retrieve :: UUID -> TVar LFSHandle -> Retriever +retrieve u h = fileRetriever $ \dest k p -> getLFSEndpoint LFS.RequestDownload h >>= \case + Nothing -> giveup "unable to connect to git-lfs endpoint" + Just endpoint -> mkDownloadRequest u k >>= \case + Nothing -> giveup "unable to download this object from git-lfs" + Just (req, sha256, size) -> sendTransferRequest req endpoint >>= \case + Left err -> giveup (show err) + Right resp -> case LFS.objects resp of + [] -> giveup "git-lfs server did not provide a way to download this object" + (tro:_) + | LFS.resp_oid tro /= sha256 || LFS.resp_size tro /= size -> + giveup "git-lfs server replied with other object than the one we requested" + | otherwise -> go dest p tro + where + go dest p tro = case LFS.resp_error tro of + Just err -> giveup $ T.unpack $ LFS.respobjerr_message err + Nothing -> case LFS.resp_actions tro of + Nothing -> giveup "git-lfs server did not provide a way to download this object" + Just op -> case LFS.downloadOperationRequest op of + Nothing -> giveup "unable to parse git-lfs server download url" + Just req -> do + uo <- getUrlOptions + liftIO $ downloadConduit p req dest uo + +checkKey :: UUID -> TVar LFSHandle -> CheckPresent +checkKey u h key = getLFSEndpoint LFS.RequestDownload h >>= \case + Nothing -> giveup "unable to connect to git-lfs endpoint" + Just endpoint -> mkDownloadRequest u key >>= \case + -- Unable to find enough information to request the key + -- from git-lfs, so it's not present there. + Nothing -> return False + Just (req, sha256, size) -> case LFS.startTransferRequest endpoint req of + Nothing -> giveup "unable to parse git-lfs endpoint url" + Just httpreq -> go sha256 size =<< makeSmallAPIRequest httpreq + where + go sha256 size httpresp + | responseStatus httpresp == status200 = go' sha256 size $ + LFS.parseTransferResponse (responseBody httpresp) + | otherwise = giveup $ + "git-lfs server refused request: " ++ show (responseStatus httpresp) + + go' :: LFS.SHA256 -> Integer -> LFS.ParsedTransferResponse LFS.DownloadOperation -> Annex Bool + go' _ _ (LFS.ParseFailed err) = + giveup $ "unable to parse response from git-lfs server: " ++ err + -- If the server responds with a json error message, + -- the content is presumably not present. + go' _ _ (LFS.ParsedTransferResponseError _) = return False + -- If the server responds with at least one download operation, + -- we will assume the content is present. We could also try to HEAD + -- that download, but there's no guarantee HEAD is supported, and + -- at most that would detect breakage where the server is confused + -- about what objects it has. + go' sha256 size (LFS.ParsedTransferResponse resp) = + case LFS.objects resp of + [] -> return False + (tro:_) + | isNothing (LFS.resp_actions tro) -> return False + | isJust (LFS.resp_error tro) -> return False + | LFS.resp_oid tro /= sha256 || LFS.resp_size tro /= size -> + giveup "git-lfs server replied with other object than the one we requested" + | otherwise -> return True + +retrieveCheap :: Key -> AssociatedFile -> FilePath -> Annex Bool +retrieveCheap _ _ _ = return False + +remove :: TVar LFSHandle -> Remover +remove _h _key = do + warning "git-lfs does not support removing content" + return False diff --git a/Remote/List.hs b/Remote/List.hs index b1cd8ff6ac..d4ed4dfe28 100644 --- a/Remote/List.hs +++ b/Remote/List.hs @@ -40,6 +40,7 @@ import qualified Remote.Adb import qualified Remote.Tahoe import qualified Remote.Glacier import qualified Remote.Ddar +import qualified Remote.GitLFS import qualified Remote.Hook import qualified Remote.External @@ -63,6 +64,7 @@ remoteTypes = map adjustExportImportRemoteType , Remote.Tahoe.remote , Remote.Glacier.remote , Remote.Ddar.remote + , Remote.GitLFS.remote , Remote.Hook.remote , Remote.External.remote ] diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index 0bc72d4023..7976f08e9f 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -263,6 +263,7 @@ data RemoteGitConfig = RemoteGitConfig , remoteAnnexAndroidDirectory :: Maybe FilePath , remoteAnnexAndroidSerial :: Maybe String , remoteAnnexGCrypt :: Maybe String + , remoteAnnexGitLFS :: Bool , remoteAnnexDdarRepo :: Maybe String , remoteAnnexHookType :: Maybe String , remoteAnnexExternalType :: Maybe String @@ -321,6 +322,7 @@ extractRemoteGitConfig r remotename = do , remoteAnnexAndroidDirectory = notempty $ getmaybe "androiddirectory" , remoteAnnexAndroidSerial = notempty $ getmaybe "androidserial" , remoteAnnexGCrypt = notempty $ getmaybe "gcrypt" + , remoteAnnexGitLFS = getbool "git-lfs" False , remoteAnnexDdarRepo = getmaybe "ddarrepo" , remoteAnnexHookType = notempty $ getmaybe "hooktype" , remoteAnnexExternalType = notempty $ getmaybe "externaltype" diff --git a/Utility/GitLFS.hs b/Utility/GitLFS.hs new file mode 100644 index 0000000000..00fac8b1ad --- /dev/null +++ b/Utility/GitLFS.hs @@ -0,0 +1,438 @@ +{- git-lfs API + - + - https://github.com/git-lfs/git-lfs/blob/master/docs/api + - + - Copyright 2019 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE DeriveGeneric, FlexibleInstances, FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE LambdaCase #-} + +module Utility.GitLFS ( + -- * transfer requests + TransferRequest(..), + TransferRequestOperation(..), + TransferAdapter(..), + TransferRequestObject(..), + startTransferRequest, + -- * responses to transfer requests + TransferResponse(..), + TransferResponseOperation(..), + IsTransferResponseOperation, + DownloadOperation, + UploadOperation, + ParsedTransferResponse(..), + parseTransferResponse, + -- * making transfers + downloadOperationRequest, + uploadOperationRequests, + -- * endpoint discovery + Endpoint, + guessEndpoint, + HostUser, + sshDiscoverEndpointCommand, + parseSshDiscoverEndpointResponse, + -- * errors + TransferResponseError(..), + TransferResponseObjectError(..), + -- * additional data types + Url, + SHA256, + GitRef(..), + NumSeconds, + HTTPHeader, + HTTPHeaderValue, +) where + +-- | This implementation of the git-lfs API uses http Request and Response, +-- but leaves actually connecting up the http client to the user. +-- +-- You'll want to use a Manager that supports https, since the protocol +-- uses http basic auth. +-- +-- Some LFS servers, notably Github's, may require a User-Agent header +-- in some of the requests, in order to allow eg, uploads. No such header +-- is added by dedault, so be sure to add your own. + +import Data.Aeson +import Data.Aeson.Types +import GHC.Generics +import Network.HTTP.Client +import Data.List +import qualified Data.Map as M +import qualified Data.Text as T +import qualified Data.Text.Encoding as E +import qualified Data.ByteString.Lazy as L +import qualified Data.CaseInsensitive as CI +import qualified Network.URI as URI + +data TransferRequest = TransferRequest + { req_operation :: TransferRequestOperation + , req_transfers :: [TransferAdapter] + , req_ref :: Maybe GitRef + , req_objects :: [TransferRequestObject] + } + deriving (Generic, Show) + +instance ToJSON TransferRequest where + toJSON = genericToJSON transferRequestOptions + toEncoding = genericToEncoding transferRequestOptions + +instance FromJSON TransferRequest where + parseJSON = genericParseJSON transferRequestOptions + +transferRequestOptions :: Options +transferRequestOptions = stripFieldPrefix nonNullOptions + +data TransferRequestObject = TransferRequestObject + { req_oid :: SHA256 + , req_size :: Integer + } + deriving (Generic, Show) + +instance ToJSON TransferRequestObject where + toJSON = genericToJSON transferRequestObjectOptions + toEncoding = genericToEncoding transferRequestObjectOptions + +instance FromJSON TransferRequestObject where + parseJSON = genericParseJSON transferRequestObjectOptions + +transferRequestObjectOptions :: Options +transferRequestObjectOptions = stripFieldPrefix defaultOptions + +data TransferRequestOperation = RequestDownload | RequestUpload + deriving (Show) + +instance ToJSON TransferRequestOperation where + toJSON RequestDownload = "download" + toJSON RequestUpload = "upload" + +instance FromJSON TransferRequestOperation where + parseJSON (String "download") = pure RequestDownload + parseJSON (String "upload") = pure RequestUpload + parseJSON invalid = typeMismatch "TransferRequestOperation" invalid + +data TransferResponse op = TransferResponse + { transfer :: Maybe TransferAdapter + , objects :: [TransferResponseOperation op] + } + deriving (Generic, Show) + +instance IsTransferResponseOperation op => ToJSON (TransferResponse op) where + toJSON = genericToJSON nonNullOptions + toEncoding = genericToEncoding nonNullOptions + +instance IsTransferResponseOperation op => FromJSON (TransferResponse op) + +-- | This is an error with a TransferRequest as a whole. It's also possible +-- for a TransferRequest to overall succeed, but fail for some +-- objects; such failures use TransferResponseObjectError. +data TransferResponseError = TransferResponseError + { resperr_message :: T.Text + , resperr_request_id :: Maybe T.Text + , resperr_documentation_url :: Maybe Url + } + deriving (Generic, Show) + +instance ToJSON TransferResponseError where + toJSON = genericToJSON transferResponseErrorOptions + toEncoding = genericToEncoding transferResponseErrorOptions + +instance FromJSON TransferResponseError where + parseJSON = genericParseJSON transferResponseErrorOptions + +transferResponseErrorOptions :: Options +transferResponseErrorOptions = stripFieldPrefix nonNullOptions + +-- | An error with a single object within a TransferRequest. +data TransferResponseObjectError = TransferResponseObjectError + { respobjerr_code :: Int + , respobjerr_message :: T.Text + } + deriving (Generic, Show) + +instance ToJSON TransferResponseObjectError where + toJSON = genericToJSON transferResponseObjectErrorOptions + toEncoding = genericToEncoding transferResponseObjectErrorOptions + +instance FromJSON TransferResponseObjectError where + parseJSON = genericParseJSON transferResponseObjectErrorOptions + +transferResponseObjectErrorOptions :: Options +transferResponseObjectErrorOptions = stripFieldPrefix nonNullOptions + +data TransferAdapter = Basic + deriving (Show) + +instance ToJSON TransferAdapter where + toJSON Basic = "basic" + +instance FromJSON TransferAdapter where + parseJSON (String "basic") = pure Basic + parseJSON invalid = typeMismatch "basic" invalid + +data TransferResponseOperation op = TransferResponseOperation + { resp_oid :: SHA256 + , resp_size :: Integer + , resp_authenticated :: Maybe Bool + , resp_actions :: Maybe op + , resp_error :: Maybe TransferResponseObjectError + } + deriving (Generic, Show) + +instance ToJSON op => ToJSON (TransferResponseOperation op) where + toJSON = genericToJSON transferResponseOperationOptions + toEncoding = genericToEncoding transferResponseOperationOptions + +instance FromJSON op => FromJSON (TransferResponseOperation op) where + parseJSON = genericParseJSON transferResponseOperationOptions + +transferResponseOperationOptions :: Options +transferResponseOperationOptions = stripFieldPrefix nonNullOptions + +-- | Class of types that can be responses to a transfer request, +-- that contain an operation to use to make the transfer. +class (FromJSON op, ToJSON op) => IsTransferResponseOperation op + +data DownloadOperation = DownloadOperation + { download :: OperationParams } + deriving (Generic, Show) + +instance IsTransferResponseOperation DownloadOperation +instance ToJSON DownloadOperation +instance FromJSON DownloadOperation + +data UploadOperation = UploadOperation + { upload :: OperationParams + , verify :: Maybe OperationParams + } + deriving (Generic, Show) + +instance IsTransferResponseOperation UploadOperation + +instance ToJSON UploadOperation where + toJSON = genericToJSON nonNullOptions + toEncoding = genericToEncoding nonNullOptions + +instance FromJSON UploadOperation + +data OperationParams = OperationParams + { href :: Url + , header :: Maybe (M.Map HTTPHeader HTTPHeaderValue) + , expires_in :: Maybe NumSeconds + , expires_at :: Maybe T.Text + } + deriving (Generic, Show) + +instance ToJSON OperationParams where + toJSON = genericToJSON nonNullOptions + toEncoding = genericToEncoding nonNullOptions + +instance FromJSON OperationParams + +data Verification = Verification + { verification_oid :: SHA256 + , verification_size :: Integer + } + deriving (Generic, Show) + +instance ToJSON Verification where + toJSON = genericToJSON verificationOptions + toEncoding = genericToEncoding verificationOptions + +instance FromJSON Verification where + parseJSON = genericParseJSON verificationOptions + +verificationOptions :: Options +verificationOptions = stripFieldPrefix defaultOptions + +-- | Sent over ssh connection when using that to find the endpoint. +data SshDiscoveryResponse = SshDiscoveryResponse + { endpoint_href :: Url + , endpoint_header :: Maybe (M.Map HTTPHeader HTTPHeaderValue) + , endpoint_expires_in :: Maybe NumSeconds + , endpoint_expires_at :: Maybe T.Text + } deriving (Generic, Show) + +instance ToJSON SshDiscoveryResponse where + toJSON = genericToJSON sshDiscoveryResponseOptions + toEncoding = genericToEncoding sshDiscoveryResponseOptions + +instance FromJSON SshDiscoveryResponse where + parseJSON = genericParseJSON sshDiscoveryResponseOptions + +sshDiscoveryResponseOptions :: Options +sshDiscoveryResponseOptions = stripFieldPrefix nonNullOptions + +data GitRef = GitRef + { name :: T.Text } + deriving (Generic, Show) + +instance FromJSON GitRef +instance ToJSON GitRef + +type SHA256 = T.Text + +-- | The endpoint of a git-lfs server. +data Endpoint + = EndpointURI URI.URI + | EndpointDiscovered SshDiscoveryResponse + deriving (Show) + +-- | Command to run via ssh with to discover an endpoint. The FilePath is +-- the location of the git repository on the ssh server. +-- +-- Note that, when sshing to the server, you should take care that the +-- hostname you pass to ssh is really a hostname and not something that ssh +-- will parse an an option, such as -oProxyCommand=". +sshDiscoverEndpointCommand :: FilePath -> TransferRequestOperation -> [String] +sshDiscoverEndpointCommand remotepath tro = + [ "git-lfs-authenticate" + , remotepath + , case tro of + RequestDownload -> "download" + RequestUpload -> "upload" + ] + +-- | Parse the json output when doing ssh endpoint discovery. +parseSshDiscoverEndpointResponse :: L.ByteString -> Maybe Endpoint +parseSshDiscoverEndpointResponse resp = EndpointDiscovered <$> decode resp + +-- | Guesses the LFS endpoint from the http url of a git remote. +-- +-- https://github.com/git-lfs/git-lfs/blob/master/docs/api/server-discovery.md +guessEndpoint :: URI.URI -> Maybe Endpoint +guessEndpoint uri = case URI.uriScheme uri of + "https:" -> Just endpoint + "http:" -> Just endpoint + _ -> Nothing + where + endpoint = EndpointURI $ uri + -- force https because the git-lfs protocol uses http + -- basic auth tokens, which should not be exposed + { URI.uriScheme = "https:" + , URI.uriPath = guessedpath + } + + guessedpath + | ".git" `isSuffixOf` URI.uriPath uri = + URI.uriPath uri ++ "/info/lfs" + | ".git/" `isSuffixOf` URI.uriPath uri = + URI.uriPath uri ++ "info/lfs" + | otherwise = (droptrailing '/' (URI.uriPath uri)) ++ ".git/info/lfs" + + droptrailing c = reverse . dropWhile (== c) . reverse + +-- | Makes a Request that will start the process of making a transfer to or +-- from the LFS endpoint. +startTransferRequest :: Endpoint -> TransferRequest -> Maybe Request +startTransferRequest (EndpointURI uri) tr = do + r <- requestFromURI uri + return $ addLfsJsonHeaders $ r + -- Since this uses the LFS batch API, it adds /objects/batch + -- to the endpoint url. + { path = path r <> "/objects/batch" + , method = "POST" + , requestBody = RequestBodyLBS (encode tr) + } +startTransferRequest (EndpointDiscovered sr) tr = do + uri <- URI.parseURI (T.unpack (endpoint_href sr)) + req <- startTransferRequest (EndpointURI uri) tr + let headers = map convheader $ maybe [] M.toList $ endpoint_header sr + return $ req { requestHeaders = requestHeaders req ++ headers } + where + convheader (k, v) = (CI.mk (E.encodeUtf8 k), E.encodeUtf8 v) + +-- | "user@host" or just the hostname. +type HostUser = String + +addLfsJsonHeaders :: Request -> Request +addLfsJsonHeaders r = r + { requestHeaders = requestHeaders r ++ + [ ("Accept", lfsjson) + , ("Content-Type", lfsjson) + ] + } + where + lfsjson = "application/vnd.git-lfs+json" + +data ParsedTransferResponse op + = ParsedTransferResponse (TransferResponse op) + | ParsedTransferResponseError TransferResponseError + | ParseFailed String + +-- | Parse the body of a response to a transfer request. +parseTransferResponse + :: IsTransferResponseOperation op + => L.ByteString + -> ParsedTransferResponse op +parseTransferResponse resp = case eitherDecode resp of + Right tr -> ParsedTransferResponse tr + -- If unable to decode as a TransferResponse, try to decode + -- as a TransferResponseError instead, in case the LFS server + -- sent an error message. + Left err -> + either (const $ ParseFailed err) ParsedTransferResponseError $ + eitherDecode resp + +-- | Builds a http request to perform a download. +downloadOperationRequest :: DownloadOperation -> Maybe Request +downloadOperationRequest = operationParamsRequest . download + +-- | Builds http request to perform an upload. The content to upload is +-- provided in the RequestBody, along with its SHA256 and size. +-- +-- When the LFS server requested verification, there will be a second +-- Request that does that; it should be run only after the upload has +-- succeeded. +-- +-- When the LFS server already contains the object, an empty list may be +-- returned. +uploadOperationRequests :: UploadOperation -> RequestBody -> SHA256 -> Integer -> Maybe [Request] +uploadOperationRequests op content oid size = + case (mkdlreq, mkverifyreq) of + (Nothing, _) -> Nothing + (Just dlreq, Nothing) -> Just [dlreq] + (Just dlreq, Just verifyreq) -> Just [dlreq, verifyreq] + where + mkdlreq = mkdlreq' + <$> operationParamsRequest (upload op) + mkdlreq' r = r + { method = "PUT" + , requestBody = content + } + mkverifyreq = mkverifyreq' + <$> (operationParamsRequest =<< verify op) + mkverifyreq' r = addLfsJsonHeaders $ r + { method = "POST" + , requestBody = RequestBodyLBS $ encode $ + Verification oid size + } + +operationParamsRequest :: OperationParams -> Maybe Request +operationParamsRequest ps = do + r <- parseRequest (T.unpack (href ps)) + let headers = map convheader $ maybe [] M.toList (header ps) + return $ r { requestHeaders = headers } + where + convheader (k, v) = (CI.mk (E.encodeUtf8 k), E.encodeUtf8 v) + +type Url = T.Text + +type NumSeconds = Integer + +type HTTPHeader = T.Text + +type HTTPHeaderValue = T.Text + +-- Prevent Nothing from serializing to null. +nonNullOptions :: Options +nonNullOptions = defaultOptions { omitNothingFields = True } + +-- Remove prefix from field names. +stripFieldPrefix :: Options -> Options +stripFieldPrefix o = + o { fieldLabelModifier = drop 1 . dropWhile (/= '_') } diff --git a/Utility/Url.hs b/Utility/Url.hs index 15bc0239f3..4ab79e5a76 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -29,6 +29,7 @@ module Utility.Url ( assumeUrlExists, download, downloadQuiet, + downloadConduit, sinkResponseFile, downloadPartial, parseURIRelaxed, @@ -335,8 +336,9 @@ download' noerror meterupdate url file uo = case (urlDownloader uo, parseUrlRequest (show u)) of (DownloadWithConduit (DownloadWithCurlRestricted r), Just req) -> catchJust (matchStatusCodeException (== found302)) - (downloadconduit req) + (downloadConduit meterupdate req file uo >> return True) (followredir r) + `catchNonAsync` (dlfailed . show) (DownloadWithConduit (DownloadWithCurlRestricted r), Nothing) | isfileurl u -> downloadfile u | isftpurl u -> downloadcurlrestricted r u url ftpport @@ -354,58 +356,6 @@ download' noerror meterupdate url file uo = ftpport = 21 - downloadconduit req = catchMaybeIO (getFileSize file) >>= \case - Just sz | sz > 0 -> resumeconduit req' sz - _ -> runResourceT $ do - liftIO $ debugM "url" (show req') - resp <- http req' (httpManager uo) - if responseStatus resp == ok200 - then store zeroBytesProcessed WriteMode resp - else showrespfailure resp - where - req' = applyRequest uo $ req - -- Override http-client's default decompression of gzip - -- compressed files. We want the unmodified file content. - { requestHeaders = (hAcceptEncoding, "identity") : - filter ((/= hAcceptEncoding) . fst) - (requestHeaders req) - , decompress = const False - } - - alreadydownloaded sz s h = s == requestedRangeNotSatisfiable416 - && case lookup hContentRange h of - -- This could be improved by fixing - -- https://github.com/aristidb/http-types/issues/87 - Just crh -> crh == B8.fromString ("bytes */" ++ show sz) - -- Some http servers send no Content-Range header when - -- the range extends beyond the end of the file. - -- There is no way to distinguish between the file - -- being the same size on the http server, vs - -- it being shorter than the file we already have. - -- So assume we have the whole content of the file - -- already, the same as wget and curl do. - Nothing -> True - - -- Resume download from where a previous download was interrupted, - -- when supported by the http server. The server may also opt to - -- send the whole file rather than resuming. - resumeconduit req sz = catchJust - (matchStatusCodeHeadersException (alreadydownloaded sz)) - dl - (const $ return True) - where - dl = runResourceT $ do - let req' = req { requestHeaders = resumeFromHeader sz : requestHeaders req } - liftIO $ debugM "url" (show req') - resp <- http req' (httpManager uo) - if responseStatus resp == partialContent206 - then store (BytesProcessed sz) AppendMode resp - else if responseStatus resp == ok200 - then store zeroBytesProcessed WriteMode resp - else showrespfailure resp - - showrespfailure = liftIO . dlfailed . B8.toString - . statusMessage . responseStatus showhttpexception he = do let msg = case he of HttpExceptionRequest _ (StatusCodeException r _) -> @@ -417,6 +367,7 @@ download' noerror meterupdate url file uo = HttpExceptionRequest _ other -> show other _ -> show he dlfailed msg + dlfailed msg | noerror = return False | otherwise = do @@ -424,10 +375,6 @@ download' noerror meterupdate url file uo = hFlush stderr return False - store initialp mode resp = do - sinkResponseFile meterupdate initialp file mode resp - return True - basecurlparams = curlParams uo [ if noerror then Param "-S" @@ -453,6 +400,8 @@ download' noerror meterupdate url file uo = L.writeFile file return True + -- Conduit does not support ftp, so will throw an exception on a + -- redirect to a ftp url; fall back to curl. followredir r ex@(HttpExceptionRequest _ (StatusCodeException resp _)) = case headMaybe $ map decodeBS $ getResponseHeader hLocation resp of Just url' -> case parseURIRelaxed url' of @@ -463,6 +412,68 @@ download' noerror meterupdate url file uo = Nothing -> throwIO ex followredir _ ex = throwIO ex +{- Download a perhaps large file using conduit, with auto-resume + - of incomplete downloads. + - + - Does not catch exceptions. + -} +downloadConduit :: MeterUpdate -> Request -> FilePath -> UrlOptions -> IO () +downloadConduit meterupdate req file uo = + catchMaybeIO (getFileSize file) >>= \case + Just sz | sz > 0 -> resumedownload sz + _ -> runResourceT $ do + liftIO $ debugM "url" (show req') + resp <- http req' (httpManager uo) + if responseStatus resp == ok200 + then store zeroBytesProcessed WriteMode resp + else respfailure resp + where + req' = applyRequest uo $ req + -- Override http-client's default decompression of gzip + -- compressed files. We want the unmodified file content. + { requestHeaders = (hAcceptEncoding, "identity") : + filter ((/= hAcceptEncoding) . fst) + (requestHeaders req) + , decompress = const False + } + + -- Resume download from where a previous download was interrupted, + -- when supported by the http server. The server may also opt to + -- send the whole file rather than resuming. + resumedownload sz = catchJust + (matchStatusCodeHeadersException (alreadydownloaded sz)) + dl + (const noop) + where + dl = runResourceT $ do + let req'' = req' { requestHeaders = resumeFromHeader sz : requestHeaders req } + liftIO $ debugM "url" (show req'') + resp <- http req'' (httpManager uo) + if responseStatus resp == partialContent206 + then store (BytesProcessed sz) AppendMode resp + else if responseStatus resp == ok200 + then store zeroBytesProcessed WriteMode resp + else respfailure resp + + alreadydownloaded sz s h = s == requestedRangeNotSatisfiable416 + && case lookup hContentRange h of + -- This could be improved by fixing + -- https://github.com/aristidb/http-types/issues/87 + Just crh -> crh == B8.fromString ("bytes */" ++ show sz) + -- Some http servers send no Content-Range header when + -- the range extends beyond the end of the file. + -- There is no way to distinguish between the file + -- being the same size on the http server, vs + -- it being shorter than the file we already have. + -- So assume we have the whole content of the file + -- already, the same as wget and curl do. + Nothing -> True + + store initialp mode resp = + sinkResponseFile meterupdate initialp file mode resp + + respfailure = giveup . B8.toString . statusMessage . responseStatus + {- Sinks a Response's body to a file. The file can either be opened in - WriteMode or AppendMode. Updates the meter as data is received. - diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 212c78985b..dc9868e91d 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1546,71 +1546,71 @@ Here are all the supported configuration settings. For example, to use the wipe command, set it to `wipe -f %file`. -* `remote..rsyncurl` +* `remote..annex-rsyncurl` Used by rsync special remotes, this configures the location of the rsync repository to use. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. -* `remote..buprepo` +* `remote..annex-buprepo` Used by bup special remotes, this configures the location of the bup repository to use. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. -* `remote..ddarrepo` +* `remote..annex-ddarrepo` Used by ddar special remotes, this configures the location of the ddar repository to use. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. -* `remote..directory` +* `remote..annex-directory` Used by directory special remotes, this configures the location of the directory where annexed files are stored for this remote. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. -* `remote..adb` +* `remote..annex-adb` Used to identify remotes on Android devices accessed via adb. Normally this is automatically set up by `git annex initremote`. -* `remote..androiddirectory` +* `remote..annex-androiddirectory` Used by adb special remotes, this is the directory on the Android device where files are stored for this remote. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. -* `remote..androidserial` +* `remote..annex-androidserial` Used by adb special remotes, this is the serial number of the Android device used by the remote. Normally this is automatically set up by `git annex initremote`, but you can change it if needed, eg when upgrading to a new Android device. -* `remote..s3` +* `remote..annex-s3` Used to identify Amazon S3 special remotes. Normally this is automatically set up by `git annex initremote`. -* `remote..glacier` +* `remote..annex-glacier` Used to identify Amazon Glacier special remotes. Normally this is automatically set up by `git annex initremote`. -* `remote..webdav` +* `remote..annex-webdav` Used to identify webdav special remotes. Normally this is automatically set up by `git annex initremote`. -* `remote..tahoe` +* `remote..annex-tahoe` Used to identify tahoe special remotes. Points to the configuration directory for tahoe. -* `remote..gcrypt` +* `remote..annex-gcrypt` Used to identify gcrypt special remotes. Normally this is automatically set up by `git annex initremote`. @@ -1619,7 +1619,14 @@ Here are all the supported configuration settings. If the gcrypt remote is accessible over ssh and has git-annex-shell available to manage it, it's set to "shell". -* `remote..hooktype`, `remote..externaltype` +* `remote..annex-git-lfs` + + Used to identify git-lfs special remotes. + Normally this is automatically set up by `git annex initremote`. + + It is set to "true" if this is a git-lfs remote. + +* `remote..annex-hooktype`, `remote..annex-externaltype` Used by hook special remotes and external special remotes to record the type of the remote. diff --git a/doc/special_remotes.mdwn b/doc/special_remotes.mdwn index 0ae4346ed0..0203f5778f 100644 --- a/doc/special_remotes.mdwn +++ b/doc/special_remotes.mdwn @@ -15,6 +15,7 @@ the git history is not stored in them. * [[ddar]] * [[directory]] * [[gcrypt]] (encrypted git repositories!) +* [[git-lfs]] * [[hook]] * [[rclone]] * [[rsync]] diff --git a/doc/special_remotes/gcrypt.mdwn b/doc/special_remotes/gcrypt.mdwn index 5807c9e5f2..2842e43303 100644 --- a/doc/special_remotes/gcrypt.mdwn +++ b/doc/special_remotes/gcrypt.mdwn @@ -4,6 +4,12 @@ remote allows git-annex to also store its files in such repositories. Naturally, git-annex encrypts the files it stores too, so everything stored on the remote is encrypted. +This special remote needs the server hosting the remote repository +to either have git-annex-shell or rsync accessible via ssh. git-annex +uses those to store its content in the remote. If the remote repository +is instead hosted on a server using git-lfs, you can use the [[git-lfs]] +special remote instead of this one; it also supports using gcrypt. + See [[tips/fully_encrypted_git_repositories_with_gcrypt]] for some examples of using gcrypt. @@ -35,11 +41,12 @@ shell access, and `rsync` must be installed. Those are the minimum requirements, but it's also recommended to install git-annex on the remote server, so that [[git-annex-shell]] can be used. -While you can use git-remote-gcrypt with servers like github, git-annex -can't store files on them. In such a case, you can just use -git-remote-gcrypt directly. +If you can't run `rsync` or `git-annex-shell` on the remote server, +you can't use this special remote. Other options are the [[git-lfs]] +special remote, which can also be combined with gcrypt, or +using git-remote-gcrypt to encrypt a remote that git-annex cannot use. -If you use encryption=hybrid, you can add more gpg keys that can access +If you use encryption=hybrid, you can later add more gpg keys that can access the files git-annex stored in the gcrypt repository. However, due to the way git-remote-gcrypt encrypts the git repository, you will need to somehow force it to re-push everything again, so that the encrypted repository can diff --git a/doc/special_remotes/git-lfs.mdwn b/doc/special_remotes/git-lfs.mdwn new file mode 100644 index 0000000000..e48a76cf4f --- /dev/null +++ b/doc/special_remotes/git-lfs.mdwn @@ -0,0 +1,101 @@ +git-annex has a special remote that lets it store content in git-lfs +repositories. + +See [[tips/storing_data_in_git-lfs]] for some examples of how to use this. + +## configuration + +These parameters can be passed to `git annex initremote` to configure +the git-lfs special remote: + +* `url` - Required. The url to the git-lfs repository to use. + Can be either a ssh url (scp-style is also accepted) or a http url. + But currently, a http url accesses the git-lfs repository without + authentication. To authenticate, you will need to use a ssh url. + + This parameter needs to be specified in the initial `git annex + initremote` but also each time you `git annex enableremote` + an existing git-lfs special remote. It's fine to use different urls + at different times as long as they point to the same git-lfs repository. + +* `encryption` - One of "none", "hybrid", "shared", or "pubkey". + Required. See [[encryption]]. Also see the encryption notes below. + +* `keyid` - Specifies the gpg key to use for encryption of both the files + git-annex stores in the repository, as well as to encrypt the git + repository itself when using gcrypt. May be repeated when + multiple participants should have access to the repository. + +## efficiency note + +Since git-lfs uses SHA256 checksums, git-annex needs to keep track of the +SHA256 of content stored in it, in order to be able to retrieve that +content. When a git-annex key uses a [[backend|backends]] +of SHA256 or SHA256E, that's easy. But, if a git-annex key uses some +other backend, git-annex has to additionally store the SHA256 checksum +into the git-annex branch when storing content in git-lfs. That adds a +small bit of size overhead to using this remote. + +When encrypting data sent to the git-lfs remote, git-annex always has to +store its SHA256 checksum in the git-annex branch. + +## encryption notes + +To encrypt a git-lfs repository, there are two separate things that +have to be encrypted: the data git-annex stores there, and the content +of the git repository itself. After all, a git-lfs remote is a git remote +and git push doesn't encrypt data by default. + +To encrypt your git pushes, you can use +[git-remote-gcrypt](https://spwhitton.name/tech/code/git-remote-gcrypt/) +and prefix the repository url with "gcrypt::" + +To make git-annex encrypt the data it stores, you can use the encrption= +configuration. + +An example of combining the two: + + git annex initremote lfstest type=git-lfs url=gcrypt::git@github.com:username/somerepo.git encryption=shared + +In that example, the git-annex shared encryption key is stored in +git, but that's ok because git push will encrypt it, along with all +the other git data, using your gpg key. You could instead use +"encryption=shared keyid=" to make git-annex and gcrypt both encrypt +to a specified gpg key. + +git-annex will detect if one part of the repository is encrypted, +but you forgot to encrypt the other part, and will refuse to set up +such an insecure half-encrypted repository. + +If you use encryption=hybrid, you can later add more gpg keys that can access +the files git-annex stored in the git-lfs repository. However, due to the +way git-remote-gcrypt encrypts the git repository, you will need to somehow +force it to re-push everything again, so that the encrypted repository can +be decrypted by the added keys. Probably this can be done by setting +`GCRYPT_FULL_REPACK` and doing a forced push of branches. + +git-annex will set `remote.`gcrypt-publish-participants` when setting +up a repository that uses gcrypt. This is done to avoid unncessary gpg +passphrase prompts, but it does publish the gpg keyids that can decrypt the +repository. Unset it if you need to obscure that. + +## limitations + +The git-lfs protocol does not support deleting content, so git-annex +**cannot delete anything** from a git-lfs special remote. + +The git-lfs protocol does not support resuming uploads, and so an +interrupted upload will have to restart from the beginning. Interrupted +downloads will resume. + +git-lfs has a concept of git ref based access control, so a user may only +be able to send content associated with a particular git ref. git-annex +does not currently provide any git ref, so won't work with a git-lfs server +that uses that. + +git-annex only supports the "basic" git-lfs transfer adapter, but that's +the one used by most git-lfs servers. + +The git-lfs protocol is designed around batching of transfers, but +git-annex doesn't do batching. This may cause it to fall afoul of +rate limiting of git-lfs servers when transferring a lot of files. diff --git a/doc/thanks/list b/doc/thanks/list index 6b58c7273b..f7ce6507eb 100644 --- a/doc/thanks/list +++ b/doc/thanks/list @@ -59,4 +59,9 @@ Walltime, Caleb Allen, TD, Pedro Araújo, - +Ryan Newton, +David W, +L N D, +EVAN HENSHAWPLATH, +James Read, +Luke Shumaker, diff --git a/doc/tips/fully_encrypted_git_repositories_with_gcrypt.mdwn b/doc/tips/fully_encrypted_git_repositories_with_gcrypt.mdwn index 2df15f193f..1847a6fb8c 100644 --- a/doc/tips/fully_encrypted_git_repositories_with_gcrypt.mdwn +++ b/doc/tips/fully_encrypted_git_repositories_with_gcrypt.mdwn @@ -1,8 +1,7 @@ [git-remote-gcrypt](https://spwhitton.name/tech/code/git-remote-gcrypt/) -adds support for encrypted remotes to git. The git-annex -[[gcrypt special remote|special_remotes/gcrypt]] allows git-annex to -also store its files in such repositories. Naturally, git-annex encrypts -the files it stores too, so everything stored on the remote is encrypted. +adds support for encrypted remotes to git. Combine this with git-annex +encrypting the files it stores in a remote, and you can fully encrypt +all the data stored on a remote. Here are some ways you can use this awesome stuff.. @@ -15,7 +14,12 @@ repositories. ## prerequisites * Install [git-remote-gcrypt](https://spwhitton.name/tech/code/git-remote-gcrypt/) -* Install git-annex version 4.20130909 or newer. + +* Set up a gpg key. You might consider generating a special purpose key + just for this use case, since you may end up wanting to put the key + on multiple machines that you would not trust with your main gpg key. + + The examples below use "$mykey" where you should put your gpg keyid. ## encrypted backup drive @@ -24,18 +28,18 @@ both the full contents of your git repository, and all the files you instruct git-annex to store on it, and everything will be encrypted so that only you can see it. -First, you need to set up a gpg key. You might consider generating a -special purpose key just for this use case, since you may end up wanting to -put the key on multiple machines that you would not trust with your -main gpg key. - -You need to tell git-annex the keyid of the key when setting up the -encrypted repository: +Here's how to set up the encrypted repository: git init --bare /mnt/encryptedbackup git annex initremote encryptedbackup type=gcrypt gitrepo=/mnt/encryptedbackup keyid=$mykey git annex sync encryptedbackup +(Remember to replace "$mykey" with the keyid of your gpg key.) + +This uses the [[gcrypt special remote|special_remotes/gcrypt]] to encrypt +pushes to the git remote, and git-annex will also encrypt the files it +stores there. + Now you can copy (or even move) files to the repository. After sending files to it, you'll probably want to do a sync, which pushes the git repository changes to it as well. @@ -62,23 +66,25 @@ the gpg key used to encrypt it, and then: ## encrypted git-annex repository on a ssh server -If you have a ssh server that has rsync installed, you can set up an -encrypted repository there. Works just like the encrypted drive except -without the cable. +If you have a ssh server that has git-annex or rsync installed on it, you +can set up an encrypted repository there. Works just like the encrypted +drive except without the cable. First, on the server, run: git init --bare encryptedrepo -(Also, install git-annex on the server if it's possible & easy to do so. -While this will work without git-annex being installed on the server, it -is recommended to have it installed.) - Now, in your existing git-annex repository, set up the encrypted remote: git annex initremote encryptedrepo type=gcrypt gitrepo=ssh://my.server/home/me/encryptedrepo keyid=$mykey git annex sync encryptedrepo +(Remember to replace "$mykey" with the keyid of your gpg key.) + +This uses the [[gcrypt special remote|special_remotes/gcrypt]] to encrypt +pushes to the git remote, and git-annex will also encrypt the files it +stores there. + If you're going to be sharing this repository with others, be sure to also include their keyids, by specifying keyid= repeatedly. @@ -97,11 +103,31 @@ used to encrypt it can check it out: git annex enableremote encryptedrepo gitrepo=ssh://my.server/home/me/encryptedrepo git annex get --from encryptedrepo -## private encrypted git remote on hosting site +## private encrypted git remote on a git-lfs hosting site + +Some git repository hosting sites do not support git-annex, but do support +the similar git-lfs for storing large files alongside a git repository. +git-annex can use the git-lfs protocol to store files in such repositories, +and with gcrypt, everything stored in the remote can be encrypted. + +First, make a new, empty git repository on the hosting site. +Get the ssh clone url for the repository, which might look +like "git@github.com:username/somerepo.git" + +Then, in your git-annex repository, set up the encrypted remote: + + git annex initremote lfstest type=git-lfs url=gcrypt::git@github.com:username/somerepo.git keyid=$mykey + +(Remember to replace "$mykey" with the keyid of your gpg key.) + +This uses the [[git-lfs special remote|special_remotes/git-lfs]], and the +`gcrypt::` prefix on the url makes pushes be encrypted with gcrypt. + +## private encrypted git remote on a git hosting site You can use gcrypt to store your git repository in encrypted form on any -hosting site that supports git. Only you can decrypt its contents. -Using it this way, git-annex does not store large files on the hosting site; it's +hosting site that supports git. Only you can decrypt its contents. Using it +this way, git-annex does not store large files on the hosting site; it's only used to store your git repository itself. git remote add encrypted gcrypt::ssh://hostingsite/myrepo.git @@ -115,7 +141,7 @@ url you used when setting it up: git clone gcrypt::ssh://hostingsite/myrepo.git -## multiuser encrypted git remote on hosting site +## multiuser encrypted git remote on a git hosting site Suppose two users want to share an encrypted git remote. Both of you need to set up the remote, and configure gcrypt to encrypt it so that both diff --git a/doc/tips/storing_data_in_git-lfs.mdwn b/doc/tips/storing_data_in_git-lfs.mdwn new file mode 100644 index 0000000000..38779cfc64 --- /dev/null +++ b/doc/tips/storing_data_in_git-lfs.mdwn @@ -0,0 +1,34 @@ +git-annex can store data in [git-lfs](https://git-lfs.github.com/) +repositories, using the [[git-lfs special remote|special_remotes/git-lfs]]. + +You do not need the git-lfs program installed to use it, just a recent +enough version of git-annex. + +Here's how to initialize a git-lfs special remote on Github. + + git annex initremote lfs type=git-lfs encryption=none url=git@github.com:yourname/yourrepo.git + +In this example, the remote will not be encrypted, so anyone who can access +it can see its contents. It is possible to encrypt everything stored in a +git-lfs remote, see [[fully_encrypted_git_repositories_with_gcrypt]]. + +Once the git-lfs remote is set up, git-annex can store and retrieve +content in the usual ways: + + git annex copy * --to lfs + git annex get --from lfs + +But, git-annex **cannot delete anything** from a git-lfs special remote, +because the protocol does not support deletion. + +A git-lfs special remote also functions as a regular git remote. You can +use things like `git push` and `git pull` with it. + +To enable an existing git-lgs remote in another clone of the repository, +you'll need to provide an url to it again. It's ok to provide a different +url as long as it points to the same git-lfs repository. + + git annex enableremote lfs url=https://github.com/yourname/yourrepo.git + +Note that http urls currently only allow read access to the git-lfs +repository. diff --git a/git-annex.cabal b/git-annex.cabal index c4ef5c4f65..a2e5743069 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -931,6 +931,7 @@ Executable git-annex Remote.External.Types Remote.GCrypt Remote.Git + Remote.GitLFS Remote.Glacier Remote.Helper.AWS Remote.Helper.Chunked @@ -1039,6 +1040,7 @@ Executable git-annex Utility.FileSystemEncoding Utility.Format Utility.FreeDesktop + Utility.GitLFS Utility.Glob Utility.Gpg Utility.Hash