import from borg is basically working

Still some issues to deal with, see TODO and XXX.

Here's what gets logged, for each key:

cid log:
1608582045.832799227s 6720ebad-b20e-4460-a8f2-2477361aea75 !MjAyMC0xMi0yMVQxMTozMzoxNw==:!MjAyMC0xMi0yMVQxMzowNzoyNg==

The "!Mj" are base64 encoded borg archive names, since mine were
dates and contained some characters not allowed in cid logs unescaped.
There were archives that each contained the key. This list will grow as
more borg backups are done and learned about.

tree generated:
120000 blob 5ef6a4615c084819b44cd4e3a31657664ddf643b	x/dotgit/annex/objects/06/mv/SHA256E-s30--a5d8532e64ec28f5491e25e7a6c1cb68f80507c1be6c1b35f8ec53d25413e5da/SHA256E-s30--a5d8532e64ec28f5491e25e7a6c1cb68f80507c1be6c1b35f8ec53d25413e5da
120000 blob 063a139d3021c8db60f5c576d29fada2b824d91c	x/dotgit/annex/objects/72/PP/SHA256E-s30--e80b09a854b4e4d99a76caaa6983b34272480e0b4fdb95d04234a54b4849b893/SHA256E-s30--e80b09a854b4e4d99a76caaa6983b34272480e0b4fdb95d04234a54b4849b893
120000 blob b53b54916fd6abf21fedf796deca08d5ac7a75af	x/dotgit/annex/objects/Ww/pk/SHA256E-s30--6aac072a8ebf02a5807c4f15e77ed585a6c87b3b333ba625a3c8d6b4dc50a9f2/SHA256E-s30--6aac072a8ebf02a5807c4f15e77ed585a6c87b3b333ba625a3c8d6b4dc50a9f2

This commit was sponsored by Denis Dzyubenko on Patreon.
This commit is contained in:
Joey Hess 2020-12-21 16:20:58 -04:00
parent 15000dee07
commit bcd55b365c
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 95 additions and 8 deletions

View file

@ -5,8 +5,6 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Remote.Borg (remote) where
import Annex.Common
@ -22,9 +20,13 @@ import Remote.Helper.ExportImport
import Annex.UUID
import Types.ProposedAccepted
import Utility.Metered
import qualified Remote.Helper.ThirdParty as ThirdParty
import qualified Remote.Helper.ThirdPartyPopulated as ThirdPartyPopulated
import Text.Read
import Control.Exception (evaluate)
import Control.DeepSeq
import qualified Data.Map as M
import qualified Data.ByteString.Lazy as L
type BorgRepo = String
@ -69,7 +71,7 @@ gen r u rc gc rs = do
, exportActions = exportUnsupported
, importActions = ImportActions
{ listImportableContents = listImportableContentsM borgrepo
, importKey = Just ThirdParty.importKey
, importKey = Just ThirdPartyPopulated.importKey
, retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierM borgrepo
, checkPresentExportWithContentIdentifier = checkPresentExportWithContentIdentifierM borgrepo
-- This remote is thirdPartyPopulated, so these
@ -119,8 +121,63 @@ borgSetup _ mu _ c _gc = do
borgLocal :: BorgRepo -> Bool
borgLocal = notElem ':'
-- TODO avoid rescanning archives that have already been scanned
--
-- XXX importableHistory should probably not be populated. git-annex
-- only stores and uses the most recent imported tree, not the whole history,
-- I think. So a key that's only in a previous archive would not have
-- a known ImportLocation when retrieving it.
-- Instead, maybe need to include the archive names at the top of the
-- importlocation? (Then would not need them in the ContentIdentifier.)
--
-- XXX the tree generated by using this does not seem to get grafted into
-- the git-annex branch, so would be subject to being lost to GC>
-- Is this a general problem affecting importtree too?
listImportableContentsM :: BorgRepo -> Annex (Maybe (ImportableContents (ContentIdentifier, ByteSize)))
listImportableContentsM borgrepo = error "TODO"
listImportableContentsM borgrepo = prompt $ do
ls <- withborglist borgrepo "{barchive}{NUL}" $ \as ->
forM as $ \archive ->
let archive' = borgrepo ++ "::" ++ decodeBS' archive
in withborglist archive' "{size}{NUL}{path}{NUL}" $
liftIO . evaluate . force . parsefilelist archive
return (Just (mkimportablecontents (reverse ls)))
where
withborglist what format a = do
let p = (proc "borg" ["list", what, "--format", format])
{ std_out = CreatePipe }
(Nothing, Just h, Nothing, pid) <- liftIO $ createProcess p
l <- liftIO $ map L.toStrict
. filter (not . L.null)
. L.split 0
<$> L.hGetContents h
let cleanup = liftIO $ do
hClose h
forceSuccessProcess p pid
a l `finally` cleanup
parsefilelist archive (bsz:f:rest) = case readMaybe (fromRawFilePath bsz) of
Nothing -> parsefilelist archive rest
Just sz ->
let loc = ThirdPartyPopulated.mkThirdPartyImportLocation f
-- This does a little unncessary work to parse the
-- key, which is then thrown away. But, it lets the
-- file list be shrank down to only the ones that are
-- importable keys, so avoids needing to buffer all
-- the rest of the files in memory.
in case ThirdPartyPopulated.importKey' loc sz of
Just k -> (loc, (ContentIdentifier archive, sz))
: parsefilelist archive rest
Nothing -> parsefilelist archive rest
parsefilelist _ _ = []
mkimportablecontents [] = ImportableContents
{ importableContents = []
, importableHistory = []
}
mkimportablecontents (v:vs) = ImportableContents
{ importableContents = v
, importableHistory = [mkimportablecontents vs]
}
retrieveExportWithContentIdentifierM :: BorgRepo -> ImportLocation -> ContentIdentifier -> FilePath -> Annex Key -> MeterUpdate -> Annex Key
retrieveExportWithContentIdentifierM borgrepo loc cid dest k p = error "TODO"

View file

@ -5,7 +5,9 @@
- Licensed under the GNU AGPL version 3 or higher.
-}
module Remote.Helper.ThirdParty where
{-# LANGUAGE OverloadedStrings #-}
module Remote.Helper.ThirdPartyPopulated where
import Annex.Common
import Types.Remote
@ -14,13 +16,41 @@ import Crypto (isEncKey)
import Utility.Metered
import qualified System.FilePath.ByteString as P
import qualified Data.ByteString as S
-- When a remote is thirdPartyPopulated, the files we want are probably
-- in the .git directory. But, git does not really support .git in paths
-- in a git tree. (Such a tree can be built, but it will lead to problems.)
-- And so anything in .git is prevented from being imported.
-- To work around that, this renames that directory when generating an
-- ImportLocation.
mkThirdPartyImportLocation :: RawFilePath -> ImportLocation
mkThirdPartyImportLocation =
mkImportLocation . P.joinPath . map esc . P.splitDirectories
where
esc ".git" = "dotgit"
esc x
| "dotgit" `S.isSuffixOf` x = "dot" <> x
| otherwise = x
fromThirdPartyImportLocation :: ImportLocation -> RawFilePath
fromThirdPartyImportLocation =
P.joinPath . map unesc . P.splitDirectories . fromImportLocation
where
unesc "dotgit" = ".git"
unesc x
| "dotgit" `S.isSuffixOf` x = S.drop 3 x
| otherwise = x
-- When a remote is thirdPartyPopulated, and contains a backup of a
-- git-annex repository or some special remotes, this can be used to
-- find only those ImportLocations that are annex object files.
-- All other ImportLocations are ignored.
importKey :: ImportLocation -> ContentIdentifier -> ByteSize -> MeterUpdate -> Annex (Maybe Key)
importKey loc _cid sz _ = return $ case deserializeKey' f of
importKey loc _cid sz _ = return $ importKey' loc sz
importKey' :: ImportLocation -> ByteSize -> Maybe Key
importKey' loc sz = case deserializeKey' f of
Just k
-- Annex objects always are in a subdirectory with the same
-- name as the filename. If this is not the case for the file

View file

@ -963,7 +963,7 @@ Executable git-annex
Remote.Helper.Messages
Remote.Helper.P2P
Remote.Helper.ReadOnly
Remote.Helper.ThirdParty
Remote.Helper.ThirdPartyPopulated
Remote.Helper.Special
Remote.Helper.Ssh
Remote.HttpAlso