implement updating the ContentIdentifier db with info from the git-annex branch

untested

This won't be super slow, but it does need to diff two likely large
trees, and since the git-annex branch rarely sits still, it will most
likely be run at the beginning of every import.

A possible speed improvement would be to only run this when the database
did not contain a ContentIdentifier. But that would only speed up
imports when there is no new version of a file on the special remote,
at most renames of existing files being imported.

A better speed improvement would be to record something in the git-annex
branch that indicates when an import has been run, and only do the diff
if the git-annex branch has record of a newer import than we've seen
before. Then, it would only run when there is in fact new
ContentIdentifier information available from a remote. Certianly doable,
but didn't want to complicate things yet.
This commit is contained in:
Joey Hess 2019-03-06 18:04:30 -04:00
parent 12e4906657
commit ee251b2e2e
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 72 additions and 24 deletions

View file

@ -25,7 +25,9 @@ import Git.Sha
import Git.FilePath
import qualified Git.Ref
import qualified Git.Branch
import qualified Git.DiffTree as DiffTree
import qualified Annex
import qualified Annex.Branch
import Annex.Link
import Annex.LockFile
import Annex.Content
@ -36,11 +38,12 @@ import Types.Key
import Types.KeySource
import Utility.Metered
import Utility.DataUnits
import Logs
import Logs.Export
import Logs.ContentIdentifier
import Logs.Location
import qualified Database.Export as Export
import qualified Database.ContentIdentifier as CID
import qualified Database.ContentIdentifier as CIDDb
import qualified Logs.ContentIdentifier as CIDLog
import Control.Concurrent.STM
import qualified Data.Map.Strict as M
@ -203,7 +206,8 @@ buildImportTrees basetree msubdir importable = History
linksha <- hashSymlink symlink
return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha
{- Downloads all new ContentIdentifiers. Supports concurrency when enabled.
{- Downloads all new ContentIdentifiers as needed to generate Keys.
- Supports concurrency when enabled.
-
- If any download fails, the whole thing fails, but it will resume where
- it left off.
@ -218,7 +222,9 @@ downloadImport remote importtreeconfig importablecontents = do
-- they will only be downloaded once.
cidmap <- liftIO $ newTVarIO M.empty
withExclusiveLock gitAnnexContentIdentifierLock $
bracket CID.openDb CID.closeDb (go cidmap importablecontents)
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
updateContentIdentifierDbFromBranch db
go cidmap importablecontents db
-- TODO really support concurrency; avoid donwloading the same
-- ContentIdentifier twice.
where
@ -270,7 +276,7 @@ downloadImport remote importtreeconfig importablecontents = do
getTopFilePath subdir </> fromImportLocation loc
getcidkey cidmap db cid = liftIO $
CID.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
CIDDb.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
[] -> atomically $
maybeToList . M.lookup cid <$> readTVar cidmap
l -> return l
@ -278,8 +284,8 @@ downloadImport remote importtreeconfig importablecontents = do
recordcidkey cidmap db cid k = do
liftIO $ atomically $ modifyTVar' cidmap $
M.insert cid k
liftIO $ CID.recordContentIdentifier db (Remote.uuid remote) cid k
recordContentIdentifier (Remote.uuid remote) cid k
liftIO $ CIDDb.recordContentIdentifier db (Remote.uuid remote) cid k
CIDLog.recordContentIdentifier (Remote.uuid remote) cid k
{- Temporary key used for import of a ContentIdentifier while downloading
- content, before generating its real key. -}
@ -289,3 +295,31 @@ importKey (ContentIdentifier cid) size = stubKey
, keyVariety = OtherKey "CID"
, keySize = Just size
}
{- Updates the ContentIdentifier database with information from the
- git-annex branch. This way, ContentIdentifiers that have been imported
- in other clones of the repository will be known, and not unncessarily
- downloaded again.
-
- The database should already be locked for write.
-}
updateContentIdentifierDbFromBranch :: CIDDb.ContentIdentifierHandle -> Annex ()
updateContentIdentifierDbFromBranch db = do
oldtree <- liftIO $ CIDDb.getAnnexBranchTree db
inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case
Just t | t /= oldtree -> do
(l, cleanup) <- inRepo $ DiffTree.diffTree oldtree t
mapM_ go l
void $ liftIO $ cleanup
liftIO $ do
CIDDb.recordAnnexBranchTree db t
CIDDb.flushDbQueue db
_ -> return ()
where
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
Nothing -> return ()
Just k -> do
l <- CIDLog.getContentIdentifiers k
liftIO $ forM_ l $ \(u, cids) ->
forM_ cids $ \cid ->
CIDDb.recordContentIdentifier db u cid k

View file

@ -22,7 +22,10 @@ module Database.ContentIdentifier (
recordContentIdentifier,
getContentIdentifiers,
getContentIdentifierKeys,
recordAnnexBranchTree,
getAnnexBranchTree,
ContentIdentifiersId,
AnnexBranchId,
) where
import Database.Types
@ -31,6 +34,8 @@ import Database.Init
import Annex.Locations
import Annex.Common hiding (delete)
import Types.Import
import Git.Types
import Git.Sha
import Database.Persist.Sql hiding (Key)
import Database.Persist.TH
@ -45,6 +50,11 @@ ContentIdentifiers
ContentIdentifiersIndexRemoteKey remote key
ContentIdentifiersIndexRemoteCID remote cid
UniqueRemoteCidKey remote cid key
-- The last git-annex branch tree sha that was used to update
-- ContentIdentifiers
AnnexBranch
tree SRef
UniqueTree tree
|]
{- Opens the database, creating it if it doesn't exist yet.
@ -97,3 +107,15 @@ getContentIdentifierKeys (ContentIdentifierHandle h) u cid =
, ContentIdentifiersRemote ==. u
] []
return $ map (fromIKey . contentIdentifiersKey . entityVal) l
recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
recordAnnexBranchTree h s = queueDb h $ do
deleteWhere ([] :: [Filter AnnexBranch])
void $ insertUnique $ AnnexBranch $ toSRef s
getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do
l <- selectList ([] :: [Filter AnnexBranch]) []
case l of
(s:[]) -> return $ fromSRef $ annexBranchTree $ entityVal s
_ -> return emptyTree

View file

@ -21,6 +21,7 @@ import qualified Annex
import qualified Data.Map as M
import Data.List.NonEmpty (NonEmpty(..))
import qualified Data.List.NonEmpty as NonEmpty
-- | Records a remote's content identifier and the key that it corresponds to.
--
@ -37,9 +38,10 @@ recordContentIdentifier u cid k = do
where
m = simpleMap l
-- | Get all content identifiers that a remote is known to use for a key.
getContentIdentifiers :: UUID -> Key -> Annex [ContentIdentifier]
getContentIdentifiers u k = do
-- | Get all known content identifiers for a key.
getContentIdentifiers :: Key -> Annex [(UUID, [ContentIdentifier])]
getContentIdentifiers k = do
config <- Annex.getGitConfig
contentIdentifierList . M.lookup u . simpleMap . parseLog
map (\(u, l) -> (u, NonEmpty.toList l) )
. M.toList . simpleMap . parseLog
<$> Annex.Branch.get (remoteContentIdentifierLogFile config k)

View file

@ -17,19 +17,9 @@ this.
* Need to support annex-tracking-branch configuration, which documentation
says makes git-annex sync and assistant do imports.
* Database.ContentIdentifier needs a way to update the database with
information coming from the git-annex branch. This will allow multiple
clones to import from the same remote, and share content identifier
information amoung them.
It will only need to be updated when listContents returns a
ContentIdentifier that is not already known in the database.
How to do the update: Stash the ref of the last git-annex branch it's
updated from in the database. Diff between that ref and the current
git-annex branch. For each file in the diff that's a .cid file, read
the file from the branch, and store into the database.
Update the stashed ref.
* Test behavior when multiple repos import from same special remote;
the second importer should not re-download as long as it has pulled
from the first importer.
* When on an adjusted unlocked branch, need to import the files unlocked.
Also, the tracking branch code needs to know about such branches,