implement updating the ContentIdentifier db with info from the git-annex branch

untested

This won't be super slow, but it does need to diff two likely large
trees, and since the git-annex branch rarely sits still, it will most
likely be run at the beginning of every import.

A possible speed improvement would be to only run this when the database
did not contain a ContentIdentifier. But that would only speed up
imports when there is no new version of a file on the special remote,
at most renames of existing files being imported.

A better speed improvement would be to record something in the git-annex
branch that indicates when an import has been run, and only do the diff
if the git-annex branch has record of a newer import than we've seen
before. Then, it would only run when there is in fact new
ContentIdentifier information available from a remote. Certianly doable,
but didn't want to complicate things yet.
This commit is contained in:
Joey Hess 2019-03-06 18:04:30 -04:00
parent 12e4906657
commit ee251b2e2e
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 72 additions and 24 deletions

View file

@ -25,7 +25,9 @@ import Git.Sha
import Git.FilePath
import qualified Git.Ref
import qualified Git.Branch
import qualified Git.DiffTree as DiffTree
import qualified Annex
import qualified Annex.Branch
import Annex.Link
import Annex.LockFile
import Annex.Content
@ -36,11 +38,12 @@ import Types.Key
import Types.KeySource
import Utility.Metered
import Utility.DataUnits
import Logs
import Logs.Export
import Logs.ContentIdentifier
import Logs.Location
import qualified Database.Export as Export
import qualified Database.ContentIdentifier as CID
import qualified Database.ContentIdentifier as CIDDb
import qualified Logs.ContentIdentifier as CIDLog
import Control.Concurrent.STM
import qualified Data.Map.Strict as M
@ -203,7 +206,8 @@ buildImportTrees basetree msubdir importable = History
linksha <- hashSymlink symlink
return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha
{- Downloads all new ContentIdentifiers. Supports concurrency when enabled.
{- Downloads all new ContentIdentifiers as needed to generate Keys.
- Supports concurrency when enabled.
-
- If any download fails, the whole thing fails, but it will resume where
- it left off.
@ -218,7 +222,9 @@ downloadImport remote importtreeconfig importablecontents = do
-- they will only be downloaded once.
cidmap <- liftIO $ newTVarIO M.empty
withExclusiveLock gitAnnexContentIdentifierLock $
bracket CID.openDb CID.closeDb (go cidmap importablecontents)
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
updateContentIdentifierDbFromBranch db
go cidmap importablecontents db
-- TODO really support concurrency; avoid donwloading the same
-- ContentIdentifier twice.
where
@ -270,7 +276,7 @@ downloadImport remote importtreeconfig importablecontents = do
getTopFilePath subdir </> fromImportLocation loc
getcidkey cidmap db cid = liftIO $
CID.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
CIDDb.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
[] -> atomically $
maybeToList . M.lookup cid <$> readTVar cidmap
l -> return l
@ -278,8 +284,8 @@ downloadImport remote importtreeconfig importablecontents = do
recordcidkey cidmap db cid k = do
liftIO $ atomically $ modifyTVar' cidmap $
M.insert cid k
liftIO $ CID.recordContentIdentifier db (Remote.uuid remote) cid k
recordContentIdentifier (Remote.uuid remote) cid k
liftIO $ CIDDb.recordContentIdentifier db (Remote.uuid remote) cid k
CIDLog.recordContentIdentifier (Remote.uuid remote) cid k
{- Temporary key used for import of a ContentIdentifier while downloading
- content, before generating its real key. -}
@ -289,3 +295,31 @@ importKey (ContentIdentifier cid) size = stubKey
, keyVariety = OtherKey "CID"
, keySize = Just size
}
{- Updates the ContentIdentifier database with information from the
- git-annex branch. This way, ContentIdentifiers that have been imported
- in other clones of the repository will be known, and not unncessarily
- downloaded again.
-
- The database should already be locked for write.
-}
updateContentIdentifierDbFromBranch :: CIDDb.ContentIdentifierHandle -> Annex ()
updateContentIdentifierDbFromBranch db = do
oldtree <- liftIO $ CIDDb.getAnnexBranchTree db
inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case
Just t | t /= oldtree -> do
(l, cleanup) <- inRepo $ DiffTree.diffTree oldtree t
mapM_ go l
void $ liftIO $ cleanup
liftIO $ do
CIDDb.recordAnnexBranchTree db t
CIDDb.flushDbQueue db
_ -> return ()
where
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
Nothing -> return ()
Just k -> do
l <- CIDLog.getContentIdentifiers k
liftIO $ forM_ l $ \(u, cids) ->
forM_ cids $ \cid ->
CIDDb.recordContentIdentifier db u cid k