implement updating the ContentIdentifier db with info from the git-annex branch

untested

This won't be super slow, but it does need to diff two likely large
trees, and since the git-annex branch rarely sits still, it will most
likely be run at the beginning of every import.

A possible speed improvement would be to only run this when the database
did not contain a ContentIdentifier. But that would only speed up
imports when there is no new version of a file on the special remote,
at most renames of existing files being imported.

A better speed improvement would be to record something in the git-annex
branch that indicates when an import has been run, and only do the diff
if the git-annex branch has record of a newer import than we've seen
before. Then, it would only run when there is in fact new
ContentIdentifier information available from a remote. Certianly doable,
but didn't want to complicate things yet.
This commit is contained in:
Joey Hess 2019-03-06 18:04:30 -04:00
parent 12e4906657
commit ee251b2e2e
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 72 additions and 24 deletions

View file

@ -25,7 +25,9 @@ import Git.Sha
import Git.FilePath import Git.FilePath
import qualified Git.Ref import qualified Git.Ref
import qualified Git.Branch import qualified Git.Branch
import qualified Git.DiffTree as DiffTree
import qualified Annex import qualified Annex
import qualified Annex.Branch
import Annex.Link import Annex.Link
import Annex.LockFile import Annex.LockFile
import Annex.Content import Annex.Content
@ -36,11 +38,12 @@ import Types.Key
import Types.KeySource import Types.KeySource
import Utility.Metered import Utility.Metered
import Utility.DataUnits import Utility.DataUnits
import Logs
import Logs.Export import Logs.Export
import Logs.ContentIdentifier
import Logs.Location import Logs.Location
import qualified Database.Export as Export import qualified Database.Export as Export
import qualified Database.ContentIdentifier as CID import qualified Database.ContentIdentifier as CIDDb
import qualified Logs.ContentIdentifier as CIDLog
import Control.Concurrent.STM import Control.Concurrent.STM
import qualified Data.Map.Strict as M import qualified Data.Map.Strict as M
@ -203,7 +206,8 @@ buildImportTrees basetree msubdir importable = History
linksha <- hashSymlink symlink linksha <- hashSymlink symlink
return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha
{- Downloads all new ContentIdentifiers. Supports concurrency when enabled. {- Downloads all new ContentIdentifiers as needed to generate Keys.
- Supports concurrency when enabled.
- -
- If any download fails, the whole thing fails, but it will resume where - If any download fails, the whole thing fails, but it will resume where
- it left off. - it left off.
@ -218,7 +222,9 @@ downloadImport remote importtreeconfig importablecontents = do
-- they will only be downloaded once. -- they will only be downloaded once.
cidmap <- liftIO $ newTVarIO M.empty cidmap <- liftIO $ newTVarIO M.empty
withExclusiveLock gitAnnexContentIdentifierLock $ withExclusiveLock gitAnnexContentIdentifierLock $
bracket CID.openDb CID.closeDb (go cidmap importablecontents) bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
updateContentIdentifierDbFromBranch db
go cidmap importablecontents db
-- TODO really support concurrency; avoid donwloading the same -- TODO really support concurrency; avoid donwloading the same
-- ContentIdentifier twice. -- ContentIdentifier twice.
where where
@ -270,7 +276,7 @@ downloadImport remote importtreeconfig importablecontents = do
getTopFilePath subdir </> fromImportLocation loc getTopFilePath subdir </> fromImportLocation loc
getcidkey cidmap db cid = liftIO $ getcidkey cidmap db cid = liftIO $
CID.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case CIDDb.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
[] -> atomically $ [] -> atomically $
maybeToList . M.lookup cid <$> readTVar cidmap maybeToList . M.lookup cid <$> readTVar cidmap
l -> return l l -> return l
@ -278,8 +284,8 @@ downloadImport remote importtreeconfig importablecontents = do
recordcidkey cidmap db cid k = do recordcidkey cidmap db cid k = do
liftIO $ atomically $ modifyTVar' cidmap $ liftIO $ atomically $ modifyTVar' cidmap $
M.insert cid k M.insert cid k
liftIO $ CID.recordContentIdentifier db (Remote.uuid remote) cid k liftIO $ CIDDb.recordContentIdentifier db (Remote.uuid remote) cid k
recordContentIdentifier (Remote.uuid remote) cid k CIDLog.recordContentIdentifier (Remote.uuid remote) cid k
{- Temporary key used for import of a ContentIdentifier while downloading {- Temporary key used for import of a ContentIdentifier while downloading
- content, before generating its real key. -} - content, before generating its real key. -}
@ -289,3 +295,31 @@ importKey (ContentIdentifier cid) size = stubKey
, keyVariety = OtherKey "CID" , keyVariety = OtherKey "CID"
, keySize = Just size , keySize = Just size
} }
{- Updates the ContentIdentifier database with information from the
- git-annex branch. This way, ContentIdentifiers that have been imported
- in other clones of the repository will be known, and not unncessarily
- downloaded again.
-
- The database should already be locked for write.
-}
updateContentIdentifierDbFromBranch :: CIDDb.ContentIdentifierHandle -> Annex ()
updateContentIdentifierDbFromBranch db = do
oldtree <- liftIO $ CIDDb.getAnnexBranchTree db
inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case
Just t | t /= oldtree -> do
(l, cleanup) <- inRepo $ DiffTree.diffTree oldtree t
mapM_ go l
void $ liftIO $ cleanup
liftIO $ do
CIDDb.recordAnnexBranchTree db t
CIDDb.flushDbQueue db
_ -> return ()
where
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
Nothing -> return ()
Just k -> do
l <- CIDLog.getContentIdentifiers k
liftIO $ forM_ l $ \(u, cids) ->
forM_ cids $ \cid ->
CIDDb.recordContentIdentifier db u cid k

View file

@ -22,7 +22,10 @@ module Database.ContentIdentifier (
recordContentIdentifier, recordContentIdentifier,
getContentIdentifiers, getContentIdentifiers,
getContentIdentifierKeys, getContentIdentifierKeys,
recordAnnexBranchTree,
getAnnexBranchTree,
ContentIdentifiersId, ContentIdentifiersId,
AnnexBranchId,
) where ) where
import Database.Types import Database.Types
@ -31,6 +34,8 @@ import Database.Init
import Annex.Locations import Annex.Locations
import Annex.Common hiding (delete) import Annex.Common hiding (delete)
import Types.Import import Types.Import
import Git.Types
import Git.Sha
import Database.Persist.Sql hiding (Key) import Database.Persist.Sql hiding (Key)
import Database.Persist.TH import Database.Persist.TH
@ -45,6 +50,11 @@ ContentIdentifiers
ContentIdentifiersIndexRemoteKey remote key ContentIdentifiersIndexRemoteKey remote key
ContentIdentifiersIndexRemoteCID remote cid ContentIdentifiersIndexRemoteCID remote cid
UniqueRemoteCidKey remote cid key UniqueRemoteCidKey remote cid key
-- The last git-annex branch tree sha that was used to update
-- ContentIdentifiers
AnnexBranch
tree SRef
UniqueTree tree
|] |]
{- Opens the database, creating it if it doesn't exist yet. {- Opens the database, creating it if it doesn't exist yet.
@ -97,3 +107,15 @@ getContentIdentifierKeys (ContentIdentifierHandle h) u cid =
, ContentIdentifiersRemote ==. u , ContentIdentifiersRemote ==. u
] [] ] []
return $ map (fromIKey . contentIdentifiersKey . entityVal) l return $ map (fromIKey . contentIdentifiersKey . entityVal) l
recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
recordAnnexBranchTree h s = queueDb h $ do
deleteWhere ([] :: [Filter AnnexBranch])
void $ insertUnique $ AnnexBranch $ toSRef s
getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do
l <- selectList ([] :: [Filter AnnexBranch]) []
case l of
(s:[]) -> return $ fromSRef $ annexBranchTree $ entityVal s
_ -> return emptyTree

View file

@ -21,6 +21,7 @@ import qualified Annex
import qualified Data.Map as M import qualified Data.Map as M
import Data.List.NonEmpty (NonEmpty(..)) import Data.List.NonEmpty (NonEmpty(..))
import qualified Data.List.NonEmpty as NonEmpty
-- | Records a remote's content identifier and the key that it corresponds to. -- | Records a remote's content identifier and the key that it corresponds to.
-- --
@ -37,9 +38,10 @@ recordContentIdentifier u cid k = do
where where
m = simpleMap l m = simpleMap l
-- | Get all content identifiers that a remote is known to use for a key. -- | Get all known content identifiers for a key.
getContentIdentifiers :: UUID -> Key -> Annex [ContentIdentifier] getContentIdentifiers :: Key -> Annex [(UUID, [ContentIdentifier])]
getContentIdentifiers u k = do getContentIdentifiers k = do
config <- Annex.getGitConfig config <- Annex.getGitConfig
contentIdentifierList . M.lookup u . simpleMap . parseLog map (\(u, l) -> (u, NonEmpty.toList l) )
. M.toList . simpleMap . parseLog
<$> Annex.Branch.get (remoteContentIdentifierLogFile config k) <$> Annex.Branch.get (remoteContentIdentifierLogFile config k)

View file

@ -17,19 +17,9 @@ this.
* Need to support annex-tracking-branch configuration, which documentation * Need to support annex-tracking-branch configuration, which documentation
says makes git-annex sync and assistant do imports. says makes git-annex sync and assistant do imports.
* Database.ContentIdentifier needs a way to update the database with * Test behavior when multiple repos import from same special remote;
information coming from the git-annex branch. This will allow multiple the second importer should not re-download as long as it has pulled
clones to import from the same remote, and share content identifier from the first importer.
information amoung them.
It will only need to be updated when listContents returns a
ContentIdentifier that is not already known in the database.
How to do the update: Stash the ref of the last git-annex branch it's
updated from in the database. Diff between that ref and the current
git-annex branch. For each file in the diff that's a .cid file, read
the file from the branch, and store into the database.
Update the stashed ref.
* When on an adjusted unlocked branch, need to import the files unlocked. * When on an adjusted unlocked branch, need to import the files unlocked.
Also, the tracking branch code needs to know about such branches, Also, the tracking branch code needs to know about such branches,