implement updating the ContentIdentifier db with info from the git-annex branch

untested This won't be super slow, but it does need to diff two likely large trees, and since the git-annex branch rarely sits still, it will most likely be run at the beginning of every import. A possible speed improvement would be to only run this when the database did not contain a ContentIdentifier. But that would only speed up imports when there is no new version of a file on the special remote, at most renames of existing files being imported. A better speed improvement would be to record something in the git-annex branch that indicates when an import has been run, and only do the diff if the git-annex branch has record of a newer import than we've seen before. Then, it would only run when there is in fact new ContentIdentifier information available from a remote. Certianly doable, but didn't want to complicate things yet.
2019-03-06 18:04:30 -04:00 · 2019-03-06 18:04:30 -04:00 · ee251b2e2e
commit ee251b2e2e
parent 12e4906657
4 changed files with 72 additions and 24 deletions
--- a/Annex/Import.hs
+++ b/Annex/Import.hs
@ -25,7 +25,9 @@ import Git.Sha
 import Git.FilePath
 import qualified Git.Ref
 import qualified Git.Branch
+import qualified Git.DiffTree as DiffTree
 import qualified Annex
+import qualified Annex.Branch
 import Annex.Link
 import Annex.LockFile
 import Annex.Content
@ -36,11 +38,12 @@ import Types.Key
 import Types.KeySource
 import Utility.Metered
 import Utility.DataUnits
+import Logs
 import Logs.Export
-import Logs.ContentIdentifier
 import Logs.Location
 import qualified Database.Export as Export
-import qualified Database.ContentIdentifier as CID
+import qualified Database.ContentIdentifier as CIDDb
+import qualified Logs.ContentIdentifier as CIDLog

 import Control.Concurrent.STM
 import qualified Data.Map.Strict as M
@ -203,7 +206,8 @@ buildImportTrees basetree msubdir importable = History
 		linksha <- hashSymlink symlink
 		return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha

-{- Downloads all new ContentIdentifiers. Supports concurrency when enabled.
+{- Downloads all new ContentIdentifiers as needed to generate Keys. 
+ - Supports concurrency when enabled.
 -
 - If any download fails, the whole thing fails, but it will resume where
 - it left off.
@ -218,7 +222,9 @@ downloadImport remote importtreeconfig importablecontents = do
 	-- they will only be downloaded once.
 	cidmap <- liftIO $ newTVarIO M.empty
 	withExclusiveLock gitAnnexContentIdentifierLock $
-		bracket CID.openDb CID.closeDb (go cidmap importablecontents)
+		bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
+			updateContentIdentifierDbFromBranch db
+			go cidmap importablecontents db
 	-- TODO really support concurrency; avoid donwloading the same
 	-- ContentIdentifier twice.
  where
@ -270,7 +276,7 @@ downloadImport remote importtreeconfig importablecontents = do
 			getTopFilePath subdir </> fromImportLocation loc

 	getcidkey cidmap db cid = liftIO $
-		CID.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
+		CIDDb.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
 			[] -> atomically $
 				maybeToList . M.lookup cid <$> readTVar cidmap
 			l -> return l
@ -278,8 +284,8 @@ downloadImport remote importtreeconfig importablecontents = do
 	recordcidkey cidmap db cid k = do
 		liftIO $ atomically $ modifyTVar' cidmap $
 			M.insert cid k
-		liftIO $ CID.recordContentIdentifier db (Remote.uuid remote) cid k
-		recordContentIdentifier (Remote.uuid remote) cid k
+		liftIO $ CIDDb.recordContentIdentifier db (Remote.uuid remote) cid k
+		CIDLog.recordContentIdentifier (Remote.uuid remote) cid k

 {- Temporary key used for import of a ContentIdentifier while downloading
 - content, before generating its real key. -}
@ -289,3 +295,31 @@ importKey (ContentIdentifier cid) size = stubKey
 	, keyVariety = OtherKey "CID"
 	, keySize = Just size
 	}
+
+{- Updates the ContentIdentifier database with information from the
+ - git-annex branch. This way, ContentIdentifiers that have been imported
+ - in other clones of the repository will be known, and not unncessarily
+ - downloaded again.
+ -
+ - The database should already be locked for write.
+ -}
+updateContentIdentifierDbFromBranch :: CIDDb.ContentIdentifierHandle -> Annex ()
+updateContentIdentifierDbFromBranch db = do
+	oldtree <- liftIO $ CIDDb.getAnnexBranchTree db
+	inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case
+		Just t | t /= oldtree -> do
+			(l, cleanup) <- inRepo $ DiffTree.diffTree oldtree t
+			mapM_ go l
+			void $ liftIO $ cleanup
+			liftIO $ do
+				CIDDb.recordAnnexBranchTree db t 
+				CIDDb.flushDbQueue db
+		_ -> return ()
+  where
+	go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
+		Nothing -> return ()
+		Just k -> do
+			l <- CIDLog.getContentIdentifiers k
+			liftIO $ forM_ l $ \(u, cids) ->
+				forM_ cids $ \cid ->
+					CIDDb.recordContentIdentifier db u cid k
--- a/Database/ContentIdentifier.hs
+++ b/Database/ContentIdentifier.hs
@ -22,7 +22,10 @@ module Database.ContentIdentifier (
 	recordContentIdentifier,
 	getContentIdentifiers,
 	getContentIdentifierKeys,
+	recordAnnexBranchTree,
+	getAnnexBranchTree,
 	ContentIdentifiersId,
+	AnnexBranchId,
 ) where

 import Database.Types
@ -31,6 +34,8 @@ import Database.Init
 import Annex.Locations
 import Annex.Common hiding (delete)
 import Types.Import
+import Git.Types
+import Git.Sha

 import Database.Persist.Sql hiding (Key)
 import Database.Persist.TH
@ -45,6 +50,11 @@ ContentIdentifiers
  ContentIdentifiersIndexRemoteKey remote key
  ContentIdentifiersIndexRemoteCID remote cid
  UniqueRemoteCidKey remote cid key
+-- The last git-annex branch tree sha that was used to update
+-- ContentIdentifiers
+AnnexBranch
+  tree SRef
+  UniqueTree tree
 |]

 {- Opens the database, creating it if it doesn't exist yet.
@ -97,3 +107,15 @@ getContentIdentifierKeys (ContentIdentifierHandle h) u cid =
 			, ContentIdentifiersRemote ==. u
 			] []
 		return $ map (fromIKey . contentIdentifiersKey . entityVal) l
+
+recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
+recordAnnexBranchTree h s = queueDb h $ do
+        deleteWhere ([] :: [Filter AnnexBranch])
+        void $ insertUnique $ AnnexBranch $ toSRef s
+
+getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
+getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do
+        l <- selectList ([] :: [Filter AnnexBranch]) []
+        case l of
+                (s:[]) -> return $ fromSRef $ annexBranchTree $ entityVal s
+                _ -> return emptyTree
--- a/Logs/ContentIdentifier.hs
+++ b/Logs/ContentIdentifier.hs
@ -21,6 +21,7 @@ import qualified Annex

 import qualified Data.Map as M
 import Data.List.NonEmpty (NonEmpty(..))
+import qualified Data.List.NonEmpty as NonEmpty

 -- | Records a remote's content identifier and the key that it corresponds to.
 --
@ -37,9 +38,10 @@ recordContentIdentifier u cid k = do
 	  where
 		m = simpleMap l

-- | Get all content identifiers that a remote is known to use for a key.
-getContentIdentifiers :: UUID -> Key -> Annex [ContentIdentifier]
-getContentIdentifiers u k = do
+-- | Get all known content identifiers for a key.
+getContentIdentifiers :: Key -> Annex [(UUID, [ContentIdentifier])]
+getContentIdentifiers k = do
 	config <- Annex.getGitConfig
-	contentIdentifierList . M.lookup u . simpleMap . parseLog
+	map (\(u, l) -> (u, NonEmpty.toList l) )
+		. M.toList . simpleMap . parseLog
 		<$> Annex.Branch.get (remoteContentIdentifierLogFile config k)
--- a/doc/todo/import_tree.mdwn
+++ b/doc/todo/import_tree.mdwn
@ -17,19 +17,9 @@ this.
 * Need to support annex-tracking-branch configuration, which documentation
  says makes git-annex sync and assistant do imports.

-* Database.ContentIdentifier needs a way to update the database with
-  information coming from the git-annex branch. This will allow multiple
-  clones to import from the same remote, and share content identifier
-  information amoung them.
-
-  It will only need to be updated when listContents returns a
-  ContentIdentifier that is not already known in the database.
-
-  How to do the update: Stash the ref of the last git-annex branch it's
-  updated from in the database. Diff between that ref and the current
-  git-annex branch. For each file in the diff that's a .cid file, read
-  the file from the branch, and store into the database. 
-  Update the stashed ref.
+* Test behavior when multiple repos import from same special remote;
+  the second importer should not re-download as long as it has pulled
+  from the first importer.

 * When on an adjusted unlocked branch, need to import the files unlocked.
  Also, the tracking branch code needs to know about such branches,