From ee251b2e2e215700b3aba7b4a242da60011f0516 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 6 Mar 2019 18:04:30 -0400 Subject: [PATCH] implement updating the ContentIdentifier db with info from the git-annex branch untested This won't be super slow, but it does need to diff two likely large trees, and since the git-annex branch rarely sits still, it will most likely be run at the beginning of every import. A possible speed improvement would be to only run this when the database did not contain a ContentIdentifier. But that would only speed up imports when there is no new version of a file on the special remote, at most renames of existing files being imported. A better speed improvement would be to record something in the git-annex branch that indicates when an import has been run, and only do the diff if the git-annex branch has record of a newer import than we've seen before. Then, it would only run when there is in fact new ContentIdentifier information available from a remote. Certianly doable, but didn't want to complicate things yet. --- Annex/Import.hs | 48 ++++++++++++++++++++++++++++++----- Database/ContentIdentifier.hs | 22 ++++++++++++++++ Logs/ContentIdentifier.hs | 10 +++++--- doc/todo/import_tree.mdwn | 16 +++--------- 4 files changed, 72 insertions(+), 24 deletions(-) diff --git a/Annex/Import.hs b/Annex/Import.hs index d9c85d88cc..2630d12d85 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -25,7 +25,9 @@ import Git.Sha import Git.FilePath import qualified Git.Ref import qualified Git.Branch +import qualified Git.DiffTree as DiffTree import qualified Annex +import qualified Annex.Branch import Annex.Link import Annex.LockFile import Annex.Content @@ -36,11 +38,12 @@ import Types.Key import Types.KeySource import Utility.Metered import Utility.DataUnits +import Logs import Logs.Export -import Logs.ContentIdentifier import Logs.Location import qualified Database.Export as Export -import qualified Database.ContentIdentifier as CID +import qualified Database.ContentIdentifier as CIDDb +import qualified Logs.ContentIdentifier as CIDLog import Control.Concurrent.STM import qualified Data.Map.Strict as M @@ -203,7 +206,8 @@ buildImportTrees basetree msubdir importable = History linksha <- hashSymlink symlink return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha -{- Downloads all new ContentIdentifiers. Supports concurrency when enabled. +{- Downloads all new ContentIdentifiers as needed to generate Keys. + - Supports concurrency when enabled. - - If any download fails, the whole thing fails, but it will resume where - it left off. @@ -218,7 +222,9 @@ downloadImport remote importtreeconfig importablecontents = do -- they will only be downloaded once. cidmap <- liftIO $ newTVarIO M.empty withExclusiveLock gitAnnexContentIdentifierLock $ - bracket CID.openDb CID.closeDb (go cidmap importablecontents) + bracket CIDDb.openDb CIDDb.closeDb $ \db -> do + updateContentIdentifierDbFromBranch db + go cidmap importablecontents db -- TODO really support concurrency; avoid donwloading the same -- ContentIdentifier twice. where @@ -270,7 +276,7 @@ downloadImport remote importtreeconfig importablecontents = do getTopFilePath subdir fromImportLocation loc getcidkey cidmap db cid = liftIO $ - CID.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case + CIDDb.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case [] -> atomically $ maybeToList . M.lookup cid <$> readTVar cidmap l -> return l @@ -278,8 +284,8 @@ downloadImport remote importtreeconfig importablecontents = do recordcidkey cidmap db cid k = do liftIO $ atomically $ modifyTVar' cidmap $ M.insert cid k - liftIO $ CID.recordContentIdentifier db (Remote.uuid remote) cid k - recordContentIdentifier (Remote.uuid remote) cid k + liftIO $ CIDDb.recordContentIdentifier db (Remote.uuid remote) cid k + CIDLog.recordContentIdentifier (Remote.uuid remote) cid k {- Temporary key used for import of a ContentIdentifier while downloading - content, before generating its real key. -} @@ -289,3 +295,31 @@ importKey (ContentIdentifier cid) size = stubKey , keyVariety = OtherKey "CID" , keySize = Just size } + +{- Updates the ContentIdentifier database with information from the + - git-annex branch. This way, ContentIdentifiers that have been imported + - in other clones of the repository will be known, and not unncessarily + - downloaded again. + - + - The database should already be locked for write. + -} +updateContentIdentifierDbFromBranch :: CIDDb.ContentIdentifierHandle -> Annex () +updateContentIdentifierDbFromBranch db = do + oldtree <- liftIO $ CIDDb.getAnnexBranchTree db + inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case + Just t | t /= oldtree -> do + (l, cleanup) <- inRepo $ DiffTree.diffTree oldtree t + mapM_ go l + void $ liftIO $ cleanup + liftIO $ do + CIDDb.recordAnnexBranchTree db t + CIDDb.flushDbQueue db + _ -> return () + where + go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of + Nothing -> return () + Just k -> do + l <- CIDLog.getContentIdentifiers k + liftIO $ forM_ l $ \(u, cids) -> + forM_ cids $ \cid -> + CIDDb.recordContentIdentifier db u cid k diff --git a/Database/ContentIdentifier.hs b/Database/ContentIdentifier.hs index 24cdd00e23..0209fbfc7e 100644 --- a/Database/ContentIdentifier.hs +++ b/Database/ContentIdentifier.hs @@ -22,7 +22,10 @@ module Database.ContentIdentifier ( recordContentIdentifier, getContentIdentifiers, getContentIdentifierKeys, + recordAnnexBranchTree, + getAnnexBranchTree, ContentIdentifiersId, + AnnexBranchId, ) where import Database.Types @@ -31,6 +34,8 @@ import Database.Init import Annex.Locations import Annex.Common hiding (delete) import Types.Import +import Git.Types +import Git.Sha import Database.Persist.Sql hiding (Key) import Database.Persist.TH @@ -45,6 +50,11 @@ ContentIdentifiers ContentIdentifiersIndexRemoteKey remote key ContentIdentifiersIndexRemoteCID remote cid UniqueRemoteCidKey remote cid key +-- The last git-annex branch tree sha that was used to update +-- ContentIdentifiers +AnnexBranch + tree SRef + UniqueTree tree |] {- Opens the database, creating it if it doesn't exist yet. @@ -97,3 +107,15 @@ getContentIdentifierKeys (ContentIdentifierHandle h) u cid = , ContentIdentifiersRemote ==. u ] [] return $ map (fromIKey . contentIdentifiersKey . entityVal) l + +recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO () +recordAnnexBranchTree h s = queueDb h $ do + deleteWhere ([] :: [Filter AnnexBranch]) + void $ insertUnique $ AnnexBranch $ toSRef s + +getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha +getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do + l <- selectList ([] :: [Filter AnnexBranch]) [] + case l of + (s:[]) -> return $ fromSRef $ annexBranchTree $ entityVal s + _ -> return emptyTree diff --git a/Logs/ContentIdentifier.hs b/Logs/ContentIdentifier.hs index b59f7a962e..d3e6a50b9b 100644 --- a/Logs/ContentIdentifier.hs +++ b/Logs/ContentIdentifier.hs @@ -21,6 +21,7 @@ import qualified Annex import qualified Data.Map as M import Data.List.NonEmpty (NonEmpty(..)) +import qualified Data.List.NonEmpty as NonEmpty -- | Records a remote's content identifier and the key that it corresponds to. -- @@ -37,9 +38,10 @@ recordContentIdentifier u cid k = do where m = simpleMap l --- | Get all content identifiers that a remote is known to use for a key. -getContentIdentifiers :: UUID -> Key -> Annex [ContentIdentifier] -getContentIdentifiers u k = do +-- | Get all known content identifiers for a key. +getContentIdentifiers :: Key -> Annex [(UUID, [ContentIdentifier])] +getContentIdentifiers k = do config <- Annex.getGitConfig - contentIdentifierList . M.lookup u . simpleMap . parseLog + map (\(u, l) -> (u, NonEmpty.toList l) ) + . M.toList . simpleMap . parseLog <$> Annex.Branch.get (remoteContentIdentifierLogFile config k) diff --git a/doc/todo/import_tree.mdwn b/doc/todo/import_tree.mdwn index 32cfeebcb8..52fe4e4935 100644 --- a/doc/todo/import_tree.mdwn +++ b/doc/todo/import_tree.mdwn @@ -17,19 +17,9 @@ this. * Need to support annex-tracking-branch configuration, which documentation says makes git-annex sync and assistant do imports. -* Database.ContentIdentifier needs a way to update the database with - information coming from the git-annex branch. This will allow multiple - clones to import from the same remote, and share content identifier - information amoung them. - - It will only need to be updated when listContents returns a - ContentIdentifier that is not already known in the database. - - How to do the update: Stash the ref of the last git-annex branch it's - updated from in the database. Diff between that ref and the current - git-annex branch. For each file in the diff that's a .cid file, read - the file from the branch, and store into the database. - Update the stashed ref. +* Test behavior when multiple repos import from same special remote; + the second importer should not re-download as long as it has pulled + from the first importer. * When on an adjusted unlocked branch, need to import the files unlocked. Also, the tracking branch code needs to know about such branches,