implement updating the ContentIdentifier db with info from the git-annex branch
untested This won't be super slow, but it does need to diff two likely large trees, and since the git-annex branch rarely sits still, it will most likely be run at the beginning of every import. A possible speed improvement would be to only run this when the database did not contain a ContentIdentifier. But that would only speed up imports when there is no new version of a file on the special remote, at most renames of existing files being imported. A better speed improvement would be to record something in the git-annex branch that indicates when an import has been run, and only do the diff if the git-annex branch has record of a newer import than we've seen before. Then, it would only run when there is in fact new ContentIdentifier information available from a remote. Certianly doable, but didn't want to complicate things yet.
This commit is contained in:
parent
12e4906657
commit
ee251b2e2e
4 changed files with 72 additions and 24 deletions
|
@ -25,7 +25,9 @@ import Git.Sha
|
|||
import Git.FilePath
|
||||
import qualified Git.Ref
|
||||
import qualified Git.Branch
|
||||
import qualified Git.DiffTree as DiffTree
|
||||
import qualified Annex
|
||||
import qualified Annex.Branch
|
||||
import Annex.Link
|
||||
import Annex.LockFile
|
||||
import Annex.Content
|
||||
|
@ -36,11 +38,12 @@ import Types.Key
|
|||
import Types.KeySource
|
||||
import Utility.Metered
|
||||
import Utility.DataUnits
|
||||
import Logs
|
||||
import Logs.Export
|
||||
import Logs.ContentIdentifier
|
||||
import Logs.Location
|
||||
import qualified Database.Export as Export
|
||||
import qualified Database.ContentIdentifier as CID
|
||||
import qualified Database.ContentIdentifier as CIDDb
|
||||
import qualified Logs.ContentIdentifier as CIDLog
|
||||
|
||||
import Control.Concurrent.STM
|
||||
import qualified Data.Map.Strict as M
|
||||
|
@ -203,7 +206,8 @@ buildImportTrees basetree msubdir importable = History
|
|||
linksha <- hashSymlink symlink
|
||||
return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha
|
||||
|
||||
{- Downloads all new ContentIdentifiers. Supports concurrency when enabled.
|
||||
{- Downloads all new ContentIdentifiers as needed to generate Keys.
|
||||
- Supports concurrency when enabled.
|
||||
-
|
||||
- If any download fails, the whole thing fails, but it will resume where
|
||||
- it left off.
|
||||
|
@ -218,7 +222,9 @@ downloadImport remote importtreeconfig importablecontents = do
|
|||
-- they will only be downloaded once.
|
||||
cidmap <- liftIO $ newTVarIO M.empty
|
||||
withExclusiveLock gitAnnexContentIdentifierLock $
|
||||
bracket CID.openDb CID.closeDb (go cidmap importablecontents)
|
||||
bracket CIDDb.openDb CIDDb.closeDb $ \db -> do
|
||||
updateContentIdentifierDbFromBranch db
|
||||
go cidmap importablecontents db
|
||||
-- TODO really support concurrency; avoid donwloading the same
|
||||
-- ContentIdentifier twice.
|
||||
where
|
||||
|
@ -270,7 +276,7 @@ downloadImport remote importtreeconfig importablecontents = do
|
|||
getTopFilePath subdir </> fromImportLocation loc
|
||||
|
||||
getcidkey cidmap db cid = liftIO $
|
||||
CID.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
|
||||
CIDDb.getContentIdentifierKeys db (Remote.uuid remote) cid >>= \case
|
||||
[] -> atomically $
|
||||
maybeToList . M.lookup cid <$> readTVar cidmap
|
||||
l -> return l
|
||||
|
@ -278,8 +284,8 @@ downloadImport remote importtreeconfig importablecontents = do
|
|||
recordcidkey cidmap db cid k = do
|
||||
liftIO $ atomically $ modifyTVar' cidmap $
|
||||
M.insert cid k
|
||||
liftIO $ CID.recordContentIdentifier db (Remote.uuid remote) cid k
|
||||
recordContentIdentifier (Remote.uuid remote) cid k
|
||||
liftIO $ CIDDb.recordContentIdentifier db (Remote.uuid remote) cid k
|
||||
CIDLog.recordContentIdentifier (Remote.uuid remote) cid k
|
||||
|
||||
{- Temporary key used for import of a ContentIdentifier while downloading
|
||||
- content, before generating its real key. -}
|
||||
|
@ -289,3 +295,31 @@ importKey (ContentIdentifier cid) size = stubKey
|
|||
, keyVariety = OtherKey "CID"
|
||||
, keySize = Just size
|
||||
}
|
||||
|
||||
{- Updates the ContentIdentifier database with information from the
|
||||
- git-annex branch. This way, ContentIdentifiers that have been imported
|
||||
- in other clones of the repository will be known, and not unncessarily
|
||||
- downloaded again.
|
||||
-
|
||||
- The database should already be locked for write.
|
||||
-}
|
||||
updateContentIdentifierDbFromBranch :: CIDDb.ContentIdentifierHandle -> Annex ()
|
||||
updateContentIdentifierDbFromBranch db = do
|
||||
oldtree <- liftIO $ CIDDb.getAnnexBranchTree db
|
||||
inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case
|
||||
Just t | t /= oldtree -> do
|
||||
(l, cleanup) <- inRepo $ DiffTree.diffTree oldtree t
|
||||
mapM_ go l
|
||||
void $ liftIO $ cleanup
|
||||
liftIO $ do
|
||||
CIDDb.recordAnnexBranchTree db t
|
||||
CIDDb.flushDbQueue db
|
||||
_ -> return ()
|
||||
where
|
||||
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
|
||||
Nothing -> return ()
|
||||
Just k -> do
|
||||
l <- CIDLog.getContentIdentifiers k
|
||||
liftIO $ forM_ l $ \(u, cids) ->
|
||||
forM_ cids $ \cid ->
|
||||
CIDDb.recordContentIdentifier db u cid k
|
||||
|
|
|
@ -22,7 +22,10 @@ module Database.ContentIdentifier (
|
|||
recordContentIdentifier,
|
||||
getContentIdentifiers,
|
||||
getContentIdentifierKeys,
|
||||
recordAnnexBranchTree,
|
||||
getAnnexBranchTree,
|
||||
ContentIdentifiersId,
|
||||
AnnexBranchId,
|
||||
) where
|
||||
|
||||
import Database.Types
|
||||
|
@ -31,6 +34,8 @@ import Database.Init
|
|||
import Annex.Locations
|
||||
import Annex.Common hiding (delete)
|
||||
import Types.Import
|
||||
import Git.Types
|
||||
import Git.Sha
|
||||
|
||||
import Database.Persist.Sql hiding (Key)
|
||||
import Database.Persist.TH
|
||||
|
@ -45,6 +50,11 @@ ContentIdentifiers
|
|||
ContentIdentifiersIndexRemoteKey remote key
|
||||
ContentIdentifiersIndexRemoteCID remote cid
|
||||
UniqueRemoteCidKey remote cid key
|
||||
-- The last git-annex branch tree sha that was used to update
|
||||
-- ContentIdentifiers
|
||||
AnnexBranch
|
||||
tree SRef
|
||||
UniqueTree tree
|
||||
|]
|
||||
|
||||
{- Opens the database, creating it if it doesn't exist yet.
|
||||
|
@ -97,3 +107,15 @@ getContentIdentifierKeys (ContentIdentifierHandle h) u cid =
|
|||
, ContentIdentifiersRemote ==. u
|
||||
] []
|
||||
return $ map (fromIKey . contentIdentifiersKey . entityVal) l
|
||||
|
||||
recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
|
||||
recordAnnexBranchTree h s = queueDb h $ do
|
||||
deleteWhere ([] :: [Filter AnnexBranch])
|
||||
void $ insertUnique $ AnnexBranch $ toSRef s
|
||||
|
||||
getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
|
||||
getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do
|
||||
l <- selectList ([] :: [Filter AnnexBranch]) []
|
||||
case l of
|
||||
(s:[]) -> return $ fromSRef $ annexBranchTree $ entityVal s
|
||||
_ -> return emptyTree
|
||||
|
|
|
@ -21,6 +21,7 @@ import qualified Annex
|
|||
|
||||
import qualified Data.Map as M
|
||||
import Data.List.NonEmpty (NonEmpty(..))
|
||||
import qualified Data.List.NonEmpty as NonEmpty
|
||||
|
||||
-- | Records a remote's content identifier and the key that it corresponds to.
|
||||
--
|
||||
|
@ -37,9 +38,10 @@ recordContentIdentifier u cid k = do
|
|||
where
|
||||
m = simpleMap l
|
||||
|
||||
-- | Get all content identifiers that a remote is known to use for a key.
|
||||
getContentIdentifiers :: UUID -> Key -> Annex [ContentIdentifier]
|
||||
getContentIdentifiers u k = do
|
||||
-- | Get all known content identifiers for a key.
|
||||
getContentIdentifiers :: Key -> Annex [(UUID, [ContentIdentifier])]
|
||||
getContentIdentifiers k = do
|
||||
config <- Annex.getGitConfig
|
||||
contentIdentifierList . M.lookup u . simpleMap . parseLog
|
||||
map (\(u, l) -> (u, NonEmpty.toList l) )
|
||||
. M.toList . simpleMap . parseLog
|
||||
<$> Annex.Branch.get (remoteContentIdentifierLogFile config k)
|
||||
|
|
|
@ -17,19 +17,9 @@ this.
|
|||
* Need to support annex-tracking-branch configuration, which documentation
|
||||
says makes git-annex sync and assistant do imports.
|
||||
|
||||
* Database.ContentIdentifier needs a way to update the database with
|
||||
information coming from the git-annex branch. This will allow multiple
|
||||
clones to import from the same remote, and share content identifier
|
||||
information amoung them.
|
||||
|
||||
It will only need to be updated when listContents returns a
|
||||
ContentIdentifier that is not already known in the database.
|
||||
|
||||
How to do the update: Stash the ref of the last git-annex branch it's
|
||||
updated from in the database. Diff between that ref and the current
|
||||
git-annex branch. For each file in the diff that's a .cid file, read
|
||||
the file from the branch, and store into the database.
|
||||
Update the stashed ref.
|
||||
* Test behavior when multiple repos import from same special remote;
|
||||
the second importer should not re-download as long as it has pulled
|
||||
from the first importer.
|
||||
|
||||
* When on an adjusted unlocked branch, need to import the files unlocked.
|
||||
Also, the tracking branch code needs to know about such branches,
|
||||
|
|
Loading…
Reference in a new issue