diff --git a/Annex/Import.hs b/Annex/Import.hs index a3f6c4a56d..9f9a2d6f50 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -508,7 +508,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec let importaction = starting ("import " ++ Remote.name remote) ai si $ do when oldversion $ showNote "old version" - tryNonAsync (importordownload cidmap db i largematcher) >>= \case + tryNonAsync (importordownload cidmap i largematcher) >>= \case Left e -> next $ do warning (UnquotedString (show e)) liftIO $ atomically $ @@ -530,7 +530,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec Just importkey -> tryNonAsync (importkey loc cid sz nullMeterUpdate) >>= \case Right (Just k) -> do - recordcidkey' db cid k + recordcidkeyindb db cid k logChange k (Remote.uuid remote) InfoPresent return $ Just (loc, Right k) Right Nothing -> return Nothing @@ -538,7 +538,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec warning (UnquotedString (show e)) return Nothing - importordownload cidmap db (loc, (cid, sz)) largematcher= do + importordownload cidmap (loc, (cid, sz)) largematcher= do f <- locworktreefile loc matcher <- largematcher f -- When importing a key is supported, always use it rather @@ -551,9 +551,9 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec then dodownload else doimport else doimport - act cidmap db (loc, (cid, sz)) f matcher + act cidmap (loc, (cid, sz)) f matcher - doimport cidmap db (loc, (cid, sz)) f matcher = + doimport cidmap (loc, (cid, sz)) f matcher = case Remote.importKey ia of Nothing -> error "internal" -- checked earlier Just importkey -> do @@ -570,10 +570,10 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec let bwlimit = remoteAnnexBwLimit (Remote.gitconfig remote) islargefile <- checkMatcher' matcher mi mempty metered Nothing sz bwlimit $ const $ if islargefile - then doimportlarge importkey cidmap db loc cid sz f - else doimportsmall cidmap db loc cid sz + then doimportlarge importkey cidmap loc cid sz f + else doimportsmall cidmap loc cid sz - doimportlarge importkey cidmap db loc cid sz f p = + doimportlarge importkey cidmap loc cid sz f p = tryNonAsync importer >>= \case Right (Just (k, True)) -> return $ Just (loc, Right k) Right _ -> return Nothing @@ -591,7 +591,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec Nothing -> return Nothing Just k -> checkSecureHashes k >>= \case Nothing -> do - recordcidkey cidmap db cid k + recordcidkey cidmap cid k logChange k (Remote.uuid remote) InfoPresent if importcontent then getcontent k @@ -618,7 +618,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec -- The file is small, so is added to git, so while importing -- without content does not retrieve annexed files, it does -- need to retrieve this file. - doimportsmall cidmap db loc cid sz p = do + doimportsmall cidmap loc cid sz p = do let downloader tmpfile = do (k, _) <- Remote.retrieveExportWithContentIdentifier ia loc [cid] (fromRawFilePath tmpfile) @@ -626,7 +626,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec p case keyGitSha k of Just sha -> do - recordcidkey cidmap db cid k + recordcidkey cidmap cid k return sha Nothing -> error "internal" checkDiskSpaceToGet tmpkey Nothing $ @@ -640,7 +640,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec tmpkey = importKey cid sz mkkey tmpfile = gitShaKey <$> hashFile tmpfile - dodownload cidmap db (loc, (cid, sz)) f matcher = do + dodownload cidmap (loc, (cid, sz)) f matcher = do let af = AssociatedFile (Just f) let downloader tmpfile p = do (k, _) <- Remote.retrieveExportWithContentIdentifier @@ -651,12 +651,12 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec Nothing -> do ok <- moveAnnex k af tmpfile when ok $ do - recordcidkey cidmap db cid k + recordcidkey cidmap cid k logStatus k InfoPresent logChange k (Remote.uuid remote) InfoPresent return (Right k, ok) Just sha -> do - recordcidkey cidmap db cid k + recordcidkey cidmap cid k return (Left sha, True) let rundownload tmpfile p = tryNonAsync (downloader tmpfile p) >>= \case Right (v, True) -> return $ Just (loc, v) @@ -706,12 +706,18 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec maybeToList . M.lookup cid <$> readTVar cidmap l -> return l - recordcidkey cidmap db cid k = do + recordcidkey cidmap cid k = do liftIO $ atomically $ modifyTVar' cidmap $ M.insert cid k - recordcidkey' db cid k - recordcidkey' db cid k = do + -- Only record in log now; the database will be updated + -- later from the log, and the cidmap will be used for now. + recordcidkeyinlog cid k + + recordcidkeyindb db cid k = do liftIO $ CIDDb.recordContentIdentifier db rs cid k + recordcidkeyinlog cid k + + recordcidkeyinlog cid k = CIDLog.recordContentIdentifier rs cid k rs = Remote.remoteStateHandle remote diff --git a/CHANGELOG b/CHANGELOG index 02051f3c8a..726e6b5169 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -75,6 +75,8 @@ git-annex (10.20230408) UNRELEASED; urgency=medium youtube-dl. Using youtube-dl is now deprecated, and git-annex no longer tries to parse its output to display download progress * repair: Fix handling of git ref names on Windows. + * Speed up importing trees from special remotes somewhat by avoiding + redundant writes to sqlite database. -- Joey Hess Sat, 08 Apr 2023 13:57:18 -0400 diff --git a/doc/bugs/importtree_spends_hours_reading_cidsdb/comment_2_8f609e231fdf315d2b658d30d708d471._comment b/doc/bugs/importtree_spends_hours_reading_cidsdb/comment_2_8f609e231fdf315d2b658d30d708d471._comment index 874017423c..133b943ac0 100644 --- a/doc/bugs/importtree_spends_hours_reading_cidsdb/comment_2_8f609e231fdf315d2b658d30d708d471._comment +++ b/doc/bugs/importtree_spends_hours_reading_cidsdb/comment_2_8f609e231fdf315d2b658d30d708d471._comment @@ -3,36 +3,14 @@ subject="""comment 2""" date="2023-05-30T18:49:34Z" content=""" -I think I see why a second sync re-writes the cidsdb with information from -the git-annex branch. +> The other hit is recordContentIdentifier, which happens for +> each recorded cid, due to updateFromLog. That seems unnecessary, because +> the previous sync already recorded all the cids. -The first sync does write to the cidsdb at the same time it writes to the -git-annex branch. So, it seems that it could at the end call -recordAnnexBranchTree to indicate that the information in the git-annex -branch has all been written to the cidsdb. That would avoid the second sync -doing extra work. +I was able to eliminate that extra work. Now the first sync does not write +to the cidsdb but only to the git-annex branch, and the second sync does +the necessary work of updating the cidsdb from the git-annex branch. -But, there could be other processes running at the same time, and one of -them may update the git-annex branch, eg merging a remote git-annex branch -into it. Any cids logs on that merged git-annex branch would not be -reflected in the cidsdb yet. If the sync then called -recordAnnexBranchTree, the cidsdb would never get updated with that merged -information. - -I don't think there's a good way to prevent, or to detect that situation. -So, it can't call recordAnnexBranchTree at the end, and has to do extra -work in updateFromLog at the beginning. - -What it could do is, only record a cid to the git-annex branch, not to the -cidsdb. Then the updateFromLog would not be doing extra work, but necessary -work. But, it also needs to read cids from the db during import, and if it -doesn't record a cid there, it won't know when a later file has the same -cid. So it will re-import it. Which for other special remotes than -directory, means an expensive second download of the content. - -Anyway, the extra work of re-writing the cidsdb is only done on the sync -immediately after the one that did import some new files. And it only -re-writes cids for the new files, not for unchanged files. - -I'm not sure that this extra work is what the bug reporter was complaining about though. +I'm not sure that extra work is what the bug reporter was complaining +about though. """]]