fix exponential blowup when adding lots of identical files
This was an old problem when the files were being added unlocked,
so the changelog mentions that being fixed. However, recently it's also
affected locked files.
The fix for locked files is kind of stupidly simple. moveAnnex already
handles populating unlocked files, and only does it when the object file
was not already present. So remove the redundant populateUnlockedFiles
call. (That call was added all the way back in
cfaac52b88
, and has always been
unncessary.)
Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
e147ae07f4
commit
3af4c9a29a
4 changed files with 40 additions and 20 deletions
|
@ -178,9 +178,7 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage =
|
||||||
|
|
||||||
golocked key mcache s =
|
golocked key mcache s =
|
||||||
tryNonAsync (moveAnnex key naf (contentLocation source)) >>= \case
|
tryNonAsync (moveAnnex key naf (contentLocation source)) >>= \case
|
||||||
Right True -> do
|
Right True -> success key mcache s
|
||||||
populateUnlockedFiles key source restage
|
|
||||||
success key mcache s
|
|
||||||
Right False -> giveup "failed to add content to annex"
|
Right False -> giveup "failed to add content to annex"
|
||||||
Left e -> restoreFile (keyFilename source) key e
|
Left e -> restoreFile (keyFilename source) key e
|
||||||
|
|
||||||
|
@ -198,8 +196,8 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage =
|
||||||
cleanOldKeys (keyFilename source) key
|
cleanOldKeys (keyFilename source) key
|
||||||
linkToAnnex key (keyFilename source) (Just cache) >>= \case
|
linkToAnnex key (keyFilename source) (Just cache) >>= \case
|
||||||
LinkAnnexFailed -> failure "failed to link to annex"
|
LinkAnnexFailed -> failure "failed to link to annex"
|
||||||
_ -> do
|
lar -> do
|
||||||
finishIngestUnlocked' key source restage
|
finishIngestUnlocked' key source restage (Just lar)
|
||||||
success key (Just cache) s
|
success key (Just cache) s
|
||||||
gounlocked _ _ _ = failure "failed statting file"
|
gounlocked _ _ _ = failure "failed statting file"
|
||||||
|
|
||||||
|
@ -215,25 +213,30 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage =
|
||||||
finishIngestUnlocked :: Key -> KeySource -> Annex ()
|
finishIngestUnlocked :: Key -> KeySource -> Annex ()
|
||||||
finishIngestUnlocked key source = do
|
finishIngestUnlocked key source = do
|
||||||
cleanCruft source
|
cleanCruft source
|
||||||
finishIngestUnlocked' key source (Restage True)
|
finishIngestUnlocked' key source (Restage True) Nothing
|
||||||
|
|
||||||
finishIngestUnlocked' :: Key -> KeySource -> Restage -> Annex ()
|
finishIngestUnlocked' :: Key -> KeySource -> Restage -> Maybe LinkAnnexResult -> Annex ()
|
||||||
finishIngestUnlocked' key source restage = do
|
finishIngestUnlocked' key source restage lar = do
|
||||||
Database.Keys.addAssociatedFile key
|
Database.Keys.addAssociatedFile key
|
||||||
=<< inRepo (toTopFilePath (keyFilename source))
|
=<< inRepo (toTopFilePath (keyFilename source))
|
||||||
populateUnlockedFiles key source restage
|
populateUnlockedFiles key source restage lar
|
||||||
|
|
||||||
{- Copy to any unlocked files using the same key. -}
|
{- Copy to any other unlocked files using the same key.
|
||||||
populateUnlockedFiles :: Key -> KeySource -> Restage -> Annex ()
|
-
|
||||||
populateUnlockedFiles key source restage =
|
- When linkToAnnex did not have to do anything, the object file
|
||||||
whenM (annexSupportUnlocked <$> Annex.getGitConfig) $ do
|
- was already present, and so other unlocked files are already populated,
|
||||||
obj <- calcRepo (gitAnnexLocation key)
|
- and nothing needs to be done here.
|
||||||
g <- Annex.gitRepo
|
-}
|
||||||
ingestedf <- flip fromTopFilePath g
|
populateUnlockedFiles :: Key -> KeySource -> Restage -> Maybe LinkAnnexResult -> Annex ()
|
||||||
<$> inRepo (toTopFilePath (keyFilename source))
|
populateUnlockedFiles _ _ _ (Just LinkAnnexNoop) = return ()
|
||||||
afs <- map (`fromTopFilePath` g) <$> Database.Keys.getAssociatedFiles key
|
populateUnlockedFiles key source restage lar = do
|
||||||
forM_ (filter (/= ingestedf) afs) $
|
obj <- calcRepo (gitAnnexLocation key)
|
||||||
populatePointerFile restage key obj
|
g <- Annex.gitRepo
|
||||||
|
ingestedf <- flip fromTopFilePath g
|
||||||
|
<$> inRepo (toTopFilePath (keyFilename source))
|
||||||
|
afs <- map (`fromTopFilePath` g) <$> Database.Keys.getAssociatedFiles key
|
||||||
|
forM_ (filter (/= ingestedf) afs) $
|
||||||
|
populatePointerFile restage key obj
|
||||||
|
|
||||||
cleanCruft :: KeySource -> Annex ()
|
cleanCruft :: KeySource -> Annex ()
|
||||||
cleanCruft source = when (contentLocation source /= keyFilename source) $
|
cleanCruft source = when (contentLocation source /= keyFilename source) $
|
||||||
|
|
|
@ -30,6 +30,8 @@ git-annex (8.20210429) UNRELEASED; urgency=medium
|
||||||
* Added annex.adviceNoSshCaching config.
|
* Added annex.adviceNoSshCaching config.
|
||||||
* Added --size-limit option.
|
* Added --size-limit option.
|
||||||
* Future proof activity log parsing.
|
* Future proof activity log parsing.
|
||||||
|
* Fix an exponential slowdown when large numbers of duplicate files are
|
||||||
|
being added in unlocked form.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Mon, 03 May 2021 10:33:10 -0400
|
-- Joey Hess <id@joeyh.name> Mon, 03 May 2021 10:33:10 -0400
|
||||||
|
|
||||||
|
|
|
@ -15,3 +15,5 @@ Currently 8.20210428+git282-gd39dfed2a and first got slow with
|
||||||
8.20210428+git228-g13a6bfff4 and was ok with 8.20210428+git202-g9a5981a15
|
8.20210428+git228-g13a6bfff4 and was ok with 8.20210428+git202-g9a5981a15
|
||||||
|
|
||||||
[[!meta title="performance edge case when adding large numbers of identical files"]]
|
[[!meta title="performance edge case when adding large numbers of identical files"]]
|
||||||
|
|
||||||
|
> [[fixed|done]] --[[Joey]]
|
||||||
|
|
|
@ -7,4 +7,17 @@ Oh, there's a much better solution: If the annex object file already exists
|
||||||
when ingesting a new file, skip populating other associated files. They
|
when ingesting a new file, skip populating other associated files. They
|
||||||
will have already been populated. moveAnnex has to check if the annex object
|
will have already been populated. moveAnnex has to check if the annex object
|
||||||
file already exists anyway, so this will have zero overhead.
|
file already exists anyway, so this will have zero overhead.
|
||||||
|
|
||||||
|
(Maybe that's what yarik was getting at in comment #30)
|
||||||
|
|
||||||
|
Implemented that, and here's the results, re-running my prior benchmark:
|
||||||
|
|
||||||
|
run 1: 0:03.14
|
||||||
|
run 2: 0:03.24
|
||||||
|
run 3: 0.03.35
|
||||||
|
run 4: 0.03.45
|
||||||
|
run 9: 0:03.65
|
||||||
|
|
||||||
|
That also shows the actual overhead of the diffing of the index,
|
||||||
|
as its size grows, is quite small.
|
||||||
"""]]
|
"""]]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue