fix exponential blowup when adding lots of identical files
This was an old problem when the files were being added unlocked,
so the changelog mentions that being fixed. However, recently it's also
affected locked files.
The fix for locked files is kind of stupidly simple. moveAnnex already
handles populating unlocked files, and only does it when the object file
was not already present. So remove the redundant populateUnlockedFiles
call. (That call was added all the way back in
cfaac52b88
, and has always been
unncessary.)
Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
e147ae07f4
commit
3af4c9a29a
4 changed files with 40 additions and 20 deletions
|
@ -178,9 +178,7 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage =
|
|||
|
||||
golocked key mcache s =
|
||||
tryNonAsync (moveAnnex key naf (contentLocation source)) >>= \case
|
||||
Right True -> do
|
||||
populateUnlockedFiles key source restage
|
||||
success key mcache s
|
||||
Right True -> success key mcache s
|
||||
Right False -> giveup "failed to add content to annex"
|
||||
Left e -> restoreFile (keyFilename source) key e
|
||||
|
||||
|
@ -198,8 +196,8 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage =
|
|||
cleanOldKeys (keyFilename source) key
|
||||
linkToAnnex key (keyFilename source) (Just cache) >>= \case
|
||||
LinkAnnexFailed -> failure "failed to link to annex"
|
||||
_ -> do
|
||||
finishIngestUnlocked' key source restage
|
||||
lar -> do
|
||||
finishIngestUnlocked' key source restage (Just lar)
|
||||
success key (Just cache) s
|
||||
gounlocked _ _ _ = failure "failed statting file"
|
||||
|
||||
|
@ -215,25 +213,30 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage =
|
|||
finishIngestUnlocked :: Key -> KeySource -> Annex ()
|
||||
finishIngestUnlocked key source = do
|
||||
cleanCruft source
|
||||
finishIngestUnlocked' key source (Restage True)
|
||||
finishIngestUnlocked' key source (Restage True) Nothing
|
||||
|
||||
finishIngestUnlocked' :: Key -> KeySource -> Restage -> Annex ()
|
||||
finishIngestUnlocked' key source restage = do
|
||||
finishIngestUnlocked' :: Key -> KeySource -> Restage -> Maybe LinkAnnexResult -> Annex ()
|
||||
finishIngestUnlocked' key source restage lar = do
|
||||
Database.Keys.addAssociatedFile key
|
||||
=<< inRepo (toTopFilePath (keyFilename source))
|
||||
populateUnlockedFiles key source restage
|
||||
populateUnlockedFiles key source restage lar
|
||||
|
||||
{- Copy to any unlocked files using the same key. -}
|
||||
populateUnlockedFiles :: Key -> KeySource -> Restage -> Annex ()
|
||||
populateUnlockedFiles key source restage =
|
||||
whenM (annexSupportUnlocked <$> Annex.getGitConfig) $ do
|
||||
obj <- calcRepo (gitAnnexLocation key)
|
||||
g <- Annex.gitRepo
|
||||
ingestedf <- flip fromTopFilePath g
|
||||
<$> inRepo (toTopFilePath (keyFilename source))
|
||||
afs <- map (`fromTopFilePath` g) <$> Database.Keys.getAssociatedFiles key
|
||||
forM_ (filter (/= ingestedf) afs) $
|
||||
populatePointerFile restage key obj
|
||||
{- Copy to any other unlocked files using the same key.
|
||||
-
|
||||
- When linkToAnnex did not have to do anything, the object file
|
||||
- was already present, and so other unlocked files are already populated,
|
||||
- and nothing needs to be done here.
|
||||
-}
|
||||
populateUnlockedFiles :: Key -> KeySource -> Restage -> Maybe LinkAnnexResult -> Annex ()
|
||||
populateUnlockedFiles _ _ _ (Just LinkAnnexNoop) = return ()
|
||||
populateUnlockedFiles key source restage lar = do
|
||||
obj <- calcRepo (gitAnnexLocation key)
|
||||
g <- Annex.gitRepo
|
||||
ingestedf <- flip fromTopFilePath g
|
||||
<$> inRepo (toTopFilePath (keyFilename source))
|
||||
afs <- map (`fromTopFilePath` g) <$> Database.Keys.getAssociatedFiles key
|
||||
forM_ (filter (/= ingestedf) afs) $
|
||||
populatePointerFile restage key obj
|
||||
|
||||
cleanCruft :: KeySource -> Annex ()
|
||||
cleanCruft source = when (contentLocation source /= keyFilename source) $
|
||||
|
|
|
@ -30,6 +30,8 @@ git-annex (8.20210429) UNRELEASED; urgency=medium
|
|||
* Added annex.adviceNoSshCaching config.
|
||||
* Added --size-limit option.
|
||||
* Future proof activity log parsing.
|
||||
* Fix an exponential slowdown when large numbers of duplicate files are
|
||||
being added in unlocked form.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 03 May 2021 10:33:10 -0400
|
||||
|
||||
|
|
|
@ -15,3 +15,5 @@ Currently 8.20210428+git282-gd39dfed2a and first got slow with
|
|||
8.20210428+git228-g13a6bfff4 and was ok with 8.20210428+git202-g9a5981a15
|
||||
|
||||
[[!meta title="performance edge case when adding large numbers of identical files"]]
|
||||
|
||||
> [[fixed|done]] --[[Joey]]
|
||||
|
|
|
@ -7,4 +7,17 @@ Oh, there's a much better solution: If the annex object file already exists
|
|||
when ingesting a new file, skip populating other associated files. They
|
||||
will have already been populated. moveAnnex has to check if the annex object
|
||||
file already exists anyway, so this will have zero overhead.
|
||||
|
||||
(Maybe that's what yarik was getting at in comment #30)
|
||||
|
||||
Implemented that, and here's the results, re-running my prior benchmark:
|
||||
|
||||
run 1: 0:03.14
|
||||
run 2: 0:03.24
|
||||
run 3: 0.03.35
|
||||
run 4: 0.03.45
|
||||
run 9: 0:03.65
|
||||
|
||||
That also shows the actual overhead of the diffing of the index,
|
||||
as its size grows, is quite small.
|
||||
"""]]
|
||||
|
|
Loading…
Reference in a new issue