From 535cdc8d48b103a000c8f847dd4e8b85a3fbe7e3 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 14 Jul 2020 12:44:35 -0400 Subject: [PATCH] importfeed: Made checking known urls step around 10% faster. This was a bit disappointing, I was hoping for a 2x speedup. But, I think the metadata lookup is wasting a lot of time and also needs to be made to stream. The changes to catObjectStreamLsTree were benchmarked to not also speed up --all around 3% more. Seems I managed to make it polymorphic after all. --- CHANGELOG | 1 + CmdLine/Seek.hs | 15 ++++--- Git/CatFile.hs | 28 ++++++------- Logs/Web.hs | 42 ++++++++++--------- ...s_more_memory_the_more_urls_there_are.mdwn | 11 +++++ ...logs_for_speed_with_cat-file_--buffer.mdwn | 3 ++ 6 files changed, 58 insertions(+), 42 deletions(-) create mode 100644 doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn diff --git a/CHANGELOG b/CHANGELOG index 045f446b25..293a146e2c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -32,6 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium * Sped up seeking for annexed files to operate on by a factor of nearly 2x. * Sped up sync --content by 2x and other commands like fsck --fast and whereis by around 50%, by using git cat-file --buffer. + * importfeed: Made checking known urls step around 10% faster. * fsck: Detect if WORM keys contain a carriage return, and recommend upgrading the key. (git-annex could have maybe created such keys back in 2013). diff --git a/CmdLine/Seek.hs b/CmdLine/Seek.hs index b43f2675c9..2ccc53521f 100644 --- a/CmdLine/Seek.hs +++ b/CmdLine/Seek.hs @@ -9,6 +9,8 @@ - Licensed under the GNU AGPL version 3 or higher. -} +{-# LANGUAGE TupleSections #-} + module CmdLine.Seek where import Annex.Common @@ -227,17 +229,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do (l, cleanup) <- inRepo $ LsTree.lsTree LsTree.LsTreeRecursive Annex.Branch.fullname - let getk = locationLogFileKey config . getTopFilePath + let getk f = fmap (,f) (locationLogFileKey config f) let go reader = liftIO reader >>= \case Nothing -> return () - Just (f, content) -> do - case getk f of - Just k -> do - maybe noop (Annex.BranchState.setCache (getTopFilePath f)) content - keyaction (k, mkActionItem k) - Nothing -> return () + Just ((k, f), content) -> do + maybe noop (Annex.BranchState.setCache f) content + keyaction (k, mkActionItem k) go reader - catObjectStreamLsTree l (isJust . getk . LsTree.file) g go + catObjectStreamLsTree l (getk . getTopFilePath . LsTree.file) g go liftIO $ void cleanup runkeyaction getks = do diff --git a/Git/CatFile.hs b/Git/CatFile.hs index 0b0327bddc..bcf5119578 100644 --- a/Git/CatFile.hs +++ b/Git/CatFile.hs @@ -286,17 +286,14 @@ parseCommit b = Commit {- Uses cat-file to stream the contents of the files as efficiently - as possible. This is much faster than querying it repeatedly per file. - - - - While this could be made more polymorhpic, specialization is important - - to its performance. -} catObjectStreamLsTree :: (MonadMask m, MonadIO m) => [LsTree.TreeItem] - -> (LsTree.TreeItem -> Bool) + -> (LsTree.TreeItem -> Maybe v) -> Repo - -> (IO (Maybe (TopFilePath, Maybe L.ByteString)) -> m ()) - -> m () + -> (IO (Maybe (v, Maybe L.ByteString)) -> m a) + -> m a catObjectStreamLsTree l want repo reader = withCatFileStream False repo $ \c hin hout -> bracketIO (async $ feeder c hin) @@ -304,11 +301,12 @@ catObjectStreamLsTree l want repo reader = withCatFileStream False repo $ (const (reader (catObjectReader readObjectContent c hout))) where feeder c h = do - forM_ l $ \ti -> - when (want ti) $ do + forM_ l $ \ti -> case want ti of + Nothing -> return () + Just v -> do let f = LsTree.file ti let sha = LsTree.sha ti - liftIO $ writeChan c (sha, f) + liftIO $ writeChan c (sha, v) S8.hPutStrLn h (fromRef' sha) hClose h @@ -319,9 +317,9 @@ catObjectStream ((v, Ref) -> IO ()) -- ^ call to feed values in -> IO () -- call once all values are fed in -> IO (Maybe (v, Maybe L.ByteString)) -- call to read results - -> m () + -> m a ) - -> m () + -> m a catObjectStream repo a = withCatFileStream False repo go where go c hin hout = a @@ -339,9 +337,9 @@ catObjectMetaDataStream ((v, Ref) -> IO ()) -- ^ call to feed values in -> IO () -- call once all values are fed in -> IO (Maybe (v, Maybe (Sha, FileSize, ObjectType))) -- call to read results - -> m () + -> m a ) - -> m () + -> m a catObjectMetaDataStream repo a = withCatFileStream True repo go where go c hin hout = a @@ -378,8 +376,8 @@ withCatFileStream :: (MonadMask m, MonadIO m) => Bool -> Repo - -> (Chan a -> Handle -> Handle -> m ()) - -> m () + -> (Chan v -> Handle -> Handle -> m a) + -> m a withCatFileStream check repo reader = assertLocal repo $ bracketIO start stop $ \(c, hin, hout, _) -> reader c hin hout where diff --git a/Logs/Web.hs b/Logs/Web.hs index 5c0ee9a269..a73f18186b 100644 --- a/Logs/Web.hs +++ b/Logs/Web.hs @@ -1,10 +1,12 @@ {- Web url logs. - - - Copyright 2011-2014 Joey Hess + - Copyright 2011-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} +{-# LANGUAGE BangPatterns #-} + module Logs.Web ( URLString, getUrls, @@ -28,9 +30,9 @@ import Logs import Logs.Presence import Logs.Location import qualified Annex.Branch -import Annex.CatFile -import qualified Git -import qualified Git.LsFiles +import qualified Git.LsTree +import Git.CatFile (catObjectStreamLsTree) +import Git.FilePath import Utility.Url import Annex.UUID import qualified Types.Remote as Remote @@ -87,24 +89,26 @@ setUrlMissing key url = do {- Finds all known urls. -} knownUrls :: Annex [(Key, URLString)] knownUrls = do - {- Ensure the git-annex branch's index file is up-to-date and - - any journaled changes are reflected in it, since we're going - - to query its index directly. -} + {- Ensure any journalled changes are committed to the git-annex + - branch, since we're going to look at its tree. -} _ <- Annex.Branch.update Annex.Branch.commit =<< Annex.Branch.commitMessage - Annex.Branch.withIndex $ do - top <- fromRepo Git.repoPath - (l, cleanup) <- inRepo $ Git.LsFiles.inRepoDetails [] [top] - r <- mapM getkeyurls l - void $ liftIO cleanup - return $ concat r + (l, cleanup) <- inRepo $ Git.LsTree.lsTree + Git.LsTree.LsTreeRecursive + Annex.Branch.fullname + g <- Annex.gitRepo + let want = urlLogFileKey . getTopFilePath . Git.LsTree.file + catObjectStreamLsTree l want g (go []) + `finally` void (liftIO cleanup) where - getkeyurls (f, s, _) = case urlLogFileKey f of - Just k -> zip (repeat k) <$> geturls s - Nothing -> return [] - geturls logsha = - map (decodeBS . fromLogInfo) . getLog - <$> catObject logsha + go c reader = liftIO reader >>= \case + Just (k, Just content) -> + let !c' = zip (repeat k) (geturls content) ++ c + in go c' reader + Just (_, Nothing) -> go c reader + Nothing -> return c + + geturls = map (decodeBS . fromLogInfo) . getLog setTempUrl :: Key -> URLString -> Annex () setTempUrl key url = Annex.changeState $ \s -> diff --git a/doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn b/doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn new file mode 100644 index 0000000000..13b9614bff --- /dev/null +++ b/doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn @@ -0,0 +1,11 @@ +git-annex tries to run in a constant amount of memory, however `knownUrls` +loads all urls ever seen into a list, so the more urls there are, the more +memory `git annex importfeed` will need. + +This is probably not a big problem in practice, but seems worth doing +something about if somehow possible. + +Unfortunately, can't use a bloom filter, because a false positive would +prevent importing an url that has not been imported before. A sqlite +database would work, but would need to be updated whenever the git-annex +branch is changed. --[[Joey]] diff --git a/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn b/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn index 0d84c8ba9b..ea2976b3a6 100644 --- a/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn +++ b/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn @@ -61,3 +61,6 @@ looked up efficiently. (Before these changes, the same key lookup was done speedup when such limits are used. What that optimisation needs is a way to tell if the current limit needs the key or not. If it does, then match on it after getting the key, otherwise before getting the key. + +Also, importfeed could be sped up more, probably, if knownItems +streamed through cat-file --buffer.