importfeed: Made checking known urls step around 10% faster.

This was a bit disappointing, I was hoping for a 2x speedup. But, I think the metadata lookup is wasting a lot of time and also needs to be made to stream. The changes to catObjectStreamLsTree were benchmarked to not also speed up --all around 3% more. Seems I managed to make it polymorphic after all.
2020-07-14 12:44:35 -04:00 · 2020-07-14 12:44:35 -04:00 · 535cdc8d48
commit 535cdc8d48
parent a6afa62a60
6 changed files with 58 additions and 42 deletions
--- a/1
+++ b/1
@ -32,6 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
  * Sped up seeking for annexed files to operate on by a factor of nearly 2x.
  * Sped up sync --content by 2x and other commands like fsck --fast and
    whereis by around 50%, by using git cat-file --buffer.
+  * importfeed: Made checking known urls step around 10% faster.
  * fsck: Detect if WORM keys contain a carriage return, and recommend
    upgrading the key. (git-annex could have maybe created such keys back
    in 2013).
--- a/CmdLine/Seek.hs
+++ b/CmdLine/Seek.hs
@ -9,6 +9,8 @@
 - Licensed under the GNU AGPL version 3 or higher.
 -}

+{-# LANGUAGE TupleSections #-}
+
 module CmdLine.Seek where

 import Annex.Common
@ -227,17 +229,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
 		(l, cleanup) <- inRepo $ LsTree.lsTree
 			LsTree.LsTreeRecursive
 			Annex.Branch.fullname
-		let getk = locationLogFileKey config . getTopFilePath
+		let getk f = fmap (,f) (locationLogFileKey config f)
 		let go reader = liftIO reader >>= \case
 			Nothing -> return ()
-			Just (f, content) -> do
-				case getk f of
-					Just k -> do
-						maybe noop (Annex.BranchState.setCache (getTopFilePath f)) content
-						keyaction (k, mkActionItem k)
-					Nothing -> return ()
+			Just ((k, f), content) -> do
+				maybe noop (Annex.BranchState.setCache f) content
+				keyaction (k, mkActionItem k)
 				go reader
-		catObjectStreamLsTree l (isJust . getk . LsTree.file) g go
+		catObjectStreamLsTree l (getk . getTopFilePath . LsTree.file) g go
 		liftIO $ void cleanup

 	runkeyaction getks = do
--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@ -286,17 +286,14 @@ parseCommit b = Commit

 {- Uses cat-file to stream the contents of the files as efficiently
 - as possible. This is much faster than querying it repeatedly per file.
- - 
- - While this could be made more polymorhpic, specialization is important
- - to its performance.
 -}
 catObjectStreamLsTree
 	:: (MonadMask m, MonadIO m)
 	=> [LsTree.TreeItem]
-	-> (LsTree.TreeItem -> Bool)
+	-> (LsTree.TreeItem -> Maybe v)
 	-> Repo
-	-> (IO (Maybe (TopFilePath, Maybe L.ByteString)) -> m ())
-	-> m ()
+	-> (IO (Maybe (v, Maybe L.ByteString)) -> m a)
+	-> m a
 catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
 	\c hin hout -> bracketIO
 		(async $ feeder c hin)
@ -304,11 +301,12 @@ catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
 		(const (reader (catObjectReader readObjectContent c hout)))
  where
 	feeder c h = do
-		forM_ l $ \ti ->
-			when (want ti) $ do
+		forM_ l $ \ti -> case want ti of
+			Nothing -> return ()
+			Just v -> do
 				let f = LsTree.file ti
 				let sha = LsTree.sha ti
-				liftIO $ writeChan c (sha, f)
+				liftIO $ writeChan c (sha, v)
 				S8.hPutStrLn h (fromRef' sha)
 		hClose h

@ -319,9 +317,9 @@ catObjectStream
 	    ((v, Ref) -> IO ()) -- ^ call to feed values in
 	    -> IO () -- call once all values are fed in
 	    -> IO (Maybe (v, Maybe L.ByteString)) -- call to read results
-	    -> m ()
+	    -> m a
 	   )
-	-> m ()
+	-> m a
 catObjectStream repo a = withCatFileStream False repo go
  where
 	go c hin hout = a
@ -339,9 +337,9 @@ catObjectMetaDataStream
 	    ((v, Ref) -> IO ()) -- ^ call to feed values in
 	    -> IO () -- call once all values are fed in
 	    -> IO (Maybe (v, Maybe (Sha, FileSize, ObjectType))) -- call to read results
-	    -> m ()
+	    -> m a
 	   )
-	-> m ()
+	-> m a
 catObjectMetaDataStream repo a = withCatFileStream True repo go
  where
 	go c hin hout = a
@ -378,8 +376,8 @@ withCatFileStream
 	:: (MonadMask m, MonadIO m)
 	=> Bool
 	-> Repo
-	-> (Chan a -> Handle -> Handle -> m ())
-	-> m ()
+	-> (Chan v -> Handle -> Handle -> m a)
+	-> m a
 withCatFileStream check repo reader = assertLocal repo $
 	bracketIO start stop $ \(c, hin, hout, _) -> reader c hin hout
  where
--- a/Logs/Web.hs
+++ b/Logs/Web.hs
@ -1,10 +1,12 @@
 {- Web url logs.
 -
- - Copyright 2011-2014 Joey Hess <id@joeyh.name>
+ - Copyright 2011-2020 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}

+{-# LANGUAGE BangPatterns #-}
+
 module Logs.Web (
 	URLString,
 	getUrls,
@ -28,9 +30,9 @@ import Logs
 import Logs.Presence
 import Logs.Location
 import qualified Annex.Branch
-import Annex.CatFile
-import qualified Git
-import qualified Git.LsFiles
+import qualified Git.LsTree
+import Git.CatFile (catObjectStreamLsTree)
+import Git.FilePath
 import Utility.Url
 import Annex.UUID
 import qualified Types.Remote as Remote
@ -87,24 +89,26 @@ setUrlMissing key url = do
 {- Finds all known urls. -}
 knownUrls :: Annex [(Key, URLString)]
 knownUrls = do
-	{- Ensure the git-annex branch's index file is up-to-date and
-	 - any journaled changes are reflected in it, since we're going
-	 - to query its index directly. -}
+	{- Ensure any journalled changes are committed to the git-annex
+	 - branch, since we're going to look at its tree. -}
 	_ <- Annex.Branch.update
 	Annex.Branch.commit =<< Annex.Branch.commitMessage
-	Annex.Branch.withIndex $ do
-		top <- fromRepo Git.repoPath
-		(l, cleanup) <- inRepo $ Git.LsFiles.inRepoDetails [] [top]
-		r <- mapM getkeyurls l
-		void $ liftIO cleanup
-		return $ concat r
+	(l, cleanup) <- inRepo $ Git.LsTree.lsTree
+		Git.LsTree.LsTreeRecursive
+		Annex.Branch.fullname
+	g <- Annex.gitRepo
+	let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
+	catObjectStreamLsTree l want g (go [])
+		`finally` void (liftIO cleanup)
  where
-	getkeyurls (f, s, _) = case urlLogFileKey f of
-		Just k -> zip (repeat k) <$> geturls s
-		Nothing -> return []
-	geturls logsha =
-		map (decodeBS . fromLogInfo) . getLog 
-			<$> catObject logsha
+	go c reader = liftIO reader >>= \case
+		Just (k, Just content) -> 
+			let !c' = zip (repeat k) (geturls content) ++ c
+			in go c' reader
+		Just (_, Nothing) -> go c reader
+		Nothing -> return c
+	
+	geturls = map (decodeBS . fromLogInfo) . getLog

 setTempUrl :: Key -> URLString -> Annex ()
 setTempUrl key url = Annex.changeState $ \s ->
--- a/doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn
+++ b/doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn
@ -0,0 +1,11 @@
+git-annex tries to run in a constant amount of memory, however `knownUrls`
+loads all urls ever seen into a list, so the more urls there are, the more
+memory `git annex importfeed` will need.
+
+This is probably not a big problem in practice, but seems worth doing
+something about if somehow possible.
+
+Unfortunately, can't use a bloom filter, because a false positive would
+prevent importing an url that has not been imported before. A sqlite
+database would work, but would need to be updated whenever the git-annex
+branch is changed. --[[Joey]]
--- a/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn
+++ b/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn
@ -61,3 +61,6 @@ looked up efficiently. (Before these changes, the same key lookup was done
 speedup when such limits are used. What that optimisation needs is a way to
 tell if the current limit needs the key or not. If it does, then match on
 it after getting the key, otherwise before getting the key.
+
+Also, importfeed could be sped up more, probably, if knownItems
+streamed through cat-file --buffer.