importfeed: Made checking known urls step around 10% faster.

This was a bit disappointing, I was hoping for a 2x speedup. But, I think the metadata lookup is wasting a lot of time and also needs to be made to stream. The changes to catObjectStreamLsTree were benchmarked to not also speed up --all around 3% more. Seems I managed to make it polymorphic after all.
2020-07-14 12:44:35 -04:00 · 2020-07-14 12:44:35 -04:00 · 535cdc8d48
commit 535cdc8d48
parent a6afa62a60
6 changed files with 58 additions and 42 deletions
--- a/1
+++ b/1
@ -32,6 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
  * Sped up seeking for annexed files to operate on by a factor of nearly 2x.
  * Sped up sync --content by 2x and other commands like fsck --fast and
    whereis by around 50%, by using git cat-file --buffer.
  * importfeed: Made checking known urls step around 10% faster.
  * fsck: Detect if WORM keys contain a carriage return, and recommend
    upgrading the key. (git-annex could have maybe created such keys back
    in 2013).
--- a/CmdLine/Seek.hs
+++ b/CmdLine/Seek.hs
@ -9,6 +9,8 @@
 - Licensed under the GNU AGPL version 3 or higher.
 -}
 {-# LANGUAGE TupleSections #-}
 module CmdLine.Seek where
 import Annex.Common
@ -227,17 +229,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
 		(l, cleanup) <- inRepo $ LsTree.lsTree
 			LsTree.LsTreeRecursive
 			Annex.Branch.fullname
-		let getk = locationLogFileKey config . getTopFilePath
+		let getk f = fmap (,f) (locationLogFileKey config f)
 		let go reader = liftIO reader >>= \case
 			Nothing -> return ()
-			Just (f, content) -> do
+			Just ((k, f), content) -> do
-				case getk f of
+				maybe noop (Annex.BranchState.setCache f) content
-					Just k -> do
+				keyaction (k, mkActionItem k)
 						maybe noop (Annex.BranchState.setCache (getTopFilePath f)) content
 						keyaction (k, mkActionItem k)
 					Nothing -> return ()
 				go reader
-		catObjectStreamLsTree l (isJust . getk . LsTree.file) g go
+		catObjectStreamLsTree l (getk . getTopFilePath . LsTree.file) g go
 		liftIO $ void cleanup
 	runkeyaction getks = do
--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@ -286,17 +286,14 @@ parseCommit b = Commit
 {- Uses cat-file to stream the contents of the files as efficiently
 - as possible. This is much faster than querying it repeatedly per file.
 - 
 - While this could be made more polymorhpic, specialization is important
 - to its performance.
 -}
 catObjectStreamLsTree
 	:: (MonadMask m, MonadIO m)
 	=> [LsTree.TreeItem]
-	-> (LsTree.TreeItem -> Bool)
+	-> (LsTree.TreeItem -> Maybe v)
 	-> Repo
-	-> (IO (Maybe (TopFilePath, Maybe L.ByteString)) -> m ())
+	-> (IO (Maybe (v, Maybe L.ByteString)) -> m a)
-	-> m ()
+	-> m a
 catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
 	\c hin hout -> bracketIO
 		(async $ feeder c hin)
@ -304,11 +301,12 @@ catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
 		(const (reader (catObjectReader readObjectContent c hout)))
  where
 	feeder c h = do
-		forM_ l $ \ti ->
+		forM_ l $ \ti -> case want ti of
-			when (want ti) $ do
+			Nothing -> return ()
 			Just v -> do
 				let f = LsTree.file ti
 				let sha = LsTree.sha ti
-				liftIO $ writeChan c (sha, f)
+				liftIO $ writeChan c (sha, v)
 				S8.hPutStrLn h (fromRef' sha)
 		hClose h
@ -319,9 +317,9 @@ catObjectStream
 	    ((v, Ref) -> IO ()) -- ^ call to feed values in
 	    -> IO () -- call once all values are fed in
 	    -> IO (Maybe (v, Maybe L.ByteString)) -- call to read results
-	    -> m ()
+	    -> m a
 	   )
-	-> m ()
+	-> m a
 catObjectStream repo a = withCatFileStream False repo go
  where
 	go c hin hout = a
@ -339,9 +337,9 @@ catObjectMetaDataStream
 	    ((v, Ref) -> IO ()) -- ^ call to feed values in
 	    -> IO () -- call once all values are fed in
 	    -> IO (Maybe (v, Maybe (Sha, FileSize, ObjectType))) -- call to read results
-	    -> m ()
+	    -> m a
 	   )
-	-> m ()
+	-> m a
 catObjectMetaDataStream repo a = withCatFileStream True repo go
  where
 	go c hin hout = a
@ -378,8 +376,8 @@ withCatFileStream
 	:: (MonadMask m, MonadIO m)
 	=> Bool
 	-> Repo
-	-> (Chan a -> Handle -> Handle -> m ())
+	-> (Chan v -> Handle -> Handle -> m a)
-	-> m ()
+	-> m a
 withCatFileStream check repo reader = assertLocal repo $
 	bracketIO start stop $ \(c, hin, hout, _) -> reader c hin hout
  where
--- a/Logs/Web.hs
+++ b/Logs/Web.hs
@ -1,10 +1,12 @@
 {- Web url logs.
 -
- - Copyright 2011-2014 Joey Hess <id@joeyh.name>
+ - Copyright 2011-2020 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
 {-# LANGUAGE BangPatterns #-}
 module Logs.Web (
 	URLString,
 	getUrls,
@ -28,9 +30,9 @@ import Logs
 import Logs.Presence
 import Logs.Location
 import qualified Annex.Branch
-import Annex.CatFile
+import qualified Git.LsTree
-import qualified Git
+import Git.CatFile (catObjectStreamLsTree)
-import qualified Git.LsFiles
+import Git.FilePath
 import Utility.Url
 import Annex.UUID
 import qualified Types.Remote as Remote
@ -87,24 +89,26 @@ setUrlMissing key url = do
 {- Finds all known urls. -}
 knownUrls :: Annex [(Key, URLString)]
 knownUrls = do
-	{- Ensure the git-annex branch's index file is up-to-date and
+	{- Ensure any journalled changes are committed to the git-annex
-	 - any journaled changes are reflected in it, since we're going
+	 - branch, since we're going to look at its tree. -}
 	 - to query its index directly. -}
 	_ <- Annex.Branch.update
 	Annex.Branch.commit =<< Annex.Branch.commitMessage
-	Annex.Branch.withIndex $ do
+	(l, cleanup) <- inRepo $ Git.LsTree.lsTree
-		top <- fromRepo Git.repoPath
+		Git.LsTree.LsTreeRecursive
-		(l, cleanup) <- inRepo $ Git.LsFiles.inRepoDetails [] [top]
+		Annex.Branch.fullname
-		r <- mapM getkeyurls l
+	g <- Annex.gitRepo
-		void $ liftIO cleanup
+	let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
-		return $ concat r
+	catObjectStreamLsTree l want g (go [])
 		`finally` void (liftIO cleanup)
  where
-	getkeyurls (f, s, _) = case urlLogFileKey f of
+	go c reader = liftIO reader >>= \case
-		Just k -> zip (repeat k) <$> geturls s
+		Just (k, Just content) -> 
-		Nothing -> return []
+			let !c' = zip (repeat k) (geturls content) ++ c
-	geturls logsha =
+			in go c' reader
-		map (decodeBS . fromLogInfo) . getLog 
+		Just (_, Nothing) -> go c reader
-			<$> catObject logsha
+		Nothing -> return c
 	geturls = map (decodeBS . fromLogInfo) . getLog
 setTempUrl :: Key -> URLString -> Annex ()
 setTempUrl key url = Annex.changeState $ \s ->
--- a/doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn
+++ b/doc/todo/importfeed_needs_more_memory_the_more_urls_there_are.mdwn
@ -0,0 +1,11 @@
 git-annex tries to run in a constant amount of memory, however `knownUrls`
 loads all urls ever seen into a list, so the more urls there are, the more
 memory `git annex importfeed` will need.
 This is probably not a big problem in practice, but seems worth doing
 something about if somehow possible.
 Unfortunately, can't use a bloom filter, because a false positive would
 prevent importing an url that has not been imported before. A sqlite
 database would work, but would need to be updated whenever the git-annex
 branch is changed. --[[Joey]]
--- a/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn
+++ b/doc/todo/precache_logs_for_speed_with_cat-file_--buffer.mdwn
@ -61,3 +61,6 @@ looked up efficiently. (Before these changes, the same key lookup was done
 speedup when such limits are used. What that optimisation needs is a way to
 tell if the current limit needs the key or not. If it does, then match on
 it after getting the key, otherwise before getting the key.
 Also, importfeed could be sped up more, probably, if knownItems
 streamed through cat-file --buffer.