importfeed: Made checking known urls step around 10% faster.
This was a bit disappointing, I was hoping for a 2x speedup. But, I think the metadata lookup is wasting a lot of time and also needs to be made to stream. The changes to catObjectStreamLsTree were benchmarked to not also speed up --all around 3% more. Seems I managed to make it polymorphic after all.
This commit is contained in:
parent
a6afa62a60
commit
535cdc8d48
6 changed files with 58 additions and 42 deletions
|
@ -32,6 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
|
|||
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
|
||||
* Sped up sync --content by 2x and other commands like fsck --fast and
|
||||
whereis by around 50%, by using git cat-file --buffer.
|
||||
* importfeed: Made checking known urls step around 10% faster.
|
||||
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
||||
upgrading the key. (git-annex could have maybe created such keys back
|
||||
in 2013).
|
||||
|
|
|
@ -9,6 +9,8 @@
|
|||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE TupleSections #-}
|
||||
|
||||
module CmdLine.Seek where
|
||||
|
||||
import Annex.Common
|
||||
|
@ -227,17 +229,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
|
|||
(l, cleanup) <- inRepo $ LsTree.lsTree
|
||||
LsTree.LsTreeRecursive
|
||||
Annex.Branch.fullname
|
||||
let getk = locationLogFileKey config . getTopFilePath
|
||||
let getk f = fmap (,f) (locationLogFileKey config f)
|
||||
let go reader = liftIO reader >>= \case
|
||||
Nothing -> return ()
|
||||
Just (f, content) -> do
|
||||
case getk f of
|
||||
Just k -> do
|
||||
maybe noop (Annex.BranchState.setCache (getTopFilePath f)) content
|
||||
Just ((k, f), content) -> do
|
||||
maybe noop (Annex.BranchState.setCache f) content
|
||||
keyaction (k, mkActionItem k)
|
||||
Nothing -> return ()
|
||||
go reader
|
||||
catObjectStreamLsTree l (isJust . getk . LsTree.file) g go
|
||||
catObjectStreamLsTree l (getk . getTopFilePath . LsTree.file) g go
|
||||
liftIO $ void cleanup
|
||||
|
||||
runkeyaction getks = do
|
||||
|
|
|
@ -286,17 +286,14 @@ parseCommit b = Commit
|
|||
|
||||
{- Uses cat-file to stream the contents of the files as efficiently
|
||||
- as possible. This is much faster than querying it repeatedly per file.
|
||||
-
|
||||
- While this could be made more polymorhpic, specialization is important
|
||||
- to its performance.
|
||||
-}
|
||||
catObjectStreamLsTree
|
||||
:: (MonadMask m, MonadIO m)
|
||||
=> [LsTree.TreeItem]
|
||||
-> (LsTree.TreeItem -> Bool)
|
||||
-> (LsTree.TreeItem -> Maybe v)
|
||||
-> Repo
|
||||
-> (IO (Maybe (TopFilePath, Maybe L.ByteString)) -> m ())
|
||||
-> m ()
|
||||
-> (IO (Maybe (v, Maybe L.ByteString)) -> m a)
|
||||
-> m a
|
||||
catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
|
||||
\c hin hout -> bracketIO
|
||||
(async $ feeder c hin)
|
||||
|
@ -304,11 +301,12 @@ catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
|
|||
(const (reader (catObjectReader readObjectContent c hout)))
|
||||
where
|
||||
feeder c h = do
|
||||
forM_ l $ \ti ->
|
||||
when (want ti) $ do
|
||||
forM_ l $ \ti -> case want ti of
|
||||
Nothing -> return ()
|
||||
Just v -> do
|
||||
let f = LsTree.file ti
|
||||
let sha = LsTree.sha ti
|
||||
liftIO $ writeChan c (sha, f)
|
||||
liftIO $ writeChan c (sha, v)
|
||||
S8.hPutStrLn h (fromRef' sha)
|
||||
hClose h
|
||||
|
||||
|
@ -319,9 +317,9 @@ catObjectStream
|
|||
((v, Ref) -> IO ()) -- ^ call to feed values in
|
||||
-> IO () -- call once all values are fed in
|
||||
-> IO (Maybe (v, Maybe L.ByteString)) -- call to read results
|
||||
-> m ()
|
||||
-> m a
|
||||
)
|
||||
-> m ()
|
||||
-> m a
|
||||
catObjectStream repo a = withCatFileStream False repo go
|
||||
where
|
||||
go c hin hout = a
|
||||
|
@ -339,9 +337,9 @@ catObjectMetaDataStream
|
|||
((v, Ref) -> IO ()) -- ^ call to feed values in
|
||||
-> IO () -- call once all values are fed in
|
||||
-> IO (Maybe (v, Maybe (Sha, FileSize, ObjectType))) -- call to read results
|
||||
-> m ()
|
||||
-> m a
|
||||
)
|
||||
-> m ()
|
||||
-> m a
|
||||
catObjectMetaDataStream repo a = withCatFileStream True repo go
|
||||
where
|
||||
go c hin hout = a
|
||||
|
@ -378,8 +376,8 @@ withCatFileStream
|
|||
:: (MonadMask m, MonadIO m)
|
||||
=> Bool
|
||||
-> Repo
|
||||
-> (Chan a -> Handle -> Handle -> m ())
|
||||
-> m ()
|
||||
-> (Chan v -> Handle -> Handle -> m a)
|
||||
-> m a
|
||||
withCatFileStream check repo reader = assertLocal repo $
|
||||
bracketIO start stop $ \(c, hin, hout, _) -> reader c hin hout
|
||||
where
|
||||
|
|
42
Logs/Web.hs
42
Logs/Web.hs
|
@ -1,10 +1,12 @@
|
|||
{- Web url logs.
|
||||
-
|
||||
- Copyright 2011-2014 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Logs.Web (
|
||||
URLString,
|
||||
getUrls,
|
||||
|
@ -28,9 +30,9 @@ import Logs
|
|||
import Logs.Presence
|
||||
import Logs.Location
|
||||
import qualified Annex.Branch
|
||||
import Annex.CatFile
|
||||
import qualified Git
|
||||
import qualified Git.LsFiles
|
||||
import qualified Git.LsTree
|
||||
import Git.CatFile (catObjectStreamLsTree)
|
||||
import Git.FilePath
|
||||
import Utility.Url
|
||||
import Annex.UUID
|
||||
import qualified Types.Remote as Remote
|
||||
|
@ -87,24 +89,26 @@ setUrlMissing key url = do
|
|||
{- Finds all known urls. -}
|
||||
knownUrls :: Annex [(Key, URLString)]
|
||||
knownUrls = do
|
||||
{- Ensure the git-annex branch's index file is up-to-date and
|
||||
- any journaled changes are reflected in it, since we're going
|
||||
- to query its index directly. -}
|
||||
{- Ensure any journalled changes are committed to the git-annex
|
||||
- branch, since we're going to look at its tree. -}
|
||||
_ <- Annex.Branch.update
|
||||
Annex.Branch.commit =<< Annex.Branch.commitMessage
|
||||
Annex.Branch.withIndex $ do
|
||||
top <- fromRepo Git.repoPath
|
||||
(l, cleanup) <- inRepo $ Git.LsFiles.inRepoDetails [] [top]
|
||||
r <- mapM getkeyurls l
|
||||
void $ liftIO cleanup
|
||||
return $ concat r
|
||||
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
||||
Git.LsTree.LsTreeRecursive
|
||||
Annex.Branch.fullname
|
||||
g <- Annex.gitRepo
|
||||
let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
|
||||
catObjectStreamLsTree l want g (go [])
|
||||
`finally` void (liftIO cleanup)
|
||||
where
|
||||
getkeyurls (f, s, _) = case urlLogFileKey f of
|
||||
Just k -> zip (repeat k) <$> geturls s
|
||||
Nothing -> return []
|
||||
geturls logsha =
|
||||
map (decodeBS . fromLogInfo) . getLog
|
||||
<$> catObject logsha
|
||||
go c reader = liftIO reader >>= \case
|
||||
Just (k, Just content) ->
|
||||
let !c' = zip (repeat k) (geturls content) ++ c
|
||||
in go c' reader
|
||||
Just (_, Nothing) -> go c reader
|
||||
Nothing -> return c
|
||||
|
||||
geturls = map (decodeBS . fromLogInfo) . getLog
|
||||
|
||||
setTempUrl :: Key -> URLString -> Annex ()
|
||||
setTempUrl key url = Annex.changeState $ \s ->
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
git-annex tries to run in a constant amount of memory, however `knownUrls`
|
||||
loads all urls ever seen into a list, so the more urls there are, the more
|
||||
memory `git annex importfeed` will need.
|
||||
|
||||
This is probably not a big problem in practice, but seems worth doing
|
||||
something about if somehow possible.
|
||||
|
||||
Unfortunately, can't use a bloom filter, because a false positive would
|
||||
prevent importing an url that has not been imported before. A sqlite
|
||||
database would work, but would need to be updated whenever the git-annex
|
||||
branch is changed. --[[Joey]]
|
|
@ -61,3 +61,6 @@ looked up efficiently. (Before these changes, the same key lookup was done
|
|||
speedup when such limits are used. What that optimisation needs is a way to
|
||||
tell if the current limit needs the key or not. If it does, then match on
|
||||
it after getting the key, otherwise before getting the key.
|
||||
|
||||
Also, importfeed could be sped up more, probably, if knownItems
|
||||
streamed through cat-file --buffer.
|
||||
|
|
Loading…
Reference in a new issue