importfeed: Made checking known urls step around 10% faster.
This was a bit disappointing, I was hoping for a 2x speedup. But, I think the metadata lookup is wasting a lot of time and also needs to be made to stream. The changes to catObjectStreamLsTree were benchmarked to not also speed up --all around 3% more. Seems I managed to make it polymorphic after all.
This commit is contained in:
parent
a6afa62a60
commit
535cdc8d48
6 changed files with 58 additions and 42 deletions
|
@ -32,6 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
|
||||||
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
|
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
|
||||||
* Sped up sync --content by 2x and other commands like fsck --fast and
|
* Sped up sync --content by 2x and other commands like fsck --fast and
|
||||||
whereis by around 50%, by using git cat-file --buffer.
|
whereis by around 50%, by using git cat-file --buffer.
|
||||||
|
* importfeed: Made checking known urls step around 10% faster.
|
||||||
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
||||||
upgrading the key. (git-annex could have maybe created such keys back
|
upgrading the key. (git-annex could have maybe created such keys back
|
||||||
in 2013).
|
in 2013).
|
||||||
|
|
|
@ -9,6 +9,8 @@
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
|
{-# LANGUAGE TupleSections #-}
|
||||||
|
|
||||||
module CmdLine.Seek where
|
module CmdLine.Seek where
|
||||||
|
|
||||||
import Annex.Common
|
import Annex.Common
|
||||||
|
@ -227,17 +229,14 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
|
||||||
(l, cleanup) <- inRepo $ LsTree.lsTree
|
(l, cleanup) <- inRepo $ LsTree.lsTree
|
||||||
LsTree.LsTreeRecursive
|
LsTree.LsTreeRecursive
|
||||||
Annex.Branch.fullname
|
Annex.Branch.fullname
|
||||||
let getk = locationLogFileKey config . getTopFilePath
|
let getk f = fmap (,f) (locationLogFileKey config f)
|
||||||
let go reader = liftIO reader >>= \case
|
let go reader = liftIO reader >>= \case
|
||||||
Nothing -> return ()
|
Nothing -> return ()
|
||||||
Just (f, content) -> do
|
Just ((k, f), content) -> do
|
||||||
case getk f of
|
maybe noop (Annex.BranchState.setCache f) content
|
||||||
Just k -> do
|
keyaction (k, mkActionItem k)
|
||||||
maybe noop (Annex.BranchState.setCache (getTopFilePath f)) content
|
|
||||||
keyaction (k, mkActionItem k)
|
|
||||||
Nothing -> return ()
|
|
||||||
go reader
|
go reader
|
||||||
catObjectStreamLsTree l (isJust . getk . LsTree.file) g go
|
catObjectStreamLsTree l (getk . getTopFilePath . LsTree.file) g go
|
||||||
liftIO $ void cleanup
|
liftIO $ void cleanup
|
||||||
|
|
||||||
runkeyaction getks = do
|
runkeyaction getks = do
|
||||||
|
|
|
@ -286,17 +286,14 @@ parseCommit b = Commit
|
||||||
|
|
||||||
{- Uses cat-file to stream the contents of the files as efficiently
|
{- Uses cat-file to stream the contents of the files as efficiently
|
||||||
- as possible. This is much faster than querying it repeatedly per file.
|
- as possible. This is much faster than querying it repeatedly per file.
|
||||||
-
|
|
||||||
- While this could be made more polymorhpic, specialization is important
|
|
||||||
- to its performance.
|
|
||||||
-}
|
-}
|
||||||
catObjectStreamLsTree
|
catObjectStreamLsTree
|
||||||
:: (MonadMask m, MonadIO m)
|
:: (MonadMask m, MonadIO m)
|
||||||
=> [LsTree.TreeItem]
|
=> [LsTree.TreeItem]
|
||||||
-> (LsTree.TreeItem -> Bool)
|
-> (LsTree.TreeItem -> Maybe v)
|
||||||
-> Repo
|
-> Repo
|
||||||
-> (IO (Maybe (TopFilePath, Maybe L.ByteString)) -> m ())
|
-> (IO (Maybe (v, Maybe L.ByteString)) -> m a)
|
||||||
-> m ()
|
-> m a
|
||||||
catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
|
catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
|
||||||
\c hin hout -> bracketIO
|
\c hin hout -> bracketIO
|
||||||
(async $ feeder c hin)
|
(async $ feeder c hin)
|
||||||
|
@ -304,11 +301,12 @@ catObjectStreamLsTree l want repo reader = withCatFileStream False repo $
|
||||||
(const (reader (catObjectReader readObjectContent c hout)))
|
(const (reader (catObjectReader readObjectContent c hout)))
|
||||||
where
|
where
|
||||||
feeder c h = do
|
feeder c h = do
|
||||||
forM_ l $ \ti ->
|
forM_ l $ \ti -> case want ti of
|
||||||
when (want ti) $ do
|
Nothing -> return ()
|
||||||
|
Just v -> do
|
||||||
let f = LsTree.file ti
|
let f = LsTree.file ti
|
||||||
let sha = LsTree.sha ti
|
let sha = LsTree.sha ti
|
||||||
liftIO $ writeChan c (sha, f)
|
liftIO $ writeChan c (sha, v)
|
||||||
S8.hPutStrLn h (fromRef' sha)
|
S8.hPutStrLn h (fromRef' sha)
|
||||||
hClose h
|
hClose h
|
||||||
|
|
||||||
|
@ -319,9 +317,9 @@ catObjectStream
|
||||||
((v, Ref) -> IO ()) -- ^ call to feed values in
|
((v, Ref) -> IO ()) -- ^ call to feed values in
|
||||||
-> IO () -- call once all values are fed in
|
-> IO () -- call once all values are fed in
|
||||||
-> IO (Maybe (v, Maybe L.ByteString)) -- call to read results
|
-> IO (Maybe (v, Maybe L.ByteString)) -- call to read results
|
||||||
-> m ()
|
-> m a
|
||||||
)
|
)
|
||||||
-> m ()
|
-> m a
|
||||||
catObjectStream repo a = withCatFileStream False repo go
|
catObjectStream repo a = withCatFileStream False repo go
|
||||||
where
|
where
|
||||||
go c hin hout = a
|
go c hin hout = a
|
||||||
|
@ -339,9 +337,9 @@ catObjectMetaDataStream
|
||||||
((v, Ref) -> IO ()) -- ^ call to feed values in
|
((v, Ref) -> IO ()) -- ^ call to feed values in
|
||||||
-> IO () -- call once all values are fed in
|
-> IO () -- call once all values are fed in
|
||||||
-> IO (Maybe (v, Maybe (Sha, FileSize, ObjectType))) -- call to read results
|
-> IO (Maybe (v, Maybe (Sha, FileSize, ObjectType))) -- call to read results
|
||||||
-> m ()
|
-> m a
|
||||||
)
|
)
|
||||||
-> m ()
|
-> m a
|
||||||
catObjectMetaDataStream repo a = withCatFileStream True repo go
|
catObjectMetaDataStream repo a = withCatFileStream True repo go
|
||||||
where
|
where
|
||||||
go c hin hout = a
|
go c hin hout = a
|
||||||
|
@ -378,8 +376,8 @@ withCatFileStream
|
||||||
:: (MonadMask m, MonadIO m)
|
:: (MonadMask m, MonadIO m)
|
||||||
=> Bool
|
=> Bool
|
||||||
-> Repo
|
-> Repo
|
||||||
-> (Chan a -> Handle -> Handle -> m ())
|
-> (Chan v -> Handle -> Handle -> m a)
|
||||||
-> m ()
|
-> m a
|
||||||
withCatFileStream check repo reader = assertLocal repo $
|
withCatFileStream check repo reader = assertLocal repo $
|
||||||
bracketIO start stop $ \(c, hin, hout, _) -> reader c hin hout
|
bracketIO start stop $ \(c, hin, hout, _) -> reader c hin hout
|
||||||
where
|
where
|
||||||
|
|
42
Logs/Web.hs
42
Logs/Web.hs
|
@ -1,10 +1,12 @@
|
||||||
{- Web url logs.
|
{- Web url logs.
|
||||||
-
|
-
|
||||||
- Copyright 2011-2014 Joey Hess <id@joeyh.name>
|
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
|
{-# LANGUAGE BangPatterns #-}
|
||||||
|
|
||||||
module Logs.Web (
|
module Logs.Web (
|
||||||
URLString,
|
URLString,
|
||||||
getUrls,
|
getUrls,
|
||||||
|
@ -28,9 +30,9 @@ import Logs
|
||||||
import Logs.Presence
|
import Logs.Presence
|
||||||
import Logs.Location
|
import Logs.Location
|
||||||
import qualified Annex.Branch
|
import qualified Annex.Branch
|
||||||
import Annex.CatFile
|
import qualified Git.LsTree
|
||||||
import qualified Git
|
import Git.CatFile (catObjectStreamLsTree)
|
||||||
import qualified Git.LsFiles
|
import Git.FilePath
|
||||||
import Utility.Url
|
import Utility.Url
|
||||||
import Annex.UUID
|
import Annex.UUID
|
||||||
import qualified Types.Remote as Remote
|
import qualified Types.Remote as Remote
|
||||||
|
@ -87,24 +89,26 @@ setUrlMissing key url = do
|
||||||
{- Finds all known urls. -}
|
{- Finds all known urls. -}
|
||||||
knownUrls :: Annex [(Key, URLString)]
|
knownUrls :: Annex [(Key, URLString)]
|
||||||
knownUrls = do
|
knownUrls = do
|
||||||
{- Ensure the git-annex branch's index file is up-to-date and
|
{- Ensure any journalled changes are committed to the git-annex
|
||||||
- any journaled changes are reflected in it, since we're going
|
- branch, since we're going to look at its tree. -}
|
||||||
- to query its index directly. -}
|
|
||||||
_ <- Annex.Branch.update
|
_ <- Annex.Branch.update
|
||||||
Annex.Branch.commit =<< Annex.Branch.commitMessage
|
Annex.Branch.commit =<< Annex.Branch.commitMessage
|
||||||
Annex.Branch.withIndex $ do
|
(l, cleanup) <- inRepo $ Git.LsTree.lsTree
|
||||||
top <- fromRepo Git.repoPath
|
Git.LsTree.LsTreeRecursive
|
||||||
(l, cleanup) <- inRepo $ Git.LsFiles.inRepoDetails [] [top]
|
Annex.Branch.fullname
|
||||||
r <- mapM getkeyurls l
|
g <- Annex.gitRepo
|
||||||
void $ liftIO cleanup
|
let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
|
||||||
return $ concat r
|
catObjectStreamLsTree l want g (go [])
|
||||||
|
`finally` void (liftIO cleanup)
|
||||||
where
|
where
|
||||||
getkeyurls (f, s, _) = case urlLogFileKey f of
|
go c reader = liftIO reader >>= \case
|
||||||
Just k -> zip (repeat k) <$> geturls s
|
Just (k, Just content) ->
|
||||||
Nothing -> return []
|
let !c' = zip (repeat k) (geturls content) ++ c
|
||||||
geturls logsha =
|
in go c' reader
|
||||||
map (decodeBS . fromLogInfo) . getLog
|
Just (_, Nothing) -> go c reader
|
||||||
<$> catObject logsha
|
Nothing -> return c
|
||||||
|
|
||||||
|
geturls = map (decodeBS . fromLogInfo) . getLog
|
||||||
|
|
||||||
setTempUrl :: Key -> URLString -> Annex ()
|
setTempUrl :: Key -> URLString -> Annex ()
|
||||||
setTempUrl key url = Annex.changeState $ \s ->
|
setTempUrl key url = Annex.changeState $ \s ->
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
git-annex tries to run in a constant amount of memory, however `knownUrls`
|
||||||
|
loads all urls ever seen into a list, so the more urls there are, the more
|
||||||
|
memory `git annex importfeed` will need.
|
||||||
|
|
||||||
|
This is probably not a big problem in practice, but seems worth doing
|
||||||
|
something about if somehow possible.
|
||||||
|
|
||||||
|
Unfortunately, can't use a bloom filter, because a false positive would
|
||||||
|
prevent importing an url that has not been imported before. A sqlite
|
||||||
|
database would work, but would need to be updated whenever the git-annex
|
||||||
|
branch is changed. --[[Joey]]
|
|
@ -61,3 +61,6 @@ looked up efficiently. (Before these changes, the same key lookup was done
|
||||||
speedup when such limits are used. What that optimisation needs is a way to
|
speedup when such limits are used. What that optimisation needs is a way to
|
||||||
tell if the current limit needs the key or not. If it does, then match on
|
tell if the current limit needs the key or not. If it does, then match on
|
||||||
it after getting the key, otherwise before getting the key.
|
it after getting the key, otherwise before getting the key.
|
||||||
|
|
||||||
|
Also, importfeed could be sped up more, probably, if knownItems
|
||||||
|
streamed through cat-file --buffer.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue