importfeed: stream metadata for 5% speedup

On top of the 10% speedup from streaming url logs.
This commit is contained in:
Joey Hess 2020-07-14 14:35:26 -04:00
parent 535cdc8d48
commit 7b2d236556
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 58 additions and 28 deletions

View file

@ -32,7 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
* Sped up sync --content by 2x and other commands like fsck --fast and
whereis by around 50%, by using git cat-file --buffer.
* importfeed: Made checking known urls step around 10% faster.
* importfeed: Made checking known urls step around 15% faster.
* fsck: Detect if WORM keys contain a carriage return, and recommend
upgrading the key. (git-annex could have maybe created such keys back
in 2013).

View file

@ -6,6 +6,7 @@
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
module Command.ImportFeed where
@ -20,6 +21,7 @@ import Data.Time.Calendar
import Data.Time.LocalTime
import qualified Data.Text as T
import System.Log.Logger
import Control.Concurrent.Async
import Command
import qualified Annex
@ -43,6 +45,10 @@ import Annex.MetaData
import Annex.FileMatcher
import Command.AddUrl (addWorkTree)
import Annex.UntrustedFilePath
import qualified Git.Ref
import qualified Annex.Branch
import Logs
import Git.CatFile (catObjectStream)
cmd :: Command
cmd = notBareRepo $
@ -119,7 +125,7 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
( ret S.empty S.empty
, do
showStart "importfeed" "checking known urls"
(is, us) <- unzip <$> (mapM knownItems =<< knownUrls)
(is, us) <- unzip <$> knownItems
showEndOk
ret (S.fromList us) (S.fromList (concat is))
)
@ -127,13 +133,33 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
ret us is = return $ Cache us is tmpl
knownItems :: (Key, URLString) -> Annex ([ItemId], URLString)
knownItems (k, u) = do
itemids <- S.toList . S.filter (/= noneValue)
. S.map (decodeBS . fromMetaValue)
. currentMetaDataValues itemIdField
<$> getCurrentMetaData k
return (itemids, u)
knownItems :: Annex [([ItemId], URLString)]
knownItems = do
g <- Annex.gitRepo
config <- Annex.getGitConfig
catObjectStream g $ \catfeeder catcloser catreader -> do
rt <- liftIO $ async $ reader catreader []
withKnownUrls (feeder config catfeeder catcloser)
liftIO (wait rt)
where
feeder config catfeeder catcloser urlreader = urlreader >>= \case
Just (k, us) -> do
forM_ us $ \u ->
let logf = metaDataLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ catfeeder (u, ref)
feeder config catfeeder catcloser urlreader
Nothing -> liftIO catcloser
reader catreader c = catreader >>= \case
Just (u, Just mdc) ->
let !itemids = S.toList $ S.filter (/= noneValue) $
S.map (decodeBS . fromMetaValue) $
currentMetaDataValues itemIdField $
parseCurrentMetaData mdc
in reader catreader ((itemids,u):c)
Just (u, Nothing) -> reader catreader (([],u):c)
Nothing -> return c
findDownloads :: URLString -> Feed -> [ToDownload]
findDownloads u f = catMaybes $ map mk (feedItems f)
@ -143,8 +169,8 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
Just $ ToDownload f u i $ Enclosure $
T.unpack enclosureurl
Nothing -> case getItemLink i of
Just link -> Just $ ToDownload f u i $
MediaLink $ T.unpack link
Just l -> Just $ ToDownload f u i $
MediaLink $ T.unpack l
Nothing -> Nothing
{- Feeds change, so a feed download cannot be resumed. -}

View file

@ -19,13 +19,14 @@
- after the other remote redundantly set foo +x, it was unset,
- and so foo currently has no value.
-
- Copyright 2014-2019 Joey Hess <id@joeyh.name>
- Copyright 2014-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
module Logs.MetaData (
getCurrentMetaData,
parseCurrentMetaData,
getCurrentRemoteMetaData,
addMetaData,
addRemoteMetaData,
@ -47,6 +48,7 @@ import Logs.MetaData.Pure
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.ByteString.Lazy as L
{- Go through the log from oldest to newest, and combine it all
- into a single MetaData representing the current state.
@ -60,9 +62,13 @@ getCurrentMetaData = getCurrentMetaData' metaDataLogFile
getCurrentMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> Annex MetaData
getCurrentMetaData' getlogfile k = do
config <- Annex.getGitConfig
ls <- S.toAscList <$> readLog (getlogfile config k)
let loggedmeta = logToCurrentMetaData ls
return $ currentMetaData $ unionMetaData loggedmeta
parseCurrentMetaData <$> Annex.Branch.get (getlogfile config k)
parseCurrentMetaData :: L.ByteString -> MetaData
parseCurrentMetaData content =
let ls = S.toAscList $ parseLog content
loggedmeta = logToCurrentMetaData ls
in currentMetaData $ unionMetaData loggedmeta
(lastchanged ls loggedmeta)
where
lastchanged [] _ = emptyMetaData

View file

@ -13,7 +13,7 @@ module Logs.Web (
getUrlsWithPrefix,
setUrlPresent,
setUrlMissing,
knownUrls,
withKnownUrls,
Downloader(..),
getDownloader,
setDownloader,
@ -87,8 +87,8 @@ setUrlMissing key url = do
_ -> True
{- Finds all known urls. -}
knownUrls :: Annex [(Key, URLString)]
knownUrls = do
withKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a
withKnownUrls a = do
{- Ensure any journalled changes are committed to the git-annex
- branch, since we're going to look at its tree. -}
_ <- Annex.Branch.update
@ -98,15 +98,16 @@ knownUrls = do
Annex.Branch.fullname
g <- Annex.gitRepo
let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
catObjectStreamLsTree l want g (go [])
catObjectStreamLsTree l want g (\reader -> a (go reader))
`finally` void (liftIO cleanup)
where
go c reader = liftIO reader >>= \case
go reader = liftIO reader >>= \case
Just (k, Just content) ->
let !c' = zip (repeat k) (geturls content) ++ c
in go c' reader
Just (_, Nothing) -> go c reader
Nothing -> return c
case geturls content of
[] -> go reader
us -> return (Just (k, us))
Just (_, Nothing) -> go reader
Nothing -> return Nothing
geturls = map (decodeBS . fromLogInfo) . getLog

View file

@ -61,6 +61,3 @@ looked up efficiently. (Before these changes, the same key lookup was done
speedup when such limits are used. What that optimisation needs is a way to
tell if the current limit needs the key or not. If it does, then match on
it after getting the key, otherwise before getting the key.
Also, importfeed could be sped up more, probably, if knownItems
streamed through cat-file --buffer.