importfeed: stream metadata for 5% speedup

On top of the 10% speedup from streaming url logs.
This commit is contained in:
Joey Hess 2020-07-14 14:35:26 -04:00
parent 535cdc8d48
commit 7b2d236556
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 58 additions and 28 deletions

View file

@ -32,7 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
* Sped up seeking for annexed files to operate on by a factor of nearly 2x. * Sped up seeking for annexed files to operate on by a factor of nearly 2x.
* Sped up sync --content by 2x and other commands like fsck --fast and * Sped up sync --content by 2x and other commands like fsck --fast and
whereis by around 50%, by using git cat-file --buffer. whereis by around 50%, by using git cat-file --buffer.
* importfeed: Made checking known urls step around 10% faster. * importfeed: Made checking known urls step around 15% faster.
* fsck: Detect if WORM keys contain a carriage return, and recommend * fsck: Detect if WORM keys contain a carriage return, and recommend
upgrading the key. (git-annex could have maybe created such keys back upgrading the key. (git-annex could have maybe created such keys back
in 2013). in 2013).

View file

@ -6,6 +6,7 @@
-} -}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
module Command.ImportFeed where module Command.ImportFeed where
@ -20,6 +21,7 @@ import Data.Time.Calendar
import Data.Time.LocalTime import Data.Time.LocalTime
import qualified Data.Text as T import qualified Data.Text as T
import System.Log.Logger import System.Log.Logger
import Control.Concurrent.Async
import Command import Command
import qualified Annex import qualified Annex
@ -43,6 +45,10 @@ import Annex.MetaData
import Annex.FileMatcher import Annex.FileMatcher
import Command.AddUrl (addWorkTree) import Command.AddUrl (addWorkTree)
import Annex.UntrustedFilePath import Annex.UntrustedFilePath
import qualified Git.Ref
import qualified Annex.Branch
import Logs
import Git.CatFile (catObjectStream)
cmd :: Command cmd :: Command
cmd = notBareRepo $ cmd = notBareRepo $
@ -119,7 +125,7 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
( ret S.empty S.empty ( ret S.empty S.empty
, do , do
showStart "importfeed" "checking known urls" showStart "importfeed" "checking known urls"
(is, us) <- unzip <$> (mapM knownItems =<< knownUrls) (is, us) <- unzip <$> knownItems
showEndOk showEndOk
ret (S.fromList us) (S.fromList (concat is)) ret (S.fromList us) (S.fromList (concat is))
) )
@ -127,13 +133,33 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
ret us is = return $ Cache us is tmpl ret us is = return $ Cache us is tmpl
knownItems :: (Key, URLString) -> Annex ([ItemId], URLString) knownItems :: Annex [([ItemId], URLString)]
knownItems (k, u) = do knownItems = do
itemids <- S.toList . S.filter (/= noneValue) g <- Annex.gitRepo
. S.map (decodeBS . fromMetaValue) config <- Annex.getGitConfig
. currentMetaDataValues itemIdField catObjectStream g $ \catfeeder catcloser catreader -> do
<$> getCurrentMetaData k rt <- liftIO $ async $ reader catreader []
return (itemids, u) withKnownUrls (feeder config catfeeder catcloser)
liftIO (wait rt)
where
feeder config catfeeder catcloser urlreader = urlreader >>= \case
Just (k, us) -> do
forM_ us $ \u ->
let logf = metaDataLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ catfeeder (u, ref)
feeder config catfeeder catcloser urlreader
Nothing -> liftIO catcloser
reader catreader c = catreader >>= \case
Just (u, Just mdc) ->
let !itemids = S.toList $ S.filter (/= noneValue) $
S.map (decodeBS . fromMetaValue) $
currentMetaDataValues itemIdField $
parseCurrentMetaData mdc
in reader catreader ((itemids,u):c)
Just (u, Nothing) -> reader catreader (([],u):c)
Nothing -> return c
findDownloads :: URLString -> Feed -> [ToDownload] findDownloads :: URLString -> Feed -> [ToDownload]
findDownloads u f = catMaybes $ map mk (feedItems f) findDownloads u f = catMaybes $ map mk (feedItems f)
@ -143,8 +169,8 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
Just $ ToDownload f u i $ Enclosure $ Just $ ToDownload f u i $ Enclosure $
T.unpack enclosureurl T.unpack enclosureurl
Nothing -> case getItemLink i of Nothing -> case getItemLink i of
Just link -> Just $ ToDownload f u i $ Just l -> Just $ ToDownload f u i $
MediaLink $ T.unpack link MediaLink $ T.unpack l
Nothing -> Nothing Nothing -> Nothing
{- Feeds change, so a feed download cannot be resumed. -} {- Feeds change, so a feed download cannot be resumed. -}

View file

@ -19,13 +19,14 @@
- after the other remote redundantly set foo +x, it was unset, - after the other remote redundantly set foo +x, it was unset,
- and so foo currently has no value. - and so foo currently has no value.
- -
- Copyright 2014-2019 Joey Hess <id@joeyh.name> - Copyright 2014-2020 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
module Logs.MetaData ( module Logs.MetaData (
getCurrentMetaData, getCurrentMetaData,
parseCurrentMetaData,
getCurrentRemoteMetaData, getCurrentRemoteMetaData,
addMetaData, addMetaData,
addRemoteMetaData, addRemoteMetaData,
@ -47,6 +48,7 @@ import Logs.MetaData.Pure
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.ByteString.Lazy as L
{- Go through the log from oldest to newest, and combine it all {- Go through the log from oldest to newest, and combine it all
- into a single MetaData representing the current state. - into a single MetaData representing the current state.
@ -60,9 +62,13 @@ getCurrentMetaData = getCurrentMetaData' metaDataLogFile
getCurrentMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> Annex MetaData getCurrentMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> Annex MetaData
getCurrentMetaData' getlogfile k = do getCurrentMetaData' getlogfile k = do
config <- Annex.getGitConfig config <- Annex.getGitConfig
ls <- S.toAscList <$> readLog (getlogfile config k) parseCurrentMetaData <$> Annex.Branch.get (getlogfile config k)
let loggedmeta = logToCurrentMetaData ls
return $ currentMetaData $ unionMetaData loggedmeta parseCurrentMetaData :: L.ByteString -> MetaData
parseCurrentMetaData content =
let ls = S.toAscList $ parseLog content
loggedmeta = logToCurrentMetaData ls
in currentMetaData $ unionMetaData loggedmeta
(lastchanged ls loggedmeta) (lastchanged ls loggedmeta)
where where
lastchanged [] _ = emptyMetaData lastchanged [] _ = emptyMetaData

View file

@ -13,7 +13,7 @@ module Logs.Web (
getUrlsWithPrefix, getUrlsWithPrefix,
setUrlPresent, setUrlPresent,
setUrlMissing, setUrlMissing,
knownUrls, withKnownUrls,
Downloader(..), Downloader(..),
getDownloader, getDownloader,
setDownloader, setDownloader,
@ -87,8 +87,8 @@ setUrlMissing key url = do
_ -> True _ -> True
{- Finds all known urls. -} {- Finds all known urls. -}
knownUrls :: Annex [(Key, URLString)] withKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a
knownUrls = do withKnownUrls a = do
{- Ensure any journalled changes are committed to the git-annex {- Ensure any journalled changes are committed to the git-annex
- branch, since we're going to look at its tree. -} - branch, since we're going to look at its tree. -}
_ <- Annex.Branch.update _ <- Annex.Branch.update
@ -98,15 +98,16 @@ knownUrls = do
Annex.Branch.fullname Annex.Branch.fullname
g <- Annex.gitRepo g <- Annex.gitRepo
let want = urlLogFileKey . getTopFilePath . Git.LsTree.file let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
catObjectStreamLsTree l want g (go []) catObjectStreamLsTree l want g (\reader -> a (go reader))
`finally` void (liftIO cleanup) `finally` void (liftIO cleanup)
where where
go c reader = liftIO reader >>= \case go reader = liftIO reader >>= \case
Just (k, Just content) -> Just (k, Just content) ->
let !c' = zip (repeat k) (geturls content) ++ c case geturls content of
in go c' reader [] -> go reader
Just (_, Nothing) -> go c reader us -> return (Just (k, us))
Nothing -> return c Just (_, Nothing) -> go reader
Nothing -> return Nothing
geturls = map (decodeBS . fromLogInfo) . getLog geturls = map (decodeBS . fromLogInfo) . getLog

View file

@ -61,6 +61,3 @@ looked up efficiently. (Before these changes, the same key lookup was done
speedup when such limits are used. What that optimisation needs is a way to speedup when such limits are used. What that optimisation needs is a way to
tell if the current limit needs the key or not. If it does, then match on tell if the current limit needs the key or not. If it does, then match on
it after getting the key, otherwise before getting the key. it after getting the key, otherwise before getting the key.
Also, importfeed could be sped up more, probably, if knownItems
streamed through cat-file --buffer.