importfeed: stream metadata for 5% speedup
On top of the 10% speedup from streaming url logs.
This commit is contained in:
parent
535cdc8d48
commit
7b2d236556
5 changed files with 58 additions and 28 deletions
|
@ -32,7 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
|
|||
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
|
||||
* Sped up sync --content by 2x and other commands like fsck --fast and
|
||||
whereis by around 50%, by using git cat-file --buffer.
|
||||
* importfeed: Made checking known urls step around 10% faster.
|
||||
* importfeed: Made checking known urls step around 15% faster.
|
||||
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
||||
upgrading the key. (git-annex could have maybe created such keys back
|
||||
in 2013).
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
-}
|
||||
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
module Command.ImportFeed where
|
||||
|
||||
|
@ -20,6 +21,7 @@ import Data.Time.Calendar
|
|||
import Data.Time.LocalTime
|
||||
import qualified Data.Text as T
|
||||
import System.Log.Logger
|
||||
import Control.Concurrent.Async
|
||||
|
||||
import Command
|
||||
import qualified Annex
|
||||
|
@ -43,6 +45,10 @@ import Annex.MetaData
|
|||
import Annex.FileMatcher
|
||||
import Command.AddUrl (addWorkTree)
|
||||
import Annex.UntrustedFilePath
|
||||
import qualified Git.Ref
|
||||
import qualified Annex.Branch
|
||||
import Logs
|
||||
import Git.CatFile (catObjectStream)
|
||||
|
||||
cmd :: Command
|
||||
cmd = notBareRepo $
|
||||
|
@ -119,7 +125,7 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
|
|||
( ret S.empty S.empty
|
||||
, do
|
||||
showStart "importfeed" "checking known urls"
|
||||
(is, us) <- unzip <$> (mapM knownItems =<< knownUrls)
|
||||
(is, us) <- unzip <$> knownItems
|
||||
showEndOk
|
||||
ret (S.fromList us) (S.fromList (concat is))
|
||||
)
|
||||
|
@ -127,13 +133,33 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
|
|||
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
||||
ret us is = return $ Cache us is tmpl
|
||||
|
||||
knownItems :: (Key, URLString) -> Annex ([ItemId], URLString)
|
||||
knownItems (k, u) = do
|
||||
itemids <- S.toList . S.filter (/= noneValue)
|
||||
. S.map (decodeBS . fromMetaValue)
|
||||
. currentMetaDataValues itemIdField
|
||||
<$> getCurrentMetaData k
|
||||
return (itemids, u)
|
||||
knownItems :: Annex [([ItemId], URLString)]
|
||||
knownItems = do
|
||||
g <- Annex.gitRepo
|
||||
config <- Annex.getGitConfig
|
||||
catObjectStream g $ \catfeeder catcloser catreader -> do
|
||||
rt <- liftIO $ async $ reader catreader []
|
||||
withKnownUrls (feeder config catfeeder catcloser)
|
||||
liftIO (wait rt)
|
||||
where
|
||||
feeder config catfeeder catcloser urlreader = urlreader >>= \case
|
||||
Just (k, us) -> do
|
||||
forM_ us $ \u ->
|
||||
let logf = metaDataLogFile config k
|
||||
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
|
||||
in liftIO $ catfeeder (u, ref)
|
||||
feeder config catfeeder catcloser urlreader
|
||||
Nothing -> liftIO catcloser
|
||||
|
||||
reader catreader c = catreader >>= \case
|
||||
Just (u, Just mdc) ->
|
||||
let !itemids = S.toList $ S.filter (/= noneValue) $
|
||||
S.map (decodeBS . fromMetaValue) $
|
||||
currentMetaDataValues itemIdField $
|
||||
parseCurrentMetaData mdc
|
||||
in reader catreader ((itemids,u):c)
|
||||
Just (u, Nothing) -> reader catreader (([],u):c)
|
||||
Nothing -> return c
|
||||
|
||||
findDownloads :: URLString -> Feed -> [ToDownload]
|
||||
findDownloads u f = catMaybes $ map mk (feedItems f)
|
||||
|
@ -143,8 +169,8 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
|
|||
Just $ ToDownload f u i $ Enclosure $
|
||||
T.unpack enclosureurl
|
||||
Nothing -> case getItemLink i of
|
||||
Just link -> Just $ ToDownload f u i $
|
||||
MediaLink $ T.unpack link
|
||||
Just l -> Just $ ToDownload f u i $
|
||||
MediaLink $ T.unpack l
|
||||
Nothing -> Nothing
|
||||
|
||||
{- Feeds change, so a feed download cannot be resumed. -}
|
||||
|
|
|
@ -19,13 +19,14 @@
|
|||
- after the other remote redundantly set foo +x, it was unset,
|
||||
- and so foo currently has no value.
|
||||
-
|
||||
- Copyright 2014-2019 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2014-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
module Logs.MetaData (
|
||||
getCurrentMetaData,
|
||||
parseCurrentMetaData,
|
||||
getCurrentRemoteMetaData,
|
||||
addMetaData,
|
||||
addRemoteMetaData,
|
||||
|
@ -47,6 +48,7 @@ import Logs.MetaData.Pure
|
|||
|
||||
import qualified Data.Set as S
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
|
||||
{- Go through the log from oldest to newest, and combine it all
|
||||
- into a single MetaData representing the current state.
|
||||
|
@ -60,9 +62,13 @@ getCurrentMetaData = getCurrentMetaData' metaDataLogFile
|
|||
getCurrentMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> Annex MetaData
|
||||
getCurrentMetaData' getlogfile k = do
|
||||
config <- Annex.getGitConfig
|
||||
ls <- S.toAscList <$> readLog (getlogfile config k)
|
||||
let loggedmeta = logToCurrentMetaData ls
|
||||
return $ currentMetaData $ unionMetaData loggedmeta
|
||||
parseCurrentMetaData <$> Annex.Branch.get (getlogfile config k)
|
||||
|
||||
parseCurrentMetaData :: L.ByteString -> MetaData
|
||||
parseCurrentMetaData content =
|
||||
let ls = S.toAscList $ parseLog content
|
||||
loggedmeta = logToCurrentMetaData ls
|
||||
in currentMetaData $ unionMetaData loggedmeta
|
||||
(lastchanged ls loggedmeta)
|
||||
where
|
||||
lastchanged [] _ = emptyMetaData
|
||||
|
|
19
Logs/Web.hs
19
Logs/Web.hs
|
@ -13,7 +13,7 @@ module Logs.Web (
|
|||
getUrlsWithPrefix,
|
||||
setUrlPresent,
|
||||
setUrlMissing,
|
||||
knownUrls,
|
||||
withKnownUrls,
|
||||
Downloader(..),
|
||||
getDownloader,
|
||||
setDownloader,
|
||||
|
@ -87,8 +87,8 @@ setUrlMissing key url = do
|
|||
_ -> True
|
||||
|
||||
{- Finds all known urls. -}
|
||||
knownUrls :: Annex [(Key, URLString)]
|
||||
knownUrls = do
|
||||
withKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a
|
||||
withKnownUrls a = do
|
||||
{- Ensure any journalled changes are committed to the git-annex
|
||||
- branch, since we're going to look at its tree. -}
|
||||
_ <- Annex.Branch.update
|
||||
|
@ -98,15 +98,16 @@ knownUrls = do
|
|||
Annex.Branch.fullname
|
||||
g <- Annex.gitRepo
|
||||
let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
|
||||
catObjectStreamLsTree l want g (go [])
|
||||
catObjectStreamLsTree l want g (\reader -> a (go reader))
|
||||
`finally` void (liftIO cleanup)
|
||||
where
|
||||
go c reader = liftIO reader >>= \case
|
||||
go reader = liftIO reader >>= \case
|
||||
Just (k, Just content) ->
|
||||
let !c' = zip (repeat k) (geturls content) ++ c
|
||||
in go c' reader
|
||||
Just (_, Nothing) -> go c reader
|
||||
Nothing -> return c
|
||||
case geturls content of
|
||||
[] -> go reader
|
||||
us -> return (Just (k, us))
|
||||
Just (_, Nothing) -> go reader
|
||||
Nothing -> return Nothing
|
||||
|
||||
geturls = map (decodeBS . fromLogInfo) . getLog
|
||||
|
||||
|
|
|
@ -61,6 +61,3 @@ looked up efficiently. (Before these changes, the same key lookup was done
|
|||
speedup when such limits are used. What that optimisation needs is a way to
|
||||
tell if the current limit needs the key or not. If it does, then match on
|
||||
it after getting the key, otherwise before getting the key.
|
||||
|
||||
Also, importfeed could be sped up more, probably, if knownItems
|
||||
streamed through cat-file --buffer.
|
||||
|
|
Loading…
Reference in a new issue