importfeed: stream metadata for 5% speedup
On top of the 10% speedup from streaming url logs.
This commit is contained in:
parent
535cdc8d48
commit
7b2d236556
5 changed files with 58 additions and 28 deletions
|
@ -32,7 +32,7 @@ git-annex (8.20200618) UNRELEASED; urgency=medium
|
||||||
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
|
* Sped up seeking for annexed files to operate on by a factor of nearly 2x.
|
||||||
* Sped up sync --content by 2x and other commands like fsck --fast and
|
* Sped up sync --content by 2x and other commands like fsck --fast and
|
||||||
whereis by around 50%, by using git cat-file --buffer.
|
whereis by around 50%, by using git cat-file --buffer.
|
||||||
* importfeed: Made checking known urls step around 10% faster.
|
* importfeed: Made checking known urls step around 15% faster.
|
||||||
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
* fsck: Detect if WORM keys contain a carriage return, and recommend
|
||||||
upgrading the key. (git-annex could have maybe created such keys back
|
upgrading the key. (git-annex could have maybe created such keys back
|
||||||
in 2013).
|
in 2013).
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
-}
|
-}
|
||||||
|
|
||||||
{-# LANGUAGE OverloadedStrings #-}
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE BangPatterns #-}
|
||||||
|
|
||||||
module Command.ImportFeed where
|
module Command.ImportFeed where
|
||||||
|
|
||||||
|
@ -20,6 +21,7 @@ import Data.Time.Calendar
|
||||||
import Data.Time.LocalTime
|
import Data.Time.LocalTime
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import System.Log.Logger
|
import System.Log.Logger
|
||||||
|
import Control.Concurrent.Async
|
||||||
|
|
||||||
import Command
|
import Command
|
||||||
import qualified Annex
|
import qualified Annex
|
||||||
|
@ -43,6 +45,10 @@ import Annex.MetaData
|
||||||
import Annex.FileMatcher
|
import Annex.FileMatcher
|
||||||
import Command.AddUrl (addWorkTree)
|
import Command.AddUrl (addWorkTree)
|
||||||
import Annex.UntrustedFilePath
|
import Annex.UntrustedFilePath
|
||||||
|
import qualified Git.Ref
|
||||||
|
import qualified Annex.Branch
|
||||||
|
import Logs
|
||||||
|
import Git.CatFile (catObjectStream)
|
||||||
|
|
||||||
cmd :: Command
|
cmd :: Command
|
||||||
cmd = notBareRepo $
|
cmd = notBareRepo $
|
||||||
|
@ -119,7 +125,7 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
|
||||||
( ret S.empty S.empty
|
( ret S.empty S.empty
|
||||||
, do
|
, do
|
||||||
showStart "importfeed" "checking known urls"
|
showStart "importfeed" "checking known urls"
|
||||||
(is, us) <- unzip <$> (mapM knownItems =<< knownUrls)
|
(is, us) <- unzip <$> knownItems
|
||||||
showEndOk
|
showEndOk
|
||||||
ret (S.fromList us) (S.fromList (concat is))
|
ret (S.fromList us) (S.fromList (concat is))
|
||||||
)
|
)
|
||||||
|
@ -127,13 +133,33 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
|
||||||
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
||||||
ret us is = return $ Cache us is tmpl
|
ret us is = return $ Cache us is tmpl
|
||||||
|
|
||||||
knownItems :: (Key, URLString) -> Annex ([ItemId], URLString)
|
knownItems :: Annex [([ItemId], URLString)]
|
||||||
knownItems (k, u) = do
|
knownItems = do
|
||||||
itemids <- S.toList . S.filter (/= noneValue)
|
g <- Annex.gitRepo
|
||||||
. S.map (decodeBS . fromMetaValue)
|
config <- Annex.getGitConfig
|
||||||
. currentMetaDataValues itemIdField
|
catObjectStream g $ \catfeeder catcloser catreader -> do
|
||||||
<$> getCurrentMetaData k
|
rt <- liftIO $ async $ reader catreader []
|
||||||
return (itemids, u)
|
withKnownUrls (feeder config catfeeder catcloser)
|
||||||
|
liftIO (wait rt)
|
||||||
|
where
|
||||||
|
feeder config catfeeder catcloser urlreader = urlreader >>= \case
|
||||||
|
Just (k, us) -> do
|
||||||
|
forM_ us $ \u ->
|
||||||
|
let logf = metaDataLogFile config k
|
||||||
|
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
|
||||||
|
in liftIO $ catfeeder (u, ref)
|
||||||
|
feeder config catfeeder catcloser urlreader
|
||||||
|
Nothing -> liftIO catcloser
|
||||||
|
|
||||||
|
reader catreader c = catreader >>= \case
|
||||||
|
Just (u, Just mdc) ->
|
||||||
|
let !itemids = S.toList $ S.filter (/= noneValue) $
|
||||||
|
S.map (decodeBS . fromMetaValue) $
|
||||||
|
currentMetaDataValues itemIdField $
|
||||||
|
parseCurrentMetaData mdc
|
||||||
|
in reader catreader ((itemids,u):c)
|
||||||
|
Just (u, Nothing) -> reader catreader (([],u):c)
|
||||||
|
Nothing -> return c
|
||||||
|
|
||||||
findDownloads :: URLString -> Feed -> [ToDownload]
|
findDownloads :: URLString -> Feed -> [ToDownload]
|
||||||
findDownloads u f = catMaybes $ map mk (feedItems f)
|
findDownloads u f = catMaybes $ map mk (feedItems f)
|
||||||
|
@ -143,8 +169,8 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
|
||||||
Just $ ToDownload f u i $ Enclosure $
|
Just $ ToDownload f u i $ Enclosure $
|
||||||
T.unpack enclosureurl
|
T.unpack enclosureurl
|
||||||
Nothing -> case getItemLink i of
|
Nothing -> case getItemLink i of
|
||||||
Just link -> Just $ ToDownload f u i $
|
Just l -> Just $ ToDownload f u i $
|
||||||
MediaLink $ T.unpack link
|
MediaLink $ T.unpack l
|
||||||
Nothing -> Nothing
|
Nothing -> Nothing
|
||||||
|
|
||||||
{- Feeds change, so a feed download cannot be resumed. -}
|
{- Feeds change, so a feed download cannot be resumed. -}
|
||||||
|
|
|
@ -19,13 +19,14 @@
|
||||||
- after the other remote redundantly set foo +x, it was unset,
|
- after the other remote redundantly set foo +x, it was unset,
|
||||||
- and so foo currently has no value.
|
- and so foo currently has no value.
|
||||||
-
|
-
|
||||||
- Copyright 2014-2019 Joey Hess <id@joeyh.name>
|
- Copyright 2014-2020 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
|
||||||
module Logs.MetaData (
|
module Logs.MetaData (
|
||||||
getCurrentMetaData,
|
getCurrentMetaData,
|
||||||
|
parseCurrentMetaData,
|
||||||
getCurrentRemoteMetaData,
|
getCurrentRemoteMetaData,
|
||||||
addMetaData,
|
addMetaData,
|
||||||
addRemoteMetaData,
|
addRemoteMetaData,
|
||||||
|
@ -47,6 +48,7 @@ import Logs.MetaData.Pure
|
||||||
|
|
||||||
import qualified Data.Set as S
|
import qualified Data.Set as S
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
|
import qualified Data.ByteString.Lazy as L
|
||||||
|
|
||||||
{- Go through the log from oldest to newest, and combine it all
|
{- Go through the log from oldest to newest, and combine it all
|
||||||
- into a single MetaData representing the current state.
|
- into a single MetaData representing the current state.
|
||||||
|
@ -60,9 +62,13 @@ getCurrentMetaData = getCurrentMetaData' metaDataLogFile
|
||||||
getCurrentMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> Annex MetaData
|
getCurrentMetaData' :: (GitConfig -> Key -> RawFilePath) -> Key -> Annex MetaData
|
||||||
getCurrentMetaData' getlogfile k = do
|
getCurrentMetaData' getlogfile k = do
|
||||||
config <- Annex.getGitConfig
|
config <- Annex.getGitConfig
|
||||||
ls <- S.toAscList <$> readLog (getlogfile config k)
|
parseCurrentMetaData <$> Annex.Branch.get (getlogfile config k)
|
||||||
let loggedmeta = logToCurrentMetaData ls
|
|
||||||
return $ currentMetaData $ unionMetaData loggedmeta
|
parseCurrentMetaData :: L.ByteString -> MetaData
|
||||||
|
parseCurrentMetaData content =
|
||||||
|
let ls = S.toAscList $ parseLog content
|
||||||
|
loggedmeta = logToCurrentMetaData ls
|
||||||
|
in currentMetaData $ unionMetaData loggedmeta
|
||||||
(lastchanged ls loggedmeta)
|
(lastchanged ls loggedmeta)
|
||||||
where
|
where
|
||||||
lastchanged [] _ = emptyMetaData
|
lastchanged [] _ = emptyMetaData
|
||||||
|
|
21
Logs/Web.hs
21
Logs/Web.hs
|
@ -13,7 +13,7 @@ module Logs.Web (
|
||||||
getUrlsWithPrefix,
|
getUrlsWithPrefix,
|
||||||
setUrlPresent,
|
setUrlPresent,
|
||||||
setUrlMissing,
|
setUrlMissing,
|
||||||
knownUrls,
|
withKnownUrls,
|
||||||
Downloader(..),
|
Downloader(..),
|
||||||
getDownloader,
|
getDownloader,
|
||||||
setDownloader,
|
setDownloader,
|
||||||
|
@ -87,8 +87,8 @@ setUrlMissing key url = do
|
||||||
_ -> True
|
_ -> True
|
||||||
|
|
||||||
{- Finds all known urls. -}
|
{- Finds all known urls. -}
|
||||||
knownUrls :: Annex [(Key, URLString)]
|
withKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a
|
||||||
knownUrls = do
|
withKnownUrls a = do
|
||||||
{- Ensure any journalled changes are committed to the git-annex
|
{- Ensure any journalled changes are committed to the git-annex
|
||||||
- branch, since we're going to look at its tree. -}
|
- branch, since we're going to look at its tree. -}
|
||||||
_ <- Annex.Branch.update
|
_ <- Annex.Branch.update
|
||||||
|
@ -98,15 +98,16 @@ knownUrls = do
|
||||||
Annex.Branch.fullname
|
Annex.Branch.fullname
|
||||||
g <- Annex.gitRepo
|
g <- Annex.gitRepo
|
||||||
let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
|
let want = urlLogFileKey . getTopFilePath . Git.LsTree.file
|
||||||
catObjectStreamLsTree l want g (go [])
|
catObjectStreamLsTree l want g (\reader -> a (go reader))
|
||||||
`finally` void (liftIO cleanup)
|
`finally` void (liftIO cleanup)
|
||||||
where
|
where
|
||||||
go c reader = liftIO reader >>= \case
|
go reader = liftIO reader >>= \case
|
||||||
Just (k, Just content) ->
|
Just (k, Just content) ->
|
||||||
let !c' = zip (repeat k) (geturls content) ++ c
|
case geturls content of
|
||||||
in go c' reader
|
[] -> go reader
|
||||||
Just (_, Nothing) -> go c reader
|
us -> return (Just (k, us))
|
||||||
Nothing -> return c
|
Just (_, Nothing) -> go reader
|
||||||
|
Nothing -> return Nothing
|
||||||
|
|
||||||
geturls = map (decodeBS . fromLogInfo) . getLog
|
geturls = map (decodeBS . fromLogInfo) . getLog
|
||||||
|
|
||||||
|
|
|
@ -61,6 +61,3 @@ looked up efficiently. (Before these changes, the same key lookup was done
|
||||||
speedup when such limits are used. What that optimisation needs is a way to
|
speedup when such limits are used. What that optimisation needs is a way to
|
||||||
tell if the current limit needs the key or not. If it does, then match on
|
tell if the current limit needs the key or not. If it does, then match on
|
||||||
it after getting the key, otherwise before getting the key.
|
it after getting the key, otherwise before getting the key.
|
||||||
|
|
||||||
Also, importfeed could be sped up more, probably, if knownItems
|
|
||||||
streamed through cat-file --buffer.
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue