speed up populating the importfeed database

Avoid conversion from ByteString to String for urls that will just be
converted right back to ByteString to go into the database.

Also setTempUrl is not used by importfeed, so avoid checking for temp
urls in this code path.

This benchmarks as only a small improvement. From 2.99s to 2.78s
when populating a database with 33k urls.

Note that it does not seem worth replacing URLString with URLByteString
generally, because the ways urls are used all entails either parseURI,
which takes a string, or passing a parameter to eg curl, which also is
currently a string.

Sponsored-by: Leon Schuermann on Patreon
This commit is contained in:
Joey Hess 2023-10-25 13:00:17 -04:00
parent aaeadc422a
commit c9866d2164
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 19 additions and 12 deletions

View file

@ -112,9 +112,9 @@ isKnownItemId (ImportFeedDbHandle h) i =
] []
return $ not (null l)
recordKnownUrl :: ImportFeedDbHandle -> URLString -> IO ()
recordKnownUrl :: ImportFeedDbHandle -> URLByteString -> IO ()
recordKnownUrl h u = queueDb h $
void $ insertUniqueFast $ KnownUrls $ SByteString $ encodeBS u
void $ insertUniqueFast $ KnownUrls $ SByteString u
recordKnownItemId :: ImportFeedDbHandle -> SByteString -> IO ()
recordKnownItemId h i = queueDb h $
@ -177,7 +177,7 @@ updateFromLog db@(ImportFeedDbHandle h) (oldtree, currtree)
let f = getTopFilePath (DiffTree.file ti)
case extLogFileKey urlLogExt f of
Just k -> do
knownurls =<< getUrls k
knownurls =<< getUrls' k
Nothing -> case extLogFileKey metaDataLogExt f of
Just k -> do
m <- getCurrentMetaData k

View file

@ -1,6 +1,6 @@
{- Web url logs.
-
- Copyright 2011-2021 Joey Hess <id@joeyh.name>
- Copyright 2011-2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -9,7 +9,9 @@
module Logs.Web (
URLString,
URLByteString,
getUrls,
getUrls',
getUrlsWithPrefix,
setUrlPresent,
setUrlMissing,
@ -23,6 +25,7 @@ module Logs.Web (
) where
import qualified Data.Map as M
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Annex.Common
@ -35,20 +38,27 @@ import Annex.UUID
import qualified Annex.Branch
import qualified Types.Remote as Remote
type URLByteString = S.ByteString
{- Gets all urls that a key might be available from. -}
getUrls :: Key -> Annex [URLString]
getUrls key = do
config <- Annex.getGitConfig
l <- go $ urlLogFile config key : oldurlLogs config key
l <- map decodeBS <$> getUrls' key
tmpl <- Annex.getState (maybeToList . M.lookup key . Annex.tempurls)
return (tmpl ++ l)
{- Note that this does not include temporary urls set with setTempUrl. -}
getUrls' :: Key -> Annex [URLByteString]
getUrls' key = do
config <- Annex.getGitConfig
go $ urlLogFile config key : oldurlLogs config key
where
go [] = return []
go (l:ls) = do
us <- currentLogInfo l
if null us
then go ls
else return $ map decodeUrlLogInfo us
else return $ map fromLogInfo us
getUrlsWithPrefix :: Key -> String -> Annex [URLString]
getUrlsWithPrefix key prefix = filter (prefix `isPrefixOf`)
@ -123,10 +133,7 @@ getDownloader u = case separate (== ':') u of
("", u') -> (u', OtherDownloader)
_ -> (u, WebDownloader)
decodeUrlLogInfo :: LogInfo -> URLString
decodeUrlLogInfo = decodeBS . fromLogInfo
{- Parses the content of an url log file, returning the urls that are
- currently recorded. -}
parseUrlLog :: L.ByteString -> [URLString]
parseUrlLog = map decodeUrlLogInfo . getLog
parseUrlLog :: L.ByteString -> [URLByteString]
parseUrlLog = map fromLogInfo . getLog