importfeed: fix bug while also speeding up 12x!

* Fix bug that could make git-annex importfeed not see recently recorded
  state when configured with annex.alwayscommit=false.
* importfeed: Made "checking known urls" phase run 12 times faster.

The massive speedup is because it no longer queries for metadata
accompanying each url. Instead it processes the whole git-annex branch and
checks all metadata files for feed item ids, and uses any it finds.

This could result in a behavior change, in an unlikely situation: If a feed
id is recorded in a key's metadata, but the url gets removed, the old code
would not see that item id and would re-download it if it finds an url for
it in a feed, while the new code will see the item id. I don't think
the old behavior was intentional, and it may be that the new behavior is
better. Not gonna worry about this.
This commit is contained in:
Joey Hess 2021-04-23 12:36:56 -04:00
parent b689f17062
commit 0547884eb2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 29 additions and 43 deletions

View file

@ -15,6 +15,9 @@ git-annex (8.20210331) UNRELEASED; urgency=medium
* Fix bug caused by recent optimisations that could make git-annex not * Fix bug caused by recent optimisations that could make git-annex not
see recently recorded status information when configured with see recently recorded status information when configured with
annex.alwayscommit=false. annex.alwayscommit=false.
* Fix bug that could make git-annex importfeed not see recently recorded
state when configured with annex.alwayscommit=false.
* importfeed: Made "checking known urls" phase run 12 times faster.
-- Joey Hess <id@joeyh.name> Thu, 01 Apr 2021 12:17:26 -0400 -- Joey Hess <id@joeyh.name> Thu, 01 Apr 2021 12:17:26 -0400

View file

@ -20,7 +20,6 @@ import Data.Time.Format
import Data.Time.Calendar import Data.Time.Calendar
import Data.Time.LocalTime import Data.Time.LocalTime
import qualified Data.Text as T import qualified Data.Text as T
import Control.Concurrent.Async
import qualified System.FilePath.ByteString as P import qualified System.FilePath.ByteString as P
import Command import Command
@ -45,10 +44,8 @@ import Annex.MetaData
import Annex.FileMatcher import Annex.FileMatcher
import Command.AddUrl (addWorkTree) import Command.AddUrl (addWorkTree)
import Annex.UntrustedFilePath import Annex.UntrustedFilePath
import qualified Git.Ref
import qualified Annex.Branch import qualified Annex.Branch
import Logs import Logs
import Git.CatFile (catObjectStream)
cmd :: Command cmd :: Command
cmd = notBareRepo $ cmd = notBareRepo $
@ -125,52 +122,38 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
( ret S.empty S.empty ( ret S.empty S.empty
, do , do
showStart "importfeed" "checking known urls" (SeekInput []) showStart "importfeed" "checking known urls" (SeekInput [])
(is, us) <- unzip <$> knownItems (us, is) <- knownItems
showEndOk showEndOk
ret (S.fromList us) (S.fromList (concat is)) ret (S.fromList us) (S.fromList is)
) )
where where
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
ret us is = return $ Cache us is tmpl ret us is = return $ Cache us is tmpl
knownItems :: Annex [([ItemId], URLString)] {- Scan all url logs and metadata logs in the branch and find urls
knownItems = do - and ItemIds that are already known. -}
g <- Annex.gitRepo knownItems :: Annex ([URLString], [ItemId])
config <- Annex.getGitConfig knownItems = Annex.Branch.overBranchFileContents select (go [] [])
catObjectStream g $ \catfeeder catcloser catreader -> do
rt <- liftIO $ async $ reader catreader []
overKnownUrls (feeder config catfeeder catcloser)
liftIO (wait rt)
where where
feeder config catfeeder catcloser urlreader = urlreader >>= \case select f
Just (k, us) -> do | isUrlLog f = Just ()
forM_ us $ \u -> | isMetaDataLog f = Just ()
let logf = metaDataLogFile config k | otherwise = Nothing
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ catfeeder (u, ref)
feeder config catfeeder catcloser urlreader
Nothing -> liftIO catcloser
reader catreader c = catreader >>= \case
Just (u, Just mdc) ->
let !itemids = S.toList $ S.filter (/= noneValue) $
S.map (decodeBS . fromMetaValue) $
currentMetaDataValues itemIdField $
parseCurrentMetaData mdc
in reader catreader ((itemids,u):c)
Just (u, Nothing) -> reader catreader (([],u):c)
Nothing -> return c
overKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a go uc ic reader = reader >>= \case
overKnownUrls a = Annex.Branch.overBranchFileContents urlLogFileKey (a . go) Just ((), f, Just content)
where | isUrlLog f -> case parseUrlLog content of
go reader = reader >>= \case [] -> go uc ic reader
Just (k, _, Just content) -> us -> go (us++uc) ic reader
case parseUrlLog content of | isMetaDataLog f ->
[] -> go reader let s = currentMetaDataValues itemIdField $
us -> return (Just (k, us)) parseCurrentMetaData content
Just (_, _, Nothing) -> go reader in if S.null s
Nothing -> return Nothing then go uc ic reader
else go uc (map (decodeBS . fromMetaValue) (S.toList s)++ic) reader
| otherwise -> go uc ic reader
Just ((), _, Nothing) -> go uc ic reader
Nothing -> return (uc, ic)
findDownloads :: URLString -> Feed -> [ToDownload] findDownloads :: URLString -> Feed -> [ToDownload]
findDownloads u f = catMaybes $ map mk (feedItems f) findDownloads u f = catMaybes $ map mk (feedItems f)

View file

@ -155,8 +155,8 @@ later write.
> * [[bugs/git-annex_branch_caching_bug]] was a problem, now fixed. > * [[bugs/git-annex_branch_caching_bug]] was a problem, now fixed.
> * Any other similar direct accesses of the branch, not going through > * Any other similar direct accesses of the branch, not going through
> Annex.Branch, also need to be fixed (and may be missing journal files > Annex.Branch, also need to be fixed (and may be missing journal files
> already?) Command.ImportFeed.knownItems is one. Command.Log behavior > already?) Most fixed now. Command.Log behavior needs to be
> needs to be investigated, may be ok. > investigated still.
> >
> * Need to implement regardingPrivateUUID and privateUUIDsKnown, > * Need to implement regardingPrivateUUID and privateUUIDsKnown,
> which need to look at the git config to find the private uuids. > which need to look at the git config to find the private uuids.