importfeed: fix bug while also speeding up 12x!

* Fix bug that could make git-annex importfeed not see recently recorded
  state when configured with annex.alwayscommit=false.
* importfeed: Made "checking known urls" phase run 12 times faster.

The massive speedup is because it no longer queries for metadata
accompanying each url. Instead it processes the whole git-annex branch and
checks all metadata files for feed item ids, and uses any it finds.

This could result in a behavior change, in an unlikely situation: If a feed
id is recorded in a key's metadata, but the url gets removed, the old code
would not see that item id and would re-download it if it finds an url for
it in a feed, while the new code will see the item id. I don't think
the old behavior was intentional, and it may be that the new behavior is
better. Not gonna worry about this.
This commit is contained in:
Joey Hess 2021-04-23 12:36:56 -04:00
parent b689f17062
commit 0547884eb2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 29 additions and 43 deletions

View file

@ -15,6 +15,9 @@ git-annex (8.20210331) UNRELEASED; urgency=medium
* Fix bug caused by recent optimisations that could make git-annex not
see recently recorded status information when configured with
annex.alwayscommit=false.
* Fix bug that could make git-annex importfeed not see recently recorded
state when configured with annex.alwayscommit=false.
* importfeed: Made "checking known urls" phase run 12 times faster.
-- Joey Hess <id@joeyh.name> Thu, 01 Apr 2021 12:17:26 -0400

View file

@ -20,7 +20,6 @@ import Data.Time.Format
import Data.Time.Calendar
import Data.Time.LocalTime
import qualified Data.Text as T
import Control.Concurrent.Async
import qualified System.FilePath.ByteString as P
import Command
@ -45,10 +44,8 @@ import Annex.MetaData
import Annex.FileMatcher
import Command.AddUrl (addWorkTree)
import Annex.UntrustedFilePath
import qualified Git.Ref
import qualified Annex.Branch
import Logs
import Git.CatFile (catObjectStream)
cmd :: Command
cmd = notBareRepo $
@ -125,52 +122,38 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
( ret S.empty S.empty
, do
showStart "importfeed" "checking known urls" (SeekInput [])
(is, us) <- unzip <$> knownItems
(us, is) <- knownItems
showEndOk
ret (S.fromList us) (S.fromList (concat is))
ret (S.fromList us) (S.fromList is)
)
where
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
ret us is = return $ Cache us is tmpl
knownItems :: Annex [([ItemId], URLString)]
knownItems = do
g <- Annex.gitRepo
config <- Annex.getGitConfig
catObjectStream g $ \catfeeder catcloser catreader -> do
rt <- liftIO $ async $ reader catreader []
overKnownUrls (feeder config catfeeder catcloser)
liftIO (wait rt)
{- Scan all url logs and metadata logs in the branch and find urls
- and ItemIds that are already known. -}
knownItems :: Annex ([URLString], [ItemId])
knownItems = Annex.Branch.overBranchFileContents select (go [] [])
where
feeder config catfeeder catcloser urlreader = urlreader >>= \case
Just (k, us) -> do
forM_ us $ \u ->
let logf = metaDataLogFile config k
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
in liftIO $ catfeeder (u, ref)
feeder config catfeeder catcloser urlreader
Nothing -> liftIO catcloser
reader catreader c = catreader >>= \case
Just (u, Just mdc) ->
let !itemids = S.toList $ S.filter (/= noneValue) $
S.map (decodeBS . fromMetaValue) $
currentMetaDataValues itemIdField $
parseCurrentMetaData mdc
in reader catreader ((itemids,u):c)
Just (u, Nothing) -> reader catreader (([],u):c)
Nothing -> return c
select f
| isUrlLog f = Just ()
| isMetaDataLog f = Just ()
| otherwise = Nothing
overKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a
overKnownUrls a = Annex.Branch.overBranchFileContents urlLogFileKey (a . go)
where
go reader = reader >>= \case
Just (k, _, Just content) ->
case parseUrlLog content of
[] -> go reader
us -> return (Just (k, us))
Just (_, _, Nothing) -> go reader
Nothing -> return Nothing
go uc ic reader = reader >>= \case
Just ((), f, Just content)
| isUrlLog f -> case parseUrlLog content of
[] -> go uc ic reader
us -> go (us++uc) ic reader
| isMetaDataLog f ->
let s = currentMetaDataValues itemIdField $
parseCurrentMetaData content
in if S.null s
then go uc ic reader
else go uc (map (decodeBS . fromMetaValue) (S.toList s)++ic) reader
| otherwise -> go uc ic reader
Just ((), _, Nothing) -> go uc ic reader
Nothing -> return (uc, ic)
findDownloads :: URLString -> Feed -> [ToDownload]
findDownloads u f = catMaybes $ map mk (feedItems f)

View file

@ -155,8 +155,8 @@ later write.
> * [[bugs/git-annex_branch_caching_bug]] was a problem, now fixed.
> * Any other similar direct accesses of the branch, not going through
> Annex.Branch, also need to be fixed (and may be missing journal files
> already?) Command.ImportFeed.knownItems is one. Command.Log behavior
> needs to be investigated, may be ok.
> already?) Most fixed now. Command.Log behavior needs to be
> investigated still.
>
> * Need to implement regardingPrivateUUID and privateUUIDsKnown,
> which need to look at the git config to find the private uuids.