importfeed: fix bug while also speeding up 12x!
* Fix bug that could make git-annex importfeed not see recently recorded state when configured with annex.alwayscommit=false. * importfeed: Made "checking known urls" phase run 12 times faster. The massive speedup is because it no longer queries for metadata accompanying each url. Instead it processes the whole git-annex branch and checks all metadata files for feed item ids, and uses any it finds. This could result in a behavior change, in an unlikely situation: If a feed id is recorded in a key's metadata, but the url gets removed, the old code would not see that item id and would re-download it if it finds an url for it in a feed, while the new code will see the item id. I don't think the old behavior was intentional, and it may be that the new behavior is better. Not gonna worry about this.
This commit is contained in:
parent
b689f17062
commit
0547884eb2
3 changed files with 29 additions and 43 deletions
|
@ -15,6 +15,9 @@ git-annex (8.20210331) UNRELEASED; urgency=medium
|
|||
* Fix bug caused by recent optimisations that could make git-annex not
|
||||
see recently recorded status information when configured with
|
||||
annex.alwayscommit=false.
|
||||
* Fix bug that could make git-annex importfeed not see recently recorded
|
||||
state when configured with annex.alwayscommit=false.
|
||||
* importfeed: Made "checking known urls" phase run 12 times faster.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Thu, 01 Apr 2021 12:17:26 -0400
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ import Data.Time.Format
|
|||
import Data.Time.Calendar
|
||||
import Data.Time.LocalTime
|
||||
import qualified Data.Text as T
|
||||
import Control.Concurrent.Async
|
||||
import qualified System.FilePath.ByteString as P
|
||||
|
||||
import Command
|
||||
|
@ -45,10 +44,8 @@ import Annex.MetaData
|
|||
import Annex.FileMatcher
|
||||
import Command.AddUrl (addWorkTree)
|
||||
import Annex.UntrustedFilePath
|
||||
import qualified Git.Ref
|
||||
import qualified Annex.Branch
|
||||
import Logs
|
||||
import Git.CatFile (catObjectStream)
|
||||
|
||||
cmd :: Command
|
||||
cmd = notBareRepo $
|
||||
|
@ -125,52 +122,38 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
|
|||
( ret S.empty S.empty
|
||||
, do
|
||||
showStart "importfeed" "checking known urls" (SeekInput [])
|
||||
(is, us) <- unzip <$> knownItems
|
||||
(us, is) <- knownItems
|
||||
showEndOk
|
||||
ret (S.fromList us) (S.fromList (concat is))
|
||||
ret (S.fromList us) (S.fromList is)
|
||||
)
|
||||
where
|
||||
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
||||
ret us is = return $ Cache us is tmpl
|
||||
|
||||
knownItems :: Annex [([ItemId], URLString)]
|
||||
knownItems = do
|
||||
g <- Annex.gitRepo
|
||||
config <- Annex.getGitConfig
|
||||
catObjectStream g $ \catfeeder catcloser catreader -> do
|
||||
rt <- liftIO $ async $ reader catreader []
|
||||
overKnownUrls (feeder config catfeeder catcloser)
|
||||
liftIO (wait rt)
|
||||
{- Scan all url logs and metadata logs in the branch and find urls
|
||||
- and ItemIds that are already known. -}
|
||||
knownItems :: Annex ([URLString], [ItemId])
|
||||
knownItems = Annex.Branch.overBranchFileContents select (go [] [])
|
||||
where
|
||||
feeder config catfeeder catcloser urlreader = urlreader >>= \case
|
||||
Just (k, us) -> do
|
||||
forM_ us $ \u ->
|
||||
let logf = metaDataLogFile config k
|
||||
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
|
||||
in liftIO $ catfeeder (u, ref)
|
||||
feeder config catfeeder catcloser urlreader
|
||||
Nothing -> liftIO catcloser
|
||||
|
||||
reader catreader c = catreader >>= \case
|
||||
Just (u, Just mdc) ->
|
||||
let !itemids = S.toList $ S.filter (/= noneValue) $
|
||||
S.map (decodeBS . fromMetaValue) $
|
||||
currentMetaDataValues itemIdField $
|
||||
parseCurrentMetaData mdc
|
||||
in reader catreader ((itemids,u):c)
|
||||
Just (u, Nothing) -> reader catreader (([],u):c)
|
||||
Nothing -> return c
|
||||
select f
|
||||
| isUrlLog f = Just ()
|
||||
| isMetaDataLog f = Just ()
|
||||
| otherwise = Nothing
|
||||
|
||||
overKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a
|
||||
overKnownUrls a = Annex.Branch.overBranchFileContents urlLogFileKey (a . go)
|
||||
where
|
||||
go reader = reader >>= \case
|
||||
Just (k, _, Just content) ->
|
||||
case parseUrlLog content of
|
||||
[] -> go reader
|
||||
us -> return (Just (k, us))
|
||||
Just (_, _, Nothing) -> go reader
|
||||
Nothing -> return Nothing
|
||||
go uc ic reader = reader >>= \case
|
||||
Just ((), f, Just content)
|
||||
| isUrlLog f -> case parseUrlLog content of
|
||||
[] -> go uc ic reader
|
||||
us -> go (us++uc) ic reader
|
||||
| isMetaDataLog f ->
|
||||
let s = currentMetaDataValues itemIdField $
|
||||
parseCurrentMetaData content
|
||||
in if S.null s
|
||||
then go uc ic reader
|
||||
else go uc (map (decodeBS . fromMetaValue) (S.toList s)++ic) reader
|
||||
| otherwise -> go uc ic reader
|
||||
Just ((), _, Nothing) -> go uc ic reader
|
||||
Nothing -> return (uc, ic)
|
||||
|
||||
findDownloads :: URLString -> Feed -> [ToDownload]
|
||||
findDownloads u f = catMaybes $ map mk (feedItems f)
|
||||
|
|
|
@ -155,8 +155,8 @@ later write.
|
|||
> * [[bugs/git-annex_branch_caching_bug]] was a problem, now fixed.
|
||||
> * Any other similar direct accesses of the branch, not going through
|
||||
> Annex.Branch, also need to be fixed (and may be missing journal files
|
||||
> already?) Command.ImportFeed.knownItems is one. Command.Log behavior
|
||||
> needs to be investigated, may be ok.
|
||||
> already?) Most fixed now. Command.Log behavior needs to be
|
||||
> investigated still.
|
||||
>
|
||||
> * Need to implement regardingPrivateUUID and privateUUIDsKnown,
|
||||
> which need to look at the git config to find the private uuids.
|
||||
|
|
Loading…
Add table
Reference in a new issue