importfeed: fix bug while also speeding up 12x!
* Fix bug that could make git-annex importfeed not see recently recorded state when configured with annex.alwayscommit=false. * importfeed: Made "checking known urls" phase run 12 times faster. The massive speedup is because it no longer queries for metadata accompanying each url. Instead it processes the whole git-annex branch and checks all metadata files for feed item ids, and uses any it finds. This could result in a behavior change, in an unlikely situation: If a feed id is recorded in a key's metadata, but the url gets removed, the old code would not see that item id and would re-download it if it finds an url for it in a feed, while the new code will see the item id. I don't think the old behavior was intentional, and it may be that the new behavior is better. Not gonna worry about this.
This commit is contained in:
parent
b689f17062
commit
0547884eb2
3 changed files with 29 additions and 43 deletions
|
@ -15,6 +15,9 @@ git-annex (8.20210331) UNRELEASED; urgency=medium
|
||||||
* Fix bug caused by recent optimisations that could make git-annex not
|
* Fix bug caused by recent optimisations that could make git-annex not
|
||||||
see recently recorded status information when configured with
|
see recently recorded status information when configured with
|
||||||
annex.alwayscommit=false.
|
annex.alwayscommit=false.
|
||||||
|
* Fix bug that could make git-annex importfeed not see recently recorded
|
||||||
|
state when configured with annex.alwayscommit=false.
|
||||||
|
* importfeed: Made "checking known urls" phase run 12 times faster.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Thu, 01 Apr 2021 12:17:26 -0400
|
-- Joey Hess <id@joeyh.name> Thu, 01 Apr 2021 12:17:26 -0400
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@ import Data.Time.Format
|
||||||
import Data.Time.Calendar
|
import Data.Time.Calendar
|
||||||
import Data.Time.LocalTime
|
import Data.Time.LocalTime
|
||||||
import qualified Data.Text as T
|
import qualified Data.Text as T
|
||||||
import Control.Concurrent.Async
|
|
||||||
import qualified System.FilePath.ByteString as P
|
import qualified System.FilePath.ByteString as P
|
||||||
|
|
||||||
import Command
|
import Command
|
||||||
|
@ -45,10 +44,8 @@ import Annex.MetaData
|
||||||
import Annex.FileMatcher
|
import Annex.FileMatcher
|
||||||
import Command.AddUrl (addWorkTree)
|
import Command.AddUrl (addWorkTree)
|
||||||
import Annex.UntrustedFilePath
|
import Annex.UntrustedFilePath
|
||||||
import qualified Git.Ref
|
|
||||||
import qualified Annex.Branch
|
import qualified Annex.Branch
|
||||||
import Logs
|
import Logs
|
||||||
import Git.CatFile (catObjectStream)
|
|
||||||
|
|
||||||
cmd :: Command
|
cmd :: Command
|
||||||
cmd = notBareRepo $
|
cmd = notBareRepo $
|
||||||
|
@ -125,52 +122,38 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
|
||||||
( ret S.empty S.empty
|
( ret S.empty S.empty
|
||||||
, do
|
, do
|
||||||
showStart "importfeed" "checking known urls" (SeekInput [])
|
showStart "importfeed" "checking known urls" (SeekInput [])
|
||||||
(is, us) <- unzip <$> knownItems
|
(us, is) <- knownItems
|
||||||
showEndOk
|
showEndOk
|
||||||
ret (S.fromList us) (S.fromList (concat is))
|
ret (S.fromList us) (S.fromList is)
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
||||||
ret us is = return $ Cache us is tmpl
|
ret us is = return $ Cache us is tmpl
|
||||||
|
|
||||||
knownItems :: Annex [([ItemId], URLString)]
|
{- Scan all url logs and metadata logs in the branch and find urls
|
||||||
knownItems = do
|
- and ItemIds that are already known. -}
|
||||||
g <- Annex.gitRepo
|
knownItems :: Annex ([URLString], [ItemId])
|
||||||
config <- Annex.getGitConfig
|
knownItems = Annex.Branch.overBranchFileContents select (go [] [])
|
||||||
catObjectStream g $ \catfeeder catcloser catreader -> do
|
|
||||||
rt <- liftIO $ async $ reader catreader []
|
|
||||||
overKnownUrls (feeder config catfeeder catcloser)
|
|
||||||
liftIO (wait rt)
|
|
||||||
where
|
where
|
||||||
feeder config catfeeder catcloser urlreader = urlreader >>= \case
|
select f
|
||||||
Just (k, us) -> do
|
| isUrlLog f = Just ()
|
||||||
forM_ us $ \u ->
|
| isMetaDataLog f = Just ()
|
||||||
let logf = metaDataLogFile config k
|
| otherwise = Nothing
|
||||||
ref = Git.Ref.branchFileRef Annex.Branch.fullname logf
|
|
||||||
in liftIO $ catfeeder (u, ref)
|
|
||||||
feeder config catfeeder catcloser urlreader
|
|
||||||
Nothing -> liftIO catcloser
|
|
||||||
|
|
||||||
reader catreader c = catreader >>= \case
|
go uc ic reader = reader >>= \case
|
||||||
Just (u, Just mdc) ->
|
Just ((), f, Just content)
|
||||||
let !itemids = S.toList $ S.filter (/= noneValue) $
|
| isUrlLog f -> case parseUrlLog content of
|
||||||
S.map (decodeBS . fromMetaValue) $
|
[] -> go uc ic reader
|
||||||
currentMetaDataValues itemIdField $
|
us -> go (us++uc) ic reader
|
||||||
parseCurrentMetaData mdc
|
| isMetaDataLog f ->
|
||||||
in reader catreader ((itemids,u):c)
|
let s = currentMetaDataValues itemIdField $
|
||||||
Just (u, Nothing) -> reader catreader (([],u):c)
|
parseCurrentMetaData content
|
||||||
Nothing -> return c
|
in if S.null s
|
||||||
|
then go uc ic reader
|
||||||
overKnownUrls :: (Annex (Maybe (Key, [URLString])) -> Annex a) -> Annex a
|
else go uc (map (decodeBS . fromMetaValue) (S.toList s)++ic) reader
|
||||||
overKnownUrls a = Annex.Branch.overBranchFileContents urlLogFileKey (a . go)
|
| otherwise -> go uc ic reader
|
||||||
where
|
Just ((), _, Nothing) -> go uc ic reader
|
||||||
go reader = reader >>= \case
|
Nothing -> return (uc, ic)
|
||||||
Just (k, _, Just content) ->
|
|
||||||
case parseUrlLog content of
|
|
||||||
[] -> go reader
|
|
||||||
us -> return (Just (k, us))
|
|
||||||
Just (_, _, Nothing) -> go reader
|
|
||||||
Nothing -> return Nothing
|
|
||||||
|
|
||||||
findDownloads :: URLString -> Feed -> [ToDownload]
|
findDownloads :: URLString -> Feed -> [ToDownload]
|
||||||
findDownloads u f = catMaybes $ map mk (feedItems f)
|
findDownloads u f = catMaybes $ map mk (feedItems f)
|
||||||
|
|
|
@ -155,8 +155,8 @@ later write.
|
||||||
> * [[bugs/git-annex_branch_caching_bug]] was a problem, now fixed.
|
> * [[bugs/git-annex_branch_caching_bug]] was a problem, now fixed.
|
||||||
> * Any other similar direct accesses of the branch, not going through
|
> * Any other similar direct accesses of the branch, not going through
|
||||||
> Annex.Branch, also need to be fixed (and may be missing journal files
|
> Annex.Branch, also need to be fixed (and may be missing journal files
|
||||||
> already?) Command.ImportFeed.knownItems is one. Command.Log behavior
|
> already?) Most fixed now. Command.Log behavior needs to be
|
||||||
> needs to be investigated, may be ok.
|
> investigated still.
|
||||||
>
|
>
|
||||||
> * Need to implement regardingPrivateUUID and privateUUIDsKnown,
|
> * Need to implement regardingPrivateUUID and privateUUIDsKnown,
|
||||||
> which need to look at the git config to find the private uuids.
|
> which need to look at the git config to find the private uuids.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue