importfeed: Avoid downloading a redundant item from a feed whose guid has been downloaded before, even when the url has changed.

To support this, always store itemid in metadata; before this was only done
when annex.genmetadata was set.
This commit is contained in:
Joey Hess 2015-03-31 13:29:51 -04:00
parent 86b66758c2
commit 9e25cbde20
6 changed files with 54 additions and 14 deletions

View file

@ -73,7 +73,7 @@ perform opts cache url = do
v <- findDownloads url v <- findDownloads url
case v of case v of
[] -> do [] -> do
feedProblem url "bad feed content" feedProblem url "bad feed content; no enclosures to download"
next $ return True next $ return True
l -> do l -> do
ok <- and <$> mapM (performDownload opts cache) l ok <- and <$> mapM (performDownload opts cache) l
@ -96,21 +96,32 @@ data ToDownload = ToDownload
data DownloadLocation = Enclosure URLString | QuviLink URLString data DownloadLocation = Enclosure URLString | QuviLink URLString
type ItemId = String
data Cache = Cache data Cache = Cache
{ knownurls :: S.Set URLString { knownurls :: S.Set URLString
, knownitems :: S.Set ItemId
, template :: Utility.Format.Format , template :: Utility.Format.Format
} }
getCache :: Maybe String -> Annex Cache getCache :: Maybe String -> Annex Cache
getCache opttemplate = ifM (Annex.getState Annex.force) getCache opttemplate = ifM (Annex.getState Annex.force)
( ret S.empty ( ret S.empty S.empty
, do , do
showSideAction "checking known urls" showSideAction "checking known urls"
ret =<< S.fromList <$> knownUrls (is, us) <- unzip <$> (mapM knownItems =<< knownUrls)
ret (S.fromList us) (S.fromList (concat is))
) )
where where
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
ret s = return $ Cache s tmpl ret us is = return $ Cache us is tmpl
knownItems :: (Key, URLString) -> Annex ([ItemId], URLString)
knownItems (k, u) = do
itemids <- S.toList . S.filter (/= noneValue) . S.map fromMetaValue
. currentMetaDataValues itemIdField
<$> getCurrentMetaData k
return (itemids, u)
findDownloads :: URLString -> Annex [ToDownload] findDownloads :: URLString -> Annex [ToDownload]
findDownloads u = go =<< downloadFeed u findDownloads u = go =<< downloadFeed u
@ -191,12 +202,18 @@ performDownload opts cache todownload = case location todownload of
where where
forced = Annex.getState Annex.force forced = Annex.getState Annex.force
{- Avoids downloading any urls that are already known to be {- Avoids downloading any items that are already known to be
- associated with a file in the annex, unless forced. -} - associated with a file in the annex, unless forced. -}
checkknown url a checkknown url a
| S.member url (knownurls cache) = ifM forced (a, return True) | knownitemid || S.member url (knownurls cache)
= ifM forced (a, return True)
| otherwise = a | otherwise = a
knownitemid = case getItemId (item todownload) of
-- only when it's a permalink
Just (True, itemid) -> S.member itemid (knownitems cache)
_ -> False
rundownload url extension getter = do rundownload url extension getter = do
dest <- makeunique url (1 :: Integer) $ dest <- makeunique url (1 :: Integer) $
feedFile (template cache) todownload extension feedFile (template cache) todownload extension
@ -211,8 +228,10 @@ performDownload opts cache todownload = case location todownload of
checkFeedBroken (feedurl todownload) checkFeedBroken (feedurl todownload)
else do else do
forM_ ks $ \key -> forM_ ks $ \key ->
whenM (annexGenMetaData <$> Annex.getGitConfig) $ ifM (annexGenMetaData <$> Annex.getGitConfig)
addMetaData key $ extractMetaData todownload ( addMetaData key $ extractMetaData todownload
, addMetaData key $ minimalMetaData todownload
)
showEndOk showEndOk
return True return True
@ -275,6 +294,12 @@ extractMetaData i = meta
tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v)) tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v))
meta = MetaData $ M.fromList $ map tometa $ extractFields i meta = MetaData $ M.fromList $ map tometa $ extractFields i
minimalMetaData :: ToDownload -> MetaData
minimalMetaData i = case getItemId (item i) of
(Nothing) -> emptyMetaData
(Just (_, itemid)) -> MetaData $ M.singleton itemIdField
(S.singleton $ toMetaValue itemid)
{- Extract fields from the feed and item, that are both used as metadata, {- Extract fields from the feed and item, that are both used as metadata,
- and to generate the filename. -} - and to generate the filename. -}
extractFields :: ToDownload -> [(String, String)] extractFields :: ToDownload -> [(String, String)]
@ -296,12 +321,18 @@ extractFields i = map (uncurry extractField)
feedauthor = getFeedAuthor $ feed i feedauthor = getFeedAuthor $ feed i
itemauthor = getItemAuthor $ item i itemauthor = getItemAuthor $ item i
itemIdField :: MetaField
itemIdField = mkMetaFieldUnchecked "itemid"
extractField :: String -> [Maybe String] -> (String, String) extractField :: String -> [Maybe String] -> (String, String)
extractField k [] = (k, "none") extractField k [] = (k, noneValue)
extractField k (Just v:_) extractField k (Just v:_)
| not (null v) = (k, v) | not (null v) = (k, v)
extractField k (_:rest) = extractField k rest extractField k (_:rest) = extractField k rest
noneValue :: String
noneValue = "none"
{- Called when there is a problem with a feed. {- Called when there is a problem with a feed.
- Throws an error if the feed is broken, otherwise shows a warning. -} - Throws an error if the feed is broken, otherwise shows a warning. -}
feedProblem :: URLString -> String -> Annex () feedProblem :: URLString -> String -> Annex ()

View file

@ -21,7 +21,6 @@ module Logs.Web (
import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Map as M import qualified Data.Map as M
import Data.Tuple.Utils
import Common.Annex import Common.Annex
import qualified Annex import qualified Annex
@ -70,7 +69,7 @@ setUrlMissing uuid key url = do
logChange key uuid InfoMissing logChange key uuid InfoMissing
{- Finds all known urls. -} {- Finds all known urls. -}
knownUrls :: Annex [URLString] knownUrls :: Annex [(Key, URLString)]
knownUrls = do knownUrls = do
{- Ensure the git-annex branch's index file is up-to-date and {- Ensure the git-annex branch's index file is up-to-date and
- any journaled changes are reflected in it, since we're going - any journaled changes are reflected in it, since we're going
@ -80,10 +79,13 @@ knownUrls = do
Annex.Branch.withIndex $ do Annex.Branch.withIndex $ do
top <- fromRepo Git.repoPath top <- fromRepo Git.repoPath
(l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top] (l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
r <- mapM (geturls . snd3) $ filter (isUrlLog . fst3) l r <- mapM getkeyurls l
void $ liftIO cleanup void $ liftIO cleanup
return $ concat r return $ concat r
where where
getkeyurls (f, s, _) = case urlLogFileKey f of
Just k -> zip (repeat k) <$> geturls s
Nothing -> return []
geturls Nothing = return [] geturls Nothing = return []
geturls (Just logsha) = getLog . L.unpack <$> catObject logsha geturls (Just logsha) = getLog . L.unpack <$> catObject logsha

4
debian/changelog vendored
View file

@ -8,6 +8,10 @@ git-annex (5.20150328) UNRELEASED; urgency=medium
* Fix GETURLS in external special remote protocol to strip * Fix GETURLS in external special remote protocol to strip
downloader prefix from logged url info before checking for the downloader prefix from logged url info before checking for the
specified prefix. specified prefix.
* importfeed: Avoid downloading a redundant item from a feed whose
guid has been downloaded before, even when the url has changed.
* importfeed: Always store itemid in metadata; before this was only
done when annex.genmetadata was set.
-- Joey Hess <id@joeyh.name> Fri, 27 Mar 2015 16:04:43 -0400 -- Joey Hess <id@joeyh.name> Fri, 27 Mar 2015 16:04:43 -0400

View file

@ -21,7 +21,7 @@ importing e.g., youtube playlists.
* `--force` * `--force`
Force downoading urls it's seen before. Force downoading items it's seen before.
* `--template` * `--template`

View file

@ -792,7 +792,8 @@ Here are all the supported configuration settings.
In particular, it stores year and month metadata, from the file's In particular, it stores year and month metadata, from the file's
modification date. modification date.
When importfeed is used, it stores additional metadata from the feed. When importfeed is used, it stores additional metadata from the feed,
such as the author, title, etc.
* `annex.queuesize` * `annex.queuesize`

View file

@ -11,6 +11,8 @@ known items, it could instead build a `Map (Either URlString GUID) Key`.
This would at least prevent the duplication, when the feed has guids. This would at least prevent the duplication, when the feed has guids.
> [[done]] --[[Joey]]
It would be even nicer if the old file could be updated with the new It would be even nicer if the old file could be updated with the new
content. But, since files can be moved around, deleted, tagged, etc, content. But, since files can be moved around, deleted, tagged, etc,
that only seems practical at all if the file is still in the directory that only seems practical at all if the file is still in the directory