importfeed: Avoid downloading a redundant item from a feed whose guid has been downloaded before, even when the url has changed.

To support this, always store itemid in metadata; before this was only done
when annex.genmetadata was set.
This commit is contained in:
Joey Hess 2015-03-31 13:29:51 -04:00
parent 86b66758c2
commit 9e25cbde20
6 changed files with 54 additions and 14 deletions

View file

@ -73,7 +73,7 @@ perform opts cache url = do
v <- findDownloads url
case v of
[] -> do
feedProblem url "bad feed content"
feedProblem url "bad feed content; no enclosures to download"
next $ return True
l -> do
ok <- and <$> mapM (performDownload opts cache) l
@ -96,21 +96,32 @@ data ToDownload = ToDownload
data DownloadLocation = Enclosure URLString | QuviLink URLString
type ItemId = String
data Cache = Cache
{ knownurls :: S.Set URLString
, knownitems :: S.Set ItemId
, template :: Utility.Format.Format
}
getCache :: Maybe String -> Annex Cache
getCache opttemplate = ifM (Annex.getState Annex.force)
( ret S.empty
( ret S.empty S.empty
, do
showSideAction "checking known urls"
ret =<< S.fromList <$> knownUrls
(is, us) <- unzip <$> (mapM knownItems =<< knownUrls)
ret (S.fromList us) (S.fromList (concat is))
)
where
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
ret s = return $ Cache s tmpl
ret us is = return $ Cache us is tmpl
knownItems :: (Key, URLString) -> Annex ([ItemId], URLString)
knownItems (k, u) = do
itemids <- S.toList . S.filter (/= noneValue) . S.map fromMetaValue
. currentMetaDataValues itemIdField
<$> getCurrentMetaData k
return (itemids, u)
findDownloads :: URLString -> Annex [ToDownload]
findDownloads u = go =<< downloadFeed u
@ -191,12 +202,18 @@ performDownload opts cache todownload = case location todownload of
where
forced = Annex.getState Annex.force
{- Avoids downloading any urls that are already known to be
{- Avoids downloading any items that are already known to be
- associated with a file in the annex, unless forced. -}
checkknown url a
| S.member url (knownurls cache) = ifM forced (a, return True)
| knownitemid || S.member url (knownurls cache)
= ifM forced (a, return True)
| otherwise = a
knownitemid = case getItemId (item todownload) of
-- only when it's a permalink
Just (True, itemid) -> S.member itemid (knownitems cache)
_ -> False
rundownload url extension getter = do
dest <- makeunique url (1 :: Integer) $
feedFile (template cache) todownload extension
@ -211,8 +228,10 @@ performDownload opts cache todownload = case location todownload of
checkFeedBroken (feedurl todownload)
else do
forM_ ks $ \key ->
whenM (annexGenMetaData <$> Annex.getGitConfig) $
addMetaData key $ extractMetaData todownload
ifM (annexGenMetaData <$> Annex.getGitConfig)
( addMetaData key $ extractMetaData todownload
, addMetaData key $ minimalMetaData todownload
)
showEndOk
return True
@ -275,6 +294,12 @@ extractMetaData i = meta
tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v))
meta = MetaData $ M.fromList $ map tometa $ extractFields i
minimalMetaData :: ToDownload -> MetaData
minimalMetaData i = case getItemId (item i) of
(Nothing) -> emptyMetaData
(Just (_, itemid)) -> MetaData $ M.singleton itemIdField
(S.singleton $ toMetaValue itemid)
{- Extract fields from the feed and item, that are both used as metadata,
- and to generate the filename. -}
extractFields :: ToDownload -> [(String, String)]
@ -296,12 +321,18 @@ extractFields i = map (uncurry extractField)
feedauthor = getFeedAuthor $ feed i
itemauthor = getItemAuthor $ item i
itemIdField :: MetaField
itemIdField = mkMetaFieldUnchecked "itemid"
extractField :: String -> [Maybe String] -> (String, String)
extractField k [] = (k, "none")
extractField k [] = (k, noneValue)
extractField k (Just v:_)
| not (null v) = (k, v)
extractField k (_:rest) = extractField k rest
noneValue :: String
noneValue = "none"
{- Called when there is a problem with a feed.
- Throws an error if the feed is broken, otherwise shows a warning. -}
feedProblem :: URLString -> String -> Annex ()

View file

@ -21,7 +21,6 @@ module Logs.Web (
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.Map as M
import Data.Tuple.Utils
import Common.Annex
import qualified Annex
@ -70,7 +69,7 @@ setUrlMissing uuid key url = do
logChange key uuid InfoMissing
{- Finds all known urls. -}
knownUrls :: Annex [URLString]
knownUrls :: Annex [(Key, URLString)]
knownUrls = do
{- Ensure the git-annex branch's index file is up-to-date and
- any journaled changes are reflected in it, since we're going
@ -80,10 +79,13 @@ knownUrls = do
Annex.Branch.withIndex $ do
top <- fromRepo Git.repoPath
(l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
r <- mapM (geturls . snd3) $ filter (isUrlLog . fst3) l
r <- mapM getkeyurls l
void $ liftIO cleanup
return $ concat r
where
getkeyurls (f, s, _) = case urlLogFileKey f of
Just k -> zip (repeat k) <$> geturls s
Nothing -> return []
geturls Nothing = return []
geturls (Just logsha) = getLog . L.unpack <$> catObject logsha

4
debian/changelog vendored
View file

@ -8,6 +8,10 @@ git-annex (5.20150328) UNRELEASED; urgency=medium
* Fix GETURLS in external special remote protocol to strip
downloader prefix from logged url info before checking for the
specified prefix.
* importfeed: Avoid downloading a redundant item from a feed whose
guid has been downloaded before, even when the url has changed.
* importfeed: Always store itemid in metadata; before this was only
done when annex.genmetadata was set.
-- Joey Hess <id@joeyh.name> Fri, 27 Mar 2015 16:04:43 -0400

View file

@ -21,7 +21,7 @@ importing e.g., youtube playlists.
* `--force`
Force downoading urls it's seen before.
Force downoading items it's seen before.
* `--template`

View file

@ -792,7 +792,8 @@ Here are all the supported configuration settings.
In particular, it stores year and month metadata, from the file's
modification date.
When importfeed is used, it stores additional metadata from the feed.
When importfeed is used, it stores additional metadata from the feed,
such as the author, title, etc.
* `annex.queuesize`

View file

@ -11,6 +11,8 @@ known items, it could instead build a `Map (Either URlString GUID) Key`.
This would at least prevent the duplication, when the feed has guids.
> [[done]] --[[Joey]]
It would be even nicer if the old file could be updated with the new
content. But, since files can be moved around, deleted, tagged, etc,
that only seems practical at all if the file is still in the directory