importfeed: Avoid downloading a redundant item from a feed whose guid has been downloaded before, even when the url has changed.
To support this, always store itemid in metadata; before this was only done when annex.genmetadata was set.
This commit is contained in:
parent
86b66758c2
commit
9e25cbde20
6 changed files with 54 additions and 14 deletions
|
@ -73,7 +73,7 @@ perform opts cache url = do
|
||||||
v <- findDownloads url
|
v <- findDownloads url
|
||||||
case v of
|
case v of
|
||||||
[] -> do
|
[] -> do
|
||||||
feedProblem url "bad feed content"
|
feedProblem url "bad feed content; no enclosures to download"
|
||||||
next $ return True
|
next $ return True
|
||||||
l -> do
|
l -> do
|
||||||
ok <- and <$> mapM (performDownload opts cache) l
|
ok <- and <$> mapM (performDownload opts cache) l
|
||||||
|
@ -96,21 +96,32 @@ data ToDownload = ToDownload
|
||||||
|
|
||||||
data DownloadLocation = Enclosure URLString | QuviLink URLString
|
data DownloadLocation = Enclosure URLString | QuviLink URLString
|
||||||
|
|
||||||
|
type ItemId = String
|
||||||
|
|
||||||
data Cache = Cache
|
data Cache = Cache
|
||||||
{ knownurls :: S.Set URLString
|
{ knownurls :: S.Set URLString
|
||||||
|
, knownitems :: S.Set ItemId
|
||||||
, template :: Utility.Format.Format
|
, template :: Utility.Format.Format
|
||||||
}
|
}
|
||||||
|
|
||||||
getCache :: Maybe String -> Annex Cache
|
getCache :: Maybe String -> Annex Cache
|
||||||
getCache opttemplate = ifM (Annex.getState Annex.force)
|
getCache opttemplate = ifM (Annex.getState Annex.force)
|
||||||
( ret S.empty
|
( ret S.empty S.empty
|
||||||
, do
|
, do
|
||||||
showSideAction "checking known urls"
|
showSideAction "checking known urls"
|
||||||
ret =<< S.fromList <$> knownUrls
|
(is, us) <- unzip <$> (mapM knownItems =<< knownUrls)
|
||||||
|
ret (S.fromList us) (S.fromList (concat is))
|
||||||
)
|
)
|
||||||
where
|
where
|
||||||
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
|
||||||
ret s = return $ Cache s tmpl
|
ret us is = return $ Cache us is tmpl
|
||||||
|
|
||||||
|
knownItems :: (Key, URLString) -> Annex ([ItemId], URLString)
|
||||||
|
knownItems (k, u) = do
|
||||||
|
itemids <- S.toList . S.filter (/= noneValue) . S.map fromMetaValue
|
||||||
|
. currentMetaDataValues itemIdField
|
||||||
|
<$> getCurrentMetaData k
|
||||||
|
return (itemids, u)
|
||||||
|
|
||||||
findDownloads :: URLString -> Annex [ToDownload]
|
findDownloads :: URLString -> Annex [ToDownload]
|
||||||
findDownloads u = go =<< downloadFeed u
|
findDownloads u = go =<< downloadFeed u
|
||||||
|
@ -191,12 +202,18 @@ performDownload opts cache todownload = case location todownload of
|
||||||
where
|
where
|
||||||
forced = Annex.getState Annex.force
|
forced = Annex.getState Annex.force
|
||||||
|
|
||||||
{- Avoids downloading any urls that are already known to be
|
{- Avoids downloading any items that are already known to be
|
||||||
- associated with a file in the annex, unless forced. -}
|
- associated with a file in the annex, unless forced. -}
|
||||||
checkknown url a
|
checkknown url a
|
||||||
| S.member url (knownurls cache) = ifM forced (a, return True)
|
| knownitemid || S.member url (knownurls cache)
|
||||||
|
= ifM forced (a, return True)
|
||||||
| otherwise = a
|
| otherwise = a
|
||||||
|
|
||||||
|
knownitemid = case getItemId (item todownload) of
|
||||||
|
-- only when it's a permalink
|
||||||
|
Just (True, itemid) -> S.member itemid (knownitems cache)
|
||||||
|
_ -> False
|
||||||
|
|
||||||
rundownload url extension getter = do
|
rundownload url extension getter = do
|
||||||
dest <- makeunique url (1 :: Integer) $
|
dest <- makeunique url (1 :: Integer) $
|
||||||
feedFile (template cache) todownload extension
|
feedFile (template cache) todownload extension
|
||||||
|
@ -211,8 +228,10 @@ performDownload opts cache todownload = case location todownload of
|
||||||
checkFeedBroken (feedurl todownload)
|
checkFeedBroken (feedurl todownload)
|
||||||
else do
|
else do
|
||||||
forM_ ks $ \key ->
|
forM_ ks $ \key ->
|
||||||
whenM (annexGenMetaData <$> Annex.getGitConfig) $
|
ifM (annexGenMetaData <$> Annex.getGitConfig)
|
||||||
addMetaData key $ extractMetaData todownload
|
( addMetaData key $ extractMetaData todownload
|
||||||
|
, addMetaData key $ minimalMetaData todownload
|
||||||
|
)
|
||||||
showEndOk
|
showEndOk
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -275,6 +294,12 @@ extractMetaData i = meta
|
||||||
tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v))
|
tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v))
|
||||||
meta = MetaData $ M.fromList $ map tometa $ extractFields i
|
meta = MetaData $ M.fromList $ map tometa $ extractFields i
|
||||||
|
|
||||||
|
minimalMetaData :: ToDownload -> MetaData
|
||||||
|
minimalMetaData i = case getItemId (item i) of
|
||||||
|
(Nothing) -> emptyMetaData
|
||||||
|
(Just (_, itemid)) -> MetaData $ M.singleton itemIdField
|
||||||
|
(S.singleton $ toMetaValue itemid)
|
||||||
|
|
||||||
{- Extract fields from the feed and item, that are both used as metadata,
|
{- Extract fields from the feed and item, that are both used as metadata,
|
||||||
- and to generate the filename. -}
|
- and to generate the filename. -}
|
||||||
extractFields :: ToDownload -> [(String, String)]
|
extractFields :: ToDownload -> [(String, String)]
|
||||||
|
@ -296,12 +321,18 @@ extractFields i = map (uncurry extractField)
|
||||||
feedauthor = getFeedAuthor $ feed i
|
feedauthor = getFeedAuthor $ feed i
|
||||||
itemauthor = getItemAuthor $ item i
|
itemauthor = getItemAuthor $ item i
|
||||||
|
|
||||||
|
itemIdField :: MetaField
|
||||||
|
itemIdField = mkMetaFieldUnchecked "itemid"
|
||||||
|
|
||||||
extractField :: String -> [Maybe String] -> (String, String)
|
extractField :: String -> [Maybe String] -> (String, String)
|
||||||
extractField k [] = (k, "none")
|
extractField k [] = (k, noneValue)
|
||||||
extractField k (Just v:_)
|
extractField k (Just v:_)
|
||||||
| not (null v) = (k, v)
|
| not (null v) = (k, v)
|
||||||
extractField k (_:rest) = extractField k rest
|
extractField k (_:rest) = extractField k rest
|
||||||
|
|
||||||
|
noneValue :: String
|
||||||
|
noneValue = "none"
|
||||||
|
|
||||||
{- Called when there is a problem with a feed.
|
{- Called when there is a problem with a feed.
|
||||||
- Throws an error if the feed is broken, otherwise shows a warning. -}
|
- Throws an error if the feed is broken, otherwise shows a warning. -}
|
||||||
feedProblem :: URLString -> String -> Annex ()
|
feedProblem :: URLString -> String -> Annex ()
|
||||||
|
|
|
@ -21,7 +21,6 @@ module Logs.Web (
|
||||||
|
|
||||||
import qualified Data.ByteString.Lazy.Char8 as L
|
import qualified Data.ByteString.Lazy.Char8 as L
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
import Data.Tuple.Utils
|
|
||||||
|
|
||||||
import Common.Annex
|
import Common.Annex
|
||||||
import qualified Annex
|
import qualified Annex
|
||||||
|
@ -70,7 +69,7 @@ setUrlMissing uuid key url = do
|
||||||
logChange key uuid InfoMissing
|
logChange key uuid InfoMissing
|
||||||
|
|
||||||
{- Finds all known urls. -}
|
{- Finds all known urls. -}
|
||||||
knownUrls :: Annex [URLString]
|
knownUrls :: Annex [(Key, URLString)]
|
||||||
knownUrls = do
|
knownUrls = do
|
||||||
{- Ensure the git-annex branch's index file is up-to-date and
|
{- Ensure the git-annex branch's index file is up-to-date and
|
||||||
- any journaled changes are reflected in it, since we're going
|
- any journaled changes are reflected in it, since we're going
|
||||||
|
@ -80,10 +79,13 @@ knownUrls = do
|
||||||
Annex.Branch.withIndex $ do
|
Annex.Branch.withIndex $ do
|
||||||
top <- fromRepo Git.repoPath
|
top <- fromRepo Git.repoPath
|
||||||
(l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
|
(l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
|
||||||
r <- mapM (geturls . snd3) $ filter (isUrlLog . fst3) l
|
r <- mapM getkeyurls l
|
||||||
void $ liftIO cleanup
|
void $ liftIO cleanup
|
||||||
return $ concat r
|
return $ concat r
|
||||||
where
|
where
|
||||||
|
getkeyurls (f, s, _) = case urlLogFileKey f of
|
||||||
|
Just k -> zip (repeat k) <$> geturls s
|
||||||
|
Nothing -> return []
|
||||||
geturls Nothing = return []
|
geturls Nothing = return []
|
||||||
geturls (Just logsha) = getLog . L.unpack <$> catObject logsha
|
geturls (Just logsha) = getLog . L.unpack <$> catObject logsha
|
||||||
|
|
||||||
|
|
4
debian/changelog
vendored
4
debian/changelog
vendored
|
@ -8,6 +8,10 @@ git-annex (5.20150328) UNRELEASED; urgency=medium
|
||||||
* Fix GETURLS in external special remote protocol to strip
|
* Fix GETURLS in external special remote protocol to strip
|
||||||
downloader prefix from logged url info before checking for the
|
downloader prefix from logged url info before checking for the
|
||||||
specified prefix.
|
specified prefix.
|
||||||
|
* importfeed: Avoid downloading a redundant item from a feed whose
|
||||||
|
guid has been downloaded before, even when the url has changed.
|
||||||
|
* importfeed: Always store itemid in metadata; before this was only
|
||||||
|
done when annex.genmetadata was set.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Fri, 27 Mar 2015 16:04:43 -0400
|
-- Joey Hess <id@joeyh.name> Fri, 27 Mar 2015 16:04:43 -0400
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ importing e.g., youtube playlists.
|
||||||
|
|
||||||
* `--force`
|
* `--force`
|
||||||
|
|
||||||
Force downoading urls it's seen before.
|
Force downoading items it's seen before.
|
||||||
|
|
||||||
* `--template`
|
* `--template`
|
||||||
|
|
||||||
|
|
|
@ -792,7 +792,8 @@ Here are all the supported configuration settings.
|
||||||
In particular, it stores year and month metadata, from the file's
|
In particular, it stores year and month metadata, from the file's
|
||||||
modification date.
|
modification date.
|
||||||
|
|
||||||
When importfeed is used, it stores additional metadata from the feed.
|
When importfeed is used, it stores additional metadata from the feed,
|
||||||
|
such as the author, title, etc.
|
||||||
|
|
||||||
* `annex.queuesize`
|
* `annex.queuesize`
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,8 @@ known items, it could instead build a `Map (Either URlString GUID) Key`.
|
||||||
|
|
||||||
This would at least prevent the duplication, when the feed has guids.
|
This would at least prevent the duplication, when the feed has guids.
|
||||||
|
|
||||||
|
> [[done]] --[[Joey]]
|
||||||
|
|
||||||
It would be even nicer if the old file could be updated with the new
|
It would be even nicer if the old file could be updated with the new
|
||||||
content. But, since files can be moved around, deleted, tagged, etc,
|
content. But, since files can be moved around, deleted, tagged, etc,
|
||||||
that only seems practical at all if the file is still in the directory
|
that only seems practical at all if the file is still in the directory
|
||||||
|
|
Loading…
Reference in a new issue