diff --git a/Command/ImportFeed.hs b/Command/ImportFeed.hs index 231c921c36..b60627cfed 100644 --- a/Command/ImportFeed.hs +++ b/Command/ImportFeed.hs @@ -73,7 +73,7 @@ perform opts cache url = do v <- findDownloads url case v of [] -> do - feedProblem url "bad feed content" + feedProblem url "bad feed content; no enclosures to download" next $ return True l -> do ok <- and <$> mapM (performDownload opts cache) l @@ -96,21 +96,32 @@ data ToDownload = ToDownload data DownloadLocation = Enclosure URLString | QuviLink URLString +type ItemId = String + data Cache = Cache { knownurls :: S.Set URLString + , knownitems :: S.Set ItemId , template :: Utility.Format.Format } getCache :: Maybe String -> Annex Cache getCache opttemplate = ifM (Annex.getState Annex.force) - ( ret S.empty + ( ret S.empty S.empty , do showSideAction "checking known urls" - ret =<< S.fromList <$> knownUrls + (is, us) <- unzip <$> (mapM knownItems =<< knownUrls) + ret (S.fromList us) (S.fromList (concat is)) ) where tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate - ret s = return $ Cache s tmpl + ret us is = return $ Cache us is tmpl + +knownItems :: (Key, URLString) -> Annex ([ItemId], URLString) +knownItems (k, u) = do + itemids <- S.toList . S.filter (/= noneValue) . S.map fromMetaValue + . currentMetaDataValues itemIdField + <$> getCurrentMetaData k + return (itemids, u) findDownloads :: URLString -> Annex [ToDownload] findDownloads u = go =<< downloadFeed u @@ -191,12 +202,18 @@ performDownload opts cache todownload = case location todownload of where forced = Annex.getState Annex.force - {- Avoids downloading any urls that are already known to be + {- Avoids downloading any items that are already known to be - associated with a file in the annex, unless forced. -} checkknown url a - | S.member url (knownurls cache) = ifM forced (a, return True) + | knownitemid || S.member url (knownurls cache) + = ifM forced (a, return True) | otherwise = a + knownitemid = case getItemId (item todownload) of + -- only when it's a permalink + Just (True, itemid) -> S.member itemid (knownitems cache) + _ -> False + rundownload url extension getter = do dest <- makeunique url (1 :: Integer) $ feedFile (template cache) todownload extension @@ -211,8 +228,10 @@ performDownload opts cache todownload = case location todownload of checkFeedBroken (feedurl todownload) else do forM_ ks $ \key -> - whenM (annexGenMetaData <$> Annex.getGitConfig) $ - addMetaData key $ extractMetaData todownload + ifM (annexGenMetaData <$> Annex.getGitConfig) + ( addMetaData key $ extractMetaData todownload + , addMetaData key $ minimalMetaData todownload + ) showEndOk return True @@ -275,6 +294,12 @@ extractMetaData i = meta tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v)) meta = MetaData $ M.fromList $ map tometa $ extractFields i +minimalMetaData :: ToDownload -> MetaData +minimalMetaData i = case getItemId (item i) of + (Nothing) -> emptyMetaData + (Just (_, itemid)) -> MetaData $ M.singleton itemIdField + (S.singleton $ toMetaValue itemid) + {- Extract fields from the feed and item, that are both used as metadata, - and to generate the filename. -} extractFields :: ToDownload -> [(String, String)] @@ -296,12 +321,18 @@ extractFields i = map (uncurry extractField) feedauthor = getFeedAuthor $ feed i itemauthor = getItemAuthor $ item i +itemIdField :: MetaField +itemIdField = mkMetaFieldUnchecked "itemid" + extractField :: String -> [Maybe String] -> (String, String) -extractField k [] = (k, "none") +extractField k [] = (k, noneValue) extractField k (Just v:_) | not (null v) = (k, v) extractField k (_:rest) = extractField k rest +noneValue :: String +noneValue = "none" + {- Called when there is a problem with a feed. - Throws an error if the feed is broken, otherwise shows a warning. -} feedProblem :: URLString -> String -> Annex () diff --git a/Logs/Web.hs b/Logs/Web.hs index ed2f575bd1..02d60170fa 100644 --- a/Logs/Web.hs +++ b/Logs/Web.hs @@ -21,7 +21,6 @@ module Logs.Web ( import qualified Data.ByteString.Lazy.Char8 as L import qualified Data.Map as M -import Data.Tuple.Utils import Common.Annex import qualified Annex @@ -70,7 +69,7 @@ setUrlMissing uuid key url = do logChange key uuid InfoMissing {- Finds all known urls. -} -knownUrls :: Annex [URLString] +knownUrls :: Annex [(Key, URLString)] knownUrls = do {- Ensure the git-annex branch's index file is up-to-date and - any journaled changes are reflected in it, since we're going @@ -80,10 +79,13 @@ knownUrls = do Annex.Branch.withIndex $ do top <- fromRepo Git.repoPath (l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top] - r <- mapM (geturls . snd3) $ filter (isUrlLog . fst3) l + r <- mapM getkeyurls l void $ liftIO cleanup return $ concat r where + getkeyurls (f, s, _) = case urlLogFileKey f of + Just k -> zip (repeat k) <$> geturls s + Nothing -> return [] geturls Nothing = return [] geturls (Just logsha) = getLog . L.unpack <$> catObject logsha diff --git a/debian/changelog b/debian/changelog index e9d2558c12..196743bd76 100644 --- a/debian/changelog +++ b/debian/changelog @@ -8,6 +8,10 @@ git-annex (5.20150328) UNRELEASED; urgency=medium * Fix GETURLS in external special remote protocol to strip downloader prefix from logged url info before checking for the specified prefix. + * importfeed: Avoid downloading a redundant item from a feed whose + guid has been downloaded before, even when the url has changed. + * importfeed: Always store itemid in metadata; before this was only + done when annex.genmetadata was set. -- Joey Hess Fri, 27 Mar 2015 16:04:43 -0400 diff --git a/doc/git-annex-importfeed.mdwn b/doc/git-annex-importfeed.mdwn index 69c8416670..3bb19d25bb 100644 --- a/doc/git-annex-importfeed.mdwn +++ b/doc/git-annex-importfeed.mdwn @@ -21,7 +21,7 @@ importing e.g., youtube playlists. * `--force` - Force downoading urls it's seen before. + Force downoading items it's seen before. * `--template` diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 316d335377..d8f3fb6fb1 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -792,7 +792,8 @@ Here are all the supported configuration settings. In particular, it stores year and month metadata, from the file's modification date. - When importfeed is used, it stores additional metadata from the feed. + When importfeed is used, it stores additional metadata from the feed, + such as the author, title, etc. * `annex.queuesize` diff --git a/doc/todo/podcatching_handling_updated_files.mdwn b/doc/todo/podcatching_handling_updated_files.mdwn index 998b084a68..2d98002835 100644 --- a/doc/todo/podcatching_handling_updated_files.mdwn +++ b/doc/todo/podcatching_handling_updated_files.mdwn @@ -11,6 +11,8 @@ known items, it could instead build a `Map (Either URlString GUID) Key`. This would at least prevent the duplication, when the feed has guids. +> [[done]] --[[Joey]] + It would be even nicer if the old file could be updated with the new content. But, since files can be moved around, deleted, tagged, etc, that only seems practical at all if the file is still in the directory