importfeed: Ignores transient problems with feeds. Only exits nonzero when a feed has repeatedly had a problems for at least 1 day.

This commit is contained in:
Joey Hess 2013-08-03 01:40:21 -04:00
parent 17f05d6572
commit 24c8a6042b
3 changed files with 79 additions and 20 deletions

View file

@ -13,6 +13,7 @@ import Text.Feed.Types
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.Map as M import qualified Data.Map as M
import Data.Char import Data.Char
import Data.Time.Clock
import Common.Annex import Common.Annex
import qualified Annex import qualified Annex
@ -23,6 +24,8 @@ import qualified Option
import qualified Utility.Format import qualified Utility.Format
import Utility.Tmp import Utility.Tmp
import Command.AddUrl (addUrlFile, relaxedOption) import Command.AddUrl (addUrlFile, relaxedOption)
import Annex.Perms
import Backend.URL (fromUrl)
def :: [Command] def :: [Command]
def = [notBareRepo $ withOptions [templateOption, relaxedOption] $ def = [notBareRepo $ withOptions [templateOption, relaxedOption] $
@ -48,20 +51,30 @@ perform relaxed cache url = do
v <- findEnclosures url v <- findEnclosures url
case v of case v of
Just l | not (null l) -> do Just l | not (null l) -> do
mapM_ (downloadEnclosure relaxed cache) l ok <- all id
<$> mapM (downloadEnclosure relaxed cache) l
next $ cleanup url ok
_ -> do
feedProblem url "bad feed content"
next $ return True next $ return True
_ -> stop
cleanup :: URLString -> Bool -> CommandCleanup
cleanup url ok = do
when ok $
clearFeedProblem url
return ok
data ToDownload = ToDownload data ToDownload = ToDownload
{ feed :: Feed { feed :: Feed
, feedurl :: URLString
, item :: Item , item :: Item
, location :: URLString , location :: URLString
} }
mkToDownload :: Feed -> Item -> Maybe ToDownload mkToDownload :: Feed -> URLString -> Item -> Maybe ToDownload
mkToDownload f i = case getItemEnclosure i of mkToDownload f u i = case getItemEnclosure i of
Nothing -> Nothing Nothing -> Nothing
Just (enclosureurl, _, _) -> Just $ ToDownload f i enclosureurl Just (enclosureurl, _, _) -> Just $ ToDownload f u i enclosureurl
data Cache = Cache data Cache = Cache
{ knownurls :: S.Set URLString { knownurls :: S.Set URLString
@ -80,13 +93,10 @@ getCache opttemplate = ifM (Annex.getState Annex.force)
ret s = return $ Cache s tmpl ret s = return $ Cache s tmpl
findEnclosures :: URLString -> Annex (Maybe [ToDownload]) findEnclosures :: URLString -> Annex (Maybe [ToDownload])
findEnclosures url = go =<< downloadFeed url findEnclosures url = extract <$> downloadFeed url
where where
go Nothing = do extract Nothing = Nothing
warning $ "failed to parse feed " ++ url extract (Just f) = Just $ mapMaybe (mkToDownload f url) (feedItems f)
return Nothing
go (Just f) = return $ Just $
mapMaybe (mkToDownload f) (feedItems f)
{- Feeds change, so a feed download cannot be resumed. -} {- Feeds change, so a feed download cannot be resumed. -}
downloadFeed :: URLString -> Annex (Maybe Feed) downloadFeed :: URLString -> Annex (Maybe Feed)
@ -95,16 +105,15 @@ downloadFeed url = do
liftIO $ withTmpFile "feed" $ \f h -> do liftIO $ withTmpFile "feed" $ \f h -> do
fileEncoding h fileEncoding h
ifM (Url.download url [] [] f) ifM (Url.download url [] [] f)
( parseFeedString <$> hGetContentsStrict h ( liftIO $ parseFeedString <$> hGetContentsStrict h
, return Nothing , return Nothing
) )
{- Avoids downloading any urls that are already known to be associated {- Avoids downloading any urls that are already known to be associated
- with a file in the annex, unless forced. -} - with a file in the annex, unless forced. -}
downloadEnclosure :: Bool -> Cache -> ToDownload -> Annex () downloadEnclosure :: Bool -> Cache -> ToDownload -> Annex Bool
downloadEnclosure relaxed cache enclosure downloadEnclosure relaxed cache enclosure
| S.member url (knownurls cache) = | S.member url (knownurls cache) = ifM forced (go, return True)
whenM forced go
| otherwise = go | otherwise = go
where where
forced = Annex.getState Annex.force forced = Annex.getState Annex.force
@ -112,13 +121,17 @@ downloadEnclosure relaxed cache enclosure
go = do go = do
dest <- makeunique (1 :: Integer) $ feedFile (template cache) enclosure dest <- makeunique (1 :: Integer) $ feedFile (template cache) enclosure
case dest of case dest of
Nothing -> noop Nothing -> return True
Just f -> do Just f -> do
showStart "addurl" f showStart "addurl" f
ifM (addUrlFile relaxed url f) ok <- addUrlFile relaxed url f
( showEndOk if ok
, showEndFail then do
) showEndOk
return True
else do
showEndFail
checkFeedBroken (feedurl enclosure)
{- Find a unique filename to save the url to. {- Find a unique filename to save the url to.
- If the file exists, prefixes it with a number. - If the file exists, prefixes it with a number.
- When forced, the file may already exist and have the same - When forced, the file may already exist and have the same
@ -171,3 +184,38 @@ feedFile tmpl i = Utility.Format.format tmpl $ M.fromList
sanitize c sanitize c
| isSpace c || isPunctuation c || c == '/' = '_' | isSpace c || isPunctuation c || c == '/' = '_'
| otherwise = c | otherwise = c
{- Called when there is a problem with a feed.
- Throws an error if the feed is broken, otherwise shows a warning. -}
feedProblem :: URLString -> String -> Annex ()
feedProblem url message = ifM (checkFeedBroken url)
( error $ message ++ " (having repeated problems with this feed!)"
, warning $ "warning: " ++ message
)
{- A feed is only broken if problems have occurred repeatedly, for at
- least 23 hours. -}
checkFeedBroken :: URLString -> Annex Bool
checkFeedBroken url = checkFeedBroken' url =<< feedState url
checkFeedBroken' :: URLString -> FilePath -> Annex Bool
checkFeedBroken' url f = do
prev <- maybe Nothing readish <$> liftIO (catchMaybeIO $ readFile f)
now <- liftIO getCurrentTime
case prev of
Nothing -> do
createAnnexDirectory (parentDir f)
liftIO $ writeFile f $ show now
return False
Just prevtime -> do
let broken = diffUTCTime now prevtime > 60 * 60 * 23
when broken $
-- Avoid repeatedly complaining about
-- broken feed.
clearFeedProblem url
return broken
clearFeedProblem :: URLString -> Annex ()
clearFeedProblem url = void $ liftIO . tryIO . removeFile =<< feedState url
feedState :: URLString -> Annex FilePath
feedState url = fromRepo . gitAnnexFeedState =<< fromUrl url Nothing

View file

@ -28,6 +28,8 @@ module Locations (
gitAnnexFsckState, gitAnnexFsckState,
gitAnnexTransferDir, gitAnnexTransferDir,
gitAnnexCredsDir, gitAnnexCredsDir,
gitAnnexFeedStateDir,
gitAnnexFeedState,
gitAnnexMergeDir, gitAnnexMergeDir,
gitAnnexJournalDir, gitAnnexJournalDir,
gitAnnexJournalLock, gitAnnexJournalLock,
@ -190,6 +192,13 @@ gitAnnexFsckState r = gitAnnexDir r </> "fsckstate"
gitAnnexCredsDir :: Git.Repo -> FilePath gitAnnexCredsDir :: Git.Repo -> FilePath
gitAnnexCredsDir r = addTrailingPathSeparator $ gitAnnexDir r </> "creds" gitAnnexCredsDir r = addTrailingPathSeparator $ gitAnnexDir r </> "creds"
{- .git/annex/feeds/ is used to record per-key (url) state by importfeeds -}
gitAnnexFeedStateDir :: Git.Repo -> FilePath
gitAnnexFeedStateDir r = addTrailingPathSeparator $ gitAnnexDir r </> "feedstate"
gitAnnexFeedState :: Key -> Git.Repo -> FilePath
gitAnnexFeedState k r = gitAnnexFeedStateDir r </> keyFile k
{- .git/annex/merge/ is used for direct mode merges. -} {- .git/annex/merge/ is used for direct mode merges. -}
gitAnnexMergeDir :: Git.Repo -> FilePath gitAnnexMergeDir :: Git.Repo -> FilePath
gitAnnexMergeDir r = addTrailingPathSeparator $ gitAnnexDir r </> "merge" gitAnnexMergeDir r = addTrailingPathSeparator $ gitAnnexDir r </> "merge"

2
debian/changelog vendored
View file

@ -3,6 +3,8 @@ git-annex (4.20130803) UNRELEASED; urgency=low
* assistant, watcher: .gitignore files and other git ignores are now * assistant, watcher: .gitignore files and other git ignores are now
honored, when git 1.8.4 or newer is installed. honored, when git 1.8.4 or newer is installed.
(Thanks, Adam Spiers, for getting the necessary support into git for this.) (Thanks, Adam Spiers, for getting the necessary support into git for this.)
* importfeed: Ignores transient problems with feeds. Only exits nonzero
when a feed has repeatedly had a problems for at least 1 day.
-- Joey Hess <joeyh@debian.org> Fri, 02 Aug 2013 19:26:20 -0400 -- Joey Hess <joeyh@debian.org> Fri, 02 Aug 2013 19:26:20 -0400