2013-07-28 15:27:36 -04:00
{- git-annex command
- Copyright 2013 Joey Hess <joey@kitenet.net>
- Licensed under the GNU GPL version 3 or higher.
module Command.ImportFeed where
import Text.Feed.Import
import Text.Feed.Query
import Text.Feed.Types
import qualified Data.Set as S
import qualified Data.Map as M
2013-08-03 01:40:21 -04:00
import Data.Time.Clock
2013-07-28 15:27:36 -04:00
import Common.Annex
2013-07-31 12:19:00 -04:00
import qualified Annex
2013-07-28 15:27:36 -04:00
import Command
2013-09-28 14:35:21 -04:00
import qualified Annex.Url as Url
2013-07-28 15:27:36 -04:00
import Logs.Web
import qualified Option
import qualified Utility.Format
import Utility.Tmp
import Command.AddUrl (addUrlFile, relaxedOption)
2013-08-03 01:40:21 -04:00
import Annex.Perms
import Backend.URL (fromUrl)
2013-07-28 15:27:36 -04:00
def :: [Command]
def = [notBareRepo $ withOptions [templateOption, relaxedOption] $
command "importfeed" (paramRepeating paramUrl) seek
SectionCommon "import files from podcast feeds"]
templateOption :: Option
templateOption = Option.field [] "template" paramFormat "template for filenames"
seek :: [CommandSeek]
seek = [withField templateOption return $ \tmpl ->
withFlag relaxedOption $ \relaxed ->
2013-07-28 18:16:24 -04:00
withValue (getCache tmpl) $ \cache ->
withStrings $ start relaxed cache]
2013-07-28 15:27:36 -04:00
2013-07-28 18:16:24 -04:00
start :: Bool -> Cache -> URLString -> CommandStart
start relaxed cache url = do
showStart "importfeed" url
next $ perform relaxed cache url
perform :: Bool -> Cache -> URLString -> CommandPerform
perform relaxed cache url = do
v <- findEnclosures url
case v of
Just l | not (null l) -> do
2013-09-25 03:09:06 -04:00
ok <- and <$> mapM (downloadEnclosure relaxed cache) l
2013-09-03 14:32:26 -04:00
unless ok $
feedProblem url "problem downloading item"
2013-09-03 14:39:07 -04:00
next $ cleanup url True
2013-08-03 01:40:21 -04:00
_ -> do
feedProblem url "bad feed content"
2013-07-28 18:16:24 -04:00
next $ return True
2013-08-03 01:40:21 -04:00
cleanup :: URLString -> Bool -> CommandCleanup
cleanup url ok = do
when ok $
clearFeedProblem url
return ok
2013-07-28 18:16:24 -04:00
2013-07-28 19:08:50 -04:00
data ToDownload = ToDownload
{ feed :: Feed
2013-08-03 01:40:21 -04:00
, feedurl :: URLString
2013-07-28 19:08:50 -04:00
, item :: Item
, location :: URLString
2013-08-03 01:40:21 -04:00
mkToDownload :: Feed -> URLString -> Item -> Maybe ToDownload
mkToDownload f u i = case getItemEnclosure i of
2013-07-28 19:08:50 -04:00
Nothing -> Nothing
2013-08-03 01:40:21 -04:00
Just (enclosureurl, _, _) -> Just $ ToDownload f u i enclosureurl
2013-07-28 19:08:50 -04:00
2013-07-28 18:16:24 -04:00
data Cache = Cache
{ knownurls :: S.Set URLString
, template :: Utility.Format.Format
getCache :: Maybe String -> Annex Cache
2013-07-31 12:19:00 -04:00
getCache opttemplate = ifM (Annex.getState Annex.force)
( ret S.empty
, do
showSideAction "checking known urls"
ret =<< S.fromList <$> knownUrls
2013-07-28 15:27:36 -04:00
2013-07-28 18:16:24 -04:00
tmpl = Utility.Format.gen $ fromMaybe defaultTemplate opttemplate
2013-07-31 12:19:00 -04:00
ret s = return $ Cache s tmpl
2013-07-28 15:27:36 -04:00
findEnclosures :: URLString -> Annex (Maybe [ToDownload])
2013-08-03 01:40:21 -04:00
findEnclosures url = extract <$> downloadFeed url
2013-07-28 15:27:36 -04:00
2013-08-03 01:40:21 -04:00
extract Nothing = Nothing
extract (Just f) = Just $ mapMaybe (mkToDownload f url) (feedItems f)
2013-07-28 15:27:36 -04:00
{- Feeds change, so a feed download cannot be resumed. -}
downloadFeed :: URLString -> Annex (Maybe Feed)
downloadFeed url = do
2013-09-28 14:35:21 -04:00
ua <- Url.getUserAgent
2013-07-28 15:27:36 -04:00
liftIO $ withTmpFile "feed" $ \f h -> do
2013-07-28 17:24:30 -04:00
fileEncoding h
2013-09-28 14:35:21 -04:00
ifM (Url.download url [] [] f ua)
2013-08-03 01:40:21 -04:00
( liftIO $ parseFeedString <$> hGetContentsStrict h
2013-07-28 15:27:36 -04:00
, return Nothing
{- Avoids downloading any urls that are already known to be associated
2013-07-31 12:19:00 -04:00
- with a file in the annex, unless forced. -}
2013-08-03 01:40:21 -04:00
downloadEnclosure :: Bool -> Cache -> ToDownload -> Annex Bool
2013-07-28 18:16:24 -04:00
downloadEnclosure relaxed cache enclosure
2013-08-03 01:40:21 -04:00
| S.member url (knownurls cache) = ifM forced (go, return True)
2013-07-31 12:19:00 -04:00
| otherwise = go
2013-08-01 11:57:05 -04:00
forced = Annex.getState Annex.force
2013-07-31 12:19:00 -04:00
url = location enclosure
go = do
2013-08-01 11:57:05 -04:00
dest <- makeunique (1 :: Integer) $ feedFile (template cache) enclosure
case dest of
2013-08-03 01:40:21 -04:00
Nothing -> return True
2013-08-01 11:57:05 -04:00
Just f -> do
showStart "addurl" f
2013-08-03 01:40:21 -04:00
ok <- addUrlFile relaxed url f
if ok
then do
return True
else do
checkFeedBroken (feedurl enclosure)
2013-08-01 11:57:05 -04:00
{- Find a unique filename to save the url to.
- If the file exists, prefixes it with a number.
- When forced, the file may already exist and have the same
- url, in which case Nothing is returned as it does not need
- to be re-downloaded. -}
makeunique n file = ifM alreadyexists
( ifM forced
( ifAnnexed f checksameurl tryanother
, tryanother
, return $ Just f
f = if n < 2
then file
let (d, base) = splitFileName file
in d </> show n ++ "_" ++ base
tryanother = makeunique (n + 1) file
alreadyexists = liftIO $ isJust <$> catchMaybeIO (getSymbolicLinkStatus f)
checksameurl (k, _) = ifM (elem url <$> getUrls k)
( return Nothing
, tryanother
2013-07-28 15:27:36 -04:00
2013-07-28 18:16:24 -04:00
defaultTemplate :: String
2013-07-28 19:08:50 -04:00
defaultTemplate = "${feedtitle}/${itemtitle}${extension}"
2013-07-28 15:27:36 -04:00
2013-08-01 11:57:05 -04:00
{- Generates a filename to use for a feed item by filling out the template.
- The filename may not be unique. -}
feedFile :: Utility.Format.Format -> ToDownload -> FilePath
feedFile tmpl i = Utility.Format.format tmpl $ M.fromList
[ field "feedtitle" $ getFeedTitle $ feed i
, fieldMaybe "itemtitle" $ getItemTitle $ item i
, fieldMaybe "feedauthor" $ getFeedAuthor $ feed i
, fieldMaybe "itemauthor" $ getItemAuthor $ item i
, fieldMaybe "itemsummary" $ getItemSummary $ item i
, fieldMaybe "itemdescription" $ getItemDescription $ item i
, fieldMaybe "itemrights" $ getItemRights $ item i
, fieldMaybe "itemid" $ snd <$> getItemId (item i)
2013-08-22 18:25:21 -04:00
, ("extension", sanitizeFilePath $ takeExtension $ location i)
2013-08-01 11:57:05 -04:00
2013-07-28 15:27:36 -04:00
field k v =
2013-08-22 18:25:21 -04:00
let s = sanitizeFilePath v in
2013-07-28 15:27:36 -04:00
if null s then (k, "none") else (k, s)
fieldMaybe k Nothing = (k, "none")
fieldMaybe k (Just v) = field k v
2013-08-03 01:40:21 -04:00
{- Called when there is a problem with a feed.
- Throws an error if the feed is broken, otherwise shows a warning. -}
feedProblem :: URLString -> String -> Annex ()
feedProblem url message = ifM (checkFeedBroken url)
( error $ message ++ " (having repeated problems with this feed!)"
, warning $ "warning: " ++ message
{- A feed is only broken if problems have occurred repeatedly, for at
- least 23 hours. -}
checkFeedBroken :: URLString -> Annex Bool
checkFeedBroken url = checkFeedBroken' url =<< feedState url
checkFeedBroken' :: URLString -> FilePath -> Annex Bool
checkFeedBroken' url f = do
prev <- maybe Nothing readish <$> liftIO (catchMaybeIO $ readFile f)
now <- liftIO getCurrentTime
case prev of
Nothing -> do
createAnnexDirectory (parentDir f)
liftIO $ writeFile f $ show now
return False
Just prevtime -> do
let broken = diffUTCTime now prevtime > 60 * 60 * 23
when broken $
-- Avoid repeatedly complaining about
-- broken feed.
clearFeedProblem url
return broken
clearFeedProblem :: URLString -> Annex ()
clearFeedProblem url = void $ liftIO . tryIO . removeFile =<< feedState url
feedState :: URLString -> Annex FilePath
feedState url = fromRepo . gitAnnexFeedState =<< fromUrl url Nothing