importfeed: Added --scrape option

Which uses yt-dlp to screen scrape the equivilant of an RSS feed. Note that youtubedlscraped is a speed optimisation. Since yt-dlp found the urls, we know it can download them. That avoids calling youtubeDlSupported on each url, which makes --fast a lot faster. Almost all the same metadata fields and file formatting fields are populated, when yt-dlp is able to get the data. Note that yt-dlp has some additional useful metadata that could be exposed. But, much of it is specific to particular websites, and it would be hard to document on the git-annex importfeed man page. Sponsored-by: unqueued on Patreon
2024-01-30 15:37:29 -04:00 · 2024-01-30 15:37:29 -04:00 · 90db97d9a2
commit 90db97d9a2
parent d7949f8202
4 changed files with 165 additions and 18 deletions
--- a/Annex/YoutubeDl.hs
+++ b/Annex/YoutubeDl.hs
@ -1,10 +1,12 @@
 {- yt-dlp (and deprecated youtube-dl) integration for git-annex
 -
- - Copyright 2017-2023 Joey Hess <id@joeyh.name>
+ - Copyright 2017-2024 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
 {-# LANGUAGE DeriveGeneric #-}
 module Annex.YoutubeDl (
 	youtubeDl,
 	youtubeDlTo,
@ -13,6 +15,8 @@ module Annex.YoutubeDl (
 	youtubeDlFileName,
 	youtubeDlFileNameHtmlOnly,
 	youtubeDlCommand,
 	youtubePlaylist,
 	YoutubePlaylistItem(..),
 ) where
 import Annex.Common
@ -23,12 +27,18 @@ import Utility.DiskFree
 import Utility.HtmlDetect
 import Utility.Process.Transcript
 import Utility.Metered
 import Utility.Tmp
 import Messages.Progress
 import Logs.Transfer
 import Network.URI
 import Control.Concurrent.Async
 import Text.Read
 import Data.Either
 import qualified Data.Aeson as Aeson
 import GHC.Generics
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as B8
 -- youtube-dl can follow redirects to anywhere, including potentially
 -- localhost or a private address. So, it's only allowed to download
@ -324,3 +334,73 @@ parseYtdlpProgress = go [] . reverse . progresschunks
 - was buggy and is no longer done. -}
 parseYoutubeDlProgress :: ProgressParser
 parseYoutubeDlProgress _ = (Nothing, Nothing, "")
 {- List the items that yt-dlp can download from an url.
 - 
 - Note that this does not check youtubeDlAllowed because it does not
 - download content.
 -}
 youtubePlaylist :: URLString -> Annex (Either String [YoutubePlaylistItem])
 youtubePlaylist url = do
 	cmd <- youtubeDlCommand
 	if cmd == "yt-dlp"
 		then liftIO $ youtubePlaylist' url cmd
 		else return $ Left $ "Scraping needs yt-dlp, but git-annex has been configured to use " ++ cmd
 youtubePlaylist' :: URLString -> String -> IO (Either String [YoutubePlaylistItem])
 youtubePlaylist' url cmd = withTmpFile "yt-dlp" $ \tmpfile h -> do
 	hClose h
 	(outerr, ok) <- processTranscript cmd
 		[ "--simulate"
 		, "--flat-playlist"
 		-- Skip live videos in progress
 		, "--match-filter", "!is_live"
 		, "--print-to-file"
 		-- Write json with selected fields.
 		, "%(.{" ++ intercalate "," youtubePlaylistItemFields ++ "})j"
 		, tmpfile
 		, url
 		]
 		Nothing
 	if ok
 		then flip catchIO (pure . Left . show) $ do
 			v <- map Aeson.eitherDecodeStrict . B8.lines
 				<$> B.readFile tmpfile
 			return $ case partitionEithers v of
 				((parserr:_), _) -> 
 					Left $ "yt-dlp json parse errror: " ++ parserr
 				([], r) -> Right r
 		else return $ Left $ if null outerr
 			then "yt-dlp failed"
 			else "yt-dlp failed: " ++ outerr
 -- There are other fields that yt-dlp can extract, but these are similar to
 -- the information from an RSS feed.
 youtubePlaylistItemFields :: [String]
 youtubePlaylistItemFields =
 	[ "playlist_title"
 	, "playlist_uploader"
 	, "title"
 	, "description"
 	, "license"
 	, "url"
 	, "timestamp"
 	]
 -- Parse JSON generated by yt-dlp for playlist. Note that any field
 -- may be omitted when that information is not supported for a given website.
 data YoutubePlaylistItem = YoutubePlaylistItem
 	{ youtube_playlist_title :: Maybe String
 	, youtube_playlist_uploader :: Maybe String
 	, youtube_title :: Maybe String
 	, youtube_description :: Maybe String
 	, youtube_license :: Maybe String
 	, youtube_url :: Maybe String
 	, youtube_timestamp :: Maybe Integer -- ^ unix timestamp
 	} deriving (Generic, Show)
 instance Aeson.FromJSON YoutubePlaylistItem
  where
 	parseJSON = Aeson.genericParseJSON Aeson.defaultOptions
 		{ Aeson.fieldLabelModifier = drop (length "youtube_") }
--- a/7
+++ b/7
@ -1,3 +1,10 @@
 git-annex (10.20240130) UNRELEASED; urgency=medium
  * importfeed: Added --scrape option, which uses yt-dlp to screen scrape
    the equivilant of an RSS feed.
 -- Joey Hess <id@joeyh.name>  Mon, 29 Jan 2024 15:59:33 -0400
 git-annex (10.20240129) upstream; urgency=medium
  * info: Added "annex sizes of repositories" table to the overall display.
--- a/Command/ImportFeed.hs
+++ b/Command/ImportFeed.hs
@ -1,6 +1,6 @@
 {- git-annex command
 -
- - Copyright 2013-2023 Joey Hess <id@joeyh.name>
+ - Copyright 2013-2024 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -17,6 +17,7 @@ import Text.Feed.Types
 import qualified Data.Set as S
 import qualified Data.Map as M
 import Data.Time.Clock
 import Data.Time.Clock.POSIX
 import Data.Time.Format
 import Data.Time.Calendar
 import Data.Time.LocalTime
@ -61,6 +62,7 @@ cmd = notBareRepo $ withAnnexOptions os $
 data ImportFeedOptions = ImportFeedOptions
 	{ feedUrls :: CmdParams
 	, templateOption :: Maybe String
 	, scrapeOption :: Bool
 	, downloadOptions :: DownloadOptions
 	}
@ -71,6 +73,10 @@ optParser desc = ImportFeedOptions
 		( long "template" <> metavar paramFormat
 		<> help "template for filenames"
 		))
 	<*> switch
 		( long "scrape"
 		<> help "scrape website for content to import"
 		)
 	<*> parseDownloadOptions False
 seek :: ImportFeedOptions -> CommandSeek
@ -84,7 +90,7 @@ seek o = startConcurrency commandStages $ do
 		liftIO $ atomically $ do
 			m <- takeTMVar dlst
 			putTMVar dlst (M.insert url Nothing m)
-		commandAction $ getFeed url dlst
+		commandAction $ getFeed o url dlst
 		startpendingdownloads addunlockedmatcher cache dlst checkst False
 	startpendingdownloads addunlockedmatcher cache dlst checkst True
@ -135,18 +141,23 @@ seek o = startConcurrency commandStages $ do
 				clearFeedProblem url
 getFeed
-	:: URLString
+	:: ImportFeedOptions
 	-> URLString
 	-> TMVar (M.Map URLString (Maybe (Maybe [ToDownload])))
 	-> CommandStart
-getFeed url st =
+getFeed o url st =
 	starting "importfeed" (ActionItemOther (Just (UnquotedString url))) (SeekInput [url]) $
-		get `onException` recordfail
+		go `onException` recordfail
  where
 	record v = liftIO $ atomically $ do
 		m <- takeTMVar st
 		putTMVar st (M.insert url v m)
 	recordfail = record (Just Nothing)
 	go
 		| scrapeOption o = scrape
 		| otherwise = get
 	get = withTmpFile "feed" $ \tmpf h -> do
 		liftIO $ hClose h
 		ifM (downloadFeed url tmpf)
@ -182,6 +193,14 @@ getFeed url st =
 		next $ feedProblem url
 			(msg ++ " (use --debug --debugfilter=ImportFeed to see the feed content that was downloaded)")
 	scrape = youtubePlaylist url >>= \case
 		Left err -> do
 			recordfail
 			next $ feedProblem url err
 		Right playlist -> do
 			record (Just (Just (playlistDownloads url playlist)))
 			next $ return True
 parseFeedFromFile' :: FilePath -> IO (Maybe Feed)
 #if MIN_VERSION_feed(1,1,0)
 parseFeedFromFile' = parseFeedFromFile
@ -197,6 +216,9 @@ data ToDownload = ToDownload
 	, itempubdate :: Maybe (Either String UTCTime)
 	-- Fields that are used as metadata and to generate the filename.
 	, itemfields :: [(String, String)]
 	-- True when youtube-dl found this by scraping, so certainly
 	-- supports downloading it.
 	, youtubedlscraped :: Bool
 	}
 data DownloadLocation = Enclosure URLString | MediaLink URLString
@ -246,6 +268,7 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
 			_ -> Left . decodeBS . fromFeedText 
 				<$> getItemPublishDateString  i
 		, itemfields = extractFeedItemFields f i u
 		, youtubedlscraped = False
 		}
 {- Feeds change, so a feed download cannot be resumed. -}
@ -326,7 +349,7 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown
 	addmediafast linkurl mediaurl mediakey =
 		ifM (pure (not (rawOption (downloadOptions opts)))
-		     <&&> youtubeDlSupported linkurl)
+		     <&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl))
 			( startUrlDownload cv todownload linkurl $ do
 				runDownload todownload linkurl ".m" cache cv $ \f ->
 					checkCanAdd (downloadOptions opts) f $ \canadd -> do
@ -515,6 +538,15 @@ minimalMetaData i = case itemid i of
 	Just iid -> MetaData $ M.singleton itemIdField
 		(S.singleton $ toMetaValue iid)
 noneValue :: String
 noneValue = "none"
 extractField :: String -> [Maybe String] -> (String, String)
 extractField k [] = (k, noneValue)
 extractField k (Just v:_)
 	| not (null v) = (k, v)
 extractField k (_:rest) = extractField k rest
 extractFeedItemFields :: Feed -> Item -> URLString -> [(String, String)]
 extractFeedItemFields f i u = map (uncurry extractField)
 	[ ("feedurl", [Just u])
@ -535,14 +567,36 @@ extractFeedItemFields f i u = map (uncurry extractField)
 	feedauthor = decodeBS . fromFeedText <$> getFeedAuthor f
 	itemauthor = decodeBS . fromFeedText <$> getItemAuthor i
-extractField :: String -> [Maybe String] -> (String, String)
+playlistFields :: URLString -> YoutubePlaylistItem -> [(String, String)]
-extractField k [] = (k, noneValue)
+playlistFields u i = map (uncurry extractField)
-extractField k (Just v:_)
+	[ ("feedurl", [Just u])
-	| not (null v) = (k, v)
+	, ("feedtitle", [youtube_playlist_title i])
-extractField k (_:rest) = extractField k rest
+	, ("itemtitle", [youtube_title i])
 	, ("feedauthor", [youtube_playlist_uploader i])
 	, ("itemauthor", [youtube_playlist_uploader i])
 	-- itemsummary omitted, no equivilant in yt-dlp data
 	, ("itemdescription", [youtube_description i])
 	, ("itemrights", [youtube_license i])
 	, ("itemid", [youtube_url i])
 	, ("title", [youtube_title i, youtube_playlist_title i])
 	, ("author", [youtube_playlist_uploader i])
 	]
-noneValue :: String
+playlistDownloads :: URLString -> [YoutubePlaylistItem] -> [ToDownload]
-noneValue = "none"
+playlistDownloads url = mapMaybe go
  where
 	go i = do
 		iurl <- youtube_url i
 		return $ ToDownload
 			{ feedurl = url
 			, location = MediaLink iurl
 			, itemid = Just (encodeBS iurl)
 			, itempubdate = 
 				Right . posixSecondsToUTCTime . fromIntegral
 					<$> youtube_timestamp i
 			, itemfields = playlistFields url i
 			, youtubedlscraped = True
 			}
 {- Called when there is a problem with a feed.
 -
--- a/doc/git-annex-importfeed.mdwn
+++ b/doc/git-annex-importfeed.mdwn
@ -8,10 +8,10 @@ git annex importfeed `[url ...]`
 # DESCRIPTION
-Imports the contents of podcasts and other feeds. Only downloads files whose
+Imports the contents of podcasts and other rss and atom feeds. Only
-content has not already been added to the repository before, so you can
+downloads files whose content has not already been added to the repository
-delete, rename, etc the resulting files and repeated runs won't duplicate
+before, so you can delete, rename, etc the resulting files and repeated
-them.
+runs won't duplicate them.
 When `yt-dlp` is installed, it can be used to download links in the feed.
 This allows importing e.g., YouTube playlists.
@ -65,6 +65,12 @@ resulting in the new url being downloaded to such a filename.
  cannot be done, the import will fail, and the next import of the feed
  will retry.
 * `--scrape`
  Rather than downloading the url and parsing it as a rss/atom feed
  to find files to import, uses yt-dlp to screen scrape the equivilant
  of a feed, and imports what it found.
 * `--template`
  Controls where the files are stored.