importfeed: Added --scrape option

Which uses yt-dlp to screen scrape the equivilant of an RSS feed. Note that youtubedlscraped is a speed optimisation. Since yt-dlp found the urls, we know it can download them. That avoids calling youtubeDlSupported on each url, which makes --fast a lot faster. Almost all the same metadata fields and file formatting fields are populated, when yt-dlp is able to get the data. Note that yt-dlp has some additional useful metadata that could be exposed. But, much of it is specific to particular websites, and it would be hard to document on the git-annex importfeed man page. Sponsored-by: unqueued on Patreon
2024-01-30 15:37:29 -04:00 · 2024-01-30 15:37:29 -04:00 · 90db97d9a2
commit 90db97d9a2
parent d7949f8202
4 changed files with 165 additions and 18 deletions
--- a/Annex/YoutubeDl.hs
+++ b/Annex/YoutubeDl.hs
@ -1,10 +1,12 @@
 {- yt-dlp (and deprecated youtube-dl) integration for git-annex
 -
- - Copyright 2017-2023 Joey Hess <id@joeyh.name>
+ - Copyright 2017-2024 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}

+{-# LANGUAGE DeriveGeneric #-}
+
 module Annex.YoutubeDl (
 	youtubeDl,
 	youtubeDlTo,
@ -13,6 +15,8 @@ module Annex.YoutubeDl (
 	youtubeDlFileName,
 	youtubeDlFileNameHtmlOnly,
 	youtubeDlCommand,
+	youtubePlaylist,
+	YoutubePlaylistItem(..),
 ) where

 import Annex.Common
@ -23,12 +27,18 @@ import Utility.DiskFree
 import Utility.HtmlDetect
 import Utility.Process.Transcript
 import Utility.Metered
+import Utility.Tmp
 import Messages.Progress
 import Logs.Transfer

 import Network.URI
 import Control.Concurrent.Async
 import Text.Read
+import Data.Either
+import qualified Data.Aeson as Aeson
+import GHC.Generics
+import qualified Data.ByteString as B
+import qualified Data.ByteString.Char8 as B8

 -- youtube-dl can follow redirects to anywhere, including potentially
 -- localhost or a private address. So, it's only allowed to download
@ -324,3 +334,73 @@ parseYtdlpProgress = go [] . reverse . progresschunks
 - was buggy and is no longer done. -}
 parseYoutubeDlProgress :: ProgressParser
 parseYoutubeDlProgress _ = (Nothing, Nothing, "")
+
+{- List the items that yt-dlp can download from an url.
+ - 
+ - Note that this does not check youtubeDlAllowed because it does not
+ - download content.
+ -}
+youtubePlaylist :: URLString -> Annex (Either String [YoutubePlaylistItem])
+youtubePlaylist url = do
+	cmd <- youtubeDlCommand
+	if cmd == "yt-dlp"
+		then liftIO $ youtubePlaylist' url cmd
+		else return $ Left $ "Scraping needs yt-dlp, but git-annex has been configured to use " ++ cmd
+
+youtubePlaylist' :: URLString -> String -> IO (Either String [YoutubePlaylistItem])
+youtubePlaylist' url cmd = withTmpFile "yt-dlp" $ \tmpfile h -> do
+	hClose h
+	(outerr, ok) <- processTranscript cmd
+		[ "--simulate"
+		, "--flat-playlist"
+		-- Skip live videos in progress
+		, "--match-filter", "!is_live"
+		, "--print-to-file"
+		-- Write json with selected fields.
+		, "%(.{" ++ intercalate "," youtubePlaylistItemFields ++ "})j"
+		, tmpfile
+		, url
+		]
+		Nothing
+	if ok
+		then flip catchIO (pure . Left . show) $ do
+			v <- map Aeson.eitherDecodeStrict . B8.lines
+				<$> B.readFile tmpfile
+			return $ case partitionEithers v of
+				((parserr:_), _) -> 
+					Left $ "yt-dlp json parse errror: " ++ parserr
+				([], r) -> Right r
+		else return $ Left $ if null outerr
+			then "yt-dlp failed"
+			else "yt-dlp failed: " ++ outerr
+
+-- There are other fields that yt-dlp can extract, but these are similar to
+-- the information from an RSS feed.
+youtubePlaylistItemFields :: [String]
+youtubePlaylistItemFields =
+	[ "playlist_title"
+	, "playlist_uploader"
+	, "title"
+	, "description"
+	, "license"
+	, "url"
+	, "timestamp"
+	]
+
+-- Parse JSON generated by yt-dlp for playlist. Note that any field
+-- may be omitted when that information is not supported for a given website.
+data YoutubePlaylistItem = YoutubePlaylistItem
+	{ youtube_playlist_title :: Maybe String
+	, youtube_playlist_uploader :: Maybe String
+	, youtube_title :: Maybe String
+	, youtube_description :: Maybe String
+	, youtube_license :: Maybe String
+	, youtube_url :: Maybe String
+	, youtube_timestamp :: Maybe Integer -- ^ unix timestamp
+	} deriving (Generic, Show)
+
+instance Aeson.FromJSON YoutubePlaylistItem
+  where
+	parseJSON = Aeson.genericParseJSON Aeson.defaultOptions
+		{ Aeson.fieldLabelModifier = drop (length "youtube_") }
+
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+git-annex (10.20240130) UNRELEASED; urgency=medium
+
+  * importfeed: Added --scrape option, which uses yt-dlp to screen scrape
+    the equivilant of an RSS feed.
+
+ -- Joey Hess <id@joeyh.name>  Mon, 29 Jan 2024 15:59:33 -0400
+
 git-annex (10.20240129) upstream; urgency=medium

  * info: Added "annex sizes of repositories" table to the overall display.
--- a/Command/ImportFeed.hs
+++ b/Command/ImportFeed.hs
@ -1,6 +1,6 @@
 {- git-annex command
 -
- - Copyright 2013-2023 Joey Hess <id@joeyh.name>
+ - Copyright 2013-2024 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -17,6 +17,7 @@ import Text.Feed.Types
 import qualified Data.Set as S
 import qualified Data.Map as M
 import Data.Time.Clock
+import Data.Time.Clock.POSIX
 import Data.Time.Format
 import Data.Time.Calendar
 import Data.Time.LocalTime
@ -61,6 +62,7 @@ cmd = notBareRepo $ withAnnexOptions os $
 data ImportFeedOptions = ImportFeedOptions
 	{ feedUrls :: CmdParams
 	, templateOption :: Maybe String
+	, scrapeOption :: Bool
 	, downloadOptions :: DownloadOptions
 	}

@ -71,6 +73,10 @@ optParser desc = ImportFeedOptions
 		( long "template" <> metavar paramFormat
 		<> help "template for filenames"
 		))
+	<*> switch
+		( long "scrape"
+		<> help "scrape website for content to import"
+		)
 	<*> parseDownloadOptions False

 seek :: ImportFeedOptions -> CommandSeek
@ -84,7 +90,7 @@ seek o = startConcurrency commandStages $ do
 		liftIO $ atomically $ do
 			m <- takeTMVar dlst
 			putTMVar dlst (M.insert url Nothing m)
-		commandAction $ getFeed url dlst
+		commandAction $ getFeed o url dlst
 		startpendingdownloads addunlockedmatcher cache dlst checkst False
 	
 	startpendingdownloads addunlockedmatcher cache dlst checkst True
@ -135,18 +141,23 @@ seek o = startConcurrency commandStages $ do
 				clearFeedProblem url

 getFeed
-	:: URLString
+	:: ImportFeedOptions
+	-> URLString
 	-> TMVar (M.Map URLString (Maybe (Maybe [ToDownload])))
 	-> CommandStart
-getFeed url st =
+getFeed o url st =
 	starting "importfeed" (ActionItemOther (Just (UnquotedString url))) (SeekInput [url]) $
-		get `onException` recordfail
+		go `onException` recordfail
  where
 	record v = liftIO $ atomically $ do
 		m <- takeTMVar st
 		putTMVar st (M.insert url v m)
 	recordfail = record (Just Nothing)
 	
+	go
+		| scrapeOption o = scrape
+		| otherwise = get
+
 	get = withTmpFile "feed" $ \tmpf h -> do
 		liftIO $ hClose h
 		ifM (downloadFeed url tmpf)
@ -181,6 +192,14 @@ getFeed url st =
 		recordfail
 		next $ feedProblem url
 			(msg ++ " (use --debug --debugfilter=ImportFeed to see the feed content that was downloaded)")
+		
+	scrape = youtubePlaylist url >>= \case
+		Left err -> do
+			recordfail
+			next $ feedProblem url err
+		Right playlist -> do
+			record (Just (Just (playlistDownloads url playlist)))
+			next $ return True

 parseFeedFromFile' :: FilePath -> IO (Maybe Feed)
 #if MIN_VERSION_feed(1,1,0)
@ -197,6 +216,9 @@ data ToDownload = ToDownload
 	, itempubdate :: Maybe (Either String UTCTime)
 	-- Fields that are used as metadata and to generate the filename.
 	, itemfields :: [(String, String)]
+	-- True when youtube-dl found this by scraping, so certainly
+	-- supports downloading it.
+	, youtubedlscraped :: Bool
 	}

 data DownloadLocation = Enclosure URLString | MediaLink URLString
@ -246,6 +268,7 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
 			_ -> Left . decodeBS . fromFeedText 
 				<$> getItemPublishDateString  i
 		, itemfields = extractFeedItemFields f i u
+		, youtubedlscraped = False
 		}

 {- Feeds change, so a feed download cannot be resumed. -}
@ -326,7 +349,7 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown

 	addmediafast linkurl mediaurl mediakey =
 		ifM (pure (not (rawOption (downloadOptions opts)))
-		     <&&> youtubeDlSupported linkurl)
+		     <&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl))
 			( startUrlDownload cv todownload linkurl $ do
 				runDownload todownload linkurl ".m" cache cv $ \f ->
 					checkCanAdd (downloadOptions opts) f $ \canadd -> do
@ -515,6 +538,15 @@ minimalMetaData i = case itemid i of
 	Just iid -> MetaData $ M.singleton itemIdField
 		(S.singleton $ toMetaValue iid)

+noneValue :: String
+noneValue = "none"
+
+extractField :: String -> [Maybe String] -> (String, String)
+extractField k [] = (k, noneValue)
+extractField k (Just v:_)
+	| not (null v) = (k, v)
+extractField k (_:rest) = extractField k rest
+
 extractFeedItemFields :: Feed -> Item -> URLString -> [(String, String)]
 extractFeedItemFields f i u = map (uncurry extractField)
 	[ ("feedurl", [Just u])
@ -535,14 +567,36 @@ extractFeedItemFields f i u = map (uncurry extractField)
 	feedauthor = decodeBS . fromFeedText <$> getFeedAuthor f
 	itemauthor = decodeBS . fromFeedText <$> getItemAuthor i

-extractField :: String -> [Maybe String] -> (String, String)
-extractField k [] = (k, noneValue)
-extractField k (Just v:_)
-	| not (null v) = (k, v)
-extractField k (_:rest) = extractField k rest
+playlistFields :: URLString -> YoutubePlaylistItem -> [(String, String)]
+playlistFields u i = map (uncurry extractField)
+	[ ("feedurl", [Just u])
+	, ("feedtitle", [youtube_playlist_title i])
+	, ("itemtitle", [youtube_title i])
+	, ("feedauthor", [youtube_playlist_uploader i])
+	, ("itemauthor", [youtube_playlist_uploader i])
+	-- itemsummary omitted, no equivilant in yt-dlp data
+	, ("itemdescription", [youtube_description i])
+	, ("itemrights", [youtube_license i])
+	, ("itemid", [youtube_url i])
+	, ("title", [youtube_title i, youtube_playlist_title i])
+	, ("author", [youtube_playlist_uploader i])
+	]

-noneValue :: String
-noneValue = "none"
+playlistDownloads :: URLString -> [YoutubePlaylistItem] -> [ToDownload]
+playlistDownloads url = mapMaybe go
+  where
+	go i = do
+		iurl <- youtube_url i
+		return $ ToDownload
+			{ feedurl = url
+			, location = MediaLink iurl
+			, itemid = Just (encodeBS iurl)
+			, itempubdate = 
+				Right . posixSecondsToUTCTime . fromIntegral
+					<$> youtube_timestamp i
+			, itemfields = playlistFields url i
+			, youtubedlscraped = True
+			}

 {- Called when there is a problem with a feed.
 -
--- a/doc/git-annex-importfeed.mdwn
+++ b/doc/git-annex-importfeed.mdwn
@ -8,10 +8,10 @@ git annex importfeed `[url ...]`

 # DESCRIPTION

-Imports the contents of podcasts and other feeds. Only downloads files whose
-content has not already been added to the repository before, so you can
-delete, rename, etc the resulting files and repeated runs won't duplicate
-them.
+Imports the contents of podcasts and other rss and atom feeds. Only
+downloads files whose content has not already been added to the repository
+before, so you can delete, rename, etc the resulting files and repeated
+runs won't duplicate them.

 When `yt-dlp` is installed, it can be used to download links in the feed.
 This allows importing e.g., YouTube playlists.
@ -65,6 +65,12 @@ resulting in the new url being downloaded to such a filename.
  cannot be done, the import will fail, and the next import of the feed
  will retry.

+* `--scrape`
+
+  Rather than downloading the url and parsing it as a rss/atom feed
+  to find files to import, uses yt-dlp to screen scrape the equivilant
+  of a feed, and imports what it found.
+
 * `--template`

  Controls where the files are stored.