importfeed: Added --scrape option

Which uses yt-dlp to screen scrape the equivilant of an RSS feed.

Note that youtubedlscraped is a speed optimisation. Since yt-dlp found
the urls, we know it can download them. That avoids calling
youtubeDlSupported on each url, which makes --fast a lot faster.

Almost all the same metadata fields and file formatting fields are
populated, when yt-dlp is able to get the data. Note that yt-dlp has some
additional useful metadata that could be exposed. But, much of it is
specific to particular websites, and it would be hard to document on the
git-annex importfeed man page.

Sponsored-by: unqueued on Patreon
This commit is contained in:
Joey Hess 2024-01-30 15:37:29 -04:00
parent d7949f8202
commit 90db97d9a2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 165 additions and 18 deletions

View file

@ -1,10 +1,12 @@
{- yt-dlp (and deprecated youtube-dl) integration for git-annex {- yt-dlp (and deprecated youtube-dl) integration for git-annex
- -
- Copyright 2017-2023 Joey Hess <id@joeyh.name> - Copyright 2017-2024 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
{-# LANGUAGE DeriveGeneric #-}
module Annex.YoutubeDl ( module Annex.YoutubeDl (
youtubeDl, youtubeDl,
youtubeDlTo, youtubeDlTo,
@ -13,6 +15,8 @@ module Annex.YoutubeDl (
youtubeDlFileName, youtubeDlFileName,
youtubeDlFileNameHtmlOnly, youtubeDlFileNameHtmlOnly,
youtubeDlCommand, youtubeDlCommand,
youtubePlaylist,
YoutubePlaylistItem(..),
) where ) where
import Annex.Common import Annex.Common
@ -23,12 +27,18 @@ import Utility.DiskFree
import Utility.HtmlDetect import Utility.HtmlDetect
import Utility.Process.Transcript import Utility.Process.Transcript
import Utility.Metered import Utility.Metered
import Utility.Tmp
import Messages.Progress import Messages.Progress
import Logs.Transfer import Logs.Transfer
import Network.URI import Network.URI
import Control.Concurrent.Async import Control.Concurrent.Async
import Text.Read import Text.Read
import Data.Either
import qualified Data.Aeson as Aeson
import GHC.Generics
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
-- youtube-dl can follow redirects to anywhere, including potentially -- youtube-dl can follow redirects to anywhere, including potentially
-- localhost or a private address. So, it's only allowed to download -- localhost or a private address. So, it's only allowed to download
@ -324,3 +334,73 @@ parseYtdlpProgress = go [] . reverse . progresschunks
- was buggy and is no longer done. -} - was buggy and is no longer done. -}
parseYoutubeDlProgress :: ProgressParser parseYoutubeDlProgress :: ProgressParser
parseYoutubeDlProgress _ = (Nothing, Nothing, "") parseYoutubeDlProgress _ = (Nothing, Nothing, "")
{- List the items that yt-dlp can download from an url.
-
- Note that this does not check youtubeDlAllowed because it does not
- download content.
-}
youtubePlaylist :: URLString -> Annex (Either String [YoutubePlaylistItem])
youtubePlaylist url = do
cmd <- youtubeDlCommand
if cmd == "yt-dlp"
then liftIO $ youtubePlaylist' url cmd
else return $ Left $ "Scraping needs yt-dlp, but git-annex has been configured to use " ++ cmd
youtubePlaylist' :: URLString -> String -> IO (Either String [YoutubePlaylistItem])
youtubePlaylist' url cmd = withTmpFile "yt-dlp" $ \tmpfile h -> do
hClose h
(outerr, ok) <- processTranscript cmd
[ "--simulate"
, "--flat-playlist"
-- Skip live videos in progress
, "--match-filter", "!is_live"
, "--print-to-file"
-- Write json with selected fields.
, "%(.{" ++ intercalate "," youtubePlaylistItemFields ++ "})j"
, tmpfile
, url
]
Nothing
if ok
then flip catchIO (pure . Left . show) $ do
v <- map Aeson.eitherDecodeStrict . B8.lines
<$> B.readFile tmpfile
return $ case partitionEithers v of
((parserr:_), _) ->
Left $ "yt-dlp json parse errror: " ++ parserr
([], r) -> Right r
else return $ Left $ if null outerr
then "yt-dlp failed"
else "yt-dlp failed: " ++ outerr
-- There are other fields that yt-dlp can extract, but these are similar to
-- the information from an RSS feed.
youtubePlaylistItemFields :: [String]
youtubePlaylistItemFields =
[ "playlist_title"
, "playlist_uploader"
, "title"
, "description"
, "license"
, "url"
, "timestamp"
]
-- Parse JSON generated by yt-dlp for playlist. Note that any field
-- may be omitted when that information is not supported for a given website.
data YoutubePlaylistItem = YoutubePlaylistItem
{ youtube_playlist_title :: Maybe String
, youtube_playlist_uploader :: Maybe String
, youtube_title :: Maybe String
, youtube_description :: Maybe String
, youtube_license :: Maybe String
, youtube_url :: Maybe String
, youtube_timestamp :: Maybe Integer -- ^ unix timestamp
} deriving (Generic, Show)
instance Aeson.FromJSON YoutubePlaylistItem
where
parseJSON = Aeson.genericParseJSON Aeson.defaultOptions
{ Aeson.fieldLabelModifier = drop (length "youtube_") }

View file

@ -1,3 +1,10 @@
git-annex (10.20240130) UNRELEASED; urgency=medium
* importfeed: Added --scrape option, which uses yt-dlp to screen scrape
the equivilant of an RSS feed.
-- Joey Hess <id@joeyh.name> Mon, 29 Jan 2024 15:59:33 -0400
git-annex (10.20240129) upstream; urgency=medium git-annex (10.20240129) upstream; urgency=medium
* info: Added "annex sizes of repositories" table to the overall display. * info: Added "annex sizes of repositories" table to the overall display.

View file

@ -1,6 +1,6 @@
{- git-annex command {- git-annex command
- -
- Copyright 2013-2023 Joey Hess <id@joeyh.name> - Copyright 2013-2024 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -17,6 +17,7 @@ import Text.Feed.Types
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.Map as M import qualified Data.Map as M
import Data.Time.Clock import Data.Time.Clock
import Data.Time.Clock.POSIX
import Data.Time.Format import Data.Time.Format
import Data.Time.Calendar import Data.Time.Calendar
import Data.Time.LocalTime import Data.Time.LocalTime
@ -61,6 +62,7 @@ cmd = notBareRepo $ withAnnexOptions os $
data ImportFeedOptions = ImportFeedOptions data ImportFeedOptions = ImportFeedOptions
{ feedUrls :: CmdParams { feedUrls :: CmdParams
, templateOption :: Maybe String , templateOption :: Maybe String
, scrapeOption :: Bool
, downloadOptions :: DownloadOptions , downloadOptions :: DownloadOptions
} }
@ -71,6 +73,10 @@ optParser desc = ImportFeedOptions
( long "template" <> metavar paramFormat ( long "template" <> metavar paramFormat
<> help "template for filenames" <> help "template for filenames"
)) ))
<*> switch
( long "scrape"
<> help "scrape website for content to import"
)
<*> parseDownloadOptions False <*> parseDownloadOptions False
seek :: ImportFeedOptions -> CommandSeek seek :: ImportFeedOptions -> CommandSeek
@ -84,7 +90,7 @@ seek o = startConcurrency commandStages $ do
liftIO $ atomically $ do liftIO $ atomically $ do
m <- takeTMVar dlst m <- takeTMVar dlst
putTMVar dlst (M.insert url Nothing m) putTMVar dlst (M.insert url Nothing m)
commandAction $ getFeed url dlst commandAction $ getFeed o url dlst
startpendingdownloads addunlockedmatcher cache dlst checkst False startpendingdownloads addunlockedmatcher cache dlst checkst False
startpendingdownloads addunlockedmatcher cache dlst checkst True startpendingdownloads addunlockedmatcher cache dlst checkst True
@ -135,18 +141,23 @@ seek o = startConcurrency commandStages $ do
clearFeedProblem url clearFeedProblem url
getFeed getFeed
:: URLString :: ImportFeedOptions
-> URLString
-> TMVar (M.Map URLString (Maybe (Maybe [ToDownload]))) -> TMVar (M.Map URLString (Maybe (Maybe [ToDownload])))
-> CommandStart -> CommandStart
getFeed url st = getFeed o url st =
starting "importfeed" (ActionItemOther (Just (UnquotedString url))) (SeekInput [url]) $ starting "importfeed" (ActionItemOther (Just (UnquotedString url))) (SeekInput [url]) $
get `onException` recordfail go `onException` recordfail
where where
record v = liftIO $ atomically $ do record v = liftIO $ atomically $ do
m <- takeTMVar st m <- takeTMVar st
putTMVar st (M.insert url v m) putTMVar st (M.insert url v m)
recordfail = record (Just Nothing) recordfail = record (Just Nothing)
go
| scrapeOption o = scrape
| otherwise = get
get = withTmpFile "feed" $ \tmpf h -> do get = withTmpFile "feed" $ \tmpf h -> do
liftIO $ hClose h liftIO $ hClose h
ifM (downloadFeed url tmpf) ifM (downloadFeed url tmpf)
@ -182,6 +193,14 @@ getFeed url st =
next $ feedProblem url next $ feedProblem url
(msg ++ " (use --debug --debugfilter=ImportFeed to see the feed content that was downloaded)") (msg ++ " (use --debug --debugfilter=ImportFeed to see the feed content that was downloaded)")
scrape = youtubePlaylist url >>= \case
Left err -> do
recordfail
next $ feedProblem url err
Right playlist -> do
record (Just (Just (playlistDownloads url playlist)))
next $ return True
parseFeedFromFile' :: FilePath -> IO (Maybe Feed) parseFeedFromFile' :: FilePath -> IO (Maybe Feed)
#if MIN_VERSION_feed(1,1,0) #if MIN_VERSION_feed(1,1,0)
parseFeedFromFile' = parseFeedFromFile parseFeedFromFile' = parseFeedFromFile
@ -197,6 +216,9 @@ data ToDownload = ToDownload
, itempubdate :: Maybe (Either String UTCTime) , itempubdate :: Maybe (Either String UTCTime)
-- Fields that are used as metadata and to generate the filename. -- Fields that are used as metadata and to generate the filename.
, itemfields :: [(String, String)] , itemfields :: [(String, String)]
-- True when youtube-dl found this by scraping, so certainly
-- supports downloading it.
, youtubedlscraped :: Bool
} }
data DownloadLocation = Enclosure URLString | MediaLink URLString data DownloadLocation = Enclosure URLString | MediaLink URLString
@ -246,6 +268,7 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
_ -> Left . decodeBS . fromFeedText _ -> Left . decodeBS . fromFeedText
<$> getItemPublishDateString i <$> getItemPublishDateString i
, itemfields = extractFeedItemFields f i u , itemfields = extractFeedItemFields f i u
, youtubedlscraped = False
} }
{- Feeds change, so a feed download cannot be resumed. -} {- Feeds change, so a feed download cannot be resumed. -}
@ -326,7 +349,7 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown
addmediafast linkurl mediaurl mediakey = addmediafast linkurl mediaurl mediakey =
ifM (pure (not (rawOption (downloadOptions opts))) ifM (pure (not (rawOption (downloadOptions opts)))
<&&> youtubeDlSupported linkurl) <&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl))
( startUrlDownload cv todownload linkurl $ do ( startUrlDownload cv todownload linkurl $ do
runDownload todownload linkurl ".m" cache cv $ \f -> runDownload todownload linkurl ".m" cache cv $ \f ->
checkCanAdd (downloadOptions opts) f $ \canadd -> do checkCanAdd (downloadOptions opts) f $ \canadd -> do
@ -515,6 +538,15 @@ minimalMetaData i = case itemid i of
Just iid -> MetaData $ M.singleton itemIdField Just iid -> MetaData $ M.singleton itemIdField
(S.singleton $ toMetaValue iid) (S.singleton $ toMetaValue iid)
noneValue :: String
noneValue = "none"
extractField :: String -> [Maybe String] -> (String, String)
extractField k [] = (k, noneValue)
extractField k (Just v:_)
| not (null v) = (k, v)
extractField k (_:rest) = extractField k rest
extractFeedItemFields :: Feed -> Item -> URLString -> [(String, String)] extractFeedItemFields :: Feed -> Item -> URLString -> [(String, String)]
extractFeedItemFields f i u = map (uncurry extractField) extractFeedItemFields f i u = map (uncurry extractField)
[ ("feedurl", [Just u]) [ ("feedurl", [Just u])
@ -535,14 +567,36 @@ extractFeedItemFields f i u = map (uncurry extractField)
feedauthor = decodeBS . fromFeedText <$> getFeedAuthor f feedauthor = decodeBS . fromFeedText <$> getFeedAuthor f
itemauthor = decodeBS . fromFeedText <$> getItemAuthor i itemauthor = decodeBS . fromFeedText <$> getItemAuthor i
extractField :: String -> [Maybe String] -> (String, String) playlistFields :: URLString -> YoutubePlaylistItem -> [(String, String)]
extractField k [] = (k, noneValue) playlistFields u i = map (uncurry extractField)
extractField k (Just v:_) [ ("feedurl", [Just u])
| not (null v) = (k, v) , ("feedtitle", [youtube_playlist_title i])
extractField k (_:rest) = extractField k rest , ("itemtitle", [youtube_title i])
, ("feedauthor", [youtube_playlist_uploader i])
, ("itemauthor", [youtube_playlist_uploader i])
-- itemsummary omitted, no equivilant in yt-dlp data
, ("itemdescription", [youtube_description i])
, ("itemrights", [youtube_license i])
, ("itemid", [youtube_url i])
, ("title", [youtube_title i, youtube_playlist_title i])
, ("author", [youtube_playlist_uploader i])
]
noneValue :: String playlistDownloads :: URLString -> [YoutubePlaylistItem] -> [ToDownload]
noneValue = "none" playlistDownloads url = mapMaybe go
where
go i = do
iurl <- youtube_url i
return $ ToDownload
{ feedurl = url
, location = MediaLink iurl
, itemid = Just (encodeBS iurl)
, itempubdate =
Right . posixSecondsToUTCTime . fromIntegral
<$> youtube_timestamp i
, itemfields = playlistFields url i
, youtubedlscraped = True
}
{- Called when there is a problem with a feed. {- Called when there is a problem with a feed.
- -

View file

@ -8,10 +8,10 @@ git annex importfeed `[url ...]`
# DESCRIPTION # DESCRIPTION
Imports the contents of podcasts and other feeds. Only downloads files whose Imports the contents of podcasts and other rss and atom feeds. Only
content has not already been added to the repository before, so you can downloads files whose content has not already been added to the repository
delete, rename, etc the resulting files and repeated runs won't duplicate before, so you can delete, rename, etc the resulting files and repeated
them. runs won't duplicate them.
When `yt-dlp` is installed, it can be used to download links in the feed. When `yt-dlp` is installed, it can be used to download links in the feed.
This allows importing e.g., YouTube playlists. This allows importing e.g., YouTube playlists.
@ -65,6 +65,12 @@ resulting in the new url being downloaded to such a filename.
cannot be done, the import will fail, and the next import of the feed cannot be done, the import will fail, and the next import of the feed
will retry. will retry.
* `--scrape`
Rather than downloading the url and parsing it as a rss/atom feed
to find files to import, uses yt-dlp to screen scrape the equivilant
of a feed, and imports what it found.
* `--template` * `--template`
Controls where the files are stored. Controls where the files are stored.