importfeed: Added --scrape option

Which uses yt-dlp to screen scrape the equivilant of an RSS feed.

Note that youtubedlscraped is a speed optimisation. Since yt-dlp found
the urls, we know it can download them. That avoids calling
youtubeDlSupported on each url, which makes --fast a lot faster.

Almost all the same metadata fields and file formatting fields are
populated, when yt-dlp is able to get the data. Note that yt-dlp has some
additional useful metadata that could be exposed. But, much of it is
specific to particular websites, and it would be hard to document on the
git-annex importfeed man page.

Sponsored-by: unqueued on Patreon
This commit is contained in:
Joey Hess 2024-01-30 15:37:29 -04:00
parent d7949f8202
commit 90db97d9a2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 165 additions and 18 deletions

View file

@ -1,10 +1,12 @@
{- yt-dlp (and deprecated youtube-dl) integration for git-annex
-
- Copyright 2017-2023 Joey Hess <id@joeyh.name>
- Copyright 2017-2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE DeriveGeneric #-}
module Annex.YoutubeDl (
youtubeDl,
youtubeDlTo,
@ -13,6 +15,8 @@ module Annex.YoutubeDl (
youtubeDlFileName,
youtubeDlFileNameHtmlOnly,
youtubeDlCommand,
youtubePlaylist,
YoutubePlaylistItem(..),
) where
import Annex.Common
@ -23,12 +27,18 @@ import Utility.DiskFree
import Utility.HtmlDetect
import Utility.Process.Transcript
import Utility.Metered
import Utility.Tmp
import Messages.Progress
import Logs.Transfer
import Network.URI
import Control.Concurrent.Async
import Text.Read
import Data.Either
import qualified Data.Aeson as Aeson
import GHC.Generics
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
-- youtube-dl can follow redirects to anywhere, including potentially
-- localhost or a private address. So, it's only allowed to download
@ -324,3 +334,73 @@ parseYtdlpProgress = go [] . reverse . progresschunks
- was buggy and is no longer done. -}
parseYoutubeDlProgress :: ProgressParser
parseYoutubeDlProgress _ = (Nothing, Nothing, "")
{- List the items that yt-dlp can download from an url.
-
- Note that this does not check youtubeDlAllowed because it does not
- download content.
-}
youtubePlaylist :: URLString -> Annex (Either String [YoutubePlaylistItem])
youtubePlaylist url = do
cmd <- youtubeDlCommand
if cmd == "yt-dlp"
then liftIO $ youtubePlaylist' url cmd
else return $ Left $ "Scraping needs yt-dlp, but git-annex has been configured to use " ++ cmd
youtubePlaylist' :: URLString -> String -> IO (Either String [YoutubePlaylistItem])
youtubePlaylist' url cmd = withTmpFile "yt-dlp" $ \tmpfile h -> do
hClose h
(outerr, ok) <- processTranscript cmd
[ "--simulate"
, "--flat-playlist"
-- Skip live videos in progress
, "--match-filter", "!is_live"
, "--print-to-file"
-- Write json with selected fields.
, "%(.{" ++ intercalate "," youtubePlaylistItemFields ++ "})j"
, tmpfile
, url
]
Nothing
if ok
then flip catchIO (pure . Left . show) $ do
v <- map Aeson.eitherDecodeStrict . B8.lines
<$> B.readFile tmpfile
return $ case partitionEithers v of
((parserr:_), _) ->
Left $ "yt-dlp json parse errror: " ++ parserr
([], r) -> Right r
else return $ Left $ if null outerr
then "yt-dlp failed"
else "yt-dlp failed: " ++ outerr
-- There are other fields that yt-dlp can extract, but these are similar to
-- the information from an RSS feed.
youtubePlaylistItemFields :: [String]
youtubePlaylistItemFields =
[ "playlist_title"
, "playlist_uploader"
, "title"
, "description"
, "license"
, "url"
, "timestamp"
]
-- Parse JSON generated by yt-dlp for playlist. Note that any field
-- may be omitted when that information is not supported for a given website.
data YoutubePlaylistItem = YoutubePlaylistItem
{ youtube_playlist_title :: Maybe String
, youtube_playlist_uploader :: Maybe String
, youtube_title :: Maybe String
, youtube_description :: Maybe String
, youtube_license :: Maybe String
, youtube_url :: Maybe String
, youtube_timestamp :: Maybe Integer -- ^ unix timestamp
} deriving (Generic, Show)
instance Aeson.FromJSON YoutubePlaylistItem
where
parseJSON = Aeson.genericParseJSON Aeson.defaultOptions
{ Aeson.fieldLabelModifier = drop (length "youtube_") }

View file

@ -1,3 +1,10 @@
git-annex (10.20240130) UNRELEASED; urgency=medium
* importfeed: Added --scrape option, which uses yt-dlp to screen scrape
the equivilant of an RSS feed.
-- Joey Hess <id@joeyh.name> Mon, 29 Jan 2024 15:59:33 -0400
git-annex (10.20240129) upstream; urgency=medium
* info: Added "annex sizes of repositories" table to the overall display.

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2013-2023 Joey Hess <id@joeyh.name>
- Copyright 2013-2024 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -17,6 +17,7 @@ import Text.Feed.Types
import qualified Data.Set as S
import qualified Data.Map as M
import Data.Time.Clock
import Data.Time.Clock.POSIX
import Data.Time.Format
import Data.Time.Calendar
import Data.Time.LocalTime
@ -61,6 +62,7 @@ cmd = notBareRepo $ withAnnexOptions os $
data ImportFeedOptions = ImportFeedOptions
{ feedUrls :: CmdParams
, templateOption :: Maybe String
, scrapeOption :: Bool
, downloadOptions :: DownloadOptions
}
@ -71,6 +73,10 @@ optParser desc = ImportFeedOptions
( long "template" <> metavar paramFormat
<> help "template for filenames"
))
<*> switch
( long "scrape"
<> help "scrape website for content to import"
)
<*> parseDownloadOptions False
seek :: ImportFeedOptions -> CommandSeek
@ -84,7 +90,7 @@ seek o = startConcurrency commandStages $ do
liftIO $ atomically $ do
m <- takeTMVar dlst
putTMVar dlst (M.insert url Nothing m)
commandAction $ getFeed url dlst
commandAction $ getFeed o url dlst
startpendingdownloads addunlockedmatcher cache dlst checkst False
startpendingdownloads addunlockedmatcher cache dlst checkst True
@ -135,18 +141,23 @@ seek o = startConcurrency commandStages $ do
clearFeedProblem url
getFeed
:: URLString
:: ImportFeedOptions
-> URLString
-> TMVar (M.Map URLString (Maybe (Maybe [ToDownload])))
-> CommandStart
getFeed url st =
getFeed o url st =
starting "importfeed" (ActionItemOther (Just (UnquotedString url))) (SeekInput [url]) $
get `onException` recordfail
go `onException` recordfail
where
record v = liftIO $ atomically $ do
m <- takeTMVar st
putTMVar st (M.insert url v m)
recordfail = record (Just Nothing)
go
| scrapeOption o = scrape
| otherwise = get
get = withTmpFile "feed" $ \tmpf h -> do
liftIO $ hClose h
ifM (downloadFeed url tmpf)
@ -181,6 +192,14 @@ getFeed url st =
recordfail
next $ feedProblem url
(msg ++ " (use --debug --debugfilter=ImportFeed to see the feed content that was downloaded)")
scrape = youtubePlaylist url >>= \case
Left err -> do
recordfail
next $ feedProblem url err
Right playlist -> do
record (Just (Just (playlistDownloads url playlist)))
next $ return True
parseFeedFromFile' :: FilePath -> IO (Maybe Feed)
#if MIN_VERSION_feed(1,1,0)
@ -197,6 +216,9 @@ data ToDownload = ToDownload
, itempubdate :: Maybe (Either String UTCTime)
-- Fields that are used as metadata and to generate the filename.
, itemfields :: [(String, String)]
-- True when youtube-dl found this by scraping, so certainly
-- supports downloading it.
, youtubedlscraped :: Bool
}
data DownloadLocation = Enclosure URLString | MediaLink URLString
@ -246,6 +268,7 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
_ -> Left . decodeBS . fromFeedText
<$> getItemPublishDateString i
, itemfields = extractFeedItemFields f i u
, youtubedlscraped = False
}
{- Feeds change, so a feed download cannot be resumed. -}
@ -326,7 +349,7 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown
addmediafast linkurl mediaurl mediakey =
ifM (pure (not (rawOption (downloadOptions opts)))
<&&> youtubeDlSupported linkurl)
<&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl))
( startUrlDownload cv todownload linkurl $ do
runDownload todownload linkurl ".m" cache cv $ \f ->
checkCanAdd (downloadOptions opts) f $ \canadd -> do
@ -515,6 +538,15 @@ minimalMetaData i = case itemid i of
Just iid -> MetaData $ M.singleton itemIdField
(S.singleton $ toMetaValue iid)
noneValue :: String
noneValue = "none"
extractField :: String -> [Maybe String] -> (String, String)
extractField k [] = (k, noneValue)
extractField k (Just v:_)
| not (null v) = (k, v)
extractField k (_:rest) = extractField k rest
extractFeedItemFields :: Feed -> Item -> URLString -> [(String, String)]
extractFeedItemFields f i u = map (uncurry extractField)
[ ("feedurl", [Just u])
@ -535,14 +567,36 @@ extractFeedItemFields f i u = map (uncurry extractField)
feedauthor = decodeBS . fromFeedText <$> getFeedAuthor f
itemauthor = decodeBS . fromFeedText <$> getItemAuthor i
extractField :: String -> [Maybe String] -> (String, String)
extractField k [] = (k, noneValue)
extractField k (Just v:_)
| not (null v) = (k, v)
extractField k (_:rest) = extractField k rest
playlistFields :: URLString -> YoutubePlaylistItem -> [(String, String)]
playlistFields u i = map (uncurry extractField)
[ ("feedurl", [Just u])
, ("feedtitle", [youtube_playlist_title i])
, ("itemtitle", [youtube_title i])
, ("feedauthor", [youtube_playlist_uploader i])
, ("itemauthor", [youtube_playlist_uploader i])
-- itemsummary omitted, no equivilant in yt-dlp data
, ("itemdescription", [youtube_description i])
, ("itemrights", [youtube_license i])
, ("itemid", [youtube_url i])
, ("title", [youtube_title i, youtube_playlist_title i])
, ("author", [youtube_playlist_uploader i])
]
noneValue :: String
noneValue = "none"
playlistDownloads :: URLString -> [YoutubePlaylistItem] -> [ToDownload]
playlistDownloads url = mapMaybe go
where
go i = do
iurl <- youtube_url i
return $ ToDownload
{ feedurl = url
, location = MediaLink iurl
, itemid = Just (encodeBS iurl)
, itempubdate =
Right . posixSecondsToUTCTime . fromIntegral
<$> youtube_timestamp i
, itemfields = playlistFields url i
, youtubedlscraped = True
}
{- Called when there is a problem with a feed.
-

View file

@ -8,10 +8,10 @@ git annex importfeed `[url ...]`
# DESCRIPTION
Imports the contents of podcasts and other feeds. Only downloads files whose
content has not already been added to the repository before, so you can
delete, rename, etc the resulting files and repeated runs won't duplicate
them.
Imports the contents of podcasts and other rss and atom feeds. Only
downloads files whose content has not already been added to the repository
before, so you can delete, rename, etc the resulting files and repeated
runs won't duplicate them.
When `yt-dlp` is installed, it can be used to download links in the feed.
This allows importing e.g., YouTube playlists.
@ -65,6 +65,12 @@ resulting in the new url being downloaded to such a filename.
cannot be done, the import will fail, and the next import of the feed
will retry.
* `--scrape`
Rather than downloading the url and parsing it as a rss/atom feed
to find files to import, uses yt-dlp to screen scrape the equivilant
of a feed, and imports what it found.
* `--template`
Controls where the files are stored.