importfeed: Added --scrape option
Which uses yt-dlp to screen scrape the equivilant of an RSS feed. Note that youtubedlscraped is a speed optimisation. Since yt-dlp found the urls, we know it can download them. That avoids calling youtubeDlSupported on each url, which makes --fast a lot faster. Almost all the same metadata fields and file formatting fields are populated, when yt-dlp is able to get the data. Note that yt-dlp has some additional useful metadata that could be exposed. But, much of it is specific to particular websites, and it would be hard to document on the git-annex importfeed man page. Sponsored-by: unqueued on Patreon
This commit is contained in:
parent
d7949f8202
commit
90db97d9a2
4 changed files with 165 additions and 18 deletions
|
@ -1,10 +1,12 @@
|
|||
{- yt-dlp (and deprecated youtube-dl) integration for git-annex
|
||||
-
|
||||
- Copyright 2017-2023 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2017-2024 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
||||
{-# LANGUAGE DeriveGeneric #-}
|
||||
|
||||
module Annex.YoutubeDl (
|
||||
youtubeDl,
|
||||
youtubeDlTo,
|
||||
|
@ -13,6 +15,8 @@ module Annex.YoutubeDl (
|
|||
youtubeDlFileName,
|
||||
youtubeDlFileNameHtmlOnly,
|
||||
youtubeDlCommand,
|
||||
youtubePlaylist,
|
||||
YoutubePlaylistItem(..),
|
||||
) where
|
||||
|
||||
import Annex.Common
|
||||
|
@ -23,12 +27,18 @@ import Utility.DiskFree
|
|||
import Utility.HtmlDetect
|
||||
import Utility.Process.Transcript
|
||||
import Utility.Metered
|
||||
import Utility.Tmp
|
||||
import Messages.Progress
|
||||
import Logs.Transfer
|
||||
|
||||
import Network.URI
|
||||
import Control.Concurrent.Async
|
||||
import Text.Read
|
||||
import Data.Either
|
||||
import qualified Data.Aeson as Aeson
|
||||
import GHC.Generics
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Char8 as B8
|
||||
|
||||
-- youtube-dl can follow redirects to anywhere, including potentially
|
||||
-- localhost or a private address. So, it's only allowed to download
|
||||
|
@ -324,3 +334,73 @@ parseYtdlpProgress = go [] . reverse . progresschunks
|
|||
- was buggy and is no longer done. -}
|
||||
parseYoutubeDlProgress :: ProgressParser
|
||||
parseYoutubeDlProgress _ = (Nothing, Nothing, "")
|
||||
|
||||
{- List the items that yt-dlp can download from an url.
|
||||
-
|
||||
- Note that this does not check youtubeDlAllowed because it does not
|
||||
- download content.
|
||||
-}
|
||||
youtubePlaylist :: URLString -> Annex (Either String [YoutubePlaylistItem])
|
||||
youtubePlaylist url = do
|
||||
cmd <- youtubeDlCommand
|
||||
if cmd == "yt-dlp"
|
||||
then liftIO $ youtubePlaylist' url cmd
|
||||
else return $ Left $ "Scraping needs yt-dlp, but git-annex has been configured to use " ++ cmd
|
||||
|
||||
youtubePlaylist' :: URLString -> String -> IO (Either String [YoutubePlaylistItem])
|
||||
youtubePlaylist' url cmd = withTmpFile "yt-dlp" $ \tmpfile h -> do
|
||||
hClose h
|
||||
(outerr, ok) <- processTranscript cmd
|
||||
[ "--simulate"
|
||||
, "--flat-playlist"
|
||||
-- Skip live videos in progress
|
||||
, "--match-filter", "!is_live"
|
||||
, "--print-to-file"
|
||||
-- Write json with selected fields.
|
||||
, "%(.{" ++ intercalate "," youtubePlaylistItemFields ++ "})j"
|
||||
, tmpfile
|
||||
, url
|
||||
]
|
||||
Nothing
|
||||
if ok
|
||||
then flip catchIO (pure . Left . show) $ do
|
||||
v <- map Aeson.eitherDecodeStrict . B8.lines
|
||||
<$> B.readFile tmpfile
|
||||
return $ case partitionEithers v of
|
||||
((parserr:_), _) ->
|
||||
Left $ "yt-dlp json parse errror: " ++ parserr
|
||||
([], r) -> Right r
|
||||
else return $ Left $ if null outerr
|
||||
then "yt-dlp failed"
|
||||
else "yt-dlp failed: " ++ outerr
|
||||
|
||||
-- There are other fields that yt-dlp can extract, but these are similar to
|
||||
-- the information from an RSS feed.
|
||||
youtubePlaylistItemFields :: [String]
|
||||
youtubePlaylistItemFields =
|
||||
[ "playlist_title"
|
||||
, "playlist_uploader"
|
||||
, "title"
|
||||
, "description"
|
||||
, "license"
|
||||
, "url"
|
||||
, "timestamp"
|
||||
]
|
||||
|
||||
-- Parse JSON generated by yt-dlp for playlist. Note that any field
|
||||
-- may be omitted when that information is not supported for a given website.
|
||||
data YoutubePlaylistItem = YoutubePlaylistItem
|
||||
{ youtube_playlist_title :: Maybe String
|
||||
, youtube_playlist_uploader :: Maybe String
|
||||
, youtube_title :: Maybe String
|
||||
, youtube_description :: Maybe String
|
||||
, youtube_license :: Maybe String
|
||||
, youtube_url :: Maybe String
|
||||
, youtube_timestamp :: Maybe Integer -- ^ unix timestamp
|
||||
} deriving (Generic, Show)
|
||||
|
||||
instance Aeson.FromJSON YoutubePlaylistItem
|
||||
where
|
||||
parseJSON = Aeson.genericParseJSON Aeson.defaultOptions
|
||||
{ Aeson.fieldLabelModifier = drop (length "youtube_") }
|
||||
|
||||
|
|
|
@ -1,3 +1,10 @@
|
|||
git-annex (10.20240130) UNRELEASED; urgency=medium
|
||||
|
||||
* importfeed: Added --scrape option, which uses yt-dlp to screen scrape
|
||||
the equivilant of an RSS feed.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 29 Jan 2024 15:59:33 -0400
|
||||
|
||||
git-annex (10.20240129) upstream; urgency=medium
|
||||
|
||||
* info: Added "annex sizes of repositories" table to the overall display.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex command
|
||||
-
|
||||
- Copyright 2013-2023 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2013-2024 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -17,6 +17,7 @@ import Text.Feed.Types
|
|||
import qualified Data.Set as S
|
||||
import qualified Data.Map as M
|
||||
import Data.Time.Clock
|
||||
import Data.Time.Clock.POSIX
|
||||
import Data.Time.Format
|
||||
import Data.Time.Calendar
|
||||
import Data.Time.LocalTime
|
||||
|
@ -61,6 +62,7 @@ cmd = notBareRepo $ withAnnexOptions os $
|
|||
data ImportFeedOptions = ImportFeedOptions
|
||||
{ feedUrls :: CmdParams
|
||||
, templateOption :: Maybe String
|
||||
, scrapeOption :: Bool
|
||||
, downloadOptions :: DownloadOptions
|
||||
}
|
||||
|
||||
|
@ -71,6 +73,10 @@ optParser desc = ImportFeedOptions
|
|||
( long "template" <> metavar paramFormat
|
||||
<> help "template for filenames"
|
||||
))
|
||||
<*> switch
|
||||
( long "scrape"
|
||||
<> help "scrape website for content to import"
|
||||
)
|
||||
<*> parseDownloadOptions False
|
||||
|
||||
seek :: ImportFeedOptions -> CommandSeek
|
||||
|
@ -84,7 +90,7 @@ seek o = startConcurrency commandStages $ do
|
|||
liftIO $ atomically $ do
|
||||
m <- takeTMVar dlst
|
||||
putTMVar dlst (M.insert url Nothing m)
|
||||
commandAction $ getFeed url dlst
|
||||
commandAction $ getFeed o url dlst
|
||||
startpendingdownloads addunlockedmatcher cache dlst checkst False
|
||||
|
||||
startpendingdownloads addunlockedmatcher cache dlst checkst True
|
||||
|
@ -135,18 +141,23 @@ seek o = startConcurrency commandStages $ do
|
|||
clearFeedProblem url
|
||||
|
||||
getFeed
|
||||
:: URLString
|
||||
:: ImportFeedOptions
|
||||
-> URLString
|
||||
-> TMVar (M.Map URLString (Maybe (Maybe [ToDownload])))
|
||||
-> CommandStart
|
||||
getFeed url st =
|
||||
getFeed o url st =
|
||||
starting "importfeed" (ActionItemOther (Just (UnquotedString url))) (SeekInput [url]) $
|
||||
get `onException` recordfail
|
||||
go `onException` recordfail
|
||||
where
|
||||
record v = liftIO $ atomically $ do
|
||||
m <- takeTMVar st
|
||||
putTMVar st (M.insert url v m)
|
||||
recordfail = record (Just Nothing)
|
||||
|
||||
go
|
||||
| scrapeOption o = scrape
|
||||
| otherwise = get
|
||||
|
||||
get = withTmpFile "feed" $ \tmpf h -> do
|
||||
liftIO $ hClose h
|
||||
ifM (downloadFeed url tmpf)
|
||||
|
@ -181,6 +192,14 @@ getFeed url st =
|
|||
recordfail
|
||||
next $ feedProblem url
|
||||
(msg ++ " (use --debug --debugfilter=ImportFeed to see the feed content that was downloaded)")
|
||||
|
||||
scrape = youtubePlaylist url >>= \case
|
||||
Left err -> do
|
||||
recordfail
|
||||
next $ feedProblem url err
|
||||
Right playlist -> do
|
||||
record (Just (Just (playlistDownloads url playlist)))
|
||||
next $ return True
|
||||
|
||||
parseFeedFromFile' :: FilePath -> IO (Maybe Feed)
|
||||
#if MIN_VERSION_feed(1,1,0)
|
||||
|
@ -197,6 +216,9 @@ data ToDownload = ToDownload
|
|||
, itempubdate :: Maybe (Either String UTCTime)
|
||||
-- Fields that are used as metadata and to generate the filename.
|
||||
, itemfields :: [(String, String)]
|
||||
-- True when youtube-dl found this by scraping, so certainly
|
||||
-- supports downloading it.
|
||||
, youtubedlscraped :: Bool
|
||||
}
|
||||
|
||||
data DownloadLocation = Enclosure URLString | MediaLink URLString
|
||||
|
@ -246,6 +268,7 @@ findDownloads u f = catMaybes $ map mk (feedItems f)
|
|||
_ -> Left . decodeBS . fromFeedText
|
||||
<$> getItemPublishDateString i
|
||||
, itemfields = extractFeedItemFields f i u
|
||||
, youtubedlscraped = False
|
||||
}
|
||||
|
||||
{- Feeds change, so a feed download cannot be resumed. -}
|
||||
|
@ -326,7 +349,7 @@ startDownload addunlockedmatcher opts cache cv todownload = case location todown
|
|||
|
||||
addmediafast linkurl mediaurl mediakey =
|
||||
ifM (pure (not (rawOption (downloadOptions opts)))
|
||||
<&&> youtubeDlSupported linkurl)
|
||||
<&&> (pure (youtubedlscraped todownload) <||> youtubeDlSupported linkurl))
|
||||
( startUrlDownload cv todownload linkurl $ do
|
||||
runDownload todownload linkurl ".m" cache cv $ \f ->
|
||||
checkCanAdd (downloadOptions opts) f $ \canadd -> do
|
||||
|
@ -515,6 +538,15 @@ minimalMetaData i = case itemid i of
|
|||
Just iid -> MetaData $ M.singleton itemIdField
|
||||
(S.singleton $ toMetaValue iid)
|
||||
|
||||
noneValue :: String
|
||||
noneValue = "none"
|
||||
|
||||
extractField :: String -> [Maybe String] -> (String, String)
|
||||
extractField k [] = (k, noneValue)
|
||||
extractField k (Just v:_)
|
||||
| not (null v) = (k, v)
|
||||
extractField k (_:rest) = extractField k rest
|
||||
|
||||
extractFeedItemFields :: Feed -> Item -> URLString -> [(String, String)]
|
||||
extractFeedItemFields f i u = map (uncurry extractField)
|
||||
[ ("feedurl", [Just u])
|
||||
|
@ -535,14 +567,36 @@ extractFeedItemFields f i u = map (uncurry extractField)
|
|||
feedauthor = decodeBS . fromFeedText <$> getFeedAuthor f
|
||||
itemauthor = decodeBS . fromFeedText <$> getItemAuthor i
|
||||
|
||||
extractField :: String -> [Maybe String] -> (String, String)
|
||||
extractField k [] = (k, noneValue)
|
||||
extractField k (Just v:_)
|
||||
| not (null v) = (k, v)
|
||||
extractField k (_:rest) = extractField k rest
|
||||
playlistFields :: URLString -> YoutubePlaylistItem -> [(String, String)]
|
||||
playlistFields u i = map (uncurry extractField)
|
||||
[ ("feedurl", [Just u])
|
||||
, ("feedtitle", [youtube_playlist_title i])
|
||||
, ("itemtitle", [youtube_title i])
|
||||
, ("feedauthor", [youtube_playlist_uploader i])
|
||||
, ("itemauthor", [youtube_playlist_uploader i])
|
||||
-- itemsummary omitted, no equivilant in yt-dlp data
|
||||
, ("itemdescription", [youtube_description i])
|
||||
, ("itemrights", [youtube_license i])
|
||||
, ("itemid", [youtube_url i])
|
||||
, ("title", [youtube_title i, youtube_playlist_title i])
|
||||
, ("author", [youtube_playlist_uploader i])
|
||||
]
|
||||
|
||||
noneValue :: String
|
||||
noneValue = "none"
|
||||
playlistDownloads :: URLString -> [YoutubePlaylistItem] -> [ToDownload]
|
||||
playlistDownloads url = mapMaybe go
|
||||
where
|
||||
go i = do
|
||||
iurl <- youtube_url i
|
||||
return $ ToDownload
|
||||
{ feedurl = url
|
||||
, location = MediaLink iurl
|
||||
, itemid = Just (encodeBS iurl)
|
||||
, itempubdate =
|
||||
Right . posixSecondsToUTCTime . fromIntegral
|
||||
<$> youtube_timestamp i
|
||||
, itemfields = playlistFields url i
|
||||
, youtubedlscraped = True
|
||||
}
|
||||
|
||||
{- Called when there is a problem with a feed.
|
||||
-
|
||||
|
|
|
@ -8,10 +8,10 @@ git annex importfeed `[url ...]`
|
|||
|
||||
# DESCRIPTION
|
||||
|
||||
Imports the contents of podcasts and other feeds. Only downloads files whose
|
||||
content has not already been added to the repository before, so you can
|
||||
delete, rename, etc the resulting files and repeated runs won't duplicate
|
||||
them.
|
||||
Imports the contents of podcasts and other rss and atom feeds. Only
|
||||
downloads files whose content has not already been added to the repository
|
||||
before, so you can delete, rename, etc the resulting files and repeated
|
||||
runs won't duplicate them.
|
||||
|
||||
When `yt-dlp` is installed, it can be used to download links in the feed.
|
||||
This allows importing e.g., YouTube playlists.
|
||||
|
@ -65,6 +65,12 @@ resulting in the new url being downloaded to such a filename.
|
|||
cannot be done, the import will fail, and the next import of the feed
|
||||
will retry.
|
||||
|
||||
* `--scrape`
|
||||
|
||||
Rather than downloading the url and parsing it as a rss/atom feed
|
||||
to find files to import, uses yt-dlp to screen scrape the equivilant
|
||||
of a feed, and imports what it found.
|
||||
|
||||
* `--template`
|
||||
|
||||
Controls where the files are stored.
|
||||
|
|
Loading…
Reference in a new issue