convert importfeed to youtube-dl

Fully working, including --fast/--relaxed.

Note that, while git-annex addurl --relaxed is not going to check
youtube-dl, I kept git annex importfeed --relaxed checking it.
Thinking is that, let's not break people's importfeed cron jobs, and
importfeed does not typically have to check a large number of new items,
so it's ok if it's a little bit slower when used with youtube playlist
feeds.

importfeed's behavior is also improved (?) when a feed has links in it
to non-media files. Before, those were skipped. Now, the content of the
link is downloaded. This had to be done, because trying to use
youtube-dl is slow, and if those were skipped, it would have to check
every time importfeed was run. While this behavior change may not be
desirable for some feeds, that intersperse links to web pages with
enclosures, it will be desirable for other feeds, that have
non-enclosure directy links to media files.

Remove old quvi modules.

This commit was sponsored by Øyvind Andersen Holm.
This commit is contained in:
Joey Hess 2017-11-29 17:05:27 -04:00
parent 99bebdface
commit 24f27ec39d
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
9 changed files with 85 additions and 273 deletions

View file

@ -64,7 +64,6 @@ import Types.LockCache
import Types.DesktopNotify import Types.DesktopNotify
import Types.CleanupActions import Types.CleanupActions
import qualified Database.Keys.Handle as Keys import qualified Database.Keys.Handle as Keys
import Utility.Quvi (QuviVersion)
import Utility.InodeCache import Utility.InodeCache
import Utility.Url import Utility.Url
@ -134,7 +133,6 @@ data AnnexState = AnnexState
, errcounter :: Integer , errcounter :: Integer
, unusedkeys :: Maybe (S.Set Key) , unusedkeys :: Maybe (S.Set Key)
, tempurls :: M.Map Key URLString , tempurls :: M.Map Key URLString
, quviversion :: Maybe QuviVersion
, existinghooks :: M.Map Git.Hook.Hook Bool , existinghooks :: M.Map Git.Hook.Hook Bool
, desktopnotify :: DesktopNotify , desktopnotify :: DesktopNotify
, workers :: [Either AnnexState (Async AnnexState)] , workers :: [Either AnnexState (Async AnnexState)]
@ -190,7 +188,6 @@ newState c r = do
, errcounter = 0 , errcounter = 0
, unusedkeys = Nothing , unusedkeys = Nothing
, tempurls = M.empty , tempurls = M.empty
, quviversion = Nothing
, existinghooks = M.empty , existinghooks = M.empty
, desktopnotify = mempty , desktopnotify = mempty
, workers = [] , workers = []

View file

@ -1,33 +0,0 @@
{- quvi options for git-annex
-
- Copyright 2013 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE Rank2Types #-}
module Annex.Quvi where
import Annex.Common
import qualified Annex
import Utility.Quvi
import Utility.Url
withQuviOptions :: forall a. Query a -> [QuviParams] -> URLString -> Annex a
withQuviOptions a ps url = do
v <- quviVersion
opts <- return []
liftIO $ a v (concatMap (\mkp -> mkp v) ps ++ opts) url
quviSupported :: URLString -> Annex Bool
quviSupported u = liftIO . flip supported u =<< quviVersion
quviVersion :: Annex QuviVersion
quviVersion = go =<< Annex.getState Annex.quviversion
where
go (Just v) = return v
go Nothing = do
v <- liftIO probeVersion
Annex.changeState $ \s -> s { Annex.quviversion = Just v }
return v

View file

@ -81,7 +81,6 @@ buildFlags = filter (not . null)
-- Always enabled now, but users may be used to seeing these flags -- Always enabled now, but users may be used to seeing these flags
-- listed. -- listed.
, "Feeds" , "Feeds"
, "Quvi"
] ]
-- Not a complete list, let alone a listing transitive deps, but only -- Not a complete list, let alone a listing transitive deps, but only

View file

@ -7,6 +7,8 @@ git-annex (6.20171125) UNRELEASED; urgency=medium
does not allow doing so without hitting the network, which would make does not allow doing so without hitting the network, which would make
this no faster than addurl --fast. Use addurl --fast instead if you this no faster than addurl --fast. Use addurl --fast instead if you
want embedded media to be downloaded. want embedded media to be downloaded.
* importfeed now downloads things linked to by feeds, even when they are
not media files.
-- Joey Hess <id@joeyh.name> Tue, 28 Nov 2017 13:48:44 -0400 -- Joey Hess <id@joeyh.name> Tue, 28 Nov 2017 13:48:44 -0400

View file

@ -165,7 +165,7 @@ downloadRemoteFile r relaxed uri file sz = checkCanAdd file $ do
liftIO $ createDirectoryIfMissing True (parentDir file) liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed) ifM (Annex.getState Annex.fast <||> pure relaxed)
( do ( do
cleanup (Remote.uuid r) loguri file urlkey Nothing addWorkTree (Remote.uuid r) loguri file urlkey Nothing
return (Just urlkey) return (Just urlkey)
, do , do
-- Set temporary url for the urlkey -- Set temporary url for the urlkey
@ -214,46 +214,6 @@ performWeb o url file urlinfo = ifAnnexed file addurl geturl
addurl = addUrlChecked (relaxedOption o) url webUUID $ \k -> return $ addurl = addUrlChecked (relaxedOption o) url webUUID $ \k -> return $
(Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k) (Url.urlExists urlinfo, Url.urlSize urlinfo == keySize k)
performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
performQuvi relaxed pageurl videourl file = ifAnnexed file addurl geturl
where
quviurl = setDownloader pageurl QuviDownloader
addurl key = next $ do
cleanup webUUID quviurl file key Nothing
return True
geturl = next $ isJust <$> addUrlFileQuvi relaxed quviurl videourl file
addUrlFileQuvi :: Bool -> URLString -> URLString -> FilePath -> Annex (Maybe Key)
addUrlFileQuvi relaxed quviurl videourl file = checkCanAdd file $ do
let key = Backend.URL.fromUrl quviurl Nothing
ifM (pure relaxed <||> Annex.getState Annex.fast)
( do
cleanup webUUID quviurl file key Nothing
return (Just key)
, do
{- Get the size, and use that to check
- disk space. However, the size info is not
- retained, because the size of a video stream
- might change and we want to be able to download
- it later. -}
urlinfo <- Url.withUrlOptions (Url.getUrlInfo videourl)
let sizedkey = addSizeUrlKey urlinfo key
checkDiskSpaceToGet sizedkey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation key
showOutput
ok <- Transfer.notifyTransfer Transfer.Download afile $
Transfer.download webUUID key afile Transfer.forwardRetry $ \p -> do
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloadUrl key p [videourl] tmp
if ok
then do
cleanup webUUID quviurl file key (Just tmp)
return (Just key)
else return Nothing
)
where
afile = AssociatedFile (Just file)
addUrlChecked :: Bool -> URLString -> UUID -> (Key -> Annex (Bool, Bool)) -> Key -> CommandPerform addUrlChecked :: Bool -> URLString -> UUID -> (Key -> Annex (Bool, Bool)) -> Key -> CommandPerform
addUrlChecked relaxed url u checkexistssize key addUrlChecked relaxed url u checkexistssize key
| relaxed = do | relaxed = do
@ -321,7 +281,7 @@ downloadWeb url urlinfo file =
pruneTmpWorkDirBefore tmp (liftIO . nukeFile) pruneTmpWorkDirBefore tmp (liftIO . nukeFile)
let dest = takeFileName mediafile let dest = takeFileName mediafile
showDestinationFile dest showDestinationFile dest
cleanup webUUID mediaurl dest mediakey (Just mediafile) addWorkTree webUUID mediaurl dest mediakey (Just mediafile)
return $ Right $ Just mediakey return $ Right $ Just mediakey
Right Nothing -> Right <$> normalfinish tmp Right Nothing -> Right <$> normalfinish tmp
Left msg -> return $ Left msg Left msg -> return $ Left msg
@ -379,15 +339,16 @@ finishDownloadWith tmp u url file = do
case k of case k of
Nothing -> return Nothing Nothing -> return Nothing
Just (key, _) -> do Just (key, _) -> do
cleanup u url file key (Just tmp) addWorkTree u url file key (Just tmp)
return (Just key) return (Just key)
{- Adds the url size to the Key. -} {- Adds the url size to the Key. -}
addSizeUrlKey :: Url.UrlInfo -> Key -> Key addSizeUrlKey :: Url.UrlInfo -> Key -> Key
addSizeUrlKey urlinfo key = key { keySize = Url.urlSize urlinfo } addSizeUrlKey urlinfo key = key { keySize = Url.urlSize urlinfo }
cleanup :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex () {- Adds worktree file to the repository. -}
cleanup u url file key mtmp = case mtmp of addWorkTree :: UUID -> URLString -> FilePath -> Key -> Maybe FilePath -> Annex ()
addWorkTree u url file key mtmp = case mtmp of
Nothing -> go Nothing -> go
Just tmp -> do Just tmp -> do
-- Move to final location for large file check. -- Move to final location for large file check.
@ -418,7 +379,7 @@ nodownload :: URLString -> Url.UrlInfo -> FilePath -> Annex (Maybe Key)
nodownload url urlinfo file nodownload url urlinfo file
| Url.urlExists urlinfo = do | Url.urlExists urlinfo = do
let key = Backend.URL.fromUrl url (Url.urlSize urlinfo) let key = Backend.URL.fromUrl url (Url.urlSize urlinfo)
cleanup webUUID url file key Nothing addWorkTree webUUID url file key Nothing
return (Just key) return (Just key)
| otherwise = do | otherwise = do
warning $ "unable to access url: " ++ url warning $ "unable to access url: " ++ url

View file

@ -36,12 +36,12 @@ import Command.AddUrl (addUrlFile, downloadRemoteFile, parseRelaxedOption, parse
import Annex.Perms import Annex.Perms
import Annex.UUID import Annex.UUID
import Backend.URL (fromUrl) import Backend.URL (fromUrl)
import Annex.Quvi import Annex.Content
import qualified Utility.Quvi as Quvi import Annex.YoutubeDl
import Command.AddUrl (addUrlFileQuvi)
import Types.MetaData import Types.MetaData
import Logs.MetaData import Logs.MetaData
import Annex.MetaData import Annex.MetaData
import Command.AddUrl (addWorkTree)
cmd :: Command cmd :: Command
cmd = notBareRepo $ cmd = notBareRepo $
@ -101,7 +101,7 @@ data ToDownload = ToDownload
, location :: DownloadLocation , location :: DownloadLocation
} }
data DownloadLocation = Enclosure URLString | QuviLink URLString data DownloadLocation = Enclosure URLString | MediaLink URLString
type ItemId = String type ItemId = String
@ -141,14 +141,10 @@ findDownloads u = go =<< downloadFeed u
Just (enclosureurl, _, _) -> return $ Just (enclosureurl, _, _) -> return $
Just $ ToDownload f u i $ Enclosure $ Just $ ToDownload f u i $ Enclosure $
fromFeed enclosureurl fromFeed enclosureurl
Nothing -> mkquvi f i Nothing -> case getItemLink i of
mkquvi f i = case getItemLink i of Just link -> return $ Just $ ToDownload f u i $
Just link -> ifM (quviSupported $ fromFeed link) MediaLink $ fromFeed link
( return $ Just $ ToDownload f u i $ QuviLink $ Nothing -> return Nothing
fromFeed link
, return Nothing
)
Nothing -> return Nothing
{- Feeds change, so a feed download cannot be resumed. -} {- Feeds change, so a feed download cannot be resumed. -}
downloadFeed :: URLString -> Annex (Maybe Feed) downloadFeed :: URLString -> Annex (Maybe Feed)
@ -192,19 +188,18 @@ performDownload opts cache todownload = case location todownload of
then catMaybes kl then catMaybes kl
else [] else []
QuviLink pageurl -> do MediaLink linkurl -> do
let quviurl = setDownloader pageurl QuviDownloader let mediaurl = setDownloader linkurl YoutubeDownloader
checkknown quviurl $ do let mediakey = Backend.URL.fromUrl mediaurl Nothing
mp <- withQuviOptions Quvi.query [Quvi.quiet, Quvi.httponly] pageurl -- Old versions of git-annex that used quvi might have
case mp of -- used the quviurl for this, so check i/f it's known
Nothing -> return False -- to avoid adding it a second time.
Just page -> case headMaybe $ Quvi.pageLinks page of let quviurl = setDownloader linkurl QuviDownloader
Nothing -> return False checkknown mediaurl $ checkknown quviurl $
Just link -> do ifM (Annex.getState Annex.fast <||> pure (relaxedOption opts))
let videourl = Quvi.linkUrl link ( addmediafast linkurl mediaurl mediakey
checkknown videourl $ , downloadmedia linkurl mediaurl mediakey
rundownload videourl ("." ++ fromMaybe "m" (Quvi.linkSuffix link)) $ \f -> )
maybeToList <$> addUrlFileQuvi (relaxedOption opts) quviurl videourl f
where where
forced = Annex.getState Annex.force forced = Annex.getState Annex.force
@ -265,6 +260,44 @@ performDownload opts cache todownload = case location todownload of
( return Nothing ( return Nothing
, tryanother , tryanother
) )
downloadmedia linkurl mediaurl mediakey = do
r <- withTmpWorkDir mediakey $ \workdir -> do
dl <- youtubeDl linkurl workdir
case dl of
Right (Just mediafile) -> do
let ext = case takeExtension mediafile of
[] -> ".m"
s -> s
ok <- rundownload linkurl ext $ \f -> do
addWorkTree webUUID mediaurl f mediakey (Just mediafile)
return [mediakey]
return (Right ok)
-- youtude-dl didn't support it, so
-- download it as if the link were
-- an enclosure.
Right Nothing -> Right <$>
performDownload opts cache todownload
{ location = Enclosure linkurl }
Left msg -> return (Left msg)
case r of
Left msg -> do
warning msg
return False
Right b -> return b
addmediafast linkurl mediaurl mediakey =
youtubeDlSupported linkurl >>= \case
Right True ->
rundownload linkurl ".m" $ \f -> do
addWorkTree webUUID mediaurl f mediakey Nothing
return [mediakey]
Right False ->
performDownload opts cache todownload
{ location = Enclosure linkurl }
Left msg -> do
warning msg
return False
defaultTemplate :: String defaultTemplate :: String
defaultTemplate = "${feedtitle}/${itemtitle}${extension}" defaultTemplate = "${feedtitle}/${itemtitle}${extension}"

View file

@ -1,162 +0,0 @@
{- querying quvi (import qualified)
-
- Copyright 2013 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE FlexibleContexts #-}
module Utility.Quvi where
import Common
import Utility.Url
import Data.Aeson
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Map as M
import Network.URI (uriAuthority, uriRegName)
import Data.Char
data QuviVersion
= Quvi04
| Quvi09
| NoQuvi
deriving (Show)
data Page = Page
{ pageTitle :: String
, pageLinks :: [Link]
} deriving (Show)
data Link = Link
{ linkSuffix :: Maybe String
, linkUrl :: URLString
} deriving (Show)
{- JSON instances for quvi 0.4. -}
instance FromJSON Page where
parseJSON (Object v) = Page
<$> v .: "page_title"
<*> v .: "link"
parseJSON _ = mzero
instance FromJSON Link where
parseJSON (Object v) = Link
<$> v .:? "file_suffix"
<*> v .: "url"
parseJSON _ = mzero
{- "enum" format used by quvi 0.9 -}
parseEnum :: String -> Maybe Page
parseEnum s = Page
<$> get "QUVI_MEDIA_PROPERTY_TITLE"
<*> ((:[]) <$>
( Link
<$> Just <$> (get "QUVI_MEDIA_STREAM_PROPERTY_CONTAINER")
<*> get "QUVI_MEDIA_STREAM_PROPERTY_URL"
)
)
where
get = flip M.lookup m
m = M.fromList $ map (separate (== '=')) $ lines s
probeVersion :: IO QuviVersion
probeVersion = catchDefaultIO NoQuvi $
examine <$> processTranscript "quvi" ["--version"] Nothing
where
examine (s, True)
| "quvi v0.4" `isInfixOf` s = Quvi04
| otherwise = Quvi09
examine _ = NoQuvi
type Query a = QuviVersion -> [CommandParam] -> URLString -> IO a
{- Throws an error when quvi is not installed. -}
forceQuery :: Query (Maybe Page)
forceQuery v ps url = query' v ps url `catchNonAsync` onerr
where
onerr e = ifM (inPath "quvi")
( giveup ("quvi failed: " ++ show e)
, giveup "quvi is not installed"
)
{- Returns Nothing if the page is not a video page, or quvi is not
- installed. -}
query :: Query (Maybe Page)
query v ps url = flip catchNonAsync (const $ return Nothing) (query' v ps url)
query' :: Query (Maybe Page)
query' Quvi09 ps url = parseEnum
<$> readQuvi (toCommand $ [Param "dump", Param "-p", Param "enum"] ++ ps ++ [Param url])
query' Quvi04 ps url = do
let p = proc "quvi" (toCommand $ ps ++ [Param url])
decode . BL.fromStrict
<$> withHandle StdoutHandle createProcessSuccess p B.hGetContents
query' NoQuvi _ _ = return Nothing
queryLinks :: Query [URLString]
queryLinks v ps url = maybe [] (map linkUrl . pageLinks) <$> query v ps url
{- Checks if quvi can still find a download link for an url.
- If quvi is not installed, returns False. -}
check :: Query Bool
check v ps url = maybe False (not . null . pageLinks) <$> query v ps url
{- Checks if an url is supported by quvi, as quickly as possible
- (without hitting it if possible), and without outputting
- anything. Also returns False if quvi is not installed. -}
supported :: QuviVersion -> URLString -> IO Bool
supported NoQuvi _ = return False
supported Quvi04 url = boolSystem "quvi"
[ Param "--verbosity", Param "mute"
, Param "--support"
, Param url
]
{- Use quvi-info to see if the url's domain is supported.
- If so, have to do a online verification of the url. -}
supported Quvi09 url = (firstlevel <&&> secondlevel)
`catchNonAsync` (\_ -> return False)
where
firstlevel = case uriAuthority =<< parseURIRelaxed url of
Nothing -> return False
Just auth -> do
let domain = map toLower $ uriRegName auth
let basedomain = intercalate "." $ reverse $ take 2 $ reverse $ splitc '.' domain
any (\h -> domain `isSuffixOf` h || basedomain `isSuffixOf` h)
. map (map toLower) <$> listdomains Quvi09
secondlevel = snd <$> processTranscript "quvi"
(toCommand [Param "dump", Param "-o", Param url]) Nothing
listdomains :: QuviVersion -> IO [String]
listdomains Quvi09 = concatMap (splitc ',')
. concatMap (drop 1 . words)
. filter ("domains: " `isPrefixOf`) . lines
<$> readQuvi (toCommand [Param "info", Param "-p", Param "domains"])
listdomains _ = return []
type QuviParams = QuviVersion -> [CommandParam]
{- Disables progress, but not information output. -}
quiet :: QuviParams
-- Cannot use quiet as it now disables informational output.
-- No way to disable progress.
quiet Quvi09 = [Param "--verbosity", Param "verbose"]
quiet Quvi04 = [Param "--verbosity", Param "quiet"]
quiet NoQuvi = []
{- Only return http results, not streaming protocols. -}
httponly :: QuviParams
-- No way to do it with 0.9?
httponly Quvi04 = [Param "-c", Param "http"]
httponly _ = [] -- No way to do it with 0.9?
readQuvi :: [String] -> IO String
readQuvi ps = withHandle StdoutHandle createProcessSuccess p $ \h -> do
r <- hGetContentsStrict h
hClose h
return r
where
p = proc "quvi" ps

View file

@ -8,7 +8,7 @@ git annex importfeed `[url ...]`
# DESCRIPTION # DESCRIPTION
Imports the contents of podcast feeds. Only downloads files whose Imports the contents of podcasts and other feeds. Only downloads files whose
content has not already been added to the repository before, so you can content has not already been added to the repository before, so you can
delete, rename, etc the resulting files and repeated runs won't duplicate delete, rename, etc the resulting files and repeated runs won't duplicate
them. them.
@ -37,6 +37,23 @@ To make the import process add metadata to the imported files from the feed,
These options behave the same as when using [[git-annex-addurl]](1). These options behave the same as when using [[git-annex-addurl]](1).
* `--fast`
Avoid immediately downloading urls. The url is still checked
(via HEAD) to verify that it exists, and to get its size if possible.
* `--relaxed`
Don't immediately download urls, and avoid storing the size of the
url's content. This makes git-annex accept whatever content is there
at a future point.
* `--raw`
Prevent special handling of urls by youtube-dl, bittorrent, and other
special remotes. This will for example, make importfeed
download a .torrent file and not the contents it points to.
# SEE ALSO # SEE ALSO
[[git-annex]](1) [[git-annex]](1)

View file

@ -531,7 +531,6 @@ Executable git-annex
Annex.Path Annex.Path
Annex.Perms Annex.Perms
Annex.Queue Annex.Queue
Annex.Quvi
Annex.ReplaceFile Annex.ReplaceFile
Annex.SpecialRemote Annex.SpecialRemote
Annex.Ssh Annex.Ssh
@ -1035,7 +1034,6 @@ Executable git-annex
Utility.Process Utility.Process
Utility.Process.Shim Utility.Process.Shim
Utility.QuickCheck Utility.QuickCheck
Utility.Quvi
Utility.Rsync Utility.Rsync
Utility.SRV Utility.SRV
Utility.SafeCommand Utility.SafeCommand