Youtube support! (And 53 other video hosts)

When quvi is installed, git-annex addurl automatically uses it to detect
when an page is a video, and downloads the video file.

web special remote: Also support using quvi, for getting files,
or checking if files exist in the web.

This commit was sponsored by Mark Hepburn. Thanks!
This commit is contained in:
Joey Hess 2013-08-22 18:25:21 -04:00
parent 96fc3a63ac
commit 46b6d75274
16 changed files with 278 additions and 33 deletions

20
Annex/Quvi.hs Normal file
View file

@ -0,0 +1,20 @@
{- quvi options for git-annex
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE Rank2Types #-}
module Annex.Quvi where
import Common.Annex
import qualified Annex
import Utility.Quvi
import Utility.Url
withQuviOptions :: forall a. (Query a) -> [CommandParam] -> URLString -> Annex a
withQuviOptions a ps url = do
opts <- map Param . annexQuviOptions <$> Annex.getGitConfig
liftIO $ a (ps++opts) url

View file

@ -32,6 +32,7 @@ tests =
, TestCase "curl" $ testCmd "curl" "curl --version >/dev/null"
, TestCase "wget" $ testCmd "wget" "wget --version >/dev/null"
, TestCase "bup" $ testCmd "bup" "bup --version >/dev/null"
, TestCase "quvi" $ testCmd "quvi" "quvi --version >/dev/null"
, TestCase "ionice" $ testCmd "ionice" "ionice -c3 true >/dev/null"
, TestCase "gpg" $ maybeSelectCmd "gpg"
[ ("gpg", "--version >/dev/null")

View file

@ -27,6 +27,8 @@ import Annex.Content.Direct
import Logs.Location
import qualified Logs.Transfer as Transfer
import Utility.Daemon (checkDaemon)
import Annex.Quvi
import qualified Utility.Quvi as Quvi
def :: [Command]
def = [notBareRepo $ withOptions [fileOption, pathdepthOption, relaxedOption] $
@ -51,15 +53,51 @@ seek = [withField fileOption return $ \f ->
start :: Bool -> Maybe FilePath -> Maybe Int -> String -> CommandStart
start relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
where
bad = fromMaybe (error $ "bad url " ++ s) $
parseURI $ escapeURIString isUnescapedInURI s
go url = do
pathmax <- liftIO $ fileNameLengthLimit "."
let file = fromMaybe (url2file url pathdepth pathmax) optfile
(s', downloader) = getDownloader s
bad = fromMaybe (error $ "bad url " ++ s') $
parseURI $ escapeURIString isUnescapedInURI s'
badquvi = error $ "quvi does not know how to download url " ++ s'
choosefile = flip fromMaybe optfile
go url
| downloader == QuviDownloader = usequvi
| otherwise = ifM (liftIO $ Quvi.supported s')
( usequvi
, do
pathmax <- liftIO $ fileNameLengthLimit "."
let file = choosefile $ url2file url pathdepth pathmax
showStart "addurl" file
next $ perform relaxed s' file
)
usequvi = do
page <- fromMaybe badquvi
<$> withQuviOptions Quvi.forceQuery [Quvi.quiet, Quvi.httponly] s'
let link = fromMaybe badquvi $ headMaybe $ Quvi.pageLinks page
let file = choosefile $ sanitizeFilePath $
Quvi.pageTitle page ++ "." ++ Quvi.linkSuffix link
showStart "addurl" file
next $ perform relaxed s file
next $ performQuvi relaxed s' (Quvi.linkUrl link) file
perform :: Bool -> String -> FilePath -> CommandPerform
performQuvi :: Bool -> URLString -> URLString -> FilePath -> CommandPerform
performQuvi relaxed pageurl videourl file = ifAnnexed file addurl geturl
where
quviurl = setDownloader pageurl QuviDownloader
addurl (key, _backend) = next $ cleanup quviurl file key Nothing
geturl = do
key <- Backend.URL.fromUrl quviurl Nothing
ifM (pure relaxed <||> Annex.getState Annex.fast)
( next $ cleanup quviurl file key Nothing
, do
tmp <- fromRepo $ gitAnnexTmpLocation key
showOutput
ok <- Transfer.download webUUID key (Just file) Transfer.forwardRetry $ const $ do
liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloadUrl [videourl] tmp
if ok
then next $ cleanup quviurl file key (Just tmp)
else stop
)
perform :: Bool -> URLString -> FilePath -> CommandPerform
perform relaxed url file = ifAnnexed file addurl geturl
where
geturl = next $ addUrlFile relaxed url file
@ -78,7 +116,7 @@ perform relaxed url file = ifAnnexed file addurl geturl
stop
)
addUrlFile :: Bool -> String -> FilePath -> Annex Bool
addUrlFile :: Bool -> URLString -> FilePath -> Annex Bool
addUrlFile relaxed url file = do
liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed)
@ -88,7 +126,7 @@ addUrlFile relaxed url file = do
download url file
)
download :: String -> FilePath -> Annex Bool
download :: URLString -> FilePath -> Annex Bool
download url file = do
dummykey <- genkey
tmp <- fromRepo $ gitAnnexTmpLocation dummykey
@ -130,7 +168,7 @@ download url file = do
downloadUrl [url] tmp
cleanup :: String -> FilePath -> Key -> Maybe FilePath -> Annex Bool
cleanup :: URLString -> FilePath -> Key -> Maybe FilePath -> Annex Bool
cleanup url file key mtmp = do
when (isJust mtmp) $
logStatus key InfoPresent
@ -144,7 +182,7 @@ cleanup url file key mtmp = do
maybe noop (moveAnnex key) mtmp
return True
nodownload :: Bool -> String -> FilePath -> Annex Bool
nodownload :: Bool -> URLString -> FilePath -> Annex Bool
nodownload relaxed url file = do
headers <- getHttpHeaders
(exists, size) <- if relaxed

View file

@ -12,7 +12,6 @@ import Text.Feed.Query
import Text.Feed.Types
import qualified Data.Set as S
import qualified Data.Map as M
import Data.Char
import Data.Time.Clock
import Common.Annex
@ -172,20 +171,15 @@ feedFile tmpl i = Utility.Format.format tmpl $ M.fromList
, fieldMaybe "itemdescription" $ getItemDescription $ item i
, fieldMaybe "itemrights" $ getItemRights $ item i
, fieldMaybe "itemid" $ snd <$> getItemId (item i)
, ("extension", map sanitize $ takeExtension $ location i)
, ("extension", sanitizeFilePath $ takeExtension $ location i)
]
where
field k v =
let s = map sanitize v in
let s = sanitizeFilePath v in
if null s then (k, "none") else (k, s)
fieldMaybe k Nothing = (k, "none")
fieldMaybe k (Just v) = field k v
sanitize c
| c == '.' = c
| isSpace c || isPunctuation c || c == '/' = '_'
| otherwise = c
{- Called when there is a problem with a feed.
- Throws an error if the feed is broken, otherwise shows a warning. -}
feedProblem :: URLString -> String -> Annex ()

View file

@ -13,7 +13,10 @@ module Logs.Web (
setUrlMissing,
urlLog,
urlLogKey,
knownUrls
knownUrls,
Downloader(..),
getDownloader,
setDownloader,
) where
import qualified Data.ByteString.Lazy.Char8 as L
@ -101,3 +104,20 @@ knownUrls = do
where
geturls Nothing = return []
geturls (Just logsha) = getLog . L.unpack <$> catObject logsha
data Downloader = DefaultDownloader | QuviDownloader
deriving (Eq)
{- Determines the downloader for an URL.
-
- Some URLs are not downloaded by normal means, and this is indicated
- by prefixing them with downloader: when they are recorded in the url
- logs. -}
getDownloader :: URLString -> (URLString, Downloader)
getDownloader u = case separate (== ':') u of
("quvi", u') -> (u', QuviDownloader)
_ -> (u, DefaultDownloader)
setDownloader :: URLString -> Downloader -> URLString
setDownloader u DefaultDownloader = u
setDownloader u QuviDownloader = "quvi:" ++ u

View file

@ -15,9 +15,11 @@ import Annex.Content
import Config
import Config.Cost
import Logs.Web
import qualified Utility.Url as Url
import Types.Key
import Utility.Metered
import qualified Utility.Url as Url
import Annex.Quvi
import qualified Utility.Quvi as Quvi
import qualified Data.Map as M
@ -67,7 +69,12 @@ downloadKey key _file dest _p = get =<< getUrls key
return False
get urls = do
showOutput -- make way for download progress bar
downloadUrl urls dest
untilTrue urls $ \u -> do
let (u', downloader) = getDownloader u
case downloader of
QuviDownloader -> flip downloadUrl dest
=<< withQuviOptions Quvi.queryLinks [Quvi.httponly, Quvi.quiet] u'
_ -> downloadUrl [u] dest
downloadKeyCheap :: Key -> FilePath -> Annex Bool
downloadKeyCheap _ _ = return False
@ -90,6 +97,11 @@ checkKey key = do
else return . Right =<< checkKey' key us
checkKey' :: Key -> [URLString] -> Annex Bool
checkKey' key us = untilTrue us $ \u -> do
showAction $ "checking " ++ u
headers <- getHttpHeaders
liftIO $ Url.check u headers (keySize key)
let (u', downloader) = getDownloader u
showAction $ "checking " ++ u'
case downloader of
QuviDownloader ->
withQuviOptions Quvi.check [Quvi.httponly, Quvi.quiet] u'
_ -> do
headers <- getHttpHeaders
liftIO $ Url.check u' headers (keySize key)

View file

@ -37,6 +37,7 @@ data GitConfig = GitConfig
, annexAutoCommit :: Bool
, annexDebug :: Bool
, annexWebOptions :: [String]
, annexQuviOptions :: [String]
, annexWebDownloadCommand :: Maybe String
, annexCrippledFileSystem :: Bool
, annexLargeFiles :: Maybe String
@ -62,6 +63,7 @@ extractGitConfig r = GitConfig
, annexAutoCommit = getbool (annex "autocommit") True
, annexDebug = getbool (annex "debug") False
, annexWebOptions = getwords (annex "web-options")
, annexQuviOptions = getwords (annex "quvi-options")
, annexWebDownloadCommand = getmaybe (annex "web-download-command")
, annexCrippledFileSystem = getbool (annex "crippledfilesystem") False
, annexLargeFiles = getmaybe (annex "largefiles")

View file

@ -14,6 +14,7 @@ import System.FilePath
import System.Directory
import Data.List
import Data.Maybe
import Data.Char
import Control.Applicative
#ifdef mingw32_HOST_OS
@ -236,3 +237,18 @@ fileNameLengthLimit dir = do
else return $ minimum [l, 255]
where
#endif
{- Given a string that we'd like to use as the basis for FilePath, but that
- was provided by a third party and is not to be trusted, returns the closest
- sane FilePath.
-
- All spaces and punctuation are replaced with '_', except for '.'
- "../" will thus turn into ".._", which is safe.
-}
sanitizeFilePath :: String -> FilePath
sanitizeFilePath = map sanitize
where
sanitize c
| c == '.' = c
| isSpace c || isPunctuation c || c == '/' = '_'
| otherwise = c

83
Utility/Quvi.hs Normal file
View file

@ -0,0 +1,83 @@
{- querying quvi (import qualified)
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Utility.Quvi where
import Common
import Utility.Url
import Data.Aeson
import Data.ByteString.Lazy.UTF8 (fromString)
data Page = Page
{ pageTitle :: String
, pageLinks :: [Link]
} deriving (Show)
data Link = Link
{ linkSuffix :: String
, linkUrl :: URLString
} deriving (Show)
instance FromJSON Page where
parseJSON (Object v) = Page
<$> v .: "page_title"
<*> v .: "link"
parseJSON _ = mzero
instance FromJSON Link where
parseJSON (Object v) = Link
<$> v .: "file_suffix"
<*> v .: "url"
parseJSON _ = mzero
type Query a = [CommandParam] -> URLString -> IO a
{- Throws an error when quvi is not installed. -}
forceQuery :: Query (Maybe Page)
forceQuery ps url = flip catchNonAsync (const notinstalled) (query' ps url)
where
notinstalled = error "quvi failed, or is not installed"
{- Returns Nothing if the page is not a video page, or quvi is not
- installed. -}
query :: Query (Maybe Page)
query ps url = flip catchNonAsync (const $ return Nothing) (query' ps url)
query' :: Query (Maybe Page)
query' ps url = decode . fromString
<$> readProcess "quvi" (toCommand $ ps ++ [Param url])
queryLinks :: Query [URLString]
queryLinks ps url = maybe [] (map linkUrl . pageLinks) <$> query ps url
{- Checks if quvi can still find a download link for an url.
- If quvi is not installed, returns False. -}
check :: Query Bool
check ps url = maybe False (not . null . pageLinks) <$> query ps url
{- Checks if an url is supported by quvi, without hitting it, or outputting
- anything. Also returns False if quvi is not installed. -}
supported :: URLString -> IO Bool
supported url = boolSystem "quvi" [Params "-v mute --support", Param url]
quiet :: CommandParam
quiet = Params "-v quiet"
noredir :: CommandParam
noredir = Params "-e -resolve"
{- Only return http results, not streaming protocols. -}
httponly :: CommandParam
httponly = Params "-c http"
{- Avoids error messages being printed to stderr, instead they are
- put in the JSON. -}
hideerrors :: CommandParam
hideerrors = Params "-l +errors"

5
debian/changelog vendored
View file

@ -1,5 +1,10 @@
git-annex (4.20130816) UNRELEASED; urgency=low
* Youtube support! (And 53 other video hosts). When quvi is installed,
git-annex addurl automatically uses it to detect when an page is
a video, and downloads the video file.
* web special remote: Also support using quvi, for getting files,
or checking if files exist in the web.
* Debian: Run the builtin test suite as an autopkgtest.
* Debian: Recommend ssh-askpass, which ssh will use when the assistant
is run w/o a tty. Closes: #719832

3
debian/control vendored
View file

@ -21,6 +21,7 @@ Build-Depends:
libghc-dlist-dev,
libghc-uuid-dev,
libghc-json-dev,
libghc-aeson-dev,
libghc-ifelse-dev,
libghc-bloomfilter-dev,
libghc-edit-distance-dev,
@ -71,7 +72,7 @@ Depends: ${misc:Depends}, ${shlibs:Depends},
wget,
curl,
openssh-client (>= 1:5.6p1)
Recommends: lsof, gnupg, bind9-host, ssh-askpass
Recommends: lsof, gnupg, bind9-host, ssh-askpass, quvi
Suggests: graphviz, bup, libnss-mdns
Description: manage files with git, without checking their contents into git
git-annex allows managing files with git, without checking the file

View file

@ -195,6 +195,9 @@ subdirectories).
alternate locations from which the file can be downloaded. In this mode,
addurl can be used both to add new files, or to add urls to existing files.
When quvi is installed, urls are automatically tested to see if they
are on a video hosting site, and the video is downloaded instead.
* rmurl file url
Record that the file is no longer available at the url.
@ -1112,6 +1115,11 @@ Here are all the supported configuration settings.
(wget is always used in preference to curl if available.)
For example, to force ipv4 only, set it to "-4"
* `annex.quvi-options`
Options to pass to quvi when using it to find the url to download for a
video.
* `annex.http-headers`
HTTP headers to send when downloading from the web. Multiple lines of

View file

@ -11,6 +11,7 @@ quite a lot.
* [monad-control](http://hackage.haskell.org/package/monad-control)
* [QuickCheck 2](http://hackage.haskell.org/package/QuickCheck)
* [json](http://hackage.haskell.org/package/json)
* [aeson](http://hackage.haskell.org/package/aeson)
* [IfElse](http://hackage.haskell.org/package/IfElse)
* [dlist](http://hackage.haskell.org/package/dlist)
* [bloomfilter](http://hackage.haskell.org/package/bloomfilter)

View file

@ -8,10 +8,16 @@ The web can be used as a [[special_remote|special_remotes]] too.
Now the file is downloaded, and has been added to the annex like any other
file. So it can be renamed, copied to other repositories, and so on.
To add a lot of urls at once, just list them all as parameters to
`git annex addurl`.
## trust issues
Note that git-annex assumes that, if the web site does not 404, and has the
right file size, the file is still present on the web, and this counts as
one [[copy|copies]] of the file. So it will let you remove your last copy,
trusting it can be downloaded again:
one [[copy|copies]] of the file. If the file still seems to be present
on the web, it will let you remove your last copy, trusting it can be
downloaded again:
# git annex drop example.com_video.mpeg
drop example.com_video.mpeg (checking http://example.com/video.mpeg) ok
@ -31,7 +37,9 @@ With the result that it will hang onto files:
(Use --force to override this check, or adjust annex.numcopies.)
failed
You can also add urls to any file already in the annex:
## attaching urls to existing files
You can also attach urls to any file already in the annex:
# git annex addurl --file my_cool_big_file http://example.com/cool_big_file
addurl my_cool_big_file ok
@ -40,8 +48,10 @@ You can also add urls to any file already in the annex:
00000000-0000-0000-0000-000000000001 -- web
27a9510c-760a-11e1-b9a0-c731d2b77df9 -- here
To add a lot of urls at once, just list them all as parameters to
`git annex addurl`.
## configuring filenames
By default, `addurl` will generate a filename for you. You can use
`--file=` to specify the filename to use.
If you're adding a bunch of related files to a directory, or just don't
like the default filenames generated by `addurl`, you can use `--pathdepth`
@ -55,3 +65,35 @@ number takes that many paths from the end.
addurl 2012_01_video.mpeg (downloading http://example.com/videos/2012/01/video.mpeg)
# git annex addurl http://example.com/videos/2012/01/video.mpeg --pathdepth=-2
addurl 01_video.mpeg (downloading http://example.com/videos/2012/01/video.mpeg)
## videos
There's support for downloading videos from sites like YouTube, Vimeo,
and many more. This relies on [quvi](http://quvi.sourceforge.net/) to find
urls to the actual videos files.
When you have quvi installed, you can just
`git annex addurl http://youtube.com/foo` and it will detect that
it is a video and download the video content for offline viewing.
Later, in another clone of the repository, you can run `git annex get` on
the file and it will also be downloaded with the help of quvi. This works
even if the video host has transcoded or otherwise changed the video
in the meantime; the assumption is that these video files are equivilant.
There is an `annex.quvi-options` configuration setting that can be used
to pass parameters to quvi. For example, you could set `git config
annex.quvi-options "--format low"` to configure it to download low
quality videos from YouTube.
Note that for performance reasons, the url is not checked for redirects,
so shortened urls to sites like youtu.be will not be detected. You can
either load the short url in a browser to get the full url, or you
can force use of quvi with redirect detection, by prepending "quvi:" to the
url. For example, `git annex addurl quvi:http://youtu.be/foo`
Downloading whole YouTube playlists is not currently supported by quvi.
## podcasts
This is done using `git annex importfeed`. See [[downloading podcasts]].

View file

@ -18,3 +18,5 @@ The [[Web special remote|special remotes/web]] could possibly be improved by det
> > URL may yield different file contents depending on the quality
> > chosen. Also, it seems that the URLs guessed by quvi may be
> > ephemeral. --[[anarcat]]
> [[done]]!!! --[[Joey]]

View file

@ -76,7 +76,7 @@ Executable git-annex
extensible-exceptions, dataenc, SHA, process, json,
base (>= 4.5 && < 4.8), monad-control, MonadCatchIO-transformers,
IfElse, text, QuickCheck >= 2.1, bloomfilter, edit-distance, process,
SafeSemaphore, uuid, random, dlist, unix-compat
SafeSemaphore, uuid, random, dlist, unix-compat, aeson
-- Need to list these because they're generated from .hsc files.
Other-Modules: Utility.Touch Utility.Mounts
Include-Dirs: Utility
@ -141,7 +141,7 @@ Executable git-annex
Build-Depends:
yesod, yesod-default, yesod-static, yesod-form, yesod-core,
case-insensitive, http-types, transformers, wai, wai-logger, warp,
blaze-builder, crypto-api, hamlet, clientsession, aeson,
blaze-builder, crypto-api, hamlet, clientsession,
template-haskell, data-default
CPP-Options: -DWITH_WEBAPP