importfeed: git-annex becomes a podcatcher in 150 LOC

This commit is contained in:
Joey Hess 2013-07-28 15:27:36 -04:00
parent 55bd5a81ad
commit 7e66d260ea
15 changed files with 319 additions and 32 deletions

View file

@ -21,6 +21,7 @@ module Annex.Branch (
change,
commit,
files,
withIndex,
) where
import qualified Data.ByteString.Lazy.Char8 as L

View file

@ -35,7 +35,7 @@ stageDirect :: Annex Bool
stageDirect = do
Annex.Queue.flush
top <- fromRepo Git.repoPath
(l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
(l, cleanup) <- inRepo $ Git.LsFiles.stagedOthersDetails [top]
forM_ l go
void $ liftIO cleanup
staged <- Annex.Queue.size

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2011 Joey Hess <joey@kitenet.net>
- Copyright 2011-2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -61,10 +61,7 @@ start relaxed optfile pathdepth s = go $ fromMaybe bad $ parseURI s
perform :: Bool -> String -> FilePath -> CommandPerform
perform relaxed url file = ifAnnexed file addurl geturl
where
geturl = do
liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed)
( nodownload relaxed url file , download url file )
geturl = next $ addUrlFile relaxed url file
addurl (key, _backend)
| relaxed = do
setUrlPresent key url
@ -80,12 +77,23 @@ perform relaxed url file = ifAnnexed file addurl geturl
stop
)
download :: String -> FilePath -> CommandPerform
download url file = do
addUrlFile :: Bool -> String -> FilePath -> Annex Bool
addUrlFile relaxed url file = do
liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed)
( nodownload relaxed url file
, do
showAction $ "downloading " ++ url ++ " "
download url file
)
download :: String -> FilePath -> Annex Bool
download url file = do
dummykey <- genkey
tmp <- fromRepo $ gitAnnexTmpLocation dummykey
stopUnless (runtransfer dummykey tmp) $ do
showOutput
ifM (runtransfer dummykey tmp)
( do
backend <- chooseBackend file
let source = KeySource
{ keyFilename = file
@ -94,8 +102,10 @@ download url file = do
}
k <- genKey source backend
case k of
Nothing -> stop
Just (key, _) -> next $ cleanup url file key (Just tmp)
Nothing -> return False
Just (key, _) -> cleanup url file key (Just tmp)
, return False
)
where
{- Generate a dummy key to use for this download, before we can
- examine the file and find its real key. This allows resuming
@ -119,7 +129,7 @@ download url file = do
downloadUrl [url] tmp
cleanup :: String -> FilePath -> Key -> Maybe FilePath -> CommandCleanup
cleanup :: String -> FilePath -> Key -> Maybe FilePath -> Annex Bool
cleanup url file key mtmp = do
when (isJust mtmp) $
logStatus key InfoPresent
@ -133,7 +143,7 @@ cleanup url file key mtmp = do
maybe noop (moveAnnex key) mtmp
return True
nodownload :: Bool -> String -> FilePath -> CommandPerform
nodownload :: Bool -> String -> FilePath -> Annex Bool
nodownload relaxed url file = do
headers <- getHttpHeaders
(exists, size) <- if relaxed
@ -142,10 +152,10 @@ nodownload relaxed url file = do
if exists
then do
let key = Backend.URL.fromUrl url size
next $ cleanup url file key Nothing
cleanup url file key Nothing
else do
warning $ "unable to access url: " ++ url
stop
return False
url2file :: URI -> Maybe Int -> FilePath
url2file url pathdepth = case pathdepth of

148
Command/ImportFeed.hs Normal file
View file

@ -0,0 +1,148 @@
{- git-annex command
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Command.ImportFeed where
import Text.Feed.Import
import Text.Feed.Query
import Text.Feed.Types
import qualified Data.Set as S
import qualified Data.Map as M
import Data.Char
import Common.Annex
import Command
import qualified Utility.Url as Url
import Logs.Web
import qualified Option
import qualified Utility.Format
import Utility.Tmp
import Command.AddUrl (addUrlFile, relaxedOption)
data ToDownload = ToDownload
{ feed :: Feed
, item :: Item
, location :: URLString
}
mkToDownload :: Feed -> Item -> Maybe ToDownload
mkToDownload f i = case getItemEnclosure i of
Nothing -> Nothing
Just (enclosureurl, _, _) -> Just $ ToDownload f i enclosureurl
def :: [Command]
def = [notBareRepo $ withOptions [templateOption, relaxedOption] $
command "importfeed" (paramRepeating paramUrl) seek
SectionCommon "import files from podcast feeds"]
templateOption :: Option
templateOption = Option.field [] "template" paramFormat "template for filenames"
seek :: [CommandSeek]
seek = [withField templateOption return $ \tmpl ->
withFlag relaxedOption $ \relaxed ->
withWords $ start relaxed tmpl]
start :: Bool -> Maybe String -> [URLString] -> CommandStart
start relaxed opttemplate = go Nothing
where
go _ [] = stop
go cache (url:urls) = do
showStart "importfeed" url
v <- findEnclosures url
if isJust v then showEndOk else showEndFail
case v of
Just l | not (null l) -> do
knownurls <- getknownurls cache
mapM_ (downloadEnclosure relaxed template knownurls) l
go (Just knownurls) urls
_ -> go cache urls
defaulttemplate = "${feedtitle}/${itemtitle}.${extension}"
template = Utility.Format.gen $ fromMaybe defaulttemplate opttemplate
{- This is expensive, so avoid running it more than once. -}
getknownurls (Just cached) = return cached
getknownurls Nothing = S.fromList <$> knownUrls
findEnclosures :: URLString -> Annex (Maybe [ToDownload])
findEnclosures url = go =<< downloadFeed url
where
go Nothing = do
warning $ "failed to parse feed " ++ url
return Nothing
go (Just f) = return $ Just $
mapMaybe (mkToDownload f) (feedItems f)
{- Feeds change, so a feed download cannot be resumed. -}
downloadFeed :: URLString -> Annex (Maybe Feed)
downloadFeed url = do
showOutput
liftIO $ withTmpFile "feed" $ \f h -> do
ifM (Url.download url [] [] f)
( parseFeedString <$> hGetContentsStrict h
, return Nothing
)
{- Avoids downloading any urls that are already known to be associated
- with a file in the annex. -}
downloadEnclosure :: Bool -> Utility.Format.Format -> S.Set URLString -> ToDownload -> Annex ()
downloadEnclosure relaxed template knownurls enclosure
| S.member url knownurls = noop
| otherwise = do
dest <- liftIO $ feedFile template enclosure
showStart "addurl" dest
ifM (addUrlFile relaxed url dest)
( showEndOk
, showEndFail
)
where
url = location enclosure
{- Generate a unique filename for the feed item by filling
- out the template.
-
- Since each feed url is only downloaded once,
- if the file already exists, two items with different urls
- has the same title. A number is added to disambiguate.
-}
feedFile :: Utility.Format.Format -> ToDownload -> IO FilePath
feedFile template i = makeUnique 0 $
Utility.Format.format template $ M.fromList
[ field "feedtitle" $ getFeedTitle $ feed i
, fieldMaybe "itemtitle" $ getItemTitle $ item i
, fieldMaybe "feedauthor" $ getFeedAuthor $ feed i
, fieldMaybe "itemauthor" $ getItemAuthor $ item i
, fieldMaybe "itemsummary" $ getItemSummary $ item i
, fieldMaybe "itemdescription" $ getItemDescription $ item i
, fieldMaybe "itemrights" $ getItemRights $ item i
, fieldMaybe "itemid" $ snd <$> getItemId (item i)
, field "extension" $ takeExtension $ location i
]
where
field k v =
let s = map sanitize v in
if null s then (k, "none") else (k, s)
fieldMaybe k Nothing = (k, "none")
fieldMaybe k (Just v) = field k v
sanitize c
| isSpace c || isPunctuation c || c == '/' = '_'
| otherwise = c
makeUnique :: Integer -> FilePath -> IO FilePath
makeUnique n file =
ifM (isJust <$> catchMaybeIO (getSymbolicLinkStatus f))
( makeUnique (n + 1) file
, return file
)
where
f = if n == 0
then file
else
let (d, base) = splitFileName file
in d </> show n ++ "_" ++ base

View file

@ -59,7 +59,7 @@ perform = do
setDirect False
top <- fromRepo Git.repoPath
(l, clean) <- inRepo $ Git.LsFiles.stagedDetails [top]
(l, clean) <- inRepo $ Git.LsFiles.stagedOthersDetails [top]
forM_ l go
void $ liftIO clean
next cleanup

View file

@ -12,6 +12,7 @@ module Git.LsFiles (
modified,
staged,
stagedNotDeleted,
stagedOthersDetails,
stagedDetails,
typeChanged,
typeChangedStaged,
@ -69,16 +70,24 @@ staged' ps l = pipeNullSplit $ prefix ++ ps ++ suffix
prefix = [Params "diff --cached --name-only -z"]
suffix = Param "--" : map File l
{- Returns details about files that are staged in the index
- (including the Sha of their staged contents),
- as well as files not yet in git. -}
{- Returns details about files that are staged in the index,
- as well as files not yet in git. Skips ignored files. -}
stagedOthersDetails :: [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool)
stagedOthersDetails = stagedDetails' [Params "--others --exclude-standard"]
{- Returns details about all files that are staged in the index. -}
stagedDetails :: [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool)
stagedDetails l repo = do
stagedDetails = stagedDetails' []
{- Gets details about staged files, including the Sha of their staged
- contents. -}
stagedDetails' :: [CommandParam] -> [FilePath] -> Repo -> IO ([(FilePath, Maybe Sha)], IO Bool)
stagedDetails' ps l repo = do
(ls, cleanup) <- pipeNullSplit params repo
return (map parse ls, cleanup)
where
params = [Params "ls-files --others --exclude-standard --stage -z --"] ++
map File l
params = Params "ls-files --stage -z" : ps ++
Param "--" : map File l
parse s
| null file = (s, Nothing)
| otherwise = (file, extractSha $ take shaSize $ drop 7 metadata)

View file

@ -57,6 +57,9 @@ import qualified Command.Ungroup
import qualified Command.Vicfg
import qualified Command.Sync
import qualified Command.AddUrl
#ifdef WITH_FEED
import qualified Command.ImportFeed
#endif
import qualified Command.RmUrl
import qualified Command.Import
import qualified Command.Map
@ -91,6 +94,9 @@ cmds = concat
, Command.Lock.def
, Command.Sync.def
, Command.AddUrl.def
#ifdef WITH_FEED
, Command.ImportFeed.def
#endif
, Command.RmUrl.def
, Command.Import.def
, Command.Init.def

View file

@ -1,6 +1,6 @@
{- Web url logs.
-
- Copyright 2011 Joey Hess <joey@kitenet.net>
- Copyright 2011, 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -11,12 +11,21 @@ module Logs.Web (
getUrls,
setUrlPresent,
setUrlMissing,
urlLog,
urlLogKey,
knownUrls
) where
import qualified Data.ByteString.Lazy.Char8 as L
import Common.Annex
import Logs.Presence
import Logs.Location
import Types.Key
import qualified Annex.Branch
import Annex.CatFile
import qualified Git
import qualified Git.LsFiles
type URLString = String
@ -24,8 +33,24 @@ type URLString = String
webUUID :: UUID
webUUID = UUID "00000000-0000-0000-0000-000000000001"
urlLogExt :: String
urlLogExt = ".log.web"
urlLog :: Key -> FilePath
urlLog key = hashDirLower key </> keyFile key ++ ".log.web"
urlLog key = hashDirLower key </> keyFile key ++ urlLogExt
{- Converts a url log file into a key.
- (Does not work on oldurlLogs.) -}
urlLogKey :: FilePath -> Maybe Key
urlLogKey file
| ext == urlLogExt = fileKey base
| otherwise = Nothing
where
(base, ext) = splitAt (length file - extlen) file
extlen = length urlLogExt
isUrlLog :: FilePath -> Bool
isUrlLog file = urlLogExt `isSuffixOf` file
{- Used to store the urls elsewhere. -}
oldurlLogs :: Key -> [FilePath]
@ -58,3 +83,21 @@ setUrlMissing key url = do
addLog (urlLog key) =<< logNow InfoMissing url
whenM (null <$> getUrls key) $
logChange key webUUID InfoMissing
{- Finds all known urls. -}
knownUrls :: Annex [URLString]
knownUrls = do
{- Ensure the git-annex branch's index file is up-to-date and
- any journaled changes are reflected in it, since we're going
- to query its index directly. -}
Annex.Branch.update
Annex.Branch.commit "update"
Annex.Branch.withIndex $ do
top <- fromRepo Git.repoPath
(l, cleanup) <- inRepo $ Git.LsFiles.stagedDetails [top]
r <- mapM (geturls . snd) $ filter (isUrlLog . fst) l
void $ liftIO cleanup
return $ concat r
where
geturls Nothing = return []
geturls (Just logsha) = getLog . L.unpack <$> catObject logsha

View file

@ -130,6 +130,9 @@ download' quiet url headers options file =
-
- This does its own redirect following because Browser's is buggy for HEAD
- requests.
-
- Unfortunately, does not handle https, so should only be used
- when curl is not available.
-}
request :: URI -> Headers -> RequestMethod -> IO (Response String)
request url headers requesttype = go 5 url

1
debian/changelog vendored
View file

@ -7,6 +7,7 @@ git-annex (4.20130724) UNRELEASED; urgency=low
Like drop, dropunused checks remotes, and honors the global
annex.numcopies setting. (However, .gitattributes settings cannot
apply to unused files.)
* importfeed can be used to import files from podcast feeds.
* Add status message to XMPP presence tag, to identify to others that
the client is a git-annex client. Closes: #717652
* webapp: When creating a repository on a removable drive, set

1
debian/control vendored
View file

@ -48,6 +48,7 @@ Build-Depends:
libghc-xml-types-dev,
libghc-async-dev,
libghc-http-dev,
libghc-feed-dev
ikiwiki,
perlmagick,
git,

View file

@ -190,6 +190,19 @@ subdirectories).
git annex import /media/camera/DCIM/
* importfeed [url ...]
Imports the contents of podcast feeds. Only downloads files whose
urls have not already been added to the repository before, so you can
delete, rename, etc the resulting files and repeated runs won't duplicate
them.
Use --template to control where the files are stored.
The default template is '${feedtitle}/${itemtitle}${extension}'
(Other available variables: feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid)
The --relaxed and --fast options behave the same as they do in addurl.
* watch
Watches for changes to files in the current directory and its subdirectories,

View file

@ -21,6 +21,7 @@ quite a lot.
* [UUID](http://hackage.haskell.org/package/uuid)
* [regex-tdfa](http://hackage.haskell.org/package/regex-tdfa)
* [extensible-exceptions](http://hackage.haskell.org/package/extensible-exceptions)
* [feed](http://hackage.haskell.org/package/feed)
* Optional haskell stuff, used by the [[assistant]] and its webapp
* [stm](http://hackage.haskell.org/package/stm)
(version 2.3 or newer)

View file

@ -0,0 +1,44 @@
You can use git-annex as a podcatcher, to download podcast contents.
No additional software is required, but your git-annex must be built
with the Feeds feature (run `git annex version` to check).
All you need to do is put something like this in a cron job:
`cd somerepo && git annex importfeed http://url/to/podcast http://other/podcast/url`
This downloads the urls, and parses them as RSS, Atom, or RDF feeds.
All enclosures are downloaded and added to the repository, the same as if you
had manually run `git annex addurl` on each of them.
git-annex will avoid downloading a file from a feed if its url has already
been stored in the repository before. So once a file is downloaded,
you can move it around, delete it, `git annex drop` its content, etc,
and it will not be downloaded again by repeated runs of
`git annex importfeed`. Just how a podcatcher should behave.
## templates
To control the filenames used for items downloaded from a feed,
there's a --template option. The default is
`--template='${feedtitle}/${itemtitle}${extension}'`
Other available template variables:
feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid
## catching up
To catch up on a feed without downloading its contents,
use `git annex importfeed --relaxed`, and delete the symlinks it creates.
Next time you run `git annex addurl` it will only fetch any new items.
## fast mode
To add a feed without downloading its contents right now,
use `git annex importfeed --fast`. Then you can use `git annex get` as
usual to download the content of an item.
## distributed podcastching
A nice benefit of using git-annex as a podcatcher is that you can
run `git annex importfeed` on the same url in different clones
of a repository, and `git annex sync` will sync it all up.

View file

@ -65,6 +65,9 @@ Flag TestSuite
Flag TDFA
Description: Use regex-tdfa for wildcards
Flag Feed
Description: Enable podcast feed support
Executable git-annex
Main-Is: git-annex.hs
Build-Depends: MissingH, hslogger, directory, filepath,
@ -154,6 +157,10 @@ Executable git-annex
Build-Depends: dns
CPP-Options: -DWITH_DNS
if flag(Feed)
Build-Depends: feed
CPP-Options: -DWITH_FEED
source-repository head
type: git
location: git://git-annex.branchable.com/