import metadata from feeds

When annex.genmetadata is set, metadata from the feed is added to files
that are imported from it.

Reused the same feedtitle and itemtitle, feedauthor, itemauthor, etc names
that are used in --template.

Also added title and author, which are the item title/author if available,
falling back to the feed title/author. These are more likely to be common
metadata fields.

(There is a small bit of dupication here, but once git gets
around to packing the object, it will compress it away.)

The itempubdate field is not included in the metadata as a string; instead
it is used to generate year and month fields, same as is done when adding
files with annex.genmetadata set.

This commit was sponsored by Amitai Schlair, who cooincidentially
is responsible for ikiwiki generating nice feed metadata!
This commit is contained in:
Joey Hess 2014-07-03 13:46:09 -04:00
parent faf50a0a2f
commit d0c1a22e7c
7 changed files with 106 additions and 61 deletions

View file

@ -218,7 +218,7 @@ getViaTmpUnchecked = finishGetViaTmp (return True)
getViaTmpChecked :: Annex Bool -> Key -> (FilePath -> Annex Bool) -> Annex Bool getViaTmpChecked :: Annex Bool -> Key -> (FilePath -> Annex Bool) -> Annex Bool
getViaTmpChecked check key action = getViaTmpChecked check key action =
prepGetViaTmpChecked key $ prepGetViaTmpChecked key False $
finishGetViaTmp check key action finishGetViaTmp check key action
{- Prepares to download a key via a tmp file, and checks that there is {- Prepares to download a key via a tmp file, and checks that there is
@ -229,8 +229,8 @@ getViaTmpChecked check key action =
- -
- Wen there's enough free space, runs the download action. - Wen there's enough free space, runs the download action.
-} -}
prepGetViaTmpChecked :: Key -> Annex Bool -> Annex Bool prepGetViaTmpChecked :: Key -> a -> Annex a -> Annex a
prepGetViaTmpChecked key getkey = do prepGetViaTmpChecked key unabletoget getkey = do
tmp <- fromRepo $ gitAnnexTmpObjectLocation key tmp <- fromRepo $ gitAnnexTmpObjectLocation key
e <- liftIO $ doesFileExist tmp e <- liftIO $ doesFileExist tmp
@ -242,7 +242,7 @@ prepGetViaTmpChecked key getkey = do
-- The tmp file may not have been left writable -- The tmp file may not have been left writable
when e $ thawContent tmp when e $ thawContent tmp
getkey getkey
, return False , return unabletoget
) )
finishGetViaTmp :: Annex Bool -> Key -> (FilePath -> Annex Bool) -> Annex Bool finishGetViaTmp :: Annex Bool -> Key -> (FilePath -> Annex Bool) -> Annex Bool

View file

@ -7,6 +7,7 @@
module Annex.MetaData ( module Annex.MetaData (
genMetaData, genMetaData,
addDateMetaData,
module X module X
) where ) where
@ -37,20 +38,18 @@ genMetaData :: Key -> FilePath -> FileStatus -> Annex ()
genMetaData key file status = do genMetaData key file status = do
maybe noop (flip copyMetaData key) =<< catKeyFileHEAD file maybe noop (flip copyMetaData key) =<< catKeyFileHEAD file
whenM (annexGenMetaData <$> Annex.getGitConfig) $ do whenM (annexGenMetaData <$> Annex.getGitConfig) $ do
metadata <- getCurrentMetaData key curr <- getCurrentMetaData key
let metadata' = genMetaData' status metadata addMetaData key (addDateMetaData mtime curr)
unless (metadata' == emptyMetaData) $ where
addMetaData key metadata' mtime = posixSecondsToUTCTime $ realToFrac $ modificationTime status
{- Generates metadata from the FileStatus. {- Generates metadata for a file's date stamp.
- Does not overwrite any existing metadata values. -} - Does not overwrite any existing metadata values. -}
genMetaData' :: FileStatus -> MetaData -> MetaData addDateMetaData :: UTCTime -> MetaData -> MetaData
genMetaData' status old = MetaData $ M.fromList $ filter isnew addDateMetaData mtime old = MetaData $ M.fromList $ filter isnew
[ (yearMetaField, S.singleton $ toMetaValue $ show y) [ (yearMetaField, S.singleton $ toMetaValue $ show y)
, (monthMetaField, S.singleton $ toMetaValue $ show m) , (monthMetaField, S.singleton $ toMetaValue $ show m)
] ]
where where
isnew (f, _) = S.null (currentMetaDataValues f old) isnew (f, _) = S.null (currentMetaDataValues f old)
(y, m, _d) = toGregorian $ utctDay $ (y, m, _d) = toGregorian $ utctDay $ mtime
posixSecondsToUTCTime $ realToFrac $
modificationTime status

View file

@ -97,15 +97,17 @@ performQuvi relaxed pageurl videourl file = ifAnnexed file addurl geturl
where where
quviurl = setDownloader pageurl QuviDownloader quviurl = setDownloader pageurl QuviDownloader
addurl key = next $ cleanup quviurl file key Nothing addurl key = next $ cleanup quviurl file key Nothing
geturl = next $ addUrlFileQuvi relaxed quviurl videourl file geturl = next $ isJust <$> addUrlFileQuvi relaxed quviurl videourl file
#endif #endif
#ifdef WITH_QUVI #ifdef WITH_QUVI
addUrlFileQuvi :: Bool -> URLString -> URLString -> FilePath -> Annex Bool addUrlFileQuvi :: Bool -> URLString -> URLString -> FilePath -> Annex (Maybe Key)
addUrlFileQuvi relaxed quviurl videourl file = do addUrlFileQuvi relaxed quviurl videourl file = do
key <- Backend.URL.fromUrl quviurl Nothing key <- Backend.URL.fromUrl quviurl Nothing
ifM (pure relaxed <||> Annex.getState Annex.fast) ifM (pure relaxed <||> Annex.getState Annex.fast)
( cleanup quviurl file key Nothing ( do
cleanup' quviurl file key Nothing
return (Just key)
, do , do
{- Get the size, and use that to check {- Get the size, and use that to check
- disk space. However, the size info is not - disk space. However, the size info is not
@ -113,7 +115,7 @@ addUrlFileQuvi relaxed quviurl videourl file = do
- might change and we want to be able to download - might change and we want to be able to download
- it later. -} - it later. -}
sizedkey <- addSizeUrlKey videourl key sizedkey <- addSizeUrlKey videourl key
prepGetViaTmpChecked sizedkey $ do prepGetViaTmpChecked sizedkey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation key tmp <- fromRepo $ gitAnnexTmpObjectLocation key
showOutput showOutput
ok <- Transfer.notifyTransfer Transfer.Download (Just file) $ ok <- Transfer.notifyTransfer Transfer.Download (Just file) $
@ -121,15 +123,17 @@ addUrlFileQuvi relaxed quviurl videourl file = do
liftIO $ createDirectoryIfMissing True (parentDir tmp) liftIO $ createDirectoryIfMissing True (parentDir tmp)
downloadUrl [videourl] tmp downloadUrl [videourl] tmp
if ok if ok
then cleanup quviurl file key (Just tmp) then do
else return False cleanup' quviurl file key (Just tmp)
return (Just key)
else return Nothing
) )
#endif #endif
perform :: Bool -> URLString -> FilePath -> CommandPerform perform :: Bool -> URLString -> FilePath -> CommandPerform
perform relaxed url file = ifAnnexed file addurl geturl perform relaxed url file = ifAnnexed file addurl geturl
where where
geturl = next $ addUrlFile relaxed url file geturl = next $ isJust <$> addUrlFile relaxed url file
addurl key addurl key
| relaxed = do | relaxed = do
setUrlPresent key url setUrlPresent key url
@ -149,7 +153,7 @@ perform relaxed url file = ifAnnexed file addurl geturl
stop stop
) )
addUrlFile :: Bool -> URLString -> FilePath -> Annex Bool addUrlFile :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
addUrlFile relaxed url file = do addUrlFile relaxed url file = do
liftIO $ createDirectoryIfMissing True (parentDir file) liftIO $ createDirectoryIfMissing True (parentDir file)
ifM (Annex.getState Annex.fast <||> pure relaxed) ifM (Annex.getState Annex.fast <||> pure relaxed)
@ -159,13 +163,13 @@ addUrlFile relaxed url file = do
download url file download url file
) )
download :: URLString -> FilePath -> Annex Bool download :: URLString -> FilePath -> Annex (Maybe Key)
download url file = do download url file = do
{- Generate a dummy key to use for this download, before we can {- Generate a dummy key to use for this download, before we can
- examine the file and find its real key. This allows resuming - examine the file and find its real key. This allows resuming
- downloads, as the dummy key for a given url is stable. -} - downloads, as the dummy key for a given url is stable. -}
dummykey <- addSizeUrlKey url =<< Backend.URL.fromUrl url Nothing dummykey <- addSizeUrlKey url =<< Backend.URL.fromUrl url Nothing
prepGetViaTmpChecked dummykey $ do prepGetViaTmpChecked dummykey Nothing $ do
tmp <- fromRepo $ gitAnnexTmpObjectLocation dummykey tmp <- fromRepo $ gitAnnexTmpObjectLocation dummykey
showOutput showOutput
ifM (runtransfer dummykey tmp) ifM (runtransfer dummykey tmp)
@ -178,9 +182,11 @@ download url file = do
} }
k <- genKey source backend k <- genKey source backend
case k of case k of
Nothing -> return False Nothing -> return Nothing
Just (key, _) -> cleanup url file key (Just tmp) Just (key, _) -> do
, return False cleanup' url file key (Just tmp)
return (Just key)
, return Nothing
) )
where where
runtransfer dummykey tmp = Transfer.notifyTransfer Transfer.Download (Just file) $ runtransfer dummykey tmp = Transfer.notifyTransfer Transfer.Download (Just file) $
@ -200,6 +206,11 @@ addSizeUrlKey url key = do
cleanup :: URLString -> FilePath -> Key -> Maybe FilePath -> Annex Bool cleanup :: URLString -> FilePath -> Key -> Maybe FilePath -> Annex Bool
cleanup url file key mtmp = do cleanup url file key mtmp = do
cleanup' url file key mtmp
return True
cleanup' :: URLString -> FilePath -> Key -> Maybe FilePath -> Annex ()
cleanup' url file key mtmp = do
when (isJust mtmp) $ when (isJust mtmp) $
logStatus key InfoPresent logStatus key InfoPresent
setUrlPresent key url setUrlPresent key url
@ -210,9 +221,8 @@ cleanup url file key mtmp = do
- must already exist, so flush the queue. -} - must already exist, so flush the queue. -}
Annex.Queue.flush Annex.Queue.flush
maybe noop (moveAnnex key) mtmp maybe noop (moveAnnex key) mtmp
return True
nodownload :: Bool -> URLString -> FilePath -> Annex Bool nodownload :: Bool -> URLString -> FilePath -> Annex (Maybe Key)
nodownload relaxed url file = do nodownload relaxed url file = do
(exists, size) <- if relaxed (exists, size) <- if relaxed
then pure (True, Nothing) then pure (True, Nothing)
@ -220,10 +230,11 @@ nodownload relaxed url file = do
if exists if exists
then do then do
key <- Backend.URL.fromUrl url size key <- Backend.URL.fromUrl url size
cleanup url file key Nothing cleanup' url file key Nothing
return (Just key)
else do else do
warning $ "unable to access url: " ++ url warning $ "unable to access url: " ++ url
return False return Nothing
url2file :: URI -> Maybe Int -> Int -> FilePath url2file :: URI -> Maybe Int -> Int -> FilePath
url2file url pathdepth pathmax = case pathdepth of url2file url pathdepth pathmax = case pathdepth of

View file

@ -33,6 +33,9 @@ import Annex.Quvi
import qualified Utility.Quvi as Quvi import qualified Utility.Quvi as Quvi
import Command.AddUrl (addUrlFileQuvi) import Command.AddUrl (addUrlFileQuvi)
#endif #endif
import Types.MetaData
import Logs.MetaData
import Annex.MetaData
def :: [Command] def :: [Command]
def = [notBareRepo $ withOptions [templateOption, relaxedOption] $ def = [notBareRepo $ withOptions [templateOption, relaxedOption] $
@ -165,12 +168,14 @@ performDownload relaxed cache todownload = case location todownload of
Nothing -> return True Nothing -> return True
Just f -> do Just f -> do
showStart "addurl" f showStart "addurl" f
ok <- getter f mk <- getter f
if ok case mk of
then do Just key -> do
whenM (annexGenMetaData <$> Annex.getGitConfig) $
addMetaData key $ extractMetaData todownload
showEndOk showEndOk
return True return True
else do Nothing -> do
showEndFail showEndFail
checkFeedBroken (feedurl todownload) checkFeedBroken (feedurl todownload)
@ -198,32 +203,19 @@ performDownload relaxed cache todownload = case location todownload of
( return Nothing ( return Nothing
, tryanother , tryanother
) )
defaultTemplate :: String defaultTemplate :: String
defaultTemplate = "${feedtitle}/${itemtitle}${extension}" defaultTemplate = "${feedtitle}/${itemtitle}${extension}"
{- Generates a filename to use for a feed item by filling out the template. {- Generates a filename to use for a feed item by filling out the template.
- The filename may not be unique. -} - The filename may not be unique. -}
feedFile :: Utility.Format.Format -> ToDownload -> String -> FilePath feedFile :: Utility.Format.Format -> ToDownload -> String -> FilePath
feedFile tmpl i extension = Utility.Format.format tmpl $ M.fromList feedFile tmpl i extension = Utility.Format.format tmpl $
[ field "feedtitle" $ getFeedTitle $ feed i M.map sanitizeFilePath $ M.fromList $ extractFields i ++
, fieldMaybe "itemtitle" $ getItemTitle $ item i [ ("extension", extension)
, fieldMaybe "feedauthor" $ getFeedAuthor $ feed i , extractField "itempubdate" [pubdate $ item i]
, fieldMaybe "itemauthor" $ getItemAuthor $ item i ]
, fieldMaybe "itemsummary" $ getItemSummary $ item i
, fieldMaybe "itemdescription" $ getItemDescription $ item i
, fieldMaybe "itemrights" $ getItemRights $ item i
, fieldMaybe "itemid" $ snd <$> getItemId (item i)
, fieldMaybe "itempubdate" $ pubdate $ item i
, ("extension", sanitizeFilePath extension)
]
where where
field k v =
let s = sanitizeFilePath v in
if null s then (k, "none") else (k, s)
fieldMaybe k Nothing = (k, "none")
fieldMaybe k (Just v) = field k v
#if MIN_VERSION_feed(0,3,9) #if MIN_VERSION_feed(0,3,9)
pubdate itm = case getItemPublishDate itm :: Maybe (Maybe UTCTime) of pubdate itm = case getItemPublishDate itm :: Maybe (Maybe UTCTime) of
Just (Just d) -> Just $ Just (Just d) -> Just $
@ -234,6 +226,41 @@ feedFile tmpl i extension = Utility.Format.format tmpl $ M.fromList
pubdate _ = Nothing pubdate _ = Nothing
#endif #endif
extractMetaData :: ToDownload -> MetaData
extractMetaData i = case getItemPublishDate (item i) :: Maybe (Maybe UTCTime) of
Just (Just d) -> addDateMetaData d meta
_ -> meta
where
tometa (k, v) = (mkMetaFieldUnchecked k, S.singleton (toMetaValue v))
meta = MetaData $ M.fromList $ map tometa $ extractFields i
{- Extract fields from the feed and item, that are both used as metadata,
- and to generate the filename. -}
extractFields :: ToDownload -> [(String, String)]
extractFields i = map (uncurry extractField)
[ ("feedtitle", [feedtitle])
, ("itemtitle", [itemtitle])
, ("feedauthor", [feedauthor])
, ("itemauthor", [itemauthor])
, ("itemsummary", [getItemSummary $ item i])
, ("itemdescription", [getItemDescription $ item i])
, ("itemrights", [getItemRights $ item i])
, ("itemid", [snd <$> getItemId (item i)])
, ("title", [itemtitle, feedtitle])
, ("author", [itemauthor, feedauthor])
]
where
feedtitle = Just $ getFeedTitle $ feed i
itemtitle = getItemTitle $ item i
feedauthor = getFeedAuthor $ feed i
itemauthor = getItemAuthor $ item i
extractField :: String -> [Maybe String] -> (String, String)
extractField k [] = (k, "none")
extractField k (Just v:_)
| not (null v) = (k, v)
extractField k (_:rest) = extractField k rest
{- Called when there is a problem with a feed. {- Called when there is a problem with a feed.
- Throws an error if the feed is broken, otherwise shows a warning. -} - Throws an error if the feed is broken, otherwise shows a warning. -}
feedProblem :: URLString -> String -> Annex () feedProblem :: URLString -> String -> Annex ()

View file

@ -95,10 +95,12 @@ addMetaData k metadata = addMetaData' k metadata =<< liftIO getPOSIXTime
- will tend to be generated across the different log files, and so - will tend to be generated across the different log files, and so
- git will be able to pack the data more efficiently. -} - git will be able to pack the data more efficiently. -}
addMetaData' :: Key -> MetaData -> POSIXTime -> Annex () addMetaData' :: Key -> MetaData -> POSIXTime -> Annex ()
addMetaData' k (MetaData m) now = Annex.Branch.change (metaDataLogFile k) $ addMetaData' k d@(MetaData m) now
showLog . simplifyLog | d == emptyMetaData = noop
. S.insert (LogEntry now metadata) | otherwise = Annex.Branch.change (metaDataLogFile k) $
. parseLog showLog . simplifyLog
. S.insert (LogEntry now metadata)
. parseLog
where where
metadata = MetaData $ M.filterWithKey (\f _ -> not (isLastChangedField f)) m metadata = MetaData $ M.filterWithKey (\f _ -> not (isLastChangedField f)) m

2
debian/changelog vendored
View file

@ -11,6 +11,8 @@ git-annex (5.20140614) UNRELEASED; urgency=medium
queue flushing than necessary. queue flushing than necessary.
* Run standalone install process when the assistant is started * Run standalone install process when the assistant is started
(was only being run when the webapp was opened). (was only being run when the webapp was opened).
* importfeed: When annex.genmetadata is set, metadata from the feed
is added to files that are imported from it.
-- Joey Hess <joeyh@debian.org> Mon, 16 Jun 2014 11:28:42 -0400 -- Joey Hess <joeyh@debian.org> Mon, 16 Jun 2014 11:28:42 -0400

View file

@ -268,7 +268,7 @@ subdirectories).
Use `--template` to control where the files are stored. Use `--template` to control where the files are stored.
The default template is '${feedtitle}/${itemtitle}${extension}' The default template is '${feedtitle}/${itemtitle}${extension}'
(Other available variables: feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid, itempubdate) (Other available variables: feedauthor, itemauthor, itemsummary, itemdescription, itemrights, itemid, itempubdate, title, author)
The `--relaxed` and `--fast` options behave the same as they do in addurl. The `--relaxed` and `--fast` options behave the same as they do in addurl.
@ -1346,8 +1346,12 @@ Here are all the supported configuration settings.
* `annex.genmetadata` * `annex.genmetadata`
Set this to `true` to make git-annex automatically generate some metadata Set this to `true` to make git-annex automatically generate some metadata
when adding files to the repository. In particular, it stores when adding files to the repository.
year and month metadata, from the file's modification date.
In particular, it stores year and month metadata, from the file's
modification date.
When importfeed is used, it stores additional metadata from the feed.
* `annex.queuesize` * `annex.queuesize`