git-annex/Database/ImportFeed.hs

{- Sqlite database of known urls, and another of known itemids,
 - for use by git-annex importfeed.
 -
 - Copyright 2023 Joey Hess <id@joeyh.name>
 -:
 - Licensed under the GNU AGPL version 3 or higher.
 -}

{-# LANGUAGE CPP #-}
{-# LANGUAGE QuasiQuotes, TypeFamilies, TypeOperators, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts, EmptyDataDecls #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE DataKinds, FlexibleInstances #-}
{-# LANGUAGE UndecidableInstances #-}
#if MIN_VERSION_persistent_template(2,8,0)
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE StandaloneDeriving #-}
#endif

module Database.ImportFeed (
	ImportFeedDbHandle,
	openDb,
	closeDb,
	isKnownUrl,
	isKnownItemId,
) where

import Database.Types
import qualified Database.Queue as H
import Database.Init
import Database.Utility
import Annex.Locations
import Annex.Common hiding (delete)
import qualified Annex.Branch
import Git.Types
import Git.Sha
import Git.FilePath
import qualified Git.DiffTree as DiffTree
import Logs
import Logs.Web
import Logs.MetaData
import Types.MetaData
import Annex.MetaData.StandardFields
import Annex.LockFile
import qualified Utility.RawFilePath as R

import Database.Persist.Sql hiding (Key)
import Database.Persist.TH
import qualified System.FilePath.ByteString as P
import qualified Data.ByteString as B
import qualified Data.Set as S

data ImportFeedDbHandle = ImportFeedDbHandle H.DbQueue

-- Note on indexes: ContentIndentifiersKeyRemoteCidIndex etc are really
-- uniqueness constraints, which cause sqlite to automatically add indexes.
-- So when adding indexes, have to take care to only add ones that work as
-- uniqueness constraints. (Unfortunately persistent does not support indexes
-- that are not uniqueness constraints; 
-- https://github.com/yesodweb/persistent/issues/109)
share [mkPersist sqlSettings, mkMigrate "migrateImportFeed"] [persistLowerCase|
KnownUrls
  url SByteString
  UniqueUrl url
KnownItemIds
  itemid SByteString
  UniqueItemId itemid
-- The last git-annex branch tree sha that was used to update
-- KnownUrls and KnownItemIds
AnnexBranch
  tree SSha
  UniqueTree tree
|]

{- Opens the database, creating it if it doesn't exist yet.
 - Updates the database from the git-annex branch. -}
openDb :: Annex ImportFeedDbHandle
openDb = do
	dbdir <- calcRepo' gitAnnexImportFeedDbDir
	let db = dbdir P.</> "db"
	isnew <- liftIO $ not <$> R.doesPathExist db
	when isnew $
		initDb db $ void $ 
			runMigrationSilent migrateImportFeed
	dbh <- liftIO $ H.openDbQueue db "known_urls"
	let h = ImportFeedDbHandle dbh
	needsUpdateFromLog h >>= \case
		Nothing -> return ()
		Just v -> do
			lck <- calcRepo' gitAnnexImportFeedDbLock
                	withExclusiveLock lck $
				updateFromLog h v
	return h

closeDb :: ImportFeedDbHandle -> Annex ()
closeDb (ImportFeedDbHandle h) = liftIO $ H.closeDbQueue h

isKnownUrl :: ImportFeedDbHandle -> URLString -> IO Bool
isKnownUrl (ImportFeedDbHandle h) u = 
	H.queryDbQueue h $ do
		l <- selectList
			[ KnownUrlsUrl ==. SByteString (encodeBS u)
			] []
		return $ not (null l)

isKnownItemId :: ImportFeedDbHandle -> B.ByteString -> IO Bool
isKnownItemId (ImportFeedDbHandle h) i = 
	H.queryDbQueue h $ do
		l <- selectList
			[ KnownItemIdsItemid ==. SByteString i
			] []
		return $ not (null l)

recordKnownUrl :: ImportFeedDbHandle -> URLByteString -> IO ()
recordKnownUrl h u = queueDb h $
	void $ insertUniqueFast $ KnownUrls $ SByteString u

recordKnownItemId :: ImportFeedDbHandle -> SByteString -> IO ()
recordKnownItemId h i = queueDb h $
	void $ insertUniqueFast $ KnownItemIds i

recordAnnexBranchTree :: ImportFeedDbHandle -> Sha -> IO ()
recordAnnexBranchTree h s = queueDb h $ do
	deleteWhere ([] :: [Filter AnnexBranch])
	void $ insertUniqueFast $ AnnexBranch $ toSSha s

getAnnexBranchTree :: ImportFeedDbHandle -> IO Sha
getAnnexBranchTree (ImportFeedDbHandle h) = H.queryDbQueue h $ do
	l <- selectList ([] :: [Filter AnnexBranch]) []
	case l of
		(s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s
		_ -> return emptyTree

queueDb :: ImportFeedDbHandle -> SqlPersistM () -> IO ()
queueDb (ImportFeedDbHandle h) = H.queueDb h checkcommit
  where
        -- commit queue after 10000 changes
        checkcommit sz _lastcommittime
                | sz > 10000 = return True
                | otherwise = return False

{- Check if the git-annex branch has been updated and the database needs
 - to be updated with any new information from it. -}
needsUpdateFromLog :: ImportFeedDbHandle -> Annex (Maybe (Sha, Sha))
needsUpdateFromLog db = do
	oldtree <- liftIO $ getAnnexBranchTree db
	Annex.Branch.updatedFromTree oldtree

{- The database should be locked for write when calling this. -}
updateFromLog :: ImportFeedDbHandle -> (Sha, Sha) -> Annex ()
updateFromLog db@(ImportFeedDbHandle h) (oldtree, currtree)
	| oldtree == emptyTree = do
		scanbranch
		out
	| otherwise = do
		scandiff
		out
  where
  	out = liftIO $ do
		recordAnnexBranchTree db currtree
		H.flushDbQueue h
	
	knownitemids s = liftIO $ forM_ (S.toList s) $
		recordKnownItemId db . SByteString . fromMetaValue

	knownurls us = liftIO $ forM_ us $
		recordKnownUrl db
		
	scandiff = do
		(l, cleanup) <- inRepo $
			DiffTree.diffTreeRecursive oldtree currtree
		mapM_ godiff l
		void $ liftIO $ cleanup
	
	godiff ti = do
		let f = getTopFilePath (DiffTree.file ti)
		case extLogFileKey urlLogExt f of
			Just k -> do
				knownurls =<< getUrls' k
			Nothing -> case extLogFileKey metaDataLogExt f of
				Just k -> do
					m <- getCurrentMetaData k
					knownitemids (currentMetaDataValues itemIdField m)
				Nothing -> return ()

	-- When initially populating the database, this 
	-- is faster than diffing from the empty tree
	-- and looking up every log file.
	scanbranch = Annex.Branch.overBranchFileContents toscan goscan >>= \case
		Just () -> return ()
		Nothing -> scandiff
	
	toscan f
		| isUrlLog f = Just ()
		| isMetaDataLog f = Just ()
		| otherwise = Nothing
	
	goscan reader = reader >>= \case
		Just ((), f, Just content)
			| isUrlLog f -> do
				knownurls (parseUrlLog content)
				goscan reader
			| isMetaDataLog f -> do
				knownitemids $
					currentMetaDataValues itemIdField $
						parseCurrentMetaData content
				goscan reader
			| otherwise -> goscan reader
		Just ((), _, Nothing) -> goscan reader
		Nothing -> return ()
sqlite datbase for importfeed importfeed: Use caching database to avoid needing to list urls on every run, and avoid using too much memory. Benchmarking in my podcasts repo, importfeed got 1.42 seconds faster, and memory use dropped from 203000k to 59408k. Database.ImportFeed is Database.ContentIdentifier with the serial number filed off. There is a bit of code duplication I would like to avoid, particularly recordAnnexBranchTree, and getAnnexBranchTree. But these use the persistent sqlite tables, so despite the code being the same, they cannot be factored out. Since this database includes the contentidentifier metadata, it will be slightly redundant if a sqlite database is ever added for metadata. I did consider making such a generic database and using it for this. But, that would then need importfeed to update both the url database and the metadata database, which is twice as much work diffing the git-annex branch trees. Or would entagle updating two databases in a complex way. So instead it seems better to optimise the database that importfeed needs, and if the metadata database is used by another command, use a little more disk space and do a little bit of redundant work to update it. Sponsored-by: unqueued on Patreon 2023-10-23 20:12:26 +00:00			`{- Sqlite database of known urls, and another of known itemids,`
			`- for use by git-annex importfeed.`
			`-`
			`- Copyright 2023 Joey Hess <id@joeyh.name>`
			`-:`
			`- Licensed under the GNU AGPL version 3 or higher.`
			`-}`

			`{-# LANGUAGE CPP #-}`
			`{-# LANGUAGE QuasiQuotes, TypeFamilies, TypeOperators, TemplateHaskell #-}`
			`{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts, EmptyDataDecls #-}`
			`{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}`
			`{-# LANGUAGE RankNTypes #-}`
			`{-# LANGUAGE DataKinds, FlexibleInstances #-}`
			`{-# LANGUAGE UndecidableInstances #-}`
			`#if MIN_VERSION_persistent_template(2,8,0)`
			`{-# LANGUAGE DerivingStrategies #-}`
			`{-# LANGUAGE StandaloneDeriving #-}`
			`#endif`

			`module Database.ImportFeed (`
			`ImportFeedDbHandle,`
			`openDb,`
			`closeDb,`
			`isKnownUrl,`
			`isKnownItemId,`
			`) where`

			`import Database.Types`
			`import qualified Database.Queue as H`
			`import Database.Init`
			`import Database.Utility`
			`import Annex.Locations`
			`import Annex.Common hiding (delete)`
			`import qualified Annex.Branch`
			`import Git.Types`
			`import Git.Sha`
			`import Git.FilePath`
			`import qualified Git.DiffTree as DiffTree`
			`import Logs`
			`import Logs.Web`
			`import Logs.MetaData`
			`import Types.MetaData`
			`import Annex.MetaData.StandardFields`
			`import Annex.LockFile`
			`import qualified Utility.RawFilePath as R`

			`import Database.Persist.Sql hiding (Key)`
			`import Database.Persist.TH`
			`import qualified System.FilePath.ByteString as P`
			`import qualified Data.ByteString as B`
			`import qualified Data.Set as S`

			`data ImportFeedDbHandle = ImportFeedDbHandle H.DbQueue`

			`-- Note on indexes: ContentIndentifiersKeyRemoteCidIndex etc are really`
			`-- uniqueness constraints, which cause sqlite to automatically add indexes.`
			`-- So when adding indexes, have to take care to only add ones that work as`
			`-- uniqueness constraints. (Unfortunately persistent does not support indexes`
			`-- that are not uniqueness constraints;`
			`-- https://github.com/yesodweb/persistent/issues/109)`
			`share [mkPersist sqlSettings, mkMigrate "migrateImportFeed"] [persistLowerCase\|`
			`KnownUrls`
			`url SByteString`
			`UniqueUrl url`
			`KnownItemIds`
			`itemid SByteString`
			`UniqueItemId itemid`
			`-- The last git-annex branch tree sha that was used to update`
			`-- KnownUrls and KnownItemIds`
			`AnnexBranch`
			`tree SSha`
			`UniqueTree tree`
			`\|]`

			`{- Opens the database, creating it if it doesn't exist yet.`
			`- Updates the database from the git-annex branch. -}`
			`openDb :: Annex ImportFeedDbHandle`
			`openDb = do`
			`dbdir <- calcRepo' gitAnnexImportFeedDbDir`
			`let db = dbdir P.</> "db"`
			`isnew <- liftIO $ not <$> R.doesPathExist db`
			`when isnew $`
			`initDb db $ void $`
			`runMigrationSilent migrateImportFeed`
			`dbh <- liftIO $ H.openDbQueue db "known_urls"`
			`let h = ImportFeedDbHandle dbh`
			`needsUpdateFromLog h >>= \case`
			`Nothing -> return ()`
			`Just v -> do`
			`lck <- calcRepo' gitAnnexImportFeedDbLock`
			`withExclusiveLock lck $`
			`updateFromLog h v`
			`return h`

			`closeDb :: ImportFeedDbHandle -> Annex ()`
			`closeDb (ImportFeedDbHandle h) = liftIO $ H.closeDbQueue h`

			`isKnownUrl :: ImportFeedDbHandle -> URLString -> IO Bool`
			`isKnownUrl (ImportFeedDbHandle h) u =`
			`H.queryDbQueue h $ do`
			`l <- selectList`
			`[ KnownUrlsUrl ==. SByteString (encodeBS u)`
			`] []`
			`return $ not (null l)`

			`isKnownItemId :: ImportFeedDbHandle -> B.ByteString -> IO Bool`
			`isKnownItemId (ImportFeedDbHandle h) i =`
			`H.queryDbQueue h $ do`
			`l <- selectList`
			`[ KnownItemIdsItemid ==. SByteString i`
			`] []`
			`return $ not (null l)`

speed up populating the importfeed database Avoid conversion from ByteString to String for urls that will just be converted right back to ByteString to go into the database. Also setTempUrl is not used by importfeed, so avoid checking for temp urls in this code path. This benchmarks as only a small improvement. From 2.99s to 2.78s when populating a database with 33k urls. Note that it does not seem worth replacing URLString with URLByteString generally, because the ways urls are used all entails either parseURI, which takes a string, or passing a parameter to eg curl, which also is currently a string. Sponsored-by: Leon Schuermann on Patreon 2023-10-25 17:00:17 +00:00			`recordKnownUrl :: ImportFeedDbHandle -> URLByteString -> IO ()`
sqlite datbase for importfeed importfeed: Use caching database to avoid needing to list urls on every run, and avoid using too much memory. Benchmarking in my podcasts repo, importfeed got 1.42 seconds faster, and memory use dropped from 203000k to 59408k. Database.ImportFeed is Database.ContentIdentifier with the serial number filed off. There is a bit of code duplication I would like to avoid, particularly recordAnnexBranchTree, and getAnnexBranchTree. But these use the persistent sqlite tables, so despite the code being the same, they cannot be factored out. Since this database includes the contentidentifier metadata, it will be slightly redundant if a sqlite database is ever added for metadata. I did consider making such a generic database and using it for this. But, that would then need importfeed to update both the url database and the metadata database, which is twice as much work diffing the git-annex branch trees. Or would entagle updating two databases in a complex way. So instead it seems better to optimise the database that importfeed needs, and if the metadata database is used by another command, use a little more disk space and do a little bit of redundant work to update it. Sponsored-by: unqueued on Patreon 2023-10-23 20:12:26 +00:00			`recordKnownUrl h u = queueDb h $`
speed up populating the importfeed database Avoid conversion from ByteString to String for urls that will just be converted right back to ByteString to go into the database. Also setTempUrl is not used by importfeed, so avoid checking for temp urls in this code path. This benchmarks as only a small improvement. From 2.99s to 2.78s when populating a database with 33k urls. Note that it does not seem worth replacing URLString with URLByteString generally, because the ways urls are used all entails either parseURI, which takes a string, or passing a parameter to eg curl, which also is currently a string. Sponsored-by: Leon Schuermann on Patreon 2023-10-25 17:00:17 +00:00			`void $ insertUniqueFast $ KnownUrls $ SByteString u`
sqlite datbase for importfeed importfeed: Use caching database to avoid needing to list urls on every run, and avoid using too much memory. Benchmarking in my podcasts repo, importfeed got 1.42 seconds faster, and memory use dropped from 203000k to 59408k. Database.ImportFeed is Database.ContentIdentifier with the serial number filed off. There is a bit of code duplication I would like to avoid, particularly recordAnnexBranchTree, and getAnnexBranchTree. But these use the persistent sqlite tables, so despite the code being the same, they cannot be factored out. Since this database includes the contentidentifier metadata, it will be slightly redundant if a sqlite database is ever added for metadata. I did consider making such a generic database and using it for this. But, that would then need importfeed to update both the url database and the metadata database, which is twice as much work diffing the git-annex branch trees. Or would entagle updating two databases in a complex way. So instead it seems better to optimise the database that importfeed needs, and if the metadata database is used by another command, use a little more disk space and do a little bit of redundant work to update it. Sponsored-by: unqueued on Patreon 2023-10-23 20:12:26 +00:00
			`recordKnownItemId :: ImportFeedDbHandle -> SByteString -> IO ()`
			`recordKnownItemId h i = queueDb h $`
			`void $ insertUniqueFast $ KnownItemIds i`

			`recordAnnexBranchTree :: ImportFeedDbHandle -> Sha -> IO ()`
			`recordAnnexBranchTree h s = queueDb h $ do`
			`deleteWhere ([] :: [Filter AnnexBranch])`
			`void $ insertUniqueFast $ AnnexBranch $ toSSha s`

			`getAnnexBranchTree :: ImportFeedDbHandle -> IO Sha`
			`getAnnexBranchTree (ImportFeedDbHandle h) = H.queryDbQueue h $ do`
			`l <- selectList ([] :: [Filter AnnexBranch]) []`
			`case l of`
			`(s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s`
			`_ -> return emptyTree`

			`queueDb :: ImportFeedDbHandle -> SqlPersistM () -> IO ()`
			`queueDb (ImportFeedDbHandle h) = H.queueDb h checkcommit`
			`where`
			`-- commit queue after 10000 changes`
			`checkcommit sz _lastcommittime`
			`\| sz > 10000 = return True`
			`\| otherwise = return False`

			`{- Check if the git-annex branch has been updated and the database needs`
			`- to be updated with any new information from it. -}`
			`needsUpdateFromLog :: ImportFeedDbHandle -> Annex (Maybe (Sha, Sha))`
			`needsUpdateFromLog db = do`
			`oldtree <- liftIO $ getAnnexBranchTree db`
			`Annex.Branch.updatedFromTree oldtree`

			`{- The database should be locked for write when calling this. -}`
			`updateFromLog :: ImportFeedDbHandle -> (Sha, Sha) -> Annex ()`
			`updateFromLog db@(ImportFeedDbHandle h) (oldtree, currtree)`
			`\| oldtree == emptyTree = do`
			`scanbranch`
			`out`
			`\| otherwise = do`
			`scandiff`
			`out`
			`where`
			`out = liftIO $ do`
			`recordAnnexBranchTree db currtree`
			`H.flushDbQueue h`

			`knownitemids s = liftIO $ forM_ (S.toList s) $`
			`recordKnownItemId db . SByteString . fromMetaValue`

			`knownurls us = liftIO $ forM_ us $`
			`recordKnownUrl db`

			`scandiff = do`
			`(l, cleanup) <- inRepo $`
			`DiffTree.diffTreeRecursive oldtree currtree`
			`mapM_ godiff l`
			`void $ liftIO $ cleanup`

			`godiff ti = do`
			`let f = getTopFilePath (DiffTree.file ti)`
			`case extLogFileKey urlLogExt f of`
			`Just k -> do`
speed up populating the importfeed database Avoid conversion from ByteString to String for urls that will just be converted right back to ByteString to go into the database. Also setTempUrl is not used by importfeed, so avoid checking for temp urls in this code path. This benchmarks as only a small improvement. From 2.99s to 2.78s when populating a database with 33k urls. Note that it does not seem worth replacing URLString with URLByteString generally, because the ways urls are used all entails either parseURI, which takes a string, or passing a parameter to eg curl, which also is currently a string. Sponsored-by: Leon Schuermann on Patreon 2023-10-25 17:00:17 +00:00			`knownurls =<< getUrls' k`
sqlite datbase for importfeed importfeed: Use caching database to avoid needing to list urls on every run, and avoid using too much memory. Benchmarking in my podcasts repo, importfeed got 1.42 seconds faster, and memory use dropped from 203000k to 59408k. Database.ImportFeed is Database.ContentIdentifier with the serial number filed off. There is a bit of code duplication I would like to avoid, particularly recordAnnexBranchTree, and getAnnexBranchTree. But these use the persistent sqlite tables, so despite the code being the same, they cannot be factored out. Since this database includes the contentidentifier metadata, it will be slightly redundant if a sqlite database is ever added for metadata. I did consider making such a generic database and using it for this. But, that would then need importfeed to update both the url database and the metadata database, which is twice as much work diffing the git-annex branch trees. Or would entagle updating two databases in a complex way. So instead it seems better to optimise the database that importfeed needs, and if the metadata database is used by another command, use a little more disk space and do a little bit of redundant work to update it. Sponsored-by: unqueued on Patreon 2023-10-23 20:12:26 +00:00			`Nothing -> case extLogFileKey metaDataLogExt f of`
			`Just k -> do`
			`m <- getCurrentMetaData k`
			`knownitemids (currentMetaDataValues itemIdField m)`
			`Nothing -> return ()`

			`-- When initially populating the database, this`
			`-- is faster than diffing from the empty tree`
			`-- and looking up every log file.`
			`scanbranch = Annex.Branch.overBranchFileContents toscan goscan >>= \case`
			`Just () -> return ()`
			`Nothing -> scandiff`

			`toscan f`
			`\| isUrlLog f = Just ()`
			`\| isMetaDataLog f = Just ()`
			`\| otherwise = Nothing`

			`goscan reader = reader >>= \case`
			`Just ((), f, Just content)`
			`\| isUrlLog f -> do`
			`knownurls (parseUrlLog content)`
			`goscan reader`
			`\| isMetaDataLog f -> do`
			`knownitemids $`
			`currentMetaDataValues itemIdField $`
			`parseCurrentMetaData content`
			`goscan reader`
			`\| otherwise -> goscan reader`
			`Just ((), _, Nothing) -> goscan reader`
			`Nothing -> return ()`