2019-02-20 20:59:10 +00:00
|
|
|
{- Sqlite database of ContentIdentifiers imported from special remotes.
|
|
|
|
-
|
2023-06-02 17:30:30 +00:00
|
|
|
- Copyright 2019-2023 Joey Hess <id@joeyh.name>
|
2019-02-20 20:59:10 +00:00
|
|
|
-:
|
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
|
|
|
-}
|
|
|
|
|
2020-02-04 17:53:00 +00:00
|
|
|
{-# LANGUAGE CPP #-}
|
2023-08-02 13:47:42 +00:00
|
|
|
{-# LANGUAGE QuasiQuotes, TypeFamilies, TypeOperators, TemplateHaskell #-}
|
2019-04-09 23:58:24 +00:00
|
|
|
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts, EmptyDataDecls #-}
|
2019-02-20 20:59:10 +00:00
|
|
|
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
|
|
|
|
{-# LANGUAGE RankNTypes #-}
|
2020-11-07 18:09:17 +00:00
|
|
|
{-# LANGUAGE DataKinds, FlexibleInstances #-}
|
2019-07-30 16:49:37 +00:00
|
|
|
{-# LANGUAGE UndecidableInstances #-}
|
2020-02-04 17:53:00 +00:00
|
|
|
#if MIN_VERSION_persistent_template(2,8,0)
|
2020-02-04 16:03:30 +00:00
|
|
|
{-# LANGUAGE DerivingStrategies #-}
|
|
|
|
{-# LANGUAGE StandaloneDeriving #-}
|
2020-02-04 17:53:00 +00:00
|
|
|
#endif
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
module Database.ContentIdentifier (
|
|
|
|
ContentIdentifierHandle,
|
2023-06-02 17:30:30 +00:00
|
|
|
databaseIsEmpty,
|
2019-02-20 20:59:10 +00:00
|
|
|
openDb,
|
|
|
|
closeDb,
|
|
|
|
flushDbQueue,
|
|
|
|
recordContentIdentifier,
|
2019-03-04 21:50:41 +00:00
|
|
|
getContentIdentifiers,
|
2019-02-20 20:59:10 +00:00
|
|
|
getContentIdentifierKeys,
|
2019-03-06 22:04:30 +00:00
|
|
|
recordAnnexBranchTree,
|
|
|
|
getAnnexBranchTree,
|
2019-03-07 16:56:40 +00:00
|
|
|
needsUpdateFromLog,
|
|
|
|
updateFromLog,
|
2019-02-20 20:59:10 +00:00
|
|
|
ContentIdentifiersId,
|
2019-03-06 22:04:30 +00:00
|
|
|
AnnexBranchId,
|
2019-02-20 20:59:10 +00:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Database.Types
|
|
|
|
import qualified Database.Queue as H
|
|
|
|
import Database.Init
|
2023-03-31 18:34:18 +00:00
|
|
|
import Database.Utility
|
2019-02-20 20:59:10 +00:00
|
|
|
import Annex.Locations
|
|
|
|
import Annex.Common hiding (delete)
|
2019-03-07 16:56:40 +00:00
|
|
|
import qualified Annex.Branch
|
2019-02-21 17:38:27 +00:00
|
|
|
import Types.Import
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
import Types.RemoteState
|
2019-03-06 22:04:30 +00:00
|
|
|
import Git.Types
|
|
|
|
import Git.Sha
|
2019-03-07 16:56:40 +00:00
|
|
|
import Git.FilePath
|
|
|
|
import qualified Git.Ref
|
|
|
|
import qualified Git.DiffTree as DiffTree
|
|
|
|
import Logs
|
|
|
|
import qualified Logs.ContentIdentifier as Log
|
2020-11-05 22:45:37 +00:00
|
|
|
import qualified Utility.RawFilePath as R
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
import Database.Persist.Sql hiding (Key)
|
|
|
|
import Database.Persist.TH
|
2020-12-23 17:58:01 +00:00
|
|
|
import Database.Persist.Sqlite (runSqlite)
|
2020-11-05 22:45:37 +00:00
|
|
|
import qualified System.FilePath.ByteString as P
|
2020-12-23 17:58:01 +00:00
|
|
|
import qualified Data.Text as T
|
2019-02-20 20:59:10 +00:00
|
|
|
|
2023-06-02 17:30:30 +00:00
|
|
|
data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue Bool
|
|
|
|
|
|
|
|
databaseIsEmpty :: ContentIdentifierHandle -> Bool
|
|
|
|
databaseIsEmpty (ContentIdentifierHandle _ b) = b
|
2019-02-20 20:59:10 +00:00
|
|
|
|
2023-06-09 19:12:33 +00:00
|
|
|
-- Note on indexes: ContentIndentifiersKeyRemoteCidIndex etc are really
|
|
|
|
-- uniqueness constraints, which cause sqlite to automatically add indexes.
|
|
|
|
-- So when adding indexes, have to take care to only add ones that work as
|
|
|
|
-- uniqueness constraints. (Unfortunately persistent does not support indexes
|
|
|
|
-- that are not uniqueness constraints;
|
|
|
|
-- https://github.com/yesodweb/persistent/issues/109)
|
|
|
|
--
|
|
|
|
-- ContentIndentifiersKeyRemoteCidIndex speeds up queries like
|
|
|
|
-- getContentIdentifiers, but it is not used for
|
|
|
|
-- getContentIdentifierKeys. ContentIndentifiersCidRemoteKeyIndex was
|
|
|
|
-- addedto speed that up.
|
2019-02-20 20:59:10 +00:00
|
|
|
share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase|
|
|
|
|
ContentIdentifiers
|
|
|
|
remote UUID
|
|
|
|
cid ContentIdentifier
|
2019-10-29 16:28:01 +00:00
|
|
|
key Key
|
2019-10-30 17:40:29 +00:00
|
|
|
ContentIndentifiersKeyRemoteCidIndex key remote cid
|
2023-06-09 19:12:33 +00:00
|
|
|
ContentIndentifiersCidRemoteKeyIndex cid remote key
|
2019-03-06 22:04:30 +00:00
|
|
|
-- The last git-annex branch tree sha that was used to update
|
|
|
|
-- ContentIdentifiers
|
|
|
|
AnnexBranch
|
2019-10-29 16:28:01 +00:00
|
|
|
tree SSha
|
2019-03-06 22:04:30 +00:00
|
|
|
UniqueTree tree
|
2019-02-20 20:59:10 +00:00
|
|
|
|]
|
|
|
|
|
|
|
|
{- Opens the database, creating it if it doesn't exist yet.
|
|
|
|
-
|
|
|
|
- Only a single process should write to the database at a time, so guard
|
|
|
|
- any writes with the gitAnnexContentIdentifierLock.
|
|
|
|
-}
|
|
|
|
openDb :: Annex ContentIdentifierHandle
|
|
|
|
openDb = do
|
2022-08-11 20:57:44 +00:00
|
|
|
dbdir <- calcRepo' gitAnnexContentIdentifierDbDir
|
2020-11-05 22:45:37 +00:00
|
|
|
let db = dbdir P.</> "db"
|
2023-06-02 17:30:30 +00:00
|
|
|
isnew <- liftIO $ not <$> R.doesPathExist db
|
|
|
|
if isnew
|
|
|
|
then initDb db $ void $
|
2019-02-20 20:59:10 +00:00
|
|
|
runMigrationSilent migrateContentIdentifier
|
2023-06-09 19:12:33 +00:00
|
|
|
-- Migrate from old versions of database, which had buggy
|
|
|
|
-- and suboptimal uniqueness constraints.
|
2023-06-02 17:30:30 +00:00
|
|
|
else liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $
|
2020-12-23 17:58:01 +00:00
|
|
|
runMigrationSilent migrateContentIdentifier
|
2021-10-20 16:24:40 +00:00
|
|
|
h <- liftIO $ H.openDbQueue db "content_identifiers"
|
2023-06-02 17:30:30 +00:00
|
|
|
return $ ContentIdentifierHandle h isnew
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
closeDb :: ContentIdentifierHandle -> Annex ()
|
2023-06-02 17:30:30 +00:00
|
|
|
closeDb (ContentIdentifierHandle h _) = liftIO $ H.closeDbQueue h
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO ()
|
2023-06-02 17:30:30 +00:00
|
|
|
queueDb (ContentIdentifierHandle h _) = H.queueDb h checkcommit
|
2019-02-20 20:59:10 +00:00
|
|
|
where
|
|
|
|
-- commit queue after 1000 changes
|
|
|
|
checkcommit sz _lastcommittime
|
|
|
|
| sz > 1000 = return True
|
|
|
|
| otherwise = return False
|
|
|
|
|
|
|
|
flushDbQueue :: ContentIdentifierHandle -> IO ()
|
2023-06-02 17:30:30 +00:00
|
|
|
flushDbQueue (ContentIdentifierHandle h _) = H.flushDbQueue h
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
-- Be sure to also update the git-annex branch when using this.
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO ()
|
|
|
|
recordContentIdentifier h (RemoteStateHandle u) cid k = queueDb h $ do
|
2023-03-31 18:34:18 +00:00
|
|
|
void $ insertUniqueFast $ ContentIdentifiers u cid k
|
2019-02-20 20:59:10 +00:00
|
|
|
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier]
|
2023-06-02 17:30:30 +00:00
|
|
|
getContentIdentifiers (ContentIdentifierHandle h _) (RemoteStateHandle u) k =
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
H.queryDbQueue h $ do
|
|
|
|
l <- selectList
|
2019-10-29 16:28:01 +00:00
|
|
|
[ ContentIdentifiersKey ==. k
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
, ContentIdentifiersRemote ==. u
|
|
|
|
] []
|
|
|
|
return $ map (contentIdentifiersCid . entityVal) l
|
2019-03-04 20:48:07 +00:00
|
|
|
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key]
|
2023-06-02 17:30:30 +00:00
|
|
|
getContentIdentifierKeys (ContentIdentifierHandle h _) (RemoteStateHandle u) cid =
|
2019-02-20 20:59:10 +00:00
|
|
|
H.queryDbQueue h $ do
|
|
|
|
l <- selectList
|
|
|
|
[ ContentIdentifiersCid ==. cid
|
|
|
|
, ContentIdentifiersRemote ==. u
|
|
|
|
] []
|
2019-10-29 16:28:01 +00:00
|
|
|
return $ map (contentIdentifiersKey . entityVal) l
|
2019-03-06 22:04:30 +00:00
|
|
|
|
|
|
|
recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
|
|
|
|
recordAnnexBranchTree h s = queueDb h $ do
|
2023-05-30 19:42:11 +00:00
|
|
|
deleteWhere ([] :: [Filter AnnexBranch])
|
|
|
|
void $ insertUniqueFast $ AnnexBranch $ toSSha s
|
2019-03-06 22:04:30 +00:00
|
|
|
|
|
|
|
getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
|
2023-06-02 17:30:30 +00:00
|
|
|
getAnnexBranchTree (ContentIdentifierHandle h _) = H.queryDbQueue h $ do
|
2023-05-30 19:42:11 +00:00
|
|
|
l <- selectList ([] :: [Filter AnnexBranch]) []
|
|
|
|
case l of
|
|
|
|
(s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s
|
|
|
|
_ -> return emptyTree
|
2019-03-07 16:56:40 +00:00
|
|
|
|
|
|
|
{- Check if the git-annex branch has been updated and the database needs
|
|
|
|
- to be updated with any new content identifiers in it. -}
|
|
|
|
needsUpdateFromLog :: ContentIdentifierHandle -> Annex (Maybe (Sha, Sha))
|
|
|
|
needsUpdateFromLog db = do
|
|
|
|
oldtree <- liftIO $ getAnnexBranchTree db
|
sqlite datbase for importfeed
importfeed: Use caching database to avoid needing to list urls on every
run, and avoid using too much memory.
Benchmarking in my podcasts repo, importfeed got 1.42 seconds faster,
and memory use dropped from 203000k to 59408k.
Database.ImportFeed is Database.ContentIdentifier with the serial number
filed off. There is a bit of code duplication I would like to avoid,
particularly recordAnnexBranchTree, and getAnnexBranchTree. But these use
the persistent sqlite tables, so despite the code being the same, they
cannot be factored out.
Since this database includes the contentidentifier metadata, it will be
slightly redundant if a sqlite database is ever added for metadata. I
did consider making such a generic database and using it for this. But,
that would then need importfeed to update both the url database and the
metadata database, which is twice as much work diffing the git-annex
branch trees. Or would entagle updating two databases in a complex way.
So instead it seems better to optimise the database that
importfeed needs, and if the metadata database is used by another command,
use a little more disk space and do a little bit of redundant work to
update it.
Sponsored-by: unqueued on Patreon
2023-10-23 20:12:26 +00:00
|
|
|
Annex.Branch.updatedFromTree oldtree
|
2019-03-07 16:56:40 +00:00
|
|
|
|
|
|
|
{- The database should be locked for write when calling this. -}
|
2023-06-02 17:30:30 +00:00
|
|
|
updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex ContentIdentifierHandle
|
|
|
|
updateFromLog db@(ContentIdentifierHandle h _) (oldtree, currtree) = do
|
2019-03-07 16:56:40 +00:00
|
|
|
(l, cleanup) <- inRepo $
|
|
|
|
DiffTree.diffTreeRecursive oldtree currtree
|
|
|
|
mapM_ go l
|
|
|
|
void $ liftIO $ cleanup
|
|
|
|
liftIO $ do
|
|
|
|
recordAnnexBranchTree db currtree
|
|
|
|
flushDbQueue db
|
2023-06-02 17:30:30 +00:00
|
|
|
return (ContentIdentifierHandle h False)
|
2019-03-07 16:56:40 +00:00
|
|
|
where
|
2019-12-09 17:49:05 +00:00
|
|
|
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
|
2019-03-07 16:56:40 +00:00
|
|
|
Nothing -> return ()
|
|
|
|
Just k -> do
|
|
|
|
l <- Log.getContentIdentifiers k
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
liftIO $ forM_ l $ \(rs, cids) ->
|
2019-03-07 16:56:40 +00:00
|
|
|
forM_ cids $ \cid ->
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
recordContentIdentifier db rs cid k
|