2019-02-20 20:59:10 +00:00
|
|
|
{- Sqlite database of ContentIdentifiers imported from special remotes.
|
|
|
|
-
|
2023-06-02 17:30:30 +00:00
|
|
|
- Copyright 2019-2023 Joey Hess <id@joeyh.name>
|
2019-02-20 20:59:10 +00:00
|
|
|
-:
|
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
|
|
|
-}
|
|
|
|
|
2020-02-04 17:53:00 +00:00
|
|
|
{-# LANGUAGE CPP #-}
|
2023-08-02 13:47:42 +00:00
|
|
|
{-# LANGUAGE QuasiQuotes, TypeFamilies, TypeOperators, TemplateHaskell #-}
|
2019-04-09 23:58:24 +00:00
|
|
|
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts, EmptyDataDecls #-}
|
2019-02-20 20:59:10 +00:00
|
|
|
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
|
|
|
|
{-# LANGUAGE RankNTypes #-}
|
2020-11-07 18:09:17 +00:00
|
|
|
{-# LANGUAGE DataKinds, FlexibleInstances #-}
|
2019-07-30 16:49:37 +00:00
|
|
|
{-# LANGUAGE UndecidableInstances #-}
|
2020-02-04 17:53:00 +00:00
|
|
|
#if MIN_VERSION_persistent_template(2,8,0)
|
2020-02-04 16:03:30 +00:00
|
|
|
{-# LANGUAGE DerivingStrategies #-}
|
|
|
|
{-# LANGUAGE StandaloneDeriving #-}
|
2020-02-04 17:53:00 +00:00
|
|
|
#endif
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
module Database.ContentIdentifier (
|
|
|
|
ContentIdentifierHandle,
|
2023-06-02 17:30:30 +00:00
|
|
|
databaseIsEmpty,
|
2019-02-20 20:59:10 +00:00
|
|
|
openDb,
|
|
|
|
closeDb,
|
|
|
|
flushDbQueue,
|
|
|
|
recordContentIdentifier,
|
2019-03-04 21:50:41 +00:00
|
|
|
getContentIdentifiers,
|
2019-02-20 20:59:10 +00:00
|
|
|
getContentIdentifierKeys,
|
2019-03-06 22:04:30 +00:00
|
|
|
recordAnnexBranchTree,
|
|
|
|
getAnnexBranchTree,
|
2019-03-07 16:56:40 +00:00
|
|
|
needsUpdateFromLog,
|
|
|
|
updateFromLog,
|
2019-02-20 20:59:10 +00:00
|
|
|
ContentIdentifiersId,
|
2019-03-06 22:04:30 +00:00
|
|
|
AnnexBranchId,
|
2019-02-20 20:59:10 +00:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Database.Types
|
|
|
|
import qualified Database.Queue as H
|
|
|
|
import Database.Init
|
2023-03-31 18:34:18 +00:00
|
|
|
import Database.Utility
|
2019-02-20 20:59:10 +00:00
|
|
|
import Annex.Locations
|
|
|
|
import Annex.Common hiding (delete)
|
2019-03-07 16:56:40 +00:00
|
|
|
import qualified Annex.Branch
|
2019-02-21 17:38:27 +00:00
|
|
|
import Types.Import
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
import Types.RemoteState
|
2019-03-06 22:04:30 +00:00
|
|
|
import Git.Types
|
|
|
|
import Git.Sha
|
2019-03-07 16:56:40 +00:00
|
|
|
import Git.FilePath
|
|
|
|
import qualified Git.Ref
|
|
|
|
import qualified Git.DiffTree as DiffTree
|
|
|
|
import Logs
|
|
|
|
import qualified Logs.ContentIdentifier as Log
|
2020-11-05 22:45:37 +00:00
|
|
|
import qualified Utility.RawFilePath as R
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
import Database.Persist.Sql hiding (Key)
|
|
|
|
import Database.Persist.TH
|
2020-12-23 17:58:01 +00:00
|
|
|
import Database.Persist.Sqlite (runSqlite)
|
2020-11-05 22:45:37 +00:00
|
|
|
import qualified System.FilePath.ByteString as P
|
2020-12-23 17:58:01 +00:00
|
|
|
import qualified Data.Text as T
|
2019-02-20 20:59:10 +00:00
|
|
|
|
2023-06-02 17:30:30 +00:00
|
|
|
data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue Bool
|
|
|
|
|
|
|
|
databaseIsEmpty :: ContentIdentifierHandle -> Bool
|
|
|
|
databaseIsEmpty (ContentIdentifierHandle _ b) = b
|
2019-02-20 20:59:10 +00:00
|
|
|
|
2023-06-09 19:12:33 +00:00
|
|
|
-- Note on indexes: ContentIndentifiersKeyRemoteCidIndex etc are really
|
|
|
|
-- uniqueness constraints, which cause sqlite to automatically add indexes.
|
|
|
|
-- So when adding indexes, have to take care to only add ones that work as
|
|
|
|
-- uniqueness constraints. (Unfortunately persistent does not support indexes
|
|
|
|
-- that are not uniqueness constraints;
|
|
|
|
-- https://github.com/yesodweb/persistent/issues/109)
|
|
|
|
--
|
|
|
|
-- ContentIndentifiersKeyRemoteCidIndex speeds up queries like
|
|
|
|
-- getContentIdentifiers, but it is not used for
|
|
|
|
-- getContentIdentifierKeys. ContentIndentifiersCidRemoteKeyIndex was
|
|
|
|
-- addedto speed that up.
|
2019-02-20 20:59:10 +00:00
|
|
|
share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase|
|
|
|
|
ContentIdentifiers
|
|
|
|
remote UUID
|
|
|
|
cid ContentIdentifier
|
2019-10-29 16:28:01 +00:00
|
|
|
key Key
|
2019-10-30 17:40:29 +00:00
|
|
|
ContentIndentifiersKeyRemoteCidIndex key remote cid
|
2023-06-09 19:12:33 +00:00
|
|
|
ContentIndentifiersCidRemoteKeyIndex cid remote key
|
2019-03-06 22:04:30 +00:00
|
|
|
-- The last git-annex branch tree sha that was used to update
|
|
|
|
-- ContentIdentifiers
|
|
|
|
AnnexBranch
|
2019-10-29 16:28:01 +00:00
|
|
|
tree SSha
|
2019-03-06 22:04:30 +00:00
|
|
|
UniqueTree tree
|
2019-02-20 20:59:10 +00:00
|
|
|
|]
|
|
|
|
|
|
|
|
{- Opens the database, creating it if it doesn't exist yet.
|
|
|
|
-
|
|
|
|
- Only a single process should write to the database at a time, so guard
|
|
|
|
- any writes with the gitAnnexContentIdentifierLock.
|
|
|
|
-}
|
|
|
|
openDb :: Annex ContentIdentifierHandle
|
|
|
|
openDb = do
|
2022-08-11 20:57:44 +00:00
|
|
|
dbdir <- calcRepo' gitAnnexContentIdentifierDbDir
|
2020-11-05 22:45:37 +00:00
|
|
|
let db = dbdir P.</> "db"
|
2023-06-02 17:30:30 +00:00
|
|
|
isnew <- liftIO $ not <$> R.doesPathExist db
|
|
|
|
if isnew
|
|
|
|
then initDb db $ void $
|
2019-02-20 20:59:10 +00:00
|
|
|
runMigrationSilent migrateContentIdentifier
|
2023-06-09 19:12:33 +00:00
|
|
|
-- Migrate from old versions of database, which had buggy
|
|
|
|
-- and suboptimal uniqueness constraints.
|
2023-06-02 17:30:30 +00:00
|
|
|
else liftIO $ runSqlite (T.pack (fromRawFilePath db)) $ void $
|
2020-12-23 17:58:01 +00:00
|
|
|
runMigrationSilent migrateContentIdentifier
|
2021-10-20 16:24:40 +00:00
|
|
|
h <- liftIO $ H.openDbQueue db "content_identifiers"
|
2023-06-02 17:30:30 +00:00
|
|
|
return $ ContentIdentifierHandle h isnew
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
closeDb :: ContentIdentifierHandle -> Annex ()
|
2023-06-02 17:30:30 +00:00
|
|
|
closeDb (ContentIdentifierHandle h _) = liftIO $ H.closeDbQueue h
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO ()
|
2023-06-02 17:30:30 +00:00
|
|
|
queueDb (ContentIdentifierHandle h _) = H.queueDb h checkcommit
|
2019-02-20 20:59:10 +00:00
|
|
|
where
|
|
|
|
-- commit queue after 1000 changes
|
|
|
|
checkcommit sz _lastcommittime
|
|
|
|
| sz > 1000 = return True
|
|
|
|
| otherwise = return False
|
|
|
|
|
|
|
|
flushDbQueue :: ContentIdentifierHandle -> IO ()
|
2023-06-02 17:30:30 +00:00
|
|
|
flushDbQueue (ContentIdentifierHandle h _) = H.flushDbQueue h
|
2019-02-20 20:59:10 +00:00
|
|
|
|
|
|
|
-- Be sure to also update the git-annex branch when using this.
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO ()
|
|
|
|
recordContentIdentifier h (RemoteStateHandle u) cid k = queueDb h $ do
|
2023-03-31 18:34:18 +00:00
|
|
|
void $ insertUniqueFast $ ContentIdentifiers u cid k
|
2019-02-20 20:59:10 +00:00
|
|
|
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier]
|
2023-06-02 17:30:30 +00:00
|
|
|
getContentIdentifiers (ContentIdentifierHandle h _) (RemoteStateHandle u) k =
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
H.queryDbQueue h $ do
|
|
|
|
l <- selectList
|
2019-10-29 16:28:01 +00:00
|
|
|
[ ContentIdentifiersKey ==. k
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
, ContentIdentifiersRemote ==. u
|
|
|
|
] []
|
|
|
|
return $ map (contentIdentifiersCid . entityVal) l
|
2019-03-04 20:48:07 +00:00
|
|
|
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key]
|
2023-06-02 17:30:30 +00:00
|
|
|
getContentIdentifierKeys (ContentIdentifierHandle h _) (RemoteStateHandle u) cid =
|
2019-02-20 20:59:10 +00:00
|
|
|
H.queryDbQueue h $ do
|
|
|
|
l <- selectList
|
|
|
|
[ ContentIdentifiersCid ==. cid
|
|
|
|
, ContentIdentifiersRemote ==. u
|
|
|
|
] []
|
2019-10-29 16:28:01 +00:00
|
|
|
return $ map (contentIdentifiersKey . entityVal) l
|
2019-03-06 22:04:30 +00:00
|
|
|
|
|
|
|
recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
|
|
|
|
recordAnnexBranchTree h s = queueDb h $ do
|
2023-05-30 19:42:11 +00:00
|
|
|
deleteWhere ([] :: [Filter AnnexBranch])
|
|
|
|
void $ insertUniqueFast $ AnnexBranch $ toSSha s
|
2019-03-06 22:04:30 +00:00
|
|
|
|
|
|
|
getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
|
2023-06-02 17:30:30 +00:00
|
|
|
getAnnexBranchTree (ContentIdentifierHandle h _) = H.queryDbQueue h $ do
|
2023-05-30 19:42:11 +00:00
|
|
|
l <- selectList ([] :: [Filter AnnexBranch]) []
|
|
|
|
case l of
|
|
|
|
(s:[]) -> return $ fromSSha $ annexBranchTree $ entityVal s
|
|
|
|
_ -> return emptyTree
|
2019-03-07 16:56:40 +00:00
|
|
|
|
|
|
|
{- Check if the git-annex branch has been updated and the database needs
|
|
|
|
- to be updated with any new content identifiers in it. -}
|
|
|
|
needsUpdateFromLog :: ContentIdentifierHandle -> Annex (Maybe (Sha, Sha))
|
|
|
|
needsUpdateFromLog db = do
|
|
|
|
oldtree <- liftIO $ getAnnexBranchTree db
|
|
|
|
inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case
|
|
|
|
Just currtree | currtree /= oldtree ->
|
|
|
|
return $ Just (oldtree, currtree)
|
|
|
|
_ -> return Nothing
|
|
|
|
|
|
|
|
{- The database should be locked for write when calling this. -}
|
2023-06-02 17:30:30 +00:00
|
|
|
updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex ContentIdentifierHandle
|
|
|
|
updateFromLog db@(ContentIdentifierHandle h _) (oldtree, currtree) = do
|
2019-03-07 16:56:40 +00:00
|
|
|
(l, cleanup) <- inRepo $
|
|
|
|
DiffTree.diffTreeRecursive oldtree currtree
|
|
|
|
mapM_ go l
|
|
|
|
void $ liftIO $ cleanup
|
|
|
|
liftIO $ do
|
|
|
|
recordAnnexBranchTree db currtree
|
|
|
|
flushDbQueue db
|
2023-06-02 17:30:30 +00:00
|
|
|
return (ContentIdentifierHandle h False)
|
2019-03-07 16:56:40 +00:00
|
|
|
where
|
2019-12-09 17:49:05 +00:00
|
|
|
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
|
2019-03-07 16:56:40 +00:00
|
|
|
Nothing -> return ()
|
|
|
|
Just k -> do
|
|
|
|
l <- Log.getContentIdentifiers k
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
liftIO $ forM_ l $ \(rs, cids) ->
|
2019-03-07 16:56:40 +00:00
|
|
|
forM_ cids $ \cid ->
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
recordContentIdentifier db rs cid k
|