git-annex/Database/ContentIdentifier.hs
Joey Hess 9828f45d85
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:

* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
  could in theory generate the same content identifier for two different
  peices of content

While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.

External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.

Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 13:51:42 -04:00

154 lines
5.1 KiB
Haskell

{- Sqlite database of ContentIdentifiers imported from special remotes.
-
- Copyright 2019 Joey Hess <id@joeyh.name>
-:
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts, EmptyDataDecls #-}
{-# LANGUAGE MultiParamTypeClasses, GeneralizedNewtypeDeriving #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE UndecidableInstances #-}
module Database.ContentIdentifier (
ContentIdentifierHandle,
openDb,
closeDb,
flushDbQueue,
recordContentIdentifier,
getContentIdentifiers,
getContentIdentifierKeys,
recordAnnexBranchTree,
getAnnexBranchTree,
needsUpdateFromLog,
updateFromLog,
ContentIdentifiersId,
AnnexBranchId,
) where
import Database.Types
import qualified Database.Queue as H
import Database.Init
import Annex.Locations
import Annex.Common hiding (delete)
import qualified Annex.Branch
import Types.Import
import Types.RemoteState
import Git.Types
import Git.Sha
import Git.FilePath
import qualified Git.Ref
import qualified Git.DiffTree as DiffTree
import Logs
import qualified Logs.ContentIdentifier as Log
import Database.Persist.Sql hiding (Key)
import Database.Persist.TH
data ContentIdentifierHandle = ContentIdentifierHandle H.DbQueue
share [mkPersist sqlSettings, mkMigrate "migrateContentIdentifier"] [persistLowerCase|
ContentIdentifiers
remote UUID
cid ContentIdentifier
key IKey
-- The last git-annex branch tree sha that was used to update
-- ContentIdentifiers
AnnexBranch
tree SRef
UniqueTree tree
|]
{- Opens the database, creating it if it doesn't exist yet.
-
- Only a single process should write to the database at a time, so guard
- any writes with the gitAnnexContentIdentifierLock.
-}
openDb :: Annex ContentIdentifierHandle
openDb = do
dbdir <- fromRepo gitAnnexContentIdentifierDbDir
let db = dbdir </> "db"
unlessM (liftIO $ doesFileExist db) $ do
initDb db $ void $
runMigrationSilent migrateContentIdentifier
h <- liftIO $ H.openDbQueue H.SingleWriter db "content_identifiers"
return $ ContentIdentifierHandle h
closeDb :: ContentIdentifierHandle -> Annex ()
closeDb (ContentIdentifierHandle h) = liftIO $ H.closeDbQueue h
queueDb :: ContentIdentifierHandle -> SqlPersistM () -> IO ()
queueDb (ContentIdentifierHandle h) = H.queueDb h checkcommit
where
-- commit queue after 1000 changes
checkcommit sz _lastcommittime
| sz > 1000 = return True
| otherwise = return False
flushDbQueue :: ContentIdentifierHandle -> IO ()
flushDbQueue (ContentIdentifierHandle h) = H.flushDbQueue h
-- Be sure to also update the git-annex branch when using this.
recordContentIdentifier :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> Key -> IO ()
recordContentIdentifier h (RemoteStateHandle u) cid k = queueDb h $ do
void $ insert_ $ ContentIdentifiers u cid (toIKey k)
getContentIdentifiers :: ContentIdentifierHandle -> RemoteStateHandle -> Key -> IO [ContentIdentifier]
getContentIdentifiers (ContentIdentifierHandle h) (RemoteStateHandle u) k =
H.queryDbQueue h $ do
l <- selectList
[ ContentIdentifiersKey ==. toIKey k
, ContentIdentifiersRemote ==. u
] []
return $ map (contentIdentifiersCid . entityVal) l
getContentIdentifierKeys :: ContentIdentifierHandle -> RemoteStateHandle -> ContentIdentifier -> IO [Key]
getContentIdentifierKeys (ContentIdentifierHandle h) (RemoteStateHandle u) cid =
H.queryDbQueue h $ do
l <- selectList
[ ContentIdentifiersCid ==. cid
, ContentIdentifiersRemote ==. u
] []
return $ map (fromIKey . contentIdentifiersKey . entityVal) l
recordAnnexBranchTree :: ContentIdentifierHandle -> Sha -> IO ()
recordAnnexBranchTree h s = queueDb h $ do
deleteWhere ([] :: [Filter AnnexBranch])
void $ insertUnique $ AnnexBranch $ toSRef s
getAnnexBranchTree :: ContentIdentifierHandle -> IO Sha
getAnnexBranchTree (ContentIdentifierHandle h) = H.queryDbQueue h $ do
l <- selectList ([] :: [Filter AnnexBranch]) []
case l of
(s:[]) -> return $ fromSRef $ annexBranchTree $ entityVal s
_ -> return emptyTree
{- Check if the git-annex branch has been updated and the database needs
- to be updated with any new content identifiers in it. -}
needsUpdateFromLog :: ContentIdentifierHandle -> Annex (Maybe (Sha, Sha))
needsUpdateFromLog db = do
oldtree <- liftIO $ getAnnexBranchTree db
inRepo (Git.Ref.tree Annex.Branch.fullname) >>= \case
Just currtree | currtree /= oldtree ->
return $ Just (oldtree, currtree)
_ -> return Nothing
{- The database should be locked for write when calling this. -}
updateFromLog :: ContentIdentifierHandle -> (Sha, Sha) -> Annex ()
updateFromLog db (oldtree, currtree) = do
(l, cleanup) <- inRepo $
DiffTree.diffTreeRecursive oldtree currtree
mapM_ go l
void $ liftIO $ cleanup
liftIO $ do
recordAnnexBranchTree db currtree
flushDbQueue db
where
go ti = case extLogFileKey remoteContentIdentifierExt (getTopFilePath (DiffTree.file ti)) of
Nothing -> return ()
Just k -> do
l <- Log.getContentIdentifiers k
liftIO $ forM_ l $ \(rs, cids) ->
forM_ cids $ \cid ->
recordContentIdentifier db rs cid k