improve sqlite retrying behavior
Avoid hanging when a suspended git-annex process is keeping a sqlite database locked. Sponsored-by: Dartmouth College's Datalad project
This commit is contained in:
parent
3149a1e2fe
commit
cde2e61105
3 changed files with 155 additions and 110 deletions
|
@ -13,6 +13,8 @@ git-annex (10.20221004) UNRELEASED; urgency=medium
|
||||||
* When importing from versioned remotes, fix tracking of the content
|
* When importing from versioned remotes, fix tracking of the content
|
||||||
of deleted files.
|
of deleted files.
|
||||||
* More robust handling of ErrorBusy when writing to sqlite databases.
|
* More robust handling of ErrorBusy when writing to sqlite databases.
|
||||||
|
* Avoid hanging when a suspended git-annex process is keeping a sqlite
|
||||||
|
database locked.
|
||||||
|
|
||||||
-- Joey Hess <id@joeyh.name> Mon, 03 Oct 2022 13:36:42 -0400
|
-- Joey Hess <id@joeyh.name> Mon, 03 Oct 2022 13:36:42 -0400
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
{- Persistent sqlite database handles.
|
{- Persistent sqlite database handles.
|
||||||
-
|
-
|
||||||
- Copyright 2015-2019 Joey Hess <id@joeyh.name>
|
- Copyright 2015-2022 Joey Hess <id@joeyh.name>
|
||||||
-
|
-
|
||||||
- Licensed under the GNU AGPL version 3 or higher.
|
- Licensed under the GNU AGPL version 3 or higher.
|
||||||
-}
|
-}
|
||||||
|
@ -21,6 +21,7 @@ import Utility.Exception
|
||||||
import Utility.FileSystemEncoding
|
import Utility.FileSystemEncoding
|
||||||
import Utility.Debug
|
import Utility.Debug
|
||||||
import Utility.DebugLocks
|
import Utility.DebugLocks
|
||||||
|
import Utility.InodeCache
|
||||||
|
|
||||||
import Database.Persist.Sqlite
|
import Database.Persist.Sqlite
|
||||||
import qualified Database.Sqlite as Sqlite
|
import qualified Database.Sqlite as Sqlite
|
||||||
|
@ -38,7 +39,7 @@ import System.IO
|
||||||
|
|
||||||
{- A DbHandle is a reference to a worker thread that communicates with
|
{- A DbHandle is a reference to a worker thread that communicates with
|
||||||
- the database. It has a MVar which Jobs are submitted to. -}
|
- the database. It has a MVar which Jobs are submitted to. -}
|
||||||
data DbHandle = DbHandle (Async ()) (MVar Job)
|
data DbHandle = DbHandle RawFilePath (Async ()) (MVar Job)
|
||||||
|
|
||||||
{- Name of a table that should exist once the database is initialized. -}
|
{- Name of a table that should exist once the database is initialized. -}
|
||||||
type TableName = String
|
type TableName = String
|
||||||
|
@ -48,17 +49,17 @@ type TableName = String
|
||||||
openDb :: RawFilePath -> TableName -> IO DbHandle
|
openDb :: RawFilePath -> TableName -> IO DbHandle
|
||||||
openDb db tablename = do
|
openDb db tablename = do
|
||||||
jobs <- newEmptyMVar
|
jobs <- newEmptyMVar
|
||||||
worker <- async (workerThread (T.pack (fromRawFilePath db)) tablename jobs)
|
worker <- async (workerThread db tablename jobs)
|
||||||
|
|
||||||
-- work around https://github.com/yesodweb/persistent/issues/474
|
-- work around https://github.com/yesodweb/persistent/issues/474
|
||||||
liftIO $ fileEncoding stderr
|
liftIO $ fileEncoding stderr
|
||||||
|
|
||||||
return $ DbHandle worker jobs
|
return $ DbHandle db worker jobs
|
||||||
|
|
||||||
{- This is optional; when the DbHandle gets garbage collected it will
|
{- This is optional; when the DbHandle gets garbage collected it will
|
||||||
- auto-close. -}
|
- auto-close. -}
|
||||||
closeDb :: DbHandle -> IO ()
|
closeDb :: DbHandle -> IO ()
|
||||||
closeDb (DbHandle worker jobs) = do
|
closeDb (DbHandle _db worker jobs) = do
|
||||||
debugLocks $ putMVar jobs CloseJob
|
debugLocks $ putMVar jobs CloseJob
|
||||||
wait worker
|
wait worker
|
||||||
|
|
||||||
|
@ -73,7 +74,7 @@ closeDb (DbHandle worker jobs) = do
|
||||||
- it is able to run.
|
- it is able to run.
|
||||||
-}
|
-}
|
||||||
queryDb :: DbHandle -> SqlPersistM a -> IO a
|
queryDb :: DbHandle -> SqlPersistM a -> IO a
|
||||||
queryDb (DbHandle _ jobs) a = do
|
queryDb (DbHandle _db _ jobs) a = do
|
||||||
res <- newEmptyMVar
|
res <- newEmptyMVar
|
||||||
putMVar jobs $ QueryJob $
|
putMVar jobs $ QueryJob $
|
||||||
debugLocks $ liftIO . putMVar res =<< tryNonAsync a
|
debugLocks $ liftIO . putMVar res =<< tryNonAsync a
|
||||||
|
@ -83,22 +84,31 @@ queryDb (DbHandle _ jobs) a = do
|
||||||
{- Writes a change to the database.
|
{- Writes a change to the database.
|
||||||
-
|
-
|
||||||
- Writes can fail when another write is happening concurrently.
|
- Writes can fail when another write is happening concurrently.
|
||||||
- So write failures are caught and retried repeatedly.
|
- So write failures are caught and retried.
|
||||||
|
-
|
||||||
|
- Retries repeatedly for up to 60 seconds. Part that point, it continues
|
||||||
|
- retrying only if the database shows signs of being modified by another
|
||||||
|
- process at least once each 30 seconds.
|
||||||
-}
|
-}
|
||||||
commitDb :: DbHandle -> SqlPersistM () -> IO ()
|
commitDb :: DbHandle -> SqlPersistM () -> IO ()
|
||||||
commitDb h wa = robustly (commitDb' h wa)
|
commitDb h@(DbHandle db _ _) wa =
|
||||||
|
robustly (commitDb' h wa) maxretries emptyDatabaseInodeCache
|
||||||
where
|
where
|
||||||
robustly :: IO (Either SomeException ()) -> IO ()
|
robustly a retries ic = do
|
||||||
robustly a = do
|
|
||||||
r <- a
|
r <- a
|
||||||
case r of
|
case r of
|
||||||
Right _ -> return ()
|
Right _ -> return ()
|
||||||
Left _ -> do
|
Left err -> do
|
||||||
threadDelay 100000 -- 1/10th second
|
threadDelay briefdelay
|
||||||
|
retryHelper "write to" err maxretries db retries ic $
|
||||||
robustly a
|
robustly a
|
||||||
|
|
||||||
|
briefdelay = 100000 -- 1/10th second
|
||||||
|
|
||||||
|
maxretries = 300 :: Int -- 30 seconds of briefdelay
|
||||||
|
|
||||||
commitDb' :: DbHandle -> SqlPersistM () -> IO (Either SomeException ())
|
commitDb' :: DbHandle -> SqlPersistM () -> IO (Either SomeException ())
|
||||||
commitDb' (DbHandle _ jobs) a = do
|
commitDb' (DbHandle _ _ jobs) a = do
|
||||||
debug "Database.Handle" "commitDb start"
|
debug "Database.Handle" "commitDb start"
|
||||||
res <- newEmptyMVar
|
res <- newEmptyMVar
|
||||||
putMVar jobs $ ChangeJob $
|
putMVar jobs $ ChangeJob $
|
||||||
|
@ -115,7 +125,7 @@ data Job
|
||||||
| ChangeJob (SqlPersistM ())
|
| ChangeJob (SqlPersistM ())
|
||||||
| CloseJob
|
| CloseJob
|
||||||
|
|
||||||
workerThread :: T.Text -> TableName -> MVar Job -> IO ()
|
workerThread :: RawFilePath -> TableName -> MVar Job -> IO ()
|
||||||
workerThread db tablename jobs = newconn
|
workerThread db tablename jobs = newconn
|
||||||
where
|
where
|
||||||
newconn = do
|
newconn = do
|
||||||
|
@ -143,44 +153,46 @@ workerThread db tablename jobs = newconn
|
||||||
getjob :: IO (Either BlockedIndefinitelyOnMVar Job)
|
getjob :: IO (Either BlockedIndefinitelyOnMVar Job)
|
||||||
getjob = try $ takeMVar jobs
|
getjob = try $ takeMVar jobs
|
||||||
|
|
||||||
-- Like runSqlite, but more robust.
|
{- Like runSqlite, but more robust.
|
||||||
--
|
-
|
||||||
-- New database connections can sometimes take a while to become usable.
|
- New database connections can sometimes take a while to become usable,
|
||||||
-- This may be due to WAL mode recovering after a crash, or perhaps a
|
- and selects will fail with ErrorBusy in the meantime. This may be due to
|
||||||
-- situation like described in blob 500f777a6ab6c45ca5f9790e0a63575f8e3cb88f.
|
- WAL mode recovering after a crash, or a concurrent writer.
|
||||||
-- So, loop until a select succeeds; once one succeeds the connection will
|
- So, wait until a select succeeds; once one succeeds the connection will
|
||||||
-- stay usable.
|
- stay usable.
|
||||||
--
|
-
|
||||||
-- And sqlite sometimes throws ErrorIO when there's not really an IO problem,
|
- Also sqlite sometimes throws ErrorIO when there's not really an IO
|
||||||
-- but perhaps just a short read(). That's caught and retried several times.
|
- problem, but perhaps just a short read(). So also retry on ErrorIO.
|
||||||
runSqliteRobustly :: TableName -> T.Text -> (SqlPersistM a) -> IO a
|
-
|
||||||
|
- Retries repeatedly for up to 60 seconds. Part that point, it continues
|
||||||
|
- retrying only if the database shows signs of being modified by another
|
||||||
|
- process at least once each 30 seconds.
|
||||||
|
-}
|
||||||
|
runSqliteRobustly :: TableName -> RawFilePath -> (SqlPersistM a) -> IO a
|
||||||
runSqliteRobustly tablename db a = do
|
runSqliteRobustly tablename db a = do
|
||||||
conn <- opensettle maxretries
|
conn <- opensettle maxretries emptyDatabaseInodeCache
|
||||||
go conn maxretries
|
go conn maxretries emptyDatabaseInodeCache
|
||||||
where
|
where
|
||||||
maxretries = 100 :: Int
|
go conn retries ic = do
|
||||||
|
|
||||||
rethrow msg e = throwIO $ userError $ show e ++ "(" ++ msg ++ ")"
|
|
||||||
|
|
||||||
go conn retries = do
|
|
||||||
r <- try $ runResourceT $ runNoLoggingT $
|
r <- try $ runResourceT $ runNoLoggingT $
|
||||||
withSqlConnRobustly (wrapConnection conn) $
|
withSqlConnRobustly db (wrapConnection conn) $
|
||||||
runSqlConn a
|
runSqlConn a
|
||||||
case r of
|
case r of
|
||||||
Right v -> return v
|
Right v -> return v
|
||||||
Left ex@(Sqlite.SqliteException { Sqlite.seError = e })
|
Left ex@(Sqlite.SqliteException { Sqlite.seError = e })
|
||||||
| e == Sqlite.ErrorIO ->
|
| e == Sqlite.ErrorIO -> do
|
||||||
let retries' = retries - 1
|
briefdelay
|
||||||
in if retries' < 1
|
retryHelper "access" ex maxretries db retries ic $
|
||||||
then rethrow "after successful open" ex
|
go conn
|
||||||
else go conn retries'
|
| otherwise -> rethrow $ errmsg "after successful open" ex
|
||||||
| otherwise -> rethrow "after successful open" ex
|
|
||||||
|
|
||||||
opensettle retries = do
|
opensettle retries ic = do
|
||||||
conn <- Sqlite.open db
|
conn <- Sqlite.open tdb
|
||||||
settle conn retries
|
settle conn retries ic
|
||||||
|
|
||||||
settle conn retries = do
|
tdb = T.pack (fromRawFilePath db)
|
||||||
|
|
||||||
|
settle conn retries ic = do
|
||||||
r <- try $ do
|
r <- try $ do
|
||||||
stmt <- Sqlite.prepare conn nullselect
|
stmt <- Sqlite.prepare conn nullselect
|
||||||
void $ Sqlite.step stmt
|
void $ Sqlite.step stmt
|
||||||
|
@ -188,27 +200,27 @@ runSqliteRobustly tablename db a = do
|
||||||
case r of
|
case r of
|
||||||
Right _ -> return conn
|
Right _ -> return conn
|
||||||
Left ex@(Sqlite.SqliteException { Sqlite.seError = e })
|
Left ex@(Sqlite.SqliteException { Sqlite.seError = e })
|
||||||
| e == Sqlite.ErrorBusy -> do
|
| e == Sqlite.ErrorBusy || e == Sqlite.ErrorIO -> do
|
||||||
-- Wait and retry any number of times; it
|
when (e == Sqlite.ErrorIO) $
|
||||||
-- will stop being busy eventually.
|
|
||||||
briefdelay
|
|
||||||
settle conn retries
|
|
||||||
| e == Sqlite.ErrorIO -> do
|
|
||||||
-- Could be a real IO error,
|
|
||||||
-- so don't retry indefinitely.
|
|
||||||
Sqlite.close conn
|
Sqlite.close conn
|
||||||
briefdelay
|
briefdelay
|
||||||
let retries' = retries - 1
|
retryHelper "open" ex maxretries db retries ic $
|
||||||
if retries' < 1
|
if e == Sqlite.ErrorIO
|
||||||
then rethrow "while opening database connection" ex
|
then opensettle
|
||||||
else opensettle retries'
|
else settle conn
|
||||||
| otherwise -> rethrow "while opening database connection" ex
|
| otherwise -> rethrow $ errmsg "while opening database connection" ex
|
||||||
|
|
||||||
-- This should succeed for any table.
|
-- This should succeed for any table.
|
||||||
nullselect = T.pack $ "SELECT null from " ++ tablename ++ " limit 1"
|
nullselect = T.pack $ "SELECT null from " ++ tablename ++ " limit 1"
|
||||||
|
|
||||||
briefdelay = threadDelay 1000 -- 1/1000th second
|
briefdelay = threadDelay 1000 -- 1/1000th second
|
||||||
|
|
||||||
|
maxretries = 30000 :: Int -- 30 seconds of briefdelays
|
||||||
|
|
||||||
|
rethrow = throwIO . userError
|
||||||
|
|
||||||
|
errmsg msg e = show e ++ "(" ++ msg ++ ")"
|
||||||
|
|
||||||
-- Like withSqlConn, but more robust.
|
-- Like withSqlConn, but more robust.
|
||||||
withSqlConnRobustly
|
withSqlConnRobustly
|
||||||
:: (MonadUnliftIO m
|
:: (MonadUnliftIO m
|
||||||
|
@ -217,45 +229,99 @@ withSqlConnRobustly
|
||||||
, BaseBackend backend ~ SqlBackend
|
, BaseBackend backend ~ SqlBackend
|
||||||
, BackendCompatible SqlBackend backend
|
, BackendCompatible SqlBackend backend
|
||||||
)
|
)
|
||||||
=> (LogFunc -> IO backend)
|
=> RawFilePath
|
||||||
|
-> (LogFunc -> IO backend)
|
||||||
-> (backend -> m a)
|
-> (backend -> m a)
|
||||||
-> m a
|
-> m a
|
||||||
withSqlConnRobustly open f = do
|
withSqlConnRobustly db open f = do
|
||||||
logFunc <- askLoggerIO
|
logFunc <- askLoggerIO
|
||||||
withRunInIO $ \run -> bracket
|
withRunInIO $ \run -> bracket
|
||||||
(open logFunc)
|
(open logFunc)
|
||||||
closeRobustly
|
(closeRobustly db)
|
||||||
(run . f)
|
(run . f)
|
||||||
|
|
||||||
-- Sqlite can throw ErrorBusy while closing a database; this catches
|
{- Sqlite can throw ErrorBusy while closing a database; this catches
|
||||||
-- the exception and retries.
|
- the exception and retries.
|
||||||
|
-
|
||||||
|
- Retries repeatedly for up to 60 seconds. Part that point, it continues
|
||||||
|
- retrying only if the database shows signs of being modified by another
|
||||||
|
- process at least once each 30 seconds.
|
||||||
|
-}
|
||||||
closeRobustly
|
closeRobustly
|
||||||
:: (IsPersistBackend backend
|
:: (IsPersistBackend backend
|
||||||
, BaseBackend backend ~ SqlBackend
|
, BaseBackend backend ~ SqlBackend
|
||||||
, BackendCompatible SqlBackend backend
|
, BackendCompatible SqlBackend backend
|
||||||
)
|
)
|
||||||
=> backend
|
=> RawFilePath
|
||||||
|
-> backend
|
||||||
-> IO ()
|
-> IO ()
|
||||||
closeRobustly conn = go maxretries briefdelay
|
closeRobustly db conn = go maxretries emptyDatabaseInodeCache
|
||||||
where
|
where
|
||||||
briefdelay = 1000 -- 1/1000th second
|
go retries ic = do
|
||||||
|
|
||||||
-- Try up to 14 times; with the delay doubling each time,
|
|
||||||
-- the maximum delay before giving up is 16 seconds.
|
|
||||||
maxretries = 14 :: Int
|
|
||||||
|
|
||||||
go retries delay = do
|
|
||||||
r <- try $ close' conn
|
r <- try $ close' conn
|
||||||
case r of
|
case r of
|
||||||
Right () -> return ()
|
Right () -> return ()
|
||||||
Left ex@(Sqlite.SqliteException { Sqlite.seError = e })
|
Left ex@(Sqlite.SqliteException { Sqlite.seError = e })
|
||||||
| e == Sqlite.ErrorBusy -> do
|
| e == Sqlite.ErrorBusy -> do
|
||||||
threadDelay delay
|
threadDelay briefdelay
|
||||||
let delay' = delay * 2
|
retryHelper "close" ex maxretries db retries ic go
|
||||||
|
| otherwise -> rethrow $ errmsg "while closing database connection" ex
|
||||||
|
|
||||||
|
briefdelay = 1000 -- 1/1000th second
|
||||||
|
|
||||||
|
maxretries = 30000 :: Int -- 30 seconds of briefdelays
|
||||||
|
|
||||||
|
rethrow = throwIO . userError
|
||||||
|
|
||||||
|
errmsg msg e = show e ++ "(" ++ msg ++ ")"
|
||||||
|
|
||||||
|
{- Retries a sqlite action repeatedly, but not forever. Detects situations
|
||||||
|
- when another git-annex process is suspended and has the database locked,
|
||||||
|
- and eventually gives up. The retries is the current number of retries
|
||||||
|
- that are left. The maxretries is how many retries to make each time
|
||||||
|
- the database is seen to have been modified by some other process.
|
||||||
|
-}
|
||||||
|
retryHelper
|
||||||
|
:: Show err
|
||||||
|
=> String
|
||||||
|
-> err
|
||||||
|
-> Int
|
||||||
|
-> RawFilePath
|
||||||
|
-> Int
|
||||||
|
-> DatabaseInodeCache
|
||||||
|
-> (Int -> DatabaseInodeCache -> IO a)
|
||||||
|
-> IO a
|
||||||
|
retryHelper action err maxretries db retries ic a = do
|
||||||
let retries' = retries - 1
|
let retries' = retries - 1
|
||||||
if retries' < 1
|
if retries' < 1
|
||||||
then rethrow "while closing database connection" ex
|
then do
|
||||||
else go retries' delay'
|
ic' <- getDatabaseInodeCache db
|
||||||
| otherwise -> rethrow "while closing database connection" ex
|
if isDatabaseModified ic ic'
|
||||||
|
then a maxretries ic'
|
||||||
|
else giveup (databaseAccessStalledMsg action db err)
|
||||||
|
else a retries' ic
|
||||||
|
|
||||||
rethrow msg e = throwIO $ userError $ show e ++ "(" ++ msg ++ ")"
|
databaseAccessStalledMsg :: Show err => String -> RawFilePath -> err -> String
|
||||||
|
databaseAccessStalledMsg action db err =
|
||||||
|
"Repeatedly unable to " ++ action ++ " sqlite database " ++ fromRawFilePath db
|
||||||
|
++ ": " ++ show err ++ ". "
|
||||||
|
++ "Perhaps another git-annex process is suspended and is "
|
||||||
|
++ "keeping this database locked?"
|
||||||
|
|
||||||
|
data DatabaseInodeCache = DatabaseInodeCache (Maybe InodeCache) (Maybe InodeCache)
|
||||||
|
|
||||||
|
emptyDatabaseInodeCache :: DatabaseInodeCache
|
||||||
|
emptyDatabaseInodeCache = DatabaseInodeCache Nothing Nothing
|
||||||
|
|
||||||
|
getDatabaseInodeCache :: RawFilePath -> IO DatabaseInodeCache
|
||||||
|
getDatabaseInodeCache db = DatabaseInodeCache
|
||||||
|
<$> genInodeCache db noTSDelta
|
||||||
|
<*> genInodeCache (db <> "-wal") noTSDelta
|
||||||
|
|
||||||
|
isDatabaseModified :: DatabaseInodeCache -> DatabaseInodeCache -> Bool
|
||||||
|
isDatabaseModified (DatabaseInodeCache a1 b1) (DatabaseInodeCache a2 b2) =
|
||||||
|
ismodified a1 a2 || ismodified b1 b2
|
||||||
|
where
|
||||||
|
ismodified (Just a) (Just b) = not (compareStrong a b)
|
||||||
|
ismodified Nothing Nothing = False
|
||||||
|
ismodified _ _ = True
|
||||||
|
|
|
@ -3,34 +3,11 @@
|
||||||
subject="""comment 27"""
|
subject="""comment 27"""
|
||||||
date="2022-10-17T18:49:47Z"
|
date="2022-10-17T18:49:47Z"
|
||||||
content="""
|
content="""
|
||||||
[[todo/withExclusiveLock_blocking_issue]] does not have to be solved for
|
I've made it retry as long as necessary on ErrorBusy, while also noticing
|
||||||
every other lock in git-annex first. Since the sqlite database lock would
|
when another process is suspended and has the sqlite database locked,
|
||||||
be a new lock file, it could use the mtime update method described in there
|
and avoiding retrying forever in that situation.
|
||||||
without backwards compatibility issues.
|
|
||||||
|
|
||||||
ErrorBusy can also occur when opening a new database connection for read,
|
This seems to be as far as I can take this bug report, I don't know
|
||||||
but it retries that as often as necessary. Which does mean that suspending
|
100% for sure if I've fixed it, but git-annex's behavior should certainly
|
||||||
git-annex at just the wrong time can already cause other git-annex
|
be improved.
|
||||||
processes to stall forever waiting to read from the database.
|
|
||||||
|
|
||||||
So, in a way, it would be ok for write to also retry each time it gets
|
|
||||||
ErrorBusy, rather than the current limited number of retries. If that does
|
|
||||||
cause git-annex to block when another git-annex process is suspended, it
|
|
||||||
would not be a new behavior.
|
|
||||||
|
|
||||||
Also, the mtime file method described in
|
|
||||||
[[todo/withExclusiveLock_blocking_issue]] could be used without a lock file
|
|
||||||
in order to detect when a suspended process is causing ErrorBusy. And can
|
|
||||||
avoid that situation for both writes and reads.
|
|
||||||
|
|
||||||
So, plan:
|
|
||||||
|
|
||||||
1. Retry forever on ErrorBusy when writing to sqlite database.
|
|
||||||
(I've made this change now... So I think probably this bug can't
|
|
||||||
occur any longer.)
|
|
||||||
2. While running opensettle and ChangeJob, have a background thread that
|
|
||||||
periodically updates a mtime file.
|
|
||||||
3. If ErrorBusy is received repeatedly for some amount of time,
|
|
||||||
check the mtime file. If it's not being updated, give up, since
|
|
||||||
a suspended git-annex process apparently has the sqlite database locked.
|
|
||||||
"""]]
|
"""]]
|
||||||
|
|
Loading…
Reference in a new issue