From 1428568554d61db26d5891d99eb6eff637404a5f Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 30 Oct 2018 18:03:03 -0400 Subject: [PATCH] retry when sqlite throws ErrorIO I suspect this may be due to SQLITE_IOERR_SHORT_READ, but have not verified. I was able to reproduce it on Linux after running the test suite in a loop for 1-3 hours until it failed. The WAL mode entry change in 3963c5fcf5b568408719663cb51e870704e8634a may have hidden the problem I was seeing; I have not seen an ErrorIO since then. --- Database/Handle.hs | 79 +++++++++++++------ ...ssive_git_add_produces_sqlite_crashes.mdwn | 2 +- ..._86875c51c5ddad715a3691a95113a48a._comment | 26 ++++++ 3 files changed, 82 insertions(+), 25 deletions(-) create mode 100644 doc/bugs/Massive_git_add_produces_sqlite_crashes/comment_6_86875c51c5ddad715a3691a95113a48a._comment diff --git a/Database/Handle.hs b/Database/Handle.hs index 9f230ad651..4b810f1f87 100644 --- a/Database/Handle.hs +++ b/Database/Handle.hs @@ -31,7 +31,6 @@ import Control.Exception (throwIO, BlockedIndefinitelyOnMVar(..)) import qualified Data.Text as T import Control.Monad.Trans.Resource (runResourceT) import Control.Monad.Logger (runNoLoggingT) -import Data.List import System.IO {- A DbHandle is a reference to a worker thread that communicates with @@ -175,36 +174,68 @@ workerThread db tablename jobs = go liftIO (a (runSqliteRobustly tablename db)) loop --- like runSqlite, but calls settle on the raw sql Connection. +-- Like runSqlite, but more robust. +-- +-- New database connections can sometimes take a while to become usable. +-- This may be due to WAL mode recovering after a crash, or perhaps a bug +-- like described in blob 500f777a6ab6c45ca5f9790e0a63575f8e3cb88f. +-- So, loop until a select succeeds; once one succeeds the connection will +-- stay usable. +-- +-- And sqlite sometimes throws ErrorIO when there's not really an IO problem, +-- but perhaps just a short read(). That's caught and retried several times. runSqliteRobustly :: TableName -> T.Text -> (SqlPersistM a) -> IO a runSqliteRobustly tablename db a = do - conn <- Sqlite.open db - settle conn - runResourceT $ runNoLoggingT $ - withSqlConn (wrapConnection conn) $ - runSqlConn a + conn <- opensettle maxretries + go conn maxretries where - -- Work around a bug in sqlite: New database connections can - -- sometimes take a while to become usable; select statements will - -- fail with ErrorBusy for some time. So, loop until a select - -- succeeds; once one succeeds the connection will stay usable. - -- - -- I reported this bug, but the sqlite developers did not respond. - -- Bug report is archived in blob 500f777a6ab6c45ca5f9790e0a63575f8e3cb88f - -- in git-annex git repo. - settle conn = do - r <- tryNonAsync $ do + maxretries = 100 :: Int + + rethrow msg e = throwIO $ userError $ show e ++ "(" ++ msg ++ ")" + + go conn retries = do + r <- try $ runResourceT $ runNoLoggingT $ + withSqlConn (wrapConnection conn) $ + runSqlConn a + case r of + Right v -> return v + Left ex@(Sqlite.SqliteException { Sqlite.seError = e }) + | e == Sqlite.ErrorIO -> + let retries' = retries - 1 + in if retries' < 1 + then rethrow "after successful open" ex + else go conn retries' + | otherwise -> rethrow "after successful open" ex + + opensettle retries = do + conn <- Sqlite.open db + settle conn retries + + settle conn retries = do + r <- try $ do stmt <- Sqlite.prepare conn nullselect void $ Sqlite.step stmt void $ Sqlite.finalize stmt case r of - Right _ -> return () - Left e -> do - if "ErrorBusy" `isInfixOf` show e - then do - threadDelay 1000 -- 1/1000th second - settle conn - else throwIO e + Right _ -> return conn + Left ex@(Sqlite.SqliteException { Sqlite.seError = e }) + | e == Sqlite.ErrorBusy -> do + -- Wait and retry any number of times; it + -- will stop being busy eventually. + briefdelay + settle conn retries + | e == Sqlite.ErrorIO -> do + -- Could be a real IO error, + -- so don't retry indefinitely. + Sqlite.close conn + briefdelay + let retries' = retries - 1 + if retries' < 1 + then rethrow "while opening database connection" ex + else opensettle retries' + | otherwise -> rethrow "while opening database connection" ex -- This should succeed for any table. nullselect = T.pack $ "SELECT null from " ++ tablename ++ " limit 1" + + briefdelay = threadDelay 1000 -- 1/1000th second diff --git a/doc/bugs/Massive_git_add_produces_sqlite_crashes.mdwn b/doc/bugs/Massive_git_add_produces_sqlite_crashes.mdwn index 3e3d749a19..9e343c251c 100644 --- a/doc/bugs/Massive_git_add_produces_sqlite_crashes.mdwn +++ b/doc/bugs/Massive_git_add_produces_sqlite_crashes.mdwn @@ -59,4 +59,4 @@ Oh yeah, I am still discovering this powerfull git annex tool. In fact, collegues and I are forming a group during the process to exchange about different use cases, encountered problems and help each other. -[[!meta title="sqlite crash on Windows running linux binary via WSL"]] +[[!meta title="v7: intermittent sqlite ErrorIO crash (especially in WSL)"]] diff --git a/doc/bugs/Massive_git_add_produces_sqlite_crashes/comment_6_86875c51c5ddad715a3691a95113a48a._comment b/doc/bugs/Massive_git_add_produces_sqlite_crashes/comment_6_86875c51c5ddad715a3691a95113a48a._comment new file mode 100644 index 0000000000..8b397d86f7 --- /dev/null +++ b/doc/bugs/Massive_git_add_produces_sqlite_crashes/comment_6_86875c51c5ddad715a3691a95113a48a._comment @@ -0,0 +1,26 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 6""" + date="2018-10-30T16:23:21Z" + content=""" +I saw a different "SQLite3 returned ErrorIO while attempting to perform step", +which seems likely to be closely related to this problem. +(The "step" action happens right after the "prepare" action.) + + failed to commit changes to sqlite database: Just SQLite3 returned ErrorIO while attempting to perform step. + CallStack (from HasCallStack): + error, called at ./Database/Handle.hs:116:26 in main:Database.Handle + +The test suite occasionally fails due to this, and it seems always in +`test_lock_v7_force`. Which notably deletes the sqlite database just before +the failure, and so causes it to be re-created. + +Dumping the keys database after such a failure, it is freshly created, +contains the tables but no data has been written to it. + +I've made git-annex catch ErrorIO and retry. Not sure if that fixes the bug, +but it may. + +Please try, if you can, the new git-annex version 7, and see if this bug +still happens. +"""]]