avoid STM deadlock

When all worker threads are running and enteringStage is called,
it waits for an idle slot. If all off the other threads then call it in
turn, a deadlock occurrs.

This is the same problem I didn't actually fix in
5a9842d7ed.

Fixed by doing two separate STM transactions, the first replaces its
active thread with an idle thread, and the second waits for another idle
thread. That guarantees there will eventually be an idle thread to find.

The changes to WorkerPool were necessary because it can't add an idle
thread containing the Annex state and go on to run an action using that
same state, so I had to remove the Annex state from IdleWorker.
This commit is contained in:
Joey Hess 2019-06-19 18:07:30 -04:00
parent a0d3a699e2
commit 37d505dd6b
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 91 additions and 59 deletions

View file

@ -84,11 +84,6 @@ stopCoProcesses = do
- -
- Also a noop if the stage is not one of the stages that the worker pool - Also a noop if the stage is not one of the stages that the worker pool
- uses. - uses.
-
- The pool needs to continue to contain the same number of worker threads
- for each stage. So, an idle worker with the desired stage is found in
- the pool (waiting if necessary for one to become idle), and the stages
- of it and the current thread are swapped.
-} -}
enteringStage :: WorkerStage -> Annex a -> Annex a enteringStage :: WorkerStage -> Annex a -> Annex a
enteringStage newstage a = Annex.getState Annex.workers >>= \case enteringStage newstage a = Annex.getState Annex.workers >>= \case
@ -99,26 +94,49 @@ enteringStage newstage a = Annex.getState Annex.workers >>= \case
let restore = maybe noop (void . changeStageTo mytid tv) let restore = maybe noop (void . changeStageTo mytid tv)
bracket set restore (const a) bracket set restore (const a)
{- This needs to leave the WorkerPool with the same number of
- idle and active threads, and with the same number of threads for each
- WorkerStage. So, all it can do is swap the WorkerStage of our thread's
- ActiveWorker with an IdleWorker.
-
- Must avoid a deadlock if all worker threads end up here at the same
- time, or if there are no suitable IdleWorkers left. So we first
- replace our ActiveWorker with an IdleWorker in the pool, to allow
- some other thread to use it, before waiting for a suitable IdleWorker
- for us to use.
-
- Note that the spareVals in the WorkerPool does not get anything added to
- it when adding the IdleWorker, so there will for a while be more IdleWorkers
- in the pool than spareVals. That does not prevent other threads that call
- this from using them though, so it's fine.
-}
changeStageTo :: ThreadId -> TMVar (WorkerPool AnnexState) -> WorkerStage -> Annex (Maybe WorkerStage) changeStageTo :: ThreadId -> TMVar (WorkerPool AnnexState) -> WorkerStage -> Annex (Maybe WorkerStage)
changeStageTo mytid tv newstage = liftIO $ atomically $ do changeStageTo mytid tv newstage = liftIO $
pool <- takeTMVar tv replaceidle >>= maybe (return Nothing) waitidle
case pool of where
WorkerPool usedstages _ replaceidle = atomically $ do
| memberStage newstage usedstages -> pool <- takeTMVar tv
case removeThreadIdWorkerPool mytid pool of if memberStage newstage (usedStages pool)
Just ((myaid, oldstage), WorkerPool usedstages' l) then case removeThreadIdWorkerPool mytid pool of
| oldstage /= newstage -> do Just ((myaid, oldstage), pool')
(idlest, restpool) <- waitWorkerSlot usedstages' newstage l | oldstage /= newstage -> do
let pool' = addWorkerPool (IdleWorker idlest oldstage) $ putTMVar tv $
addWorkerPool (ActiveWorker myaid newstage) restpool addWorkerPool (IdleWorker oldstage) pool'
putTMVar tv pool' return $ Just (myaid, oldstage)
return (Just oldstage) | otherwise -> do
_ -> do
putTMVar tv pool putTMVar tv pool
return Nothing return Nothing
_ -> do _ -> do
putTMVar tv pool putTMVar tv pool
return Nothing return Nothing
else do
putTMVar tv pool
return Nothing
waitidle (myaid, oldstage) = atomically $ do
pool <- waitIdleWorkerSlot newstage =<< takeTMVar tv
putTMVar tv $ addWorkerPool (ActiveWorker myaid newstage) pool
return (Just oldstage)
-- | Waits until there's an idle worker in the worker pool -- | Waits until there's an idle worker in the worker pool
-- for its initial stage, removes it from the pool, and returns its state. -- for its initial stage, removes it from the pool, and returns its state.
@ -126,18 +144,24 @@ changeStageTo mytid tv newstage = liftIO $ atomically $ do
-- If the worker pool is not already allocated, returns Nothing. -- If the worker pool is not already allocated, returns Nothing.
waitInitialWorkerSlot :: TMVar (WorkerPool Annex.AnnexState) -> STM (Maybe (Annex.AnnexState, WorkerStage)) waitInitialWorkerSlot :: TMVar (WorkerPool Annex.AnnexState) -> STM (Maybe (Annex.AnnexState, WorkerStage))
waitInitialWorkerSlot tv = do waitInitialWorkerSlot tv = do
WorkerPool usedstages l <- takeTMVar tv pool <- takeTMVar tv
let stage = initialStage usedstages let stage = initialStage (usedStages pool)
(st, pool') <- waitWorkerSlot usedstages stage l st <- go stage pool
putTMVar tv pool'
return $ Just (st, stage) return $ Just (st, stage)
where
go wantstage pool = case spareVals pool of
[] -> retry
(v:vs) -> do
let pool' = pool { spareVals = vs }
putTMVar tv =<< waitIdleWorkerSlot wantstage pool'
return v
-- | Waits until there's an idle worker for the specified stage, and returns waitIdleWorkerSlot :: WorkerStage -> WorkerPool Annex.AnnexState -> STM (WorkerPool Annex.AnnexState)
-- its state and a WorkerPool containing all the other workers. waitIdleWorkerSlot wantstage pool = do
waitWorkerSlot :: UsedStages -> WorkerStage -> [Worker Annex.AnnexState] -> STM (Annex.AnnexState, WorkerPool Annex.AnnexState) l <- findidle [] (workerList pool)
waitWorkerSlot usedstages wantstage = findidle [] return $ pool { workerList = l }
where where
findidle _ [] = retry findidle _ [] = retry
findidle c ((IdleWorker st stage):rest) findidle c ((IdleWorker stage):rest) | stage == wantstage =
| stage == wantstage = return (st, WorkerPool usedstages (c ++ rest)) return (c ++ rest)
findidle c (w:rest) = findidle (w:c) rest findidle c (w:rest) = findidle (w:c) rest

View file

@ -129,8 +129,8 @@ finishCommandActions = Annex.getState Annex.workers >>= \case
Nothing -> noop Nothing -> noop
Just tv -> do Just tv -> do
Annex.changeState $ \s -> s { Annex.workers = Nothing } Annex.changeState $ \s -> s { Annex.workers = Nothing }
WorkerPool _ l <- liftIO $ atomically $ takeTMVar tv pool <- liftIO $ atomically $ takeTMVar tv
forM_ (mapMaybe workerAsync l) $ \aid -> forM_ (mapMaybe workerAsync $ workerList pool) $ \aid ->
liftIO (waitCatch aid) >>= \case liftIO (waitCatch aid) >>= \case
Left _ -> noop Left _ -> noop
Right st -> mergeState st Right st -> mergeState st

View file

@ -12,17 +12,23 @@ import Control.Concurrent.Async
import qualified Data.Set as S import qualified Data.Set as S
-- | Pool of worker threads. -- | Pool of worker threads.
data WorkerPool t = WorkerPool UsedStages [Worker t] data WorkerPool t = WorkerPool
{ usedStages :: UsedStages
, workerList :: [Worker t]
, spareVals :: [t]
-- ^ Normally there is one value for each IdleWorker,
-- but there can temporarily be fewer.
}
deriving (Show) deriving (Show)
-- | A worker can either be idle or running an Async action. -- | A worker can either be idle or running an Async action.
-- And it is used for some stage. -- And it is used for some stage.
data Worker t data Worker t
= IdleWorker t WorkerStage = IdleWorker WorkerStage
| ActiveWorker (Async t) WorkerStage | ActiveWorker (Async t) WorkerStage
instance Show (Worker t) where instance Show (Worker t) where
show (IdleWorker _ s) = "IdleWorker " ++ show s show (IdleWorker s) = "IdleWorker " ++ show s
show (ActiveWorker _ s) = "ActiveWorker " ++ show s show (ActiveWorker _ s) = "ActiveWorker " ++ show s
data WorkerStage data WorkerStage
@ -46,12 +52,12 @@ data WorkerStage
-- stage, and so there will be no blocking before starting them. -- stage, and so there will be no blocking before starting them.
data UsedStages = UsedStages data UsedStages = UsedStages
{ initialStage :: WorkerStage { initialStage :: WorkerStage
, usedStages :: S.Set WorkerStage , stageSet :: S.Set WorkerStage
} }
deriving (Show) deriving (Show)
memberStage :: WorkerStage -> UsedStages -> Bool memberStage :: WorkerStage -> UsedStages -> Bool
memberStage s u = S.member s (usedStages u) memberStage s u = S.member s (stageSet u)
-- | The default is to use only the CommandPerform and CommandCleanup -- | The default is to use only the CommandPerform and CommandCleanup
-- stages. Since cleanup actions often don't contend much with -- stages. Since cleanup actions often don't contend much with
@ -60,7 +66,7 @@ memberStage s u = S.member s (usedStages u)
commandStages :: UsedStages commandStages :: UsedStages
commandStages = UsedStages commandStages = UsedStages
{ initialStage = PerformStage { initialStage = PerformStage
, usedStages = S.fromList [PerformStage, CleanupStage] , stageSet = S.fromList [PerformStage, CleanupStage]
} }
-- | When a command is transferring content, it can use this instead. -- | When a command is transferring content, it can use this instead.
@ -70,15 +76,15 @@ commandStages = UsedStages
transferStages :: UsedStages transferStages :: UsedStages
transferStages = UsedStages transferStages = UsedStages
{ initialStage = TransferStage { initialStage = TransferStage
, usedStages = S.fromList [TransferStage, VerifyStage] , stageSet = S.fromList [TransferStage, VerifyStage]
} }
workerStage :: Worker t -> WorkerStage workerStage :: Worker t -> WorkerStage
workerStage (IdleWorker _ s) = s workerStage (IdleWorker s) = s
workerStage (ActiveWorker _ s) = s workerStage (ActiveWorker _ s) = s
workerAsync :: Worker t -> Maybe (Async t) workerAsync :: Worker t -> Maybe (Async t)
workerAsync (IdleWorker _ _) = Nothing workerAsync (IdleWorker _) = Nothing
workerAsync (ActiveWorker aid _) = Just aid workerAsync (ActiveWorker aid _) = Just aid
-- | Allocates a WorkerPool that has the specified number of workers -- | Allocates a WorkerPool that has the specified number of workers
@ -86,39 +92,41 @@ workerAsync (ActiveWorker aid _) = Just aid
-- --
-- The stages are distributed evenly throughout. -- The stages are distributed evenly throughout.
allocateWorkerPool :: t -> Int -> UsedStages -> WorkerPool t allocateWorkerPool :: t -> Int -> UsedStages -> WorkerPool t
allocateWorkerPool t n u = WorkerPool u $ take (n+n) $ allocateWorkerPool t n u = WorkerPool
map (uncurry IdleWorker) $ zip (repeat t) stages { usedStages = u
, workerList = take totalthreads $ map IdleWorker stages
, spareVals = replicate totalthreads t
}
where where
stages = concat $ repeat $ S.toList $ usedStages u stages = concat $ repeat $ S.toList $ stageSet u
totalthreads = n * S.size (stageSet u)
addWorkerPool :: Worker t -> WorkerPool t -> WorkerPool t addWorkerPool :: Worker t -> WorkerPool t -> WorkerPool t
addWorkerPool w (WorkerPool u l) = WorkerPool u (w:l) addWorkerPool w pool = pool { workerList = w : workerList pool }
idleWorkers :: WorkerPool t -> [t]
idleWorkers (WorkerPool _ l) = go l
where
go [] = []
go (IdleWorker t _ : rest) = t : go rest
go (ActiveWorker _ _ : rest) = go rest
-- | Removes a worker from the pool whose Async uses the ThreadId. -- | Removes a worker from the pool whose Async uses the ThreadId.
-- --
-- Each Async has its own ThreadId, so this stops once it finds -- Each Async has its own ThreadId, so this stops once it finds
-- a match. -- a match.
removeThreadIdWorkerPool :: ThreadId -> WorkerPool t -> Maybe ((Async t, WorkerStage), WorkerPool t) removeThreadIdWorkerPool :: ThreadId -> WorkerPool t -> Maybe ((Async t, WorkerStage), WorkerPool t)
removeThreadIdWorkerPool tid (WorkerPool u l) = go [] l removeThreadIdWorkerPool tid pool = go [] (workerList pool)
where where
go _ [] = Nothing go _ [] = Nothing
go c (ActiveWorker a stage : rest) go c (ActiveWorker a stage : rest)
| asyncThreadId a == tid = Just ((a, stage), WorkerPool u (c++rest)) | asyncThreadId a == tid =
let pool' = pool { workerList = (c++rest) }
in Just ((a, stage), pool')
go c (v : rest) = go (v:c) rest go c (v : rest) = go (v:c) rest
deactivateWorker :: WorkerPool t -> Async t -> t -> WorkerPool t deactivateWorker :: WorkerPool t -> Async t -> t -> WorkerPool t
deactivateWorker (WorkerPool u l) aid t = WorkerPool u $ go l deactivateWorker pool aid t = pool
{ workerList = go (workerList pool)
, spareVals = t : spareVals pool
}
where where
go [] = [] go [] = []
go (w@(IdleWorker _ _) : rest) = w : go rest go (w@(IdleWorker _) : rest) = w : go rest
go (w@(ActiveWorker a st) : rest) go (w@(ActiveWorker a st) : rest)
| a == aid = IdleWorker t st : rest | a == aid = IdleWorker st : rest
| otherwise = w : go rest | otherwise = w : go rest