avoid concurrent threads trying to take pid lock at same time

Seem there are several races that happen when 2 threads run PidLock.tryLock at the same time. One involves checkSaneLock of the side lock file, which may be deleted by another process that is dropping the lock, causing checkSaneLock to fail. And even with the deletion disabled, it can still fail, Probably due to linkToLock failing when a second thread overwrites the lock file. The same can happen when 2 processes do, but then one process just fails to take the lock, which is fine. But with 2 threads, some actions where failing even though the process as a whole had the pid lock held. Utility.LockPool.PidLock already maintains a STM lock, and since it uses LockShared, 2 threads can hold the pidlock at the same time, and when the first thread drops the lock, it will remain held by the second thread, and so the pid lock file should not get deleted until the last thread to hold it drops the lock. Which is the right behavior, and why a LockShared STM lock is used in the first place. The problem is that each time it takes the STM lock, it then also calls PidLock.tryLock. So that was getting called repeatedly and concurrently. Fixed by noticing when the shared lock is already held, and stop calling PidLock.tryLock again, just use the pid lock that already exists then. Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed to take the lock, which was entirely wrong. It should only drop the side lock. Sponsored-by: Dartmouth College's Datalad project
2021-12-01 15:22:31 -04:00 · 2021-12-01 15:22:31 -04:00 · ed0afbc36b
commit ed0afbc36b
parent 66b2536ea0
7 changed files with 113 additions and 42 deletions
--- a/2
+++ b/2
@ -7,6 +7,8 @@ git-annex (8.20211124) UNRELEASED; urgency=medium
  * export: Avoid unncessarily re-exporting non-annexed files that were
    already exported.
  * Fix locking bug introduced in version 8.20200814.
+  * Fix locking problems when annex.pidlock is set and concurrency is
+    enabled eg with -J.

 -- Joey Hess <id@joeyh.name>  Tue, 23 Nov 2021 15:58:27 -0400

--- a/Utility/LockFile/PidLock.hs
+++ b/Utility/LockFile/PidLock.hs
@ -1,6 +1,6 @@
 {- pid-based lock files
 -
- - Copyright 2015-2020 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2021 Joey Hess <id@joeyh.name>
 -
 - License: BSD-2-clause
 -}
@ -11,6 +11,8 @@ module Utility.LockFile.PidLock (
 	LockHandle,
 	tryLock,
 	waitLock,
+	waitedLock,
+	alreadyLocked,
 	dropLock,
 	LockStatus(..),
 	getLockStatus,
@ -149,8 +151,8 @@ tryLock lockfile = do
 		setFileMode tmp' (combineModes readModes)
 		hPutStr h . show =<< mkPidLock
 		hClose h
-		let failedlock st = do
-			dropLock $ LockHandle tmp' st sidelock
+		let failedlock = do
+			dropSideLock sidelock
 			removeWhenExistsWith removeLink tmp'
 			return Nothing
 		let tooklock st = return $ Just $ LockHandle abslockfile st sidelock
@ -171,7 +173,7 @@ tryLock lockfile = do
 						-- stale, and can take it over.
 						rename tmp' abslockfile
 						tooklock tmpst
-					_ -> failedlock tmpst
+					_ -> failedlock

 -- Linux's open(2) man page recommends linking a pid lock into place,
 -- as the most portable atomic operation that will fail if
@ -254,8 +256,8 @@ checkInsaneLustre dest = do
 --
 -- After the first second waiting, runs the callback to display a message,
 -- so the user knows why it's stalled.
-waitLock :: MonadIO m => Seconds -> LockFile -> (String -> m ()) -> m LockHandle
-waitLock (Seconds timeout) lockfile displaymessage = go timeout
+waitLock :: MonadIO m => Seconds -> LockFile -> (String -> m ()) -> (Bool -> IO ()) -> m LockHandle
+waitLock (Seconds timeout) lockfile displaymessage sem = go timeout
  where
 	go n
 		| n > 0 = liftIO (tryLock lockfile) >>= \case
@ -264,10 +266,25 @@ waitLock (Seconds timeout) lockfile displaymessage = go timeout
 					displaymessage $ "waiting for pid lock file " ++ fromRawFilePath lockfile ++ " which is held by another process (or may be stale)"
 				liftIO $ threadDelaySeconds (Seconds 1)
 				go (pred n)
-			Just lckh -> return lckh
+			Just lckh -> do
+				liftIO $ sem True
+				return lckh
 		| otherwise = do
-			displaymessage $ show timeout ++ " second timeout exceeded while waiting for pid lock file " ++ fromRawFilePath lockfile
-			giveup $ "Gave up waiting for pid lock file " ++ fromRawFilePath lockfile
+			liftIO $ sem False
+			waitedLock (Seconds timeout) lockfile displaymessage
+
+waitedLock :: MonadIO m => Seconds -> LockFile -> (String -> m ()) -> m LockHandle
+waitedLock (Seconds timeout) lockfile displaymessage = do
+	displaymessage $ show timeout ++ " second timeout exceeded while waiting for pid lock file " ++ fromRawFilePath lockfile
+	giveup $ "Gave up waiting for pid lock file " ++ fromRawFilePath lockfile
+
+-- | Use when the pid lock has already been taken by another thread of the
+-- same process, or perhaps is in the process of being taken.
+alreadyLocked :: MonadIO m => LockFile -> m LockHandle
+alreadyLocked lockfile = liftIO $ do
+	abslockfile <- absPath lockfile
+	st <- getFileStatus abslockfile
+	return $ LockHandle abslockfile st Nothing

 dropLock :: LockHandle -> IO ()
 dropLock (LockHandle lockfile _ sidelock) = do
--- a/Utility/LockPool/LockHandle.hs
+++ b/Utility/LockPool/LockHandle.hs
@ -52,33 +52,33 @@ makeLockHandle
 	:: (MonadIO m, MonadMask m)
 	=> P.LockPool
 	-> LockFile
-	-> (P.LockPool -> LockFile -> STM P.LockHandle)
-	-> (LockFile -> m FileLockOps)
+	-> (P.LockPool -> LockFile -> STM (P.LockHandle, P.FirstLock))
+	-> (LockFile -> P.FirstLock -> m FileLockOps)
 	-> m LockHandle
 makeLockHandle pool file pa fa = bracketOnError setup cleanup go
  where
 	setup = debugLocks $ liftIO $ atomically (pa pool file)
-	cleanup ph = debugLocks $ liftIO $ P.releaseLock ph
-	go ph = liftIO . mkLockHandle ph =<< fa file
+	cleanup (ph, _) = debugLocks $ liftIO $ P.releaseLock ph
+	go (ph, firstlock) = liftIO . mkLockHandle ph =<< fa file firstlock

 tryMakeLockHandle
 	:: (MonadIO m, MonadMask m)
 	=> P.LockPool
 	-> LockFile
-	-> (P.LockPool -> LockFile -> STM (Maybe P.LockHandle))
-	-> (LockFile -> m (Maybe FileLockOps))
+	-> (P.LockPool -> LockFile -> STM (Maybe (P.LockHandle, P.FirstLock)))
+	-> (LockFile -> P.FirstLock -> m (Maybe FileLockOps))
 	-> m (Maybe LockHandle)
 tryMakeLockHandle pool file pa fa = bracketOnError setup cleanup go
  where
 	setup = liftIO $ atomically (pa pool file)
 	cleanup Nothing = return ()
-	cleanup (Just ph) = liftIO $ P.releaseLock ph
+	cleanup (Just (ph, _)) = liftIO $ P.releaseLock ph
 	go Nothing = return Nothing
-	go (Just ph) = do
-		mfo <- fa file
+	go (Just (ph, firstlock)) = do
+		mfo <- fa file firstlock
 		case mfo of
 			Nothing -> do
-				liftIO $ cleanup (Just ph)
+				liftIO $ cleanup (Just (ph, firstlock))
 				return Nothing
 			Just fo -> liftIO $ Just <$> mkLockHandle ph fo

@ -86,4 +86,4 @@ mkLockHandle :: P.LockHandle -> FileLockOps -> IO LockHandle
 mkLockHandle ph fo = do
 	atomically $ P.registerCloseLockFile ph (fDropLock fo)
 	return $ LockHandle ph fo
-	
+
--- a/Utility/LockPool/PidLock.hs
+++ b/Utility/LockPool/PidLock.hs
@ -1,6 +1,6 @@
 {- Pid locks, using lock pools.
 -
- - Copyright 2015-2020 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2021 Joey Hess <id@joeyh.name>
 -
 - License: BSD-2-clause
 -}
@ -26,7 +26,9 @@ import Utility.ThreadScheduler

 import System.IO
 import System.Posix
+import Control.Concurrent.STM
 import Data.Maybe
+import Control.Monad
 import Control.Monad.Catch
 import Control.Monad.IO.Class
 import Control.Applicative
@ -43,13 +45,35 @@ waitLock timeout file displaymessage = makeLockHandle P.lockPool file
 	-- LockShared for STM lock, because a pid lock can be the top-level
 	-- lock with various other STM level locks gated behind it.
 	(\p f -> P.waitTakeLock p f LockShared)
-	(\f -> mk <$> F.waitLock timeout f displaymessage)
+	(\f (P.FirstLock firstlock firstlocksem) -> mk 
+		<$> if firstlock
+			then F.waitLock timeout f displaymessage $
+				void . atomically . tryPutTMVar firstlocksem . P.FirstLockSemWaited
+			else liftIO (atomically $ readTMVar firstlocksem) >>= \case
+				P.FirstLockSemWaited True -> F.alreadyLocked f
+				P.FirstLockSemTried True -> F.alreadyLocked f
+				P.FirstLockSemWaited False -> F.waitedLock timeout f displaymessage
+				P.FirstLockSemTried False -> F.waitLock timeout f displaymessage $
+					void . atomically . tryPutTMVar firstlocksem . P.FirstLockSemWaited
+	)

 -- Tries to take a pid lock, but does not block.
 tryLock :: LockFile -> IO (Maybe LockHandle)
 tryLock file = tryMakeLockHandle P.lockPool file
 	(\p f -> P.tryTakeLock p f LockShared)
-	(\f -> fmap mk <$> F.tryLock f)
+	(\f (P.FirstLock firstlock firstlocksem) -> fmap mk
+		<$> if firstlock
+			then do
+				lh <- F.tryLock f
+				void $ atomically $ tryPutTMVar firstlocksem 
+					(P.FirstLockSemTried (isJust lh))
+				return lh
+			else liftIO (atomically $ readTMVar firstlocksem) >>= \case
+					P.FirstLockSemWaited True -> Just <$> F.alreadyLocked f
+					P.FirstLockSemTried True -> Just <$> F.alreadyLocked f
+					P.FirstLockSemWaited False -> return Nothing
+					P.FirstLockSemTried False -> return Nothing
+	)

 checkLocked :: LockFile -> IO (Maybe Bool)
 checkLocked file = P.getLockStatus P.lockPool file
--- a/Utility/LockPool/Posix.hs
+++ b/Utility/LockPool/Posix.hs
@ -35,25 +35,25 @@ import Prelude
 lockShared :: Maybe FileMode -> LockFile -> IO LockHandle
 lockShared mode file = makeLockHandle P.lockPool file
 	(\p f -> P.waitTakeLock p f LockShared)
-	(\f -> mk <$> F.lockShared mode f)
+	(\f _ -> mk <$> F.lockShared mode f)

 -- Takes an exclusive lock, blocking until the lock is available.
 lockExclusive :: Maybe FileMode -> LockFile -> IO LockHandle
 lockExclusive mode file = makeLockHandle P.lockPool file
 	(\p f -> P.waitTakeLock p f LockExclusive)
-	(\f -> mk <$> F.lockExclusive mode f)
+	(\f _ -> mk <$> F.lockExclusive mode f)

 -- Tries to take a shared lock, but does not block.
 tryLockShared :: Maybe FileMode -> LockFile -> IO (Maybe LockHandle)
 tryLockShared mode file = tryMakeLockHandle P.lockPool file
 	(\p f -> P.tryTakeLock p f LockShared)
-	(\f -> fmap mk <$> F.tryLockShared mode f)
+	(\f _ -> fmap mk <$> F.tryLockShared mode f)

 -- Tries to take an exclusive lock, but does not block.
 tryLockExclusive :: Maybe FileMode -> LockFile -> IO (Maybe LockHandle)
 tryLockExclusive mode file = tryMakeLockHandle P.lockPool file
 	(\p f -> P.tryTakeLock p f LockExclusive)
-	(\f -> fmap mk <$> F.tryLockExclusive mode f)
+	(\f _ -> fmap mk <$> F.tryLockExclusive mode f)

 -- Returns Nothing when the file doesn't exist, for cases where
 -- that is different from it not being locked.
--- a/Utility/LockPool/STM.hs
+++ b/Utility/LockPool/STM.hs
@ -1,6 +1,6 @@
 {- STM implementation of lock pools.
 -
- - Copyright 2015-2020 Joey Hess <id@joeyh.name>
+ - Copyright 2015-2021 Joey Hess <id@joeyh.name>
 -
 - License: BSD-2-clause
 -}
@ -11,6 +11,8 @@ module Utility.LockPool.STM (
 	LockFile,
 	LockMode(..),
 	LockHandle,
+	FirstLock(..),
+	FirstLockSemVal(..),
 	waitTakeLock,
 	tryTakeLock,
 	getLockStatus,
@ -27,8 +29,6 @@ import qualified Data.Map.Strict as M
 import Control.Concurrent.STM
 import Control.Exception
 import Control.Monad
-import Control.Applicative
-import Prelude

 type LockFile = RawFilePath

@ -39,9 +39,20 @@ data LockMode = LockExclusive | LockShared
 -- closed.
 type LockHandle = TMVar (LockPool, LockFile, CloseLockFile)

+-- When a shared lock is taken, this will only be true for the first
+-- process, not subsequent processes. The first process should
+-- fill the FirstLockSem after doing any IO actions to finish lock setup
+-- and subsequent processes can block on that getting filled to know
+-- when the lock is fully set up.
+data FirstLock = FirstLock Bool FirstLockSem
+
+type FirstLockSem = TMVar FirstLockSemVal
+
+data FirstLockSemVal = FirstLockSemWaited Bool | FirstLockSemTried Bool
+
 type LockCount = Integer

-data LockStatus = LockStatus LockMode LockCount
+data LockStatus = LockStatus LockMode LockCount FirstLockSem

 type CloseLockFile = IO ()

@ -62,24 +73,30 @@ lockPool = unsafePerformIO (newTMVarIO M.empty)
 -- the same shared lock should not be blocked on the exclusive lock.
 -- Keeping the whole Map in a TMVar accomplishes this, at the expense of
 -- sometimes retrying after unrelated changes in the map.
-waitTakeLock :: LockPool -> LockFile -> LockMode -> STM LockHandle
+waitTakeLock :: LockPool -> LockFile -> LockMode -> STM (LockHandle, FirstLock)
 waitTakeLock pool file mode = maybe retry return =<< tryTakeLock pool file mode

 -- Avoids blocking if another thread is holding a conflicting lock.
-tryTakeLock :: LockPool -> LockFile -> LockMode -> STM (Maybe LockHandle)
+tryTakeLock :: LockPool -> LockFile -> LockMode -> STM (Maybe (LockHandle, FirstLock))
 tryTakeLock pool file mode = do
 	m <- takeTMVar pool
-	let success v = do
+	let success firstlock v = do
 		putTMVar pool (M.insert file v m)
-		Just <$> newTMVar (pool, file, noop)
+		tmv <- newTMVar (pool, file, noop)
+		return (Just (tmv, firstlock))
 	case M.lookup file m of
-		Just (LockStatus mode' n)
-			| mode == LockShared && mode' == LockShared ->
-				success $ LockStatus mode (succ n)
+		Just (LockStatus mode' n firstlocksem)
+			| mode == LockShared && mode' == LockShared -> do
+				fl@(FirstLock _ firstlocksem') <- if n == 0
+					then FirstLock True <$> newEmptyTMVar
+					else pure (FirstLock False firstlocksem)
+				success fl $ LockStatus mode (succ n) firstlocksem'
 			| n > 0 -> do
 				putTMVar pool m
 				return Nothing
-		_ -> success $ LockStatus mode 1
+		_ -> do
+			firstlocksem <- newEmptyTMVar
+			success (FirstLock True firstlocksem) $ LockStatus mode 1 firstlocksem

 -- Call after waitTakeLock or tryTakeLock, to register a CloseLockFile
 -- action to run when releasing the lock.
@ -101,7 +118,7 @@ getLockStatus pool file getdefault checker = do
 	v <- atomically $ do
 		m <- takeTMVar pool
 		let threadlocked = case M.lookup file m of
-			Just (LockStatus _ n) | n > 0 -> True
+			Just (LockStatus _ n _) | n > 0 -> True
 			_ -> False
 		if threadlocked
 			then do
@ -125,10 +142,10 @@ releaseLock h = go =<< atomically (tryTakeTMVar h)
 		(m, lastuser) <- atomically $ do
 			m <- takeTMVar pool
 			return $ case M.lookup file m of
-				Just (LockStatus mode n)
+				Just (LockStatus mode n firstlocksem)
 					| n == 1 -> (M.delete file m, True)
 					| otherwise ->
-						(M.insert file (LockStatus mode (pred n)) m, False)
+						(M.insert file (LockStatus mode (pred n) firstlocksem) m, False)
 				Nothing -> (m, True)
 		() <- when lastuser closelockfile
 		atomically $ putTMVar pool m
--- a/doc/bugs/annex_get_should_retry_failed_downloads_from_S3/comment_3_b279da2a2db1ba5a24a76a9112e691aa._comment
+++ b/doc/bugs/annex_get_should_retry_failed_downloads_from_S3/comment_3_b279da2a2db1ba5a24a76a9112e691aa._comment
@ -0,0 +1,11 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 3"""
+ date="2021-12-01T20:25:18Z"
+ content="""
+I've fixed the transfer locking problem. Also drop -J did sometimes fail
+due to a similar locking problem and should be fixed.
+
+Still want to add the fine-grained STM locking when doing pid locking,
+as discussed above.
+"""]]