2015-11-12 19:38:02 +00:00
|
|
|
{- pid-based lock files
|
|
|
|
-
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
- Copyright 2015-2021 Joey Hess <id@joeyh.name>
|
2015-11-12 19:38:02 +00:00
|
|
|
-
|
|
|
|
- License: BSD-2-clause
|
|
|
|
-}
|
|
|
|
|
2020-10-29 14:33:12 +00:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
|
|
|
2015-11-12 19:38:02 +00:00
|
|
|
module Utility.LockFile.PidLock (
|
2021-12-03 21:20:21 +00:00
|
|
|
PidLockFile,
|
2015-11-12 19:38:02 +00:00
|
|
|
LockHandle,
|
|
|
|
tryLock,
|
|
|
|
waitLock,
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
waitedLock,
|
|
|
|
alreadyLocked,
|
2015-11-12 20:31:34 +00:00
|
|
|
dropLock,
|
2015-11-12 19:38:02 +00:00
|
|
|
LockStatus(..),
|
|
|
|
getLockStatus,
|
2015-11-12 20:31:34 +00:00
|
|
|
checkLocked,
|
|
|
|
checkSaneLock,
|
2020-06-17 19:13:52 +00:00
|
|
|
pidLockEnv,
|
2020-08-25 18:57:25 +00:00
|
|
|
pidLockEnvValue,
|
2015-11-12 19:38:02 +00:00
|
|
|
) where
|
|
|
|
|
|
|
|
import Utility.PartialPrelude
|
|
|
|
import Utility.Exception
|
|
|
|
import Utility.Applicative
|
|
|
|
import Utility.Directory
|
|
|
|
import Utility.Monad
|
2020-10-28 19:40:50 +00:00
|
|
|
import Utility.Path.AbsRel
|
2015-11-12 19:38:02 +00:00
|
|
|
import Utility.FileMode
|
|
|
|
import Utility.LockFile.LockStatus
|
2015-11-12 21:12:54 +00:00
|
|
|
import Utility.ThreadScheduler
|
2017-05-15 22:10:13 +00:00
|
|
|
import Utility.Hash
|
|
|
|
import Utility.FileSystemEncoding
|
2020-06-17 19:13:52 +00:00
|
|
|
import Utility.Env
|
|
|
|
import Utility.Env.Set
|
2021-08-30 17:05:02 +00:00
|
|
|
import Utility.Tmp
|
2015-11-12 19:38:02 +00:00
|
|
|
import qualified Utility.LockFile.Posix as Posix
|
|
|
|
|
|
|
|
import System.IO
|
2020-06-17 19:13:52 +00:00
|
|
|
import System.Posix.Types
|
2020-10-29 14:33:12 +00:00
|
|
|
import System.Posix.IO.ByteString
|
|
|
|
import System.Posix.Files.ByteString
|
2020-06-17 19:13:52 +00:00
|
|
|
import System.Posix.Process
|
2020-08-26 17:05:34 +00:00
|
|
|
import Control.Monad
|
|
|
|
import Control.Monad.IO.Class (liftIO, MonadIO)
|
2020-10-29 14:33:12 +00:00
|
|
|
import qualified System.FilePath.ByteString as P
|
2015-11-12 19:38:02 +00:00
|
|
|
import Data.Maybe
|
|
|
|
import Data.List
|
|
|
|
import Network.BSD
|
|
|
|
import System.FilePath
|
2015-12-19 21:42:45 +00:00
|
|
|
import Control.Applicative
|
|
|
|
import Prelude
|
2015-11-12 19:38:02 +00:00
|
|
|
|
2021-12-03 21:20:21 +00:00
|
|
|
type PidLockFile = RawFilePath
|
2015-11-12 19:38:02 +00:00
|
|
|
|
2020-06-17 19:13:52 +00:00
|
|
|
data LockHandle
|
2021-12-03 21:20:21 +00:00
|
|
|
= LockHandle PidLockFile FileStatus SideLockHandle
|
2020-06-17 19:13:52 +00:00
|
|
|
| ParentLocked
|
2015-11-13 18:44:53 +00:00
|
|
|
|
2021-12-03 21:20:21 +00:00
|
|
|
type SideLockHandle = Maybe (RawFilePath, Posix.LockHandle)
|
2015-11-12 19:38:02 +00:00
|
|
|
|
|
|
|
data PidLock = PidLock
|
|
|
|
{ lockingPid :: ProcessID
|
|
|
|
, lockingHost :: HostName
|
|
|
|
}
|
|
|
|
deriving (Eq, Read, Show)
|
|
|
|
|
|
|
|
mkPidLock :: IO PidLock
|
|
|
|
mkPidLock = PidLock
|
|
|
|
<$> getProcessID
|
|
|
|
<*> getHostName
|
|
|
|
|
2021-12-03 21:20:21 +00:00
|
|
|
readPidLock :: PidLockFile -> IO (Maybe PidLock)
|
2020-10-29 14:33:12 +00:00
|
|
|
readPidLock lockfile = (readish =<<)
|
|
|
|
<$> catchMaybeIO (readFile (fromRawFilePath lockfile))
|
2015-11-12 19:38:02 +00:00
|
|
|
|
|
|
|
-- To avoid races when taking over a stale pid lock, a side lock is used.
|
2015-11-16 15:36:11 +00:00
|
|
|
-- This is a regular posix exclusive lock.
|
2021-12-03 21:20:21 +00:00
|
|
|
trySideLock :: PidLockFile -> (SideLockHandle -> IO a) -> IO a
|
2015-11-12 19:38:02 +00:00
|
|
|
trySideLock lockfile a = do
|
2015-11-13 18:04:29 +00:00
|
|
|
sidelock <- sideLockFile lockfile
|
2015-11-12 19:38:02 +00:00
|
|
|
mlck <- catchDefaultIO Nothing $
|
|
|
|
withUmask nullFileMode $
|
|
|
|
Posix.tryLockExclusive (Just mode) sidelock
|
2015-11-16 15:36:11 +00:00
|
|
|
-- Check the lock we just took, in case we opened a side lock file
|
|
|
|
-- belonging to another process that will have since deleted it.
|
|
|
|
case mlck of
|
|
|
|
Nothing -> a Nothing
|
|
|
|
Just lck -> ifM (Posix.checkSaneLock sidelock lck)
|
|
|
|
( a (Just (sidelock, lck))
|
|
|
|
, a Nothing
|
|
|
|
)
|
2015-11-12 19:38:02 +00:00
|
|
|
where
|
2015-11-13 18:49:30 +00:00
|
|
|
-- Let all users write to the lock file in /dev/shm or /tmp,
|
2015-11-12 19:38:02 +00:00
|
|
|
-- so that other users can reuse it to take the lock.
|
2015-11-13 18:49:30 +00:00
|
|
|
-- Since /dev/shm and /tmp are sticky dirs, a user cannot
|
|
|
|
-- delete another user's lock file there, so could not
|
|
|
|
-- delete a stale lock.
|
2015-11-12 19:38:02 +00:00
|
|
|
mode = combineModes (readModes ++ writeModes)
|
|
|
|
|
2015-11-16 15:36:11 +00:00
|
|
|
dropSideLock :: SideLockHandle -> IO ()
|
|
|
|
dropSideLock Nothing = return ()
|
|
|
|
dropSideLock (Just (f, h)) = do
|
|
|
|
-- Delete the file first, to ensure that any process that is trying
|
|
|
|
-- to take the side lock will only succeed once the file is
|
|
|
|
-- deleted, and so will be able to immediately see that it's taken
|
|
|
|
-- a stale lock.
|
2020-10-29 14:33:12 +00:00
|
|
|
_ <- tryIO $ removeFile (fromRawFilePath f)
|
2015-11-16 15:36:11 +00:00
|
|
|
Posix.dropLock h
|
|
|
|
|
|
|
|
-- The side lock is put in /dev/shm. This will work on most any
|
|
|
|
-- Linux system, even if its whole root filesystem doesn't support posix
|
|
|
|
-- locks. /tmp is used as a fallback.
|
2021-12-03 21:20:21 +00:00
|
|
|
sideLockFile :: PidLockFile -> IO RawFilePath
|
2015-11-13 18:04:29 +00:00
|
|
|
sideLockFile lockfile = do
|
2020-10-29 14:33:12 +00:00
|
|
|
f <- fromRawFilePath <$> absPath lockfile
|
2015-11-13 18:04:29 +00:00
|
|
|
let base = intercalate "_" (splitDirectories (makeRelative "/" f))
|
|
|
|
let shortbase = reverse $ take 32 $ reverse base
|
2017-05-15 22:10:13 +00:00
|
|
|
let md5sum = if base == shortbase
|
|
|
|
then ""
|
2020-10-29 14:33:12 +00:00
|
|
|
else toRawFilePath $ show (md5 (encodeBL base))
|
2015-11-13 18:49:30 +00:00
|
|
|
dir <- ifM (doesDirectoryExist "/dev/shm")
|
|
|
|
( return "/dev/shm"
|
|
|
|
, return "/tmp"
|
|
|
|
)
|
2020-10-29 14:33:12 +00:00
|
|
|
return $ dir P.</> md5sum <> toRawFilePath shortbase <> ".lck"
|
2015-11-13 18:04:29 +00:00
|
|
|
|
2015-11-12 19:38:02 +00:00
|
|
|
-- | Tries to take a lock; does not block when the lock is already held.
|
|
|
|
--
|
|
|
|
-- Note that stale locks are automatically detected and broken.
|
|
|
|
-- However, if the lock file is on a networked file system, and was
|
|
|
|
-- created on a different host than the current host (determined by hostname),
|
|
|
|
-- this can't be done and stale locks may persist.
|
2020-06-17 19:13:52 +00:00
|
|
|
--
|
|
|
|
-- If a parent process is holding the lock, determined by a
|
|
|
|
-- "PIDLOCK_lockfile" environment variable, does not block either.
|
2021-12-03 21:20:21 +00:00
|
|
|
tryLock :: PidLockFile -> IO (Maybe LockHandle)
|
2020-06-17 19:13:52 +00:00
|
|
|
tryLock lockfile = do
|
2020-10-29 14:33:12 +00:00
|
|
|
abslockfile <- absPath lockfile
|
2020-06-17 19:13:52 +00:00
|
|
|
lockenv <- pidLockEnv abslockfile
|
|
|
|
getEnv lockenv >>= \case
|
|
|
|
Nothing -> trySideLock lockfile (go abslockfile)
|
|
|
|
_ -> return (Just ParentLocked)
|
|
|
|
where
|
|
|
|
go abslockfile sidelock = do
|
2020-10-29 14:33:12 +00:00
|
|
|
let abslockfile' = fromRawFilePath abslockfile
|
2021-08-30 17:05:02 +00:00
|
|
|
(tmp, h) <- openTmpFileIn (takeDirectory abslockfile') "locktmp"
|
2020-10-29 14:33:12 +00:00
|
|
|
let tmp' = toRawFilePath tmp
|
|
|
|
setFileMode tmp' (combineModes readModes)
|
2020-06-17 19:13:52 +00:00
|
|
|
hPutStr h . show =<< mkPidLock
|
|
|
|
hClose h
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
let failedlock = do
|
|
|
|
dropSideLock sidelock
|
2020-10-29 14:33:12 +00:00
|
|
|
removeWhenExistsWith removeLink tmp'
|
2020-06-17 19:13:52 +00:00
|
|
|
return Nothing
|
|
|
|
let tooklock st = return $ Just $ LockHandle abslockfile st sidelock
|
2021-11-29 18:51:28 +00:00
|
|
|
linkToLock sidelock tmp' abslockfile >>= \case
|
|
|
|
Just lckst -> do
|
2020-10-29 14:33:12 +00:00
|
|
|
removeWhenExistsWith removeLink tmp'
|
2020-06-17 19:13:52 +00:00
|
|
|
tooklock lckst
|
2021-11-29 18:51:28 +00:00
|
|
|
Nothing -> do
|
2020-06-17 19:13:52 +00:00
|
|
|
v <- readPidLock abslockfile
|
|
|
|
hn <- getHostName
|
2020-10-29 14:33:12 +00:00
|
|
|
tmpst <- getFileStatus tmp'
|
2020-06-17 19:13:52 +00:00
|
|
|
case v of
|
|
|
|
Just pl | isJust sidelock && hn == lockingHost pl -> do
|
|
|
|
-- Since we have the sidelock,
|
|
|
|
-- and are on the same host that
|
|
|
|
-- the pidlock was taken on,
|
|
|
|
-- we know that the pidlock is
|
|
|
|
-- stale, and can take it over.
|
2020-10-29 14:33:12 +00:00
|
|
|
rename tmp' abslockfile
|
2020-06-17 19:13:52 +00:00
|
|
|
tooklock tmpst
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
_ -> failedlock
|
2015-11-12 19:38:02 +00:00
|
|
|
|
2015-11-13 18:44:53 +00:00
|
|
|
-- Linux's open(2) man page recommends linking a pid lock into place,
|
2015-11-13 17:22:45 +00:00
|
|
|
-- as the most portable atomic operation that will fail if
|
2015-11-13 18:44:53 +00:00
|
|
|
-- it already exists.
|
|
|
|
--
|
|
|
|
-- open(2) suggests that link can sometimes appear to fail
|
|
|
|
-- on NFS but have actually succeeded, and the way to find out is to stat
|
|
|
|
-- the file and check its link count etc.
|
2017-02-10 19:21:58 +00:00
|
|
|
--
|
|
|
|
-- However, not all filesystems support hard links. So, first probe
|
|
|
|
-- to see if they are supported. If not, use open with O_EXCL.
|
2021-11-29 18:51:28 +00:00
|
|
|
linkToLock :: SideLockHandle -> RawFilePath -> RawFilePath -> IO (Maybe FileStatus)
|
|
|
|
linkToLock Nothing _ _ = return Nothing
|
2015-11-13 18:44:53 +00:00
|
|
|
linkToLock (Just _) src dest = do
|
2020-10-29 14:33:12 +00:00
|
|
|
let probe = src <> ".lnk"
|
2017-02-10 19:21:58 +00:00
|
|
|
v <- tryIO $ createLink src probe
|
2020-10-29 14:33:12 +00:00
|
|
|
removeWhenExistsWith removeLink probe
|
2017-02-10 19:21:58 +00:00
|
|
|
case v of
|
|
|
|
Right _ -> do
|
|
|
|
_ <- tryIO $ createLink src dest
|
|
|
|
ifM (catchBoolIO checklinked)
|
2021-11-29 18:51:28 +00:00
|
|
|
( ifM (catchBoolIO $ not <$> checkInsaneLustre dest)
|
|
|
|
( catchMaybeIO $ getFileStatus dest
|
|
|
|
, return Nothing
|
|
|
|
)
|
|
|
|
, return Nothing
|
2017-02-10 19:21:58 +00:00
|
|
|
)
|
2021-11-29 18:51:28 +00:00
|
|
|
Left _ -> catchMaybeIO $ do
|
2020-06-05 19:46:01 +00:00
|
|
|
let setup = do
|
|
|
|
fd <- openFd dest WriteOnly
|
|
|
|
(Just $ combineModes readModes)
|
|
|
|
(defaultFileFlags {exclusive = True})
|
|
|
|
fdToHandle fd
|
|
|
|
let cleanup = hClose
|
2020-10-29 14:33:12 +00:00
|
|
|
let go h = readFile (fromRawFilePath src) >>= hPutStr h
|
|
|
|
bracket setup cleanup go
|
2021-11-29 18:51:28 +00:00
|
|
|
getFileStatus dest
|
2015-11-13 17:22:45 +00:00
|
|
|
where
|
2015-11-13 19:51:45 +00:00
|
|
|
checklinked = do
|
2015-11-13 17:22:45 +00:00
|
|
|
x <- getSymbolicLinkStatus src
|
|
|
|
y <- getSymbolicLinkStatus dest
|
|
|
|
return $ and
|
|
|
|
[ deviceID x == deviceID y
|
|
|
|
, fileID x == fileID y
|
|
|
|
, fileMode x == fileMode y
|
|
|
|
, fileOwner x == fileOwner y
|
|
|
|
, fileGroup x == fileGroup y
|
|
|
|
, fileSize x == fileSize y
|
|
|
|
, modificationTime x == modificationTime y
|
|
|
|
, isRegularFile x == isRegularFile y
|
2015-11-13 18:44:53 +00:00
|
|
|
, linkCount x == linkCount y
|
|
|
|
, linkCount x == 2
|
2015-11-13 17:22:45 +00:00
|
|
|
]
|
|
|
|
|
2015-11-13 20:13:43 +00:00
|
|
|
-- On a Lustre filesystem, link has been observed to incorrectly *succeed*,
|
|
|
|
-- despite the dest already existing. A subsequent stat of the dest
|
|
|
|
-- looked like it had been replaced with the src. The process proceeded to
|
|
|
|
-- run and then deleted the dest, and after the process was done, the
|
|
|
|
-- original file was observed to still be in place.
|
|
|
|
--
|
|
|
|
-- We can detect this insanity by getting the directory contents after
|
|
|
|
-- making the link, and checking to see if 2 copies of the dest file,
|
|
|
|
-- with the SAME FILENAME exist.
|
2020-10-29 14:33:12 +00:00
|
|
|
checkInsaneLustre :: RawFilePath -> IO Bool
|
2015-11-13 20:13:43 +00:00
|
|
|
checkInsaneLustre dest = do
|
2020-10-29 14:33:12 +00:00
|
|
|
let dest' = fromRawFilePath dest
|
|
|
|
fs <- dirContents (takeDirectory dest')
|
|
|
|
case length (filter (== dest') fs) of
|
2015-11-13 20:13:43 +00:00
|
|
|
1 -> return False -- whew!
|
|
|
|
0 -> return True -- wtf?
|
|
|
|
_ -> do
|
|
|
|
-- Try to clean up the extra copy we made
|
|
|
|
-- that has the same name. Egads.
|
2020-10-29 14:33:12 +00:00
|
|
|
_ <- tryIO $ removeFile dest'
|
2015-11-13 20:13:43 +00:00
|
|
|
return True
|
|
|
|
|
2015-11-12 19:38:02 +00:00
|
|
|
-- | Waits as necessary to take a lock.
|
|
|
|
--
|
2020-08-26 17:05:34 +00:00
|
|
|
-- Uses a 1 second wait-loop, retrying until a timeout.
|
2015-11-12 19:38:02 +00:00
|
|
|
--
|
2020-08-26 17:05:34 +00:00
|
|
|
-- After the first second waiting, runs the callback to display a message,
|
|
|
|
-- so the user knows why it's stalled.
|
2021-12-03 21:20:21 +00:00
|
|
|
waitLock :: MonadIO m => Seconds -> PidLockFile -> (String -> m ()) -> (Bool -> IO ()) -> m LockHandle
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
waitLock (Seconds timeout) lockfile displaymessage sem = go timeout
|
2015-11-12 19:38:02 +00:00
|
|
|
where
|
2015-11-12 21:12:54 +00:00
|
|
|
go n
|
2020-08-26 17:05:34 +00:00
|
|
|
| n > 0 = liftIO (tryLock lockfile) >>= \case
|
|
|
|
Nothing -> do
|
|
|
|
when (n == pred timeout) $
|
2020-10-29 14:33:12 +00:00
|
|
|
displaymessage $ "waiting for pid lock file " ++ fromRawFilePath lockfile ++ " which is held by another process (or may be stale)"
|
2020-08-26 17:05:34 +00:00
|
|
|
liftIO $ threadDelaySeconds (Seconds 1)
|
|
|
|
go (pred n)
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
Just lckh -> do
|
|
|
|
liftIO $ sem True
|
|
|
|
return lckh
|
2015-11-12 21:12:54 +00:00
|
|
|
| otherwise = do
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
liftIO $ sem False
|
|
|
|
waitedLock (Seconds timeout) lockfile displaymessage
|
|
|
|
|
close pid lock only once no threads use it
This fixes a FD leak when annex.pidlock is set and -J is used. Also, it
fixes bugs where the pid lock file got deleted because one thread was
done with it, while another thread was still holding it open.
The LockPool now has two distinct types of resources,
one is per-LockHandle and is used for file Handles, which get closed
when the associated LockHandle is closed. The other one is per lock
file, and gets closed when no more LockHandles use that lock file,
including other shared locks of the same file.
That latter kind is used for the pid lock file, so it's opened by the
first thread to use a lock, and closed when the last thread closes a lock.
In practice, this means that eg git-annex get of several files opens and
closes the pidlock file a few times per file. While with -J5 it will open
the pidlock file, process a number of files, until all the threads happen to
finish together, at which point the pidlock file gets closed, and then
that repeats. So in either case, another process still gets a chance to
take the pidlock.
registerPostRelease has a rather intricate dance, there are fine-grained
STM locks, a STM lock of the pidfile itself, and the actual pidlock file
on disk that are all resolved in stages by it.
Sponsored-by: Dartmouth College's Datalad project
2021-12-06 19:01:39 +00:00
|
|
|
waitedLock :: MonadIO m => Seconds -> PidLockFile -> (String -> m ()) -> m a
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
waitedLock (Seconds timeout) lockfile displaymessage = do
|
|
|
|
displaymessage $ show timeout ++ " second timeout exceeded while waiting for pid lock file " ++ fromRawFilePath lockfile
|
|
|
|
giveup $ "Gave up waiting for pid lock file " ++ fromRawFilePath lockfile
|
|
|
|
|
|
|
|
-- | Use when the pid lock has already been taken by another thread of the
|
2021-12-03 22:41:51 +00:00
|
|
|
-- same process.
|
2021-12-03 21:20:21 +00:00
|
|
|
alreadyLocked :: MonadIO m => PidLockFile -> m LockHandle
|
avoid concurrent threads trying to take pid lock at same time
Seem there are several races that happen when 2 threads run PidLock.tryLock
at the same time. One involves checkSaneLock of the side lock file, which may
be deleted by another process that is dropping the lock, causing checkSaneLock
to fail. And even with the deletion disabled, it can still fail, Probably due
to linkToLock failing when a second thread overwrites the lock file.
The same can happen when 2 processes do, but then one process just fails
to take the lock, which is fine. But with 2 threads, some actions where failing
even though the process as a whole had the pid lock held.
Utility.LockPool.PidLock already maintains a STM lock, and since it uses
LockShared, 2 threads can hold the pidlock at the same time, and when
the first thread drops the lock, it will remain held by the second
thread, and so the pid lock file should not get deleted until the last
thread to hold it drops the lock. Which is the right behavior, and why a
LockShared STM lock is used in the first place.
The problem is that each time it takes the STM lock, it then also calls
PidLock.tryLock. So that was getting called repeatedly and concurrently.
Fixed by noticing when the shared lock is already held, and stop calling
PidLock.tryLock again, just use the pid lock that already exists then.
Also, LockFile.PidLock.tryLock was deleting the pid lock when it failed
to take the lock, which was entirely wrong. It should only drop the side
lock.
Sponsored-by: Dartmouth College's Datalad project
2021-12-01 19:22:31 +00:00
|
|
|
alreadyLocked lockfile = liftIO $ do
|
|
|
|
abslockfile <- absPath lockfile
|
|
|
|
st <- getFileStatus abslockfile
|
|
|
|
return $ LockHandle abslockfile st Nothing
|
2015-11-12 19:38:02 +00:00
|
|
|
|
|
|
|
dropLock :: LockHandle -> IO ()
|
2015-11-13 19:43:09 +00:00
|
|
|
dropLock (LockHandle lockfile _ sidelock) = do
|
2015-11-13 16:36:37 +00:00
|
|
|
-- Drop side lock first, at which point the pid lock will be
|
|
|
|
-- considered stale.
|
2015-11-16 15:36:11 +00:00
|
|
|
dropSideLock sidelock
|
2020-10-29 14:33:12 +00:00
|
|
|
removeWhenExistsWith removeLink lockfile
|
2020-06-17 19:13:52 +00:00
|
|
|
dropLock ParentLocked = return ()
|
2015-11-12 20:31:34 +00:00
|
|
|
|
2021-12-03 21:20:21 +00:00
|
|
|
getLockStatus :: PidLockFile -> IO LockStatus
|
2015-11-12 20:31:34 +00:00
|
|
|
getLockStatus = maybe StatusUnLocked (StatusLockedBy . lockingPid) <$$> readPidLock
|
|
|
|
|
2021-12-03 21:20:21 +00:00
|
|
|
checkLocked :: PidLockFile -> IO (Maybe Bool)
|
2015-11-12 20:31:34 +00:00
|
|
|
checkLocked lockfile = conv <$> getLockStatus lockfile
|
|
|
|
where
|
|
|
|
conv (StatusLockedBy _) = Just True
|
|
|
|
conv _ = Just False
|
|
|
|
|
|
|
|
-- Checks that the lock file still exists, and is the same file that was
|
|
|
|
-- locked to get the LockHandle.
|
2021-12-03 21:20:21 +00:00
|
|
|
checkSaneLock :: PidLockFile -> LockHandle -> IO Bool
|
2015-11-16 19:37:27 +00:00
|
|
|
checkSaneLock lockfile (LockHandle _ st _) =
|
2015-11-12 20:31:34 +00:00
|
|
|
go =<< catchMaybeIO (getFileStatus lockfile)
|
|
|
|
where
|
|
|
|
go Nothing = return False
|
2015-11-16 19:25:04 +00:00
|
|
|
go (Just st') = return $
|
|
|
|
deviceID st == deviceID st' && fileID st == fileID st'
|
2020-06-17 19:13:52 +00:00
|
|
|
checkSaneLock _ ParentLocked = return True
|
|
|
|
|
|
|
|
-- | A parent process that is using pid locking can set this to 1 before
|
|
|
|
-- starting a child, to communicate to the child that it's holding the pid
|
|
|
|
-- lock and that the child can skip trying to take it, and not block
|
|
|
|
-- on the pid lock its parent is holding.
|
|
|
|
--
|
|
|
|
-- The parent process should keep running as long as the child
|
|
|
|
-- process is running, since the child inherits the environment and will
|
|
|
|
-- not see unsetLockEnv.
|
2020-10-29 14:33:12 +00:00
|
|
|
pidLockEnv :: RawFilePath -> IO String
|
2020-06-17 19:13:52 +00:00
|
|
|
pidLockEnv lockfile = do
|
2020-10-29 14:33:12 +00:00
|
|
|
abslockfile <- fromRawFilePath <$> absPath lockfile
|
2020-06-17 19:13:52 +00:00
|
|
|
return $ "PIDLOCK_" ++ filter legalInEnvVar abslockfile
|
2020-08-25 18:57:25 +00:00
|
|
|
|
|
|
|
pidLockEnvValue :: String
|
|
|
|
pidLockEnvValue = "1"
|