82448bdf39
That made eg git-annex get of an unlocked file hang until the annex.pidlocktimeout and then fail. This fix should be fully thread safe no matter what else git-annex is doing. Only using runsGitAnnexChildProcess in the one place it's known to be a problem. Could audit for all places where git-annex runs itself as a child and add it to all of them, later.
295 lines
9.3 KiB
Haskell
295 lines
9.3 KiB
Haskell
{- pid-based lock files
|
|
-
|
|
- Copyright 2015-2020 Joey Hess <id@joeyh.name>
|
|
-
|
|
- License: BSD-2-clause
|
|
-}
|
|
|
|
module Utility.LockFile.PidLock (
|
|
LockHandle,
|
|
tryLock,
|
|
waitLock,
|
|
dropLock,
|
|
LockStatus(..),
|
|
getLockStatus,
|
|
checkLocked,
|
|
checkSaneLock,
|
|
pidLockEnv,
|
|
) where
|
|
|
|
import Utility.PartialPrelude
|
|
import Utility.Exception
|
|
import Utility.Applicative
|
|
import Utility.Directory
|
|
import Utility.Monad
|
|
import Utility.Path
|
|
import Utility.FileMode
|
|
import Utility.LockFile.LockStatus
|
|
import Utility.ThreadScheduler
|
|
import Utility.Hash
|
|
import Utility.FileSystemEncoding
|
|
import Utility.Env
|
|
import Utility.Env.Set
|
|
import qualified Utility.LockFile.Posix as Posix
|
|
|
|
import System.IO
|
|
import System.Posix.IO
|
|
import System.Posix.Types
|
|
import System.Posix.Files
|
|
import System.Posix.Process
|
|
import Data.Maybe
|
|
import Data.List
|
|
import Network.BSD
|
|
import System.FilePath
|
|
import Control.Applicative
|
|
import Prelude
|
|
|
|
type LockFile = FilePath
|
|
|
|
data LockHandle
|
|
= LockHandle LockFile FileStatus SideLockHandle
|
|
| ParentLocked
|
|
|
|
type SideLockHandle = Maybe (LockFile, Posix.LockHandle)
|
|
|
|
data PidLock = PidLock
|
|
{ lockingPid :: ProcessID
|
|
, lockingHost :: HostName
|
|
}
|
|
deriving (Eq, Read, Show)
|
|
|
|
mkPidLock :: IO PidLock
|
|
mkPidLock = PidLock
|
|
<$> getProcessID
|
|
<*> getHostName
|
|
|
|
readPidLock :: LockFile -> IO (Maybe PidLock)
|
|
readPidLock lockfile = (readish =<<) <$> catchMaybeIO (readFile lockfile)
|
|
|
|
-- To avoid races when taking over a stale pid lock, a side lock is used.
|
|
-- This is a regular posix exclusive lock.
|
|
trySideLock :: LockFile -> (SideLockHandle -> IO a) -> IO a
|
|
trySideLock lockfile a = do
|
|
sidelock <- sideLockFile lockfile
|
|
mlck <- catchDefaultIO Nothing $
|
|
withUmask nullFileMode $
|
|
Posix.tryLockExclusive (Just mode) sidelock
|
|
-- Check the lock we just took, in case we opened a side lock file
|
|
-- belonging to another process that will have since deleted it.
|
|
case mlck of
|
|
Nothing -> a Nothing
|
|
Just lck -> ifM (Posix.checkSaneLock sidelock lck)
|
|
( a (Just (sidelock, lck))
|
|
, a Nothing
|
|
)
|
|
where
|
|
-- Let all users write to the lock file in /dev/shm or /tmp,
|
|
-- so that other users can reuse it to take the lock.
|
|
-- Since /dev/shm and /tmp are sticky dirs, a user cannot
|
|
-- delete another user's lock file there, so could not
|
|
-- delete a stale lock.
|
|
mode = combineModes (readModes ++ writeModes)
|
|
|
|
dropSideLock :: SideLockHandle -> IO ()
|
|
dropSideLock Nothing = return ()
|
|
dropSideLock (Just (f, h)) = do
|
|
-- Delete the file first, to ensure that any process that is trying
|
|
-- to take the side lock will only succeed once the file is
|
|
-- deleted, and so will be able to immediately see that it's taken
|
|
-- a stale lock.
|
|
_ <- tryIO $ removeFile f
|
|
Posix.dropLock h
|
|
|
|
-- The side lock is put in /dev/shm. This will work on most any
|
|
-- Linux system, even if its whole root filesystem doesn't support posix
|
|
-- locks. /tmp is used as a fallback.
|
|
sideLockFile :: LockFile -> IO LockFile
|
|
sideLockFile lockfile = do
|
|
f <- absPath lockfile
|
|
let base = intercalate "_" (splitDirectories (makeRelative "/" f))
|
|
let shortbase = reverse $ take 32 $ reverse base
|
|
let md5sum = if base == shortbase
|
|
then ""
|
|
else show (md5 (encodeBL base))
|
|
dir <- ifM (doesDirectoryExist "/dev/shm")
|
|
( return "/dev/shm"
|
|
, return "/tmp"
|
|
)
|
|
return $ dir </> md5sum ++ shortbase ++ ".lck"
|
|
|
|
-- | Tries to take a lock; does not block when the lock is already held.
|
|
--
|
|
-- Note that stale locks are automatically detected and broken.
|
|
-- However, if the lock file is on a networked file system, and was
|
|
-- created on a different host than the current host (determined by hostname),
|
|
-- this can't be done and stale locks may persist.
|
|
--
|
|
-- If a parent process is holding the lock, determined by a
|
|
-- "PIDLOCK_lockfile" environment variable, does not block either.
|
|
tryLock :: LockFile -> IO (Maybe LockHandle)
|
|
tryLock lockfile = do
|
|
abslockfile <- absPath lockfile
|
|
lockenv <- pidLockEnv abslockfile
|
|
getEnv lockenv >>= \case
|
|
Nothing -> trySideLock lockfile (go abslockfile)
|
|
_ -> return (Just ParentLocked)
|
|
where
|
|
go abslockfile sidelock = do
|
|
(tmp, h) <- openTempFile (takeDirectory abslockfile) "locktmp"
|
|
setFileMode tmp (combineModes readModes)
|
|
hPutStr h . show =<< mkPidLock
|
|
hClose h
|
|
let failedlock st = do
|
|
dropLock $ LockHandle tmp st sidelock
|
|
nukeFile tmp
|
|
return Nothing
|
|
let tooklock st = return $ Just $ LockHandle abslockfile st sidelock
|
|
ifM (linkToLock sidelock tmp abslockfile)
|
|
( do
|
|
nukeFile tmp
|
|
-- May not have made a hard link, so stat
|
|
-- the lockfile
|
|
lckst <- getFileStatus abslockfile
|
|
tooklock lckst
|
|
, do
|
|
v <- readPidLock abslockfile
|
|
hn <- getHostName
|
|
tmpst <- getFileStatus tmp
|
|
case v of
|
|
Just pl | isJust sidelock && hn == lockingHost pl -> do
|
|
-- Since we have the sidelock,
|
|
-- and are on the same host that
|
|
-- the pidlock was taken on,
|
|
-- we know that the pidlock is
|
|
-- stale, and can take it over.
|
|
rename tmp abslockfile
|
|
tooklock tmpst
|
|
_ -> failedlock tmpst
|
|
)
|
|
|
|
-- Linux's open(2) man page recommends linking a pid lock into place,
|
|
-- as the most portable atomic operation that will fail if
|
|
-- it already exists.
|
|
--
|
|
-- open(2) suggests that link can sometimes appear to fail
|
|
-- on NFS but have actually succeeded, and the way to find out is to stat
|
|
-- the file and check its link count etc.
|
|
--
|
|
-- However, not all filesystems support hard links. So, first probe
|
|
-- to see if they are supported. If not, use open with O_EXCL.
|
|
linkToLock :: SideLockHandle -> FilePath -> FilePath -> IO Bool
|
|
linkToLock Nothing _ _ = return False
|
|
linkToLock (Just _) src dest = do
|
|
let probe = src ++ ".lnk"
|
|
v <- tryIO $ createLink src probe
|
|
nukeFile probe
|
|
case v of
|
|
Right _ -> do
|
|
_ <- tryIO $ createLink src dest
|
|
ifM (catchBoolIO checklinked)
|
|
( catchBoolIO $ not <$> checkInsaneLustre dest
|
|
, return False
|
|
)
|
|
Left _ -> catchBoolIO $ do
|
|
let setup = do
|
|
fd <- openFd dest WriteOnly
|
|
(Just $ combineModes readModes)
|
|
(defaultFileFlags {exclusive = True})
|
|
fdToHandle fd
|
|
let cleanup = hClose
|
|
bracket setup cleanup (\h -> readFile src >>= hPutStr h)
|
|
return True
|
|
where
|
|
checklinked = do
|
|
x <- getSymbolicLinkStatus src
|
|
y <- getSymbolicLinkStatus dest
|
|
return $ and
|
|
[ deviceID x == deviceID y
|
|
, fileID x == fileID y
|
|
, fileMode x == fileMode y
|
|
, fileOwner x == fileOwner y
|
|
, fileGroup x == fileGroup y
|
|
, fileSize x == fileSize y
|
|
, modificationTime x == modificationTime y
|
|
, isRegularFile x == isRegularFile y
|
|
, linkCount x == linkCount y
|
|
, linkCount x == 2
|
|
]
|
|
|
|
-- On a Lustre filesystem, link has been observed to incorrectly *succeed*,
|
|
-- despite the dest already existing. A subsequent stat of the dest
|
|
-- looked like it had been replaced with the src. The process proceeded to
|
|
-- run and then deleted the dest, and after the process was done, the
|
|
-- original file was observed to still be in place.
|
|
--
|
|
-- We can detect this insanity by getting the directory contents after
|
|
-- making the link, and checking to see if 2 copies of the dest file,
|
|
-- with the SAME FILENAME exist.
|
|
checkInsaneLustre :: FilePath -> IO Bool
|
|
checkInsaneLustre dest = do
|
|
fs <- dirContents (takeDirectory dest)
|
|
case length (filter (== dest) fs) of
|
|
1 -> return False -- whew!
|
|
0 -> return True -- wtf?
|
|
_ -> do
|
|
-- Try to clean up the extra copy we made
|
|
-- that has the same name. Egads.
|
|
_ <- tryIO $ removeFile dest
|
|
return True
|
|
|
|
-- | Waits as necessary to take a lock.
|
|
--
|
|
-- Uses a 1 second wait-loop.
|
|
--
|
|
-- May wait until timeout if the lock file is stale and is on a network file
|
|
-- system, or on a system where the side lock cannot be taken.
|
|
waitLock :: Seconds -> LockFile -> IO LockHandle
|
|
waitLock (Seconds timeout) lockfile = go timeout
|
|
where
|
|
go n
|
|
| n > 0 = maybe (threadDelaySeconds (Seconds 1) >> go (pred n)) return
|
|
=<< tryLock lockfile
|
|
| otherwise = do
|
|
hPutStrLn stderr $ show timeout ++ " second timeout exceeded while waiting for pid lock file " ++ lockfile
|
|
giveup $ "Gave up waiting for possibly stale pid lock file " ++ lockfile
|
|
|
|
dropLock :: LockHandle -> IO ()
|
|
dropLock (LockHandle lockfile _ sidelock) = do
|
|
-- Drop side lock first, at which point the pid lock will be
|
|
-- considered stale.
|
|
dropSideLock sidelock
|
|
nukeFile lockfile
|
|
dropLock ParentLocked = return ()
|
|
|
|
getLockStatus :: LockFile -> IO LockStatus
|
|
getLockStatus = maybe StatusUnLocked (StatusLockedBy . lockingPid) <$$> readPidLock
|
|
|
|
checkLocked :: LockFile -> IO (Maybe Bool)
|
|
checkLocked lockfile = conv <$> getLockStatus lockfile
|
|
where
|
|
conv (StatusLockedBy _) = Just True
|
|
conv _ = Just False
|
|
|
|
-- Checks that the lock file still exists, and is the same file that was
|
|
-- locked to get the LockHandle.
|
|
checkSaneLock :: LockFile -> LockHandle -> IO Bool
|
|
checkSaneLock lockfile (LockHandle _ st _) =
|
|
go =<< catchMaybeIO (getFileStatus lockfile)
|
|
where
|
|
go Nothing = return False
|
|
go (Just st') = return $
|
|
deviceID st == deviceID st' && fileID st == fileID st'
|
|
checkSaneLock _ ParentLocked = return True
|
|
|
|
-- | A parent process that is using pid locking can set this to 1 before
|
|
-- starting a child, to communicate to the child that it's holding the pid
|
|
-- lock and that the child can skip trying to take it, and not block
|
|
-- on the pid lock its parent is holding.
|
|
--
|
|
-- The parent process should keep running as long as the child
|
|
-- process is running, since the child inherits the environment and will
|
|
-- not see unsetLockEnv.
|
|
pidLockEnv :: FilePath -> IO String
|
|
pidLockEnv lockfile = do
|
|
abslockfile <- absPath lockfile
|
|
return $ "PIDLOCK_" ++ filter legalInEnvVar abslockfile
|