fix a annex.pidlock issue
That made eg git-annex get of an unlocked file hang until the annex.pidlocktimeout and then fail. This fix should be fully thread safe no matter what else git-annex is doing. Only using runsGitAnnexChildProcess in the one place it's known to be a problem. Could audit for all places where git-annex runs itself as a child and add it to all of them, later.
This commit is contained in:
parent
9583b267f5
commit
82448bdf39
10 changed files with 247 additions and 43 deletions
|
@ -20,6 +20,11 @@ import Git.Index
|
|||
import Git.Env
|
||||
import qualified Annex
|
||||
import qualified Annex.Queue
|
||||
import qualified Utility.LockFile.PidLock as PidF
|
||||
import qualified Utility.LockPool.PidLock as PidP
|
||||
import Utility.LockPool (dropLock)
|
||||
import Utility.Env
|
||||
import Annex.LockPool.PosixOrPid (pidLockFile)
|
||||
|
||||
{- Runs an action using a different git index file. -}
|
||||
withIndexFile :: AltIndexFile -> (FilePath -> Annex a) -> Annex a
|
||||
|
@ -125,3 +130,57 @@ withAltRepo modrepo unmodrepo a = do
|
|||
, Annex.repoqueue = Just q
|
||||
}
|
||||
either E.throw return v
|
||||
|
||||
{- Wrap around actions that may run a git-annex child process.
|
||||
-
|
||||
- When pid locking is in use, this tries to take the pid lock, and if
|
||||
- successful, holds it while running the child process. The action
|
||||
- is run with the Annex monad modified so git commands are run with
|
||||
- an env var set, which prevents child git annex processes from t
|
||||
- rying to take the pid lock themselves.
|
||||
-
|
||||
- This way, any locking the parent does will not get in the way of
|
||||
- the child. The child is assumed to not do any locking that conflicts
|
||||
- with the parent, but if it did happen to do that, it would be noticed
|
||||
- when git-annex is used without pid locking.
|
||||
-}
|
||||
runsGitAnnexChildProcess :: Annex a -> Annex a
|
||||
runsGitAnnexChildProcess a = pidLockFile >>= \case
|
||||
Nothing -> a
|
||||
Just pidlock -> bracket (setup pidlock) cleanup (go pidlock)
|
||||
where
|
||||
setup pidlock = liftIO $ PidP.tryLock pidlock
|
||||
|
||||
cleanup (Just h) = liftIO $ dropLock h
|
||||
cleanup Nothing = return ()
|
||||
|
||||
go _ Nothing = a
|
||||
go pidlock (Just _h) = do
|
||||
v <- liftIO $ PidF.pidLockEnv pidlock
|
||||
let addenv g = do
|
||||
g' <- liftIO $ addGitEnv g v "1"
|
||||
return (g', ())
|
||||
let rmenv oldg g
|
||||
| any (\(k, _) -> k == v) (fromMaybe [] (Git.gitEnv oldg)) = g
|
||||
| otherwise =
|
||||
let e' = case Git.gitEnv g of
|
||||
Just e -> Just (delEntry v e)
|
||||
Nothing -> Nothing
|
||||
in g { Git.gitEnv = e' }
|
||||
withAltRepo addenv rmenv (const a)
|
||||
|
||||
runsGitAnnexChildProcess' :: Git.Repo -> (Git.Repo -> IO a) -> Annex a
|
||||
runsGitAnnexChildProcess' r a = pidLockFile >>= \case
|
||||
Nothing -> liftIO $ a r
|
||||
Just pidlock -> liftIO $ bracket (setup pidlock) cleanup (go pidlock)
|
||||
where
|
||||
setup pidlock = PidP.tryLock pidlock
|
||||
|
||||
cleanup (Just h) = dropLock h
|
||||
cleanup Nothing = return ()
|
||||
|
||||
go _ Nothing = a r
|
||||
go pidlock (Just _h) = do
|
||||
v <- PidF.pidLockEnv pidlock
|
||||
r' <- addGitEnv r v "1"
|
||||
a r'
|
||||
|
|
|
@ -30,6 +30,7 @@ import Git.FilePath
|
|||
import Git.Config
|
||||
import Annex.HashObject
|
||||
import Annex.InodeSentinal
|
||||
import Annex.GitOverlay
|
||||
import Utility.FileMode
|
||||
import Utility.InodeCache
|
||||
import Utility.Tmp.Dir
|
||||
|
@ -204,23 +205,24 @@ restagePointerFile (Restage True) f orig = withTSDelta $ \tsd -> do
|
|||
go (Just _) = withTmpDirIn (fromRawFilePath $ Git.localGitDir r) "annexindex" $ \tmpdir -> do
|
||||
let tmpindex = tmpdir </> "index"
|
||||
let updatetmpindex = do
|
||||
r' <- Git.Env.addGitEnv r Git.Index.indexEnv
|
||||
r' <- liftIO $ Git.Env.addGitEnv r Git.Index.indexEnv
|
||||
=<< Git.Index.indexEnvVal tmpindex
|
||||
-- Avoid git warning about CRLF munging.
|
||||
let r'' = r' { gitGlobalOpts = gitGlobalOpts r' ++
|
||||
[ Param "-c"
|
||||
, Param $ "core.safecrlf=" ++ boolConfig False
|
||||
] }
|
||||
Git.UpdateIndex.refreshIndex r'' $ \feed ->
|
||||
forM_ l $ \(f', checkunmodified) ->
|
||||
whenM checkunmodified $
|
||||
feed f'
|
||||
runsGitAnnexChildProcess' r'' $ \r''' ->
|
||||
liftIO $ Git.UpdateIndex.refreshIndex r''' $ \feed ->
|
||||
forM_ l $ \(f', checkunmodified) ->
|
||||
whenM checkunmodified $
|
||||
feed f'
|
||||
let replaceindex = catchBoolIO $ do
|
||||
moveFile tmpindex realindex
|
||||
return True
|
||||
ok <- liftIO $ createLinkOrCopy realindex tmpindex
|
||||
ok <- liftIO (createLinkOrCopy realindex tmpindex)
|
||||
<&&> updatetmpindex
|
||||
<&&> replaceindex
|
||||
<&&> liftIO replaceindex
|
||||
unless ok showwarning
|
||||
bracket lockindex unlockindex go
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex lock files.
|
||||
-
|
||||
- Copyright 2012-2019 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2012-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
|
|
@ -18,6 +18,7 @@ module Annex.LockPool.PosixOrPid (
|
|||
LockStatus(..),
|
||||
getLockStatus,
|
||||
checkSaneLock,
|
||||
pidLockFile,
|
||||
) where
|
||||
|
||||
import Common
|
||||
|
|
|
@ -56,6 +56,8 @@ git-annex (8.20200523) UNRELEASED; urgency=medium
|
|||
was initialized with gpg encryption, but without an explicit
|
||||
embedcreds=yes.
|
||||
(Regression introduced in version 7.20200202.7)
|
||||
* Fix a annex.pidlock issue that made eg git-annex get of an unlocked
|
||||
file hang until the annex.pidlocktimeout and then fail.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Tue, 26 May 2020 10:20:52 -0400
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
module Utility.Env.Set (
|
||||
setEnv,
|
||||
unsetEnv,
|
||||
legalInEnvVar,
|
||||
) where
|
||||
|
||||
#ifdef mingw32_HOST_OS
|
||||
|
@ -18,6 +19,7 @@ import Utility.Env
|
|||
#else
|
||||
import qualified System.Posix.Env as PE
|
||||
#endif
|
||||
import Data.Char
|
||||
|
||||
{- Sets an environment variable. To overwrite an existing variable,
|
||||
- overwrite must be True.
|
||||
|
@ -41,3 +43,7 @@ unsetEnv = PE.unsetEnv
|
|||
#else
|
||||
unsetEnv = System.SetEnv.unsetEnv
|
||||
#endif
|
||||
|
||||
legalInEnvVar :: Char -> Bool
|
||||
legalInEnvVar '_' = True
|
||||
legalInEnvVar c = isAsciiLower c || isAsciiUpper c || (isNumber c && isAscii c)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- pid-based lock files
|
||||
-
|
||||
- Copyright 2015 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2015-2020 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
@ -14,6 +14,7 @@ module Utility.LockFile.PidLock (
|
|||
getLockStatus,
|
||||
checkLocked,
|
||||
checkSaneLock,
|
||||
pidLockEnv,
|
||||
) where
|
||||
|
||||
import Utility.PartialPrelude
|
||||
|
@ -27,10 +28,15 @@ import Utility.LockFile.LockStatus
|
|||
import Utility.ThreadScheduler
|
||||
import Utility.Hash
|
||||
import Utility.FileSystemEncoding
|
||||
import Utility.Env
|
||||
import Utility.Env.Set
|
||||
import qualified Utility.LockFile.Posix as Posix
|
||||
|
||||
import System.IO
|
||||
import System.Posix
|
||||
import System.Posix.IO
|
||||
import System.Posix.Types
|
||||
import System.Posix.Files
|
||||
import System.Posix.Process
|
||||
import Data.Maybe
|
||||
import Data.List
|
||||
import Network.BSD
|
||||
|
@ -40,7 +46,9 @@ import Prelude
|
|||
|
||||
type LockFile = FilePath
|
||||
|
||||
data LockHandle = LockHandle LockFile FileStatus SideLockHandle
|
||||
data LockHandle
|
||||
= LockHandle LockFile FileStatus SideLockHandle
|
||||
| ParentLocked
|
||||
|
||||
type SideLockHandle = Maybe (LockFile, Posix.LockHandle)
|
||||
|
||||
|
@ -115,40 +123,49 @@ sideLockFile lockfile = do
|
|||
-- However, if the lock file is on a networked file system, and was
|
||||
-- created on a different host than the current host (determined by hostname),
|
||||
-- this can't be done and stale locks may persist.
|
||||
--
|
||||
-- If a parent process is holding the lock, determined by a
|
||||
-- "PIDLOCK_lockfile" environment variable, does not block either.
|
||||
tryLock :: LockFile -> IO (Maybe LockHandle)
|
||||
tryLock lockfile = trySideLock lockfile $ \sidelock -> do
|
||||
lockfile' <- absPath lockfile
|
||||
(tmp, h) <- openTempFile (takeDirectory lockfile') "locktmp"
|
||||
setFileMode tmp (combineModes readModes)
|
||||
hPutStr h . show =<< mkPidLock
|
||||
hClose h
|
||||
let failedlock st = do
|
||||
dropLock $ LockHandle tmp st sidelock
|
||||
nukeFile tmp
|
||||
return Nothing
|
||||
let tooklock st = return $ Just $ LockHandle lockfile' st sidelock
|
||||
ifM (linkToLock sidelock tmp lockfile')
|
||||
( do
|
||||
tryLock lockfile = do
|
||||
abslockfile <- absPath lockfile
|
||||
lockenv <- pidLockEnv abslockfile
|
||||
getEnv lockenv >>= \case
|
||||
Nothing -> trySideLock lockfile (go abslockfile)
|
||||
_ -> return (Just ParentLocked)
|
||||
where
|
||||
go abslockfile sidelock = do
|
||||
(tmp, h) <- openTempFile (takeDirectory abslockfile) "locktmp"
|
||||
setFileMode tmp (combineModes readModes)
|
||||
hPutStr h . show =<< mkPidLock
|
||||
hClose h
|
||||
let failedlock st = do
|
||||
dropLock $ LockHandle tmp st sidelock
|
||||
nukeFile tmp
|
||||
-- May not have made a hard link, so stat
|
||||
-- the lockfile
|
||||
lckst <- getFileStatus lockfile'
|
||||
tooklock lckst
|
||||
, do
|
||||
v <- readPidLock lockfile'
|
||||
hn <- getHostName
|
||||
tmpst <- getFileStatus tmp
|
||||
case v of
|
||||
Just pl | isJust sidelock && hn == lockingHost pl -> do
|
||||
-- Since we have the sidelock,
|
||||
-- and are on the same host that
|
||||
-- the pidlock was taken on,
|
||||
-- we know that the pidlock is
|
||||
-- stale, and can take it over.
|
||||
rename tmp lockfile'
|
||||
tooklock tmpst
|
||||
_ -> failedlock tmpst
|
||||
)
|
||||
return Nothing
|
||||
let tooklock st = return $ Just $ LockHandle abslockfile st sidelock
|
||||
ifM (linkToLock sidelock tmp abslockfile)
|
||||
( do
|
||||
nukeFile tmp
|
||||
-- May not have made a hard link, so stat
|
||||
-- the lockfile
|
||||
lckst <- getFileStatus abslockfile
|
||||
tooklock lckst
|
||||
, do
|
||||
v <- readPidLock abslockfile
|
||||
hn <- getHostName
|
||||
tmpst <- getFileStatus tmp
|
||||
case v of
|
||||
Just pl | isJust sidelock && hn == lockingHost pl -> do
|
||||
-- Since we have the sidelock,
|
||||
-- and are on the same host that
|
||||
-- the pidlock was taken on,
|
||||
-- we know that the pidlock is
|
||||
-- stale, and can take it over.
|
||||
rename tmp abslockfile
|
||||
tooklock tmpst
|
||||
_ -> failedlock tmpst
|
||||
)
|
||||
|
||||
-- Linux's open(2) man page recommends linking a pid lock into place,
|
||||
-- as the most portable atomic operation that will fail if
|
||||
|
@ -242,6 +259,7 @@ dropLock (LockHandle lockfile _ sidelock) = do
|
|||
-- considered stale.
|
||||
dropSideLock sidelock
|
||||
nukeFile lockfile
|
||||
dropLock ParentLocked = return ()
|
||||
|
||||
getLockStatus :: LockFile -> IO LockStatus
|
||||
getLockStatus = maybe StatusUnLocked (StatusLockedBy . lockingPid) <$$> readPidLock
|
||||
|
@ -261,3 +279,17 @@ checkSaneLock lockfile (LockHandle _ st _) =
|
|||
go Nothing = return False
|
||||
go (Just st') = return $
|
||||
deviceID st == deviceID st' && fileID st == fileID st'
|
||||
checkSaneLock _ ParentLocked = return True
|
||||
|
||||
-- | A parent process that is using pid locking can set this to 1 before
|
||||
-- starting a child, to communicate to the child that it's holding the pid
|
||||
-- lock and that the child can skip trying to take it, and not block
|
||||
-- on the pid lock its parent is holding.
|
||||
--
|
||||
-- The parent process should keep running as long as the child
|
||||
-- process is running, since the child inherits the environment and will
|
||||
-- not see unsetLockEnv.
|
||||
pidLockEnv :: FilePath -> IO String
|
||||
pidLockEnv lockfile = do
|
||||
abslockfile <- absPath lockfile
|
||||
return $ "PIDLOCK_" ++ filter legalInEnvVar abslockfile
|
||||
|
|
|
@ -56,3 +56,5 @@ details of the setup are [in that PR](https://github.com/datalad/datalad-extensi
|
|||
PS determining the boundaries and names of the tests git annex had ran is a tricky business on its own -- I wondered if tests output formatting and annotation could have been improved as well. E.g. unlikely there is a point to print all output if test passes. With `nose` in Python / datalad we get a summary of all failed tests (and what was output when they were ran) at the end of the full sweep. That helps to avoid needing to search the entire long list
|
||||
|
||||
|
||||
> I've fixed two tests now, so [[done]]. (Also git-annex test succeeded
|
||||
> on vfat.) --[[Joey]]
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 8"""
|
||||
date="2020-06-17T16:16:19Z"
|
||||
content="""
|
||||
Minimal test case for the hang:
|
||||
|
||||
git init a
|
||||
cd a
|
||||
git annex init
|
||||
git annex adjust --unlock
|
||||
git config annex.pidlock true
|
||||
date > foo
|
||||
git annex add --force
|
||||
git commit -m add
|
||||
cd ..
|
||||
git clone a b
|
||||
cd b
|
||||
git annex init
|
||||
git annex adjust --unlock
|
||||
git config annex.pidlock true
|
||||
git annex get foo --force
|
||||
|
||||
That does not need vfat to hang.
|
||||
|
||||
364479 pts/2 Sl+ 0:00 | \_ /home/joey/bin/git-annex get foo
|
||||
364504 pts/2 Sl+ 0:00 | \_ git --git-dir=.git --work-tree=. --literal-pathspecs -c core.safecrlf=false update-index -q --refresh -z --stdin
|
||||
364506 pts/2 S+ 0:00 | \_ /bin/sh -c git-annex smudge --clean -- 'foo' git-annex smudge --clean -- 'foo'
|
||||
364507 pts/2 Sl+ 0:00 | \_ git-annex smudge --clean -- foo
|
||||
|
||||
So is git-annex smudge waiting on the pidlock its parent has?
|
||||
|
||||
Yes: After setting annex.pidlocktimeout 2:
|
||||
|
||||
2 second timeout exceeded while waiting for pid lock file .git/annex/pidlock
|
||||
git-annex: Gave up waiting for possibly stale pid lock file .git/annex/pidlock
|
||||
error: external filter 'git-annex smudge --clean -- %f' failed 1
|
||||
|
||||
What I'm not sure about: annex.pidlock is not set by default on vfat,
|
||||
so why would the test suite have failed there, and intermittently?
|
||||
Maybe annex.pidlock does get set in some circumstances?
|
||||
|
||||
Anyway, there's a clear problem that annex.pidlock prevents more than 1 git-annex
|
||||
process that uses locking from running, and here we have a parent git-annex
|
||||
that necessarily runs a child git-annex that does locking.
|
||||
|
||||
Could the child process check if a parent/grandparent has the pid lock held
|
||||
and so safely skip taking it? Or do all places git-annex runs itself
|
||||
have to shut down pid locking?
|
||||
"""]]
|
|
@ -0,0 +1,50 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 9"""
|
||||
date="2020-06-17T17:04:57Z"
|
||||
content="""
|
||||
Oh this is tricky.. git-annex is taking the gitAnnexGitQueueLock
|
||||
while running the queued git update-index command. Which is the command
|
||||
that then runs git-annex.
|
||||
|
||||
So dropping the pid lock before running the command won't work.
|
||||
|
||||
If pid locks were fine grained, this would not be a problem because the
|
||||
child process is really locking a different resource than its grandparent.
|
||||
|
||||
But, I think the reasons for not making them fine grained do make sense:
|
||||
Since git-annex sometimes takes a posix lock on a content file, it would
|
||||
need to use some other lock file for the pid lock. So every place that
|
||||
uses a lock file would have to specifiy the filename to use for pid
|
||||
locking. Which makes pid locking complicate the rest of the code base
|
||||
quite a lot, and every code path involving locking would have to be tested
|
||||
twice, in order to check that the pid lock file used by that lock works.
|
||||
Doubling the complexity of your file locking is a good thing to avoid.
|
||||
|
||||
Hmm.. I said "the child process is really locking a different resource than
|
||||
its grandparent". And that generally has to be the case, or people using
|
||||
git-annex w/o pid locking would notice that hey, these child processes
|
||||
fail to take a lock and crash.
|
||||
|
||||
So.. If we assume that is the case, and that there are plenty of git-annex
|
||||
users not using pid locking, then there's no need for a child process
|
||||
to take the pid lock, if its parent currently has the pid lock held,
|
||||
and will keep it held.
|
||||
|
||||
And this could be communicated via an env var. When the pid lock is taken
|
||||
set `ANNEX_PIDLOCKED`, and unset when it's dropped. Then, so long
|
||||
as childen inherit that env variable, they can skip taking the pid lock when
|
||||
it's set.
|
||||
|
||||
To make sure that's safe, any time git-annex runs a child process
|
||||
(or a git command that runs git-annex), it ought to hold the pid lock
|
||||
while doing it. Holding any lock will do. The risk is, if one thread
|
||||
has some lock held for whatever reason, and another thread runs the child
|
||||
process, then the child process will rely on the unrelated thread keeping
|
||||
the lock held. Explicitly holding some lock avoids such a scenario.
|
||||
|
||||
So, let's make it more explicit, add a runsGitAnnex that, when pid locking
|
||||
is enabled, holds the pid lock while running the action. Then that has to
|
||||
be wrapped around any places where a git-annex child process is run,
|
||||
which can be done broadly, or just as these issues come up.
|
||||
"""]]
|
Loading…
Add table
Add a link
Reference in a new issue