From 82448bdf39bbe2b4fb6e0bac0735b845d52e189a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 17 Jun 2020 15:13:52 -0400 Subject: [PATCH] fix a annex.pidlock issue That made eg git-annex get of an unlocked file hang until the annex.pidlocktimeout and then fail. This fix should be fully thread safe no matter what else git-annex is doing. Only using runsGitAnnexChildProcess in the one place it's known to be a problem. Could audit for all places where git-annex runs itself as a child and add it to all of them, later. --- Annex/GitOverlay.hs | 59 ++++++++++ Annex/Link.hs | 16 +-- Annex/LockFile.hs | 2 +- Annex/LockPool/PosixOrPid.hs | 1 + CHANGELOG | 2 + Utility/Env/Set.hs | 6 ++ Utility/LockFile/PidLock.hs | 102 ++++++++++++------ ...test_FAILs_when_HOME_is_a_crippled_fs.mdwn | 2 + ..._1be9990d0cab0e69936de356072ea890._comment | 50 +++++++++ ..._cf253f0ad3f9857b9f0746e678d8dbd8._comment | 50 +++++++++ 10 files changed, 247 insertions(+), 43 deletions(-) create mode 100644 doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_8_1be9990d0cab0e69936de356072ea890._comment create mode 100644 doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_9_cf253f0ad3f9857b9f0746e678d8dbd8._comment diff --git a/Annex/GitOverlay.hs b/Annex/GitOverlay.hs index 6597ac1900..7370116d78 100644 --- a/Annex/GitOverlay.hs +++ b/Annex/GitOverlay.hs @@ -20,6 +20,11 @@ import Git.Index import Git.Env import qualified Annex import qualified Annex.Queue +import qualified Utility.LockFile.PidLock as PidF +import qualified Utility.LockPool.PidLock as PidP +import Utility.LockPool (dropLock) +import Utility.Env +import Annex.LockPool.PosixOrPid (pidLockFile) {- Runs an action using a different git index file. -} withIndexFile :: AltIndexFile -> (FilePath -> Annex a) -> Annex a @@ -125,3 +130,57 @@ withAltRepo modrepo unmodrepo a = do , Annex.repoqueue = Just q } either E.throw return v + +{- Wrap around actions that may run a git-annex child process. + - + - When pid locking is in use, this tries to take the pid lock, and if + - successful, holds it while running the child process. The action + - is run with the Annex monad modified so git commands are run with + - an env var set, which prevents child git annex processes from t + - rying to take the pid lock themselves. + - + - This way, any locking the parent does will not get in the way of + - the child. The child is assumed to not do any locking that conflicts + - with the parent, but if it did happen to do that, it would be noticed + - when git-annex is used without pid locking. + -} +runsGitAnnexChildProcess :: Annex a -> Annex a +runsGitAnnexChildProcess a = pidLockFile >>= \case + Nothing -> a + Just pidlock -> bracket (setup pidlock) cleanup (go pidlock) + where + setup pidlock = liftIO $ PidP.tryLock pidlock + + cleanup (Just h) = liftIO $ dropLock h + cleanup Nothing = return () + + go _ Nothing = a + go pidlock (Just _h) = do + v <- liftIO $ PidF.pidLockEnv pidlock + let addenv g = do + g' <- liftIO $ addGitEnv g v "1" + return (g', ()) + let rmenv oldg g + | any (\(k, _) -> k == v) (fromMaybe [] (Git.gitEnv oldg)) = g + | otherwise = + let e' = case Git.gitEnv g of + Just e -> Just (delEntry v e) + Nothing -> Nothing + in g { Git.gitEnv = e' } + withAltRepo addenv rmenv (const a) + +runsGitAnnexChildProcess' :: Git.Repo -> (Git.Repo -> IO a) -> Annex a +runsGitAnnexChildProcess' r a = pidLockFile >>= \case + Nothing -> liftIO $ a r + Just pidlock -> liftIO $ bracket (setup pidlock) cleanup (go pidlock) + where + setup pidlock = PidP.tryLock pidlock + + cleanup (Just h) = dropLock h + cleanup Nothing = return () + + go _ Nothing = a r + go pidlock (Just _h) = do + v <- PidF.pidLockEnv pidlock + r' <- addGitEnv r v "1" + a r' diff --git a/Annex/Link.hs b/Annex/Link.hs index faed59f192..59f79e838b 100644 --- a/Annex/Link.hs +++ b/Annex/Link.hs @@ -30,6 +30,7 @@ import Git.FilePath import Git.Config import Annex.HashObject import Annex.InodeSentinal +import Annex.GitOverlay import Utility.FileMode import Utility.InodeCache import Utility.Tmp.Dir @@ -204,23 +205,24 @@ restagePointerFile (Restage True) f orig = withTSDelta $ \tsd -> do go (Just _) = withTmpDirIn (fromRawFilePath $ Git.localGitDir r) "annexindex" $ \tmpdir -> do let tmpindex = tmpdir "index" let updatetmpindex = do - r' <- Git.Env.addGitEnv r Git.Index.indexEnv + r' <- liftIO $ Git.Env.addGitEnv r Git.Index.indexEnv =<< Git.Index.indexEnvVal tmpindex -- Avoid git warning about CRLF munging. let r'' = r' { gitGlobalOpts = gitGlobalOpts r' ++ [ Param "-c" , Param $ "core.safecrlf=" ++ boolConfig False ] } - Git.UpdateIndex.refreshIndex r'' $ \feed -> - forM_ l $ \(f', checkunmodified) -> - whenM checkunmodified $ - feed f' + runsGitAnnexChildProcess' r'' $ \r''' -> + liftIO $ Git.UpdateIndex.refreshIndex r''' $ \feed -> + forM_ l $ \(f', checkunmodified) -> + whenM checkunmodified $ + feed f' let replaceindex = catchBoolIO $ do moveFile tmpindex realindex return True - ok <- liftIO $ createLinkOrCopy realindex tmpindex + ok <- liftIO (createLinkOrCopy realindex tmpindex) <&&> updatetmpindex - <&&> replaceindex + <&&> liftIO replaceindex unless ok showwarning bracket lockindex unlockindex go diff --git a/Annex/LockFile.hs b/Annex/LockFile.hs index 45a2c39913..3ad3e709c8 100644 --- a/Annex/LockFile.hs +++ b/Annex/LockFile.hs @@ -1,6 +1,6 @@ {- git-annex lock files. - - - Copyright 2012-2019 Joey Hess + - Copyright 2012-2020 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} diff --git a/Annex/LockPool/PosixOrPid.hs b/Annex/LockPool/PosixOrPid.hs index 6230544117..87e88c4a5d 100644 --- a/Annex/LockPool/PosixOrPid.hs +++ b/Annex/LockPool/PosixOrPid.hs @@ -18,6 +18,7 @@ module Annex.LockPool.PosixOrPid ( LockStatus(..), getLockStatus, checkSaneLock, + pidLockFile, ) where import Common diff --git a/CHANGELOG b/CHANGELOG index 10500ccf2a..bd50fff7a3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -56,6 +56,8 @@ git-annex (8.20200523) UNRELEASED; urgency=medium was initialized with gpg encryption, but without an explicit embedcreds=yes. (Regression introduced in version 7.20200202.7) + * Fix a annex.pidlock issue that made eg git-annex get of an unlocked + file hang until the annex.pidlocktimeout and then fail. -- Joey Hess Tue, 26 May 2020 10:20:52 -0400 diff --git a/Utility/Env/Set.hs b/Utility/Env/Set.hs index f14674ca67..45d2e7f9a7 100644 --- a/Utility/Env/Set.hs +++ b/Utility/Env/Set.hs @@ -10,6 +10,7 @@ module Utility.Env.Set ( setEnv, unsetEnv, + legalInEnvVar, ) where #ifdef mingw32_HOST_OS @@ -18,6 +19,7 @@ import Utility.Env #else import qualified System.Posix.Env as PE #endif +import Data.Char {- Sets an environment variable. To overwrite an existing variable, - overwrite must be True. @@ -41,3 +43,7 @@ unsetEnv = PE.unsetEnv #else unsetEnv = System.SetEnv.unsetEnv #endif + +legalInEnvVar :: Char -> Bool +legalInEnvVar '_' = True +legalInEnvVar c = isAsciiLower c || isAsciiUpper c || (isNumber c && isAscii c) diff --git a/Utility/LockFile/PidLock.hs b/Utility/LockFile/PidLock.hs index c08ae06cb0..ae6bcc6a5d 100644 --- a/Utility/LockFile/PidLock.hs +++ b/Utility/LockFile/PidLock.hs @@ -1,6 +1,6 @@ {- pid-based lock files - - - Copyright 2015 Joey Hess + - Copyright 2015-2020 Joey Hess - - License: BSD-2-clause -} @@ -14,6 +14,7 @@ module Utility.LockFile.PidLock ( getLockStatus, checkLocked, checkSaneLock, + pidLockEnv, ) where import Utility.PartialPrelude @@ -27,10 +28,15 @@ import Utility.LockFile.LockStatus import Utility.ThreadScheduler import Utility.Hash import Utility.FileSystemEncoding +import Utility.Env +import Utility.Env.Set import qualified Utility.LockFile.Posix as Posix import System.IO -import System.Posix +import System.Posix.IO +import System.Posix.Types +import System.Posix.Files +import System.Posix.Process import Data.Maybe import Data.List import Network.BSD @@ -40,7 +46,9 @@ import Prelude type LockFile = FilePath -data LockHandle = LockHandle LockFile FileStatus SideLockHandle +data LockHandle + = LockHandle LockFile FileStatus SideLockHandle + | ParentLocked type SideLockHandle = Maybe (LockFile, Posix.LockHandle) @@ -115,40 +123,49 @@ sideLockFile lockfile = do -- However, if the lock file is on a networked file system, and was -- created on a different host than the current host (determined by hostname), -- this can't be done and stale locks may persist. +-- +-- If a parent process is holding the lock, determined by a +-- "PIDLOCK_lockfile" environment variable, does not block either. tryLock :: LockFile -> IO (Maybe LockHandle) -tryLock lockfile = trySideLock lockfile $ \sidelock -> do - lockfile' <- absPath lockfile - (tmp, h) <- openTempFile (takeDirectory lockfile') "locktmp" - setFileMode tmp (combineModes readModes) - hPutStr h . show =<< mkPidLock - hClose h - let failedlock st = do - dropLock $ LockHandle tmp st sidelock - nukeFile tmp - return Nothing - let tooklock st = return $ Just $ LockHandle lockfile' st sidelock - ifM (linkToLock sidelock tmp lockfile') - ( do +tryLock lockfile = do + abslockfile <- absPath lockfile + lockenv <- pidLockEnv abslockfile + getEnv lockenv >>= \case + Nothing -> trySideLock lockfile (go abslockfile) + _ -> return (Just ParentLocked) + where + go abslockfile sidelock = do + (tmp, h) <- openTempFile (takeDirectory abslockfile) "locktmp" + setFileMode tmp (combineModes readModes) + hPutStr h . show =<< mkPidLock + hClose h + let failedlock st = do + dropLock $ LockHandle tmp st sidelock nukeFile tmp - -- May not have made a hard link, so stat - -- the lockfile - lckst <- getFileStatus lockfile' - tooklock lckst - , do - v <- readPidLock lockfile' - hn <- getHostName - tmpst <- getFileStatus tmp - case v of - Just pl | isJust sidelock && hn == lockingHost pl -> do - -- Since we have the sidelock, - -- and are on the same host that - -- the pidlock was taken on, - -- we know that the pidlock is - -- stale, and can take it over. - rename tmp lockfile' - tooklock tmpst - _ -> failedlock tmpst - ) + return Nothing + let tooklock st = return $ Just $ LockHandle abslockfile st sidelock + ifM (linkToLock sidelock tmp abslockfile) + ( do + nukeFile tmp + -- May not have made a hard link, so stat + -- the lockfile + lckst <- getFileStatus abslockfile + tooklock lckst + , do + v <- readPidLock abslockfile + hn <- getHostName + tmpst <- getFileStatus tmp + case v of + Just pl | isJust sidelock && hn == lockingHost pl -> do + -- Since we have the sidelock, + -- and are on the same host that + -- the pidlock was taken on, + -- we know that the pidlock is + -- stale, and can take it over. + rename tmp abslockfile + tooklock tmpst + _ -> failedlock tmpst + ) -- Linux's open(2) man page recommends linking a pid lock into place, -- as the most portable atomic operation that will fail if @@ -242,6 +259,7 @@ dropLock (LockHandle lockfile _ sidelock) = do -- considered stale. dropSideLock sidelock nukeFile lockfile +dropLock ParentLocked = return () getLockStatus :: LockFile -> IO LockStatus getLockStatus = maybe StatusUnLocked (StatusLockedBy . lockingPid) <$$> readPidLock @@ -261,3 +279,17 @@ checkSaneLock lockfile (LockHandle _ st _) = go Nothing = return False go (Just st') = return $ deviceID st == deviceID st' && fileID st == fileID st' +checkSaneLock _ ParentLocked = return True + +-- | A parent process that is using pid locking can set this to 1 before +-- starting a child, to communicate to the child that it's holding the pid +-- lock and that the child can skip trying to take it, and not block +-- on the pid lock its parent is holding. +-- +-- The parent process should keep running as long as the child +-- process is running, since the child inherits the environment and will +-- not see unsetLockEnv. +pidLockEnv :: FilePath -> IO String +pidLockEnv lockfile = do + abslockfile <- absPath lockfile + return $ "PIDLOCK_" ++ filter legalInEnvVar abslockfile diff --git a/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs.mdwn b/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs.mdwn index 388ea9f5e6..83c82435bf 100644 --- a/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs.mdwn +++ b/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs.mdwn @@ -56,3 +56,5 @@ details of the setup are [in that PR](https://github.com/datalad/datalad-extensi PS determining the boundaries and names of the tests git annex had ran is a tricky business on its own -- I wondered if tests output formatting and annotation could have been improved as well. E.g. unlikely there is a point to print all output if test passes. With `nose` in Python / datalad we get a summary of all failed tests (and what was output when they were ran) at the end of the full sweep. That helps to avoid needing to search the entire long list +> I've fixed two tests now, so [[done]]. (Also git-annex test succeeded +> on vfat.) --[[Joey]] diff --git a/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_8_1be9990d0cab0e69936de356072ea890._comment b/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_8_1be9990d0cab0e69936de356072ea890._comment new file mode 100644 index 0000000000..65ddb75f24 --- /dev/null +++ b/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_8_1be9990d0cab0e69936de356072ea890._comment @@ -0,0 +1,50 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 8""" + date="2020-06-17T16:16:19Z" + content=""" +Minimal test case for the hang: + + git init a + cd a + git annex init + git annex adjust --unlock + git config annex.pidlock true + date > foo + git annex add --force + git commit -m add + cd .. + git clone a b + cd b + git annex init + git annex adjust --unlock + git config annex.pidlock true + git annex get foo --force + +That does not need vfat to hang. + + 364479 pts/2 Sl+ 0:00 | \_ /home/joey/bin/git-annex get foo + 364504 pts/2 Sl+ 0:00 | \_ git --git-dir=.git --work-tree=. --literal-pathspecs -c core.safecrlf=false update-index -q --refresh -z --stdin + 364506 pts/2 S+ 0:00 | \_ /bin/sh -c git-annex smudge --clean -- 'foo' git-annex smudge --clean -- 'foo' + 364507 pts/2 Sl+ 0:00 | \_ git-annex smudge --clean -- foo + +So is git-annex smudge waiting on the pidlock its parent has? + +Yes: After setting annex.pidlocktimeout 2: + + 2 second timeout exceeded while waiting for pid lock file .git/annex/pidlock + git-annex: Gave up waiting for possibly stale pid lock file .git/annex/pidlock + error: external filter 'git-annex smudge --clean -- %f' failed 1 + +What I'm not sure about: annex.pidlock is not set by default on vfat, +so why would the test suite have failed there, and intermittently? +Maybe annex.pidlock does get set in some circumstances? + +Anyway, there's a clear problem that annex.pidlock prevents more than 1 git-annex +process that uses locking from running, and here we have a parent git-annex +that necessarily runs a child git-annex that does locking. + +Could the child process check if a parent/grandparent has the pid lock held +and so safely skip taking it? Or do all places git-annex runs itself +have to shut down pid locking? +"""]] diff --git a/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_9_cf253f0ad3f9857b9f0746e678d8dbd8._comment b/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_9_cf253f0ad3f9857b9f0746e678d8dbd8._comment new file mode 100644 index 0000000000..f29b477fc7 --- /dev/null +++ b/doc/bugs/one_annex_test_FAILs_when_HOME_is_a_crippled_fs/comment_9_cf253f0ad3f9857b9f0746e678d8dbd8._comment @@ -0,0 +1,50 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 9""" + date="2020-06-17T17:04:57Z" + content=""" +Oh this is tricky.. git-annex is taking the gitAnnexGitQueueLock +while running the queued git update-index command. Which is the command +that then runs git-annex. + +So dropping the pid lock before running the command won't work. + +If pid locks were fine grained, this would not be a problem because the +child process is really locking a different resource than its grandparent. + +But, I think the reasons for not making them fine grained do make sense: +Since git-annex sometimes takes a posix lock on a content file, it would +need to use some other lock file for the pid lock. So every place that +uses a lock file would have to specifiy the filename to use for pid +locking. Which makes pid locking complicate the rest of the code base +quite a lot, and every code path involving locking would have to be tested +twice, in order to check that the pid lock file used by that lock works. +Doubling the complexity of your file locking is a good thing to avoid. + +Hmm.. I said "the child process is really locking a different resource than +its grandparent". And that generally has to be the case, or people using +git-annex w/o pid locking would notice that hey, these child processes +fail to take a lock and crash. + +So.. If we assume that is the case, and that there are plenty of git-annex +users not using pid locking, then there's no need for a child process +to take the pid lock, if its parent currently has the pid lock held, +and will keep it held. + +And this could be communicated via an env var. When the pid lock is taken +set `ANNEX_PIDLOCKED`, and unset when it's dropped. Then, so long +as childen inherit that env variable, they can skip taking the pid lock when +it's set. + +To make sure that's safe, any time git-annex runs a child process +(or a git command that runs git-annex), it ought to hold the pid lock +while doing it. Holding any lock will do. The risk is, if one thread +has some lock held for whatever reason, and another thread runs the child +process, then the child process will rely on the unrelated thread keeping +the lock held. Explicitly holding some lock avoids such a scenario. + +So, let's make it more explicit, add a runsGitAnnex that, when pid locking +is enabled, holds the pid lock while running the action. Then that has to +be wrapped around any places where a git-annex child process is run, +which can be done broadly, or just as these issues come up. +"""]]