From 7db37ddde012dd1b44eface0fb29f6dc55bd85f1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 25 May 2017 16:02:17 -0400 Subject: [PATCH] Fix transfer log file locking problem when running concurrent transfers. orElse is great, but was not the right thing to use here because waitTakeLock could retry for other reasons than the lock being held, which made tryTakeLock fail when it shouldn't. Instead, move the code to tryTakeLock and implement waitTakeLock using tryTakeLock and retry. (Also, in runTransfer, when checkSaneLock fails, dropLock to avoid leaking a lock handle.) This commit was supported by the NSF-funded DataLad project. --- Annex/Transfer.hs | 4 +- CHANGELOG | 2 + Utility/LockPool/STM.hs | 30 +++++++------- ..._6674e4dbc7437ce941bcef6272c3433b._comment | 40 +++++++++++++++++++ 4 files changed, 60 insertions(+), 16 deletions(-) create mode 100644 doc/bugs/parallel_get_can_fail_some_downloads_and_require_re-getting_/comment_3_6674e4dbc7437ce941bcef6272c3433b._comment diff --git a/Annex/Transfer.hs b/Annex/Transfer.hs index 87480b2f1d..3fcf1a1b97 100644 --- a/Annex/Transfer.hs +++ b/Annex/Transfer.hs @@ -118,7 +118,9 @@ runTransfer' ignorelock t afile shouldretry transferaction = checkSecureHashes t void $ liftIO $ tryIO $ writeTransferInfoFile info tfile return (Just lockhandle, False) - , return (Nothing, True) + , do + liftIO $ dropLock lockhandle + return (Nothing, True) ) #else prep tfile _mode info = catchPermissionDenied (const prepfailed) $ do diff --git a/CHANGELOG b/CHANGELOG index dd408a0b51..bf91667226 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,8 @@ git-annex (6.20170520) UNRELEASED; urgency=medium exclamation mark, which forces gpg to use a specific subkey. * Improve progress display when watching file size, in cases where a transfer does not resume. + * Fix transfer log file locking problem when running concurrent + transfers. -- Joey Hess Wed, 24 May 2017 14:03:40 -0400 diff --git a/Utility/LockPool/STM.hs b/Utility/LockPool/STM.hs index d1ee0dbaf1..5cb7b88340 100644 --- a/Utility/LockPool/STM.hs +++ b/Utility/LockPool/STM.hs @@ -49,9 +49,9 @@ type LockPool = TMVar (M.Map LockFile LockStatus) -- A shared global variable for the lockPool. Avoids callers needing to -- maintain state for this implementation detail. +{-# NOINLINE lockPool #-} lockPool :: LockPool lockPool = unsafePerformIO (newTMVarIO M.empty) -{-# NOINLINE lockPool #-} -- Updates the LockPool, blocking as necessary if another thread is holding -- a conflicting lock. @@ -62,23 +62,23 @@ lockPool = unsafePerformIO (newTMVarIO M.empty) -- Keeping the whole Map in a TMVar accomplishes this, at the expense of -- sometimes retrying after unrelated changes in the map. waitTakeLock :: LockPool -> LockFile -> LockMode -> STM LockHandle -waitTakeLock pool file mode = do - m <- takeTMVar pool - v <- case M.lookup file m of - Just (LockStatus mode' n closelockfile) - | mode == LockShared && mode' == LockShared -> - return $ LockStatus mode (succ n) closelockfile - | n > 0 -> retry -- wait for lock - _ -> return $ LockStatus mode 1 noop - putTMVar pool (M.insert file v m) - newTMVar (pool, file) +waitTakeLock pool file mode = maybe retry return =<< tryTakeLock pool file mode -- Avoids blocking if another thread is holding a conflicting lock. tryTakeLock :: LockPool -> LockFile -> LockMode -> STM (Maybe LockHandle) -tryTakeLock pool file mode = - (Just <$> waitTakeLock pool file mode) - `orElse` - return Nothing +tryTakeLock pool file mode = do + m <- takeTMVar pool + let success v = do + putTMVar pool (M.insert file v m) + Just <$> newTMVar (pool, file) + case M.lookup file m of + Just (LockStatus mode' n closelockfile) + | mode == LockShared && mode' == LockShared -> + success $ LockStatus mode (succ n) closelockfile + | n > 0 -> do + putTMVar pool m + return Nothing + _ -> success $ LockStatus mode 1 noop -- Call after waitTakeLock or tryTakeLock, to register a CloseLockFile -- action to run when releasing the lock. diff --git a/doc/bugs/parallel_get_can_fail_some_downloads_and_require_re-getting_/comment_3_6674e4dbc7437ce941bcef6272c3433b._comment b/doc/bugs/parallel_get_can_fail_some_downloads_and_require_re-getting_/comment_3_6674e4dbc7437ce941bcef6272c3433b._comment new file mode 100644 index 0000000000..a004b75b90 --- /dev/null +++ b/doc/bugs/parallel_get_can_fail_some_downloads_and_require_re-getting_/comment_3_6674e4dbc7437ce941bcef6272c3433b._comment @@ -0,0 +1,40 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2017-05-25T19:08:59Z" + content=""" +That looks like concurrent `git config` setting remote.origin.annex-uuid +are failing. + +I have not reproduced the `.git/config` error, but with a local +clone of a repository, I have been able to reproduce some intermittent +"transfer already in progress, or unable to take transfer lock" failures +with `git annex get -J5`, happening after remote.origin.annex-uuid has been +cached. + +So, two distinct bugs I think.. + +--- + +Debugging, the lock it fails to take always seems to be the lock on +the remote side, which points to the local clone being involved somehow. + +Debugging further, Utility.LockPool.STM.tryTakeLock is what's failing. +That's supposed to only fail when another thread holds a conflicting lock, +but as it's implemented with `orElse`, if the main STM +transaction retries due to other STM activity on the same TVar, +it will give up when it shouldn't. + +That's probably why this is happening under heavier concurrency loads; +it makes that failure case much more likely. And with a local clone, +twice as much locking is done. + +I've fixed this part of it! + +--- + +The concurrent `git config` part remains. +Since git-annex can potentially have multiple threads doing different `git +config` for their own reasons concurrently, it seems it will need to add +its own locking around that. +"""]]