pve-kernel-thunderx/debian/patches/zfs/0012-Fix-zvol_open-lock-inversion.patch
Fabian Mastenbroek b8de2740ea
Update ZFS patches
This change adds new patches from Proxmox for the ZFS project.
2022-02-04 11:46:40 +01:00

212 lines
5.8 KiB
Diff

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 17 Dec 2021 09:52:13 -0800
Subject: [PATCH] Fix zvol_open() lock inversion
When restructuring the zvol_open() logic for the Linux 5.13 kernel
a lock inversion was accidentally introduced. In the updated code
the spa_namespace_lock is now taken before the zv_suspend_lock
allowing the following scenario to occur:
down_read <=== waiting for zv_suspend_lock
zvol_open <=== holds spa_namespace_lock
__blkdev_get
blkdev_get_by_dev
blkdev_open
...
mutex_lock <== waiting for spa_namespace_lock
spa_open_common
spa_open
dsl_pool_hold
dmu_objset_hold_flags
dmu_objset_hold
dsl_prop_get
dsl_prop_get_integer
zvol_create_minor
dmu_recv_end
zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
zfs_ioc_recv
...
This commit resolves the issue by moving the acquisition of the
spa_namespace_lock back to after the zv_suspend_lock which restores
the original ordering.
Additionally, as part of this change the error exit paths were
simplified where possible.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #12863
(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023)
Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------
1 file changed, 58 insertions(+), 63 deletions(-)
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 44caadd58..69479b3f7 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
{
zvol_state_t *zv;
int error = 0;
- boolean_t drop_suspend = B_TRUE;
- boolean_t drop_namespace = B_FALSE;
+ boolean_t drop_suspend = B_FALSE;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
hrtime_t start = gethrtime();
@@ -517,7 +516,36 @@ retry:
return (SET_ERROR(-ENXIO));
}
- if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ mutex_enter(&zv->zv_state_lock);
+ /*
+ * Make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ } else {
+ drop_suspend = B_TRUE;
+ }
+ } else {
+ drop_suspend = B_TRUE;
+ }
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ boolean_t drop_namespace = B_FALSE;
+
+ ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+
/*
* In all other call paths the spa_namespace_lock is taken
* before the bdev->bd_mutex lock. However, on open(2)
@@ -542,84 +570,51 @@ retry:
* the kernel so the only option is to return the error for
* the caller to handle it.
*/
- if (!mutex_tryenter(&spa_namespace_lock)) {
- rw_exit(&zvol_state_lock);
+ if (!mutex_owned(&spa_namespace_lock)) {
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_exit(&zv->zv_suspend_lock);
#ifdef HAVE_BLKDEV_GET_ERESTARTSYS
- schedule();
- return (SET_ERROR(-ERESTARTSYS));
-#else
- if ((gethrtime() - start) > timeout)
+ schedule();
return (SET_ERROR(-ERESTARTSYS));
+#else
+ if ((gethrtime() - start) > timeout)
+ return (SET_ERROR(-ERESTARTSYS));
- schedule_timeout(MSEC_TO_TICK(10));
- goto retry;
+ schedule_timeout(MSEC_TO_TICK(10));
+ goto retry;
#endif
- } else {
- drop_namespace = B_TRUE;
- }
- }
-
- mutex_enter(&zv->zv_state_lock);
- /*
- * make sure zvol is not suspended during first open
- * (hold zv_suspend_lock) and respect proper lock acquisition
- * ordering - zv_suspend_lock before zv_state_lock
- */
- if (zv->zv_open_count == 0) {
- if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
- mutex_exit(&zv->zv_state_lock);
- rw_enter(&zv->zv_suspend_lock, RW_READER);
- mutex_enter(&zv->zv_state_lock);
- /* check to see if zv_suspend_lock is needed */
- if (zv->zv_open_count != 0) {
- rw_exit(&zv->zv_suspend_lock);
- drop_suspend = B_FALSE;
+ } else {
+ drop_namespace = B_TRUE;
}
}
- } else {
- drop_suspend = B_FALSE;
- }
- rw_exit(&zvol_state_lock);
-
- ASSERT(MUTEX_HELD(&zv->zv_state_lock));
- if (zv->zv_open_count == 0) {
- ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
- if (error)
- goto out_mutex;
- }
- if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
- error = -EROFS;
- goto out_open_count;
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
}
- zv->zv_open_count++;
-
- mutex_exit(&zv->zv_state_lock);
- if (drop_namespace)
- mutex_exit(&spa_namespace_lock);
- if (drop_suspend)
- rw_exit(&zv->zv_suspend_lock);
-
- zfs_check_media_change(bdev);
-
- return (0);
+ if (error == 0) {
+ if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ if (zv->zv_open_count == 0)
+ zvol_last_close(zv);
-out_open_count:
- if (zv->zv_open_count == 0)
- zvol_last_close(zv);
+ error = SET_ERROR(-EROFS);
+ } else {
+ zv->zv_open_count++;
+ }
+ }
-out_mutex:
mutex_exit(&zv->zv_state_lock);
- if (drop_namespace)
- mutex_exit(&spa_namespace_lock);
if (drop_suspend)
rw_exit(&zv->zv_suspend_lock);
- return (SET_ERROR(error));
+ if (error == 0)
+ zfs_check_media_change(bdev);
+
+ return (error);
}
static void