From a6f2f0fdc73aacc6e10ae48ae78634dba26702d4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Jun 2020 14:00:20 +0300 Subject: [PATCH 001/497] soc: xilinx: Fix error code in zynqmp_pm_probe() This should be returning PTR_ERR() but it returns IS_ERR() instead. Fixes: ffdbae28d9d1 ("drivers: soc: xilinx: Use mailbox IPI callback") Signed-off-by: Dan Carpenter Reviewed-by: Michal Simek Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/20200605110020.GA978434@mwanda --- drivers/soc/xilinx/zynqmp_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/xilinx/zynqmp_power.c b/drivers/soc/xilinx/zynqmp_power.c index 31ff49fcd078..c556623dae02 100644 --- a/drivers/soc/xilinx/zynqmp_power.c +++ b/drivers/soc/xilinx/zynqmp_power.c @@ -205,7 +205,7 @@ static int zynqmp_pm_probe(struct platform_device *pdev) rx_chan = mbox_request_channel_byname(client, "rx"); if (IS_ERR(rx_chan)) { dev_err(&pdev->dev, "Failed to request rx channel\n"); - return IS_ERR(rx_chan); + return PTR_ERR(rx_chan); } } else if (of_find_property(pdev->dev.of_node, "interrupts", NULL)) { irq = platform_get_irq(pdev, 0); From d1876f3596a57b114ae8615e28a935d996ac5464 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 10 Aug 2020 11:21:15 +0100 Subject: [PATCH 002/497] cpupowerutils: fix spelling mistake "dependant" -> "dependent" There is a spelling mistake in a message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Shuah Khan --- tools/power/cpupower/debug/i386/intel_gsic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/debug/i386/intel_gsic.c b/tools/power/cpupower/debug/i386/intel_gsic.c index e5e926f46d6b..befd837f07f8 100644 --- a/tools/power/cpupower/debug/i386/intel_gsic.c +++ b/tools/power/cpupower/debug/i386/intel_gsic.c @@ -71,7 +71,7 @@ int main (void) printf("\tsmi_cmd=0x?? smi_port=0x?? smi_sig=1\n"); printf("\nUnfortunately, you have to know what exactly are " "smi_cmd and smi_port, and this\nis system " - "dependant.\n"); + "dependent.\n"); } return 1; } From 527b7779e5ecabb057089b760140309bdcacc16a Mon Sep 17 00:00:00 2001 From: Martin Kaistra Date: Wed, 12 Aug 2020 11:49:12 +0200 Subject: [PATCH 003/497] cpupower: speed up generating git version string The variable VERSION is expanded for every use of CFLAGS. This causes "git describe" to get called multiple times on the kernel tree, which can be quite slow. The git revision does not change during build, so we can use simple variable expansion to set VERSION. Signed-off-by: Martin Kaistra Acked-by: Thomas Renninger Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index c8622497ef23..c7bcddbd486d 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -51,7 +51,7 @@ DESTDIR ?= # Package-related definitions. Distributions can modify the version # and _should_ modify the PACKAGE_BUGREPORT definition -VERSION= $(shell ./utils/version-gen.sh) +VERSION:= $(shell ./utils/version-gen.sh) LIB_MAJ= 0.0.1 LIB_MIN= 0 From 1716420e8de2705cd16905134d1eaeda94272a17 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2020 08:08:51 -0700 Subject: [PATCH 004/497] iio: cros_ec: Accept -EOPNOTSUPP as 'not supported' error code A follow-up patch will extend the number of errors reported by cros_ec_cmd_xfer_status(). Specifically, the function will return -EOPNOTSUPP if a command is not supported by the EC. Prepare for it. Cc: Gwendal Grignou Cc: Yu-Hsuan Hsu Cc: Prashant Malani Cc: Brian Norris Acked-by: Jonathan Cameron Reviewed-by: Brian Norris Signed-off-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c index 130ab8ce0269..d71e9064c789 100644 --- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c +++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c @@ -73,7 +73,7 @@ static int cros_ec_sensors_read(struct iio_dev *indio_dev, st->core.param.sensor_offset.flags = 0; ret = cros_ec_motion_send_host_cmd(&st->core, 0); - if (ret == -EPROTO) { + if (ret == -EPROTO || ret == -EOPNOTSUPP) { /* Reading calibscale is not supported on older EC. */ *val = 1; *val2 = 0; From 064df8851f4ac25f64c0323fc93c3f9705e1702d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2020 08:08:52 -0700 Subject: [PATCH 005/497] cros_ec_lightbar: Accept more error codes from cros_ec_cmd_xfer_status Since commit c5cd2b47b203 ("platform/chrome: cros_ec_proto: Report command not supported") we can no longer assume that cros_ec_cmd_xfer_status() reports -EPROTO for all errors returned by the EC itself. A follow-up patch will change cros_ec_cmd_xfer_status() to report additional errors reported by the EC as distinguished Linux error codes. Handle this change by no longer assuming that -EPROTO is used to report all errors returned by the EC itself. Since errors reported by the EC are already reported in text form through sysfs attributes, extend this form of error reporting to all errors reported by cros_ec_cmd_xfer_status(). Cc: Gwendal Grignou Cc: Yu-Hsuan Hsu Cc: Prashant Malani Cc: Brian Norris Reviewed-by: Brian Norris Signed-off-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_lightbar.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c index b59180bff5a3..8445cda57927 100644 --- a/drivers/platform/chrome/cros_ec_lightbar.c +++ b/drivers/platform/chrome/cros_ec_lightbar.c @@ -117,7 +117,7 @@ static int get_lightbar_version(struct cros_ec_dev *ec, param = (struct ec_params_lightbar *)msg->data; param->cmd = LIGHTBAR_CMD_VERSION; ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); - if (ret < 0) { + if (ret < 0 && ret != -EINVAL) { ret = 0; goto exit; } @@ -298,11 +298,9 @@ static ssize_t sequence_show(struct device *dev, goto exit; ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); - if (ret == -EPROTO) { - ret = scnprintf(buf, PAGE_SIZE, - "ERROR: EC returned %d\n", msg->result); - goto exit; - } else if (ret < 0) { + if (ret < 0) { + ret = scnprintf(buf, PAGE_SIZE, "XFER / EC ERROR %d / %d\n", + ret, msg->result); goto exit; } From b646e7db1ce85e24bdb3afd16e542a348a53d2ab Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2020 08:08:53 -0700 Subject: [PATCH 006/497] platform/chrome: cros_ec_sysfs: Report range of error codes from EC Since commit c5cd2b47b203 ("platform/chrome: cros_ec_proto: Report command not supported") we can no longer assume that cros_ec_cmd_xfer_status() reports -EPROTO for all errors returned by the EC itself. A follow-up patch will change cros_ec_cmd_xfer_status() to report additional errors reported by the EC as distinguished Linux error codes. Prepare for this change by always reporting both the Linux error code and the EC error code in sysfs attributes. Cc: Gwendal Grignou Cc: Yu-Hsuan Hsu Cc: Prashant Malani Cc: Brian Norris Reviewed-by: Brian Norris Signed-off-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_sysfs.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c index d45ea5d5bfa4..9c1e0998a721 100644 --- a/drivers/platform/chrome/cros_ec_sysfs.c +++ b/drivers/platform/chrome/cros_ec_sysfs.c @@ -150,12 +150,10 @@ static ssize_t version_show(struct device *dev, msg->command = EC_CMD_GET_BUILD_INFO + ec->cmd_offset; msg->insize = EC_HOST_PARAM_SIZE; ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); - if (ret == -EPROTO) { + if (ret < 0) { count += scnprintf(buf + count, PAGE_SIZE - count, - "Build info: EC error %d\n", msg->result); - } else if (ret < 0) { - count += scnprintf(buf + count, PAGE_SIZE - count, - "Build info: XFER ERROR %d\n", ret); + "Build info: XFER / EC ERROR %d / %d\n", + ret, msg->result); } else { msg->data[EC_HOST_PARAM_SIZE - 1] = '\0'; count += scnprintf(buf + count, PAGE_SIZE - count, @@ -166,12 +164,10 @@ static ssize_t version_show(struct device *dev, msg->command = EC_CMD_GET_CHIP_INFO + ec->cmd_offset; msg->insize = sizeof(*r_chip); ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); - if (ret == -EPROTO) { + if (ret < 0) { count += scnprintf(buf + count, PAGE_SIZE - count, - "Chip info: EC error %d\n", msg->result); - } else if (ret < 0) { - count += scnprintf(buf + count, PAGE_SIZE - count, - "Chip info: XFER ERROR %d\n", ret); + "Chip info: XFER / EC ERROR %d / %d\n", + ret, msg->result); } else { r_chip = (struct ec_response_get_chip_info *)msg->data; @@ -190,12 +186,10 @@ static ssize_t version_show(struct device *dev, msg->command = EC_CMD_GET_BOARD_VERSION + ec->cmd_offset; msg->insize = sizeof(*r_board); ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); - if (ret == -EPROTO) { + if (ret < 0) { count += scnprintf(buf + count, PAGE_SIZE - count, - "Board version: EC error %d\n", msg->result); - } else if (ret < 0) { - count += scnprintf(buf + count, PAGE_SIZE - count, - "Board version: XFER ERROR %d\n", ret); + "Board version: XFER / EC ERROR %d / %d\n", + ret, msg->result); } else { r_board = (struct ec_response_board_version *)msg->data; From d509f8a71aa0a5e6d92409d9f0b4eaa94615727d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2020 08:08:54 -0700 Subject: [PATCH 007/497] pwm: cros-ec: Accept more error codes from cros_ec_cmd_xfer_status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit c5cd2b47b203 ("platform/chrome: cros_ec_proto: Report command not supported") we can no longer assume that cros_ec_cmd_xfer_status() reports -EPROTO for all errors returned by the EC itself. A follow-up patch will change cros_ec_cmd_xfer_status() to report additional errors reported by the EC as distinguished Linux error codes. Handle this change by no longer assuming that only -EPROTO is used to report all errors returned by the EC itself. Instead, support both the old and the new error codes. Add a comment describing cros_ec_num_pwms() to explain its functionality. Cc: Gwendal Grignou Cc: Yu-Hsuan Hsu Cc: Prashant Malani Cc: Brian Norris Acked-by: Thierry Reding Acked-by: Uwe Kleine-König Signed-off-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/pwm/pwm-cros-ec.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 09c08dee099e..94d3dff9b0e5 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -204,6 +204,11 @@ static const struct pwm_ops cros_ec_pwm_ops = { .owner = THIS_MODULE, }; +/* + * Determine the number of supported PWMs. The EC does not return the number + * of PWMs it supports directly, so we have to read the pwm duty cycle for + * subsequent channels until we get an error. + */ static int cros_ec_num_pwms(struct cros_ec_device *ec) { int i, ret; @@ -213,20 +218,30 @@ static int cros_ec_num_pwms(struct cros_ec_device *ec) u32 result = 0; ret = __cros_ec_pwm_get_duty(ec, i, &result); - /* We want to parse EC protocol errors */ - if (ret < 0 && !(ret == -EPROTO && result)) - return ret; - /* * We look for SUCCESS, INVALID_COMMAND, or INVALID_PARAM * responses; everything else is treated as an error. + * The EC error codes either map to -EOPNOTSUPP / -EINVAL, + * or -EPROTO is returned and the EC error is in the result + * field. Check for both. */ - if (result == EC_RES_INVALID_COMMAND) + switch (ret) { + case -EOPNOTSUPP: /* invalid command */ return -ENODEV; - else if (result == EC_RES_INVALID_PARAM) + case -EINVAL: /* invalid parameter */ return i; - else if (result) + case -EPROTO: + /* Old or new error return code: Handle both */ + if (result == EC_RES_INVALID_COMMAND) + return -ENODEV; + else if (result == EC_RES_INVALID_PARAM) + return i; return -EPROTO; + default: + if (ret < 0) + return ret; + break; + } } return U8_MAX; From b4e452b5e97dc70d529475168a37ad7394994b8b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2020 08:08:55 -0700 Subject: [PATCH 008/497] platform/input: cros_ec: Replace -ENOTSUPP with -ENOPROTOOPT -ENOTSUPP is not a SUSV4 error code and should not be used. Use -ENOPROTOOPT instead to report EC_RES_INVALID_VERSION responses from the EC. This matches match the NFS response for unsupported protocol versions. Cc: Gwendal Grignou Cc: Yu-Hsuan Hsu Cc: Prashant Malani Cc: Brian Norris Acked-by: Dmitry Torokhov Signed-off-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/input/keyboard/cros_ec_keyb.c | 2 +- drivers/platform/chrome/cros_ec_proto.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c index fc1793ca2f17..15d17c717081 100644 --- a/drivers/input/keyboard/cros_ec_keyb.c +++ b/drivers/input/keyboard/cros_ec_keyb.c @@ -348,7 +348,7 @@ static int cros_ec_keyb_info(struct cros_ec_device *ec_dev, params->event_type = event_type; ret = cros_ec_cmd_xfer_status(ec_dev, msg); - if (ret == -ENOTSUPP) { + if (ret == -ENOPROTOOPT) { /* With older ECs we just return 0 for everything */ memset(result, 0, result_size); ret = 0; diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 8d52b3b4bd4e..f4727511fb41 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -579,7 +579,7 @@ static int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, * * Return: * >=0 - The number of bytes transferred - * -ENOTSUPP - Operation not supported + * -ENOPROTOOPT - Operation not supported * -EPROTO - Protocol error */ int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, @@ -593,7 +593,7 @@ int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, } else if (msg->result == EC_RES_INVALID_VERSION) { dev_dbg(ec_dev->dev, "Command invalid version (err:%d)\n", msg->result); - return -ENOTSUPP; + return -ENOPROTOOPT; } else if (msg->result != EC_RES_SUCCESS) { dev_dbg(ec_dev->dev, "Command result (err: %d)\n", msg->result); return -EPROTO; From 0d080459e813ce8076f183cc73a4c9b64a39a4d8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2020 08:08:56 -0700 Subject: [PATCH 009/497] platform/chrome: cros_ec_proto: Convert EC error codes to Linux error codes The EC reports a variety of error codes. Most of those, with the exception of EC_RES_INVALID_VERSION, are converted to -EPROTO. As result, the actual EC error code gets lost. Introduce cros_ec_map_error() to map EC error codes to Linux error codes, and use it in cros_ec_cmd_xfer_status() to report more meaningful errors to the caller. With this change, callers of cros_ec_cmd_xfer_status() can implement a more distinguished action without having to rely on the EC error code. At the same time, debugging is improved in situations where the Linux error code is reported to userspace and/or in the kernel log. Cc: Gwendal Grignou Cc: Yu-Hsuan Hsu Cc: Prashant Malani Cc: Brian Norris Signed-off-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_proto.c | 57 ++++++++++++++++++++----- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index f4727511fb41..dda182132a6a 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -15,6 +15,43 @@ #define EC_COMMAND_RETRIES 50 +static const int cros_ec_error_map[] = { + [EC_RES_INVALID_COMMAND] = -EOPNOTSUPP, + [EC_RES_ERROR] = -EIO, + [EC_RES_INVALID_PARAM] = -EINVAL, + [EC_RES_ACCESS_DENIED] = -EACCES, + [EC_RES_INVALID_RESPONSE] = -EPROTO, + [EC_RES_INVALID_VERSION] = -ENOPROTOOPT, + [EC_RES_INVALID_CHECKSUM] = -EBADMSG, + [EC_RES_IN_PROGRESS] = -EINPROGRESS, + [EC_RES_UNAVAILABLE] = -ENODATA, + [EC_RES_TIMEOUT] = -ETIMEDOUT, + [EC_RES_OVERFLOW] = -EOVERFLOW, + [EC_RES_INVALID_HEADER] = -EBADR, + [EC_RES_REQUEST_TRUNCATED] = -EBADR, + [EC_RES_RESPONSE_TOO_BIG] = -EFBIG, + [EC_RES_BUS_ERROR] = -EFAULT, + [EC_RES_BUSY] = -EBUSY, + [EC_RES_INVALID_HEADER_VERSION] = -EBADMSG, + [EC_RES_INVALID_HEADER_CRC] = -EBADMSG, + [EC_RES_INVALID_DATA_CRC] = -EBADMSG, + [EC_RES_DUP_UNAVAILABLE] = -ENODATA, +}; + +static int cros_ec_map_error(uint32_t result) +{ + int ret = 0; + + if (result != EC_RES_SUCCESS) { + if (result < ARRAY_SIZE(cros_ec_error_map) && cros_ec_error_map[result]) + ret = cros_ec_error_map[result]; + else + ret = -EPROTO; + } + + return ret; +} + static int prepare_packet(struct cros_ec_device *ec_dev, struct cros_ec_command *msg) { @@ -579,26 +616,24 @@ static int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, * * Return: * >=0 - The number of bytes transferred - * -ENOPROTOOPT - Operation not supported - * -EPROTO - Protocol error + * <0 - Linux error code */ int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, struct cros_ec_command *msg) { - int ret; + int ret, mapped; ret = cros_ec_cmd_xfer(ec_dev, msg); if (ret < 0) { dev_err(ec_dev->dev, "Command xfer error (err:%d)\n", ret); - } else if (msg->result == EC_RES_INVALID_VERSION) { - dev_dbg(ec_dev->dev, "Command invalid version (err:%d)\n", - msg->result); - return -ENOPROTOOPT; - } else if (msg->result != EC_RES_SUCCESS) { - dev_dbg(ec_dev->dev, "Command result (err: %d)\n", msg->result); - return -EPROTO; + return ret; + } + mapped = cros_ec_map_error(msg->result); + if (mapped) { + dev_dbg(ec_dev->dev, "Command result (err: %d [%d])\n", + msg->result, mapped); + ret = mapped; } - return ret; } EXPORT_SYMBOL(cros_ec_cmd_xfer_status); From be020f0df5a92d90af7506e96cdf86105c396965 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 22 Aug 2020 08:08:57 -0700 Subject: [PATCH 010/497] pwm: cros-ec: Simplify EC error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With enhanced error reporting from cros_ec_cmd_xfer_status() in place, we can fully use it and no longer rely on EC error codes. Acked-by: Uwe Kleine-König Reviewed-by: Brian Norris Signed-off-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/pwm/pwm-cros-ec.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 94d3dff9b0e5..c1c337969e4e 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -81,8 +81,7 @@ static int cros_ec_pwm_set_duty(struct cros_ec_device *ec, u8 index, u16 duty) return cros_ec_cmd_xfer_status(ec, msg); } -static int __cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index, - u32 *result) +static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index) { struct { struct cros_ec_command msg; @@ -107,19 +106,12 @@ static int __cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index, params->index = index; ret = cros_ec_cmd_xfer_status(ec, msg); - if (result) - *result = msg->result; if (ret < 0) return ret; return resp->duty; } -static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index) -{ - return __cros_ec_pwm_get_duty(ec, index, NULL); -} - static int cros_ec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state) { @@ -215,28 +207,18 @@ static int cros_ec_num_pwms(struct cros_ec_device *ec) /* The index field is only 8 bits */ for (i = 0; i <= U8_MAX; i++) { - u32 result = 0; - - ret = __cros_ec_pwm_get_duty(ec, i, &result); + ret = cros_ec_pwm_get_duty(ec, i); /* * We look for SUCCESS, INVALID_COMMAND, or INVALID_PARAM * responses; everything else is treated as an error. - * The EC error codes either map to -EOPNOTSUPP / -EINVAL, - * or -EPROTO is returned and the EC error is in the result - * field. Check for both. + * The EC error codes map to -EOPNOTSUPP and -EINVAL, + * so check for those. */ switch (ret) { case -EOPNOTSUPP: /* invalid command */ return -ENODEV; case -EINVAL: /* invalid parameter */ return i; - case -EPROTO: - /* Old or new error return code: Handle both */ - if (result == EC_RES_INVALID_COMMAND) - return -ENODEV; - else if (result == EC_RES_INVALID_PARAM) - return i; - return -EPROTO; default: if (ret < 0) return ret; From 46c5bbd2df4a8b7eed427db866a5bce7234744bf Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 10 Jul 2020 12:40:17 -0700 Subject: [PATCH 011/497] platform/chrome: cros_ec_typec: USB4 support With USB4 mode the mux driver needs the Enter_USB Data Object (EUDO) that was used when the USB mode was entered. Though the object is not available in the driver, it is possible to construct it from the information we have. Signed-off-by: Heikki Krogerus Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_typec.c | 33 ++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 3fcd27ec9ad8..585f27e2b362 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -496,6 +497,34 @@ static int cros_typec_enable_dp(struct cros_typec_data *typec, return typec_mux_set(port->mux, &port->state); } +static int cros_typec_enable_usb4(struct cros_typec_data *typec, + int port_num, + struct ec_response_usb_pd_control_v2 *pd_ctrl) +{ + struct cros_typec_port *port = typec->ports[port_num]; + struct enter_usb_data data; + + data.eudo = EUDO_USB_MODE_USB4 << EUDO_USB_MODE_SHIFT; + + /* Cable Speed */ + data.eudo |= pd_ctrl->cable_speed << EUDO_CABLE_SPEED_SHIFT; + + /* Cable Type */ + if (pd_ctrl->control_flags & USB_PD_CTRL_OPTICAL_CABLE) + data.eudo |= EUDO_CABLE_TYPE_OPTICAL << EUDO_CABLE_TYPE_SHIFT; + else if (pd_ctrl->control_flags & USB_PD_CTRL_ACTIVE_CABLE) + data.eudo |= EUDO_CABLE_TYPE_RE_TIMER << EUDO_CABLE_TYPE_SHIFT; + + data.active_link_training = !!(pd_ctrl->control_flags & + USB_PD_CTRL_ACTIVE_LINK_UNIDIR); + + port->state.alt = NULL; + port->state.data = &data; + port->state.mode = TYPEC_MODE_USB4; + + return typec_mux_set(port->mux, &port->state); +} + static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num, uint8_t mux_flags, struct ec_response_usb_pd_control_v2 *pd_ctrl) @@ -516,7 +545,9 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num, if (ret) return ret; - if (mux_flags & USB_PD_MUX_TBT_COMPAT_ENABLED) { + if (mux_flags & USB_PD_MUX_USB4_ENABLED) { + ret = cros_typec_enable_usb4(typec, port_num, pd_ctrl); + } else if (mux_flags & USB_PD_MUX_TBT_COMPAT_ENABLED) { ret = cros_typec_enable_tbt(typec, port_num, pd_ctrl); } else if (mux_flags & USB_PD_MUX_DP_ENABLED) { ret = cros_typec_enable_dp(typec, port_num, pd_ctrl); From 5381b0ed54b6af3c0e8184b43e34154e17904848 Mon Sep 17 00:00:00 2001 From: Azhar Shaikh Date: Fri, 21 Aug 2020 14:47:22 -0700 Subject: [PATCH 012/497] platform/chrome: cros_ec_typec: Send enum values to usb_role_switch_set_role() usb_role_switch_set_role() has the second argument as enum for usb_role. Currently depending upon the data role i.e. UFP(0) or DFP(1) is sent. This eventually translates to USB_ROLE_NONE in case of UFP and USB_ROLE_DEVICE in case of DFP. Correct this by sending correct enum values as USB_ROLE_DEVICE in case of UFP and USB_ROLE_HOST in case of DFP. Fixes: 7e7def15fa4b ("platform/chrome: cros_ec_typec: Add USB mux control") Signed-off-by: Azhar Shaikh Cc: Prashant Malani Reviewed-by: Prashant Malani Reviewed-by: Heikki Krogerus Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_typec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 585f27e2b362..739a1c846949 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -622,7 +622,8 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) dev_warn(typec->dev, "Configure muxes failed, err = %d\n", ret); return usb_role_switch_set_role(typec->ports[port_num]->role_sw, - !!(resp.role & PD_CTRL_RESP_ROLE_DATA)); + resp.role & PD_CTRL_RESP_ROLE_DATA + ? USB_ROLE_HOST : USB_ROLE_DEVICE); } static int cros_typec_get_cmd_version(struct cros_typec_data *typec) From b12e4fd5f3e4852cdb1fa11d1a48498bea9e92cf Mon Sep 17 00:00:00 2001 From: Azhar Shaikh Date: Fri, 21 Aug 2020 14:47:23 -0700 Subject: [PATCH 013/497] platform/chrome: cros_ec_typec: Avoid setting usb role twice during disconnect On disconnect port partner is removed and usb role is set to NONE. But then in cros_typec_port_update() the role is set again. Avoid this by moving usb_role_switch_set_role() to cros_typec_configure_mux(). Suggested-by: Prashant Malani Signed-off-by: Azhar Shaikh Reviewed-by: Heikki Krogerus Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_typec.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 739a1c846949..39989069decf 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -564,7 +564,12 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num, ret = -ENOTSUPP; } - return ret; + if (ret) + return ret; + + return usb_role_switch_set_role(typec->ports[port_num]->role_sw, + pd_ctrl->role & PD_CTRL_RESP_ROLE_DATA + ? USB_ROLE_HOST : USB_ROLE_DEVICE); } static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) @@ -621,9 +626,7 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) if (ret) dev_warn(typec->dev, "Configure muxes failed, err = %d\n", ret); - return usb_role_switch_set_role(typec->ports[port_num]->role_sw, - resp.role & PD_CTRL_RESP_ROLE_DATA - ? USB_ROLE_HOST : USB_ROLE_DEVICE); + return ret; } static int cros_typec_get_cmd_version(struct cros_typec_data *typec) From a772336596dfadbbe22db24d4c4b9516c4b2dd57 Mon Sep 17 00:00:00 2001 From: Azhar Shaikh Date: Fri, 21 Aug 2020 14:47:24 -0700 Subject: [PATCH 014/497] platform/chrome: cros_ec_typec: Re-order connector configuration steps As per USB Type-C Spec R2.0 section 4.5.1.2 (Connecting Sources and Sinks) and section 4.5.2.2 (Connection State Machine Requirements), the typical flow for configuring a device connected to a typeC port is as below: 1. Source/sink detection 2. Orientation 3. Data role 4. VCONN 5. VBUS (USB Type-C currents) 6. The connector is now configured. We can start the PD communication that should lead into configuration of the mux if we enter a mode. But in existing code data role was set after the connector and mux are already configured. So fix this by following the spec to set the data role before the connector and mux are configured. Signed-off-by: Azhar Shaikh Reviewed-by: Prashant Malani Reviewed-by: Heikki Krogerus Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_typec.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 39989069decf..31be31161350 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -545,6 +545,12 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num, if (ret) return ret; + ret = usb_role_switch_set_role(typec->ports[port_num]->role_sw, + pd_ctrl->role & PD_CTRL_RESP_ROLE_DATA + ? USB_ROLE_HOST : USB_ROLE_DEVICE); + if (ret) + return ret; + if (mux_flags & USB_PD_MUX_USB4_ENABLED) { ret = cros_typec_enable_usb4(typec, port_num, pd_ctrl); } else if (mux_flags & USB_PD_MUX_TBT_COMPAT_ENABLED) { @@ -564,12 +570,7 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num, ret = -ENOTSUPP; } - if (ret) - return ret; - - return usb_role_switch_set_role(typec->ports[port_num]->role_sw, - pd_ctrl->role & PD_CTRL_RESP_ROLE_DATA - ? USB_ROLE_HOST : USB_ROLE_DEVICE); + return ret; } static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) From 3658a2b7f3e16c7053eb8d70657b94bb62c5a0f4 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Mon, 24 Aug 2020 21:36:49 +0200 Subject: [PATCH 015/497] ARM: dts: sun8i: r40: bananapi-m2-ultra: Fix dcdc1 regulator DCDC1 regulator powers many different subsystems. While some of them can work at 3.0 V, some of them can not. For example, VCC-HDMI can only work between 3.24 V and 3.36 V. According to OS images provided by the board manufacturer this regulator should be set to 3.3 V. Set DCDC1 and DCDC1SW to 3.3 V in order to fix this. Fixes: da7ac948fa93 ("ARM: dts: sun8i: Add board dts file for Banana Pi M2 Ultra") Signed-off-by: Jernej Skrabec Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20200824193649.978197-1-jernej.skrabec@siol.net --- arch/arm/boot/dts/sun8i-r40-bananapi-m2-ultra.dts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/sun8i-r40-bananapi-m2-ultra.dts b/arch/arm/boot/dts/sun8i-r40-bananapi-m2-ultra.dts index 42d62d1ba1dc..ea15073f0c79 100644 --- a/arch/arm/boot/dts/sun8i-r40-bananapi-m2-ultra.dts +++ b/arch/arm/boot/dts/sun8i-r40-bananapi-m2-ultra.dts @@ -223,16 +223,16 @@ }; ®_dc1sw { - regulator-min-microvolt = <3000000>; - regulator-max-microvolt = <3000000>; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; regulator-name = "vcc-gmac-phy"; }; ®_dcdc1 { regulator-always-on; - regulator-min-microvolt = <3000000>; - regulator-max-microvolt = <3000000>; - regulator-name = "vcc-3v0"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + regulator-name = "vcc-3v3"; }; ®_dcdc2 { From 2933bf3528007f834fb7f5eab033f9c5b0683f91 Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Sat, 22 Aug 2020 14:27:55 +0800 Subject: [PATCH 016/497] arm64: dts: allwinner: h5: remove Mali GPU PMU module H5's Mali GPU PMU is not present or working corretly although H5 datasheet record its interrupt vector. Adding this module will miss lead lima driver try to shutdown it and get waiting timeout. This problem is not exposed before lima runtime PM support is added. Fixes: bb39ed07e55b ("arm64: dts: allwinner: h5: Add device node for Mali-450 GPU") Signed-off-by: Qiang Yu Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20200822062755.534761-1-yuq825@gmail.com --- arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi index 6735e316a39c..6c6053a18413 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h5.dtsi @@ -139,8 +139,7 @@ , , , - , - ; + ; interrupt-names = "gp", "gpmmu", "pp", @@ -151,8 +150,7 @@ "pp2", "ppmmu2", "pp3", - "ppmmu3", - "pmu"; + "ppmmu3"; clocks = <&ccu CLK_BUS_GPU>, <&ccu CLK_GPU>; clock-names = "bus", "core"; resets = <&ccu RST_BUS_GPU>; From 9b5fbad1dcee0607300d08f888c71c8ef97a06a0 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 25 Aug 2020 10:06:59 -0700 Subject: [PATCH 017/497] Input: MT - avoid comma separated statements Use semicolons and braces. Signed-off-by: Joe Perches Link: https://lore.kernel.org/r/02cb394f8c305473c1a783a5ea8425de79fe0ec1.1598331149.git.joe@perches.com Signed-off-by: Dmitry Torokhov --- drivers/input/input-mt.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c index f699538bdac4..44fe6f2f063c 100644 --- a/drivers/input/input-mt.c +++ b/drivers/input/input-mt.c @@ -323,11 +323,14 @@ static int adjust_dual(int *begin, int step, int *end, int eq, int mu) p = begin + step; s = p == end ? f + 1 : *p; - for (; p != end; p += step) - if (*p < f) - s = f, f = *p; - else if (*p < s) + for (; p != end; p += step) { + if (*p < f) { + s = f; + f = *p; + } else if (*p < s) { s = *p; + } + } c = (f + s + 1) / 2; if (c == 0 || (c > mu && (!eq || mu > 0))) From 8143182426874a509f65481c7b740115df22f7f3 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 14 Aug 2020 20:39:08 -0700 Subject: [PATCH 018/497] platform/chrome: cros_ec_trace: Add fields to command traces In ftrace, add more fields to the cros_ec command event: - Add size of commands to check if they are properly set. - Add offset (in case an EC is cascaded being another EC), to allow proper command output With: echo 1 > events/cros_ec/cros_ec_cmd/enable We now have (on samus) Invalid command for the sensor stack: ectool-6942 [002] .... 3082.783116: cros_ec_request_done: version: 3, offset: 0, command: EC_CMD_MOTION_SENSE_CMD, outsize: 2, insize: 19, ec result: EC_RES_INVALID_PARAM, retval: 0 Powerd accessing PD EC being the main EC: powerd-1272 [002] .... 40.644026: cros_ec_request_done: version: 0, offset: 1, command: EC_CMD_USB_PD_POWER_INFO, outsize: 1, insize: 16, ec result: EC_RES_SUCCESS, retval: 16 Signed-off-by: Gwendal Grignou Acked-by: Raul E Rangel Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_trace.h | 27 +++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_trace.h b/drivers/platform/chrome/cros_ec_trace.h index e9fb05f89ef0..f744b21bc655 100644 --- a/drivers/platform/chrome/cros_ec_trace.h +++ b/drivers/platform/chrome/cros_ec_trace.h @@ -23,14 +23,22 @@ TRACE_EVENT(cros_ec_request_start, TP_ARGS(cmd), TP_STRUCT__entry( __field(uint32_t, version) + __field(uint32_t, offset) __field(uint32_t, command) + __field(uint32_t, outsize) + __field(uint32_t, insize) ), TP_fast_assign( __entry->version = cmd->version; - __entry->command = cmd->command; + __entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(1); + __entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(1); + __entry->outsize = cmd->outsize; + __entry->insize = cmd->insize; ), - TP_printk("version: %u, command: %s", __entry->version, - __print_symbolic(__entry->command, EC_CMDS)) + TP_printk("version: %u, offset: %d, command: %s, outsize: %u, insize: %u", + __entry->version, __entry->offset, + __print_symbolic(__entry->command, EC_CMDS), + __entry->outsize, __entry->insize) ); TRACE_EVENT(cros_ec_request_done, @@ -38,19 +46,26 @@ TRACE_EVENT(cros_ec_request_done, TP_ARGS(cmd, retval), TP_STRUCT__entry( __field(uint32_t, version) + __field(uint32_t, offset) __field(uint32_t, command) + __field(uint32_t, outsize) + __field(uint32_t, insize) __field(uint32_t, result) __field(int, retval) ), TP_fast_assign( __entry->version = cmd->version; - __entry->command = cmd->command; + __entry->offset = cmd->command / EC_CMD_PASSTHRU_OFFSET(1); + __entry->command = cmd->command % EC_CMD_PASSTHRU_OFFSET(1); + __entry->outsize = cmd->outsize; + __entry->insize = cmd->insize; __entry->result = cmd->result; __entry->retval = retval; ), - TP_printk("version: %u, command: %s, ec result: %s, retval: %d", - __entry->version, + TP_printk("version: %u, offset: %d, command: %s, outsize: %u, insize: %u, ec result: %s, retval: %u", + __entry->version, __entry->offset, __print_symbolic(__entry->command, EC_CMDS), + __entry->outsize, __entry->insize, __print_symbolic(__entry->result, EC_RESULT), __entry->retval) ); From 5706d14d2a9472aae6d6e3023a9f37b1eee718ea Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 10 Aug 2020 12:08:05 +0200 Subject: [PATCH 019/497] KVM: PPC: Book3S HV: XICS: Replace the 'destroy' method by a 'release' method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similarly to what was done with XICS-on-XIVE and XIVE native KVM devices with commit 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy' method by a 'release' method"), convert the historical XICS KVM device to implement the 'release' method. This is needed to run nested guests with an in-kernel IRQ chip. A typical POWER9 guest can select XICS or XIVE during boot, which requires to be able to destroy and to re-create the KVM device. Only the historical XICS KVM device is available under pseries at the current time and it still uses the legacy 'destroy' method. Switching to 'release' means that vCPUs might still be running when the device is destroyed. In order to avoid potential use-after-free, the kvmppc_xics structure is allocated on first usage and kept around until the VM exits. The same pointer is used each time a KVM XICS device is being created, but this is okay since we only have one per VM. Clear the ICP of each vCPU with vcpu->mutex held. This ensures that the next time the vCPU resumes execution, it won't be going into the XICS code anymore. Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Tested-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s.c | 4 +- arch/powerpc/kvm/book3s_xics.c | 86 +++++++++++++++++++++++------ 3 files changed, 72 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 10ded83414de..d67a470e95a3 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -326,6 +326,7 @@ struct kvm_arch { #endif #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; + struct kvmppc_xics *xics_device; struct kvmppc_xive *xive; /* Current XIVE device in use */ struct { struct kvmppc_xive *native; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 49db50d1db04..1fce9777af1c 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -879,13 +879,15 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) #ifdef CONFIG_KVM_XICS /* - * Free the XIVE devices which are not directly freed by the + * Free the XIVE and XICS devices which are not directly freed by the * device 'release' method */ kfree(kvm->arch.xive_devices.native); kvm->arch.xive_devices.native = NULL; kfree(kvm->arch.xive_devices.xics_on_xive); kvm->arch.xive_devices.xics_on_xive = NULL; + kfree(kvm->arch.xics_device); + kvm->arch.xics_device = NULL; #endif /* CONFIG_KVM_XICS */ } diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 381bf8dea193..5fee5a11550d 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -1334,47 +1334,97 @@ static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return -ENXIO; } -static void kvmppc_xics_free(struct kvm_device *dev) +/* + * Called when device fd is closed. kvm->lock is held. + */ +static void kvmppc_xics_release(struct kvm_device *dev) { struct kvmppc_xics *xics = dev->private; int i; struct kvm *kvm = xics->kvm; + struct kvm_vcpu *vcpu; + + pr_devel("Releasing xics device\n"); + + /* + * Since this is the device release function, we know that + * userspace does not have any open fd referring to the + * device. Therefore there can not be any of the device + * attribute set/get functions being executed concurrently, + * and similarly, the connect_vcpu and set/clr_mapped + * functions also cannot be being executed. + */ debugfs_remove(xics->dentry); + /* + * We should clean up the vCPU interrupt presenters first. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + /* + * Take vcpu->mutex to ensure that no one_reg get/set ioctl + * (i.e. kvmppc_xics_[gs]et_icp) can be done concurrently. + * Holding the vcpu->mutex also means that execution is + * excluded for the vcpu until the ICP was freed. When the vcpu + * can execute again, vcpu->arch.icp and vcpu->arch.irq_type + * have been cleared and the vcpu will not be going into the + * XICS code anymore. + */ + mutex_lock(&vcpu->mutex); + kvmppc_xics_free_icp(vcpu); + mutex_unlock(&vcpu->mutex); + } + if (kvm) kvm->arch.xics = NULL; - for (i = 0; i <= xics->max_icsid; i++) + for (i = 0; i <= xics->max_icsid; i++) { kfree(xics->ics[i]); - kfree(xics); + xics->ics[i] = NULL; + } + /* + * A reference of the kvmppc_xics pointer is now kept under + * the xics_device pointer of the machine for reuse. It is + * freed when the VM is destroyed for now until we fix all the + * execution paths. + */ kfree(dev); } +static struct kvmppc_xics *kvmppc_xics_get_device(struct kvm *kvm) +{ + struct kvmppc_xics **kvm_xics_device = &kvm->arch.xics_device; + struct kvmppc_xics *xics = *kvm_xics_device; + + if (!xics) { + xics = kzalloc(sizeof(*xics), GFP_KERNEL); + *kvm_xics_device = xics; + } else { + memset(xics, 0, sizeof(*xics)); + } + + return xics; +} + static int kvmppc_xics_create(struct kvm_device *dev, u32 type) { struct kvmppc_xics *xics; struct kvm *kvm = dev->kvm; - int ret = 0; - xics = kzalloc(sizeof(*xics), GFP_KERNEL); + pr_devel("Creating xics for partition\n"); + + /* Already there ? */ + if (kvm->arch.xics) + return -EEXIST; + + xics = kvmppc_xics_get_device(kvm); if (!xics) return -ENOMEM; dev->private = xics; xics->dev = dev; xics->kvm = kvm; - - /* Already there ? */ - if (kvm->arch.xics) - ret = -EEXIST; - else - kvm->arch.xics = xics; - - if (ret) { - kfree(xics); - return ret; - } + kvm->arch.xics = xics; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (cpu_has_feature(CPU_FTR_ARCH_206) && @@ -1399,7 +1449,7 @@ struct kvm_device_ops kvm_xics_ops = { .name = "kvm-xics", .create = kvmppc_xics_create, .init = kvmppc_xics_init, - .destroy = kvmppc_xics_free, + .release = kvmppc_xics_release, .set_attr = xics_set_attr, .get_attr = xics_get_attr, .has_attr = xics_has_attr, @@ -1415,7 +1465,7 @@ int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, return -EPERM; if (xics->kvm != vcpu->kvm) return -EPERM; - if (vcpu->arch.irq_type) + if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; r = kvmppc_xics_create_icp(vcpu, xcpu); From 1e7913ff5f9f1b73146ad8522958bd266f22a510 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Sat, 29 Aug 2020 23:59:37 -0700 Subject: [PATCH 020/497] platform/chrome: cros_ec_lightbar: Reduce ligthbar get version command By default, the lightbar commands are set to the biggest lightbar command and response. That length is greater than 128 bytes and may not work on all machines. But all EC are probed for lightbar by sending a get version request. Set that request size precisely. Before the command would be: cros_ec_cmd: version: 0, command: EC_CMD_LIGHTBAR_CMD, outsize: 194, insize: 128, result: 0 Afer: cros_ec_cmd: version: 0, command: EC_CMD_LIGHTBAR_CMD, outsize: 1, insize: 8, result: 0 Fixes: a841178445bb7 ("mfd: cros_ec: Use a zero-length array for command data") Signed-off-by: Gwendal Grignou Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_lightbar.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c index 8445cda57927..de8dfb12e486 100644 --- a/drivers/platform/chrome/cros_ec_lightbar.c +++ b/drivers/platform/chrome/cros_ec_lightbar.c @@ -116,6 +116,8 @@ static int get_lightbar_version(struct cros_ec_dev *ec, param = (struct ec_params_lightbar *)msg->data; param->cmd = LIGHTBAR_CMD_VERSION; + msg->outsize = sizeof(param->cmd); + msg->result = sizeof(resp->version); ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); if (ret < 0 && ret != -EINVAL) { ret = 0; From dd92f7dfe1ba3bad556c8a529edacce7c6fd0195 Mon Sep 17 00:00:00 2001 From: Enric Balletbo i Serra Date: Thu, 3 Sep 2020 16:17:46 +0200 Subject: [PATCH 021/497] platform/chrome: Kconfig: Remove the transitional MFD_CROS_EC config The MFD_CROS_EC config was a transitional Kconfig option to not break current defconfigs in the kernel. Now, this is not required anymore because all the defconfigs have been removed this option and migrated to enable the CrOS EC parts individually. Signed-off-by: Enric Balletbo i Serra Reviewed-by: Guenter Roeck Tested-by: Gwendal Grignou --- drivers/platform/chrome/Kconfig | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/platform/chrome/Kconfig b/drivers/platform/chrome/Kconfig index a056031dee81..ccc23d8686e8 100644 --- a/drivers/platform/chrome/Kconfig +++ b/drivers/platform/chrome/Kconfig @@ -3,16 +3,6 @@ # Platform support for Chrome OS hardware (Chromebooks and Chromeboxes) # -config MFD_CROS_EC - tristate "Platform support for Chrome hardware (transitional)" - select CHROME_PLATFORMS - select CROS_EC - select MFD_CROS_EC_DEV - depends on X86 || ARM || ARM64 || COMPILE_TEST - help - This is a transitional Kconfig option and will be removed after - everyone enables the parts individually. - menuconfig CHROME_PLATFORMS bool "Platform support for Chrome hardware" depends on X86 || ARM || ARM64 || COMPILE_TEST From cd80ec795156346236e9b1cd9f5cbff5a9bbd212 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 09:33:56 -0700 Subject: [PATCH 022/497] Input: allocate keycodes for notification-center, pickup-phone and hangup-phone New Lenovo Thinkpad models, e.g. the X1 Carbon 8th gen and the new T14 gen1 models have 3 new symbols / shortcuts on their F9-F11 keys (and the thinkpad_acpi driver receives 3 new "scancodes" for these): F9: Has a symbol resembling a rectangular speech balloon, the manual says the hotkey functions shows or hides the notification center F10: Has a symbol of a telephone horn which has been picked up from the receiver, the manual says: "Answer incoming calls" F11: Has a symbol of a telephone horn which is resting on the receiver, the manual says: "Decline incoming calls" We have no existing keycodes which are a good match for these, so add 3 new keycodes for these. I noticed that we have a hole in our keycodes between 0x1ba and 0x1c0 which does not seem to be reserved for any specific purpose, so these new 3 codes use 0x1bc - 0x1be, instead of starting at 0x27b. Acked-by: Henrique de Moraes Holschuh Acked-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 0c2e27d28e0a..b74821d09145 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -515,6 +515,9 @@ #define KEY_10CHANNELSUP 0x1b8 /* 10 channels up (10+) */ #define KEY_10CHANNELSDOWN 0x1b9 /* 10 channels down (10-) */ #define KEY_IMAGES 0x1ba /* AL Image Browser */ +#define KEY_NOTIFICATION_CENTER 0x1bc /* Show/hide the notification center */ +#define KEY_PICKUP_PHONE 0x1bd /* Answer incoming call */ +#define KEY_HANGUP_PHONE 0x1be /* Decline incoming call */ #define KEY_DEL_EOL 0x1c0 #define KEY_DEL_EOS 0x1c1 From bba013e1ca5e7150b42a1a1a1e852010d772edad Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 09:58:09 -0700 Subject: [PATCH 023/497] Input: allocate keycode for Fn + right shift The last 2 generations of Lenovo Thinkpads send an acpi_thinkpad event when Fn + right shift is pressed. This is intended for use with "Lenovo Quick Clean" software, which disables the touchpad + kbd for 2 minutes on this key-combo so that healthcare workes can disinfect it. But there is no silkscreen print on the right-keyboard to indicate this, so add a KEY_FN_RIGHT_SHIFT keycode define to use for this key-combo. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200908135147.4044-3-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index b74821d09145..ee93428ced9a 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -545,6 +545,7 @@ #define KEY_FN_F 0x1e2 #define KEY_FN_S 0x1e3 #define KEY_FN_B 0x1e4 +#define KEY_FN_RIGHT_SHIFT 0x1e5 #define KEY_BRL_DOT1 0x1f1 #define KEY_BRL_DOT2 0x1f2 From 7ed7748d2c9cb8c26f5fcbfe4f4fa87655bb9a21 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 12:21:11 -0700 Subject: [PATCH 024/497] platform/x86: thinkpad_acpi: Add support for new hotkeys found on X1C8 / T14 New Lenovo Thinkpad models, e.g. the X1 Carbon 8th gen and the new T14 gen1 models have 3 new symbols / shortcuts on their F9-F11 keys (and the thinkpad_acpi driver receives 3 new hkey events for these): F9: Has a symbol resembling a rectangular speech balloon, the manual says the hotkey functions shows or hides the notification center F10: Has a symbol of a telephone horn which has been picked up from the receiver, the manual says: "Answer incoming calls" F11: Has a symbol of a telephone horn which is resting on the receiver, the manual says: "Decline incoming calls" And these Thinkpad models also send a new 0x1316 hkey events when the Fn + right Shift key-combo is pressed. This commit adds support for these 4 new hkey events. Acked-by: Henrique de Moraes Holschuh Acked-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200908135147.4044-4-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/platform/x86/thinkpad_acpi.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 0f704484ae1d..c09933a86ee2 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -1930,6 +1930,10 @@ enum { /* hot key scan codes (derived from ACPI DSDT) */ TP_ACPI_HOTKEYSCAN_CALCULATOR, TP_ACPI_HOTKEYSCAN_BLUETOOTH, TP_ACPI_HOTKEYSCAN_KEYBOARD, + TP_ACPI_HOTKEYSCAN_FN_RIGHT_SHIFT, /* Used by "Lenovo Quick Clean" */ + TP_ACPI_HOTKEYSCAN_NOTIFICATION_CENTER, + TP_ACPI_HOTKEYSCAN_PICKUP_PHONE, + TP_ACPI_HOTKEYSCAN_HANGUP_PHONE, /* Hotkey keymap size */ TPACPI_HOTKEY_MAP_LEN @@ -3446,11 +3450,15 @@ static int __init hotkey_init(struct ibm_init_struct *iibm) KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, KEY_UNKNOWN, - KEY_BOOKMARKS, /* Favorite app, 0x311 */ - KEY_RESERVED, /* Clipping tool */ - KEY_CALC, /* Calculator (above numpad, P52) */ - KEY_BLUETOOTH, /* Bluetooth */ - KEY_KEYBOARD /* Keyboard, 0x315 */ + KEY_BOOKMARKS, /* Favorite app, 0x311 */ + KEY_RESERVED, /* Clipping tool */ + KEY_CALC, /* Calculator (above numpad, P52) */ + KEY_BLUETOOTH, /* Bluetooth */ + KEY_KEYBOARD, /* Keyboard, 0x315 */ + KEY_FN_RIGHT_SHIFT, /* Fn + right Shift */ + KEY_NOTIFICATION_CENTER, /* Notification Center */ + KEY_PICKUP_PHONE, /* Answer incoming call */ + KEY_HANGUP_PHONE, /* Decline incoming call */ }, }; From e2c8c4ec48b5cbd4b61edc24d884dfd5ec35ef9d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 12:21:50 -0700 Subject: [PATCH 025/497] platform/x86: thinkpad_acpi: Map Clipping tool hotkey to KEY_SELECTIVE_SCREENSHOT Commit 696c6523ec8f ("platform/x86: thinkpad_acpi: add mapping for new hotkeys") added support for a bunch of new hotkeys, but the clipping/snipping tool hotkey got ignored because there was no good key-code to map it to. Recently a new KEY_SELECTIVE_SCREENSHOT keycode was added by commit 3b059da9835c ("Input: allocate keycode for "Selective Screenshot" key") quoting from the commit message: "New Chrome OS keyboards have a "snip" key that is basically a selective screenshot (allows a user to select an area of screen to be copied). Allocate a keycode for it." Support for this "snip" key seems like it is also a good match for the clipping/snipping tool hotkey, so map this hotkey to the new KEY_SELECTIVE_SCREENSHOT key-code. Reviewed-by: Bastien Nocera Acked-by: Henrique de Moraes Holschuh Acked-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200908135147.4044-5-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/platform/x86/thinkpad_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index c09933a86ee2..fd93df4ac293 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -3451,7 +3451,7 @@ static int __init hotkey_init(struct ibm_init_struct *iibm) KEY_UNKNOWN, KEY_BOOKMARKS, /* Favorite app, 0x311 */ - KEY_RESERVED, /* Clipping tool */ + KEY_SELECTIVE_SCREENSHOT, /* Clipping tool */ KEY_CALC, /* Calculator (above numpad, P52) */ KEY_BLUETOOTH, /* Bluetooth */ KEY_KEYBOARD, /* Keyboard, 0x315 */ From 9af3e08baa7c20ee69b7fc18e720e08a091493b9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:09 +0100 Subject: [PATCH 026/497] KVM: arm64: Remove kvm_mmu_free_memory_caches() kvm_mmu_free_memory_caches() is only called by kvm_arch_vcpu_destroy(), so inline the implementation and get rid of the extra function. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-2-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 2 -- arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/mmu.c | 5 ----- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 189839c3706a..0f078b1920ff 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -141,8 +141,6 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); - phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 46dc3d75cf13..262a0afbcc27 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -283,7 +283,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) static_branch_dec(&userspace_irqchip_in_use); - kvm_mmu_free_memory_caches(vcpu); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ba00bcc0c884..935f8f689433 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2324,11 +2324,6 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) kvm_test_age_hva_handler, NULL); } -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); -} - phys_addr_t kvm_mmu_get_httbr(void) { if (__kvm_cpu_uses_extended_idmap()) From b1e57de62cfb4d05f45ab848bb893fbcff9557d3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:10 +0100 Subject: [PATCH 027/497] KVM: arm64: Add stand-alone page-table walker infrastructure The KVM page-table code is intricately tied into the kernel page-table code and re-uses the pte/pmd/pud/p4d/pgd macros directly in an attempt to reduce code duplication. Unfortunately, the reality is that there is an awful lot of code required to make this work, and at the end of the day you're limited to creating page-tables with the same configuration as the host kernel. Furthermore, lifting the page-table code to run directly at EL2 on a non-VHE system (as we plan to to do in future patches) is practically impossible due to the number of dependencies it has on the core kernel. Introduce a framework for walking Armv8 page-tables configured independently from the host kernel. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-3-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 104 ++++++++++ arch/arm64/kvm/hyp/Makefile | 2 +- arch/arm64/kvm/hyp/pgtable.c | 285 +++++++++++++++++++++++++++ 3 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/kvm_pgtable.h create mode 100644 arch/arm64/kvm/hyp/pgtable.c diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h new file mode 100644 index 000000000000..9423e5d609f1 --- /dev/null +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google LLC + * Author: Will Deacon + */ + +#ifndef __ARM64_KVM_PGTABLE_H__ +#define __ARM64_KVM_PGTABLE_H__ + +#include +#include +#include + +typedef u64 kvm_pte_t; + +/** + * struct kvm_pgtable - KVM page-table. + * @ia_bits: Maximum input address size, in bits. + * @start_level: Level at which the page-table walk starts. + * @pgd: Pointer to the first top-level entry of the page-table. + * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + */ +struct kvm_pgtable { + u32 ia_bits; + u32 start_level; + kvm_pte_t *pgd; + + /* Stage-2 only */ + struct kvm_s2_mmu *mmu; +}; + +/** + * enum kvm_pgtable_prot - Page-table permissions and attributes. + * @KVM_PGTABLE_PROT_X: Execute permission. + * @KVM_PGTABLE_PROT_W: Write permission. + * @KVM_PGTABLE_PROT_R: Read permission. + * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + */ +enum kvm_pgtable_prot { + KVM_PGTABLE_PROT_X = BIT(0), + KVM_PGTABLE_PROT_W = BIT(1), + KVM_PGTABLE_PROT_R = BIT(2), + + KVM_PGTABLE_PROT_DEVICE = BIT(3), +}; + +/** + * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. + * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid + * entries. + * @KVM_PGTABLE_WALK_TABLE_PRE: Visit table entries before their + * children. + * @KVM_PGTABLE_WALK_TABLE_POST: Visit table entries after their + * children. + */ +enum kvm_pgtable_walk_flags { + KVM_PGTABLE_WALK_LEAF = BIT(0), + KVM_PGTABLE_WALK_TABLE_PRE = BIT(1), + KVM_PGTABLE_WALK_TABLE_POST = BIT(2), +}; + +typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg); + +/** + * struct kvm_pgtable_walker - Hook into a page-table walk. + * @cb: Callback function to invoke during the walk. + * @arg: Argument passed to the callback function. + * @flags: Bitwise-OR of flags to identify the entry types on which to + * invoke the callback function. + */ +struct kvm_pgtable_walker { + const kvm_pgtable_visitor_fn_t cb; + void * const arg; + const enum kvm_pgtable_walk_flags flags; +}; + +/** + * kvm_pgtable_walk() - Walk a page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). + * @addr: Input address for the start of the walk. + * @size: Size of the range to walk. + * @walker: Walker callback description. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * The walker will walk the page-table entries corresponding to the input + * address range specified, visiting entries according to the walker flags. + * Invalid entries are treated as leaf entries. Leaf entries are reloaded + * after invoking the walker callback, allowing the walker to descend into + * a newly installed table. + * + * Returning a negative error code from the walker callback function will + * terminate the walk immediately with the same error code. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_pgtable_walker *walker); + +#endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index f54f0e89a71c..607b8a898826 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -10,5 +10,5 @@ subdir-ccflags-y := -I$(incdir) \ -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vhe/ nvhe/ +obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c new file mode 100644 index 000000000000..3fb9d1949a3f --- /dev/null +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. + * No bombay mix was harmed in the writing of this file. + * + * Copyright (C) 2020 Google LLC + * Author: Will Deacon + */ + +#include +#include + +#define KVM_PGTABLE_MAX_LEVELS 4U + +#define KVM_PTE_VALID BIT(0) + +#define KVM_PTE_TYPE BIT(1) +#define KVM_PTE_TYPE_BLOCK 0 +#define KVM_PTE_TYPE_PAGE 1 +#define KVM_PTE_TYPE_TABLE 1 + +#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) + +#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) + +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) + +struct kvm_pgtable_walk_data { + struct kvm_pgtable *pgt; + struct kvm_pgtable_walker *walker; + + u64 addr; + u64 end; +}; + +static u64 kvm_granule_shift(u32 level) +{ + /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); +} + +static u64 kvm_granule_size(u32 level) +{ + return BIT(kvm_granule_shift(level)); +} + +static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) +{ + u64 granule = kvm_granule_size(level); + + /* + * Reject invalid block mappings and don't bother with 4TB mappings for + * 52-bit PAs. + */ + if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1)) + return false; + + if (granule > (end - addr)) + return false; + + return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule); +} + +static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) +{ + u64 shift = kvm_granule_shift(level); + u64 mask = BIT(PAGE_SHIFT - 3) - 1; + + return (data->addr >> shift) & mask; +} + +static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) +{ + u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ + u64 mask = BIT(pgt->ia_bits) - 1; + + return (addr & mask) >> shift; +} + +static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data) +{ + return __kvm_pgd_page_idx(data->pgt, data->addr); +} + +static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) +{ + struct kvm_pgtable pgt = { + .ia_bits = ia_bits, + .start_level = start_level, + }; + + return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; +} + +static bool kvm_pte_valid(kvm_pte_t pte) +{ + return pte & KVM_PTE_VALID; +} + +static bool kvm_pte_table(kvm_pte_t pte, u32 level) +{ + if (level == KVM_PGTABLE_MAX_LEVELS - 1) + return false; + + if (!kvm_pte_valid(pte)) + return false; + + return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; +} + +static u64 kvm_pte_to_phys(kvm_pte_t pte) +{ + u64 pa = pte & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + + return pa; +} + +static kvm_pte_t kvm_phys_to_pte(u64 pa) +{ + kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48); + + return pte; +} + +static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte) +{ + return __va(kvm_pte_to_phys(pte)); +} + +static void kvm_set_invalid_pte(kvm_pte_t *ptep) +{ + kvm_pte_t pte = *ptep; + WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID); +} + +static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp) +{ + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp)); + + pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); + pte |= KVM_PTE_VALID; + + WARN_ON(kvm_pte_valid(old)); + smp_store_release(ptep, pte); +} + +static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr, + u32 level) +{ + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa); + u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : + KVM_PTE_TYPE_BLOCK; + + pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); + pte |= FIELD_PREP(KVM_PTE_TYPE, type); + pte |= KVM_PTE_VALID; + + /* Tolerate KVM recreating the exact same mapping. */ + if (kvm_pte_valid(old)) + return old == pte; + + smp_store_release(ptep, pte); + return true; +} + +static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, + u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag) +{ + struct kvm_pgtable_walker *walker = data->walker; + return walker->cb(addr, data->end, level, ptep, flag, walker->arg); +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + kvm_pte_t *pgtable, u32 level); + +static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, + kvm_pte_t *ptep, u32 level) +{ + int ret = 0; + u64 addr = data->addr; + kvm_pte_t *childp, pte = *ptep; + bool table = kvm_pte_table(pte, level); + enum kvm_pgtable_walk_flags flags = data->walker->flags; + + if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_TABLE_PRE); + } + + if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_LEAF); + pte = *ptep; + table = kvm_pte_table(pte, level); + } + + if (ret) + goto out; + + if (!table) { + data->addr += kvm_granule_size(level); + goto out; + } + + childp = kvm_pte_follow(pte); + ret = __kvm_pgtable_walk(data, childp, level + 1); + if (ret) + goto out; + + if (flags & KVM_PGTABLE_WALK_TABLE_POST) { + ret = kvm_pgtable_visitor_cb(data, addr, level, ptep, + KVM_PGTABLE_WALK_TABLE_POST); + } + +out: + return ret; +} + +static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, + kvm_pte_t *pgtable, u32 level) +{ + u32 idx; + int ret = 0; + + if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) + return -EINVAL; + + for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { + kvm_pte_t *ptep = &pgtable[idx]; + + if (data->addr >= data->end) + break; + + ret = __kvm_pgtable_visit(data, ptep, level); + if (ret) + break; + } + + return ret; +} + +static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data) +{ + u32 idx; + int ret = 0; + struct kvm_pgtable *pgt = data->pgt; + u64 limit = BIT(pgt->ia_bits); + + if (data->addr > limit || data->end > limit) + return -ERANGE; + + if (!pgt->pgd) + return -EINVAL; + + for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) { + kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE]; + + ret = __kvm_pgtable_walk(data, ptep, pgt->start_level); + if (ret) + break; + } + + return ret; +} + +int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_pgtable_walker *walker) +{ + struct kvm_pgtable_walk_data walk_data = { + .pgt = pgt, + .addr = ALIGN_DOWN(addr, PAGE_SIZE), + .end = PAGE_ALIGN(walk_data.addr + size), + .walker = walker, + }; + + return _kvm_pgtable_walk(&walk_data); +} From bb0e92cbbcd1f91411d0a7241f8febe2c8c5b078 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:11 +0100 Subject: [PATCH 028/497] KVM: arm64: Add support for creating kernel-agnostic stage-1 page tables The generic page-table walker is pretty useless as it stands, because it doesn't understand enough to allocate anything. Teach it about stage-1 page-tables, and hook up an API for allocating these for the hypervisor at EL2. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-4-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 40 ++++++++ arch/arm64/kvm/hyp/pgtable.c | 133 +++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 9423e5d609f1..d0a207d095e6 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -77,6 +77,46 @@ struct kvm_pgtable_walker { const enum kvm_pgtable_walk_flags flags; }; +/** + * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. + * @pgt: Uninitialised page-table structure to initialise. + * @va_bits: Maximum virtual address bits. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits); + +/** + * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt); + +/** + * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_hyp_init(). + * @addr: Virtual address at which to place the mapping. + * @size: Size of the mapping. + * @phys: Physical address of the memory to map. + * @prot: Permissions and attributes for the mapping. + * + * The offset of @addr within a page is ignored, @size is rounded-up to + * the next page boundary and @phys is rounded-down to the previous page + * boundary. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. Attempts to install a new mapping + * for a virtual address that is already mapped will be rejected with an + * error and a WARN(). + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot); + /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3fb9d1949a3f..23a1006aa4ef 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -24,8 +24,18 @@ #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) + #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) + struct kvm_pgtable_walk_data { struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; @@ -283,3 +293,126 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, return _kvm_pgtable_walk(&walk_data); } + +struct hyp_map_data { + u64 phys; + kvm_pte_t attr; +}; + +static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, + struct hyp_map_data *data) +{ + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; + kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); + u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; + u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : + KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; + + if (!(prot & KVM_PGTABLE_PROT_R)) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_X) { + if (prot & KVM_PGTABLE_PROT_W) + return -EINVAL; + + if (device) + return -EINVAL; + } else { + attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; + } + + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + data->attr = attr; + return 0; +} + +static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, struct hyp_map_data *data) +{ + u64 granule = kvm_granule_size(level), phys = data->phys; + + if (!kvm_block_mapping_supported(addr, end, phys, level)) + return false; + + WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)); + data->phys += granule; + return true; +} + +static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + kvm_pte_t *childp; + + if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg)) + return 0; + + if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + if (!childp) + return -ENOMEM; + + kvm_set_table_pte(ptep, childp); + return 0; +} + +int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot) +{ + int ret; + struct hyp_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + }; + struct kvm_pgtable_walker walker = { + .cb = hyp_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &map_data, + }; + + ret = hyp_map_set_prot_attr(prot, &map_data); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + isb(); + return ret; +} + +int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits) +{ + u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); + + pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = va_bits; + pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; + pgt->mmu = NULL; + return 0; +} + +static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + free_page((unsigned long)kvm_pte_follow(*ptep)); + return 0; +} + +void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) +{ + struct kvm_pgtable_walker walker = { + .cb = hyp_free_walker, + .flags = KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + free_page((unsigned long)pgt->pgd); + pgt->pgd = NULL; +} From 0f9d09b8e29bc8166f6584279aedc4a7a4038f68 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:12 +0100 Subject: [PATCH 029/497] KVM: arm64: Use generic allocator for hyp stage-1 page-tables Now that we have a shiny new page-table allocator, replace the hyp page-table code with calls into the new API. This also allows us to remove the extended idmap code, as we can now simply ensure that the VA size is large enough to map everything we need. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-5-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 78 +---- arch/arm64/include/asm/kvm_pgtable.h | 5 + arch/arm64/include/asm/pgtable-hwdef.h | 6 - arch/arm64/include/asm/pgtable-prot.h | 6 - arch/arm64/kvm/mmu.c | 414 +++---------------------- 5 files changed, 45 insertions(+), 464 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 0f078b1920ff..42fb50cfe0d8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -43,16 +43,6 @@ * HYP_VA_MIN = 1 << (VA_BITS - 1) * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 * - * This of course assumes that the trampoline page exists within the - * VA_BITS range. If it doesn't, then it means we're in the odd case - * where the kernel idmap (as well as HYP) uses more levels than the - * kernel runtime page tables (as seen when the kernel is configured - * for 4k pages, 39bits VA, and yet memory lives just above that - * limit, forcing the idmap to use 4 levels of page tables while the - * kernel itself only uses 3). In this particular case, it doesn't - * matter which side of VA_BITS we use, as we're guaranteed not to - * conflict with anything. - * * When using VHE, there are no separate hyp mappings and all KVM * functionality is already mapped as part of the main kernel * mappings, and none of this applies in that case. @@ -123,9 +113,10 @@ static inline bool kvm_page_empty(void *ptr) return page_count(ptr_page) == 1; } +#include #include -int create_hyp_mappings(void *from, void *to, pgprot_t prot); +int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, void __iomem **haddr); @@ -144,8 +135,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); -void kvm_clear_hyp_idmap(void); - #define kvm_mk_pmd(ptep) \ __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) #define kvm_mk_pud(pmdp) \ @@ -263,25 +252,6 @@ static inline bool kvm_s2pud_young(pud_t pud) return pud_young(pud); } -#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) - -#ifdef __PAGETABLE_PMD_FOLDED -#define hyp_pmd_table_empty(pmdp) (0) -#else -#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#endif - -#ifdef __PAGETABLE_PUD_FOLDED -#define hyp_pud_table_empty(pudp) (0) -#else -#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp) -#endif - -#ifdef __PAGETABLE_P4D_FOLDED -#define hyp_p4d_table_empty(p4dp) (0) -#else -#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp) -#endif struct kvm; @@ -350,50 +320,6 @@ static inline void __kvm_flush_dcache_pud(pud_t pud) void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); -static inline bool __kvm_cpu_uses_extended_idmap(void) -{ - return __cpu_uses_extended_idmap_level(); -} - -static inline unsigned long __kvm_idmap_ptrs_per_pgd(void) -{ - return idmap_ptrs_per_pgd; -} - -/* - * Can't use pgd_populate here, because the extended idmap adds an extra level - * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended - * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4. - */ -static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, - pgd_t *hyp_pgd, - pgd_t *merged_hyp_pgd, - unsigned long hyp_idmap_start) -{ - int idmap_idx; - u64 pgd_addr; - - /* - * Use the first entry to access the HYP mappings. It is - * guaranteed to be free, otherwise we wouldn't use an - * extended idmap. - */ - VM_BUG_ON(pgd_val(merged_hyp_pgd[0])); - pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd)); - merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE); - - /* - * Create another extended level entry that points to the boot HYP map, - * which contains an ID mapping of the HYP init code. We essentially - * merge the boot and runtime HYP maps by doing so, but they don't - * overlap anyway, so this is fine. - */ - idmap_idx = hyp_idmap_start >> VA_BITS; - VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx])); - pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd)); - merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE); -} - static inline unsigned int kvm_get_vmid_bits(void) { int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index d0a207d095e6..ff5d7d27eb39 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -44,6 +44,11 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_DEVICE = BIT(3), }; +#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) +#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) +#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) + /** * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. * @KVM_PGTABLE_WALK_LEAF: Visit leaf entries, including invalid diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d400a4d9aee2..1a989353144e 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -194,12 +194,6 @@ */ #define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2) -/* - * EL2/HYP PTE/PMD definitions - */ -#define PMD_HYP PMD_SECT_USER -#define PTE_HYP PTE_USER - /* * Highest possible physical address supported. */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 4d867c6446c4..88acd7e1cd05 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -56,7 +56,6 @@ extern bool arm64_use_ng_mappings; #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) -#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT #define PAGE_KERNEL __pgprot(PROT_NORMAL) #define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) @@ -64,11 +63,6 @@ extern bool arm64_use_ng_mappings; #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) #define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) -#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) -#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) -#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) -#define PAGE_HYP_DEVICE __pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN) - #define PAGE_S2_MEMATTR(attr) \ ({ \ u64 __val; \ diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 935f8f689433..fabd72b0c8a4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -21,9 +22,7 @@ #include "trace.h" -static pgd_t *boot_hyp_pgd; -static pgd_t *hyp_pgd; -static pgd_t *merged_hyp_pgd; +static struct kvm_pgtable *hyp_pgtable; static DEFINE_MUTEX(kvm_hyp_pgd_mutex); static unsigned long hyp_idmap_start; @@ -32,8 +31,6 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; -#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) - #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) @@ -489,338 +486,28 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } -static void clear_hyp_pgd_entry(pgd_t *pgd) -{ - p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL); - pgd_clear(pgd); - p4d_free(NULL, p4d_table); - put_page(virt_to_page(pgd)); -} - -static void clear_hyp_p4d_entry(p4d_t *p4d) -{ - pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL); - VM_BUG_ON(p4d_huge(*p4d)); - p4d_clear(p4d); - pud_free(NULL, pud_table); - put_page(virt_to_page(p4d)); -} - -static void clear_hyp_pud_entry(pud_t *pud) -{ - pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); - pmd_free(NULL, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_hyp_pmd_entry(pmd_t *pmd) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - pte_free_kernel(NULL, pte_table); - put_page(virt_to_page(pmd)); -} - -static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - kvm_set_pte(pte, __pte(0)); - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (hyp_pte_table_empty(start_pte)) - clear_hyp_pmd_entry(pmd); -} - -static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - /* Hyp doesn't use huge pmds */ - if (!pmd_none(*pmd)) - unmap_hyp_ptes(pmd, addr, next); - } while (pmd++, addr = next, addr != end); - - if (hyp_pmd_table_empty(start_pmd)) - clear_hyp_pud_entry(pud); -} - -static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - pud_t *pud, *start_pud; - - start_pud = pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - /* Hyp doesn't use huge puds */ - if (!pud_none(*pud)) - unmap_hyp_pmds(pud, addr, next); - } while (pud++, addr = next, addr != end); - - if (hyp_pud_table_empty(start_pud)) - clear_hyp_p4d_entry(p4d); -} - -static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t next; - p4d_t *p4d, *start_p4d; - - start_p4d = p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - /* Hyp doesn't use huge p4ds */ - if (!p4d_none(*p4d)) - unmap_hyp_puds(p4d, addr, next); - } while (p4d++, addr = next, addr != end); - - if (hyp_p4d_table_empty(start_p4d)) - clear_hyp_pgd_entry(pgd); -} - -static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd) -{ - return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1); -} - -static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd, - phys_addr_t start, u64 size) -{ - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; - - /* - * We don't unmap anything from HYP, except at the hyp tear down. - * Hence, we don't have to invalidate the TLBs here. - */ - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - do { - next = pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_hyp_p4ds(pgd, addr, next); - } while (pgd++, addr = next, addr != end); -} - -static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size); -} - -static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size) -{ - __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size); -} - /** * free_hyp_pgds - free Hyp-mode page tables - * - * Assumes hyp_pgd is a page table used strictly in Hyp-mode and - * therefore contains either mappings in the kernel memory area (above - * PAGE_OFFSET), or device mappings in the idmap range. - * - * boot_hyp_pgd should only map the idmap range, and is only used in - * the extended idmap case. */ void free_hyp_pgds(void) { - pgd_t *id_pgd; - mutex_lock(&kvm_hyp_pgd_mutex); - - id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd; - - if (id_pgd) { - /* In case we never called hyp_mmu_init() */ - if (!io_map_base) - io_map_base = hyp_idmap_start; - unmap_hyp_idmap_range(id_pgd, io_map_base, - hyp_idmap_start + PAGE_SIZE - io_map_base); + if (hyp_pgtable) { + kvm_pgtable_hyp_destroy(hyp_pgtable); + kfree(hyp_pgtable); } - - if (boot_hyp_pgd) { - free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); - boot_hyp_pgd = NULL; - } - - if (hyp_pgd) { - unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), - (uintptr_t)high_memory - PAGE_OFFSET); - - free_pages((unsigned long)hyp_pgd, hyp_pgd_order); - hyp_pgd = NULL; - } - if (merged_hyp_pgd) { - clear_page(merged_hyp_pgd); - free_page((unsigned long)merged_hyp_pgd); - merged_hyp_pgd = NULL; - } - mutex_unlock(&kvm_hyp_pgd_mutex); } -static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) +static int __create_hyp_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { - pte_t *pte; - unsigned long addr; - - addr = start; - do { - pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); - get_page(virt_to_page(pte)); - pfn++; - } while (addr += PAGE_SIZE, addr != end); -} - -static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pmd_t *pmd; - pte_t *pte; - unsigned long addr, next; - - addr = start; - do { - pmd = pmd_offset(pud, addr); - - BUG_ON(pmd_sect(*pmd)); - - if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL); - if (!pte) { - kvm_err("Cannot allocate Hyp pte\n"); - return -ENOMEM; - } - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - next = pmd_addr_end(addr, end); - - create_hyp_pte_mappings(pmd, addr, next, pfn, prot); - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - pud_t *pud; - pmd_t *pmd; - unsigned long addr, next; - int ret; - - addr = start; - do { - pud = pud_offset(p4d, addr); - - if (pud_none_or_clear_bad(pud)) { - pmd = pmd_alloc_one(NULL, addr); - if (!pmd) { - kvm_err("Cannot allocate Hyp pmd\n"); - return -ENOMEM; - } - kvm_pud_populate(pud, pmd); - get_page(virt_to_page(pud)); - } - - next = pud_addr_end(addr, end); - ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start, - unsigned long end, unsigned long pfn, - pgprot_t prot) -{ - p4d_t *p4d; - pud_t *pud; - unsigned long addr, next; - int ret; - - addr = start; - do { - p4d = p4d_offset(pgd, addr); - - if (p4d_none(*p4d)) { - pud = pud_alloc_one(NULL, addr); - if (!pud) { - kvm_err("Cannot allocate Hyp pud\n"); - return -ENOMEM; - } - kvm_p4d_populate(p4d, pud); - get_page(virt_to_page(p4d)); - } - - next = p4d_addr_end(addr, end); - ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot); - if (ret) - return ret; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); - - return 0; -} - -static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, - unsigned long start, unsigned long end, - unsigned long pfn, pgprot_t prot) -{ - pgd_t *pgd; - p4d_t *p4d; - unsigned long addr, next; - int err = 0; + int err; mutex_lock(&kvm_hyp_pgd_mutex); - addr = start & PAGE_MASK; - end = PAGE_ALIGN(end); - do { - pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd); - - if (pgd_none(*pgd)) { - p4d = p4d_alloc_one(NULL, addr); - if (!p4d) { - kvm_err("Cannot allocate Hyp p4d\n"); - err = -ENOMEM; - goto out; - } - kvm_pgd_populate(pgd, p4d); - get_page(virt_to_page(pgd)); - } - - next = pgd_addr_end(addr, end); - err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot); - if (err) - goto out; - pfn += (next - addr) >> PAGE_SHIFT; - } while (addr = next, addr != end); -out: + err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); mutex_unlock(&kvm_hyp_pgd_mutex); + return err; } @@ -845,7 +532,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying * physical pages. */ -int create_hyp_mappings(void *from, void *to, pgprot_t prot) +int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) { phys_addr_t phys_addr; unsigned long virt_addr; @@ -862,9 +549,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) int err; phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); - err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, - virt_addr, virt_addr + PAGE_SIZE, - __phys_to_pfn(phys_addr), + err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, prot); if (err) return err; @@ -874,9 +559,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) } static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, - unsigned long *haddr, pgprot_t prot) + unsigned long *haddr, + enum kvm_pgtable_prot prot) { - pgd_t *pgd = hyp_pgd; unsigned long base; int ret = 0; @@ -908,17 +593,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, if (ret) goto out; - if (__kvm_cpu_uses_extended_idmap()) - pgd = boot_hyp_pgd; - - ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - base, base + size, - __phys_to_pfn(phys_addr), prot); + ret = __create_hyp_mappings(base, size, phys_addr, prot); if (ret) goto out; *haddr = base + offset_in_page(phys_addr); - out: return ret; } @@ -2326,10 +2005,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) phys_addr_t kvm_mmu_get_httbr(void) { - if (__kvm_cpu_uses_extended_idmap()) - return virt_to_phys(merged_hyp_pgd); - else - return virt_to_phys(hyp_pgd); + return __pa(hyp_pgtable->pgd); } phys_addr_t kvm_get_idmap_vector(void) @@ -2337,15 +2013,11 @@ phys_addr_t kvm_get_idmap_vector(void) return hyp_idmap_vector; } -static int kvm_map_idmap_text(pgd_t *pgd) +static int kvm_map_idmap_text(void) { - int err; - - /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), - hyp_idmap_start, hyp_idmap_end, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); + unsigned long size = hyp_idmap_end - hyp_idmap_start; + int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, + PAGE_HYP_EXEC); if (err) kvm_err("Failed to idmap %lx-%lx\n", hyp_idmap_start, hyp_idmap_end); @@ -2356,6 +2028,7 @@ static int kvm_map_idmap_text(pgd_t *pgd) int kvm_mmu_init(void) { int err; + u32 hyp_va_bits; hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); @@ -2369,6 +2042,8 @@ int kvm_mmu_init(void) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); + hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); + kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); kvm_debug("HYP VA range: %lx:%lx\n", kern_hyp_va(PAGE_OFFSET), @@ -2386,43 +2061,30 @@ int kvm_mmu_init(void) goto out; } - hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - if (!hyp_pgd) { - kvm_err("Hyp mode PGD not allocated\n"); + hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); + if (!hyp_pgtable) { + kvm_err("Hyp mode page-table not allocated\n"); err = -ENOMEM; goto out; } - if (__kvm_cpu_uses_extended_idmap()) { - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - hyp_pgd_order); - if (!boot_hyp_pgd) { - kvm_err("Hyp boot PGD not allocated\n"); - err = -ENOMEM; - goto out; - } + err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits); + if (err) + goto out_free_pgtable; - err = kvm_map_idmap_text(boot_hyp_pgd); - if (err) - goto out; - - merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - if (!merged_hyp_pgd) { - kvm_err("Failed to allocate extra HYP pgd\n"); - goto out; - } - __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, - hyp_idmap_start); - } else { - err = kvm_map_idmap_text(hyp_pgd); - if (err) - goto out; - } + err = kvm_map_idmap_text(); + if (err) + goto out_destroy_pgtable; io_map_base = hyp_idmap_start; return 0; + +out_destroy_pgtable: + kvm_pgtable_hyp_destroy(hyp_pgtable); +out_free_pgtable: + kfree(hyp_pgtable); + hyp_pgtable = NULL; out: - free_hyp_pgds(); return err; } From 71233d05f4b5e3560f0d3d5607d01e9beff8dcbd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:13 +0100 Subject: [PATCH 030/497] KVM: arm64: Add support for creating kernel-agnostic stage-2 page tables Introduce alloc() and free() functions to the generic page-table code for guest stage-2 page-tables and plumb these into the existing KVM page-table allocator. Subsequent patches will convert other operations within the KVM allocator over to the generic code. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-6-will@kernel.org --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_pgtable.h | 18 +++++++++ arch/arm64/kvm/hyp/pgtable.c | 54 +++++++++++++++++++++++++++ arch/arm64/kvm/mmu.c | 55 +++++++++++++++------------- 4 files changed, 102 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e52c927aade5..0b7c702b2151 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -81,6 +81,7 @@ struct kvm_s2_mmu { */ pgd_t *pgd; phys_addr_t pgd_phys; + struct kvm_pgtable *pgt; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index ff5d7d27eb39..21d71395a377 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -122,6 +122,24 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt); int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot); +/** + * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. + * @pgt: Uninitialised page-table structure to initialise. + * @kvm: KVM structure representing the guest virtual machine. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm); + +/** + * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); + /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 23a1006aa4ef..16b34d11e7cf 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -416,3 +416,57 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) free_page((unsigned long)pgt->pgd); pgt->pgd = NULL; } + +int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) +{ + size_t pgd_sz; + u64 vtcr = kvm->arch.vtcr; + u32 ia_bits = VTCR_EL2_IPA(vtcr); + u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); + u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; + + pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; + pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO); + if (!pgt->pgd) + return -ENOMEM; + + pgt->ia_bits = ia_bits; + pgt->start_level = start_level; + pgt->mmu = &kvm->arch.mmu; + + /* Ensure zeroed PGD pages are visible to the hardware walker */ + dsb(ishst); + return 0; +} + +static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte)) + return 0; + + put_page(virt_to_page(ptep)); + + if (kvm_pte_table(pte, level)) + free_page((unsigned long)kvm_pte_follow(pte)); + + return 0; +} + +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + struct kvm_pgtable_walker walker = { + .cb = stage2_free_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + }; + + WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; + free_pages_exact(pgt->pgd, pgd_sz); + pgt->pgd = NULL; +} diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index fabd72b0c8a4..4607e9ca60a2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -668,47 +668,49 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, * @kvm: The pointer to the KVM structure * @mmu: The pointer to the s2 MMU structure * - * Allocates only the stage-2 HW PGD level table(s) of size defined by - * stage2_pgd_size(mmu->kvm). - * + * Allocates only the stage-2 HW PGD level table(s). * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) { - phys_addr_t pgd_phys; - pgd_t *pgd; - int cpu; + int cpu, err; + struct kvm_pgtable *pgt; - if (mmu->pgd != NULL) { + if (mmu->pgt != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); - if (!pgd) + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); + if (!pgt) return -ENOMEM; - pgd_phys = virt_to_phys(pgd); - if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) - return -EINVAL; + err = kvm_pgtable_stage2_init(pgt, kvm); + if (err) + goto out_free_pgtable; mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { - free_pages_exact(pgd, stage2_pgd_size(kvm)); - return -ENOMEM; + err = -ENOMEM; + goto out_destroy_pgtable; } for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; mmu->kvm = kvm; - mmu->pgd = pgd; - mmu->pgd_phys = pgd_phys; + mmu->pgt = pgt; + mmu->pgd_phys = __pa(pgt->pgd); + mmu->pgd = (void *)pgt->pgd; mmu->vmid.vmid_gen = 0; - return 0; + +out_destroy_pgtable: + kvm_pgtable_stage2_destroy(pgt); +out_free_pgtable: + kfree(pgt); + return err; } static void stage2_unmap_memslot(struct kvm *kvm, @@ -781,20 +783,21 @@ void stage2_unmap_vm(struct kvm *kvm) void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) { struct kvm *kvm = mmu->kvm; - void *pgd = NULL; + struct kvm_pgtable *pgt = NULL; spin_lock(&kvm->mmu_lock); - if (mmu->pgd) { - unmap_stage2_range(mmu, 0, kvm_phys_size(kvm)); - pgd = READ_ONCE(mmu->pgd); + pgt = mmu->pgt; + if (pgt) { mmu->pgd = NULL; + mmu->pgd_phys = 0; + mmu->pgt = NULL; + free_percpu(mmu->last_vcpu_ran); } spin_unlock(&kvm->mmu_lock); - /* Free the HW pgd, one page at a time */ - if (pgd) { - free_pages_exact(pgd, stage2_pgd_size(kvm)); - free_percpu(mmu->last_vcpu_ran); + if (pgt) { + kvm_pgtable_stage2_destroy(pgt); + kfree(pgt); } } From 6d9d2115c48027132a69477c98d812be669f5b9b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:14 +0100 Subject: [PATCH 031/497] KVM: arm64: Add support for stage-2 map()/unmap() in generic page-table Add stage-2 map() and unmap() operations to the generic page-table code. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-7-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 46 +++++ arch/arm64/kvm/hyp/pgtable.c | 273 +++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 21d71395a377..895b2238062b 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -140,6 +140,52 @@ int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm); */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +/** + * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address at which to place the mapping. + * @size: Size of the mapping. + * @phys: Physical address of the memory to map. + * @prot: Permissions and attributes for the mapping. + * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to + * allocate page-table pages. + * + * The offset of @addr within a page is ignored, @size is rounded-up to + * the next page boundary and @phys is rounded-down to the previous page + * boundary. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. + * + * Note that this function will both coalesce existing table entries and split + * existing block mappings, relying on page-faults to fault back areas outside + * of the new mapping lazily. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *mc); + +/** + * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to remove the mapping. + * @size: Size of the mapping. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * TLB invalidation is performed for each page-table entry cleared during the + * unmapping operation and the reference count for the page-table page + * containing the cleared entry is decremented, with unreferenced pages being + * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if + * FWB is not supported by the CPU. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); + /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 16b34d11e7cf..2f9b872f5355 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -32,10 +32,19 @@ #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) +#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) +#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) +#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 +#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) + #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) + struct kvm_pgtable_walk_data { struct kvm_pgtable *pgt; struct kvm_pgtable_walker *walker; @@ -417,6 +426,270 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) pgt->pgd = NULL; } +struct stage2_map_data { + u64 phys; + kvm_pte_t attr; + + kvm_pte_t *anchor; + + struct kvm_s2_mmu *mmu; + struct kvm_mmu_memory_cache *memcache; +}; + +static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, + struct stage2_map_data *data) +{ + bool device = prot & KVM_PGTABLE_PROT_DEVICE; + kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) : + PAGE_S2_MEMATTR(NORMAL); + u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + + if (!(prot & KVM_PGTABLE_PROT_X)) + attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + else if (device) + return -EINVAL; + + if (prot & KVM_PGTABLE_PROT_R) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); + attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + data->attr = attr; + return 0; +} + +static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + u64 granule = kvm_granule_size(level), phys = data->phys; + + if (!kvm_block_mapping_supported(addr, end, phys, level)) + return false; + + if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level)) + goto out; + + /* There's an existing valid leaf entry, so perform break-before-make */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + kvm_set_valid_leaf_pte(ptep, phys, data->attr, level); +out: + data->phys += granule; + return true; +} + +static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + if (data->anchor) + return 0; + + if (!kvm_block_mapping_supported(addr, end, data->phys, level)) + return 0; + + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0); + data->anchor = ptep; + return 0; +} + +static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + kvm_pte_t *childp, pte = *ptep; + struct page *page = virt_to_page(ptep); + + if (data->anchor) { + if (kvm_pte_valid(pte)) + put_page(page); + + return 0; + } + + if (stage2_map_walker_try_leaf(addr, end, level, ptep, data)) + goto out_get_page; + + if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + if (!data->memcache) + return -ENOMEM; + + childp = kvm_mmu_memory_cache_alloc(data->memcache); + if (!childp) + return -ENOMEM; + + /* + * If we've run into an existing block mapping then replace it with + * a table. Accesses beyond 'end' that fall within the new table + * will be mapped lazily. + */ + if (kvm_pte_valid(pte)) { + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); + put_page(page); + } + + kvm_set_table_pte(ptep, childp); + +out_get_page: + get_page(page); + return 0; +} + +static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + int ret = 0; + + if (!data->anchor) + return 0; + + free_page((unsigned long)kvm_pte_follow(*ptep)); + put_page(virt_to_page(ptep)); + + if (data->anchor == ptep) { + data->anchor = NULL; + ret = stage2_map_walk_leaf(addr, end, level, ptep, data); + } + + return ret; +} + +/* + * This is a little fiddly, as we use all three of the walk flags. The idea + * is that the TABLE_PRE callback runs for table entries on the way down, + * looking for table entries which we could conceivably replace with a + * block entry for this mapping. If it finds one, then it sets the 'anchor' + * field in 'struct stage2_map_data' to point at the table entry, before + * clearing the entry to zero and descending into the now detached table. + * + * The behaviour of the LEAF callback then depends on whether or not the + * anchor has been set. If not, then we're not using a block mapping higher + * up the table and we perform the mapping at the existing leaves instead. + * If, on the other hand, the anchor _is_ set, then we drop references to + * all valid leaves so that the pages beneath the anchor can be freed. + * + * Finally, the TABLE_POST callback does nothing if the anchor has not + * been set, but otherwise frees the page-table pages while walking back up + * the page-table, installing the block entry when it revisits the anchor + * pointer and clearing the anchor to NULL. + */ +static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + struct stage2_map_data *data = arg; + + switch (flag) { + case KVM_PGTABLE_WALK_TABLE_PRE: + return stage2_map_walk_table_pre(addr, end, level, ptep, data); + case KVM_PGTABLE_WALK_LEAF: + return stage2_map_walk_leaf(addr, end, level, ptep, data); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_map_walk_table_post(addr, end, level, ptep, data); + } + + return -EINVAL; +} + +int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + struct kvm_mmu_memory_cache *mc) +{ + int ret; + struct stage2_map_data map_data = { + .phys = ALIGN_DOWN(phys, PAGE_SIZE), + .mmu = pgt->mmu, + .memcache = mc, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_TABLE_PRE | + KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_TABLE_POST, + .arg = &map_data, + }; + + ret = stage2_map_set_prot_attr(prot, &map_data); + if (ret) + return ret; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + dsb(ishst); + return ret; +} + +static void stage2_flush_dcache(void *addr, u64 size) +{ + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return; + + __flush_dcache_area(addr, size); +} + +static bool stage2_pte_cacheable(kvm_pte_t pte) +{ + u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte); + return memattr == PAGE_S2_MEMATTR(NORMAL); +} + +static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + struct kvm_s2_mmu *mmu = arg; + kvm_pte_t pte = *ptep, *childp = NULL; + bool need_flush = false; + + if (!kvm_pte_valid(pte)) + return 0; + + if (kvm_pte_table(pte, level)) { + childp = kvm_pte_follow(pte); + + if (page_count(virt_to_page(childp)) != 1) + return 0; + } else if (stage2_pte_cacheable(pte)) { + need_flush = true; + } + + /* + * This is similar to the map() path in that we unmap the entire + * block entry and rely on the remaining portions being faulted + * back lazily. + */ + kvm_set_invalid_pte(ptep); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); + put_page(virt_to_page(ptep)); + + if (need_flush) { + stage2_flush_dcache(kvm_pte_follow(pte), + kvm_granule_size(level)); + } + + if (childp) + free_page((unsigned long)childp); + + return 0; +} + +int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_unmap_walker, + .arg = pgt->mmu, + .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) { size_t pgd_sz; From 02bbd374ce4a8aad36c022fd1203cb9f11b52ee3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:15 +0100 Subject: [PATCH 032/497] KVM: arm64: Convert kvm_phys_addr_ioremap() to generic page-table API Convert kvm_phys_addr_ioremap() to use kvm_pgtable_stage2_map() instead of stage2_set_pte(). Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-8-will@kernel.org --- arch/arm64/kvm/mmu.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 4607e9ca60a2..32e93f42aed0 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1154,35 +1154,33 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud) int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable) { - phys_addr_t addr, end; + phys_addr_t addr; int ret = 0; - unsigned long pfn; struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | + KVM_PGTABLE_PROT_R | + (writable ? KVM_PGTABLE_PROT_W : 0); - end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; - pfn = __phys_to_pfn(pa); - - for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); - - if (writable) - pte = kvm_s2pte_mkwrite(pte); + size += offset_in_page(guest_ipa); + guest_ipa &= PAGE_MASK; + for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { ret = kvm_mmu_topup_memory_cache(&cache, kvm_mmu_cache_min_pages(kvm)); if (ret) - goto out; + break; + spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte, - KVM_S2PTE_FLAG_IS_IOMAP); + ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, + &cache); spin_unlock(&kvm->mmu_lock); if (ret) - goto out; + break; - pfn++; + pa += PAGE_SIZE; } -out: kvm_mmu_free_memory_cache(&cache); return ret; } From e9edb17ae0a2634bfa5dac529229c06f97f2b669 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:16 +0100 Subject: [PATCH 033/497] KVM: arm64: Convert kvm_set_spte_hva() to generic page-table API Convert kvm_set_spte_hva() to use kvm_pgtable_stage2_map() instead of stage2_set_pte(). Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-9-will@kernel.org --- arch/arm64/kvm/mmu.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 32e93f42aed0..d8cfdd25eac5 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1914,28 +1914,27 @@ int kvm_unmap_hva_range(struct kvm *kvm, static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pte_t *pte = (pte_t *)data; + kvm_pfn_t *pfn = (kvm_pfn_t *)data; WARN_ON(size != PAGE_SIZE); + /* - * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE - * flag clear because MMU notifiers will have unmapped a huge PMD before - * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and - * therefore stage2_set_pte() never needs to clear out a huge PMD - * through this calling path. + * The MMU notifiers will have unmapped a huge PMD before calling + * ->change_pte() (which in turn calls kvm_set_spte_hva()) and + * therefore we never need to clear out a huge PMD through this + * calling path and a memcache is not required. */ - stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0); + kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE, + __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL); return 0; } - int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; kvm_pfn_t pfn = pte_pfn(pte); - pte_t stage2_pte; - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_set_spte_hva(hva); @@ -1945,9 +1944,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); - handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); - + handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn); return 0; } From 52bae936f0e7be4b4c088505aecb8f04121f1dee Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:17 +0100 Subject: [PATCH 034/497] KVM: arm64: Convert unmap_stage2_range() to generic page-table API Convert unmap_stage2_range() to use kvm_pgtable_stage2_unmap() instead of walking the page-table directly. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-10-will@kernel.org --- arch/arm64/kvm/mmu.c | 62 ++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d8cfdd25eac5..f03d3d0af10d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -39,6 +39,38 @@ static bool is_iomap(unsigned long flags) return flags & KVM_S2PTE_FLAG_IS_IOMAP; } +/* + * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, + * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, + * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too + * long will also starve other vCPUs. We have to also make sure that the page + * tables are not freed while we released the lock. + */ +static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end, + int (*fn)(struct kvm_pgtable *, u64, u64), + bool resched) +{ + int ret; + u64 next; + + do { + struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; + if (!pgt) + return -EINVAL; + + next = stage2_pgd_addr_end(kvm, addr, end); + ret = fn(pgt, addr, next - addr); + if (ret) + break; + + if (resched && next != end) + cond_resched_lock(&kvm->mmu_lock); + } while (addr = next, addr != end); + + return ret; +} + static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -220,8 +252,8 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) * end up writing old data to disk. * * This is why right after unmapping a page/section and invalidating - * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure - * the IO subsystem will never hit in the cache. + * the corresponding TLBs, we flush to make sure the IO subsystem will + * never hit in the cache. * * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as * we then fully enforce cacheability of RAM, no matter what the guest @@ -344,32 +376,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 bool may_block) { struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; + phys_addr_t end = start + size; assert_spin_locked(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Make sure the page table is still active, as another thread - * could have possibly freed the page table, while we released - * the lock. - */ - if (!READ_ONCE(mmu->pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_p4ds(mmu, pgd, addr, next); - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (may_block && next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); + WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap, + may_block)); } static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) From e0e5a07f3f5adca259ffa8ad3f4153ab448a4115 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:18 +0100 Subject: [PATCH 035/497] KVM: arm64: Add support for stage-2 page-aging in generic page-table Add stage-2 mkyoung(), mkold() and is_young() operations to the generic page-table code. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-11-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 44 ++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 86 ++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 895b2238062b..50782128c861 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -186,6 +186,50 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, */ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); +/** + * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * set the access flag in that entry. + * + * Return: The old page-table entry prior to setting the flag, 0 on failure. + */ +kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * clear the access flag in that entry. + * + * Note that it is the caller's responsibility to invalidate the TLB after + * calling this function to ensure that the updated permissions are visible + * to the CPUs. + * + * Return: The old page-table entry prior to clearing the flag, 0 on failure. + */ +kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); + +/** + * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the + * access flag set. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * + * The offset of @addr within a page is ignored. + * + * Return: True if the page-table entry has the access flag set, false otherwise. + */ +bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr); + /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 2f9b872f5355..af60ea8ee29d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -690,6 +690,92 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } +struct stage2_attr_data { + kvm_pte_t attr_set; + kvm_pte_t attr_clr; + kvm_pte_t pte; +}; + +static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + struct stage2_attr_data *data = arg; + + if (!kvm_pte_valid(pte)) + return 0; + + data->pte = pte; + pte &= ~data->attr_clr; + pte |= data->attr_set; + + /* + * We may race with the CPU trying to set the access flag here, + * but worst-case the access flag update gets lost and will be + * set on the next access instead. + */ + if (data->pte != pte) + WRITE_ONCE(*ptep, pte); + + return 0; +} + +static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, + u64 size, kvm_pte_t attr_set, + kvm_pte_t attr_clr, kvm_pte_t *orig_pte) +{ + int ret; + kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; + struct stage2_attr_data data = { + .attr_set = attr_set & attr_mask, + .attr_clr = attr_clr & attr_mask, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_attr_walker, + .arg = &data, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + ret = kvm_pgtable_walk(pgt, addr, size, &walker); + if (ret) + return ret; + + if (orig_pte) + *orig_pte = data.pte; + return 0; +} + +kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, + &pte); + dsb(ishst); + return pte; +} + +kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF, + &pte); + /* + * "But where's the TLBI?!", you scream. + * "Over in the core code", I sigh. + * + * See the '->clear_flush_young()' callback on the KVM mmu notifier. + */ + return pte; +} + +bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) +{ + kvm_pte_t pte = 0; + stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte); + return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; +} + int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) { size_t pgd_sz; From ee8efad7994ce204806260bc0261c7e103f704c1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:19 +0100 Subject: [PATCH 036/497] KVM: arm64: Convert page-aging and access faults to generic page-table API Convert the page-aging functions and access fault handler to use the generic page-table code instead of walking the page-table directly. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Alexandru Elisei Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-12-will@kernel.org --- arch/arm64/kvm/mmu.c | 74 ++++++++++---------------------------------- 1 file changed, 16 insertions(+), 58 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index f03d3d0af10d..36407dcdf01e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1706,46 +1706,23 @@ out_unlock: return ret; } -/* - * Resolve the access fault by making the page young again. - * Note that because the faulting entry is guaranteed not to be - * cached in the TLB, we don't need to invalidate anything. - * Only the HW Access Flag updates are supported for Stage 2 (no DBM), - * so there is no need for atomic (pte|pmd)_mkyoung operations. - */ +/* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - kvm_pfn_t pfn; - bool pfn_valid = false; + pte_t pte; + kvm_pte_t kpte; + struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); spin_lock(&vcpu->kvm->mmu_lock); - - if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte)) - goto out; - - if (pud) { /* HugeTLB */ - *pud = kvm_s2pud_mkyoung(*pud); - pfn = kvm_pud_pfn(*pud); - pfn_valid = true; - } else if (pmd) { /* THP, HugeTLB */ - *pmd = pmd_mkyoung(*pmd); - pfn = pmd_pfn(*pmd); - pfn_valid = true; - } else { - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; - } - -out: + mmu = vcpu->arch.hw_mmu; + kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); spin_unlock(&vcpu->kvm->mmu_lock); - if (pfn_valid) - kvm_set_pfn_accessed(pfn); + + pte = __pte(kpte); + if (pte_valid(pte)) + kvm_set_pfn_accessed(pte_pfn(pte)); } /** @@ -1962,38 +1939,19 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pte_t pte; + kvm_pte_t kpte; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return stage2_pudp_test_and_clear_young(pud); - else if (pmd) - return stage2_pmdp_test_and_clear_young(pmd); - else - return stage2_ptep_test_and_clear_young(pte); + kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa); + pte = __pte(kpte); + return pte_valid(pte) && pte_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte)) - return 0; - - if (pud) - return kvm_s2pud_young(*pud); - else if (pmd) - return pmd_young(*pmd); - else - return pte_young(*pte); + return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa); } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) From 73d49df2c3e7cad5bf64c92ec03ce9e3823898b9 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 11 Sep 2020 14:25:20 +0100 Subject: [PATCH 037/497] KVM: arm64: Add support for stage-2 write-protect in generic page-table Add a stage-2 wrprotect() operation to the generic page-table code. Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200911132529.19844-13-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 18 ++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 50782128c861..5ae6006a6098 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -186,6 +186,24 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, */ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); +/** + * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range + * without TLB invalidation. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to write-protect, + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Note that it is the caller's responsibility to invalidate the TLB after + * calling this function to ensure that the updated permissions are visible + * to the CPUs. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); + /** * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index af60ea8ee29d..480b95030f54 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -746,6 +746,12 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, return 0; } +int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + return stage2_update_leaf_attrs(pgt, addr, size, 0, + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL); +} + kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; From cc38d61cace392a5f064f2981ce6009bb158de21 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 11 Sep 2020 14:25:21 +0100 Subject: [PATCH 038/497] KVM: arm64: Convert write-protect operation to generic page-table API Convert stage2_wp_range() to call the kvm_pgtable_stage2_wrprotect() function of the generic page-table code instead of walking the page-table directly. Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200911132529.19844-14-will@kernel.org --- arch/arm64/kvm/mmu.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 36407dcdf01e..a37da87eb7eb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -71,6 +71,9 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, return ret; } +#define stage2_apply_range_resched(kvm, addr, end, fn) \ + stage2_apply_range(kvm, addr, end, fn, true) + static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -1302,27 +1305,7 @@ static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - phys_addr_t next; - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Release kvm_mmu_lock periodically if the memory region is - * large. Otherwise, we may see kernel panics with - * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, - * CONFIG_LOCKDEP. Additionally, holding the lock too long - * will also starve other vCPUs. We have to also make sure - * that the page tables are not freed while we released - * the lock. - */ - cond_resched_lock(&kvm->mmu_lock); - if (!READ_ONCE(mmu->pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (stage2_pgd_present(kvm, *pgd)) - stage2_wp_p4ds(mmu, pgd, addr, next); - } while (pgd++, addr = next, addr != end); + stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); } /** From 93c66b40d728178722706a306df15ce92c819cd6 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 11 Sep 2020 14:25:22 +0100 Subject: [PATCH 039/497] KVM: arm64: Add support for stage-2 cache flushing in generic page-table Add support for cache flushing a range of the stage-2 address space to the generic page-table code. Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200911132529.19844-15-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 15 +++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 26 ++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 5ae6006a6098..77c027456c61 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -248,6 +248,21 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); */ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr); +/** + * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point + * of Coherency for guest stage-2 address + * range. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to flush. + * @size: Size of the range. + * + * The offset of @addr within a page is ignored and @size is rounded-up to + * the next page boundary. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); + /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 480b95030f54..d382756a527c 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -782,6 +782,32 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; } +static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + kvm_pte_t pte = *ptep; + + if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte)) + return 0; + + stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level)); + return 0; +} + +int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_flush_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) + return 0; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) { size_t pgd_sz; From 8d5207bef6913b5fac91cb58e509711e406c160d Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 11 Sep 2020 14:25:23 +0100 Subject: [PATCH 040/497] KVM: arm64: Convert memslot cache-flushing code to generic page-table API Convert stage2_flush_memslot() to call the kvm_pgtable_stage2_flush() function of the generic page-table code instead of walking the page-table directly. Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200911132529.19844-16-will@kernel.org --- arch/arm64/kvm/mmu.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a37da87eb7eb..d5813996c0e4 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -460,21 +460,10 @@ static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { - struct kvm_s2_mmu *mmu = &kvm->arch.mmu; phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = addr + PAGE_SIZE * memslot->npages; - phys_addr_t next; - pgd_t *pgd; - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - stage2_flush_p4ds(mmu, pgd, addr, next); - - if (next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); + stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush); } /** From adcd4e23291e67575be0c8b08eed04fbe7fd9e77 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:24 +0100 Subject: [PATCH 041/497] KVM: arm64: Add support for relaxing stage-2 perms in generic page-table code Add support for relaxing the permissions of a stage-2 mapping (i.e. adding additional permissions) to the generic page-table code. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-17-will@kernel.org --- arch/arm64/include/asm/kvm_pgtable.h | 19 +++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 20 ++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 77c027456c61..52ab38db04c7 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -236,6 +236,25 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); */ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); +/** + * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a + * page-table entry. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address to identify the page-table entry. + * @prot: Additional permissions to grant for the mapping. + * + * The offset of @addr within a page is ignored. + * + * If there is a valid, leaf page-table entry used to translate @addr, then + * relax the permissions in that entry according to the read, write and + * execute permissions specified by @prot. No permissions are removed, and + * TLB invalidation is performed after updating the entry. + * + * Return: 0 on success, negative error code on failure. + */ +int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot); + /** * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the * access flag set. diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index d382756a527c..603d6b415337 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -782,6 +782,26 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; } +int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_prot prot) +{ + int ret; + kvm_pte_t set = 0, clr = 0; + + if (prot & KVM_PGTABLE_PROT_R) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; + + if (prot & KVM_PGTABLE_PROT_W) + set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; + + if (prot & KVM_PGTABLE_PROT_X) + clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; + + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, 0); + return ret; +} + static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, enum kvm_pgtable_walk_flags flag, void * const arg) From 6f745f1bb5bf2914be2344b0dd7781d0528b3e42 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:25 +0100 Subject: [PATCH 042/497] KVM: arm64: Convert user_mem_abort() to generic page-table API Convert user_mem_abort() to call kvm_pgtable_stage2_relax_perms() when handling a stage-2 permission fault and kvm_pgtable_stage2_map() when handling a stage-2 translation fault, rather than walking the page-table manually. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Reviewed-by: Alexandru Elisei Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-18-will@kernel.org --- arch/arm64/kvm/mmu.c | 124 +++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 80 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d5813996c0e4..e980e614dd81 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1499,18 +1499,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, { int ret; bool write_fault, writable, force_pte = false; - bool exec_fault, needs_exec; + bool exec_fault; + bool device = false; unsigned long mmu_seq; - gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + gfn_t gfn; kvm_pfn_t pfn; - pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); - unsigned long vma_pagesize, flags = 0; - struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + unsigned long vma_pagesize; + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; + struct kvm_pgtable *pgt; write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_iabt(vcpu); @@ -1543,22 +1544,24 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = PAGE_SIZE; } - /* - * The stage2 has a minimum of 2 level table (For arm64 see - * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can - * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). - * As for PUD huge maps, we must make sure that we have at least - * 3 levels, i.e, PMD is not folded. - */ - if (vma_pagesize == PMD_SIZE || - (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) - gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; + if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) + fault_ipa &= huge_page_mask(hstate_vma(vma)); + + gfn = fault_ipa >> PAGE_SHIFT; mmap_read_unlock(current->mm); - /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); - if (ret) - return ret; + /* + * Permission faults just need to update the existing leaf entry, + * and so normally don't require allocations from the memcache. The + * only exception to this is when dirty logging is enabled at runtime + * and a write fault needs to collapse a block entry into a table. + */ + if (fault_status != FSC_PERM || (logging_active && write_fault)) { + ret = kvm_mmu_topup_memory_cache(memcache, + kvm_mmu_cache_min_pages(kvm)); + if (ret) + return ret; + } mmu_seq = vcpu->kvm->mmu_notifier_seq; /* @@ -1581,28 +1584,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; if (kvm_is_device_pfn(pfn)) { - mem_type = PAGE_S2_DEVICE; - flags |= KVM_S2PTE_FLAG_IS_IOMAP; - } else if (logging_active) { - /* - * Faults on pages in a memslot with logging enabled - * should not be mapped with huge pages (it introduces churn - * and performance degradation), so force a pte mapping. - */ - flags |= KVM_S2_FLAG_LOGGING_ACTIVE; - + device = true; + } else if (logging_active && !write_fault) { /* * Only actually map the page as writable if this was a write * fault. */ - if (!write_fault) - writable = false; + writable = false; } - if (exec_fault && is_iomap(flags)) + if (exec_fault && device) return -ENOEXEC; spin_lock(&kvm->mmu_lock); + pgt = vcpu->arch.hw_mmu->pgt; if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; @@ -1613,62 +1608,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (vma_pagesize == PAGE_SIZE && !force_pte) vma_pagesize = transparent_hugepage_adjust(memslot, hva, &pfn, &fault_ipa); - if (writable) + if (writable) { + prot |= KVM_PGTABLE_PROT_W; kvm_set_pfn_dirty(pfn); + mark_page_dirty(kvm, gfn); + } - if (fault_status != FSC_PERM && !is_iomap(flags)) + if (fault_status != FSC_PERM && !device) clean_dcache_guest_page(pfn, vma_pagesize); - if (exec_fault) + if (exec_fault) { + prot |= KVM_PGTABLE_PROT_X; invalidate_icache_guest_page(pfn, vma_pagesize); + } - /* - * If we took an execution fault we have made the - * icache/dcache coherent above and should now let the s2 - * mapping be executable. - * - * Write faults (!exec_fault && FSC_PERM) are orthogonal to - * execute permissions, and we preserve whatever we have. - */ - needs_exec = exec_fault || - (fault_status == FSC_PERM && - stage2_is_exec(mmu, fault_ipa, vma_pagesize)); + if (device) + prot |= KVM_PGTABLE_PROT_DEVICE; + else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) + prot |= KVM_PGTABLE_PROT_X; - if (vma_pagesize == PUD_SIZE) { - pud_t new_pud = kvm_pfn_pud(pfn, mem_type); - - new_pud = kvm_pud_mkhuge(new_pud); - if (writable) - new_pud = kvm_s2pud_mkwrite(new_pud); - - if (needs_exec) - new_pud = kvm_s2pud_mkexec(new_pud); - - ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud); - } else if (vma_pagesize == PMD_SIZE) { - pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); - - new_pmd = kvm_pmd_mkhuge(new_pmd); - - if (writable) - new_pmd = kvm_s2pmd_mkwrite(new_pmd); - - if (needs_exec) - new_pmd = kvm_s2pmd_mkexec(new_pmd); - - ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd); + if (fault_status == FSC_PERM && !(logging_active && writable)) { + ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); } else { - pte_t new_pte = kvm_pfn_pte(pfn, mem_type); - - if (writable) { - new_pte = kvm_s2pte_mkwrite(new_pte); - mark_page_dirty(kvm, gfn); - } - - if (needs_exec) - new_pte = kvm_s2pte_mkexec(new_pte); - - ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags); + ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + __pfn_to_phys(pfn), prot, + memcache); } out_unlock: From 063deeb1f2c234ca5e9d156c7a86825fc5377477 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:26 +0100 Subject: [PATCH 043/497] KVM: arm64: Check the pgt instead of the pgd when modifying page-table In preparation for removing the 'pgd' field of 'struct kvm_s2_mmu', update the few remaining users to check the 'pgt' field instead. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-19-will@kernel.org --- arch/arm64/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index e980e614dd81..ed5aefd5d0a7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1829,7 +1829,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_unmap_hva_range(start, end); @@ -1892,7 +1892,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void * int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_age_hva(start, end); return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); @@ -1900,7 +1900,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - if (!kvm->arch.mmu.pgd) + if (!kvm->arch.mmu.pgt) return 0; trace_kvm_test_age_hva(hva); return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE, From 3f26ab58e3272ceada11279c2e90894d4995cf79 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:27 +0100 Subject: [PATCH 044/497] KVM: arm64: Remove unused page-table code Now that KVM is using the generic page-table code to manage the guest stage-2 page-tables, we can remove a bunch of unused macros, #defines and static inline functions from the old implementation. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-20-will@kernel.org --- arch/arm64/include/asm/kvm_mmu.h | 171 ------ arch/arm64/include/asm/pgtable-hwdef.h | 18 - arch/arm64/include/asm/pgtable-prot.h | 13 - arch/arm64/include/asm/stage2_pgtable.h | 215 ------- arch/arm64/kvm/mmu.c | 755 ------------------------ 5 files changed, 1172 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 42fb50cfe0d8..c490fe8089b3 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -107,12 +107,6 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) #define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - #include #include @@ -135,123 +129,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); int kvm_mmu_init(void); -#define kvm_mk_pmd(ptep) \ - __pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE) -#define kvm_mk_pud(pmdp) \ - __pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE) -#define kvm_mk_p4d(pmdp) \ - __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE) - -#define kvm_set_pud(pudp, pud) set_pud(pudp, pud) - -#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) -#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) -#define kvm_pfn_pud(pfn, prot) pfn_pud(pfn, prot) - -#define kvm_pud_pfn(pud) pud_pfn(pud) - -#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) -#define kvm_pud_mkhuge(pud) pud_mkhuge(pud) - -static inline pte_t kvm_s2pte_mkwrite(pte_t pte) -{ - pte_val(pte) |= PTE_S2_RDWR; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) -{ - pmd_val(pmd) |= PMD_S2_RDWR; - return pmd; -} - -static inline pud_t kvm_s2pud_mkwrite(pud_t pud) -{ - pud_val(pud) |= PUD_S2_RDWR; - return pud; -} - -static inline pte_t kvm_s2pte_mkexec(pte_t pte) -{ - pte_val(pte) &= ~PTE_S2_XN; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) -{ - pmd_val(pmd) &= ~PMD_S2_XN; - return pmd; -} - -static inline pud_t kvm_s2pud_mkexec(pud_t pud) -{ - pud_val(pud) &= ~PUD_S2_XN; - return pud; -} - -static inline void kvm_set_s2pte_readonly(pte_t *ptep) -{ - pteval_t old_pteval, pteval; - - pteval = READ_ONCE(pte_val(*ptep)); - do { - old_pteval = pteval; - pteval &= ~PTE_S2_RDWR; - pteval |= PTE_S2_RDONLY; - pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); - } while (pteval != old_pteval); -} - -static inline bool kvm_s2pte_readonly(pte_t *ptep) -{ - return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY; -} - -static inline bool kvm_s2pte_exec(pte_t *ptep) -{ - return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN); -} - -static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp) -{ - kvm_set_s2pte_readonly((pte_t *)pmdp); -} - -static inline bool kvm_s2pmd_readonly(pmd_t *pmdp) -{ - return kvm_s2pte_readonly((pte_t *)pmdp); -} - -static inline bool kvm_s2pmd_exec(pmd_t *pmdp) -{ - return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); -} - -static inline void kvm_set_s2pud_readonly(pud_t *pudp) -{ - kvm_set_s2pte_readonly((pte_t *)pudp); -} - -static inline bool kvm_s2pud_readonly(pud_t *pudp) -{ - return kvm_s2pte_readonly((pte_t *)pudp); -} - -static inline bool kvm_s2pud_exec(pud_t *pudp) -{ - return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN); -} - -static inline pud_t kvm_s2pud_mkyoung(pud_t pud) -{ - return pud_mkyoung(pud); -} - -static inline bool kvm_s2pud_young(pud_t pud) -{ - return pud_young(pud); -} - struct kvm; @@ -293,30 +170,6 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, } } -static inline void __kvm_flush_dcache_pte(pte_t pte) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pte_page(pte); - kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE); - } -} - -static inline void __kvm_flush_dcache_pmd(pmd_t pmd) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pmd_page(pmd); - kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE); - } -} - -static inline void __kvm_flush_dcache_pud(pud_t pud) -{ - if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) { - struct page *page = pud_page(pud); - kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE); - } -} - void kvm_set_way_flush(struct kvm_vcpu *vcpu); void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); @@ -477,30 +330,6 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) -/* - * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. - * With v8.2 LVA extensions, 'x' should be a minimum of 6 with - * 52bit IPS. - */ -static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) -{ - int x = ARM64_VTTBR_X(ipa_shift, levels); - - return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; -} - -static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) -{ - unsigned int x = arm64_vttbr_x(ipa_shift, levels); - - return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); -} - -static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) -{ - return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); -} - static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 1a989353144e..2b5f822fb033 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -156,7 +156,6 @@ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ -#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */ #define PTE_ADDR_LOW (((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) #ifdef CONFIG_ARM64_PA_BITS_52 @@ -172,23 +171,6 @@ #define PTE_ATTRINDX(t) (_AT(pteval_t, (t)) << 2) #define PTE_ATTRINDX_MASK (_AT(pteval_t, 7) << 2) -/* - * 2nd stage PTE definitions - */ -#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ -#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ -#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */ -#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */ - -#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */ -#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ -#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ -#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */ - -#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ -#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ -#define PUD_S2_XN (_AT(pudval_t, 2) << 53) /* XN[1:0] */ - /* * Memory Attribute override for Stage-2 (MemAttr[3:0]) */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 88acd7e1cd05..8f094c43072a 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -73,19 +73,6 @@ extern bool arm64_use_ng_mappings; __val; \ }) -#define PAGE_S2_XN \ - ({ \ - u64 __val; \ - if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) \ - __val = 0; \ - else \ - __val = PTE_S2_XN; \ - __val; \ - }) - -#define PAGE_S2 __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN) -#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN) - #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 996bf98f0cab..fe341a6578c3 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -8,7 +8,6 @@ #ifndef __ARM64_S2_PGTABLE_H_ #define __ARM64_S2_PGTABLE_H_ -#include #include /* @@ -36,21 +35,6 @@ #define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) #define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) -/* - * The number of PTRS across all concatenated stage2 tables given by the - * number of bits resolved at the initial level. - * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), - * in which case, stage2_pgd_ptrs will have one entry. - */ -#define pgd_ptrs_shift(ipa, pgdir_shift) \ - ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) -#define __s2_pgd_ptrs(ipa, lvls) \ - (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) -#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) - -#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) -#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) - /* * kvm_mmmu_cache_min_pages() is the number of pages required to install * a stage-2 translation. We pre-allocate the entry level page table at @@ -58,196 +42,6 @@ */ #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) -/* Stage2 PUD definitions when the level is present */ -static inline bool kvm_stage2_has_pud(struct kvm *kvm) -{ - return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); -} - -#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) -#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) -#define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) - -#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) -#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) -#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) -#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d) - -static inline p4d_t *stage2_p4d_offset(struct kvm *kvm, - pgd_t *pgd, unsigned long address) -{ - return p4d_offset(pgd, address); -} - -static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d) -{ -} - -static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp) -{ - return false; -} - -static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm, - phys_addr_t addr, phys_addr_t end) -{ - return end; -} - -static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d) -{ - if (kvm_stage2_has_pud(kvm)) - return p4d_none(p4d); - else - return 0; -} - -static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp) -{ - if (kvm_stage2_has_pud(kvm)) - p4d_clear(p4dp); -} - -static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d) -{ - if (kvm_stage2_has_pud(kvm)) - return p4d_present(p4d); - else - return 1; -} - -static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud) -{ - if (kvm_stage2_has_pud(kvm)) - p4d_populate(NULL, p4d, pud); -} - -static inline pud_t *stage2_pud_offset(struct kvm *kvm, - p4d_t *p4d, unsigned long address) -{ - if (kvm_stage2_has_pud(kvm)) - return pud_offset(p4d, address); - else - return (pud_t *)p4d; -} - -static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) -{ - if (kvm_stage2_has_pud(kvm)) - free_page((unsigned long)pud); -} - -static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) -{ - if (kvm_stage2_has_pud(kvm)) - return kvm_page_empty(pudp); - else - return false; -} - -static inline phys_addr_t -stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - if (kvm_stage2_has_pud(kvm)) { - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; - } else { - return end; - } -} - -/* Stage2 PMD definitions when the level is present */ -static inline bool kvm_stage2_has_pmd(struct kvm *kvm) -{ - return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); -} - -#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) -#define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) - -static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_none(pud); - else - return 0; -} - -static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) -{ - if (kvm_stage2_has_pmd(kvm)) - pud_clear(pud); -} - -static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_present(pud); - else - return 1; -} - -static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) -{ - if (kvm_stage2_has_pmd(kvm)) - pud_populate(NULL, pud, pmd); -} - -static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, - pud_t *pud, unsigned long address) -{ - if (kvm_stage2_has_pmd(kvm)) - return pmd_offset(pud, address); - else - return (pmd_t *)pud; -} - -static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) -{ - if (kvm_stage2_has_pmd(kvm)) - free_page((unsigned long)pmd); -} - -static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) -{ - if (kvm_stage2_has_pmd(kvm)) - return pud_huge(pud); - else - return 0; -} - -static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) -{ - if (kvm_stage2_has_pmd(kvm)) - return kvm_page_empty(pmdp); - else - return 0; -} - -static inline phys_addr_t -stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - if (kvm_stage2_has_pmd(kvm)) { - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; - } else { - return end; - } -} - -static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) -{ - return kvm_page_empty(ptep); -} - -static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) -{ - return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); -} - static inline phys_addr_t stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { @@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) return (boundary - 1 < end - 1) ? boundary : end; } -/* - * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and - * the architectural page-table level. - */ -#define S2_NO_LEVEL_HINT 0 -#define S2_PUD_LEVEL 1 -#define S2_PMD_LEVEL 2 -#define S2_PTE_LEVEL 3 - #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ed5aefd5d0a7..dd0ea7d8065a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -31,13 +31,6 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; -#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) -#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) - -static bool is_iomap(unsigned long flags) -{ - return flags & KVM_S2PTE_FLAG_IS_IOMAP; -} /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, @@ -90,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); } -static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, - int level) -{ - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level); -} - -/* - * D-Cache management functions. They take the page table entries by - * value, as they are flushing the cache using the kernel mapping (or - * kmap on 32bit). - */ -static void kvm_flush_dcache_pte(pte_t pte) -{ - __kvm_flush_dcache_pte(pte); -} - -static void kvm_flush_dcache_pmd(pmd_t pmd) -{ - __kvm_flush_dcache_pmd(pmd); -} - -static void kvm_flush_dcache_pud(pud_t pud) -{ - __kvm_flush_dcache_pud(pud); -} - static bool kvm_is_device_pfn(unsigned long pfn) { return !pfn_valid(pfn); } -/** - * stage2_dissolve_pmd() - clear and flush huge PMD entry - * @mmu: pointer to mmu structure to operate on - * @addr: IPA - * @pmd: pmd pointer for IPA - * - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd) -{ - if (!pmd_thp_or_huge(*pmd)) - return; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - put_page(virt_to_page(pmd)); -} - -/** - * stage2_dissolve_pud() - clear and flush huge PUD entry - * @mmu: pointer to mmu structure to operate on - * @addr: IPA - * @pud: pud pointer for IPA - * - * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. - */ -static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp) -{ - struct kvm *kvm = mmu->kvm; - - if (!stage2_pud_huge(kvm, *pudp)) - return; - - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - put_page(virt_to_page(pudp)); -} - -static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); - stage2_pgd_clear(kvm, pgd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_p4d_free(kvm, p4d_table); - put_page(virt_to_page(pgd)); -} - -static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0); - stage2_p4d_clear(kvm, p4d); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_pud_free(kvm, pud_table); - put_page(virt_to_page(p4d)); -} - -static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); - - VM_BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - stage2_pmd_free(kvm, pmd_table); - put_page(virt_to_page(pud)); -} - -static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr) -{ - pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(pmd_thp_or_huge(*pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT); - free_page((unsigned long)pte_table); - put_page(virt_to_page(pmd)); -} - -static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte) -{ - WRITE_ONCE(*ptep, new_pte); - dsb(ishst); -} - -static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd) -{ - WRITE_ONCE(*pmdp, new_pmd); - dsb(ishst); -} - -static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep) -{ - kvm_set_pmd(pmdp, kvm_mk_pmd(ptep)); -} - -static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp) -{ - WRITE_ONCE(*pudp, kvm_mk_pud(pmdp)); - dsb(ishst); -} - -static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp) -{ - WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp)); - dsb(ishst); -} - -static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) -{ -#ifndef __PAGETABLE_P4D_FOLDED - WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp)); - dsb(ishst); -#endif -} - /* * Unmapping vs dcache management: * @@ -262,108 +112,6 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) * we then fully enforce cacheability of RAM, no matter what the guest * does. */ -static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t start_addr = addr; - pte_t *pte, *start_pte; - - start_pte = pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - pte_t old_pte = *pte; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL); - - /* No need to invalidate the cache for device mappings */ - if (!kvm_is_device_pfn(pte_pfn(old_pte))) - kvm_flush_dcache_pte(old_pte); - - put_page(virt_to_page(pte)); - } - } while (pte++, addr += PAGE_SIZE, addr != end); - - if (stage2_pte_table_empty(mmu->kvm, start_pte)) - clear_stage2_pmd_entry(mmu, pmd, start_addr); -} - -static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - pmd_t *pmd, *start_pmd; - - start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - pmd_t old_pmd = *pmd; - - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - - kvm_flush_dcache_pmd(old_pmd); - - put_page(virt_to_page(pmd)); - } else { - unmap_stage2_ptes(mmu, pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); - - if (stage2_pmd_table_empty(kvm, start_pmd)) - clear_stage2_pud_entry(mmu, pud, start_addr); -} - -static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - pud_t *pud, *start_pud; - - start_pud = pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - pud_t old_pud = *pud; - - stage2_pud_clear(kvm, pud); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); - } else { - unmap_stage2_pmds(mmu, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); - - if (stage2_pud_table_empty(kvm, start_pud)) - clear_stage2_p4d_entry(mmu, p4d, start_addr); -} - -static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - phys_addr_t next, start_addr = addr; - p4d_t *p4d, *start_p4d; - - start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - unmap_stage2_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); - - if (stage2_p4d_table_empty(kvm, start_p4d)) - clear_stage2_pgd_entry(mmu, pgd, start_addr); -} - /** * unmap_stage2_range -- Clear stage2 page table entries to unmap a range * @kvm: The VM pointer @@ -392,71 +140,6 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si __unmap_stage2_range(mmu, start, size, true); } -static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, - phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) - kvm_flush_dcache_pte(*pte); - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) - kvm_flush_dcache_pmd(*pmd); - else - stage2_flush_ptes(mmu, pmd, addr, next); - } - } while (pmd++, addr = next, addr != end); -} - -static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) - kvm_flush_dcache_pud(*pud); - else - stage2_flush_pmds(mmu, pud, addr, next); - } - } while (pud++, addr = next, addr != end); -} - -static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - phys_addr_t next; - - p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - stage2_flush_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - static void stage2_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -805,348 +488,6 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) } } -static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - p4d_t *p4d; - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - if (stage2_pgd_none(kvm, *pgd)) { - if (!cache) - return NULL; - p4d = kvm_mmu_memory_cache_alloc(cache); - stage2_pgd_populate(kvm, pgd, p4d); - get_page(virt_to_page(pgd)); - } - - return stage2_p4d_offset(kvm, pgd, addr); -} - -static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - pud_t *pud; - - p4d = stage2_get_p4d(mmu, cache, addr); - if (stage2_p4d_none(kvm, *p4d)) { - if (!cache) - return NULL; - pud = kvm_mmu_memory_cache_alloc(cache); - stage2_p4d_populate(kvm, p4d, pud); - get_page(virt_to_page(p4d)); - } - - return stage2_pud_offset(kvm, p4d, addr); -} - -static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache, - phys_addr_t addr) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - pmd_t *pmd; - - pud = stage2_get_pud(mmu, cache, addr); - if (!pud || stage2_pud_huge(kvm, *pud)) - return NULL; - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return NULL; - pmd = kvm_mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - return stage2_pmd_offset(kvm, pud, addr); -} - -static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pmd_t *new_pmd) -{ - pmd_t *pmd, old_pmd; - -retry: - pmd = stage2_get_pmd(mmu, cache, addr); - VM_BUG_ON(!pmd); - - old_pmd = *pmd; - /* - * Multiple vcpus faulting on the same PMD entry, can - * lead to them sequentially updating the PMD with the - * same value. Following the break-before-make - * (pmd_clear() followed by tlb_flush()) process can - * hinder forward progress due to refaults generated - * on missing translations. - * - * Skip updating the page table if the entry is - * unchanged. - */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - - if (pmd_present(old_pmd)) { - /* - * If we already have PTE level mapping for this block, - * we must unmap it to avoid inconsistent TLB state and - * leaking the table page. We could end up in this situation - * if the memory slot was marked for dirty logging and was - * reverted, leaving PTE level mappings for the pages accessed - * during the period. So, unmap the PTE level mapping for this - * block and retry, as we could have released the upper level - * table in the process. - * - * Normal THP split/merge follows mmu_notifier callbacks and do - * get handled accordingly. - */ - if (!pmd_thp_or_huge(old_pmd)) { - unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE); - goto retry; - } - /* - * Mapping in huge pages should only happen through a - * fault. If a page is merged into a transparent huge - * page, the individual subpages of that huge page - * should be unmapped through MMU notifiers before we - * get here. - * - * Merging of CompoundPages is not supported; they - * should become splitting first, unmapped, merged, - * and mapped back in on-demand. - */ - WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - pmd_clear(pmd); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL); - } else { - get_page(virt_to_page(pmd)); - } - - kvm_set_pmd(pmd, *new_pmd); - return 0; -} - -static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pud_t *new_pudp) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pudp, old_pud; - -retry: - pudp = stage2_get_pud(mmu, cache, addr); - VM_BUG_ON(!pudp); - - old_pud = *pudp; - - /* - * A large number of vcpus faulting on the same stage 2 entry, - * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). - * Skip updating the page tables if there is no change. - */ - if (pud_val(old_pud) == pud_val(*new_pudp)) - return 0; - - if (stage2_pud_present(kvm, old_pud)) { - /* - * If we already have table level mapping for this block, unmap - * the range for this block and retry. - */ - if (!stage2_pud_huge(kvm, old_pud)) { - unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE); - goto retry; - } - - WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); - stage2_pud_clear(kvm, pudp); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL); - } else { - get_page(virt_to_page(pudp)); - } - - kvm_set_pud(pudp, *new_pudp); - return 0; -} - -/* - * stage2_get_leaf_entry - walk the stage2 VM page tables and return - * true if a valid and present leaf-entry is found. A pointer to the - * leaf-entry is returned in the appropriate level variable - pudpp, - * pmdpp, ptepp. - */ -static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr, - pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - *pudpp = NULL; - *pmdpp = NULL; - *ptepp = NULL; - - pudp = stage2_get_pud(mmu, NULL, addr); - if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) - return false; - - if (stage2_pud_huge(kvm, *pudp)) { - *pudpp = pudp; - return true; - } - - pmdp = stage2_pmd_offset(kvm, pudp, addr); - if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) - return false; - - if (pmd_thp_or_huge(*pmdp)) { - *pmdpp = pmdp; - return true; - } - - ptep = pte_offset_kernel(pmdp, addr); - if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) - return false; - - *ptepp = ptep; - return true; -} - -static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz) -{ - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - bool found; - - found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep); - if (!found) - return false; - - if (pudp) - return sz <= PUD_SIZE && kvm_s2pud_exec(pudp); - else if (pmdp) - return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp); - else - return sz == PAGE_SIZE && kvm_s2pte_exec(ptep); -} - -static int stage2_set_pte(struct kvm_s2_mmu *mmu, - struct kvm_mmu_memory_cache *cache, - phys_addr_t addr, const pte_t *new_pte, - unsigned long flags) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, old_pte; - bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; - bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE; - - VM_BUG_ON(logging_active && !cache); - - /* Create stage-2 page table mapping - Levels 0 and 1 */ - pud = stage2_get_pud(mmu, cache, addr); - if (!pud) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PUD, then continue - * on to allocate page. - */ - if (logging_active) - stage2_dissolve_pud(mmu, addr, pud); - - if (stage2_pud_none(kvm, *pud)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pmd = kvm_mmu_memory_cache_alloc(cache); - stage2_pud_populate(kvm, pud, pmd); - get_page(virt_to_page(pud)); - } - - pmd = stage2_pmd_offset(kvm, pud, addr); - if (!pmd) { - /* - * Ignore calls from kvm_set_spte_hva for unallocated - * address ranges. - */ - return 0; - } - - /* - * While dirty page logging - dissolve huge PMD, then continue on to - * allocate page. - */ - if (logging_active) - stage2_dissolve_pmd(mmu, addr, pmd); - - /* Create stage-2 page mappings - Level 2 */ - if (pmd_none(*pmd)) { - if (!cache) - return 0; /* ignore calls from kvm_set_spte_hva */ - pte = kvm_mmu_memory_cache_alloc(cache); - kvm_pmd_populate(pmd, pte); - get_page(virt_to_page(pmd)); - } - - pte = pte_offset_kernel(pmd, addr); - - if (iomap && pte_present(*pte)) - return -EFAULT; - - /* Create 2nd stage page table mapping - Level 3 */ - old_pte = *pte; - if (pte_present(old_pte)) { - /* Skip page table update if there is no change */ - if (pte_val(old_pte) == pte_val(*new_pte)) - return 0; - - kvm_set_pte(pte, __pte(0)); - kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL); - } else { - get_page(virt_to_page(pte)); - } - - kvm_set_pte(pte, *new_pte); - return 0; -} - -#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); - return 1; - } - return 0; -} -#else -static int stage2_ptep_test_and_clear_young(pte_t *pte) -{ - return __ptep_test_and_clear_young(pte); -} -#endif - -static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pmd); -} - -static int stage2_pudp_test_and_clear_young(pud_t *pud) -{ - return stage2_ptep_test_and_clear_young((pte_t *)pud); -} - /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -1189,102 +530,6 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, return ret; } -/** - * stage2_wp_ptes - write protect PMD range - * @pmd: pointer to pmd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) -{ - pte_t *pte; - - pte = pte_offset_kernel(pmd, addr); - do { - if (!pte_none(*pte)) { - if (!kvm_s2pte_readonly(pte)) - kvm_set_s2pte_readonly(pte); - } - } while (pte++, addr += PAGE_SIZE, addr != end); -} - -/** - * stage2_wp_pmds - write protect PUD range - * kvm: kvm instance for the VM - * @pud: pointer to pud entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pmd_t *pmd; - phys_addr_t next; - - pmd = stage2_pmd_offset(kvm, pud, addr); - - do { - next = stage2_pmd_addr_end(kvm, addr, end); - if (!pmd_none(*pmd)) { - if (pmd_thp_or_huge(*pmd)) { - if (!kvm_s2pmd_readonly(pmd)) - kvm_set_s2pmd_readonly(pmd); - } else { - stage2_wp_ptes(pmd, addr, next); - } - } - } while (pmd++, addr = next, addr != end); -} - -/** - * stage2_wp_puds - write protect P4D range - * @p4d: pointer to p4d entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - pud_t *pud; - phys_addr_t next; - - pud = stage2_pud_offset(kvm, p4d, addr); - do { - next = stage2_pud_addr_end(kvm, addr, end); - if (!stage2_pud_none(kvm, *pud)) { - if (stage2_pud_huge(kvm, *pud)) { - if (!kvm_s2pud_readonly(pud)) - kvm_set_s2pud_readonly(pud); - } else { - stage2_wp_pmds(mmu, pud, addr, next); - } - } - } while (pud++, addr = next, addr != end); -} - -/** - * stage2_wp_p4ds - write protect PGD range - * @pgd: pointer to pgd entry - * @addr: range start address - * @end: range end address - */ -static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, - phys_addr_t addr, phys_addr_t end) -{ - struct kvm *kvm = mmu->kvm; - p4d_t *p4d; - phys_addr_t next; - - p4d = stage2_p4d_offset(kvm, pgd, addr); - do { - next = stage2_p4d_addr_end(kvm, addr, end); - if (!stage2_p4d_none(kvm, *p4d)) - stage2_wp_puds(mmu, p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - /** * stage2_wp_range() - write protect stage2 memory region range * @kvm: The KVM pointer From 74cfa7ea66a7e54a5ea1d8ad1fb74a5e832ed867 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:28 +0100 Subject: [PATCH 045/497] KVM: arm64: Remove unused 'pgd' field from 'struct kvm_s2_mmu' The stage-2 page-tables are entirely encapsulated by the 'pgt' field of 'struct kvm_s2_mmu', so remove the unused 'pgd' field. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-21-will@kernel.org --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/mmu.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0b7c702b2151..41caf29bd93c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -79,7 +79,6 @@ struct kvm_s2_mmu { * for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the * canonical stage-2 page tables. */ - pgd_t *pgd; phys_addr_t pgd_phys; struct kvm_pgtable *pgt; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index dd0ea7d8065a..21b70abf65a7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -389,7 +389,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) mmu->kvm = kvm; mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); - mmu->pgd = (void *)pgt->pgd; mmu->vmid.vmid_gen = 0; return 0; @@ -475,7 +474,6 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) spin_lock(&kvm->mmu_lock); pgt = mmu->pgt; if (pgt) { - mmu->pgd = NULL; mmu->pgd_phys = 0; mmu->pgt = NULL; free_percpu(mmu->last_vcpu_ran); From c9b69a0cf0b4336fe7d2e35c46273debc68910fd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 11 Sep 2020 14:25:29 +0100 Subject: [PATCH 046/497] KVM: arm64: Don't constrain maximum IPA size based on host configuration Now that the guest stage-2 page-tables are managed independently from the host stage-1 page-tables, we can avoid constraining the IPA size based on the host and instead limit it only based on the PARange field of the ID_AA64MMFR0 register. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Gavin Shan Cc: Marc Zyngier Cc: Quentin Perret Link: https://lore.kernel.org/r/20200911132529.19844-22-will@kernel.org --- arch/arm64/kvm/reset.c | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index ee33875c5c2a..2202b710d44c 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -339,7 +339,7 @@ u32 get_kvm_ipa_limit(void) int kvm_set_ipa_limit(void) { - unsigned int ipa_max, pa_max, va_max, parange, tgran_2; + unsigned int parange, tgran_2; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); @@ -376,39 +376,11 @@ int kvm_set_ipa_limit(void) break; } - pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); - - /* Clamp the IPA limit to the PA size supported by the kernel */ - ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; - /* - * Since our stage2 table is dependent on the stage1 page table code, - * we must always honor the following condition: - * - * Number of levels in Stage1 >= Number of levels in Stage2. - * - * So clamp the ipa limit further down to limit the number of levels. - * Since we can concatenate upto 16 tables at entry level, we could - * go upto 4bits above the maximum VA addressable with the current - * number of levels. - */ - va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; - va_max += 4; - - if (va_max < ipa_max) - ipa_max = va_max; - - /* - * If the final limit is lower than the real physical address - * limit of the CPUs, report the reason. - */ - if (ipa_max < pa_max) - pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", - (va_max < pa_max) ? "Virtual" : "Physical"); - - WARN(ipa_max < KVM_PHYS_SHIFT, - "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); - kvm_ipa_limit = ipa_max; - kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); + kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); + WARN(kvm_ipa_limit < KVM_PHYS_SHIFT, + "KVM IPA Size Limit (%d bits) is smaller than default size\n", + kvm_ipa_limit); + kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit); return 0; } From 838fc8083b6241c50b0be105b47006b3ae9c9ed8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 13 Sep 2020 23:04:28 -0700 Subject: [PATCH 047/497] Input: soc_button_array - add active_low setting to soc_button_info This is a preparation patch for adding support for Intel INT33D3 ACPI devices. These INT33D3 devices follow yet another Intel defined (but not documented) ACPI GPIO button standard. Unlike the ACPI GPIO button devices supported so far, the GPIO used in the INT33D3 devices is active-high, rather then active-low. This commit makes setting the gpio_keys_button.active_low flag configurable through the soc_button_info struct and enables it for all currently supported devices. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200826150601.12137-2-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/soc_button_array.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c index 08520b3a18b8..e3a22a61f5d9 100644 --- a/drivers/input/misc/soc_button_array.c +++ b/drivers/input/misc/soc_button_array.c @@ -23,6 +23,7 @@ struct soc_button_info { unsigned int event_code; bool autorepeat; bool wakeup; + bool active_low; }; struct soc_device_data { @@ -110,7 +111,7 @@ soc_button_device_create(struct platform_device *pdev, gpio_keys[n_buttons].type = info->event_type; gpio_keys[n_buttons].code = info->event_code; gpio_keys[n_buttons].gpio = gpio; - gpio_keys[n_buttons].active_low = 1; + gpio_keys[n_buttons].active_low = info->active_low; gpio_keys[n_buttons].desc = info->name; gpio_keys[n_buttons].wakeup = info->wakeup; /* These devices often use cheap buttons, use 50 ms debounce */ @@ -173,6 +174,7 @@ static int soc_button_parse_btn_desc(struct device *dev, } info->event_type = EV_KEY; + info->active_low = true; info->acpi_index = soc_button_get_acpi_object_int(&desc->package.elements[1]); upage = soc_button_get_acpi_object_int(&desc->package.elements[3]); @@ -383,11 +385,11 @@ static int soc_button_probe(struct platform_device *pdev) * Platforms" */ static const struct soc_button_info soc_button_PNP0C40[] = { - { "power", 0, EV_KEY, KEY_POWER, false, true }, - { "home", 1, EV_KEY, KEY_LEFTMETA, false, true }, - { "volume_up", 2, EV_KEY, KEY_VOLUMEUP, true, false }, - { "volume_down", 3, EV_KEY, KEY_VOLUMEDOWN, true, false }, - { "rotation_lock", 4, EV_KEY, KEY_ROTATE_LOCK_TOGGLE, false, false }, + { "power", 0, EV_KEY, KEY_POWER, false, true, true }, + { "home", 1, EV_KEY, KEY_LEFTMETA, false, true, true }, + { "volume_up", 2, EV_KEY, KEY_VOLUMEUP, true, false, true }, + { "volume_down", 3, EV_KEY, KEY_VOLUMEDOWN, true, false, true }, + { "rotation_lock", 4, EV_KEY, KEY_ROTATE_LOCK_TOGGLE, false, false, true }, { } }; @@ -444,9 +446,9 @@ static int soc_device_check_MSHW0040(struct device *dev) * Obtained from DSDT/testing. */ static const struct soc_button_info soc_button_MSHW0040[] = { - { "power", 0, EV_KEY, KEY_POWER, false, true }, - { "volume_up", 2, EV_KEY, KEY_VOLUMEUP, true, false }, - { "volume_down", 4, EV_KEY, KEY_VOLUMEDOWN, true, false }, + { "power", 0, EV_KEY, KEY_POWER, false, true, true }, + { "volume_up", 2, EV_KEY, KEY_VOLUMEUP, true, false, true }, + { "volume_down", 4, EV_KEY, KEY_VOLUMEDOWN, true, false, true }, { } }; From 4e5d9c198349233b2ba9eb41597a8fc9a662d608 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 13 Sep 2020 23:05:19 -0700 Subject: [PATCH 048/497] Input: soc_button_array - add support for INT33D3 tablet-mode switch devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the Microsoft documentation for Windows 8 convertible devices, these devices should implement a PNP0C60 "laptop/slate mode state indicator" ACPI device. This device can work in 2 ways, if there is a GPIO which directly indicates the device is in tablet-mode or not then the direct-gpio mode should be used. If there is no such GPIO, but instead the events are coming from e.g. the embedded-controller, then there should still be a PNP0C60 ACPI device and event-injection should be used to send the events. The drivers/platform/x86/intel-vbtn.c code is an example from a standardized manner of doing the latter. On various 2-in-1s with either a detachable keyboard, or with 360° hinges, the direct GPIO mode is indicated by an ACPI device with a HID of INT33D3, which contains a single GpioInt in its ACPI resource table, which directly indicates if the device is in tablet-mode or not. This commit adds support for this to the soc_button_array code, as well as for the alternative ID9001 HID which some devices use instead of the INT33D3 HID. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200826150601.12137-3-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/soc_button_array.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c index e3a22a61f5d9..837c787e9c4b 100644 --- a/drivers/input/misc/soc_button_array.c +++ b/drivers/input/misc/soc_button_array.c @@ -397,6 +397,15 @@ static const struct soc_device_data soc_device_PNP0C40 = { .button_info = soc_button_PNP0C40, }; +static const struct soc_button_info soc_button_INT33D3[] = { + { "tablet_mode", 0, EV_SW, SW_TABLET_MODE, false, false, false }, + { } +}; + +static const struct soc_device_data soc_device_INT33D3 = { + .button_info = soc_button_INT33D3, +}; + /* * Special device check for Surface Book 2 and Surface Pro (2017). * Both, the Surface Pro 4 (surfacepro3_button.c) and the above mentioned @@ -459,6 +468,8 @@ static const struct soc_device_data soc_device_MSHW0040 = { static const struct acpi_device_id soc_button_acpi_match[] = { { "PNP0C40", (unsigned long)&soc_device_PNP0C40 }, + { "INT33D3", (unsigned long)&soc_device_INT33D3 }, + { "ID9001", (unsigned long)&soc_device_INT33D3 }, { "ACPI0011", 0 }, /* Microsoft Surface Devices (5th and 6th generation) */ From 78a5b53e9fb4d9a4437b6262b79278d2cd4669c9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 13 Sep 2020 23:07:27 -0700 Subject: [PATCH 049/497] Input: soc_button_array - work around DSDTs which modify the irqflags Some 2-in-1s which use the soc_button_array driver have this ugly issue in their DSDT where the _LID method modifies the irq-type settings of the GPIOs used for the power and home buttons. The intend of this AML code is to disable these buttons when the lid is closed. The AML does this by directly poking the GPIO controllers registers. This is problematic because when re-enabling the irq, which happens whenever _LID gets called with the lid open (e.g. on boot and on resume), it sets the irq-type to IRQ_TYPE_LEVEL_LOW. Where as the gpio-keys driver programs the type to, and expects it to be, IRQ_TYPE_EDGE_BOTH. This commit adds a workaround for this which (on affected devices) does not set gpio_keys_button.gpio on these 2-in-1s, instead it gets the irq for the GPIO, configures it as IRQ_TYPE_LEVEL_LOW (to match how the _LID AML code configures it) and passes the irq in gpio_keys_button.irq. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200906122016.4628-2-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/soc_button_array.c | 69 +++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c index 837c787e9c4b..cae1a3fae83a 100644 --- a/drivers/input/misc/soc_button_array.c +++ b/drivers/input/misc/soc_button_array.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -42,23 +43,66 @@ struct soc_button_data { struct platform_device *children[BUTTON_TYPES]; }; +/* + * Some 2-in-1s which use the soc_button_array driver have this ugly issue in + * their DSDT where the _LID method modifies the irq-type settings of the GPIOs + * used for the power and home buttons. The intend of this AML code is to + * disable these buttons when the lid is closed. + * The AML does this by directly poking the GPIO controllers registers. This is + * problematic because when re-enabling the irq, which happens whenever _LID + * gets called with the lid open (e.g. on boot and on resume), it sets the + * irq-type to IRQ_TYPE_LEVEL_LOW. Where as the gpio-keys driver programs the + * type to, and expects it to be, IRQ_TYPE_EDGE_BOTH. + * To work around this we don't set gpio_keys_button.gpio on these 2-in-1s, + * instead we get the irq for the GPIO ourselves, configure it as + * IRQ_TYPE_LEVEL_LOW (to match how the _LID AML code configures it) and pass + * the irq in gpio_keys_button.irq. Below is a list of affected devices. + */ +static const struct dmi_system_id dmi_use_low_level_irq[] = { + { + /* + * Acer Switch 10 SW5-012. _LID method messes with home- and + * power-button GPIO IRQ settings. When (re-)enabling the irq + * it ors in its own flags without clearing the previous set + * ones, leading to an irq-type of IRQ_TYPE_LEVEL_LOW | + * IRQ_TYPE_LEVEL_HIGH causing a continuous interrupt storm. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Aspire SW5-012"), + }, + }, + { + /* + * Acer One S1003. _LID method messes with power-button GPIO + * IRQ settings, leading to a non working power-button. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "One S1003"), + }, + }, + {} /* Terminating entry */ +}; + /* * Get the Nth GPIO number from the ACPI object. */ -static int soc_button_lookup_gpio(struct device *dev, int acpi_index) +static int soc_button_lookup_gpio(struct device *dev, int acpi_index, + int *gpio_ret, int *irq_ret) { struct gpio_desc *desc; - int gpio; desc = gpiod_get_index(dev, NULL, acpi_index, GPIOD_ASIS); if (IS_ERR(desc)) return PTR_ERR(desc); - gpio = desc_to_gpio(desc); + *gpio_ret = desc_to_gpio(desc); + *irq_ret = gpiod_to_irq(desc); gpiod_put(desc); - return gpio; + return 0; } static struct platform_device * @@ -70,9 +114,8 @@ soc_button_device_create(struct platform_device *pdev, struct platform_device *pd; struct gpio_keys_button *gpio_keys; struct gpio_keys_platform_data *gpio_keys_pdata; + int error, gpio, irq; int n_buttons = 0; - int gpio; - int error; for (info = button_info; info->name; info++) if (info->autorepeat == autorepeat) @@ -92,8 +135,8 @@ soc_button_device_create(struct platform_device *pdev, if (info->autorepeat != autorepeat) continue; - gpio = soc_button_lookup_gpio(&pdev->dev, info->acpi_index); - if (!gpio_is_valid(gpio)) { + error = soc_button_lookup_gpio(&pdev->dev, info->acpi_index, &gpio, &irq); + if (error || irq < 0) { /* * Skip GPIO if not present. Note we deliberately * ignore -EPROBE_DEFER errors here. On some devices @@ -108,9 +151,17 @@ soc_button_device_create(struct platform_device *pdev, continue; } + /* See dmi_use_low_level_irq[] comment */ + if (!autorepeat && dmi_check_system(dmi_use_low_level_irq)) { + irq_set_irq_type(irq, IRQ_TYPE_LEVEL_LOW); + gpio_keys[n_buttons].irq = irq; + gpio_keys[n_buttons].gpio = -ENOENT; + } else { + gpio_keys[n_buttons].gpio = gpio; + } + gpio_keys[n_buttons].type = info->event_type; gpio_keys[n_buttons].code = info->event_code; - gpio_keys[n_buttons].gpio = gpio; gpio_keys[n_buttons].active_low = info->active_low; gpio_keys[n_buttons].desc = info->name; gpio_keys[n_buttons].wakeup = info->wakeup; From f492ffe414a7cddc4fd7351563300bc8711ca187 Mon Sep 17 00:00:00 2001 From: Furquan Shaikh Date: Tue, 8 Sep 2020 17:32:16 -0700 Subject: [PATCH 050/497] Input: raydium_i2c_ts - use single i2c_transfer transaction when using RM_CMD_BANK_SWITCH On an AMD chromebook, where the same I2C bus is shared by both Raydium touchscreen and a trackpad device, it is observed that interleaving of I2C messages when `raydium_i2c_read_message()` is called leads to the Raydium touch IC reporting incorrect information. This is the sequence that was observed to result in the above issue: * I2C write to Raydium device for RM_CMD_BANK_SWITCH * I2C write to trackpad device * I2C read from trackpad device * I2C write to Raydium device for setting address * I2C read from Raydium device >>>> This provides incorrect information This change adds a new helper function `raydium_i2c_xfer()` that performs I2C transactions to the Raydium device. It uses the register address to decide if RM_CMD_BANK_SWITCH header needs to be sent to the device (i.e. if register address is greater than 255, then bank switch header is sent before the rest of the transaction). Additionally, it ensures that all the I2C operations performed as part of `raydium_i2c_xfer()` are done as a single i2c_transfer. This guarantees that no other transactions are initiated to any other device on the same bus in between. Additionally, `raydium_i2c_{send|read}*` functions are refactored to use this new helper function. Verified with the patch across multiple reboots (>100) that the information reported by the Raydium touchscreen device during probe is correct. Signed-off-by: Furquan Shaikh Link: https://lore.kernel.org/r/20200821024006.3399663-1-furquan@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/raydium_i2c_ts.c | 133 +++++++++------------ 1 file changed, 59 insertions(+), 74 deletions(-) diff --git a/drivers/input/touchscreen/raydium_i2c_ts.c b/drivers/input/touchscreen/raydium_i2c_ts.c index fe245439adee..e694a9b2b1e5 100644 --- a/drivers/input/touchscreen/raydium_i2c_ts.c +++ b/drivers/input/touchscreen/raydium_i2c_ts.c @@ -51,6 +51,7 @@ /* Touch relative info */ #define RM_MAX_RETRIES 3 +#define RM_RETRY_DELAY_MS 20 #define RM_MAX_TOUCH_NUM 10 #define RM_BOOT_DELAY_MS 100 @@ -136,83 +137,82 @@ struct raydium_data { bool wake_irq_enabled; }; -static int raydium_i2c_send(struct i2c_client *client, - u8 addr, const void *data, size_t len) +static int raydium_i2c_xfer(struct i2c_client *client, + u32 addr, void *data, size_t len, bool is_read) { - u8 *buf; - int tries = 0; - int ret; + struct raydium_bank_switch_header { + u8 cmd; + __be32 be_addr; + } __packed header = { + .cmd = RM_CMD_BANK_SWITCH, + .be_addr = cpu_to_be32(addr), + }; - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; + u8 reg_addr = addr & 0xff; - buf[0] = addr; - memcpy(buf + 1, data, len); - - do { - ret = i2c_master_send(client, buf, len + 1); - if (likely(ret == len + 1)) - break; - - msleep(20); - } while (++tries < RM_MAX_RETRIES); - - kfree(buf); - - if (unlikely(ret != len + 1)) { - if (ret >= 0) - ret = -EIO; - dev_err(&client->dev, "%s failed: %d\n", __func__, ret); - return ret; - } - - return 0; -} - -static int raydium_i2c_read(struct i2c_client *client, - u8 addr, void *data, size_t len) -{ struct i2c_msg xfer[] = { { .addr = client->addr, - .len = 1, - .buf = &addr, + .len = sizeof(header), + .buf = (u8 *)&header, + }, + { + .addr = client->addr, + .len = 1, + .buf = ®_addr, }, { .addr = client->addr, - .flags = I2C_M_RD, .len = len, .buf = data, + .flags = is_read ? I2C_M_RD : 0, } }; + + /* + * If address is greater than 255, then RM_CMD_BANK_SWITCH needs to be + * sent first. Else, skip the header i.e. xfer[0]. + */ + int xfer_start_idx = (addr > 0xff) ? 0 : 1; + size_t xfer_count = ARRAY_SIZE(xfer) - xfer_start_idx; int ret; - ret = i2c_transfer(client->adapter, xfer, ARRAY_SIZE(xfer)); - if (unlikely(ret != ARRAY_SIZE(xfer))) - return ret < 0 ? ret : -EIO; + ret = i2c_transfer(client->adapter, &xfer[xfer_start_idx], xfer_count); + if (likely(ret == xfer_count)) + return 0; - return 0; + return ret < 0 ? ret : -EIO; } -static int raydium_i2c_read_message(struct i2c_client *client, - u32 addr, void *data, size_t len) +static int raydium_i2c_send(struct i2c_client *client, + u32 addr, const void *data, size_t len) +{ + int tries = 0; + int error; + + do { + error = raydium_i2c_xfer(client, addr, (void *)data, len, + false); + if (likely(!error)) + return 0; + + msleep(RM_RETRY_DELAY_MS); + } while (++tries < RM_MAX_RETRIES); + + dev_err(&client->dev, "%s failed: %d\n", __func__, error); + return error; +} + +static int raydium_i2c_read(struct i2c_client *client, + u32 addr, void *data, size_t len) { - __be32 be_addr; size_t xfer_len; int error; while (len) { xfer_len = min_t(size_t, len, RM_MAX_READ_SIZE); - - be_addr = cpu_to_be32(addr); - - error = raydium_i2c_send(client, RM_CMD_BANK_SWITCH, - &be_addr, sizeof(be_addr)); - if (!error) - error = raydium_i2c_read(client, addr & 0xff, - data, xfer_len); - if (error) + error = raydium_i2c_xfer(client, addr, data, xfer_len, true); + if (unlikely(error)) return error; len -= xfer_len; @@ -223,27 +223,13 @@ static int raydium_i2c_read_message(struct i2c_client *client, return 0; } -static int raydium_i2c_send_message(struct i2c_client *client, - u32 addr, const void *data, size_t len) -{ - __be32 be_addr = cpu_to_be32(addr); - int error; - - error = raydium_i2c_send(client, RM_CMD_BANK_SWITCH, - &be_addr, sizeof(be_addr)); - if (!error) - error = raydium_i2c_send(client, addr & 0xff, data, len); - - return error; -} - static int raydium_i2c_sw_reset(struct i2c_client *client) { const u8 soft_rst_cmd = 0x01; int error; - error = raydium_i2c_send_message(client, RM_RESET_MSG_ADDR, - &soft_rst_cmd, sizeof(soft_rst_cmd)); + error = raydium_i2c_send(client, RM_RESET_MSG_ADDR, &soft_rst_cmd, + sizeof(soft_rst_cmd)); if (error) { dev_err(&client->dev, "software reset failed: %d\n", error); return error; @@ -295,9 +281,8 @@ static int raydium_i2c_query_ts_info(struct raydium_data *ts) if (error) continue; - error = raydium_i2c_read_message(client, - le32_to_cpu(query_bank_addr), - &ts->info, sizeof(ts->info)); + error = raydium_i2c_read(client, le32_to_cpu(query_bank_addr), + &ts->info, sizeof(ts->info)); if (error) continue; @@ -834,8 +819,8 @@ static irqreturn_t raydium_i2c_irq(int irq, void *_dev) if (ts->boot_mode != RAYDIUM_TS_MAIN) goto out; - error = raydium_i2c_read_message(ts->client, ts->data_bank_addr, - ts->report_data, ts->pkg_size); + error = raydium_i2c_read(ts->client, ts->data_bank_addr, + ts->report_data, ts->pkg_size); if (error) goto out; From 4238e52cc351e893ebd05e8d7ea97edb3f81d978 Mon Sep 17 00:00:00 2001 From: Johnny Chuang Date: Sun, 13 Sep 2020 23:35:27 -0700 Subject: [PATCH 051/497] Input: elants_i2c - report resolution of ABS_MT_TOUCH_MAJOR by FW information. This patch adds a new behavior to report touch major resolution based on information provided by firmware. In initial process, driver acquires touch information from touch ic. It contains one byte about the resolution value of ABS_MT_TOUCH_MAJOR. Touch driver will report touch major resolution by this information. Signed-off-by: Johnny Chuang Reviewed-by: Harry Cutts Link: https://lore.kernel.org/r/1598581195-9874-1-git-send-email-johnny.chuang.emc@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/elants_i2c.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/input/touchscreen/elants_i2c.c b/drivers/input/touchscreen/elants_i2c.c index b0bd5bb079be..c8d7bdd34c7a 100644 --- a/drivers/input/touchscreen/elants_i2c.c +++ b/drivers/input/touchscreen/elants_i2c.c @@ -134,6 +134,7 @@ struct elants_data { u8 bc_version; u8 iap_version; u16 hw_version; + u8 major_res; unsigned int x_res; /* resolution in units/mm */ unsigned int y_res; unsigned int x_max; @@ -459,6 +460,9 @@ static int elants_i2c_query_ts_info(struct elants_data *ts) rows = resp[2] + resp[6] + resp[10]; cols = resp[3] + resp[7] + resp[11]; + /* Get report resolution value of ABS_MT_TOUCH_MAJOR */ + ts->major_res = resp[16]; + /* Process mm_to_pixel information */ error = elants_i2c_execute_command(client, get_osr_cmd, sizeof(get_osr_cmd), @@ -1325,6 +1329,8 @@ static int elants_i2c_probe(struct i2c_client *client, 0, MT_TOOL_PALM, 0, 0); input_abs_set_res(ts->input, ABS_MT_POSITION_X, ts->x_res); input_abs_set_res(ts->input, ABS_MT_POSITION_Y, ts->y_res); + if (ts->major_res > 0) + input_abs_set_res(ts->input, ABS_MT_TOUCH_MAJOR, ts->major_res); touchscreen_parse_properties(ts->input, true, &ts->prop); From e0b760a5f6c9e54db8bd22b1d6b19223e6b92264 Mon Sep 17 00:00:00 2001 From: Rajendra Nayak Date: Wed, 12 Aug 2020 15:52:10 +0530 Subject: [PATCH 052/497] arm64: dts: sdm845: Fixup OPP table for all qup devices This OPP table was based on the clock VDD-FMAX tables seen in downstream code, however it turns out the downstream clock driver does update these tables based on later/production rev of the chip and whats seen in the tables belongs to an early engineering rev of the SoC. Fix up the OPP tables such that it now matches with the production rev of sdm845 SoC. Tested-by: Amit Pundir Tested-by: John Stultz Tested-by: Steev Klimaszewski Fixes: 13cadb34e593 ("arm64: dts: sdm845: Add OPP table for all qup devices") Reported-by: John Stultz Signed-off-by: Rajendra Nayak Link: https://lore.kernel.org/r/1597227730-16477-1-git-send-email-rnayak@codeaurora.org Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/sdm845.dtsi | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi index 2884577dcb77..eca81cffd2c1 100644 --- a/arch/arm64/boot/dts/qcom/sdm845.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi @@ -1093,8 +1093,8 @@ qup_opp_table: qup-opp-table { compatible = "operating-points-v2"; - opp-19200000 { - opp-hz = /bits/ 64 <19200000>; + opp-50000000 { + opp-hz = /bits/ 64 <50000000>; required-opps = <&rpmhpd_opp_min_svs>; }; @@ -1107,6 +1107,11 @@ opp-hz = /bits/ 64 <100000000>; required-opps = <&rpmhpd_opp_svs>; }; + + opp-128000000 { + opp-hz = /bits/ 64 <128000000>; + required-opps = <&rpmhpd_opp_nom>; + }; }; qupv3_id_0: geniqup@8c0000 { From 326407d2c576995464fda64e92b5e37f3589e5ee Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 30 Jun 2020 00:26:10 +0200 Subject: [PATCH 053/497] arm64: dts: sdm630: Temporarily disable SMMUs by default There happens to be an issue between how kernel handles qcom-smmuv2 and how the hypervisor would like it to be handled. That results in the platform hanging completely after the SMMUs are probed. Hence, disable the SMMU nodes temporarily, until the issue is rectified. This has been overlooked by me in the initial porting stage, as my defconfig has SMMU disabled. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20200629222610.168511-1-konradybcio@gmail.com Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/sdm630.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/boot/dts/qcom/sdm630.dtsi b/arch/arm64/boot/dts/qcom/sdm630.dtsi index 88efe8200c80..deb928d303c2 100644 --- a/arch/arm64/boot/dts/qcom/sdm630.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm630.dtsi @@ -518,6 +518,8 @@ , , ; + + status = "disabled"; }; tcsr_mutex_regs: syscon@1f40000 { @@ -749,6 +751,8 @@ , , ; + + status = "disabled"; }; lpass_smmu: iommu@5100000 { @@ -778,6 +782,8 @@ , , ; + + status = "disabled"; }; spmi_bus: spmi@800f000 { @@ -1074,6 +1080,8 @@ , , ; + + status = "disabled"; }; apcs_glb: mailbox@17911000 { From e884fb6cc89dce1debeae33704edd7735a3d6d9c Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 14 Aug 2020 17:47:49 +0200 Subject: [PATCH 054/497] arm64: dts: qcom: kitakami: Temporarily disable SDHCI1 There is an issue with Kitakami eMMCs dying when a quirk isn't addressed. Until that happens, disable it. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20200814154749.257837-1-konradybcio@gmail.com Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/msm8994-sony-xperia-kitakami.dtsi | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/msm8994-sony-xperia-kitakami.dtsi b/arch/arm64/boot/dts/qcom/msm8994-sony-xperia-kitakami.dtsi index 4032b7478f04..791f254ac3f8 100644 --- a/arch/arm64/boot/dts/qcom/msm8994-sony-xperia-kitakami.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8994-sony-xperia-kitakami.dtsi @@ -221,7 +221,12 @@ }; &sdhc1 { - status = "okay"; + /* There is an issue with the eMMC causing permanent + * damage to the card if a quirk isn't addressed. + * Until it's fixed, disable the MMC so as not to brick + * devices. + */ + status = "disabled"; /* Downstream pushes 2.95V to the sdhci device, * but upstream driver REALLY wants to make vmmc 1.8v From 22f5adc75a8d60080e489b0f90f0a55104488464 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Patron?= Date: Sat, 25 Jul 2020 10:24:17 +0200 Subject: [PATCH 055/497] arm64: dts: qcom: pm660: Fix missing pound sign in interrupt-cells MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also add a space after '=' while at it. Tested-by: Konrad Dybcio Signed-off-by: Łukasz Patron Link: https://lore.kernel.org/r/20200725082417.8507-1-priv.luk@gmail.com Signed-off-by: Bjorn Andersson --- arch/arm64/boot/dts/qcom/pm660.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/pm660.dtsi b/arch/arm64/boot/dts/qcom/pm660.dtsi index ea0e9558d0f2..2e6a6f6c3b66 100644 --- a/arch/arm64/boot/dts/qcom/pm660.dtsi +++ b/arch/arm64/boot/dts/qcom/pm660.dtsi @@ -44,7 +44,7 @@ gpio-ranges = <&pm660_gpios 0 0 13>; #gpio-cells = <2>; interrupt-controller; - interrupt-cells =<2>; + #interrupt-cells = <2>; }; }; }; From 7a366707bb6a93baeb1a9ef46c4b9c875e0132d6 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Mon, 14 Sep 2020 20:28:07 +0530 Subject: [PATCH 056/497] soc: qcom: pdr: Fixup array type of get_domain_list_resp message The array type of get_domain_list_resp is incorrectly marked as NO_ARRAY. Due to which the following error was observed when using pdr helpers with the downstream proprietary pd-mapper. Fix this up by marking it as VAR_LEN_ARRAY instead. Err logs: qmi_decode_struct_elem: Fault in decoding: dl(2), db(27), tl(160), i(1), el(1) failed to decode incoming message PDR: tms/servreg get domain list txn wait failed: -14 PDR: service lookup for tms/servreg failed: -14 Tested-by: Rishabh Bhatnagar Fixes: fbe639b44a82 ("soc: qcom: Introduce Protection Domain Restart helpers") Reported-by: Rishabh Bhatnagar Signed-off-by: Sibi Sankar Link: https://lore.kernel.org/r/20200914145807.1224-1-sibis@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/pdr_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/qcom/pdr_internal.h b/drivers/soc/qcom/pdr_internal.h index 15b5002e4127..ab9ae8cdfa54 100644 --- a/drivers/soc/qcom/pdr_internal.h +++ b/drivers/soc/qcom/pdr_internal.h @@ -185,7 +185,7 @@ struct qmi_elem_info servreg_get_domain_list_resp_ei[] = { .data_type = QMI_STRUCT, .elem_len = SERVREG_DOMAIN_LIST_LENGTH, .elem_size = sizeof(struct servreg_location_entry), - .array_type = NO_ARRAY, + .array_type = VAR_LEN_ARRAY, .tlv_type = 0x12, .offset = offsetof(struct servreg_get_domain_list_resp, domain_list), From 501a67a25dd459a7d16b35ecdbe8a1ca5463e2bd Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:25 +0100 Subject: [PATCH 057/497] KVM: arm64: Remove __activate_vm wrapper The __activate_vm wrapper serves no useful function and has a misleading name as it simply calls __load_guest_stage2 and does not touch HCR_EL2.VM so remove it. Also rename __deactivate_vm to __load_host_stage2 to match naming pattern. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-2-ascull@google.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 5 ----- arch/arm64/kvm/hyp/nvhe/switch.c | 8 ++++---- arch/arm64/kvm/hyp/vhe/switch.c | 10 +++++----- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5b6b8fa00f0a..0864f88bc840 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) } } -static inline void __activate_vm(struct kvm_s2_mmu *mmu) -{ - __load_guest_stage2(mmu); -} - static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) { u64 par, tmp; diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 0970442d2dbc..3c9c065b3264 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -93,7 +93,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); } -static void __deactivate_vm(struct kvm_vcpu *vcpu) +static void __load_host_stage2(void) { write_sysreg(0, vttbr_el2); } @@ -194,7 +194,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_state_nvhe(guest_ctxt); - __activate_vm(kern_hyp_va(vcpu->arch.hw_mmu)); + __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); @@ -219,7 +219,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __hyp_vgic_save_state(vcpu); __deactivate_traps(vcpu); - __deactivate_vm(vcpu); + __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); @@ -253,7 +253,7 @@ void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) if (read_sysreg(vttbr_el2)) { __timer_disable_traps(vcpu); __deactivate_traps(vcpu); - __deactivate_vm(vcpu); + __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c1da4f86ccac..6636522a8529 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -120,12 +120,12 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * HCR_EL2.TGE. * * We have already configured the guest's stage 1 translation in - * kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm - * before __activate_traps, because __activate_vm configures - * stage 2 translation, and __activate_traps clear HCR_EL2.TGE - * (among other things). + * kvm_vcpu_load_sysregs_vhe above. We must now call + * __load_guest_stage2 before __activate_traps, because + * __load_guest_stage2 configures stage 2 translation, and + * __activate_traps clear HCR_EL2.TGE (among other things). */ - __activate_vm(vcpu->arch.hw_mmu); + __load_guest_stage2(vcpu->arch.hw_mmu); __activate_traps(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); From 6a0259ed29bba83653a36fabcdf6b06aecd78596 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:26 +0100 Subject: [PATCH 058/497] KVM: arm64: Remove hyp_panic arguments hyp_panic is able to find all the context it needs from within itself so remove the argument. The __hyp_panic wrapper becomes redundant so is also removed. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-3-ascull@google.com --- arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kvm/hyp/hyp-entry.S | 7 +------ arch/arm64/kvm/hyp/include/hyp/switch.h | 4 +--- arch/arm64/kvm/hyp/nvhe/switch.c | 8 ++++++-- arch/arm64/kvm/hyp/vhe/switch.c | 10 ++++++---- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 46689e7db46c..3de99b323061 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -89,7 +89,7 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt); +void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ void __noreturn __hyp_do_panic(unsigned long, ...); #endif diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 46b4dab933d0..9cb3fbca5d79 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -210,12 +210,7 @@ SYM_FUNC_START(__hyp_do_panic) SYM_FUNC_END(__hyp_do_panic) #endif -SYM_CODE_START(__hyp_panic) - get_host_ctxt x0, x1 - b hyp_panic -SYM_CODE_END(__hyp_panic) - -.macro invalid_vector label, target = __hyp_panic +.macro invalid_vector label, target = hyp_panic .align 2 SYM_CODE_START(\label) b \target diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 0864f88bc840..96ea3fdd0c20 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -510,13 +510,11 @@ static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu) static inline void __kvm_unexpected_el2_exception(void) { unsigned long addr, fixup; - struct kvm_cpu_context *host_ctxt; struct exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); entry = hyp_symbol_addr(__start___kvm_ex_table); end = hyp_symbol_addr(__stop___kvm_ex_table); - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; while (entry < end) { addr = (unsigned long)&entry->insn + entry->insn; @@ -531,7 +529,7 @@ static inline void __kvm_unexpected_el2_exception(void) return; } - hyp_panic(host_ctxt); + hyp_panic(); } #endif /* __ARM64_KVM_HYP_SWITCH_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 3c9c065b3264..26d6fd4b6f4d 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -242,14 +242,18 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) return exit_code; } -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) +void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); - struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu; + struct kvm_cpu_context *host_ctxt; + struct kvm_vcpu *vcpu; unsigned long str_va; + host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + vcpu = host_ctxt->__hyp_running_vcpu; + if (read_sysreg(vttbr_el2)) { __timer_disable_traps(vcpu); __deactivate_traps(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 6636522a8529..c1fd47557713 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -192,10 +192,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) return ret; } -static void __hyp_call_panic(u64 spsr, u64 elr, u64 par, - struct kvm_cpu_context *host_ctxt) +static void __hyp_call_panic(u64 spsr, u64 elr, u64 par) { + struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; + + host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; vcpu = host_ctxt->__hyp_running_vcpu; __deactivate_traps(vcpu); @@ -208,13 +210,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par, } NOKPROBE_SYMBOL(__hyp_call_panic); -void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt) +void __noreturn hyp_panic(void) { u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); - __hyp_call_panic(spsr, elr, par, host_ctxt); + __hyp_call_panic(spsr, elr, par); unreachable(); } From d7ca1079d8ea897a8c45c9f78693972e28859056 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:27 +0100 Subject: [PATCH 059/497] KVM: arm64: Remove kvm_host_data_t typedef The kvm_host_data_t typedef is used inconsistently and goes against the kernel's coding style. Remove it in favour of the full struct specifier. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-4-ascull@google.com --- arch/arm64/include/asm/kvm_host.h | 4 +--- arch/arm64/kvm/arm.c | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e52c927aade5..16adbefde1cc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -262,8 +262,6 @@ struct kvm_host_data { struct kvm_pmu_events pmu_events; }; -typedef struct kvm_host_data kvm_host_data_t; - struct vcpu_reset_state { unsigned long pc; unsigned long r0; @@ -565,7 +563,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data); +DECLARE_PER_CPU(struct kvm_host_data, kvm_host_data); static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 46dc3d75cf13..1af4c77feda2 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,7 +46,7 @@ __asm__(".arch_extension virt"); #endif -DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); +DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); /* The VMID used in the VTTBR */ @@ -1538,7 +1538,7 @@ static int init_hyp_mode(void) } for_each_possible_cpu(cpu) { - kvm_host_data_t *cpu_data; + struct kvm_host_data *cpu_data; cpu_data = per_cpu_ptr(&kvm_host_data, cpu); err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); From ceee2fe4ba1f0b9dd388fd1030bea0162ca66a40 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:28 +0100 Subject: [PATCH 060/497] KVM: arm64: Choose hyp symbol based on context Make CHOOSE_HYP_SYM select the symbol of the active hypervisor for the host, the nVHE symbol for nVHE and the VHE symbol for VHE. The nVHE and VHE hypervisors see their own symbols without prefixes and trigger a link error when trying to use a symbol of the other hypervisor. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Cc: David Brazdil Link: https://lore.kernel.org/r/20200915104643.2543892-5-ascull@google.com --- arch/arm64/include/asm/kvm_asm.h | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 6f98fbd0ac81..a952859117b2 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -60,10 +60,24 @@ DECLARE_KVM_VHE_SYM(sym); \ DECLARE_KVM_NVHE_SYM(sym) -#define CHOOSE_VHE_SYM(sym) sym -#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) +#if defined(__KVM_NVHE_HYPERVISOR__) + +#define CHOOSE_HYP_SYM(sym) CHOOSE_NVHE_SYM(sym) +#define CHOOSE_NVHE_SYM(sym) sym +/* The nVHE hypervisor shouldn't even try to access VHE symbols */ +extern void *__nvhe_undefined_symbol; +#define CHOOSE_VHE_SYM(sym) __nvhe_undefined_symbol + +#elif defined(__KVM_VHE_HYPERVISOR) + +#define CHOOSE_HYP_SYM(sym) CHOOSE_VHE_SYM(sym) +#define CHOOSE_VHE_SYM(sym) sym +/* The VHE hypervisor shouldn't even try to access nVHE symbols */ +extern void *__vhe_undefined_symbol; +#define CHOOSE_NVHE_SYM(sym) __vhe_undefined_symbol + +#else -#ifndef __KVM_NVHE_HYPERVISOR__ /* * BIG FAT WARNINGS: * @@ -77,10 +91,9 @@ */ #define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \ : CHOOSE_NVHE_SYM(sym)) -#else -/* The nVHE hypervisor shouldn't even try to access anything */ -extern void *__nvhe_undefined_symbol; -#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol +#define CHOOSE_VHE_SYM(sym) sym +#define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) + #endif /* Translate a kernel address @ptr into its equivalent linear mapping */ From a0e479523e3fb4fa52c351cc4906f38097de789a Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:29 +0100 Subject: [PATCH 061/497] KVM: arm64: Save chosen hyp vector to a percpu variable Introduce a percpu variable to hold the address of the selected hyp vector that will be used with guests. This avoids the selection process each time a guest is being entered and can be used by nVHE when a separate vector is introduced for the host. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-6-ascull@google.com --- arch/arm64/include/asm/kvm_hyp.h | 2 ++ arch/arm64/kvm/arm.c | 5 ++++- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 3de99b323061..1e2491da324e 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -12,6 +12,8 @@ #include #include +DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); + #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1af4c77feda2..77fc856ea513 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -47,6 +47,7 @@ __asm__(".arch_extension virt"); #endif DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); /* The VMID used in the VTTBR */ @@ -1276,7 +1277,7 @@ static void cpu_init_hyp_mode(void) pgd_ptr = kvm_mmu_get_httbr(); hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; - vector_ptr = (unsigned long)kvm_get_hyp_vector(); + vector_ptr = __this_cpu_read(kvm_hyp_vector); /* * Call initialization code, and switch to the full blown HYP code. @@ -1309,6 +1310,8 @@ static void cpu_hyp_reinit(void) cpu_hyp_reset(); + __this_cpu_write(kvm_hyp_vector, (unsigned long)kvm_get_hyp_vector()); + if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); else diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c1fd47557713..b49cf53c11f0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -59,7 +59,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) write_sysreg(val, cpacr_el1); - write_sysreg(kvm_get_hyp_vector(), vbar_el1); + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1); } NOKPROBE_SYMBOL(__activate_traps); From 6e3bfbb22c51bd1f121ca5c4bce6b51dc6cdceae Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:30 +0100 Subject: [PATCH 062/497] KVM: arm64: nVHE: Use separate vector for the host The host is treated differently from the guests when an exception is taken so introduce a separate vector that is specialized for the host. This also allows the nVHE specific code to move out of hyp-entry.S and into nvhe/host.S. The host is only expected to make HVC calls and anything else is considered invalid and results in a panic. Hyp initialization is now passed the vector that is used for the host and it is swapped for the guest vector during the context switch. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-7-ascull@google.com --- arch/arm64/include/asm/kvm_asm.h | 2 + arch/arm64/kernel/image-vars.h | 1 + arch/arm64/kvm/arm.c | 11 +++- arch/arm64/kvm/hyp/hyp-entry.S | 66 ------------------- arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/hyp/nvhe/host.S | 108 +++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 3 + 7 files changed, 125 insertions(+), 68 deletions(-) create mode 100644 arch/arm64/kvm/hyp/nvhe/host.S diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index a952859117b2..fe51c06d480d 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -111,8 +111,10 @@ struct kvm_vcpu; struct kvm_s2_mmu; DECLARE_KVM_NVHE_SYM(__kvm_hyp_init); +DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector); DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) +#define __kvm_hyp_host_vector CHOOSE_NVHE_SYM(__kvm_hyp_host_vector) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) #ifdef CONFIG_KVM_INDIRECT_VECTORS diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 8982b68289b7..54bb0eb34b0f 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -71,6 +71,7 @@ KVM_NVHE_ALIAS(kvm_update_va_mask); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(arm64_ssbd_callback_required); KVM_NVHE_ALIAS(kvm_host_data); +KVM_NVHE_ALIAS(kvm_hyp_vector); KVM_NVHE_ALIAS(kvm_vgic_global_state); /* Kernel constant needed to compute idmap addresses. */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 77fc856ea513..b6442c6be5ad 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1277,7 +1277,7 @@ static void cpu_init_hyp_mode(void) pgd_ptr = kvm_mmu_get_httbr(); hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; - vector_ptr = __this_cpu_read(kvm_hyp_vector); + vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector)); /* * Call initialization code, and switch to the full blown HYP code. @@ -1542,6 +1542,7 @@ static int init_hyp_mode(void) for_each_possible_cpu(cpu) { struct kvm_host_data *cpu_data; + unsigned long *vector; cpu_data = per_cpu_ptr(&kvm_host_data, cpu); err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); @@ -1550,6 +1551,14 @@ static int init_hyp_mode(void) kvm_err("Cannot map host CPU state: %d\n", err); goto out_err; } + + vector = per_cpu_ptr(&kvm_hyp_vector, cpu); + err = create_hyp_mappings(vector, vector + 1, PAGE_HYP); + + if (err) { + kvm_err("Cannot map hyp guest vector address\n"); + goto out_err; + } } err = hyp_map_aux_data(); diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 9cb3fbca5d79..f92489250dfc 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -12,7 +12,6 @@ #include #include #include -#include #include .macro save_caller_saved_regs_vect @@ -41,20 +40,6 @@ .text -.macro do_el2_call - /* - * Shuffle the parameters before calling the function - * pointed to in x0. Assumes parameters in x[1,2,3]. - */ - str lr, [sp, #-16]! - mov lr, x0 - mov x0, x1 - mov x1, x2 - mov x2, x3 - blr lr - ldr lr, [sp], #16 -.endm - el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 @@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2 ccmp x0, #ESR_ELx_EC_HVC32, #4, ne b.ne el1_trap -#ifdef __KVM_NVHE_HYPERVISOR__ - mrs x1, vttbr_el2 // If vttbr is valid, the guest - cbnz x1, el1_hvc_guest // called HVC - - /* Here, we're pretty sure the host called HVC. */ - ldp x0, x1, [sp], #16 - - /* Check for a stub HVC call */ - cmp x0, #HVC_STUB_HCALL_NR - b.hs 1f - - /* - * Compute the idmap address of __kvm_handle_stub_hvc and - * jump there. Since we use kimage_voffset, do not use the - * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead - * (by loading it from the constant pool). - * - * Preserve x0-x4, which may contain stub parameters. - */ - ldr x5, =__kvm_handle_stub_hvc - ldr_l x6, kimage_voffset - - /* x5 = __pa(x5) */ - sub x5, x5, x6 - br x5 - -1: - /* - * Perform the EL2 call - */ - kern_hyp_va x0 - do_el2_call - - eret - sb -#endif /* __KVM_NVHE_HYPERVISOR__ */ - -el1_hvc_guest: /* * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1. * The workaround has already been applied on the host, @@ -198,18 +145,6 @@ el2_error: eret sb -#ifdef __KVM_NVHE_HYPERVISOR__ -SYM_FUNC_START(__hyp_do_panic) - mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) - msr spsr_el2, lr - ldr lr, =panic - msr elr_el2, lr - eret - sb -SYM_FUNC_END(__hyp_do_panic) -#endif - .macro invalid_vector label, target = hyp_panic .align 2 SYM_CODE_START(\label) @@ -222,7 +157,6 @@ SYM_CODE_END(\label) invalid_vector el2t_irq_invalid invalid_vector el2t_fiq_invalid invalid_vector el2t_error_invalid - invalid_vector el2h_sync_invalid invalid_vector el2h_irq_invalid invalid_vector el2h_fiq_invalid invalid_vector el1_fiq_invalid diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index aef76487edc2..ddf98eb07b9d 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -6,7 +6,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o +obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S new file mode 100644 index 000000000000..128af58d342d --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull + */ + +#include + +#include +#include +#include + + .text + +SYM_FUNC_START(__hyp_do_panic) + mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ + PSR_MODE_EL1h) + msr spsr_el2, lr + ldr lr, =panic + msr elr_el2, lr + eret + sb +SYM_FUNC_END(__hyp_do_panic) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + esb + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + lsr x0, x0, #ESR_ELx_EC_SHIFT + cmp x0, #ESR_ELx_EC_HVC64 + ldp x0, x1, [sp], #16 + b.ne hyp_panic + + /* Check for a stub HVC call */ + cmp x0, #HVC_STUB_HCALL_NR + b.hs 1f + + /* + * Compute the idmap address of __kvm_handle_stub_hvc and + * jump there. Since we use kimage_voffset, do not use the + * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead + * (by loading it from the constant pool). + * + * Preserve x0-x4, which may contain stub parameters. + */ + ldr x5, =__kvm_handle_stub_hvc + ldr_l x6, kimage_voffset + + /* x5 = __pa(x5) */ + sub x5, x5, x6 + br x5 + +1: + /* + * Shuffle the parameters before calling the function + * pointed to in x0. Assumes parameters in x[1,2,3]. + */ + kern_hyp_va x0 + str lr, [sp, #-16]! + mov lr, x0 + mov x0, x1 + mov x1, x2 + mov x2, x3 + blr lr + ldr lr, [sp], #16 + + eret + sb +.L__vect_end\@: +.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) + .error "host_el1_sync_vect larger than vector entry" +.endif +.endm + +.macro invalid_host_vect + .align 7 + b hyp_panic +.endm + +/* + * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the + * host knows about the EL2 vectors already, and there is no point in hiding + * them. + */ + .align 11 +SYM_CODE_START(__kvm_hyp_host_vector) + invalid_host_vect // Synchronous EL2t + invalid_host_vect // IRQ EL2t + invalid_host_vect // FIQ EL2t + invalid_host_vect // Error EL2t + + invalid_host_vect // Synchronous EL2h + invalid_host_vect // IRQ EL2h + invalid_host_vect // FIQ EL2h + invalid_host_vect // Error EL2h + + host_el1_sync_vect // Synchronous 64-bit EL1 + invalid_host_vect // IRQ 64-bit EL1 + invalid_host_vect // FIQ 64-bit EL1 + invalid_host_vect // Error 64-bit EL1 + + invalid_host_vect // Synchronous 32-bit EL1 + invalid_host_vect // IRQ 32-bit EL1 + invalid_host_vect // FIQ 32-bit EL1 + invalid_host_vect // Error 32-bit EL1 +SYM_CODE_END(__kvm_hyp_host_vector) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 26d6fd4b6f4d..075384084e08 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -42,6 +42,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) } write_sysreg(val, cptr_el2); + write_sysreg(__hyp_this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt; @@ -60,6 +61,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { + extern char __kvm_hyp_host_vector[]; u64 mdcr_el2; ___deactivate_traps(vcpu); @@ -91,6 +93,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(mdcr_el2, mdcr_el2); write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); + write_sysreg(__kvm_hyp_host_vector, vbar_el2); } static void __load_host_stage2(void) From 472fc011ccd30f05e0b39d71064777d39dd11cac Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:31 +0100 Subject: [PATCH 063/497] KVM: arm64: nVHE: Don't consume host SErrors with ESB The ESB at the start of the host vector may cause SErrors to be consumed to DISR_EL1. However, this is not checked for the host so the SError could go unhandled. Remove the ESB so that SErrors are not consumed but are instead left pending for the host to consume. __guest_enter already defers entry into a guest if there are any SErrors pending. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Cc: James Morse Link: https://lore.kernel.org/r/20200915104643.2543892-8-ascull@google.com --- arch/arm64/kvm/hyp/nvhe/host.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 128af58d342d..da21fddcef75 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -25,7 +25,6 @@ SYM_FUNC_END(__hyp_do_panic) .macro host_el1_sync_vect .align 7 .L__vect_start\@: - esb stp x0, x1, [sp, #-16]! mrs x0, esr_el2 lsr x0, x0, #ESR_ELx_EC_SHIFT @@ -80,6 +79,11 @@ SYM_FUNC_END(__hyp_do_panic) .endm /* + * The host vector does not use an ESB instruction in order to avoid consuming + * SErrors that should only be consumed by the host. Guest entry is deferred by + * __guest_enter if there are any pending asynchronous exceptions so hyp will + * always return to the host without having consumerd host SErrors. + * * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the * host knows about the EL2 vectors already, and there is no point in hiding * them. From b619d9aa8b385bdab747df596961d8fdddde4d52 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:32 +0100 Subject: [PATCH 064/497] KVM: arm64: Introduce hyp context During __guest_enter, save and restore from a new hyp context rather than the host context. This is preparation for separation of the hyp and host context in nVHE. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-9-ascull@google.com --- arch/arm64/include/asm/kvm_hyp.h | 3 ++- arch/arm64/kernel/image-vars.h | 1 + arch/arm64/kvm/arm.c | 10 ++++++++++ arch/arm64/kvm/hyp/entry.S | 10 +++++----- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- 7 files changed, 21 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 1e2491da324e..0b525e05e5bf 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -12,6 +12,7 @@ #include #include +DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); #define read_sysreg_elx(r,nvh,vh) \ @@ -89,7 +90,7 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); void deactivate_traps_vhe_put(void); #endif -u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); +u64 __guest_enter(struct kvm_vcpu *vcpu); void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 54bb0eb34b0f..9f419e4fc66b 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -71,6 +71,7 @@ KVM_NVHE_ALIAS(kvm_update_va_mask); /* Global kernel state accessed by nVHE hyp code. */ KVM_NVHE_ALIAS(arm64_ssbd_callback_required); KVM_NVHE_ALIAS(kvm_host_data); +KVM_NVHE_ALIAS(kvm_hyp_ctxt); KVM_NVHE_ALIAS(kvm_hyp_vector); KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b6442c6be5ad..ae4b34f91e94 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -47,6 +47,7 @@ __asm__(".arch_extension virt"); #endif DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); +DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); @@ -1542,6 +1543,7 @@ static int init_hyp_mode(void) for_each_possible_cpu(cpu) { struct kvm_host_data *cpu_data; + struct kvm_cpu_context *hyp_ctxt; unsigned long *vector; cpu_data = per_cpu_ptr(&kvm_host_data, cpu); @@ -1552,6 +1554,14 @@ static int init_hyp_mode(void) goto out_err; } + hyp_ctxt = per_cpu_ptr(&kvm_hyp_ctxt, cpu); + err = create_hyp_mappings(hyp_ctxt, hyp_ctxt + 1, PAGE_HYP); + + if (err) { + kvm_err("Cannot map hyp context: %d\n", err); + goto out_err; + } + vector = per_cpu_ptr(&kvm_hyp_vector, cpu); err = create_hyp_mappings(vector, vector + 1, PAGE_HYP); diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 76e7eaf4675e..9551d7f186da 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -57,15 +57,15 @@ .endm /* - * u64 __guest_enter(struct kvm_vcpu *vcpu, - * struct kvm_cpu_context *host_ctxt); + * u64 __guest_enter(struct kvm_vcpu *vcpu); */ SYM_FUNC_START(__guest_enter) // x0: vcpu - // x1: host context - // x2-x17: clobbered by macros + // x1-x17: clobbered by macros // x29: guest context + hyp_adr_this_cpu x1, kvm_hyp_ctxt, x2 + // Store the host regs save_callee_saved_regs x1 @@ -148,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Store the guest's sp_el0 save_sp_el0 x1, x2 - get_host_ctxt x2, x3 + hyp_adr_this_cpu x2, kvm_hyp_ctxt, x3 // Macro ptrauth_switch_to_guest format: // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 96ea3fdd0c20..afe714056b97 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -381,7 +381,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) return false; - ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + ctxt = __hyp_this_cpu_ptr(kvm_hyp_ctxt); __ptrauth_save_key(ctxt, APIA); __ptrauth_save_key(ctxt, APIB); __ptrauth_save_key(ctxt, APDA); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 075384084e08..c99945cda779 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -209,7 +209,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) do { /* Jump in the fire! */ - exit_code = __guest_enter(vcpu, host_ctxt); + exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b49cf53c11f0..cf477f856e51 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -135,7 +135,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) do { /* Jump in the fire! */ - exit_code = __guest_enter(vcpu, host_ctxt); + exit_code = __guest_enter(vcpu); /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); From 7c2e76d87f9ce7af47a07ca803343fd5f4aa4ab5 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:33 +0100 Subject: [PATCH 065/497] KVM: arm64: Update context references from host to hyp Hyp now has its own nominal context for saving and restoring its state when switching to and from a guest. Update the related comments and utilities to match the new name. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-10-ascull@google.com --- arch/arm64/include/asm/kvm_ptrauth.h | 6 +++--- arch/arm64/kvm/hyp/entry.S | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h index 0ddf98c3ba9f..0cd0965255d2 100644 --- a/arch/arm64/include/asm/kvm_ptrauth.h +++ b/arch/arm64/include/asm/kvm_ptrauth.h @@ -60,7 +60,7 @@ .endm /* - * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will + * Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will * check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as * (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and * then proceed ahead with the save/restore of Pointer Authentication @@ -78,7 +78,7 @@ alternative_else_nop_endif .L__skip_switch\@: .endm -.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3 alternative_if_not ARM64_HAS_ADDRESS_AUTH b .L__skip_switch\@ alternative_else_nop_endif @@ -96,7 +96,7 @@ alternative_else_nop_endif #else /* !CONFIG_ARM64_PTR_AUTH */ .macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3 .endm -.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3 +.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3 .endm #endif /* CONFIG_ARM64_PTR_AUTH */ #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 9551d7f186da..38cca690a6ff 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -66,16 +66,16 @@ SYM_FUNC_START(__guest_enter) hyp_adr_this_cpu x1, kvm_hyp_ctxt, x2 - // Store the host regs + // Store the hyp regs save_callee_saved_regs x1 - // Save the host's sp_el0 + // Save hyp's sp_el0 save_sp_el0 x1, x2 - // Now the host state is stored if we have a pending RAS SError it must - // affect the host. If any asynchronous exception is pending we defer - // the guest entry. The DSB isn't necessary before v8.2 as any SError - // would be fatal. + // Now the hyp state is stored if we have a pending RAS SError it must + // affect the host or hyp. If any asynchronous exception is pending we + // defer the guest entry. The DSB isn't necessary before v8.2 as any + // SError would be fatal. alternative_if ARM64_HAS_RAS_EXTN dsb nshst isb @@ -150,17 +150,17 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) hyp_adr_this_cpu x2, kvm_hyp_ctxt, x3 - // Macro ptrauth_switch_to_guest format: - // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3) + // Macro ptrauth_switch_to_hyp format: + // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3) // The below macro to save/restore keys is not implemented in C code // as it may cause Pointer Authentication key signing mismatch errors // when this feature is enabled for kernel code. - ptrauth_switch_to_host x1, x2, x3, x4, x5 + ptrauth_switch_to_hyp x1, x2, x3, x4, x5 - // Restore the hosts's sp_el0 + // Restore hyp's sp_el0 restore_sp_el0 x2, x3 - // Now restore the host regs + // Now restore the hyp regs restore_callee_saved_regs x2 alternative_if ARM64_HAS_RAS_EXTN From 7db21530479f071ee0e0a4d5fcf5e6bc6c0352ba Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:34 +0100 Subject: [PATCH 066/497] KVM: arm64: Restore hyp when panicking in guest context If the guest context is loaded when a panic is triggered, restore the hyp context so e.g. the shadow call stack works when hyp_panic() is called and SP_EL0 is valid when the host's panic() is called. Use the hyp context's __hyp_running_vcpu field to track when hyp transitions to and from the guest vcpu so the exception handlers know whether the context needs to be restored. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-11-ascull@google.com --- arch/arm64/include/asm/kvm_asm.h | 10 ++++++++++ arch/arm64/kvm/hyp/entry.S | 24 ++++++++++++++++++++++++ arch/arm64/kvm/hyp/hyp-entry.S | 5 ++--- arch/arm64/kvm/hyp/include/hyp/switch.h | 4 +++- arch/arm64/kvm/hyp/nvhe/host.S | 5 +++++ 5 files changed, 44 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index fe51c06d480d..4df2bd8882bc 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -236,6 +236,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] .endm +.macro get_loaded_vcpu vcpu, ctxt + hyp_adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu + ldr \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] +.endm + +.macro set_loaded_vcpu vcpu, ctxt, tmp + hyp_adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp + str \vcpu, [\ctxt, #HOST_CONTEXT_VCPU] +.endm + /* * KVM extable for unexpected exceptions. * In the same format _asm_extable, but output to a different section so that diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 38cca690a6ff..4787fc82790c 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -86,6 +86,8 @@ alternative_else_nop_endif ret 1: + set_loaded_vcpu x0, x1, x2 + add x29, x0, #VCPU_CONTEXT // Macro ptrauth_switch_to_guest format: @@ -116,6 +118,26 @@ alternative_else_nop_endif eret sb +SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL) + // x2-x29,lr: vcpu regs + // vcpu x0-x1 on the stack + + // If the hyp context is loaded, go straight to hyp_panic + get_loaded_vcpu x0, x1 + cbz x0, hyp_panic + + // The hyp context is saved so make sure it is restored to allow + // hyp_panic to run at hyp and, subsequently, panic to run in the host. + // This makes use of __guest_exit to avoid duplication but sets the + // return address to tail call into hyp_panic. As a side effect, the + // current state is saved to the guest context but it will only be + // accurate if the guest had been completely restored. + hyp_adr_this_cpu x0, kvm_hyp_ctxt, x1 + adr x1, hyp_panic + str x1, [x0, #CPU_XREG_OFFSET(30)] + + get_vcpu_ptr x1, x0 + SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // x0: return code // x1: vcpu @@ -163,6 +185,8 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Now restore the hyp regs restore_callee_saved_regs x2 + set_loaded_vcpu xzr, x1, x2 + alternative_if ARM64_HAS_RAS_EXTN // If we have the RAS extensions we can consume a pending error // without an unmask-SError and isb. The ESB-instruction consumed any diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index f92489250dfc..bc9f53df46f5 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -145,7 +145,7 @@ el2_error: eret sb -.macro invalid_vector label, target = hyp_panic +.macro invalid_vector label, target = __guest_exit_panic .align 2 SYM_CODE_START(\label) b \target @@ -186,10 +186,9 @@ check_preamble_length 661b, 662b .macro invalid_vect target .align 7 661: - b \target nop + stp x0, x1, [sp, #-16]! 662: - ldp x0, x1, [sp], #16 b \target check_preamble_length 661b, 662b diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index afe714056b97..821721b78ad9 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -509,6 +509,7 @@ static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu) static inline void __kvm_unexpected_el2_exception(void) { + extern char __guest_exit_panic[]; unsigned long addr, fixup; struct exception_table_entry *entry, *end; unsigned long elr_el2 = read_sysreg(elr_el2); @@ -529,7 +530,8 @@ static inline void __kvm_unexpected_el2_exception(void) return; } - hyp_panic(); + /* Trigger a panic after restoring the hyp context. */ + write_sysreg(__guest_exit_panic, elr_el2); } #endif /* __ARM64_KVM_HYP_SWITCH_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index da21fddcef75..9ab7814e6114 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -75,6 +75,11 @@ SYM_FUNC_END(__hyp_do_panic) .macro invalid_host_vect .align 7 + /* If a guest is loaded, panic out of it. */ + stp x0, x1, [sp, #-16]! + get_loaded_vcpu x0, x1 + cbnz x0, __guest_exit_panic + add sp, sp, #16 b hyp_panic .endm From 603d2bdaa57e48a33d57eeb88971b88fbe5875d6 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:35 +0100 Subject: [PATCH 067/497] KVM: arm64: Share context save and restore macros To avoid duplicating the context save and restore macros, move them into a shareable header. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-12-ascull@google.com --- arch/arm64/include/asm/kvm_asm.h | 39 ++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/entry.S | 39 -------------------------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 4df2bd8882bc..4e3073858346 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -261,6 +261,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; .popsection .endm +#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x) +#define CPU_LR_OFFSET CPU_XREG_OFFSET(30) +#define CPU_SP_EL0_OFFSET (CPU_LR_OFFSET + 8) + +/* + * We treat x18 as callee-saved as the host may use it as a platform + * register (e.g. for shadow call stack). + */ +.macro save_callee_saved_regs ctxt + str x18, [\ctxt, #CPU_XREG_OFFSET(18)] + stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] + stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] + stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] + stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] + stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] + stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] +.endm + +.macro restore_callee_saved_regs ctxt + // We require \ctxt is not x18-x28 + ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)] + ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] + ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] + ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] + ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] + ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] + ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] +.endm + +.macro save_sp_el0 ctxt, tmp + mrs \tmp, sp_el0 + str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] +.endm + +.macro restore_sp_el0 ctxt, tmp + ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] + msr sp_el0, \tmp +.endm + #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 4787fc82790c..afaa8d1f2485 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -7,7 +7,6 @@ #include #include -#include #include #include #include @@ -16,46 +15,8 @@ #include #include -#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x) -#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8) - .text -/* - * We treat x18 as callee-saved as the host may use it as a platform - * register (e.g. for shadow call stack). - */ -.macro save_callee_saved_regs ctxt - str x18, [\ctxt, #CPU_XREG_OFFSET(18)] - stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] - stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] - stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] - stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] - stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] - stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] -.endm - -.macro restore_callee_saved_regs ctxt - // We require \ctxt is not x18-x28 - ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)] - ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)] - ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)] - ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)] - ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)] - ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)] - ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] -.endm - -.macro save_sp_el0 ctxt, tmp - mrs \tmp, sp_el0 - str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] -.endm - -.macro restore_sp_el0 ctxt, tmp - ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] - msr sp_el0, \tmp -.endm - /* * u64 __guest_enter(struct kvm_vcpu *vcpu); */ From 4e3393a969a0bdaae83263918b6824b2d08c3996 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:36 +0100 Subject: [PATCH 068/497] KVM: arm64: nVHE: Switch to hyp context for EL2 Save and restore the host context when switching to and from hyp. This gives hyp its own context that the host will not see as a step towards a full trust boundary between the two. SP_EL0 and pointer authentication keys are currently shared between the host and hyp so don't need to be switched yet. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-13-ascull@google.com --- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 + arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/hyp/nvhe/host.S | 68 ++++++++++++++++++------- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 36 +++++++++++++ 4 files changed, 89 insertions(+), 19 deletions(-) create mode 100644 arch/arm64/kvm/hyp/nvhe/hyp-main.c diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 821721b78ad9..4536b50ddc06 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -372,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr) ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \ } while(0) +DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); + static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *ctxt; diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index ddf98eb07b9d..46c89e8c30bc 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -6,7 +6,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o +obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 9ab7814e6114..d26e41773dc4 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -12,6 +12,55 @@ .text +SYM_FUNC_START(__host_exit) + stp x0, x1, [sp, #-16]! + + get_host_ctxt x0, x1 + + ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) + + /* Store the host regs x2 and x3 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(2)] + + /* Retrieve the host regs x0-x1 from the stack */ + ldp x2, x3, [sp], #16 // x0, x1 + + /* Store the host regs x0-x1 and x4-x17 */ + stp x2, x3, [x0, #CPU_XREG_OFFSET(0)] + stp x4, x5, [x0, #CPU_XREG_OFFSET(4)] + stp x6, x7, [x0, #CPU_XREG_OFFSET(6)] + stp x8, x9, [x0, #CPU_XREG_OFFSET(8)] + stp x10, x11, [x0, #CPU_XREG_OFFSET(10)] + stp x12, x13, [x0, #CPU_XREG_OFFSET(12)] + stp x14, x15, [x0, #CPU_XREG_OFFSET(14)] + stp x16, x17, [x0, #CPU_XREG_OFFSET(16)] + + /* Store the host regs x18-x29, lr */ + save_callee_saved_regs x0 + + /* Save the host context pointer in x29 across the function call */ + mov x29, x0 + bl handle_trap + + /* Restore host regs x0-x17 */ + ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] + ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] + ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] + ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)] + ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)] + ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)] + ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)] + ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)] + ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)] + + /* Restore host regs x18-x29, lr */ + restore_callee_saved_regs x29 + + /* Do not touch any register after this! */ + eret + sb +SYM_FUNC_END(__host_exit) + SYM_FUNC_START(__hyp_do_panic) mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) @@ -34,7 +83,7 @@ SYM_FUNC_END(__hyp_do_panic) /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR - b.hs 1f + b.hs __host_exit /* * Compute the idmap address of __kvm_handle_stub_hvc and @@ -50,23 +99,6 @@ SYM_FUNC_END(__hyp_do_panic) /* x5 = __pa(x5) */ sub x5, x5, x6 br x5 - -1: - /* - * Shuffle the parameters before calling the function - * pointed to in x0. Assumes parameters in x[1,2,3]. - */ - kern_hyp_va x0 - str lr, [sp, #-16]! - mov lr, x0 - mov x0, x1 - mov x1, x2 - mov x2, x3 - blr lr - ldr lr, [sp], #16 - - eret - sb .L__vect_end\@: .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) .error "host_el1_sync_vect larger than vector entry" diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c new file mode 100644 index 000000000000..570c3896f42e --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 - Google Inc + * Author: Andrew Scull + */ + +#include + +#include +#include +#include +#include +#include + +typedef unsigned long (*hypcall_fn_t) + (unsigned long, unsigned long, unsigned long); + +void handle_trap(struct kvm_cpu_context *host_ctxt) +{ + u64 esr = read_sysreg_el2(SYS_ESR); + hypcall_fn_t func; + unsigned long ret; + + if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64) + hyp_panic(); + + /* + * __kvm_call_hyp takes a pointer in the host address space and + * up to three arguments. + */ + func = (hypcall_fn_t)kern_hyp_va(host_ctxt->regs.regs[0]); + ret = func(host_ctxt->regs.regs[1], + host_ctxt->regs.regs[2], + host_ctxt->regs.regs[3]); + host_ctxt->regs.regs[0] = ret; +} From a2e102e20fd69ce26ea9fe7bd0df26d25dafff59 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:37 +0100 Subject: [PATCH 069/497] KVM: arm64: nVHE: Handle hyp panics Restore the host context when panicking from hyp to give the best chance of the panic being clean. The host requires that registers be preserved such as x18 for the shadow callstack. If the panic is caused by an exception from EL1, the host context is still valid so the panic can return straight back to the host. If the panic comes from EL2 then it's most likely that the hyp context is active and the host context needs to be restored. There are windows before and after the host context is saved and restored that restoration is attempted incorrectly and the panic won't be clean. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-14-ascull@google.com --- arch/arm64/include/asm/kvm_hyp.h | 2 +- arch/arm64/kvm/hyp/nvhe/host.S | 76 ++++++++++++++++++++++++-------- arch/arm64/kvm/hyp/nvhe/switch.c | 18 +++----- 3 files changed, 63 insertions(+), 33 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 0b525e05e5bf..6b664de5ec1f 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -94,7 +94,7 @@ u64 __guest_enter(struct kvm_vcpu *vcpu); void __noreturn hyp_panic(void); #ifdef __KVM_NVHE_HYPERVISOR__ -void __noreturn __hyp_do_panic(unsigned long, ...); +void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); #endif #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index d26e41773dc4..ff9a0f547b9f 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -47,6 +47,9 @@ SYM_FUNC_START(__host_exit) ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)] + + /* x0-7 are use for panic arguments */ +__host_enter_for_panic: ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)] ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)] ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)] @@ -57,18 +60,38 @@ SYM_FUNC_START(__host_exit) restore_callee_saved_regs x29 /* Do not touch any register after this! */ +__host_enter_without_restoring: eret sb SYM_FUNC_END(__host_exit) +/* + * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par); + */ SYM_FUNC_START(__hyp_do_panic) + /* Load the format arguments into x1-7 */ + mov x6, x3 + get_vcpu_ptr x7, x3 + + mrs x3, esr_el2 + mrs x4, far_el2 + mrs x5, hpfar_el2 + + /* Prepare and exit to the host's panic funciton. */ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ PSR_MODE_EL1h) msr spsr_el2, lr ldr lr, =panic msr elr_el2, lr - eret - sb + + /* + * Set the panic format string and enter the host, conditionally + * restoring the host context. + */ + cmp x0, xzr + ldr x0, =__hyp_panic_string + b.eq __host_enter_without_restoring + b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) .macro host_el1_sync_vect @@ -79,7 +102,7 @@ SYM_FUNC_END(__hyp_do_panic) lsr x0, x0, #ESR_ELx_EC_SHIFT cmp x0, #ESR_ELx_EC_HVC64 ldp x0, x1, [sp], #16 - b.ne hyp_panic + b.ne __host_exit /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR @@ -105,16 +128,31 @@ SYM_FUNC_END(__hyp_do_panic) .endif .endm -.macro invalid_host_vect +.macro invalid_host_el2_vect .align 7 /* If a guest is loaded, panic out of it. */ stp x0, x1, [sp, #-16]! get_loaded_vcpu x0, x1 cbnz x0, __guest_exit_panic add sp, sp, #16 + + /* + * The panic may not be clean if the exception is taken before the host + * context has been saved by __host_exit or after the hyp context has + * been partially clobbered by __host_enter. + */ b hyp_panic .endm +.macro invalid_host_el1_vect + .align 7 + mov x0, xzr /* restore_host = false */ + mrs x1, spsr_el2 + mrs x2, elr_el2 + mrs x3, par_el1 + b __hyp_do_panic +.endm + /* * The host vector does not use an ESB instruction in order to avoid consuming * SErrors that should only be consumed by the host. Guest entry is deferred by @@ -127,23 +165,23 @@ SYM_FUNC_END(__hyp_do_panic) */ .align 11 SYM_CODE_START(__kvm_hyp_host_vector) - invalid_host_vect // Synchronous EL2t - invalid_host_vect // IRQ EL2t - invalid_host_vect // FIQ EL2t - invalid_host_vect // Error EL2t + invalid_host_el2_vect // Synchronous EL2t + invalid_host_el2_vect // IRQ EL2t + invalid_host_el2_vect // FIQ EL2t + invalid_host_el2_vect // Error EL2t - invalid_host_vect // Synchronous EL2h - invalid_host_vect // IRQ EL2h - invalid_host_vect // FIQ EL2h - invalid_host_vect // Error EL2h + invalid_host_el2_vect // Synchronous EL2h + invalid_host_el2_vect // IRQ EL2h + invalid_host_el2_vect // FIQ EL2h + invalid_host_el2_vect // Error EL2h host_el1_sync_vect // Synchronous 64-bit EL1 - invalid_host_vect // IRQ 64-bit EL1 - invalid_host_vect // FIQ 64-bit EL1 - invalid_host_vect // Error 64-bit EL1 + invalid_host_el1_vect // IRQ 64-bit EL1 + invalid_host_el1_vect // FIQ 64-bit EL1 + invalid_host_el1_vect // Error 64-bit EL1 - invalid_host_vect // Synchronous 32-bit EL1 - invalid_host_vect // IRQ 32-bit EL1 - invalid_host_vect // FIQ 32-bit EL1 - invalid_host_vect // Error 32-bit EL1 + invalid_host_el1_vect // Synchronous 32-bit EL1 + invalid_host_el1_vect // IRQ 32-bit EL1 + invalid_host_el1_vect // FIQ 32-bit EL1 + invalid_host_el1_vect // Error 32-bit EL1 SYM_CODE_END(__kvm_hyp_host_vector) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index c99945cda779..29febf9a93f2 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -242,6 +242,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQOFF); + host_ctxt->__hyp_running_vcpu = NULL; + return exit_code; } @@ -250,31 +252,21 @@ void __noreturn hyp_panic(void) u64 spsr = read_sysreg_el2(SYS_SPSR); u64 elr = read_sysreg_el2(SYS_ELR); u64 par = read_sysreg(par_el1); + bool restore_host = true; struct kvm_cpu_context *host_ctxt; struct kvm_vcpu *vcpu; - unsigned long str_va; host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; vcpu = host_ctxt->__hyp_running_vcpu; - if (read_sysreg(vttbr_el2)) { + if (vcpu) { __timer_disable_traps(vcpu); __deactivate_traps(vcpu); __load_host_stage2(); __sysreg_restore_state_nvhe(host_ctxt); } - /* - * Force the panic string to be loaded from the literal pool, - * making sure it is a kernel address and not a PC-relative - * reference. - */ - asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string)); - - __hyp_do_panic(str_va, - spsr, elr, - read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR), - read_sysreg(hpfar_el2), par, vcpu); + __hyp_do_panic(restore_host, spsr, elr, par); unreachable(); } From 5dc33bd199ca3b9f53c28144fa9a20b43c908972 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:38 +0100 Subject: [PATCH 070/497] KVM: arm64: nVHE: Pass pointers consistently to hyp-init Rather than some being kernel pointer and others being hyp pointers, standardize on all pointers being hyp pointers. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-15-ascull@google.com --- arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/hyp/nvhe/hyp-init.S | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ae4b34f91e94..6b7180072c8d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1278,6 +1278,7 @@ static void cpu_init_hyp_mode(void) pgd_ptr = kvm_mmu_get_httbr(); hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; + hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr); vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector)); /* diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index d9434e90c06d..abe885e26fe2 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -114,7 +114,6 @@ alternative_else_nop_endif isb /* Set the stack and new vectors */ - kern_hyp_va x1 mov sp, x1 msr vbar_el2, x2 From cf6501689012ef346cc835a38cbebb1e398d0834 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:39 +0100 Subject: [PATCH 071/497] smccc: Define vendor hyp owned service call region Vendor specific hypervisor services have their own region of function identifiers reserved by SMCCC. Extend the list of owners to include this case. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Cc: Sudeep Holla Link: https://lore.kernel.org/r/20200915104643.2543892-16-ascull@google.com --- include/linux/arm-smccc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 15c706fb0a37..ee286f5de239 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -49,6 +49,7 @@ #define ARM_SMCCC_OWNER_OEM 3 #define ARM_SMCCC_OWNER_STANDARD 4 #define ARM_SMCCC_OWNER_STANDARD_HYP 5 +#define ARM_SMCCC_OWNER_VENDOR_HYP 6 #define ARM_SMCCC_OWNER_TRUSTED_APP 48 #define ARM_SMCCC_OWNER_TRUSTED_APP_END 49 #define ARM_SMCCC_OWNER_TRUSTED_OS 50 From 0794a974d74dc777a212a3c58dd236f507360348 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:40 +0100 Subject: [PATCH 072/497] smccc: Use separate variables for args and results Using the same register-bound variable for both arguments and results means these values share a type. That type must allow the arguments to be assigned to it and must also be assignable to the unsigned long fields of struct arm_smccc_res. This restriction on types causes compiler warnings when the argument cannot be implicitly assigned to an unsigned long, for example the pointers that are used in the KVM hyp interface. By separating the arguments and results into their own variables, the type constraint is lifted allowing the arguments to avoid the need for any type conversion. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Cc: Sudeep Holla Link: https://lore.kernel.org/r/20200915104643.2543892-17-ascull@google.com --- include/linux/arm-smccc.h | 73 ++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 44 deletions(-) diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index ee286f5de239..885c9ffc835c 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -228,87 +228,67 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #define __count_args(...) \ ___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0) -#define __constraint_write_0 \ - "+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3) -#define __constraint_write_1 \ - "+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3) -#define __constraint_write_2 \ - "+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3) -#define __constraint_write_3 \ - "+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3) -#define __constraint_write_4 __constraint_write_3 -#define __constraint_write_5 __constraint_write_4 -#define __constraint_write_6 __constraint_write_5 -#define __constraint_write_7 __constraint_write_6 - -#define __constraint_read_0 -#define __constraint_read_1 -#define __constraint_read_2 -#define __constraint_read_3 -#define __constraint_read_4 "r" (r4) -#define __constraint_read_5 __constraint_read_4, "r" (r5) -#define __constraint_read_6 __constraint_read_5, "r" (r6) -#define __constraint_read_7 __constraint_read_6, "r" (r7) +#define __constraint_read_0 "r" (arg0) +#define __constraint_read_1 __constraint_read_0, "r" (arg1) +#define __constraint_read_2 __constraint_read_1, "r" (arg2) +#define __constraint_read_3 __constraint_read_2, "r" (arg3) +#define __constraint_read_4 __constraint_read_3, "r" (arg4) +#define __constraint_read_5 __constraint_read_4, "r" (arg5) +#define __constraint_read_6 __constraint_read_5, "r" (arg6) +#define __constraint_read_7 __constraint_read_6, "r" (arg7) #define __declare_arg_0(a0, res) \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1"); \ - register unsigned long r2 asm("r2"); \ - register unsigned long r3 asm("r3") + register unsigned long arg0 asm("r0") = (u32)a0 #define __declare_arg_1(a0, a1, res) \ typeof(a1) __a1 = a1; \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = __a1; \ - register unsigned long r2 asm("r2"); \ - register unsigned long r3 asm("r3") + register unsigned long arg0 asm("r0") = (u32)a0; \ + register typeof(a1) arg1 asm("r1") = __a1 #define __declare_arg_2(a0, a1, a2, res) \ typeof(a1) __a1 = a1; \ typeof(a2) __a2 = a2; \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = __a1; \ - register unsigned long r2 asm("r2") = __a2; \ - register unsigned long r3 asm("r3") + register unsigned long arg0 asm("r0") = (u32)a0; \ + register typeof(a1) arg1 asm("r1") = __a1; \ + register typeof(a2) arg2 asm("r2") = __a2 #define __declare_arg_3(a0, a1, a2, a3, res) \ typeof(a1) __a1 = a1; \ typeof(a2) __a2 = a2; \ typeof(a3) __a3 = a3; \ struct arm_smccc_res *___res = res; \ - register unsigned long r0 asm("r0") = (u32)a0; \ - register unsigned long r1 asm("r1") = __a1; \ - register unsigned long r2 asm("r2") = __a2; \ - register unsigned long r3 asm("r3") = __a3 + register unsigned long arg0 asm("r0") = (u32)a0; \ + register typeof(a1) arg1 asm("r1") = __a1; \ + register typeof(a2) arg2 asm("r2") = __a2; \ + register typeof(a3) arg3 asm("r3") = __a3 #define __declare_arg_4(a0, a1, a2, a3, a4, res) \ typeof(a4) __a4 = a4; \ __declare_arg_3(a0, a1, a2, a3, res); \ - register unsigned long r4 asm("r4") = __a4 + register typeof(a4) arg4 asm("r4") = __a4 #define __declare_arg_5(a0, a1, a2, a3, a4, a5, res) \ typeof(a5) __a5 = a5; \ __declare_arg_4(a0, a1, a2, a3, a4, res); \ - register unsigned long r5 asm("r5") = __a5 + register typeof(a5) arg5 asm("r5") = __a5 #define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res) \ typeof(a6) __a6 = a6; \ __declare_arg_5(a0, a1, a2, a3, a4, a5, res); \ - register unsigned long r6 asm("r6") = __a6 + register typeof(a6) arg6 asm("r6") = __a6 #define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res) \ typeof(a7) __a7 = a7; \ __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res); \ - register unsigned long r7 asm("r7") = __a7 + register typeof(a7) arg7 asm("r7") = __a7 #define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__) #define __declare_args(count, ...) ___declare_args(count, __VA_ARGS__) #define ___constraints(count) \ - : __constraint_write_ ## count \ : __constraint_read_ ## count \ : "memory" #define __constraints(count) ___constraints(count) @@ -320,8 +300,13 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, */ #define __arm_smccc_1_1(inst, ...) \ do { \ + register unsigned long r0 asm("r0"); \ + register unsigned long r1 asm("r1"); \ + register unsigned long r2 asm("r2"); \ + register unsigned long r3 asm("r3"); \ __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \ - asm volatile(inst "\n" \ + asm volatile(inst "\n" : \ + "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3) \ __constraints(__count_args(__VA_ARGS__))); \ if (___res) \ *___res = (typeof(*___res)){r0, r1, r2, r3}; \ @@ -367,7 +352,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, #define __fail_smccc_1_1(...) \ do { \ __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \ - asm ("" __constraints(__count_args(__VA_ARGS__))); \ + asm ("" : __constraints(__count_args(__VA_ARGS__))); \ if (___res) \ ___res->a0 = SMCCC_RET_NOT_SUPPORTED; \ } while (0) From 054698316d87a13e4c0dc8f5ecdd77b12e5eb676 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:41 +0100 Subject: [PATCH 073/497] KVM: arm64: nVHE: Migrate hyp interface to SMCCC Rather than passing arbitrary function pointers to run at hyp, define and equivalent set of SMCCC functions. Since the SMCCC functions are strongly tied to the original function prototypes, it is not expected for the host to ever call an invalid ID but a warning is raised if this does ever occur. As __kvm_vcpu_run is used for every switch between the host and a guest, it is explicitly singled out to be identified before the other function IDs to improve the performance of the hot path. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-18-ascull@google.com --- arch/arm64/include/asm/kvm_asm.h | 24 +++++++ arch/arm64/include/asm/kvm_host.h | 25 ++++--- arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/hyp.S | 24 ++----- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 107 +++++++++++++++++++++++++---- 5 files changed, 139 insertions(+), 43 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 4e3073858346..3e4577013d33 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -38,6 +38,30 @@ #define __SMCCC_WORKAROUND_1_SMC_SZ 36 +#define KVM_HOST_SMCCC_ID(id) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + (id)) + +#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name) + +#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0 +#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1 +#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 +#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid 5 +#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 +#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2 8 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 +#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 +#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 16adbefde1cc..82c941cf8890 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -11,6 +11,7 @@ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ +#include #include #include #include @@ -479,18 +480,20 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -u64 __kvm_call_hyp(void *hypfn, ...); +u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr, + unsigned long hyp_stack_ptr, + unsigned long vector_ptr, + unsigned long tpidr_el2); -#define kvm_call_hyp_nvhe(f, ...) \ - do { \ - DECLARE_KVM_NVHE_SYM(f); \ - __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \ - } while(0) - -#define kvm_call_hyp_nvhe_ret(f, ...) \ +#define kvm_call_hyp_nvhe(f, ...) \ ({ \ - DECLARE_KVM_NVHE_SYM(f); \ - __kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__); \ + struct arm_smccc_res res; \ + \ + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \ + ##__VA_ARGS__, &res); \ + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \ + \ + res.a1; \ }) /* @@ -516,7 +519,7 @@ u64 __kvm_call_hyp(void *hypfn, ...); ret = f(__VA_ARGS__); \ isb(); \ } else { \ - ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__); \ + ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ \ ret; \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 6b7180072c8d..49aa08bd26de 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1288,7 +1288,7 @@ static void cpu_init_hyp_mode(void) * cpus_have_const_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); - __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); + __kvm_call_hyp_init(pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); /* * Disabling SSBD on a non-VHE system requires us to enable SSBS diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 3c79a1124af2..12aa426f7559 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -11,24 +11,12 @@ #include /* - * u64 __kvm_call_hyp(void *hypfn, ...); - * - * This is not really a variadic function in the classic C-way and care must - * be taken when calling this to ensure parameters are passed in registers - * only, since the stack will change between the caller and the callee. - * - * Call the function with the first argument containing a pointer to the - * function you wish to call in Hyp mode, and subsequent arguments will be - * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the - * function pointer can be passed). The function being called must be mapped - * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are - * passed in x0. - * - * A function pointer with a value less than 0xfff has a special meaning, - * and is used to implement hyp stubs in the same way as in - * arch/arm64/kernel/hyp_stub.S. + * u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr, + * unsigned long hyp_stack_ptr, + * unsigned long vector_ptr, + * unsigned long tpidr_el2); */ -SYM_FUNC_START(__kvm_call_hyp) +SYM_FUNC_START(__kvm_call_hyp_init) hvc #0 ret -SYM_FUNC_END(__kvm_call_hyp) +SYM_FUNC_END(__kvm_call_hyp_init) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 570c3896f42e..41aaf038599a 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -12,25 +12,106 @@ #include #include -typedef unsigned long (*hypcall_fn_t) - (unsigned long, unsigned long, unsigned long); +#include + +static void handle_host_hcall(unsigned long func_id, + struct kvm_cpu_context *host_ctxt) +{ + unsigned long ret = 0; + + switch (func_id) { + case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1; + + ret = __kvm_vcpu_run(vcpu); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context): + __kvm_flush_vm_context(); + break; + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + phys_addr_t ipa = host_ctxt->regs.regs[2]; + int level = host_ctxt->regs.regs[3]; + + __kvm_tlb_flush_vmid_ipa(mmu, ipa, level); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + + __kvm_tlb_flush_vmid(mmu); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; + + __kvm_tlb_flush_local_vmid(mmu); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): { + u64 cntvoff = host_ctxt->regs.regs[1]; + + __kvm_timer_set_cntvoff(cntvoff); + break; + } + case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs): + __kvm_enable_ssbs(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2): + ret = __vgic_v3_get_ich_vtr_el2(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr): + ret = __vgic_v3_read_vmcr(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): { + u32 vmcr = host_ctxt->regs.regs[1]; + + __vgic_v3_write_vmcr(vmcr); + break; + } + case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs): + __vgic_v3_init_lrs(); + break; + case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2): + ret = __kvm_get_mdcr_el2(); + break; + case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; + + __vgic_v3_save_aprs(cpu_if); + break; + } + case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): { + unsigned long r1 = host_ctxt->regs.regs[1]; + struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; + + __vgic_v3_restore_aprs(cpu_if); + break; + } + default: + /* Invalid host HVC. */ + host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED; + return; + } + + host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS; + host_ctxt->regs.regs[1] = ret; +} void handle_trap(struct kvm_cpu_context *host_ctxt) { u64 esr = read_sysreg_el2(SYS_ESR); - hypcall_fn_t func; - unsigned long ret; + unsigned long func_id; if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64) hyp_panic(); - /* - * __kvm_call_hyp takes a pointer in the host address space and - * up to three arguments. - */ - func = (hypcall_fn_t)kern_hyp_va(host_ctxt->regs.regs[0]); - ret = func(host_ctxt->regs.regs[1], - host_ctxt->regs.regs[2], - host_ctxt->regs.regs[3]); - host_ctxt->regs.regs[0] = ret; + func_id = host_ctxt->regs.regs[0]; + handle_host_hcall(func_id, host_ctxt); } From 04e4caa8d355d19fd7d26734bd2b9e3b563bb22a Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:42 +0100 Subject: [PATCH 074/497] KVM: arm64: nVHE: Migrate hyp-init to SMCCC To complete the transition to SMCCC, the hyp initialization is given a function ID. This looks neater than comparing the hyp stub function IDs to the page table physical address. Some care is taken to only clobber x0-3 before the host context is saved as only those registers can be clobbered accoring to SMCCC. Fortunately, only a few acrobatics are needed. The possible new tpidr_el2 is moved to the argument in x2 so that it can be stashed in tpidr_el2 early to free up a scratch register. The page table configuration then makes use of x0-2. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-19-ascull@google.com --- arch/arm64/include/asm/kvm_host.h | 5 --- arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/arm.c | 5 ++- arch/arm64/kvm/hyp.S | 22 ---------- arch/arm64/kvm/hyp/nvhe/hyp-init.S | 70 +++++++++++++++++------------- 5 files changed, 45 insertions(+), 59 deletions(-) delete mode 100644 arch/arm64/kvm/hyp.S diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 82c941cf8890..ef0325c42ca0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -480,11 +480,6 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr, - unsigned long hyp_stack_ptr, - unsigned long vector_ptr, - unsigned long tpidr_el2); - #define kvm_call_hyp_nvhe(f, ...) \ ({ \ struct arm_smccc_res res; \ diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 99977c1972cc..1504c81fbf5d 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ $(KVM)/vfio.o $(KVM)/irqchip.o \ arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ - inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \ + inject_fault.o regmap.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ vgic-sys-reg-v3.o fpsimd.o pmu.o \ aarch32.o arch_timer.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 49aa08bd26de..c074d9824d54 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1264,6 +1264,7 @@ static void cpu_init_hyp_mode(void) unsigned long hyp_stack_ptr; unsigned long vector_ptr; unsigned long tpidr_el2; + struct arm_smccc_res res; /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); @@ -1288,7 +1289,9 @@ static void cpu_init_hyp_mode(void) * cpus_have_const_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); - __kvm_call_hyp_init(pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2); + arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), + pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res); + WARN_ON(res.a0 != SMCCC_RET_SUCCESS); /* * Disabling SSBD on a non-VHE system requires us to enable SSBS diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S deleted file mode 100644 index 12aa426f7559..000000000000 --- a/arch/arm64/kvm/hyp.S +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012,2013 - ARM Ltd - * Author: Marc Zyngier - */ - -#include - -#include -#include -#include - -/* - * u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr, - * unsigned long hyp_stack_ptr, - * unsigned long vector_ptr, - * unsigned long tpidr_el2); - */ -SYM_FUNC_START(__kvm_call_hyp_init) - hvc #0 - ret -SYM_FUNC_END(__kvm_call_hyp_init) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index abe885e26fe2..47224dc62c51 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -4,11 +4,13 @@ * Author: Marc Zyngier */ +#include #include #include #include #include +#include #include #include #include @@ -44,27 +46,37 @@ __invalid: b . /* - * x0: HYP pgd - * x1: HYP stack - * x2: HYP vectors - * x3: per-CPU offset + * x0: SMCCC function ID + * x1: HYP pgd + * x2: per-CPU offset + * x3: HYP stack + * x4: HYP vectors */ __do_hyp_init: /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.lo __kvm_handle_stub_hvc - phys_to_ttbr x4, x0 -alternative_if ARM64_HAS_CNP - orr x4, x4, #TTBR_CNP_BIT -alternative_else_nop_endif - msr ttbr0_el2, x4 + /* Set tpidr_el2 for use by HYP to free a register */ + msr tpidr_el2, x2 - mrs x4, tcr_el1 - mov_q x5, TCR_EL2_MASK - and x4, x4, x5 - mov x5, #TCR_EL2_RES1 - orr x4, x4, x5 + mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) + cmp x0, x2 + b.eq 1f + mov x0, #SMCCC_RET_NOT_SUPPORTED + eret + +1: phys_to_ttbr x0, x1 +alternative_if ARM64_HAS_CNP + orr x0, x0, #TTBR_CNP_BIT +alternative_else_nop_endif + msr ttbr0_el2, x0 + + mrs x0, tcr_el1 + mov_q x1, TCR_EL2_MASK + and x0, x0, x1 + mov x1, #TCR_EL2_RES1 + orr x0, x0, x1 /* * The ID map may be configured to use an extended virtual address @@ -80,18 +92,18 @@ alternative_else_nop_endif * * So use the same T0SZ value we use for the ID map. */ - ldr_l x5, idmap_t0sz - bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + ldr_l x1, idmap_t0sz + bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH /* * Set the PS bits in TCR_EL2. */ - tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6 + tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2 - msr tcr_el2, x4 + msr tcr_el2, x0 - mrs x4, mair_el1 - msr mair_el2, x4 + mrs x0, mair_el1 + msr mair_el2, x0 isb /* Invalidate the stale TLBs from Bootloader */ @@ -103,24 +115,22 @@ alternative_else_nop_endif * as well as the EE bit on BE. Drop the A flag since the compiler * is allowed to generate unaligned accesses. */ - mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) -CPU_BE( orr x4, x4, #SCTLR_ELx_EE) + mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) +CPU_BE( orr x0, x0, #SCTLR_ELx_EE) alternative_if ARM64_HAS_ADDRESS_AUTH - mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ + mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) - orr x4, x4, x5 + orr x0, x0, x1 alternative_else_nop_endif - msr sctlr_el2, x4 + msr sctlr_el2, x0 isb /* Set the stack and new vectors */ - mov sp, x1 - msr vbar_el2, x2 - - /* Set tpidr_el2 for use by HYP */ - msr tpidr_el2, x3 + mov sp, x3 + msr vbar_el2, x4 /* Hello, World! */ + mov x0, #SMCCC_RET_SUCCESS eret SYM_CODE_END(__kvm_hyp_init) From a071261d93184f43b7dede7516b6c449c4ed5770 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Tue, 15 Sep 2020 11:46:43 +0100 Subject: [PATCH 075/497] KVM: arm64: nVHE: Fix pointers during SMCCC convertion The host need not concern itself with the pointer differences for the hyp interfaces that are shared between VHE and nVHE so leave it to the hyp to handle. As the SMCCC function IDs are converted into function calls, it is a suitable place to also convert any pointer arguments into hyp pointers. This, additionally, eases the reuse of the handlers in different contexts. Signed-off-by: Andrew Scull Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915104643.2543892-20-ascull@google.com --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 12 ++++++------ arch/arm64/kvm/hyp/nvhe/switch.c | 2 -- arch/arm64/kvm/hyp/nvhe/tlb.c | 2 -- arch/arm64/kvm/vgic/vgic-v3.c | 4 ++-- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 41aaf038599a..e2eafe2c93af 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -24,7 +24,7 @@ static void handle_host_hcall(unsigned long func_id, unsigned long r1 = host_ctxt->regs.regs[1]; struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1; - ret = __kvm_vcpu_run(vcpu); + ret = __kvm_vcpu_run(kern_hyp_va(vcpu)); break; } case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context): @@ -36,21 +36,21 @@ static void handle_host_hcall(unsigned long func_id, phys_addr_t ipa = host_ctxt->regs.regs[2]; int level = host_ctxt->regs.regs[3]; - __kvm_tlb_flush_vmid_ipa(mmu, ipa, level); + __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); break; } case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): { unsigned long r1 = host_ctxt->regs.regs[1]; struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; - __kvm_tlb_flush_vmid(mmu); + __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); break; } case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): { unsigned long r1 = host_ctxt->regs.regs[1]; struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1; - __kvm_tlb_flush_local_vmid(mmu); + __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu)); break; } case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): { @@ -84,14 +84,14 @@ static void handle_host_hcall(unsigned long func_id, unsigned long r1 = host_ctxt->regs.regs[1]; struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; - __vgic_v3_save_aprs(cpu_if); + __vgic_v3_save_aprs(kern_hyp_va(cpu_if)); break; } case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): { unsigned long r1 = host_ctxt->regs.regs[1]; struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1; - __vgic_v3_restore_aprs(cpu_if); + __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); break; } default: diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 29febf9a93f2..a29f247f35e3 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -176,8 +176,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) pmr_sync(); } - vcpu = kern_hyp_va(vcpu); - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 69eae608d670..544bca3072b7 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -54,7 +54,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ishst); /* Switch to requested VMID */ - mmu = kern_hyp_va(mmu); __tlb_switch_to_guest(mmu, &cxt); /* @@ -108,7 +107,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) dsb(ishst); /* Switch to requested VMID */ - mmu = kern_hyp_va(mmu); __tlb_switch_to_guest(mmu, &cxt); __tlbi(vmalls12e1is); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 76e2d85789ed..9cdf39a94a63 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) if (likely(cpu_if->vgic_sre)) kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); - kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if)); + kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) vgic_v3_vmcr_sync(vcpu); - kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if)); + kvm_call_hyp(__vgic_v3_save_aprs, cpu_if); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); From 93f634069707cfe562c38739f5062feccbe9a834 Mon Sep 17 00:00:00 2001 From: Johnny Chuang Date: Wed, 16 Sep 2020 10:26:57 -0700 Subject: [PATCH 076/497] Input: elants_i2c - fix typo for an attribute to show calibration count Fixed typo for command from 0xE0 to 0xD0. Fixes: cf520c643012 ("Input: elants_i2c - provide an attribute to show calibration count") Signed-off-by: Johnny Chuang Link: https://lore.kernel.org/r/1600238783-32303-1-git-send-email-johnny.chuang.emc@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/elants_i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/elants_i2c.c b/drivers/input/touchscreen/elants_i2c.c index c8d7bdd34c7a..50c348297e38 100644 --- a/drivers/input/touchscreen/elants_i2c.c +++ b/drivers/input/touchscreen/elants_i2c.c @@ -90,7 +90,7 @@ /* FW read command, 0x53 0x?? 0x0, 0x01 */ #define E_ELAN_INFO_FW_VER 0x00 #define E_ELAN_INFO_BC_VER 0x10 -#define E_ELAN_INFO_REK 0xE0 +#define E_ELAN_INFO_REK 0xD0 #define E_ELAN_INFO_TEST_VER 0xE0 #define E_ELAN_INFO_FW_ID 0xF0 #define E_INFO_OSR 0xD6 From 30df23c5ecdfb8da5b0bc17ceef67eff9e1b0957 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 14 Sep 2020 10:17:01 -0700 Subject: [PATCH 077/497] Input: imx6ul_tsc - clean up some errors in imx6ul_tsc_resume() If imx6ul_tsc_init() fails then we need to clean up the clocks. I reversed the "if (input_dev->users) {" condition to make the code a bit simpler. Fixes: 6cc527b05847 ("Input: imx6ul_tsc - propagate the errors") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20200905124942.GC183976@mwanda Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/imx6ul_tsc.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/input/touchscreen/imx6ul_tsc.c b/drivers/input/touchscreen/imx6ul_tsc.c index 9ed258854349..5e6ba5c4eca2 100644 --- a/drivers/input/touchscreen/imx6ul_tsc.c +++ b/drivers/input/touchscreen/imx6ul_tsc.c @@ -530,20 +530,25 @@ static int __maybe_unused imx6ul_tsc_resume(struct device *dev) mutex_lock(&input_dev->mutex); - if (input_dev->users) { - retval = clk_prepare_enable(tsc->adc_clk); - if (retval) - goto out; + if (!input_dev->users) + goto out; - retval = clk_prepare_enable(tsc->tsc_clk); - if (retval) { - clk_disable_unprepare(tsc->adc_clk); - goto out; - } + retval = clk_prepare_enable(tsc->adc_clk); + if (retval) + goto out; - retval = imx6ul_tsc_init(tsc); + retval = clk_prepare_enable(tsc->tsc_clk); + if (retval) { + clk_disable_unprepare(tsc->adc_clk); + goto out; } + retval = imx6ul_tsc_init(tsc); + if (retval) { + clk_disable_unprepare(tsc->tsc_clk); + clk_disable_unprepare(tsc->adc_clk); + goto out; + } out: mutex_unlock(&input_dev->mutex); return retval; From 925145f9e9eee8ac3b7d9eb53e0a9585ca671e1e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 14 Sep 2020 10:31:07 -0700 Subject: [PATCH 078/497] Input: imx6ul_tsc - unify open/close and PM paths Open/close and resume/suspend paths are very similar, let's factor out common parts. Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/imx6ul_tsc.c | 52 +++++++++++--------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/drivers/input/touchscreen/imx6ul_tsc.c b/drivers/input/touchscreen/imx6ul_tsc.c index 5e6ba5c4eca2..cd369f9ac5e6 100644 --- a/drivers/input/touchscreen/imx6ul_tsc.c +++ b/drivers/input/touchscreen/imx6ul_tsc.c @@ -315,9 +315,8 @@ static irqreturn_t adc_irq_fn(int irq, void *dev_id) return IRQ_HANDLED; } -static int imx6ul_tsc_open(struct input_dev *input_dev) +static int imx6ul_tsc_start(struct imx6ul_tsc *tsc) { - struct imx6ul_tsc *tsc = input_get_drvdata(input_dev); int err; err = clk_prepare_enable(tsc->adc_clk); @@ -349,16 +348,29 @@ disable_adc_clk: return err; } -static void imx6ul_tsc_close(struct input_dev *input_dev) +static void imx6ul_tsc_stop(struct imx6ul_tsc *tsc) { - struct imx6ul_tsc *tsc = input_get_drvdata(input_dev); - imx6ul_tsc_disable(tsc); clk_disable_unprepare(tsc->tsc_clk); clk_disable_unprepare(tsc->adc_clk); } + +static int imx6ul_tsc_open(struct input_dev *input_dev) +{ + struct imx6ul_tsc *tsc = input_get_drvdata(input_dev); + + return imx6ul_tsc_start(tsc); +} + +static void imx6ul_tsc_close(struct input_dev *input_dev) +{ + struct imx6ul_tsc *tsc = input_get_drvdata(input_dev); + + imx6ul_tsc_stop(tsc); +} + static int imx6ul_tsc_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; @@ -509,12 +521,8 @@ static int __maybe_unused imx6ul_tsc_suspend(struct device *dev) mutex_lock(&input_dev->mutex); - if (input_dev->users) { - imx6ul_tsc_disable(tsc); - - clk_disable_unprepare(tsc->tsc_clk); - clk_disable_unprepare(tsc->adc_clk); - } + if (input_dev->users) + imx6ul_tsc_stop(tsc); mutex_unlock(&input_dev->mutex); @@ -530,27 +538,11 @@ static int __maybe_unused imx6ul_tsc_resume(struct device *dev) mutex_lock(&input_dev->mutex); - if (!input_dev->users) - goto out; + if (input_dev->users) + retval = imx6ul_tsc_start(tsc); - retval = clk_prepare_enable(tsc->adc_clk); - if (retval) - goto out; - - retval = clk_prepare_enable(tsc->tsc_clk); - if (retval) { - clk_disable_unprepare(tsc->adc_clk); - goto out; - } - - retval = imx6ul_tsc_init(tsc); - if (retval) { - clk_disable_unprepare(tsc->tsc_clk); - clk_disable_unprepare(tsc->adc_clk); - goto out; - } -out: mutex_unlock(&input_dev->mutex); + return retval; } From d04afe14b23651e7a8bc89727a759e982a8458e4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 16 Sep 2020 10:26:09 -0700 Subject: [PATCH 079/497] Input: stmfts - fix a & vs && typo In stmfts_sysfs_hover_enable_write(), we should check value and sdata->hover_enabled is all true. Fixes: 78bcac7b2ae1 ("Input: add support for the STMicroelectronics FingerTip touchscreen") Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20200916141941.16684-1-yuehaibing@huawei.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/stmfts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/stmfts.c b/drivers/input/touchscreen/stmfts.c index df946869d4cd..9a64e1dbc04a 100644 --- a/drivers/input/touchscreen/stmfts.c +++ b/drivers/input/touchscreen/stmfts.c @@ -479,7 +479,7 @@ static ssize_t stmfts_sysfs_hover_enable_write(struct device *dev, mutex_lock(&sdata->mutex); - if (value & sdata->hover_enabled) + if (value && sdata->hover_enabled) goto out; if (sdata->running) From 7d50f6656dacf085a00beeedbc48b19a37d17881 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 15 Sep 2020 17:51:05 -0700 Subject: [PATCH 080/497] Input: ep93xx_keypad - fix handling of platform_get_irq() error platform_get_irq() returns -ERRNO on error. In such case comparison to 0 would pass the check. Fixes: 60214f058f44 ("Input: ep93xx_keypad - update driver to new core support") Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200828145744.3636-1-krzk@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/ep93xx_keypad.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/keyboard/ep93xx_keypad.c b/drivers/input/keyboard/ep93xx_keypad.c index 7c70492d9d6b..f831f01501d5 100644 --- a/drivers/input/keyboard/ep93xx_keypad.c +++ b/drivers/input/keyboard/ep93xx_keypad.c @@ -250,8 +250,8 @@ static int ep93xx_keypad_probe(struct platform_device *pdev) } keypad->irq = platform_get_irq(pdev, 0); - if (!keypad->irq) { - err = -ENXIO; + if (keypad->irq < 0) { + err = keypad->irq; goto failed_free; } From 4738dd1992fa13acfbbd71800c71c612f466fa44 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 15 Sep 2020 17:52:15 -0700 Subject: [PATCH 081/497] Input: omap4-keypad - fix handling of platform_get_irq() error platform_get_irq() returns -ERRNO on error. In such case comparison to 0 would pass the check. Fixes: f3a1ba60dbdb ("Input: omap4-keypad - use platform device helpers") Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200828145744.3636-2-krzk@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/omap4-keypad.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c index 94c94d7f5155..d6c924032aaa 100644 --- a/drivers/input/keyboard/omap4-keypad.c +++ b/drivers/input/keyboard/omap4-keypad.c @@ -240,10 +240,8 @@ static int omap4_keypad_probe(struct platform_device *pdev) } irq = platform_get_irq(pdev, 0); - if (!irq) { - dev_err(&pdev->dev, "no keyboard irq assigned\n"); - return -EINVAL; - } + if (irq < 0) + return irq; keypad_data = kzalloc(sizeof(struct omap4_keypad), GFP_KERNEL); if (!keypad_data) { From c277e1f0dc3c7d7b5b028e20dd414df241642036 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 15 Sep 2020 17:56:19 -0700 Subject: [PATCH 082/497] Input: twl4030_keypad - fix handling of platform_get_irq() error platform_get_irq() returns -ERRNO on error. In such case casting to unsigned and comparing to 0 would pass the check. Fixes: 7abf38d6d13c ("Input: twl4030-keypad - add device tree support") Reported-by: kernel test robot Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200828145744.3636-3-krzk@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/twl4030_keypad.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/input/keyboard/twl4030_keypad.c b/drivers/input/keyboard/twl4030_keypad.c index af3a6824f1a4..77e0743a3cf8 100644 --- a/drivers/input/keyboard/twl4030_keypad.c +++ b/drivers/input/keyboard/twl4030_keypad.c @@ -50,7 +50,7 @@ struct twl4030_keypad { bool autorepeat; unsigned int n_rows; unsigned int n_cols; - unsigned int irq; + int irq; struct device *dbg_dev; struct input_dev *input; @@ -376,10 +376,8 @@ static int twl4030_kp_probe(struct platform_device *pdev) } kp->irq = platform_get_irq(pdev, 0); - if (!kp->irq) { - dev_err(&pdev->dev, "no keyboard irq assigned\n"); - return -EINVAL; - } + if (kp->irq < 0) + return kp->irq; error = matrix_keypad_build_keymap(keymap_data, NULL, TWL4030_MAX_ROWS, From cafb3abea6136e59ea534004e5773361e196bb94 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 15 Sep 2020 17:56:40 -0700 Subject: [PATCH 083/497] Input: sun4i-ps2 - fix handling of platform_get_irq() error platform_get_irq() returns -ERRNO on error. In such case comparison to 0 would pass the check. Fixes: e443631d20f5 ("Input: serio - add support for Alwinner A10/A20 PS/2 controller") Signed-off-by: Krzysztof Kozlowski Acked-by: Chen-Yu Tsai Link: https://lore.kernel.org/r/20200828145744.3636-4-krzk@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/serio/sun4i-ps2.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/input/serio/sun4i-ps2.c b/drivers/input/serio/sun4i-ps2.c index a681a2c04e39..f15ed3dcdb9b 100644 --- a/drivers/input/serio/sun4i-ps2.c +++ b/drivers/input/serio/sun4i-ps2.c @@ -211,7 +211,6 @@ static int sun4i_ps2_probe(struct platform_device *pdev) struct sun4i_ps2data *drvdata; struct serio *serio; struct device *dev = &pdev->dev; - unsigned int irq; int error; drvdata = kzalloc(sizeof(struct sun4i_ps2data), GFP_KERNEL); @@ -264,14 +263,12 @@ static int sun4i_ps2_probe(struct platform_device *pdev) writel(0, drvdata->reg_base + PS2_REG_GCTL); /* Get IRQ for the device */ - irq = platform_get_irq(pdev, 0); - if (!irq) { - dev_err(dev, "no IRQ found\n"); - error = -ENXIO; + drvdata->irq = platform_get_irq(pdev, 0); + if (drvdata->irq < 0) { + error = drvdata->irq; goto err_disable_clk; } - drvdata->irq = irq; drvdata->serio = serio; drvdata->dev = dev; From 4e1b2ab7e6299937219097fc581de7d29d79aade Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 11 Sep 2020 12:53:45 +0200 Subject: [PATCH 084/497] KVM: PPC: Don't return -ENOTSUPP to userspace in ioctls ENOTSUPP is a linux only thingy, the value of which is unknown to userspace, not to be confused with ENOTSUP which linux maps to EOPNOTSUPP, as permitted by POSIX [1]: [EOPNOTSUPP] Operation not supported on socket. The type of socket (address family or protocol) does not support the requested operation. A conforming implementation may assign the same values for [EOPNOTSUPP] and [ENOTSUP]. Return -EOPNOTSUPP instead of -ENOTSUPP for the following ioctls: - KVM_GET_FPU for Book3s and BookE - KVM_SET_FPU for Book3s and BookE - KVM_GET_DIRTY_LOG for BookE This doesn't affect QEMU which doesn't call the KVM_GET_FPU and KVM_SET_FPU ioctls on POWER anyway since they are not supported, and _buggily_ ignores anything but -EPERM for KVM_GET_DIRTY_LOG. [1] https://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html Signed-off-by: Greg Kurz Acked-by: Thadeu Lima de Souza Cascardo Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s.c | 4 ++-- arch/powerpc/kvm/booke.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 1fce9777af1c..44bf567b6589 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -558,12 +558,12 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 3e1c9f08e302..b1abcb816439 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1747,12 +1747,12 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - return -ENOTSUPP; + return -EOPNOTSUPP; } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -1773,7 +1773,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - return -ENOTSUPP; + return -EOPNOTSUPP; } void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) From 05e6295dc7de859c9d56334805485c4d20bebf25 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Fri, 11 Sep 2020 01:16:07 -0300 Subject: [PATCH 085/497] KVM: PPC: Book3S HV: Do not allocate HPT for a nested guest The current nested KVM code does not support HPT guests. This is informed/enforced in some ways: - Hosts < P9 will not be able to enable the nested HV feature; - The nested hypervisor MMU capabilities will not contain KVM_CAP_PPC_MMU_HASH_V3; - QEMU reflects the MMU capabilities in the 'ibm,arch-vec-5-platform-support' device-tree property; - The nested guest, at 'prom_parse_mmu_model' ignores the 'disable_radix' kernel command line option if HPT is not supported; - The KVM_PPC_CONFIGURE_V3_MMU ioctl will fail if trying to use HPT. There is, however, still a way to start a HPT guest by using max-compat-cpu=power8 at the QEMU machine options. This leads to the guest being set to use hash after QEMU calls the KVM_PPC_ALLOCATE_HTAB ioctl. With the guest set to hash, the nested hypervisor goes through the entry path that has no knowledge of nesting (kvmppc_run_vcpu) and crashes when it tries to execute an hypervisor-privileged (mtspr HDEC) instruction at __kvmppc_vcore_entry: root@L1:~ $ qemu-system-ppc64 -machine pseries,max-cpu-compat=power8 ... [ 538.543303] CPU: 83 PID: 25185 Comm: CPU 0/KVM Not tainted 5.9.0-rc4 #1 [ 538.543355] NIP: c00800000753f388 LR: c00800000753f368 CTR: c0000000001e5ec0 [ 538.543417] REGS: c0000013e91e33b0 TRAP: 0700 Not tainted (5.9.0-rc4) [ 538.543470] MSR: 8000000002843033 CR: 22422882 XER: 20040000 [ 538.543546] CFAR: c00800000753f4b0 IRQMASK: 3 GPR00: c0080000075397a0 c0000013e91e3640 c00800000755e600 0000000080000000 GPR04: 0000000000000000 c0000013eab19800 c000001394de0000 00000043a054db72 GPR08: 00000000003b1652 0000000000000000 0000000000000000 c0080000075502e0 GPR12: c0000000001e5ec0 c0000007ffa74200 c0000013eab19800 0000000000000008 GPR16: 0000000000000000 c00000139676c6c0 c000000001d23948 c0000013e91e38b8 GPR20: 0000000000000053 0000000000000000 0000000000000001 0000000000000000 GPR24: 0000000000000001 0000000000000001 0000000000000000 0000000000000001 GPR28: 0000000000000001 0000000000000053 c0000013eab19800 0000000000000001 [ 538.544067] NIP [c00800000753f388] __kvmppc_vcore_entry+0x90/0x104 [kvm_hv] [ 538.544121] LR [c00800000753f368] __kvmppc_vcore_entry+0x70/0x104 [kvm_hv] [ 538.544173] Call Trace: [ 538.544196] [c0000013e91e3640] [c0000013e91e3680] 0xc0000013e91e3680 (unreliable) [ 538.544260] [c0000013e91e3820] [c0080000075397a0] kvmppc_run_core+0xbc8/0x19d0 [kvm_hv] [ 538.544325] [c0000013e91e39e0] [c00800000753d99c] kvmppc_vcpu_run_hv+0x404/0xc00 [kvm_hv] [ 538.544394] [c0000013e91e3ad0] [c0080000072da4fc] kvmppc_vcpu_run+0x34/0x48 [kvm] [ 538.544472] [c0000013e91e3af0] [c0080000072d61b8] kvm_arch_vcpu_ioctl_run+0x310/0x420 [kvm] [ 538.544539] [c0000013e91e3b80] [c0080000072c7450] kvm_vcpu_ioctl+0x298/0x778 [kvm] [ 538.544605] [c0000013e91e3ce0] [c0000000004b8c2c] sys_ioctl+0x1dc/0xc90 [ 538.544662] [c0000013e91e3dc0] [c00000000002f9a4] system_call_exception+0xe4/0x1c0 [ 538.544726] [c0000013e91e3e20] [c00000000000d140] system_call_common+0xf0/0x27c [ 538.544787] Instruction dump: [ 538.544821] f86d1098 60000000 60000000 48000099 e8ad0fe8 e8c500a0 e9264140 75290002 [ 538.544886] 7d1602a6 7cec42a6 40820008 7d0807b4 <7d164ba6> 7d083a14 f90d10a0 480104fd [ 538.544953] ---[ end trace 74423e2b948c2e0c ]--- This patch makes the KVM_PPC_ALLOCATE_HTAB ioctl fail when running in the nested hypervisor, causing QEMU to abort. Reported-by: Satheesh Rajendran Signed-off-by: Fabiano Rosas Reviewed-by: Greg Kurz Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 4ba06a2a306c..764b6239ef72 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5250,6 +5250,12 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, case KVM_PPC_ALLOCATE_HTAB: { u32 htab_order; + /* If we're a nested hypervisor, we currently only support radix */ + if (kvmhv_on_pseries()) { + r = -EOPNOTSUPP; + break; + } + r = -EFAULT; if (get_user(htab_order, (u32 __user *)argp)) break; From 35dfb43c243b28658ef757a2b9c6454f99cfc43c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 3 Sep 2020 17:55:40 +1000 Subject: [PATCH 086/497] KVM: PPC: Book3S HV: Set LPCR[HDICE] before writing HDEC POWER8 and POWER9 machines have a hardware deviation where generation of a hypervisor decrementer exception is suppressed if the HDICE bit in the LPCR register is 0 at the time when the HDEC register decrements from 0 to -1. When entering a guest, KVM first writes the HDEC register with the time until it wants the CPU to exit the guest, and then writes the LPCR with the guest value, which includes HDICE = 1. If HDEC decrements from 0 to -1 during the interval between those two events, it is possible that we can enter the guest with HDEC already negative but no HDEC exception pending, meaning that no HDEC interrupt will occur while the CPU is in the guest, or at least not until HDEC wraps around. Thus it is possible for the CPU to keep executing in the guest for a long time; up to about 4 seconds on POWER8, or about 4.46 years on POWER9 (except that the host kernel hard lockup detector will fire first). To fix this, we set the LPCR[HDICE] bit before writing HDEC on guest entry. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 14 ++++++++++++-- arch/powerpc/kvm/book3s_hv_interrupts.S | 9 ++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 764b6239ef72..7ee00681ecb5 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3442,9 +3442,19 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_psscr = mfspr(SPRN_PSSCR); unsigned long host_pidr = mfspr(SPRN_PID); + /* + * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0, + * so set HDICE before writing HDEC. + */ + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE); + isync(); + hdec = time_limit - mftb(); - if (hdec < 0) + if (hdec < 0) { + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + isync(); return BOOK3S_INTERRUPT_HV_DECREMENTER; + } mtspr(SPRN_HDEC, hdec); if (vc->tb_offset) { @@ -3572,7 +3582,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, dec = mfspr(SPRN_DEC); tb = mftb(); - if (dec < 512) + if (dec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; local_paca->kvm_hstate.dec_expires = dec + tb; if (local_paca->kvm_hstate.dec_expires < time_limit) diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 59822cba454d..327417d79eac 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -58,13 +58,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* * Put whatever is in the decrementer into the * hypervisor decrementer. + * Because of a hardware deviation in P8 and P9, + * we need to set LPCR[HDICE] before writing HDEC. */ -BEGIN_FTR_SECTION ld r5, HSTATE_KVM_VCORE(r13) ld r6, VCORE_KVM(r5) ld r9, KVM_HOST_LPCR(r6) - andis. r9, r9, LPCR_LD@h -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ori r8, r9, LPCR_HDICE + mtspr SPRN_LPCR, r8 + isync + andis. r0, r9, LPCR_LD@h mfspr r8,SPRN_DEC mftb r7 BEGIN_FTR_SECTION From 523b3999e5f620cb5ccce6a7ca2780a4cab579a2 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Thu, 10 Sep 2020 14:33:51 +0100 Subject: [PATCH 087/497] KVM: arm64: Try PMD block mappings if PUD mappings are not supported When userspace uses hugetlbfs for the VM memory, user_mem_abort() tries to use the same block size to map the faulting IPA in stage 2. If stage 2 cannot the same block mapping because the block size doesn't fit in the memslot or the memslot is not properly aligned, user_mem_abort() will fall back to a page mapping, regardless of the block size. We can do better for PUD backed hugetlbfs by checking if a PMD block mapping is supported before deciding to use a page. vma_pagesize is an unsigned long, use 1UL instead of 1ULL when assigning its value. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200910133351.118191-1-alexandru.elisei@arm.com --- arch/arm64/kvm/mmu.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 21b70abf65a7..852497bd6d1e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -779,16 +779,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, else vma_shift = PAGE_SHIFT; - vma_pagesize = 1ULL << vma_shift; if (logging_active || - (vma->vm_flags & VM_PFNMAP) || - !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { + (vma->vm_flags & VM_PFNMAP)) { force_pte = true; - vma_pagesize = PAGE_SIZE; + vma_shift = PAGE_SHIFT; } + if (vma_shift == PUD_SHIFT && + !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) + vma_shift = PMD_SHIFT; + + if (vma_shift == PMD_SHIFT && + !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + force_pte = true; + vma_shift = PAGE_SHIFT; + } + + vma_pagesize = 1UL << vma_shift; if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) - fault_ipa &= huge_page_mask(hstate_vma(vma)); + fault_ipa &= ~(vma_pagesize - 1); gfn = fault_ipa >> PAGE_SHIFT; mmap_read_unlock(current->mm); From 8a4374f97deefb36f927c608af63dcc925521b3c Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Tue, 15 Sep 2020 15:21:52 +0800 Subject: [PATCH 088/497] KVM: arm64: Fix inject_fault.c kernel-doc warnings Fix kernel-doc warnings. arch/arm64/kvm/inject_fault.c:210: warning: Function parameter or member 'vcpu' not described in 'kvm_inject_undefined' Signed-off-by: Tian Tao Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/1600154512-44624-1-git-send-email-tiantao6@hisilicon.com --- arch/arm64/kvm/inject_fault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index ebfdfc27b2bd..34a96ab244fa 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) /** * kvm_inject_undefined - inject an undefined instruction into the guest + * @vcpu: The vCPU in which to inject the exception * * It is assumed that this code is called from the VCPU thread and that the * VCPU therefore is not currently executing guest code. From cb62e0b5c8db778cd29e63c2f844f36caf6859ed Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 16 Sep 2020 10:50:23 +0800 Subject: [PATCH 089/497] KVM: arm64: vgic-debug: Convert to use DEFINE_SEQ_ATTRIBUTE macro Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code. Signed-off-by: Liu Shixin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200916025023.3992679-1-liushixin2@huawei.com --- arch/arm64/kvm/vgic/vgic-debug.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index b13a9e3f99dd..f38c40a76251 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v) return 0; } -static const struct seq_operations vgic_debug_seq_ops = { +static const struct seq_operations vgic_debug_sops = { .start = vgic_debug_start, .next = vgic_debug_next, .stop = vgic_debug_stop, .show = vgic_debug_show }; -static int debug_open(struct inode *inode, struct file *file) -{ - int ret; - ret = seq_open(file, &vgic_debug_seq_ops); - if (!ret) { - struct seq_file *seq; - /* seq_open will have modified file->private_data */ - seq = file->private_data; - seq->private = inode->i_private; - } - - return ret; -}; - -static const struct file_operations vgic_debug_fops = { - .owner = THIS_MODULE, - .open = debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release -}; +DEFINE_SEQ_ATTRIBUTE(vgic_debug); void vgic_debug_init(struct kvm *kvm) { From ada329e6b5b406f33fae665e62caff7814409906 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 15 Sep 2020 18:04:42 +0100 Subject: [PATCH 090/497] KVM: arm64: Do not flush memslot if FWB is supported As a result of a KVM_SET_USER_MEMORY_REGION ioctl, KVM flushes the dcache for the memslot being changed to ensure a consistent view of memory between the host and the guest: the host runs with caches enabled, and it is possible for the data written by the hypervisor to still be in the caches, but the guest is running with stage 1 disabled, meaning data accesses are to Device-nGnRnE memory, bypassing the caches entirely. Flushing the dcache is not necessary when KVM enables FWB, because it forces the guest to uses cacheable memory accesses. The current behaviour does not change, as the dcache flush helpers execute the cache operation only if FWB is not enabled, but walking the stage 2 table is avoided. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200915170442.131635-1-alexandru.elisei@arm.com --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 852497bd6d1e..4d68c160a7b5 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1344,7 +1344,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, spin_lock(&kvm->mmu_lock); if (ret) unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size); - else + else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) stage2_flush_memslot(kvm, memslot); spin_unlock(&kvm->mmu_lock); out: From c9c0279cc02b4e161686de7ccd1973357f29db8c Mon Sep 17 00:00:00 2001 From: Xiaofei Tan Date: Thu, 17 Sep 2020 09:47:49 +0800 Subject: [PATCH 091/497] KVM: arm64: Fix doc warnings in mmu code Fix following warnings caused by mismatch bewteen function parameters and comments. arch/arm64/kvm/mmu.c:128: warning: Function parameter or member 'mmu' not described in '__unmap_stage2_range' arch/arm64/kvm/mmu.c:128: warning: Function parameter or member 'may_block' not described in '__unmap_stage2_range' arch/arm64/kvm/mmu.c:128: warning: Excess function parameter 'kvm' description in '__unmap_stage2_range' arch/arm64/kvm/mmu.c:499: warning: Function parameter or member 'writable' not described in 'kvm_phys_addr_ioremap' arch/arm64/kvm/mmu.c:538: warning: Function parameter or member 'mmu' not described in 'stage2_wp_range' arch/arm64/kvm/mmu.c:538: warning: Excess function parameter 'kvm' description in 'stage2_wp_range' Signed-off-by: Xiaofei Tan Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/1600307269-50957-1-git-send-email-tanxiaofei@huawei.com --- arch/arm64/kvm/mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 4d68c160a7b5..c5c26a9cb85b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -114,9 +114,10 @@ static bool kvm_is_device_pfn(unsigned long pfn) */ /** * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer + * @mmu: The KVM stage-2 MMU pointer * @start: The intermediate physical base address of the range to unmap * @size: The size of the area to unmap + * @may_block: Whether or not we are permitted to block * * Clear a range of stage-2 mappings, lowering the various ref-counts. Must * be called while holding mmu_lock (unless for freeing the stage2 pgd before @@ -493,6 +494,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) * @guest_ipa: The IPA at which to insert the mapping * @pa: The physical address of the device * @size: The size of the mapping + * @writable: Whether or not to create a writable mapping */ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable) @@ -530,7 +532,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, /** * stage2_wp_range() - write protect stage2 memory region range - * @kvm: The KVM pointer + * @mmu: The KVM stage-2 MMU pointer * @addr: Start address of range * @end: End address of range */ From 64b02e54e50fea8ebe0a2b2dc37ea096d0030cc5 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Mon, 21 Sep 2020 10:51:58 +0200 Subject: [PATCH 092/497] platform/chrome: cros_ec_proto: Update cros_ec_cmd_xfer() call-sites Since all the other call-sites of cros_ec_cmd_xfer() have been converted to use cros_ec_cmd_xfer_status() instead, update the remaining call-sites to prepare for the merge of cros_ec_cmd_xfer() into cros_ec_cmd_xfer_status(). As part of this update, change the error handling inside cros_ec_get_sensor_count() such that the legacy LPC interface is tried on all error values, not just when msg->result != EC_RESULT_SUCCESS. Note that there is a slight change in API in cros_ec_get_sensor_count(): it will return a negative number of sensors when there are no sensors on arm platform when MOTIONSENSE_CMD_DUMP is not supported (typical for sensorless chromebook) instead of 0. However, this is not a problem when probing the EC as we ignore errors only looking for cros_ec_get_sensor_count() returning a positive number of sensors. Signed-off-by: Prashant Malani Reviewed-by: Gwendal Grignou Tested-by: Gwendal Grignou Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_proto.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index dda182132a6a..2cb1defcdd13 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -650,7 +650,7 @@ static int get_next_event_xfer(struct cros_ec_device *ec_dev, msg->insize = size; msg->outsize = 0; - ret = cros_ec_cmd_xfer(ec_dev, msg); + ret = cros_ec_cmd_xfer_status(ec_dev, msg); if (ret > 0) { ec_dev->event_size = ret - 1; ec_dev->event_data = *event; @@ -694,7 +694,7 @@ static int get_keyboard_state_event(struct cros_ec_device *ec_dev) msg->insize = sizeof(ec_dev->event_data.data); msg->outsize = 0; - ec_dev->event_size = cros_ec_cmd_xfer(ec_dev, msg); + ec_dev->event_size = cros_ec_cmd_xfer_status(ec_dev, msg); ec_dev->event_data.event_type = EC_MKBP_EVENT_KEY_MATRIX; memcpy(&ec_dev->event_data.data, msg->data, sizeof(ec_dev->event_data.data)); @@ -883,11 +883,9 @@ int cros_ec_get_sensor_count(struct cros_ec_dev *ec) params = (struct ec_params_motion_sense *)msg->data; params->cmd = MOTIONSENSE_CMD_DUMP; - ret = cros_ec_cmd_xfer(ec->ec_dev, msg); + ret = cros_ec_cmd_xfer_status(ec->ec_dev, msg); if (ret < 0) { sensor_count = ret; - } else if (msg->result != EC_RES_SUCCESS) { - sensor_count = -EPROTO; } else { resp = (struct ec_response_motion_sense *)msg->data; sensor_count = resp->dump.sensor_count; @@ -898,9 +896,7 @@ int cros_ec_get_sensor_count(struct cros_ec_dev *ec) * Check legacy mode: Let's find out if sensors are accessible * via LPC interface. */ - if (sensor_count == -EPROTO && - ec->cmd_offset == 0 && - ec_dev->cmd_readmem) { + if (sensor_count < 0 && ec->cmd_offset == 0 && ec_dev->cmd_readmem) { ret = ec_dev->cmd_readmem(ec_dev, EC_MEMMAP_ACC_STATUS, 1, &status); if (ret >= 0 && @@ -915,9 +911,6 @@ int cros_ec_get_sensor_count(struct cros_ec_dev *ec) */ sensor_count = 0; } - } else if (sensor_count == -EPROTO) { - /* EC responded, but does not understand DUMP command. */ - sensor_count = 0; } return sensor_count; } From 6b194ee986463ec7e6de7ccf3728ae4ede3b8d29 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Mon, 21 Sep 2020 10:51:58 +0200 Subject: [PATCH 093/497] platform/chrome: cros_ec_proto: Drop cros_ec_cmd_xfer() Since cros_ec_cmd_xfer_status() now returns Linux error codes and all other files use that command, remove the now-unused function cros_ec_cmd_xfer(). Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_proto.c | 44 +++++++------------------ 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 2cb1defcdd13..0ecee8b8773d 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -549,19 +549,22 @@ exit: EXPORT_SYMBOL(cros_ec_query_all); /** - * cros_ec_cmd_xfer() - Send a command to the ChromeOS EC. + * cros_ec_cmd_xfer_status() - Send a command to the ChromeOS EC. * @ec_dev: EC device. * @msg: Message to write. * - * Call this to send a command to the ChromeOS EC. This should be used - * instead of calling the EC's cmd_xfer() callback directly. + * Call this to send a command to the ChromeOS EC. This should be used instead of calling the EC's + * cmd_xfer() callback directly. It returns success status only if both the command was transmitted + * successfully and the EC replied with success status. * - * Return: 0 on success or negative error code. + * Return: + * >=0 - The number of bytes transferred + * <0 - Linux error code */ -static int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, +int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, struct cros_ec_command *msg) { - int ret; + int ret, mapped; mutex_lock(&ec_dev->lock); if (ec_dev->proto_version == EC_PROTO_VERSION_UNKNOWN) { @@ -598,42 +601,17 @@ static int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, return -EMSGSIZE; } } + ret = send_command(ec_dev, msg); mutex_unlock(&ec_dev->lock); - return ret; -} - -/** - * cros_ec_cmd_xfer_status() - Send a command to the ChromeOS EC. - * @ec_dev: EC device. - * @msg: Message to write. - * - * This function is identical to cros_ec_cmd_xfer, except it returns success - * status only if both the command was transmitted successfully and the EC - * replied with success status. It's not necessary to check msg->result when - * using this function. - * - * Return: - * >=0 - The number of bytes transferred - * <0 - Linux error code - */ -int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, - struct cros_ec_command *msg) -{ - int ret, mapped; - - ret = cros_ec_cmd_xfer(ec_dev, msg); - if (ret < 0) { - dev_err(ec_dev->dev, "Command xfer error (err:%d)\n", ret); - return ret; - } mapped = cros_ec_map_error(msg->result); if (mapped) { dev_dbg(ec_dev->dev, "Command result (err: %d [%d])\n", msg->result, mapped); ret = mapped; } + return ret; } EXPORT_SYMBOL(cros_ec_cmd_xfer_status); From c65176fd49f45bd5a5ffaa1790109745d1fa462c Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Fri, 18 Sep 2020 19:59:30 +0300 Subject: [PATCH 094/497] arm64: dts: ti: k3-j721e: Rename mux header and update macro names We intend to use one header file for SERDES MUX for all TI SoCs so rename the header file. The exsting macros are too generic. Prefix them with SoC name. While at that, add the missing configurations for completeness. Fixes: b766e3b0d5f6 ("arm64: dts: ti: k3-j721e-main: Add system controller node and SERDES lane mux") Reported-by: Peter Rosin Signed-off-by: Roger Quadros Signed-off-by: Nishanth Menon Acked-by: Peter Rosin Link: https://lore.kernel.org/r/20200918165930.2031-1-rogerq@ti.com --- .../dts/ti/k3-j721e-common-proc-board.dts | 11 +-- arch/arm64/boot/dts/ti/k3-j721e-main.dtsi | 13 ++-- include/dt-bindings/mux/mux-j721e-wiz.h | 53 -------------- include/dt-bindings/mux/ti-serdes.h | 71 +++++++++++++++++++ 4 files changed, 84 insertions(+), 64 deletions(-) delete mode 100644 include/dt-bindings/mux/mux-j721e-wiz.h create mode 100644 include/dt-bindings/mux/ti-serdes.h diff --git a/arch/arm64/boot/dts/ti/k3-j721e-common-proc-board.dts b/arch/arm64/boot/dts/ti/k3-j721e-common-proc-board.dts index e8fc01d97ada..6f7490efc438 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-common-proc-board.dts +++ b/arch/arm64/boot/dts/ti/k3-j721e-common-proc-board.dts @@ -404,11 +404,12 @@ }; &serdes_ln_ctrl { - idle-states = , , - , , - , , - , , - , , , ; + idle-states = , , + , , + , , + , , + , , + , ; }; &serdes_wiz3 { diff --git a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi index 12ceea9b3c9a..63d221aee9bc 100644 --- a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi +++ b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi @@ -6,7 +6,7 @@ */ #include #include -#include +#include &cbass_main { msmc_ram: sram@70000000 { @@ -38,11 +38,12 @@ <0x40b0 0x3>, <0x40b4 0x3>, /* SERDES3 lane0/1 select */ <0x40c0 0x3>, <0x40c4 0x3>, <0x40c8 0x3>, <0x40cc 0x3>; /* SERDES4 lane0/1/2/3 select */ - idle-states = , , - , , - , , - , , - , , , ; + idle-states = , , + , , + , , + , , + , , + , ; }; usb_serdes_mux: mux-controller@4000 { diff --git a/include/dt-bindings/mux/mux-j721e-wiz.h b/include/dt-bindings/mux/mux-j721e-wiz.h deleted file mode 100644 index fd1c4ea9fc7f..000000000000 --- a/include/dt-bindings/mux/mux-j721e-wiz.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This header provides constants for J721E WIZ. - */ - -#ifndef _DT_BINDINGS_J721E_WIZ -#define _DT_BINDINGS_J721E_WIZ - -#define SERDES0_LANE0_QSGMII_LANE1 0x0 -#define SERDES0_LANE0_PCIE0_LANE0 0x1 -#define SERDES0_LANE0_USB3_0_SWAP 0x2 - -#define SERDES0_LANE1_QSGMII_LANE2 0x0 -#define SERDES0_LANE1_PCIE0_LANE1 0x1 -#define SERDES0_LANE1_USB3_0 0x2 - -#define SERDES1_LANE0_QSGMII_LANE3 0x0 -#define SERDES1_LANE0_PCIE1_LANE0 0x1 -#define SERDES1_LANE0_USB3_1_SWAP 0x2 -#define SERDES1_LANE0_SGMII_LANE0 0x3 - -#define SERDES1_LANE1_QSGMII_LANE4 0x0 -#define SERDES1_LANE1_PCIE1_LANE1 0x1 -#define SERDES1_LANE1_USB3_1 0x2 -#define SERDES1_LANE1_SGMII_LANE1 0x3 - -#define SERDES2_LANE0_PCIE2_LANE0 0x1 -#define SERDES2_LANE0_SGMII_LANE0 0x3 -#define SERDES2_LANE0_USB3_1_SWAP 0x2 - -#define SERDES2_LANE1_PCIE2_LANE1 0x1 -#define SERDES2_LANE1_USB3_1 0x2 -#define SERDES2_LANE1_SGMII_LANE1 0x3 - -#define SERDES3_LANE0_PCIE3_LANE0 0x1 -#define SERDES3_LANE0_USB3_0_SWAP 0x2 - -#define SERDES3_LANE1_PCIE3_LANE1 0x1 -#define SERDES3_LANE1_USB3_0 0x2 - -#define SERDES4_LANE0_EDP_LANE0 0x0 -#define SERDES4_LANE0_QSGMII_LANE5 0x2 - -#define SERDES4_LANE1_EDP_LANE1 0x0 -#define SERDES4_LANE1_QSGMII_LANE6 0x2 - -#define SERDES4_LANE2_EDP_LANE2 0x0 -#define SERDES4_LANE2_QSGMII_LANE7 0x2 - -#define SERDES4_LANE3_EDP_LANE3 0x0 -#define SERDES4_LANE3_QSGMII_LANE8 0x2 - -#endif /* _DT_BINDINGS_J721E_WIZ */ diff --git a/include/dt-bindings/mux/ti-serdes.h b/include/dt-bindings/mux/ti-serdes.h new file mode 100644 index 000000000000..146d0685a925 --- /dev/null +++ b/include/dt-bindings/mux/ti-serdes.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This header provides constants for SERDES MUX for TI SoCs + */ + +#ifndef _DT_BINDINGS_MUX_TI_SERDES +#define _DT_BINDINGS_MUX_TI_SERDES + +/* J721E */ + +#define J721E_SERDES0_LANE0_QSGMII_LANE1 0x0 +#define J721E_SERDES0_LANE0_PCIE0_LANE0 0x1 +#define J721E_SERDES0_LANE0_USB3_0_SWAP 0x2 +#define J721E_SERDES0_LANE0_IP4_UNUSED 0x3 + +#define J721E_SERDES0_LANE1_QSGMII_LANE2 0x0 +#define J721E_SERDES0_LANE1_PCIE0_LANE1 0x1 +#define J721E_SERDES0_LANE1_USB3_0 0x2 +#define J721E_SERDES0_LANE1_IP4_UNUSED 0x3 + +#define J721E_SERDES1_LANE0_QSGMII_LANE3 0x0 +#define J721E_SERDES1_LANE0_PCIE1_LANE0 0x1 +#define J721E_SERDES1_LANE0_USB3_1_SWAP 0x2 +#define J721E_SERDES1_LANE0_SGMII_LANE0 0x3 + +#define J721E_SERDES1_LANE1_QSGMII_LANE4 0x0 +#define J721E_SERDES1_LANE1_PCIE1_LANE1 0x1 +#define J721E_SERDES1_LANE1_USB3_1 0x2 +#define J721E_SERDES1_LANE1_SGMII_LANE1 0x3 + +#define J721E_SERDES2_LANE0_IP1_UNUSED 0x0 +#define J721E_SERDES2_LANE0_PCIE2_LANE0 0x1 +#define J721E_SERDES2_LANE0_USB3_1_SWAP 0x2 +#define J721E_SERDES2_LANE0_SGMII_LANE0 0x3 + +#define J721E_SERDES2_LANE1_IP1_UNUSED 0x0 +#define J721E_SERDES2_LANE1_PCIE2_LANE1 0x1 +#define J721E_SERDES2_LANE1_USB3_1 0x2 +#define J721E_SERDES2_LANE1_SGMII_LANE1 0x3 + +#define J721E_SERDES3_LANE0_IP1_UNUSED 0x0 +#define J721E_SERDES3_LANE0_PCIE3_LANE0 0x1 +#define J721E_SERDES3_LANE0_USB3_0_SWAP 0x2 +#define J721E_SERDES3_LANE0_IP4_UNUSED 0x3 + +#define J721E_SERDES3_LANE1_IP1_UNUSED 0x0 +#define J721E_SERDES3_LANE1_PCIE3_LANE1 0x1 +#define J721E_SERDES3_LANE1_USB3_0 0x2 +#define J721E_SERDES3_LANE1_IP4_UNUSED 0x3 + +#define J721E_SERDES4_LANE0_EDP_LANE0 0x0 +#define J721E_SERDES4_LANE0_IP2_UNUSED 0x1 +#define J721E_SERDES4_LANE0_QSGMII_LANE5 0x2 +#define J721E_SERDES4_LANE0_IP4_UNUSED 0x3 + +#define J721E_SERDES4_LANE1_EDP_LANE1 0x0 +#define J721E_SERDES4_LANE1_IP2_UNUSED 0x1 +#define J721E_SERDES4_LANE1_QSGMII_LANE6 0x2 +#define J721E_SERDES4_LANE1_IP4_UNUSED 0x3 + +#define J721E_SERDES4_LANE2_EDP_LANE2 0x0 +#define J721E_SERDES4_LANE2_IP2_UNUSED 0x1 +#define J721E_SERDES4_LANE2_QSGMII_LANE7 0x2 +#define J721E_SERDES4_LANE2_IP4_UNUSED 0x3 + +#define J721E_SERDES4_LANE3_EDP_LANE3 0x0 +#define J721E_SERDES4_LANE3_IP2_UNUSED 0x1 +#define J721E_SERDES4_LANE3_QSGMII_LANE8 0x2 +#define J721E_SERDES4_LANE3_IP4_UNUSED 0x3 + +#endif /* _DT_BINDINGS_MUX_TI_SERDES */ From 4517076608c51c74966cf0c54bc66dada7dec223 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Sat, 19 Sep 2020 09:29:25 +0800 Subject: [PATCH 095/497] KVM: PPC: Book3S HV: XIVE: Convert to DEFINE_SHOW_ATTRIBUTE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Qinglang Miao Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive_native.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index bdea91df1497..d0c2db0e07fa 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1227,17 +1227,7 @@ static int xive_native_debug_show(struct seq_file *m, void *private) return 0; } -static int xive_native_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, xive_native_debug_show, inode->i_private); -} - -static const struct file_operations xive_native_debug_fops = { - .open = xive_native_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(xive_native_debug); static void xive_native_debugfs_init(struct kvmppc_xive *xive) { From eb173559c97c3b5d8905cd8a93a5d3f3fb24916d Mon Sep 17 00:00:00 2001 From: Jing Xiangfeng Date: Sat, 19 Sep 2020 15:12:30 +0800 Subject: [PATCH 096/497] KVM: PPC: Book3S: Remove redundant initialization of variable ret The variable ret is being initialized with '-ENOMEM' that is meaningless. So remove it. Signed-off-by: Jing Xiangfeng Reviewed-by: Fabiano Rosas Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_vio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 1a529df0ab44..b277a75cd1be 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -283,7 +283,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *siter; struct mm_struct *mm = kvm->mm; unsigned long npages, size = args->size; - int ret = -ENOMEM; + int ret; if (!args->size || args->page_shift < 12 || args->page_shift > 34 || (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) From cf59eb13e151ef42c37ae31864046c17e481ed8f Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Mon, 21 Sep 2020 11:22:11 +0000 Subject: [PATCH 097/497] KVM: PPC: Book3S: Fix symbol undeclared warnings Build the kernel with `C=2`: arch/powerpc/kvm/book3s_hv_nested.c:572:25: warning: symbol 'kvmhv_alloc_nested' was not declared. Should it be static? arch/powerpc/kvm/book3s_64_mmu_radix.c:350:6: warning: symbol 'kvmppc_radix_set_pte_at' was not declared. Should it be static? arch/powerpc/kvm/book3s_hv.c:3568:5: warning: symbol 'kvmhv_p9_guest_entry' was not declared. Should it be static? arch/powerpc/kvm/book3s_hv_rm_xics.c:767:15: warning: symbol 'eoi_rc' was not declared. Should it be static? arch/powerpc/kvm/book3s_64_vio_hv.c:240:13: warning: symbol 'iommu_tce_kill_rm' was not declared. Should it be static? arch/powerpc/kvm/book3s_64_vio.c:492:6: warning: symbol 'kvmppc_tce_iommu_do_map' was not declared. Should it be static? arch/powerpc/kvm/book3s_pr.c:572:6: warning: symbol 'kvmppc_set_pvr_pr' was not declared. Should it be static? Those symbols are used only in the files that define them so make them static to fix the warnings. Signed-off-by: Wang Wensheng Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/kvm/book3s_64_vio.c | 2 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 2 +- arch/powerpc/kvm/book3s_pr.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 22a677b18695..bb35490400e9 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -347,7 +347,7 @@ static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, return __radix_pte_update(ptep, clr, set); } -void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, +static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, pte_t *ptep, pte_t pte) { radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index b277a75cd1be..8da93fdfa59e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -489,7 +489,7 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, return ret; } -long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, +static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, unsigned long entry, unsigned long ua, enum dma_data_direction dir) { diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ac6ac192b8bb..470e7c518a10 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -237,7 +237,7 @@ static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm, return ret; } -extern void iommu_tce_kill_rm(struct iommu_table *tbl, +static void iommu_tce_kill_rm(struct iommu_table *tbl, unsigned long entry, unsigned long pages) { if (tbl->it_ops->tce_kill) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7ee00681ecb5..490a0f6a7285 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3568,7 +3568,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, * Virtual-mode guest entry for POWER9 and later when the host and * guest are both using the radix MMU. The LPIDR has already been set. */ -int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, +static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 6822d23a2da4..33b58549a9aa 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -569,7 +569,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) kvmhv_set_nested_ptbl(gp); } -struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) +static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) { struct kvm_nested_guest *gp; long shadow_lpid; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 4d7e5610731a..c2c9c733f359 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -764,7 +764,7 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) return ics_rm_eoi(vcpu, irq); } -unsigned long eoi_rc; +static unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) { diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 88fac22fbf09..b1fefa63e125 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -569,7 +569,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) #endif } -void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) +static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) { u32 host_pvr; From bc38325703ebd0a00fecfb965020c255ffc582fe Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 20 Sep 2020 22:26:52 +0200 Subject: [PATCH 098/497] soc: actions: include header to fix missing prototype Include the header with prototype of owl_sps_set_pg to fix: drivers/soc/actions/owl-sps-helper.c:16:5: warning: no previous prototype for 'owl_sps_set_pg' [-Wmissing-prototypes] Signed-off-by: Krzysztof Kozlowski Reviewed-by: Manivannan Sadhasivam Signed-off-by: Manivannan Sadhasivam --- drivers/soc/actions/owl-sps-helper.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/soc/actions/owl-sps-helper.c b/drivers/soc/actions/owl-sps-helper.c index 291a206d6f04..e3f36603dd53 100644 --- a/drivers/soc/actions/owl-sps-helper.c +++ b/drivers/soc/actions/owl-sps-helper.c @@ -10,6 +10,7 @@ #include #include +#include #define OWL_SPS_PG_CTL 0x0 From b753e41d9999c040c601de53cc1e072160472d80 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Tue, 22 Sep 2020 00:50:55 +0200 Subject: [PATCH 099/497] ARM: dts: am33xx: modify AM33XX_IOPAD for #pinctrl-cells = 2 Modify the AM33XX_IOPAD macro so that it works now that #pinctrl-cells = <2>. The third parameter is just a zero and the pinctrl-single driver will just OR this with the second parameter so it has no actual effect. There are no longer any dts files using this macro (following my patch to am335x-guardian.dts), but this will keep dts files not in mainline from breaking. Fixes: 27c90e5e48d0 ("ARM: dts: am33xx-l4: change #pinctrl-cells from 1 to 2") Suggested-by: Tony Lindgren Reported-by: Trent Piepho Link: https://lore.kernel.org/linux-devicetree/20200921064707.GN7101@atomide.com/ Signed-off-by: Drew Fustini Signed-off-by: Tony Lindgren --- include/dt-bindings/pinctrl/omap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/dt-bindings/pinctrl/omap.h b/include/dt-bindings/pinctrl/omap.h index 2d2a8c737822..f48245ff87e5 100644 --- a/include/dt-bindings/pinctrl/omap.h +++ b/include/dt-bindings/pinctrl/omap.h @@ -64,7 +64,7 @@ #define OMAP3_WKUP_IOPAD(pa, val) OMAP_IOPAD_OFFSET((pa), 0x2a00) (val) #define DM814X_IOPAD(pa, val) OMAP_IOPAD_OFFSET((pa), 0x0800) (val) #define DM816X_IOPAD(pa, val) OMAP_IOPAD_OFFSET((pa), 0x0800) (val) -#define AM33XX_IOPAD(pa, val) OMAP_IOPAD_OFFSET((pa), 0x0800) (val) +#define AM33XX_IOPAD(pa, val) OMAP_IOPAD_OFFSET((pa), 0x0800) (val) (0) #define AM33XX_PADCONF(pa, conf, mux) OMAP_IOPAD_OFFSET((pa), 0x0800) (conf) (mux) /* From 8f04aea048d56f3e39a7e543939450246542a6fc Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 23 Sep 2020 09:16:22 +0300 Subject: [PATCH 100/497] ARM: OMAP2+: Restore MPU power domain if cpu_cluster_pm_enter() fails If cpu_cluster_pm_enter() fails, we need to set MPU power domain back to enabled to prevent the next WFI from potentially triggering an undesired MPU power domain state change. We already do this for omap_enter_idle_smp() but are missing it for omap_enter_idle_coupled(). Fixes: 55be2f50336f ("ARM: OMAP2+: Handle errors for cpu_pm") Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/cpuidle44xx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/cpuidle44xx.c b/arch/arm/mach-omap2/cpuidle44xx.c index 6f5f89711f25..a92d277f81a0 100644 --- a/arch/arm/mach-omap2/cpuidle44xx.c +++ b/arch/arm/mach-omap2/cpuidle44xx.c @@ -174,8 +174,10 @@ static int omap_enter_idle_coupled(struct cpuidle_device *dev, */ if (mpuss_can_lose_context) { error = cpu_cluster_pm_enter(); - if (error) + if (error) { + omap_set_pwrdm_state(mpu_pd, PWRDM_POWER_ON); goto cpu_cluster_pm_out; + } } } From 08d7a73fffb6769b1cf2278bf697e692daba3abf Mon Sep 17 00:00:00 2001 From: Biju Das Date: Thu, 24 Sep 2020 09:05:35 +0100 Subject: [PATCH 101/497] ARM: dts: iwg20d-q7-common: Fix touch controller probe failure As per the iWave RZ/G1M schematic, the signal LVDS_PPEN controls the supply voltage for the touch panel, LVDS receiver and RGB LCD panel. Add a regulator for these device nodes and remove the powerdown-gpios property from the lvds-receiver node as it results in a touch controller driver probe failure. Fixes: 6f89dd9e9325 ("ARM: dts: iwg20d-q7-common: Add LCD support") Signed-off-by: Biju Das Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20200924080535.3641-1-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- arch/arm/boot/dts/iwg20d-q7-common.dtsi | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/iwg20d-q7-common.dtsi b/arch/arm/boot/dts/iwg20d-q7-common.dtsi index ebbe1518ef8a..63cafd220dba 100644 --- a/arch/arm/boot/dts/iwg20d-q7-common.dtsi +++ b/arch/arm/boot/dts/iwg20d-q7-common.dtsi @@ -57,7 +57,7 @@ lvds-receiver { compatible = "ti,ds90cf384a", "lvds-decoder"; - powerdown-gpios = <&gpio7 25 GPIO_ACTIVE_LOW>; + power-supply = <&vcc_3v3_tft1>; ports { #address-cells = <1>; @@ -81,6 +81,7 @@ panel { compatible = "edt,etm0700g0dh6"; backlight = <&lcd_backlight>; + power-supply = <&vcc_3v3_tft1>; port { panel_in: endpoint { @@ -113,6 +114,17 @@ }; }; + vcc_3v3_tft1: regulator-panel { + compatible = "regulator-fixed"; + + regulator-name = "vcc-3v3-tft1"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + enable-active-high; + startup-delay-us = <500>; + gpio = <&gpio7 25 GPIO_ACTIVE_HIGH>; + }; + vcc_sdhi1: regulator-vcc-sdhi1 { compatible = "regulator-fixed"; @@ -207,6 +219,7 @@ reg = <0x38>; interrupt-parent = <&gpio2>; interrupts = <12 IRQ_TYPE_EDGE_FALLING>; + vcc-supply = <&vcc_3v3_tft1>; }; }; From d5cd6f34014592a232ce79dc25e295778bd43c22 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 14 Sep 2020 15:37:25 +0200 Subject: [PATCH 102/497] KVM: nSVM: Avoid freeing uninitialized pointers in svm_set_nested_state() The save and ctl pointers are passed uninitialized to kfree() when svm_set_nested_state() follows the 'goto out_set_gif' path. While the issue could've been fixed by initializing these on-stack varialbles to NULL, it seems preferable to eliminate 'out_set_gif' label completely as it is not actually a failure path and duplicating a single svm_set_gif() call doesn't look too bad. [ bp: Drop obscure Addresses-Coverity: tag. ] Fixes: 6ccbd29ade0d ("KVM: SVM: nested: Don't allocate VMCB structures on stack") Reported-by: Dan Carpenter Reported-by: Joerg Roedel Reported-by: Colin King Signed-off-by: Vitaly Kuznetsov Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Acked-by: Joerg Roedel Tested-by: Tom Lendacky Link: https://lkml.kernel.org/r/20200914133725.650221-1-vkuznets@redhat.com Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 598a769f1961..afe5cb3f87a8 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1094,7 +1094,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { svm_leave_nested(svm); - goto out_set_gif; + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; } if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) @@ -1148,10 +1149,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_prepare_vmcb_control(svm); if (!nested_svm_vmrun_msrpm(svm)) - return -EINVAL; - -out_set_gif: - svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + goto out_free; ret = 0; out_free: From efc831338bfd66e40b6ab8b2b332ab48203be9ff Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Fri, 28 Aug 2020 16:56:18 +0800 Subject: [PATCH 103/497] KVM: nVMX: Fix VMX controls MSRs setup when nested VMX enabled KVM supports the nested VM_{EXIT, ENTRY}_LOAD_IA32_PERF_GLOBAL_CTRL and VM_{ENTRY_LOAD, EXIT_CLEAR}_BNDCFGS, but they are not exposed by the system ioctl KVM_GET_MSR. Add them to the setup of nested VMX controls MSR. Signed-off-by: Chenyi Qiang Message-Id: <20200828085622.8365-2-chenyi.qiang@intel.com> Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1bb6b31eb646..07dc5663a48e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6318,7 +6318,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif - VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; + VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | @@ -6337,7 +6338,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); From b9757a4b6f46560cdd65afdb0e44bc06b7e60340 Mon Sep 17 00:00:00 2001 From: Chenyi Qiang Date: Fri, 28 Aug 2020 16:56:22 +0800 Subject: [PATCH 104/497] KVM: nVMX: Simplify the initialization of nested_vmx_msrs The nested VMX controls MSRs can be initialized by the global capability values stored in vmcs_config. Signed-off-by: Chenyi Qiang Reviewed-by: Xiaoyao Li Message-Id: <20200828085622.8365-6-chenyi.qiang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8646a797b7a8..d4dfb7a23a0c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7016,8 +7016,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } if (nested) - nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, - vmx_capability.ept); + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); From 2fc4f15dac952b9bbc2695c9040f2c17be70dabb Mon Sep 17 00:00:00 2001 From: Yi Li Date: Fri, 11 Sep 2020 13:56:52 +0800 Subject: [PATCH 105/497] kvm/eventfd: move wildcard calculation outside loop There is no need to calculate wildcard in each iteration since wildcard is not changed. Signed-off-by: Yi Li Message-Id: <20200911055652.3041762-1-yili@winhong.com> Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index d6408bb497dc..c2323c27a28b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -853,15 +853,17 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, struct eventfd_ctx *eventfd; struct kvm_io_bus *bus; int ret = -ENOENT; + bool wildcard; eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); + wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); + mutex_lock(&kvm->slots_lock); list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { - bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); if (p->bus_idx != bus_idx || p->eventfd != eventfd || From ae6f24968608f71a51814d80ee8fcd38e799a0fd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 19 Aug 2020 16:55:26 +0800 Subject: [PATCH 106/497] KVM: LAPIC: Fix updating DFR missing apic map recalculation There is missing apic map recalculation after updating DFR, if it is INIT RESET, in x2apic mode, local apic is software enabled before. This patch fix it by introducing the function kvm_apic_set_dfr() to be called in INIT RESET handling path. Signed-off-by: Wanpeng Li Message-Id: <1597827327-25055-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 35cca2e0c802..43aab8a84d6d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) +{ + kvm_lapic_set_reg(apic, APIC_DFR, val); + atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); +} + static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) { return ((id >> 4) << 16) | (1 << (id & 0xf)); @@ -1984,10 +1990,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_DFR: - if (!apic_x2apic_mode(apic)) { - kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); - } else + if (!apic_x2apic_mode(apic)) + kvm_apic_set_dfr(apic, val | 0x0FFFFFFF); + else ret = 1; break; @@ -2303,7 +2308,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); - kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU); + kvm_apic_set_dfr(apic, 0xffffffffU); apic_set_spiv(apic, 0xff); kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) From a970e9b216a28b81c0417ab978ff2e248880b86d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 10 Sep 2020 17:50:36 +0800 Subject: [PATCH 107/497] KVM: LAPIC: Return 0 when getting the tscdeadline timer if the lapic is hw disabled Return 0 when getting the tscdeadline timer if the lapic is hw disabled. Suggested-by: Paolo Bonzini Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1599731444-3525-2-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43aab8a84d6d..5af7d03b0ecc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2188,8 +2188,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || - !apic_lvtt_tscdeadline(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; From 275038332f22d15796f1921f672b024d5cb392bc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 10 Sep 2020 17:50:37 +0800 Subject: [PATCH 108/497] KVM: LAPIC: Guarantee the timer is in tsc-deadline mode when setting Check apic_lvtt_tscdeadline() mode directly instead of apic_lvtt_oneshot() and apic_lvtt_period() to guarantee the timer is in tsc-deadline mode when wrmsr MSR_IA32_TSCDEADLINE. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1599731444-3525-3-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5af7d03b0ecc..e446bdff70ec 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2198,8 +2198,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); From 68ca7663c75b099539fe675fc5387dee44d3a6cc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 10 Sep 2020 17:50:40 +0800 Subject: [PATCH 109/497] KVM: LAPIC: Narrow down the kick target vCPU The kick after setting KVM_REQ_PENDING_TIMER is used to handle the timer fires on a different pCPU which vCPU is running on. This kick costs about 1000 clock cycles and we don't need this when injecting already-expired timer or when using the VMX preemption timer because kvm_lapic_expired_hv_timer() is called from the target vCPU. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1599731444-3525-6-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 +++- arch/x86/kvm/x86.c | 6 ------ arch/x86/kvm/x86.h | 1 - 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e446bdff70ec..3b32d3b094b5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,9 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } atomic_inc(&apic->lapic_timer.pending); - kvm_set_pending_timer(vcpu); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + if (from_timer_fn) + kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1994602a0851..836baad47fe9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1774,12 +1774,6 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - kvm_vcpu_kick(vcpu); -} - static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 995ab696dcf0..ea20b8bfd5dc 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); } -void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); From 010fd37fddf659ea7525b1191381e6f8c3ccab14 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 10 Sep 2020 17:50:41 +0800 Subject: [PATCH 110/497] KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns All the checks in lapic_timer_int_injected(), __kvm_wait_lapic_expire(), and these function calls waste cpu cycles when the timer mode is not tscdeadline. We can observe ~1.3% world switch time overhead by kvm-unit-tests/vmexit.flat vmcall testing on AMD server. This patch reduces the world switch latency caused by timer_advance_ns feature when the timer mode is not tscdeadline by simpling move the check against apic->lapic_timer.expired_tscdeadline much earlier. Signed-off-by: Wanpeng Li Message-Id: <1599731444-3525-7-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 11 +++++------ arch/x86/kvm/svm/svm.c | 4 +--- arch/x86/kvm/vmx/vmx.c | 4 +--- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3b32d3b094b5..51ed4f0b1390 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1582,9 +1582,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1599,7 +1596,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { - if (lapic_timer_int_injected(vcpu)) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns && + lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); @@ -1635,8 +1635,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - if (apic->lapic_timer.timer_advance_ns) - __kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c91acabf18d0..a74426214a6d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3454,9 +3454,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4dfb7a23a0c..f4cfc18366de 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6800,9 +6800,7 @@ reenter_guest: if (enable_preemption_timer) vmx_update_hv_timer(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if From 1feaba144cd395fa0ff5143ba1dc33afddb7a743 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 20:11:38 +0300 Subject: [PATCH 111/497] KVM: SVM: rename a variable in the svm_create_vcpu The 'page' is to hold the vcpu's vmcb so name it as such to avoid confusion. Signed-off-by: Maxim Levitsky Reviewed-by: Jim Mattson Message-Id: <20200827171145.374620-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a74426214a6d..d517bced3f84 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1171,7 +1171,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; - struct page *page; + struct page *vmcb_page; struct page *msrpm_pages; struct page *hsave_page; struct page *nested_msrpm_pages; @@ -1181,8 +1181,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) + vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!vmcb_page) goto out; msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); @@ -1216,9 +1216,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->nested.msrpm = page_address(nested_msrpm_pages); svm_vcpu_init_msrpm(svm->nested.msrpm); - svm->vmcb = page_address(page); + svm->vmcb = page_address(vmcb_page); clear_page(svm->vmcb); - svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); + svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); @@ -1234,7 +1234,7 @@ free_page3: free_page2: __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); free_page1: - __free_page(page); + __free_page(vmcb_page); out: return err; } From 0dd16b5b0c9bd62fbff3ea375cdd5125e19317c6 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 20:11:39 +0300 Subject: [PATCH 112/497] KVM: nSVM: rename nested vmcb to vmcb12 This is to be more consistient with VMX, and to support upcoming addition of vmcb02 Hopefully no functional changes. Signed-off-by: Maxim Levitsky Message-Id: <20200827171145.374620-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 220 +++++++++++++++++++------------------- arch/x86/kvm/svm/svm.c | 10 +- arch/x86/kvm/svm/svm.h | 2 +- 3 files changed, 115 insertions(+), 117 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index afe5cb3f87a8..e73bdd44b24a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -215,41 +215,39 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb) +static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) { - bool nested_vmcb_lma; - if ((vmcb->save.efer & EFER_SVME) == 0) + bool vmcb12_lma; + + if ((vmcb12->save.efer & EFER_SVME) == 0) return false; - if (((vmcb->save.cr0 & X86_CR0_CD) == 0) && - (vmcb->save.cr0 & X86_CR0_NW)) + if (((vmcb12->save.cr0 & X86_CR0_CD) == 0) && (vmcb12->save.cr0 & X86_CR0_NW)) return false; - if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) + if (!kvm_dr6_valid(vmcb12->save.dr6) || !kvm_dr7_valid(vmcb12->save.dr7)) return false; - nested_vmcb_lma = - (vmcb->save.efer & EFER_LME) && - (vmcb->save.cr0 & X86_CR0_PG); + vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG); - if (!nested_vmcb_lma) { - if (vmcb->save.cr4 & X86_CR4_PAE) { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) + if (!vmcb12_lma) { + if (vmcb12->save.cr4 & X86_CR4_PAE) { + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) return false; } else { - if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) + if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) return false; } } else { - if (!(vmcb->save.cr4 & X86_CR4_PAE) || - !(vmcb->save.cr0 & X86_CR0_PE) || - (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + if (!(vmcb12->save.cr4 & X86_CR4_PAE) || + !(vmcb12->save.cr0 & X86_CR0_PE) || + (vmcb12->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) return false; } - if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4)) + if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb->control); + return nested_vmcb_check_controls(&vmcb12->control); } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -296,7 +294,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) * EXIT_INT_INFO. */ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, - struct vmcb *nested_vmcb) + struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 exit_int_info = 0; @@ -308,7 +306,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, if (vcpu->arch.exception.has_error_code) { exit_int_info |= SVM_EVTINJ_VALID_ERR; - nested_vmcb->control.exit_int_info_err = + vmcb12->control.exit_int_info_err = vcpu->arch.exception.error_code; } @@ -325,7 +323,7 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, exit_int_info |= SVM_EVTINJ_TYPE_INTR; } - nested_vmcb->control.exit_int_info = exit_int_info; + vmcb12->control.exit_int_info = exit_int_info; } static inline bool nested_npt_enabled(struct vcpu_svm *svm) @@ -364,31 +362,31 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return 0; } -static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) +static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { /* Load the nested guest state */ - svm->vmcb->save.es = nested_vmcb->save.es; - svm->vmcb->save.cs = nested_vmcb->save.cs; - svm->vmcb->save.ss = nested_vmcb->save.ss; - svm->vmcb->save.ds = nested_vmcb->save.ds; - svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; - svm->vmcb->save.idtr = nested_vmcb->save.idtr; - kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); - svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); - svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); - svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); - kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); - kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip); + svm->vmcb->save.es = vmcb12->save.es; + svm->vmcb->save.cs = vmcb12->save.cs; + svm->vmcb->save.ss = vmcb12->save.ss; + svm->vmcb->save.ds = vmcb12->save.ds; + svm->vmcb->save.gdtr = vmcb12->save.gdtr; + svm->vmcb->save.idtr = vmcb12->save.idtr; + kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags); + svm_set_efer(&svm->vcpu, vmcb12->save.efer); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); + svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; + kvm_rax_write(&svm->vcpu, vmcb12->save.rax); + kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); + kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = nested_vmcb->save.rax; - svm->vmcb->save.rsp = nested_vmcb->save.rsp; - svm->vmcb->save.rip = nested_vmcb->save.rip; - svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; - svm->vmcb->save.cpl = nested_vmcb->save.cpl; + svm->vmcb->save.rax = vmcb12->save.rax; + svm->vmcb->save.rsp = vmcb12->save.rsp; + svm->vmcb->save.rip = vmcb12->save.rip; + svm->vmcb->save.dr7 = vmcb12->save.dr7; + svm->vcpu.arch.dr6 = vmcb12->save.dr6; + svm->vmcb->save.cpl = vmcb12->save.cpl; } static void nested_prepare_vmcb_control(struct vcpu_svm *svm) @@ -426,17 +424,17 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); } -int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb) +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, + struct vmcb *vmcb12) { int ret; - svm->nested.vmcb = vmcb_gpa; - load_nested_vmcb_control(svm, &nested_vmcb->control); - nested_prepare_vmcb_save(svm, nested_vmcb); + svm->nested.vmcb12_gpa = vmcb12_gpa; + load_nested_vmcb_control(svm, &vmcb12->control); + nested_prepare_vmcb_save(svm, vmcb12); nested_prepare_vmcb_control(svm); - ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, nested_npt_enabled(svm)); if (ret) return ret; @@ -449,19 +447,19 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, int nested_svm_vmrun(struct vcpu_svm *svm) { int ret; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - u64 vmcb_gpa; + u64 vmcb12_gpa; if (is_smm(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } - vmcb_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + vmcb12_gpa = svm->vmcb->save.rax; + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb12_gpa), &map); if (ret == -EINVAL) { kvm_inject_gp(&svm->vcpu, 0); return 1; @@ -471,26 +469,26 @@ int nested_svm_vmrun(struct vcpu_svm *svm) ret = kvm_skip_emulated_instruction(&svm->vcpu); - nested_vmcb = map.hva; + vmcb12 = map.hva; - if (!nested_vmcb_checks(svm, nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; + if (!nested_vmcb_checks(svm, vmcb12)) { + vmcb12->control.exit_code = SVM_EXIT_ERR; + vmcb12->control.exit_code_hi = 0; + vmcb12->control.exit_info_1 = 0; + vmcb12->control.exit_info_2 = 0; goto out; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl); - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); + trace_kvm_nested_intercepts(vmcb12->control.intercept_cr & 0xffff, + vmcb12->control.intercept_cr >> 16, + vmcb12->control.intercept_exceptions, + vmcb12->control.intercept); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -522,7 +520,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) svm->nested.nested_run_pending = 1; - if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb)) + if (enter_svm_guest_mode(svm, vmcb12_gpa, vmcb12)) goto out_exit_err; if (nested_svm_vmrun_msrpm(svm)) @@ -563,23 +561,23 @@ void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { int rc; - struct vmcb *nested_vmcb; + struct vmcb *vmcb12; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map); + rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) kvm_inject_gp(&svm->vcpu, 0); return 1; } - nested_vmcb = map.hva; + vmcb12 = map.hva; /* Exit Guest-Mode */ leave_guest_mode(&svm->vcpu); - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); /* in case we halted in L2 */ @@ -587,45 +585,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Give the current vmcb to the guest */ - nested_vmcb->save.es = vmcb->save.es; - nested_vmcb->save.cs = vmcb->save.cs; - nested_vmcb->save.ss = vmcb->save.ss; - nested_vmcb->save.ds = vmcb->save.ds; - nested_vmcb->save.gdtr = vmcb->save.gdtr; - nested_vmcb->save.idtr = vmcb->save.idtr; - nested_vmcb->save.efer = svm->vcpu.arch.efer; - nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); - nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); - nested_vmcb->save.cr2 = vmcb->save.cr2; - nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = kvm_rip_read(&svm->vcpu); - nested_vmcb->save.rsp = kvm_rsp_read(&svm->vcpu); - nested_vmcb->save.rax = kvm_rax_read(&svm->vcpu); - nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; - nested_vmcb->save.cpl = vmcb->save.cpl; + vmcb12->save.es = vmcb->save.es; + vmcb12->save.cs = vmcb->save.cs; + vmcb12->save.ss = vmcb->save.ss; + vmcb12->save.ds = vmcb->save.ds; + vmcb12->save.gdtr = vmcb->save.gdtr; + vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.efer = svm->vcpu.arch.efer; + vmcb12->save.cr0 = kvm_read_cr0(&svm->vcpu); + vmcb12->save.cr3 = kvm_read_cr3(&svm->vcpu); + vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr4 = svm->vcpu.arch.cr4; + vmcb12->save.rflags = kvm_get_rflags(&svm->vcpu); + vmcb12->save.rip = kvm_rip_read(&svm->vcpu); + vmcb12->save.rsp = kvm_rsp_read(&svm->vcpu); + vmcb12->save.rax = kvm_rax_read(&svm->vcpu); + vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr6 = svm->vcpu.arch.dr6; + vmcb12->save.cpl = vmcb->save.cpl; - nested_vmcb->control.int_state = vmcb->control.int_state; - nested_vmcb->control.exit_code = vmcb->control.exit_code; - nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; - nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; - nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; + vmcb12->control.int_state = vmcb->control.int_state; + vmcb12->control.exit_code = vmcb->control.exit_code; + vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; - if (nested_vmcb->control.exit_code != SVM_EXIT_ERR) - nested_vmcb_save_pending_event(svm, nested_vmcb); + if (vmcb12->control.exit_code != SVM_EXIT_ERR) + nested_vmcb_save_pending_event(svm, vmcb12); if (svm->nrips_enabled) - nested_vmcb->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb->control.next_rip; - nested_vmcb->control.int_ctl = svm->nested.ctl.int_ctl; - nested_vmcb->control.tlb_ctl = svm->nested.ctl.tlb_ctl; - nested_vmcb->control.event_inj = svm->nested.ctl.event_inj; - nested_vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; + vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; + vmcb12->control.event_inj = svm->nested.ctl.event_inj; + vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - nested_vmcb->control.pause_filter_count = + vmcb12->control.pause_filter_count = svm->vmcb->control.pause_filter_count; - nested_vmcb->control.pause_filter_thresh = + vmcb12->control.pause_filter_thresh = svm->vmcb->control.pause_filter_thresh; /* Restore the original control entries */ @@ -659,11 +657,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_all_dirty(svm->vmcb); - trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, - nested_vmcb->control.exit_info_1, - nested_vmcb->control.exit_info_2, - nested_vmcb->control.exit_int_info, - nested_vmcb->control.exit_int_info_err, + trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, + vmcb12->control.exit_info_1, + vmcb12->control.exit_info_2, + vmcb12->control.exit_int_info, + vmcb12->control.exit_int_info_err, KVM_ISA_SVM); kvm_vcpu_unmap(&svm->vcpu, &map, true); @@ -1020,7 +1018,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, /* First fill in the header and copy it out. */ if (is_guest_mode(vcpu)) { - kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa; kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; @@ -1144,7 +1142,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); hsave->save = *save; - svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; + svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; load_nested_vmcb_control(svm, ctl); nested_prepare_vmcb_control(svm); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d517bced3f84..bd3515f25ce2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1102,7 +1102,7 @@ static void init_vmcb(struct vcpu_svm *svm) } svm->asid_generation = 0; - svm->nested.vmcb = 0; + svm->nested.vmcb12_gpa = 0; svm->vcpu.arch.hflags = 0; if (!kvm_pause_in_guest(svm->vcpu.kvm)) { @@ -3881,7 +3881,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) /* FED8h - SVM Guest */ put_smstate(u64, smstate, 0x7ed8, 1); /* FEE0h - SVM Guest VMCB Physical Address */ - put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb); + put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -3903,7 +3903,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); - u64 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); + u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); if (guest) { if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) @@ -3913,10 +3913,10 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 1; if (kvm_vcpu_map(&svm->vcpu, - gpa_to_gfn(vmcb), &map) == -EINVAL) + gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) return 1; - ret = enter_svm_guest_mode(svm, vmcb, map.hva); + ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva); kvm_vcpu_unmap(&svm->vcpu, &map, true); } } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a798e1731709..ab913468f9cb 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -85,7 +85,7 @@ struct svm_nested_state { struct vmcb *hsave; u64 hsave_msr; u64 vm_cr_msr; - u64 vmcb; + u64 vmcb12_gpa; u32 host_intercept_exceptions; /* These are the merged vectors */ From f4c847a95654b898834ed513870a4fb01c27b29e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 20:11:40 +0300 Subject: [PATCH 113/497] KVM: SVM: refactor msr permission bitmap allocation Replace svm_vcpu_init_msrpm with svm_vcpu_alloc_msrpm, that also allocates the msr bitmap and add svm_vcpu_free_msrpm to free it. This will be used later to move the nested msr permission bitmap allocation to nested.c Signed-off-by: Maxim Levitsky Message-Id: <20200827171145.374620-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd3515f25ce2..535839cdcb56 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -609,18 +609,29 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, msrpm[offset] = tmp; } -static void svm_vcpu_init_msrpm(u32 *msrpm) +static u32 *svm_vcpu_init_msrpm(void) { int i; + u32 *msrpm; + struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); + if (!pages) + return NULL; + + msrpm = page_address(pages); memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { if (!direct_access_msrs[i].always) continue; - set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); } + return msrpm; +} + +static void svm_vcpu_free_msrpm(u32 *msrpm) +{ + __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER); } static void add_msr_offset(u32 offset) @@ -1172,9 +1183,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb_page; - struct page *msrpm_pages; struct page *hsave_page; - struct page *nested_msrpm_pages; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); @@ -1185,21 +1194,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (!vmcb_page) goto out; - msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!msrpm_pages) - goto free_page1; - - nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); - if (!nested_msrpm_pages) - goto free_page2; - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!hsave_page) - goto free_page3; + goto free_page1; err = avic_init_vcpu(svm); if (err) - goto free_page4; + goto free_page2; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1210,11 +1211,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->nested.hsave = page_address(hsave_page); clear_page(svm->nested.hsave); - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + svm->msrpm = svm_vcpu_init_msrpm(); + if (!svm->msrpm) + goto free_page2; - svm->nested.msrpm = page_address(nested_msrpm_pages); - svm_vcpu_init_msrpm(svm->nested.msrpm); + svm->nested.msrpm = svm_vcpu_init_msrpm(); + if (!svm->nested.msrpm) + goto free_page3; svm->vmcb = page_address(vmcb_page); clear_page(svm->vmcb); @@ -1227,12 +1230,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) return 0; -free_page4: - __free_page(hsave_page); free_page3: - __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); + svm_vcpu_free_msrpm(svm->msrpm); free_page2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); + __free_page(hsave_page); free_page1: __free_page(vmcb_page); out: From 0681de1b83697bcbc06b843d0c8302da45855205 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 20:11:41 +0300 Subject: [PATCH 114/497] KVM: SVM: use __GFP_ZERO instead of clear_page Another small refactoring. Suggested-by: Jim Mattson Signed-off-by: Maxim Levitsky Message-Id: <20200827171145.374620-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 535839cdcb56..22886b1d9af7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1190,11 +1190,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT); + vmcb_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmcb_page) goto out; - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!hsave_page) goto free_page1; @@ -1209,7 +1209,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->avic_is_running = true; svm->nested.hsave = page_address(hsave_page); - clear_page(svm->nested.hsave); svm->msrpm = svm_vcpu_init_msrpm(); if (!svm->msrpm) @@ -1220,7 +1219,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) goto free_page3; svm->vmcb = page_address(vmcb_page); - clear_page(svm->vmcb); svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); From 8d22b90e942c26d33dcccaef398cbf7f9b4e39d6 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 27 Aug 2020 20:11:42 +0300 Subject: [PATCH 115/497] KVM: SVM: refactor exit labels in svm_create_vcpu Kernel coding style suggests not to use labels like error1,error2 Suggested-by: Jim Mattson Signed-off-by: Maxim Levitsky Message-Id: <20200827171145.374620-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 22886b1d9af7..281e93b47a07 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1196,11 +1196,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!hsave_page) - goto free_page1; + goto error_free_vmcb_page; err = avic_init_vcpu(svm); if (err) - goto free_page2; + goto error_free_hsave_page; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1212,11 +1212,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->msrpm = svm_vcpu_init_msrpm(); if (!svm->msrpm) - goto free_page2; + goto error_free_hsave_page; svm->nested.msrpm = svm_vcpu_init_msrpm(); if (!svm->nested.msrpm) - goto free_page3; + goto error_free_msrpm; svm->vmcb = page_address(vmcb_page); svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); @@ -1228,11 +1228,11 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) return 0; -free_page3: +error_free_msrpm: svm_vcpu_free_msrpm(svm->msrpm); -free_page2: +error_free_hsave_page: __free_page(hsave_page); -free_page1: +error_free_vmcb_page: __free_page(vmcb_page); out: return err; From a90c1ed9f11dbc3d37664a8561e50e3f3695c539 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:28:42 -0500 Subject: [PATCH 116/497] KVM: nSVM: Remove unused field host_intercept_exceptions is not used anywhere. Clean it up. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985252277.11252.8819848322175521354.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 -- arch/x86/kvm/svm/svm.h | 1 - 2 files changed, 3 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e73bdd44b24a..c10a9a80022e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -108,8 +108,6 @@ void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested.ctl; - svm->nested.host_intercept_exceptions = h->intercept_exceptions; - c->intercept_cr = h->intercept_cr; c->intercept_dr = h->intercept_dr; c->intercept_exceptions = h->intercept_exceptions; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index ab913468f9cb..e3aca71b24ab 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -86,7 +86,6 @@ struct svm_nested_state { u64 hsave_msr; u64 vm_cr_msr; u64 vmcb12_gpa; - u32 host_intercept_exceptions; /* These are the merged vectors */ u32 *msrpm; From c45ad7229d139ad48e894a271f8df6975e53d12e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:27:58 -0500 Subject: [PATCH 117/497] KVM: SVM: Introduce vmcb_(set_intercept/clr_intercept/_is_intercept) This is in preparation for the future intercept vector additions. Add new functions vmcb_set_intercept, vmcb_clr_intercept and vmcb_is_intercept using kernel APIs __set_bit, __clear_bit and test_bit espectively. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985247876.11252.16039238014239824460.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 9 +++++++++ arch/x86/kvm/svm/nested.c | 12 ++++++++++++ arch/x86/kvm/svm/svm.h | 18 ++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index cf13f9e78585..debe71f0226c 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -4,6 +4,14 @@ #include +/* + * 32-bit intercept words in the VMCB Control Area, starting + * at Byte offset 000h. + */ + +enum intercept_words { + MAX_INTERCEPT, +}; enum { INTERCEPT_INTR, @@ -57,6 +65,7 @@ enum { struct __attribute__ ((__packed__)) vmcb_control_area { + u32 intercepts[MAX_INTERCEPT]; u32 intercept_cr; u32 intercept_dr; u32 intercept_exceptions; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c10a9a80022e..69dfd7c1a5b7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -98,6 +98,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h, *g; + unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -108,6 +109,9 @@ void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested.ctl; + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] = h->intercepts[i]; + c->intercept_cr = h->intercept_cr; c->intercept_dr = h->intercept_dr; c->intercept_exceptions = h->intercept_exceptions; @@ -129,6 +133,9 @@ void recalc_intercepts(struct vcpu_svm *svm) /* We don't want to see VMMCALLs from a nested guest */ c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + for (i = 0; i < MAX_INTERCEPT; i++) + c->intercepts[i] |= g->intercepts[i]; + c->intercept_cr |= g->intercept_cr; c->intercept_dr |= g->intercept_dr; c->intercept_exceptions |= g->intercept_exceptions; @@ -138,6 +145,11 @@ void recalc_intercepts(struct vcpu_svm *svm) static void copy_vmcb_control_area(struct vmcb_control_area *dst, struct vmcb_control_area *from) { + unsigned int i; + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + dst->intercept_cr = from->intercept_cr; dst->intercept_dr = from->intercept_dr; dst->intercept_exceptions = from->intercept_exceptions; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e3aca71b24ab..83202c40857e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -213,6 +213,24 @@ static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) return svm->vmcb; } +static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __set_bit(bit, (unsigned long *)&control->intercepts); +} + +static inline void vmcb_clr_intercept(struct vmcb_control_area *control, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + __clear_bit(bit, (unsigned long *)&control->intercepts); +} + +static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); +} + static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); From 03bfeeb988a970995479eb6d108c398027ab7525 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:28:05 -0500 Subject: [PATCH 118/497] KVM: SVM: Change intercept_cr to generic intercepts Change intercept_cr to generic intercepts in vmcb_control_area. Use the new vmcb_set_intercept, vmcb_clr_intercept and vmcb_is_intercept where applicable. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985248506.11252.9081085950784508671.stgit@bmoger-ubuntu> [Change constant names. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 23 +++++++++++++---------- arch/x86/kvm/svm/nested.c | 14 +++++--------- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 6 +++--- 4 files changed, 23 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index debe71f0226c..20b63418ae36 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -10,9 +10,22 @@ */ enum intercept_words { + INTERCEPT_CR = 0, MAX_INTERCEPT, }; +enum { + /* Byte offset 000h (word 0) */ + INTERCEPT_CR0_READ = 0, + INTERCEPT_CR3_READ = 3, + INTERCEPT_CR4_READ = 4, + INTERCEPT_CR8_READ = 8, + INTERCEPT_CR0_WRITE = 16, + INTERCEPT_CR3_WRITE = 16 + 3, + INTERCEPT_CR4_WRITE = 16 + 4, + INTERCEPT_CR8_WRITE = 16 + 8, +}; + enum { INTERCEPT_INTR, INTERCEPT_NMI, @@ -66,7 +79,6 @@ enum { struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercepts[MAX_INTERCEPT]; - u32 intercept_cr; u32 intercept_dr; u32 intercept_exceptions; u64 intercept; @@ -296,15 +308,6 @@ struct vmcb { #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK #define SVM_SELECTOR_CODE_MASK (1 << 3) -#define INTERCEPT_CR0_READ 0 -#define INTERCEPT_CR3_READ 3 -#define INTERCEPT_CR4_READ 4 -#define INTERCEPT_CR8_READ 8 -#define INTERCEPT_CR0_WRITE (16 + 0) -#define INTERCEPT_CR3_WRITE (16 + 3) -#define INTERCEPT_CR4_WRITE (16 + 4) -#define INTERCEPT_CR8_WRITE (16 + 8) - #define INTERCEPT_DR0_READ 0 #define INTERCEPT_DR1_READ 1 #define INTERCEPT_DR2_READ 2 diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 69dfd7c1a5b7..4a7fcc6d312c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -112,15 +112,14 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] = h->intercepts[i]; - c->intercept_cr = h->intercept_cr; c->intercept_dr = h->intercept_dr; c->intercept_exceptions = h->intercept_exceptions; c->intercept = h->intercept; if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ - c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); - c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); + vmcb_clr_intercept(c, INTERCEPT_CR8_READ); + vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); /* * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not @@ -136,7 +135,6 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; - c->intercept_cr |= g->intercept_cr; c->intercept_dr |= g->intercept_dr; c->intercept_exceptions |= g->intercept_exceptions; c->intercept |= g->intercept; @@ -150,7 +148,6 @@ static void copy_vmcb_control_area(struct vmcb_control_area *dst, for (i = 0; i < MAX_INTERCEPT; i++) dst->intercepts[i] = from->intercepts[i]; - dst->intercept_cr = from->intercept_cr; dst->intercept_dr = from->intercept_dr; dst->intercept_exceptions = from->intercept_exceptions; dst->intercept = from->intercept; @@ -495,8 +492,8 @@ int nested_svm_vmrun(struct vcpu_svm *svm) vmcb12->control.event_inj, vmcb12->control.nested_ctl); - trace_kvm_nested_intercepts(vmcb12->control.intercept_cr & 0xffff, - vmcb12->control.intercept_cr >> 16, + trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, + vmcb12->control.intercepts[INTERCEPT_CR] >> 16, vmcb12->control.intercept_exceptions, vmcb12->control.intercept); @@ -775,8 +772,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.ctl.intercept_cr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 281e93b47a07..98ba4fa07f29 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2812,8 +2812,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) } pr_err("VMCB Control Area:\n"); - pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); - pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); + pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 83202c40857e..92938596bd3e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -235,7 +235,7 @@ static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_cr |= (1U << bit); + vmcb_set_intercept(&vmcb->control, bit); recalc_intercepts(svm); } @@ -244,7 +244,7 @@ static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_cr &= ~(1U << bit); + vmcb_clr_intercept(&vmcb->control, bit); recalc_intercepts(svm); } @@ -253,7 +253,7 @@ static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - return vmcb->control.intercept_cr & (1U << bit); + return vmcb_is_intercept(&vmcb->control, bit); } static inline void set_dr_intercepts(struct vcpu_svm *svm) From 30abaa88382ce078cfc2ecebb61d9e0540fef24d Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:28:12 -0500 Subject: [PATCH 119/497] KVM: SVM: Change intercept_dr to generic intercepts Modify intercept_dr to generic intercepts in vmcb_control_area. Use the generic vmcb_set_intercept, vmcb_clr_intercept and vmcb_is_intercept to set/clear/test the intercept_dr bits. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985249255.11252.10000868032136333355.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 36 ++++++++++++++++++------------------ arch/x86/kvm/svm/nested.c | 6 +----- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 34 +++++++++++++++++----------------- 4 files changed, 38 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 20b63418ae36..80a4db25e424 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -11,6 +11,7 @@ enum intercept_words { INTERCEPT_CR = 0, + INTERCEPT_DR, MAX_INTERCEPT, }; @@ -24,6 +25,23 @@ enum { INTERCEPT_CR3_WRITE = 16 + 3, INTERCEPT_CR4_WRITE = 16 + 4, INTERCEPT_CR8_WRITE = 16 + 8, + /* Byte offset 004h (word 1) */ + INTERCEPT_DR0_READ = 32, + INTERCEPT_DR1_READ, + INTERCEPT_DR2_READ, + INTERCEPT_DR3_READ, + INTERCEPT_DR4_READ, + INTERCEPT_DR5_READ, + INTERCEPT_DR6_READ, + INTERCEPT_DR7_READ, + INTERCEPT_DR0_WRITE = 48, + INTERCEPT_DR1_WRITE, + INTERCEPT_DR2_WRITE, + INTERCEPT_DR3_WRITE, + INTERCEPT_DR4_WRITE, + INTERCEPT_DR5_WRITE, + INTERCEPT_DR6_WRITE, + INTERCEPT_DR7_WRITE, }; enum { @@ -79,7 +97,6 @@ enum { struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercepts[MAX_INTERCEPT]; - u32 intercept_dr; u32 intercept_exceptions; u64 intercept; u8 reserved_1[40]; @@ -308,23 +325,6 @@ struct vmcb { #define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK #define SVM_SELECTOR_CODE_MASK (1 << 3) -#define INTERCEPT_DR0_READ 0 -#define INTERCEPT_DR1_READ 1 -#define INTERCEPT_DR2_READ 2 -#define INTERCEPT_DR3_READ 3 -#define INTERCEPT_DR4_READ 4 -#define INTERCEPT_DR5_READ 5 -#define INTERCEPT_DR6_READ 6 -#define INTERCEPT_DR7_READ 7 -#define INTERCEPT_DR0_WRITE (16 + 0) -#define INTERCEPT_DR1_WRITE (16 + 1) -#define INTERCEPT_DR2_WRITE (16 + 2) -#define INTERCEPT_DR3_WRITE (16 + 3) -#define INTERCEPT_DR4_WRITE (16 + 4) -#define INTERCEPT_DR5_WRITE (16 + 5) -#define INTERCEPT_DR6_WRITE (16 + 6) -#define INTERCEPT_DR7_WRITE (16 + 7) - #define SVM_EVTINJ_VEC_MASK 0xff #define SVM_EVTINJ_TYPE_SHIFT 8 diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 4a7fcc6d312c..012ab2255b3c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -112,7 +112,6 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] = h->intercepts[i]; - c->intercept_dr = h->intercept_dr; c->intercept_exceptions = h->intercept_exceptions; c->intercept = h->intercept; @@ -135,7 +134,6 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; - c->intercept_dr |= g->intercept_dr; c->intercept_exceptions |= g->intercept_exceptions; c->intercept |= g->intercept; } @@ -148,7 +146,6 @@ static void copy_vmcb_control_area(struct vmcb_control_area *dst, for (i = 0; i < MAX_INTERCEPT; i++) dst->intercepts[i] = from->intercepts[i]; - dst->intercept_dr = from->intercept_dr; dst->intercept_exceptions = from->intercept_exceptions; dst->intercept = from->intercept; dst->iopm_base_pa = from->iopm_base_pa; @@ -777,8 +774,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.ctl.intercept_dr & bit) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 98ba4fa07f29..07a0804d4fe7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2814,8 +2814,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); - pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); - pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); pr_err("%-20s%016llx\n", "intercepts:", control->intercept); pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 92938596bd3e..2f54829c9ad6 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -260,22 +260,22 @@ static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ) - | (1 << INTERCEPT_DR1_READ) - | (1 << INTERCEPT_DR2_READ) - | (1 << INTERCEPT_DR3_READ) - | (1 << INTERCEPT_DR4_READ) - | (1 << INTERCEPT_DR5_READ) - | (1 << INTERCEPT_DR6_READ) - | (1 << INTERCEPT_DR7_READ) - | (1 << INTERCEPT_DR0_WRITE) - | (1 << INTERCEPT_DR1_WRITE) - | (1 << INTERCEPT_DR2_WRITE) - | (1 << INTERCEPT_DR3_WRITE) - | (1 << INTERCEPT_DR4_WRITE) - | (1 << INTERCEPT_DR5_WRITE) - | (1 << INTERCEPT_DR6_WRITE) - | (1 << INTERCEPT_DR7_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR0_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR1_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR2_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR3_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR4_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR5_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR6_WRITE); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); } @@ -284,7 +284,7 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_dr = 0; + vmcb->control.intercepts[INTERCEPT_DR] = 0; recalc_intercepts(svm); } From 9780d51dc2af1c02bed9687822ba0d6df955c302 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:28:20 -0500 Subject: [PATCH 120/497] KVM: SVM: Modify intercept_exceptions to generic intercepts Modify intercept_exceptions to generic intercepts in vmcb_control_area. Use the generic vmcb_set_intercept, vmcb_clr_intercept and vmcb_is_intercept to set/clear/test the intercept_exceptions bits. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985250037.11252.1361972528657052410.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 5 ++++- arch/x86/kvm/svm/nested.c | 10 ++++------ arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 10 ++++++---- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 80a4db25e424..caf7a63d65aa 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -3,6 +3,7 @@ #define __SVM_H #include +#include /* * 32-bit intercept words in the VMCB Control Area, starting @@ -12,6 +13,7 @@ enum intercept_words { INTERCEPT_CR = 0, INTERCEPT_DR, + INTERCEPT_EXCEPTION, MAX_INTERCEPT, }; @@ -42,6 +44,8 @@ enum { INTERCEPT_DR5_WRITE, INTERCEPT_DR6_WRITE, INTERCEPT_DR7_WRITE, + /* Byte offset 008h (word 2) */ + INTERCEPT_EXCEPTION_OFFSET = 64, }; enum { @@ -97,7 +101,6 @@ enum { struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercepts[MAX_INTERCEPT]; - u32 intercept_exceptions; u64 intercept; u8 reserved_1[40]; u16 pause_filter_thresh; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 012ab2255b3c..e9e6ad7fdbbe 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -112,7 +112,6 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] = h->intercepts[i]; - c->intercept_exceptions = h->intercept_exceptions; c->intercept = h->intercept; if (g->int_ctl & V_INTR_MASKING_MASK) { @@ -134,7 +133,6 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; - c->intercept_exceptions |= g->intercept_exceptions; c->intercept |= g->intercept; } @@ -146,7 +144,6 @@ static void copy_vmcb_control_area(struct vmcb_control_area *dst, for (i = 0; i < MAX_INTERCEPT; i++) dst->intercepts[i] = from->intercepts[i]; - dst->intercept_exceptions = from->intercept_exceptions; dst->intercept = from->intercept; dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; @@ -491,7 +488,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, vmcb12->control.intercepts[INTERCEPT_CR] >> 16, - vmcb12->control.intercept_exceptions, + vmcb12->control.intercepts[INTERCEPT_EXCEPTION], vmcb12->control.intercept); /* Clear internal status */ @@ -833,7 +830,7 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm) { unsigned int nr = svm->vcpu.arch.exception.nr; - return (svm->nested.ctl.intercept_exceptions & (1 << nr)); + return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr)); } static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) @@ -982,7 +979,8 @@ int nested_svm_exit_special(struct vcpu_svm *svm) case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) + if (get_host_vmcb(svm)->control.intercepts[INTERCEPT_EXCEPTION] & + excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && svm->vcpu.arch.apf.host_apf_flags) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 07a0804d4fe7..c37d64fb05ff 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2816,7 +2816,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); - pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); + pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); pr_err("%-20s%016llx\n", "intercepts:", control->intercept); pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); pr_err("%-20s%d\n", "pause filter threshold:", diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2f54829c9ad6..65c054994776 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -289,20 +289,22 @@ static inline void clr_dr_intercepts(struct vcpu_svm *svm) recalc_intercepts(svm); } -static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions |= (1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } -static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) +static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept_exceptions &= ~(1U << bit); + WARN_ON_ONCE(bit >= 32); + vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); recalc_intercepts(svm); } From c62e2e94b9d4a221f489cfeacf665c50aa9ab6cf Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:28:28 -0500 Subject: [PATCH 121/497] KVM: SVM: Modify 64 bit intercept field to two 32 bit vectors Convert all the intercepts to one array of 32 bit vectors in vmcb_control_area. This makes it easy for future intercept vector additions. Also update trace functions. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985250813.11252.5736581193881040525.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 14 +++++++------- arch/x86/kvm/svm/nested.c | 25 ++++++++++--------------- arch/x86/kvm/svm/svm.c | 17 +++++++---------- arch/x86/kvm/svm/svm.h | 12 ++++++------ arch/x86/kvm/trace.h | 18 +++++++++++------- 5 files changed, 41 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index caf7a63d65aa..c2ae1dfb37a4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -14,6 +14,8 @@ enum intercept_words { INTERCEPT_CR = 0, INTERCEPT_DR, INTERCEPT_EXCEPTION, + INTERCEPT_WORD3, + INTERCEPT_WORD4, MAX_INTERCEPT, }; @@ -46,10 +48,8 @@ enum { INTERCEPT_DR7_WRITE, /* Byte offset 008h (word 2) */ INTERCEPT_EXCEPTION_OFFSET = 64, -}; - -enum { - INTERCEPT_INTR, + /* Byte offset 00Ch (word 3) */ + INTERCEPT_INTR = 96, INTERCEPT_NMI, INTERCEPT_SMI, INTERCEPT_INIT, @@ -81,7 +81,8 @@ enum { INTERCEPT_TASK_SWITCH, INTERCEPT_FERR_FREEZE, INTERCEPT_SHUTDOWN, - INTERCEPT_VMRUN, + /* Byte offset 010h (word 4) */ + INTERCEPT_VMRUN = 128, INTERCEPT_VMMCALL, INTERCEPT_VMLOAD, INTERCEPT_VMSAVE, @@ -101,8 +102,7 @@ enum { struct __attribute__ ((__packed__)) vmcb_control_area { u32 intercepts[MAX_INTERCEPT]; - u64 intercept; - u8 reserved_1[40]; + u32 reserved_1[15 - MAX_INTERCEPT]; u16 pause_filter_thresh; u16 pause_filter_count; u64 iopm_base_pa; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e9e6ad7fdbbe..cc0985c0e09e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -112,8 +112,6 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] = h->intercepts[i]; - c->intercept = h->intercept; - if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ vmcb_clr_intercept(c, INTERCEPT_CR8_READ); @@ -124,16 +122,14 @@ void recalc_intercepts(struct vcpu_svm *svm) * affect any interrupt we may want to inject; therefore, * interrupt window vmexits are irrelevant to L0. */ - c->intercept &= ~(1ULL << INTERCEPT_VINTR); + vmcb_clr_intercept(c, INTERCEPT_VINTR); } /* We don't want to see VMMCALLs from a nested guest */ - c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + vmcb_clr_intercept(c, INTERCEPT_VMMCALL); for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; - - c->intercept |= g->intercept; } static void copy_vmcb_control_area(struct vmcb_control_area *dst, @@ -144,7 +140,6 @@ static void copy_vmcb_control_area(struct vmcb_control_area *dst, for (i = 0; i < MAX_INTERCEPT; i++) dst->intercepts[i] = from->intercepts[i]; - dst->intercept = from->intercept; dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; @@ -177,7 +172,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -203,7 +198,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { - if ((control->intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) return false; if (control->asid == 0) @@ -489,7 +484,8 @@ int nested_svm_vmrun(struct vcpu_svm *svm) trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, vmcb12->control.intercepts[INTERCEPT_CR] >> 16, vmcb12->control.intercepts[INTERCEPT_EXCEPTION], - vmcb12->control.intercept); + vmcb12->control.intercepts[INTERCEPT_WORD3], + vmcb12->control.intercepts[INTERCEPT_WORD4]); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); @@ -708,7 +704,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_MSR_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -735,7 +731,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(svm->nested.ctl.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -789,8 +785,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.ctl.intercept & exit_bits) + if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -898,7 +893,7 @@ static void nested_svm_intr(struct vcpu_svm *svm) static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INIT)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static void nested_svm_init(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c37d64fb05ff..9c399b393d46 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2217,12 +2217,9 @@ static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, { unsigned long cr0 = svm->vcpu.arch.cr0; bool ret = false; - u64 intercept; - - intercept = svm->nested.ctl.intercept; if (!is_guest_mode(&svm->vcpu) || - (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -2817,7 +2814,9 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); - pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%08x %08x\n", "intercepts:", + control->intercepts[INTERCEPT_WORD3], + control->intercepts[INTERCEPT_WORD4]); pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); pr_err("%-20s%d\n", "pause filter threshold:", control->pause_filter_thresh); @@ -3734,7 +3733,6 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, break; case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - u64 intercept; if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; @@ -3743,9 +3741,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - intercept = svm->nested.ctl.intercept; - - if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + if (!(vmcb_is_intercept(&svm->nested.ctl, + INTERCEPT_SELECTIVE_CR0))) break; cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; @@ -4013,7 +4010,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) * if an INIT signal is pending. */ return !gif_set(svm) || - (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); + (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); } static void svm_vm_destroy(struct kvm *kvm) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 65c054994776..0b8f1b9026e5 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -313,7 +313,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept |= (1ULL << bit); + vmcb_set_intercept(&vmcb->control, bit); recalc_intercepts(svm); } @@ -322,14 +322,14 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); - vmcb->control.intercept &= ~(1ULL << bit); + vmcb_clr_intercept(&vmcb->control, bit); recalc_intercepts(svm); } static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) { - return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; + return vmcb_is_intercept(&svm->vmcb->control, bit); } static inline bool vgif_enabled(struct vcpu_svm *svm) @@ -393,17 +393,17 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_SMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_INTR)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); + return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b66432b015d2..b250c6aa7ae0 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -544,26 +544,30 @@ TRACE_EVENT(kvm_nested_vmrun, ); TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u32 intercept1, + __u32 intercept2), + TP_ARGS(cr_read, cr_write, exceptions, intercept1, intercept2), TP_STRUCT__entry( __field( __u16, cr_read ) __field( __u16, cr_write ) __field( __u32, exceptions ) - __field( __u64, intercept ) + __field( __u32, intercept1 ) + __field( __u32, intercept2 ) ), TP_fast_assign( __entry->cr_read = cr_read; __entry->cr_write = cr_write; __entry->exceptions = exceptions; - __entry->intercept = intercept; + __entry->intercept1 = intercept1; + __entry->intercept2 = intercept2; ), - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) + TP_printk("cr_read: %04x cr_write: %04x excp: %08x " + "intercepts: %08x %08x", + __entry->cr_read, __entry->cr_write, __entry->exceptions, + __entry->intercept1, __entry->intercept2) ); /* * Tracepoint for #VMEXIT while nested From 4c44e8d6c19330e48a87103184f840ffe5a423cd Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:28:35 -0500 Subject: [PATCH 122/497] KVM: SVM: Add new intercept word in vmcb_control_area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new intercept bits have been added in vmcb control area to support few more interceptions. Here are the some of them. - INTERCEPT_INVLPGB, - INTERCEPT_INVLPGB_ILLEGAL, - INTERCEPT_INVPCID, - INTERCEPT_MCOMMIT, - INTERCEPT_TLBSYNC, Add a new intercept word in vmcb_control_area to support these instructions. Also update kvm_nested_vmrun trace function to support the new addition. AMD documentation for these instructions is available at "AMD64 Architecture Programmer’s Manual Volume 2: System Programming, Pub. 24593 Rev. 3.34(or later)" The documentation can be obtained at the links below: Link: https://www.amd.com/system/files/TechDocs/24593.pdf Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985251547.11252.16994139329949066945.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 7 +++++++ arch/x86/kvm/svm/nested.c | 3 ++- arch/x86/kvm/trace.h | 13 ++++++++----- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index c2ae1dfb37a4..71d630bb5e08 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -16,6 +16,7 @@ enum intercept_words { INTERCEPT_EXCEPTION, INTERCEPT_WORD3, INTERCEPT_WORD4, + INTERCEPT_WORD5, MAX_INTERCEPT, }; @@ -97,6 +98,12 @@ enum { INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, INTERCEPT_RDPRU, + /* Byte offset 014h (word 5) */ + INTERCEPT_INVLPGB = 160, + INTERCEPT_INVLPGB_ILLEGAL, + INTERCEPT_INVPCID, + INTERCEPT_MCOMMIT, + INTERCEPT_TLBSYNC, }; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cc0985c0e09e..cf6e74b9461d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -485,7 +485,8 @@ int nested_svm_vmrun(struct vcpu_svm *svm) vmcb12->control.intercepts[INTERCEPT_CR] >> 16, vmcb12->control.intercepts[INTERCEPT_EXCEPTION], vmcb12->control.intercepts[INTERCEPT_WORD3], - vmcb12->control.intercepts[INTERCEPT_WORD4]); + vmcb12->control.intercepts[INTERCEPT_WORD4], + vmcb12->control.intercepts[INTERCEPT_WORD5]); /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b250c6aa7ae0..11cb4c070f0c 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -544,9 +544,10 @@ TRACE_EVENT(kvm_nested_vmrun, ); TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u32 intercept1, - __u32 intercept2), - TP_ARGS(cr_read, cr_write, exceptions, intercept1, intercept2), + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, + __u32 intercept1, __u32 intercept2, __u32 intercept3), + TP_ARGS(cr_read, cr_write, exceptions, intercept1, + intercept2, intercept3), TP_STRUCT__entry( __field( __u16, cr_read ) @@ -554,6 +555,7 @@ TRACE_EVENT(kvm_nested_intercepts, __field( __u32, exceptions ) __field( __u32, intercept1 ) __field( __u32, intercept2 ) + __field( __u32, intercept3 ) ), TP_fast_assign( @@ -562,12 +564,13 @@ TRACE_EVENT(kvm_nested_intercepts, __entry->exceptions = exceptions; __entry->intercept1 = intercept1; __entry->intercept2 = intercept2; + __entry->intercept3 = intercept3; ), TP_printk("cr_read: %04x cr_write: %04x excp: %08x " - "intercepts: %08x %08x", + "intercepts: %08x %08x %08x", __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept1, __entry->intercept2) + __entry->intercept1, __entry->intercept2, __entry->intercept3) ); /* * Tracepoint for #VMEXIT while nested From 830bd71f2c0684b530970d0aea792a12c0cbcdd2 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:28:50 -0500 Subject: [PATCH 123/497] KVM: SVM: Remove set_cr_intercept, clr_cr_intercept and is_cr_intercept Remove set_cr_intercept, clr_cr_intercept and is_cr_intercept. Instead call generic svm_set_intercept, svm_clr_intercept an dsvm_is_intercep tfor all cr intercepts. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985253016.11252.16945893859439811480.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 34 +++++++++++++++++----------------- arch/x86/kvm/svm/svm.h | 25 ------------------------- 2 files changed, 17 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9c399b393d46..633b6249bcb8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1003,14 +1003,14 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vcpu.arch.hflags = 0; - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR3_READ); - set_cr_intercept(svm, INTERCEPT_CR4_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); - set_cr_intercept(svm, INTERCEPT_CR3_WRITE); - set_cr_intercept(svm, INTERCEPT_CR4_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR3_READ); + svm_set_intercept(svm, INTERCEPT_CR4_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR3_WRITE); + svm_set_intercept(svm, INTERCEPT_CR4_WRITE); if (!kvm_vcpu_apicv_active(&svm->vcpu)) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1105,8 +1105,8 @@ static void init_vmcb(struct vcpu_svm *svm) control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); - clr_cr_intercept(svm, INTERCEPT_CR3_READ); - clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR3_READ); + svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = svm->vcpu.arch.pat; save->cr3 = 0; save->cr4 = 0; @@ -1548,11 +1548,11 @@ static void update_cr0_intercept(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0) { - clr_cr_intercept(svm, INTERCEPT_CR0_READ); - clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR0_READ); + svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); } else { - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); + svm_set_intercept(svm, INTERCEPT_CR0_READ); + svm_set_intercept(svm, INTERCEPT_CR0_WRITE); } } @@ -2931,7 +2931,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) + if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; @@ -3054,13 +3054,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) if (nested_svm_virtualize_tpr(vcpu)) return; - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); if (irr == -1) return; if (tpr >= irr) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); } bool svm_nmi_blocked(struct kvm_vcpu *vcpu) @@ -3248,7 +3248,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) if (nested_svm_virtualize_tpr(vcpu)) return; - if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { + if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 0b8f1b9026e5..4cd360e8a77a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -231,31 +231,6 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) return test_bit(bit, (unsigned long *)&control->intercepts); } -static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb_set_intercept(&vmcb->control, bit); - - recalc_intercepts(svm); -} - -static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb_clr_intercept(&vmcb->control, bit); - - recalc_intercepts(svm); -} - -static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - return vmcb_is_intercept(&vmcb->control, bit); -} - static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = get_host_vmcb(svm); From 3f3393b3ce383600fc7255f5ec04385209b6f404 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:29:05 -0500 Subject: [PATCH 124/497] KVM: X86: Rename and move the function vmx_handle_memory_failure to x86.c Handling of kvm_read/write_guest_virt*() errors can be moved to common code. The same code can be used by both VMX and SVM. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985254493.11252.6603092560732507607.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 29 +---------------------------- arch/x86/kvm/vmx/vmx.h | 2 -- arch/x86/kvm/x86.c | 28 ++++++++++++++++++++++++++++ arch/x86/kvm/x86.h | 2 ++ 5 files changed, 37 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 07dc5663a48e..b6ce9ce91029 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4696,7 +4696,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); if (r != X86EMUL_CONTINUE) { - *ret = vmx_handle_memory_failure(vcpu, r, &e); + *ret = kvm_handle_memory_failure(vcpu, r, &e); return -EINVAL; } @@ -5003,7 +5003,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* _system ok, nested_vmx_check_permission has verified cpl=0 */ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } return nested_vmx_succeed(vcpu); @@ -5076,7 +5076,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); } field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); @@ -5238,7 +5238,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, sizeof(gpa_t), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); return nested_vmx_succeed(vcpu); } @@ -5291,7 +5291,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); /* * Nested EPT roots are always held through guest_mmu, @@ -5373,7 +5373,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) return nested_vmx_fail(vcpu, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f4cfc18366de..76d657d5d5f7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1598,33 +1598,6 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) return 1; } -/* - * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns - * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value - * indicates whether exit to userspace is needed. - */ -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e) -{ - if (r == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_emulated_page_fault(vcpu, e); - return 1; - } - - /* - * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED - * while handling a VMX instruction KVM could've handled the request - * correctly by exiting to userspace and performing I/O but there - * doesn't seem to be a real use-case behind such requests, just return - * KVM_EXIT_INTERNAL_ERROR for now. - */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - - return 0; -} - /* * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. @@ -5558,7 +5531,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) - return vmx_handle_memory_failure(vcpu, r, &e); + return kvm_handle_memory_failure(vcpu, r, &e); if (operand.pcid >> 12 != 0) { kvm_inject_gp(vcpu, 0); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a2f82127c170..d7ec66db5eb8 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -354,8 +354,6 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); -int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, - struct x86_exception *e); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); #define POSTED_INTR_ON 0 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 836baad47fe9..e560687ecf34 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10765,6 +10765,34 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c } EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +/* + * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns + * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value + * indicates whether exit to userspace is needed. + */ +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e) +{ + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, e); + return 1; + } + + /* + * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED + * while handling a VMX instruction KVM could've handled the request + * correctly by exiting to userspace and performing I/O but there + * doesn't seem to be a real use-case behind such requests, just return + * KVM_EXIT_INTERNAL_ERROR for now. + */ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ea20b8bfd5dc..c05a953ad29a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -371,6 +371,8 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e); #define KVM_MSR_RET_INVALID 2 From 9715092f8d7eaab2e06a86b67bffc61c20e76f17 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:29:12 -0500 Subject: [PATCH 125/497] KVM: X86: Move handling of INVPCID types to x86 INVPCID instruction handling is mostly same across both VMX and SVM. So, move the code to common x86.c. Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985255212.11252.10322694343971983487.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 68 +----------------------------------- arch/x86/kvm/x86.c | 78 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + 3 files changed, 80 insertions(+), 67 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 76d657d5d5f7..1d20705193c2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5497,16 +5497,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) { u32 vmx_instruction_info; unsigned long type; - bool pcid_enabled; gva_t gva; - struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; } operand; - int r; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5529,68 +5524,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) sizeof(operand), &gva)) return 1; - r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); - if (r != X86EMUL_CONTINUE) - return kvm_handle_memory_failure(vcpu, r, &e); - - if (operand.pcid >> 12 != 0) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - switch (type) { - case INVPCID_TYPE_INDIV_ADDR: - if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_SINGLE_CTXT: - if (!pcid_enabled && (operand.pcid != 0)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); - /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. - */ - - return kvm_skip_emulated_instruction(vcpu); - - case INVPCID_TYPE_ALL_NON_GLOBAL: - /* - * Currently, KVM doesn't mark global entries in the shadow - * page tables, so a non-global flush just degenerates to a - * global flush. If needed, we could optimize this later by - * keeping track of global entries in shadow page tables. - */ - - fallthrough; - case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_mmu_unload(vcpu); - return kvm_skip_emulated_instruction(vcpu); - - default: - BUG(); /* We have already checked above that type <= 3 */ - } + return kvm_handle_invpcid(vcpu, type, gva); } static int handle_pml_full(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e560687ecf34..d8b40dd90cc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -10793,6 +10794,83 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, } EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) +{ + bool pcid_enabled; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + int r; + + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + fallthrough; + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} +EXPORT_SYMBOL_GPL(kvm_handle_invpcid); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c05a953ad29a..941f288c38aa 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -373,6 +373,7 @@ int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); #define KVM_MSR_RET_INVALID 2 From 4407a797e9412afba2f7815fb19395d4b32dca4e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 11 Sep 2020 14:29:19 -0500 Subject: [PATCH 126/497] KVM: SVM: Enable INVPCID feature on AMD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following intercept bit has been added to support VMEXIT for INVPCID instruction: Code Name Cause A2h VMEXIT_INVPCID INVPCID instruction The following bit has been added to the VMCB layout control area to control intercept of INVPCID: Byte Offset Bit(s) Function 14h 2 intercept INVPCID Enable the interceptions when the the guest is running with shadow page table enabled and handle the tlbflush based on the invpcid instruction type. For the guests with nested page table (NPT) support, the INVPCID feature works as running it natively. KVM does not need to do any special handling in this case. AMD documentation for INVPCID feature is available at "AMD64 Architecture Programmer’s Manual Volume 2: System Programming, Pub. 24593 Rev. 3.34(or later)" The documentation can be obtained at the links below: Link: https://www.amd.com/system/files/TechDocs/24593.pdf Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Signed-off-by: Babu Moger Reviewed-by: Jim Mattson Message-Id: <159985255929.11252.17346684135277453258.stgit@bmoger-ubuntu> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/svm.h | 2 ++ arch/x86/kvm/svm/svm.c | 51 +++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 2e8a30f06c74..522d42dfc28c 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -76,6 +76,7 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_RDPRU 0x08e +#define SVM_EXIT_INVPCID 0x0a2 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 @@ -171,6 +172,7 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_INVPCID, "invpcid" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 633b6249bcb8..259a724c69c7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -824,6 +824,9 @@ static __init void svm_set_cpu_caps(void) if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* Enable INVPCID feature */ + kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); } static __init int svm_hardware_setup(void) @@ -996,6 +999,21 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) return svm->vmcb->control.tsc_offset; } +static void svm_check_invpcid(struct vcpu_svm *svm) +{ + /* + * Intercept INVPCID instruction only if shadow page table is + * enabled. Interception is not required with nested page table + * enabled. + */ + if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { + if (!npt_enabled) + svm_set_intercept(svm, INTERCEPT_INVPCID); + else + svm_clr_intercept(svm, INTERCEPT_INVPCID); + } +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1125,6 +1143,8 @@ static void init_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_PAUSE); } + svm_check_invpcid(svm); + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); @@ -2729,6 +2749,33 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } +static int invpcid_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + unsigned long type; + gva_t gva; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + /* + * For an INVPCID intercept: + * EXITINFO1 provides the linear address of the memory operand. + * EXITINFO2 provides the contents of the register operand. + */ + type = svm->vmcb->control.exit_info_2; + gva = svm->vmcb->control.exit_info_1; + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_handle_invpcid(vcpu, type, gva); +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -2791,6 +2838,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_RDPRU] = rdpru_interception, + [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, @@ -3619,6 +3667,9 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + /* Check again if INVPCID interception if required */ + svm_check_invpcid(svm); + if (!kvm_vcpu_apicv_active(vcpu)) return; From 871c433bae565306cfc3a1a386bf7328bc9d31cb Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Fri, 18 Sep 2020 05:05:00 -0700 Subject: [PATCH 127/497] KVM: use struct_size() and flex_array_size() helpers in kvm_io_bus_unregister_dev() Make use of the struct_size() helper to avoid any potential type mistakes and protect against potential integer overflows Make use of the flex_array_size() helper to calculate the size of a flexible array member within an enclosing structure Suggested-by: Gustavo A. R. Silva Signed-off-by: Rustam Kovhaev Message-Id: <20200918120500.954436-1-rkovhaev@gmail.com> Reviewed-by: Gustavo A. R. Silva Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cf88233b819a..68edd25dcb11 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4350,10 +4350,10 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), GFP_KERNEL_ACCOUNT); if (new_bus) { - memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); + memcpy(new_bus, bus, struct_size(bus, range, i)); new_bus->dev_count--; memcpy(new_bus->range + i, bus->range + i + 1, - (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); + flex_array_size(new_bus, range, new_bus->dev_count - i)); } else { pr_err("kvm: failed to shrink bus, removing it completely\n"); for (j = 0; j < bus->dev_count; j++) { From cc5b54dd58d0420c1b1f1e9b29f53327076e9355 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 21 Sep 2020 13:38:05 +0300 Subject: [PATCH 128/497] KVM: x86: fix MSR_IA32_TSC read for nested migration MSR reads/writes should always access the L1 state, since the (nested) hypervisor should intercept all the msrs it wants to adjust, and these that it doesn't should be read by the guest as if the host had read it. However IA32_TSC is an exception. Even when not intercepted, guest still reads the value + TSC offset. The write however does not take any TSC offset into account. This is documented in Intel's SDM and seems also to happen on AMD as well. This creates a problem when userspace wants to read the IA32_TSC value and then write it. (e.g for migration) In this case it reads L2 value but write is interpreted as an L1 value. To fix this make the userspace initiated reads of IA32_TSC return L1 value as well. Huge thanks to Dave Gilbert for helping me understand this very confusing semantic of MSR writes. Signed-off-by: Maxim Levitsky Message-Id: <20200921103805.9102-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d8b40dd90cc1..cc6992fd4637 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3216,9 +3216,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; - case MSR_IA32_TSC: - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; + case MSR_IA32_TSC: { + /* + * Intel SDM states that MSR_IA32_TSC read adds the TSC offset + * even when not intercepted. AMD manual doesn't explicitly + * state this but appears to behave the same. + * + * However when userspace wants to read this MSR, we should + * return it's real L1 value so that its restore will be correct. + */ + u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : + vcpu->arch.tsc_offset; + + msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; break; + } case MSR_MTRRcap: case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); From bddd82d19e2e580b11e62f7a1d86ee52d8e437b6 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Mon, 21 Sep 2020 08:10:25 +0000 Subject: [PATCH 129/497] KVM: nVMX: KVM needs to unset "unrestricted guest" VM-execution control in vmcs02 if vmcs12 doesn't set it Currently, prepare_vmcs02_early() does not check if the "unrestricted guest" VM-execution control in vmcs12 is turned off and leaves the corresponding bit on in vmcs02. Due to this setting, vmentry checks which are supposed to render the nested guest state as invalid when this VM-execution control is not set, are passing in hardware. This patch turns off the "unrestricted guest" VM-execution control in vmcs02 if vmcs12 has turned it off. Suggested-by: Jim Mattson Suggested-by: Sean Christopherson Signed-off-by: Krish Sadhukhan Message-Id: <20200921081027.23047-2-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +++ arch/x86/kvm/vmx/vmx.c | 17 +++++++++-------- arch/x86/kvm/vmx/vmx.h | 7 +++++++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b6ce9ce91029..9861179d9a8c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2314,6 +2314,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + secondary_exec_controls_set(vmx, exec_control); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1d20705193c2..b73901699ecc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1441,7 +1441,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long old_rflags; - if (enable_unrestricted_guest) { + if (is_unrestricted_guest(vcpu)) { kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); vmx->rflags = rflags; vmcs_writel(GUEST_RFLAGS, rflags); @@ -2240,7 +2240,8 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + if (is_unrestricted_guest(vcpu) || + (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -3006,7 +3007,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) unsigned long hw_cr0; hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; @@ -3027,7 +3028,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif - if (enable_ept && !enable_unrestricted_guest) + if (enable_ept && !is_unrestricted_guest(vcpu)) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -3107,7 +3108,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long hw_cr4; hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; @@ -3142,7 +3143,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); - if (!enable_unrestricted_guest) { + if (!is_unrestricted_guest(vcpu)) { if (enable_ept) { if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; @@ -3282,7 +3283,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) * tree. Newer qemu binaries with that qemu fix would not need this * kvm hack. */ - if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); @@ -3473,7 +3474,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) */ static bool guest_state_valid(struct kvm_vcpu *vcpu) { - if (enable_unrestricted_guest) + if (is_unrestricted_guest(vcpu)) return true; /* real mode guest state checks */ diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d7ec66db5eb8..941336dc5ee4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -553,6 +553,13 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; } +static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) +{ + return enable_unrestricted_guest && (!is_guest_mode(vcpu) || + (secondary_exec_controls_get(to_vmx(vcpu)) & + SECONDARY_EXEC_UNRESTRICTED_GUEST)); +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ From ae5a2a39e46c1e21d06d275daeef9eb0b46864fe Mon Sep 17 00:00:00 2001 From: Haiwei Li Date: Wed, 16 Sep 2020 16:36:21 +0800 Subject: [PATCH 130/497] KVM: SVM: use __GFP_ZERO instead of clear_page() Use __GFP_ZERO while alloc_page(). Signed-off-by: Haiwei Li Message-Id: <20200916083621.5512-1-lihaiwei.kernel@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ac830cd50830..f73f84d56452 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -153,20 +153,18 @@ int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL_ACCOUNT); + p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!p_page) goto free_avic; kvm_svm->avic_physical_id_table_page = p_page; - clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL_ACCOUNT); + l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!l_page) goto free_avic; kvm_svm->avic_logical_id_table_page = l_page; - clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); again: From 09e3e2a1cc8d8069085785f1236a64c72707e7f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Sep 2020 16:27:02 -0700 Subject: [PATCH 131/497] KVM: x86: Add kvm_x86_ops hook to short circuit emulation Replace the existing kvm_x86_ops.need_emulation_on_page_fault() with a more generic is_emulatable(), and unconditionally call the new function in x86_emulate_instruction(). KVM will use the generic hook to support multiple security related technologies that prevent emulation in one way or another. Similar to the existing AMD #NPF case where emulation of the current instruction is not possible due to lack of information, AMD's SEV-ES and Intel's SGX and TDX will introduce scenarios where emulation is impossible due to the guest's register state being inaccessible. And again similar to the existing #NPF case, emulation can be initiated by kvm_mmu_page_fault(), i.e. outside of the control of vendor-specific code. While the cause and architecturally visible behavior of the various cases are different, e.g. SGX will inject a #UD, AMD #NPF is a clean resume or complete shutdown, and SEV-ES and TDX "return" an error, the impact on the common emulation code is identical: KVM must stop emulation immediately and resume the guest. Query is_emulatable() in handle_ud() as well so that the force_emulation_prefix code doesn't incorrectly modify RIP before calling emulate_instruction() in the absurdly unlikely scenario that KVM encounters forced emulation in conjunction with "do not emulate". Cc: Tom Lendacky Signed-off-by: Sean Christopherson Message-Id: <20200915232702.15945-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 12 ------------ arch/x86/kvm/svm/svm.c | 31 ++++++++++++++++++------------- arch/x86/kvm/vmx/vmx.c | 12 ++++++------ arch/x86/kvm/x86.c | 14 +++++++++++--- 5 files changed, 36 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5303dbc5c9bc..a4a68b2b38d5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1221,7 +1221,7 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); + bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 71aa3da2a0b7..5ae9f972609f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5485,18 +5485,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: - /* - * On AMD platforms, under certain conditions insn_len may be zero on #NPF. - * This can happen if a guest gets a page-fault on data access but the HW - * table walker is not able to read the instruction page (e.g instruction - * page is not present in memory). In those cases we simply restart the - * guest, with the exception of AMD Erratum 1096 which is unrecoverable. - */ - if (unlikely(insn && !insn_len)) { - if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu)) - return 1; - } - return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 259a724c69c7..0f16113ee90e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3984,19 +3984,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) { - unsigned long cr4 = kvm_read_cr4(vcpu); - bool smep = cr4 & X86_CR4_SMEP; - bool smap = cr4 & X86_CR4_SMAP; - bool is_user = svm_get_cpl(vcpu) == 3; - - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. - */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; + bool smep, smap, is_user; + unsigned long cr4; /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. @@ -4038,6 +4029,20 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) * instruction pointer so we will not able to workaround it. Lets * print the error and request to kill the guest. */ + if (likely(!insn || insn_len)) + return true; + + /* + * If RIP is invalid, go ahead with emulation which will cause an + * internal error exit. + */ + if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) + return true; + + cr4 = kvm_read_cr4(vcpu); + smep = cr4 & X86_CR4_SMEP; + smap = cr4 & X86_CR4_SMAP; + is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { if (!sev_guest(vcpu->kvm)) return true; @@ -4199,7 +4204,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, + .can_emulate_instruction = svm_can_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, }; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b73901699ecc..f002d3415840 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1561,6 +1561,11 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +{ + return true; +} + static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip, orig_rip; @@ -7749,11 +7754,6 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) /* RSM will cause a vmexit anyway. */ } -static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) -{ - return false; -} - static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon; @@ -7908,7 +7908,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cc6992fd4637..5f3e9ab34c80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3222,8 +3222,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * even when not intercepted. AMD manual doesn't explicitly * state this but appears to behave the same. * - * However when userspace wants to read this MSR, we should - * return it's real L1 value so that its restore will be correct. + * Unconditionally return L1's TSC offset on userspace reads + * so that userspace reads and writes always operate on L1's + * offset, e.g. to ensure deterministic behavior for migration. */ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : vcpu->arch.tsc_offset; @@ -5714,6 +5715,9 @@ int handle_ud(struct kvm_vcpu *vcpu) char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0))) + return 1; + if (force_emulation_prefix && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && @@ -6919,7 +6923,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + bool write_fault_to_spt; + + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len))) + return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -6927,6 +6934,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. */ + write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; kvm_clear_exception_queue(vcpu); From 535f7ef2ab7dbfbcfe732ee7634faca21417a777 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Sep 2020 12:15:04 -0700 Subject: [PATCH 132/497] KVM: VMX: Move IRQ invocation to assembly subroutine Move the asm blob that invokes the appropriate IRQ handler after VM-Exit into a proper subroutine. Unconditionally create a stack frame in the subroutine so that, as objtool sees things, the function has standard stack behavior. The dynamic stack adjustment makes using unwind hints problematic. Suggested-by: Josh Poimboeuf Cc: Uros Bizjak Signed-off-by: Sean Christopherson Message-Id: <20200915191505.10355-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 33 +++------------------------------ 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 799db084a336..90ad7a6246e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -4,6 +4,7 @@ #include #include #include +#include #define WORD_SIZE (BITS_PER_LONG / 8) @@ -294,3 +295,36 @@ SYM_FUNC_START(vmread_error_trampoline) ret SYM_FUNC_END(vmread_error_trampoline) + +SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) + /* + * Unconditionally create a stack frame, getting the correct RSP on the + * stack (for x86-64) would take two instructions anyways, and RBP can + * be used to restore RSP to make objtool happy (see below). + */ + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + +#ifdef CONFIG_X86_64 + /* + * Align RSP to a 16-byte boundary (to emulate CPU behavior) before + * creating the synthetic interrupt stack frame for the IRQ/NMI. + */ + and $-16, %rsp + push $__KERNEL_DS + push %rbp +#endif + pushf + push $__KERNEL_CS + CALL_NOSPEC _ASM_ARG1 + + /* + * "Restore" RSP from RBP, even though IRET has already unwound RSP to + * the correct value. objtool doesn't know the callee will IRET and, + * without the explicit restore, thinks the stack is getting walloped. + * Using an unwind hint is problematic due to x86-64's dynamic alignment. + */ + mov %_ASM_BP, %_ASM_SP + pop %_ASM_BP + ret +SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f002d3415840..6f425a658865 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6323,6 +6323,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); } +void vmx_do_interrupt_nmi_irqoff(unsigned long entry); + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { u32 intr_info = vmx_get_intr_info(&vmx->vcpu); @@ -6344,10 +6346,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { unsigned int vector; - unsigned long entry; -#ifdef CONFIG_X86_64 - unsigned long tmp; -#endif gate_desc *desc; u32 intr_info = vmx_get_intr_info(vcpu); @@ -6357,36 +6355,11 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) vector = intr_info & INTR_INFO_VECTOR_MASK; desc = (gate_desc *)host_idt_base + vector; - entry = gate_offset(desc); kvm_before_interrupt(vcpu); - - asm volatile( -#ifdef CONFIG_X86_64 - "mov %%rsp, %[sp]\n\t" - "and $-16, %%rsp\n\t" - "push %[ss]\n\t" - "push %[sp]\n\t" -#endif - "pushf\n\t" - "push %[cs]\n\t" - CALL_NOSPEC - : -#ifdef CONFIG_X86_64 - [sp]"=&r"(tmp), -#endif - ASM_CALL_CONSTRAINT - : - [thunk_target]"r"(entry), -#ifdef CONFIG_X86_64 - [ss]"i"(__KERNEL_DS), -#endif - [cs]"i"(__KERNEL_CS) - ); - + vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); kvm_after_interrupt(vcpu); } -STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { From 1a5488ef0dcf6af5b2a69144ad6732dd67e98146 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Sep 2020 12:15:05 -0700 Subject: [PATCH 133/497] KVM: VMX: Invoke NMI handler via indirect call instead of INTn Rework NMI VM-Exit handling to invoke the kernel handler by function call instead of INTn. INTn microcode is relatively expensive, and aligning the IRQ and NMI handling will make it easier to update KVM should some newfangled method for invoking the handlers come along. Suggested-by: Andi Kleen Signed-off-by: Sean Christopherson Message-Id: <20200915191505.10355-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6f425a658865..1eeafec6c791 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6325,40 +6325,40 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) void vmx_do_interrupt_nmi_irqoff(unsigned long entry); +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) +{ + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; + gate_desc *desc = (gate_desc *)host_idt_base + vector; + + kvm_before_interrupt(vcpu); + vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); + kvm_after_interrupt(vcpu); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(intr_info)) { + if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(intr_info)) { + else if (is_machine_check(intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(intr_info)) { - kvm_before_interrupt(&vmx->vcpu); - asm("int $2"); - kvm_after_interrupt(&vmx->vcpu); - } + else if (is_nmi(intr_info)) + handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { - unsigned int vector; - gate_desc *desc; u32 intr_info = vmx_get_intr_info(vcpu); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - vector = intr_info & INTR_INFO_VECTOR_MASK; - desc = (gate_desc *)host_idt_base + vector; - - kvm_before_interrupt(vcpu); - vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); - kvm_after_interrupt(vcpu); + handle_interrupt_nmi_irqoff(vcpu, intr_info); } static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) From 4e810adb5362df366aee10ac05041fc7ba0eb2fe Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 14 Sep 2020 14:55:48 +0800 Subject: [PATCH 134/497] KVM: SVM: Analyze is_guest_mode() in svm_vcpu_run() Analyze is_guest_mode() in svm_vcpu_run() instead of svm_exit_handlers_fastpath() in conformity with VMX version. Suggested-by: Vitaly Kuznetsov Signed-off-by: Wanpeng Li Message-Id: <1600066548-4343-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0f16113ee90e..a788ce16e4fa 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3393,8 +3393,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -3459,7 +3458,6 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { - fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3580,8 +3578,11 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); - exit_fastpath = svm_exit_handlers_fastpath(vcpu); - return exit_fastpath; + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + return svm_exit_handlers_fastpath(vcpu); } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, From 95b28ac9db2ac27ac7e92edf2f5bbd6f36138933 Mon Sep 17 00:00:00 2001 From: Haiwei Li Date: Fri, 4 Sep 2020 19:25:29 +0800 Subject: [PATCH 135/497] KVM: SVM: Add tracepoint for cr_interception Add trace_kvm_cr_write and trace_kvm_cr_read for svm. Signed-off-by: Haiwei Li Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a788ce16e4fa..08adac8900e0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2277,6 +2277,7 @@ static int cr_interception(struct vcpu_svm *svm) if (cr >= 16) { /* mov to cr */ cr -= 16; val = kvm_register_read(&svm->vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: if (!check_selective_cr0_intercepted(svm, val)) @@ -2322,6 +2323,7 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } kvm_register_write(&svm->vcpu, reg, val); + trace_kvm_cr_read(cr, val); } return kvm_complete_insn_gp(&svm->vcpu, err); } From 0b7aa58356912989a48dc24ba889f78680756831 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 23 Jun 2020 21:14:18 +0800 Subject: [PATCH 136/497] KVM: MIPS: clean up redundant kvm_run parameters in assembly In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu' structure. For historical reasons, many kvm-related function parameters retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This patch does a unified cleanup of these remaining redundant parameters. Signed-off-by: Tianjia Zhang Reviewed-by: Huacai Chen Tested-by: Jiaxun Yang Message-Id: <20200623131418.31473-6-tianjia.zhang@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 4 ++-- arch/mips/kvm/entry.c | 21 ++++++++------------- arch/mips/kvm/mips.c | 3 ++- arch/mips/kvm/trap_emul.c | 2 +- arch/mips/kvm/vz.c | 2 +- 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 825d337a505a..24f3d0f9996b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -341,7 +341,7 @@ struct kvm_mips_tlb { #define KVM_MIPS_GUEST_TLB_SIZE 64 struct kvm_vcpu_arch { void *guest_ebase; - int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + int (*vcpu_run)(struct kvm_vcpu *vcpu); /* Host registers preserved across guest mode execution */ unsigned long host_stack; @@ -852,7 +852,7 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); -extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern int kvm_mips_handle_exit(struct kvm_vcpu *vcpu); /* Building of entry/exception code */ int kvm_mips_entry_setup(void); diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index fd716942e302..832475bf2055 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -205,7 +205,7 @@ static inline void build_set_exc_base(u32 **p, unsigned int reg) * Assemble the start of the vcpu_run function to run a guest VCPU. The function * conforms to the following prototype: * - * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); + * int vcpu_run(struct kvm_vcpu *vcpu); * * The exit from the guest and return to the caller is handled by the code * generated by kvm_mips_build_ret_to_host(). @@ -218,8 +218,7 @@ void *kvm_mips_build_vcpu_run(void *addr) unsigned int i; /* - * A0: run - * A1: vcpu + * A0: vcpu */ /* k0/k1 not being used in host kernel context */ @@ -238,10 +237,10 @@ void *kvm_mips_build_vcpu_run(void *addr) kvm_mips_build_save_scratch(&p, V1, K1); /* VCPU scratch register has pointer to vcpu */ - UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MTC0(&p, A0, scratch_vcpu[0], scratch_vcpu[1]); /* Offset into vcpu->arch */ - UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); + UASM_i_ADDIU(&p, K1, A0, offsetof(struct kvm_vcpu, arch)); /* * Save the host stack to VCPU, used for exception processing @@ -645,10 +644,7 @@ void *kvm_mips_build_exit(void *addr) /* Now that context has been saved, we can use other registers */ /* Restore vcpu */ - UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); - - /* Restore run (vcpu->run) */ - UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1); + UASM_i_MFC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]); /* * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process @@ -810,7 +806,6 @@ void *kvm_mips_build_exit(void *addr) * with this in the kernel */ uasm_i_move(&p, A0, S0); - uasm_i_move(&p, A1, S1); UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); uasm_i_jalr(&p, RA, T9); UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ); @@ -852,7 +847,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr) * guest, reload k1 */ - uasm_i_move(&p, K1, S1); + uasm_i_move(&p, K1, S0); UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); /* @@ -886,8 +881,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr) { u32 *p = addr; - /* Put the saved pointer to vcpu (s1) back into the scratch register */ - UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); + /* Put the saved pointer to vcpu (s0) back into the scratch register */ + UASM_i_MTC0(&p, S0, scratch_vcpu[0], scratch_vcpu[1]); /* Load up the Guest EBASE to minimize the window where BEV is set */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 0c50ac444222..3d6a7f5827b1 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1199,8 +1199,9 @@ static void kvm_mips_set_c0_status(void) /* * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV) */ -int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; u32 __user *opc = (u32 __user *) vcpu->arch.pc; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index f8cba51e1054..0788c00d7e94 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -1241,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu) */ kvm_mips_suspend_mm(cpu); - r = vcpu->arch.vcpu_run(vcpu->run, vcpu); + r = vcpu->arch.vcpu_run(vcpu); /* We may have migrated while handling guest exits */ cpu = smp_processor_id(); diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index c299e5d6d69c..2ffbe9264a31 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -3266,7 +3266,7 @@ static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu) kvm_vz_vcpu_load_tlb(vcpu, cpu); kvm_vz_vcpu_load_wired(vcpu); - r = vcpu->arch.vcpu_run(vcpu->run, vcpu); + r = vcpu->arch.vcpu_run(vcpu); kvm_vz_vcpu_save_wired(vcpu); From 25bb2cf97139f81e3bb8910d26016a529019528e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Aug 2020 10:51:29 -0700 Subject: [PATCH 137/497] KVM: nVMX: Morph notification vector IRQ on nested VM-Enter to pending PI On successful nested VM-Enter, check for pending interrupts and convert the highest priority interrupt to a pending posted interrupt if it matches L2's notification vector. If the vCPU receives a notification interrupt before nested VM-Enter (assuming L1 disables IRQs before doing VM-Enter), the pending interrupt (for L1) should be recognized and processed as a posted interrupt when interrupts become unblocked after VM-Enter to L2. This fixes a bug where L1/L2 will get stuck in an infinite loop if L1 is trying to inject an interrupt into L2 by setting the appropriate bit in L2's PIR and sending a self-IPI prior to VM-Enter (as opposed to KVM's method of manually moving the vector from PIR->vIRR/RVI). KVM will observe the IPI while the vCPU is in L1 context and so won't immediately morph it to a posted interrupt for L2. The pending interrupt will be seen by vmx_check_nested_events(), cause KVM to force an immediate exit after nested VM-Enter, and eventually be reflected to L1 as a VM-Exit. After handling the VM-Exit, L1 will see that L2 has a pending interrupt in PIR, send another IPI, and repeat until L2 is killed. Note, posted interrupts require virtual interrupt deliveriy, and virtual interrupt delivery requires exit-on-interrupt, ergo interrupts will be unconditionally unmasked on VM-Enter if posted interrupts are enabled. Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") Cc: stable@vger.kernel.org Cc: Liran Alon Signed-off-by: Sean Christopherson Message-Id: <20200812175129.12172-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 +++++++ arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/vmx/nested.c | 8 ++++++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 51ed4f0b1390..105e7859d1f2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -494,6 +494,12 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) } } +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) +{ + apic_clear_irr(vec, vcpu->arch.apic); +} +EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); + static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { struct kvm_vcpu *vcpu; @@ -2465,6 +2471,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } +EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 754f29beb83e..4fb86e3a9dd3 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -89,6 +89,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); +void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 9861179d9a8c..f39491cc92c7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3531,6 +3531,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; + /* Emulate processing of posted interrupts on VM-Enter. */ + if (nested_cpu_has_posted_intr(vmcs12) && + kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { + vmx->nested.pi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); + } + /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; From a9e2e0ae686094571378c72d8146b5a1a92d0652 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Fri, 28 Aug 2020 10:23:42 +0800 Subject: [PATCH 138/497] KVM: x86: emulating RDPID failure shall return #UD rather than #GP Per Intel's SDM, RDPID takes a #UD if it is unsupported, which is more or less what KVM is emulating when MSR_TSC_AUX is not available. In fact, there are no scenarios in which RDPID is supposed to #GP. Fixes: fb6d4d340e ("KVM: x86: emulate RDPID") Signed-off-by: Robert Hoo Message-Id: <1598581422-76264-1-git-send-email-robert.hu@linux.intel.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2f6510de6b0c..85111cd0adcd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3606,7 +3606,7 @@ static int em_rdpid(struct x86_emulate_ctxt *ctxt) u64 tsc_aux = 0; if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) - return emulate_gp(ctxt, 0); + return emulate_ud(ctxt); ctxt->dst.val = tsc_aux; return X86EMUL_CONTINUE; } From fb0f33fdefe9f473dc5f7b71345096c8d60ab9dd Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Sat, 29 Aug 2020 00:48:22 +0000 Subject: [PATCH 139/497] KVM: nSVM: CR3 MBZ bits are only 63:52 Commit 761e4169346553c180bbd4a383aedd72f905bc9a created a wrong mask for the CR3 MBZ bits. According to APM vol 2, only the upper 12 bits are MBZ. Fixes: 761e41693465 ("KVM: nSVM: Check that MBZ bits in CR3 and CR4 are not set on vmrun of nested guests", 2020-07-08) Signed-off-by: Krish Sadhukhan Message-Id: <20200829004824.4577-2-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cf6e74b9461d..da5e87d002e9 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -237,7 +237,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) } else { if (!(vmcb12->save.cr4 & X86_CR4_PAE) || !(vmcb12->save.cr0 & X86_CR0_PE) || - (vmcb12->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK)) return false; } if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 4cd360e8a77a..bb3bbc87d3ff 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -339,7 +339,7 @@ static inline bool gif_set(struct vcpu_svm *svm) /* svm.c */ #define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U #define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U -#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U +#define MSR_CR3_LONG_MBZ_MASK 0xfff0000000000000U #define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); From fc595f35994233a566904bf811ffb9c3c80a84a9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Aug 2020 11:06:15 -0700 Subject: [PATCH 140/497] KVM: nVMX: Add VM-Enter failed tracepoints for super early checks Add tracepoints for the early consistency checks in nested_vmx_run(). The "VMLAUNCH vs. VMRESUME" check in particular is useful to trace, as there is no architectural way to check VMCS.LAUNCH_STATE, and subtle bugs such as VMCLEAR on the wrong HPA can lead to confusing errors in the L1 VMM. Signed-off-by: Sean Christopherson Message-Id: <20200812180615.22372-1-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f39491cc92c7..473fa40cc924 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3471,11 +3471,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; - } else if (evmptrld_status == EVMPTRLD_VMFAIL) { + } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { return nested_vmx_failInvalid(vcpu); } - if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) + if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -3486,7 +3486,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field. */ - if (vmcs12->hdr.shadow_vmcs) + if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); if (vmx->nested.hv_evmcs) { @@ -3507,10 +3507,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ - if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) + if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); - if (vmcs12->launch_state == launch) + if (CC(vmcs12->launch_state == launch)) return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); From 50085beee878e7c4e8ddbd3d634c0170589802e0 Mon Sep 17 00:00:00 2001 From: Cfir Cohen Date: Fri, 7 Aug 2020 17:37:46 -0700 Subject: [PATCH 141/497] KVM: SVM: Mark SEV launch secret pages as dirty. The LAUNCH_SECRET command performs encryption of the launch secret memory contents. Mark pinned pages as dirty, before unpinning them. This matches the logic in sev_launch_update_data(). Signed-off-by: Cfir Cohen Message-Id: <20200808003746.66687-1-cfir@google.com> Reviewed-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7bf7bf734979..bb0e89c79a04 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -856,7 +856,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) struct kvm_sev_launch_secret params; struct page **pages; void *blob, *hdr; - unsigned long n; + unsigned long n, i; int ret, offset; if (!sev_guest(kvm)) @@ -869,6 +869,14 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (IS_ERR(pages)) return PTR_ERR(pages); + /* + * The LAUNCH_SECRET command will perform in-place encryption of the + * memory content (i.e it will write the same memory region with C=1). + * It's possible that the cache may contain the data with C=0, i.e., + * unencrypted so invalidate it first. + */ + sev_clflush_pages(pages, n); + /* * The secret must be copied into contiguous memory region, lets verify * that userspace memory pages are contiguous before we issue command. @@ -914,6 +922,11 @@ e_free_blob: e_free: kfree(data); e_unpin_memory: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < n; i++) { + set_page_dirty_lock(pages[i]); + mark_page_accessed(pages[i]); + } sev_unpin_memory(kvm, pages, n); return ret; } From 14e3dd8d256b7a7a281ac18a65e3c8cc9573ec88 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 23 Sep 2020 13:01:33 -0400 Subject: [PATCH 142/497] KVM: SEV: shorten comments around sev_clflush_pages Very similar content is present in four comments in sev.c. Unfortunately there are small differences that make it harder to place the comment in sev_clflush_pages itself, but at least we can make it more concise. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index bb0e89c79a04..65e15c22bd3c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -446,10 +446,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) } /* - * The LAUNCH_UPDATE command will perform in-place encryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in + * place; the cache may contain the data that was written unencrypted. */ sev_clflush_pages(inpages, npages); @@ -805,10 +803,9 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) } /* - * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify + * the pages; flush the destination too so that future accesses do not + * see stale data. */ sev_clflush_pages(src_p, 1); sev_clflush_pages(dst_p, 1); @@ -870,10 +867,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) return PTR_ERR(pages); /* - * The LAUNCH_SECRET command will perform in-place encryption of the - * memory content (i.e it will write the same memory region with C=1). - * It's possible that the cache may contain the data with C=0, i.e., - * unencrypted so invalidate it first. + * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in + * place; the cache may contain the data that was written unencrypted. */ sev_clflush_pages(pages, n); From 28e2b2f1a40dbada5740fdd83e8645949d1c8db6 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 24 Sep 2020 13:41:58 -0500 Subject: [PATCH 143/497] KVM: VMX: Do not perform emulation for INVD intercept The INVD instruction is emulated as a NOP, just skip the instruction instead. Signed-off-by: Tom Lendacky Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1eeafec6c791..dc74bb62d1df 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5127,7 +5127,8 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(vcpu, 0); + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_skip_emulated_instruction(vcpu); } static int handle_invlpg(struct kvm_vcpu *vcpu) From 8d921acf98ec3e82cf673a5422890939cd4c24f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Sep 2020 12:42:46 -0700 Subject: [PATCH 144/497] KVM: VMX: Use precomputed MAXPHYADDR for RTIT base MSR check Use cpuid_maxphyaddr() instead of cpuid_query_maxphyaddr() for the RTIT base MSR check. There is no reason to recompute MAXPHYADDR as the precomputed version is synchronized with CPUID updates, and MSR_IA32_RTIT_OUTPUT_BASE is not written between stuffing CPUID and refreshing vcpu->arch.maxphyaddr. Signed-off-by: Sean Christopherson Message-Id: <20200924194250.19137-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index dc74bb62d1df..84dea7357e1e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -147,7 +147,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); RTIT_STATUS_BYTECNT)) #define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ - (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) + (~((1UL << cpuid_maxphyaddr(vcpu)) - 1) | 0x7f) /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: From 526ad23bc564ae60170ae18a087d018bbd6618ab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Sep 2020 12:42:47 -0700 Subject: [PATCH 145/497] KVM: x86: Unexport cpuid_query_maxphyaddr() Stop exporting cpuid_query_maxphyaddr() now that it's not being abused by VMX. Signed-off-by: Sean Christopherson Message-Id: <20200924194250.19137-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7456f9ad424b..37c3668a774f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -186,7 +186,6 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) not_found: return 36; } -EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr); /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, From 1cc6cbc3e405536b971cbd62341197a411e7713c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Sep 2020 12:42:48 -0700 Subject: [PATCH 146/497] KVM: VMX: Replace MSR_IA32_RTIT_OUTPUT_BASE_MASK with helper function Replace the subtly not-a-constant MSR_IA32_RTIT_OUTPUT_BASE_MASK with a proper helper function to check whether or not the specified base is valid. Blindly referencing the local 'vcpu' is especially nasty. Signed-off-by: Sean Christopherson Message-Id: <20200924194250.19137-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 84dea7357e1e..2b5e749ed111 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -146,9 +146,6 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ RTIT_STATUS_BYTECNT)) -#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ - (~((1UL << cpuid_maxphyaddr(vcpu)) - 1) | 0x7f) - /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -1037,6 +1034,12 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } +static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) +{ + /* The base must be 128-byte aligned and a legal physical address. */ + return !(base & (~((1UL << cpuid_maxphyaddr(vcpu)) - 1) | 0x7f)); +} + static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; @@ -2172,7 +2175,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_single_range_output)) return 1; - if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) + if (!pt_output_base_valid(vcpu, data)) return 1; vmx->pt_desc.guest.output_base = data; break; From dc46515cf838e37c602016cc905df051e06cb8ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Sep 2020 12:42:49 -0700 Subject: [PATCH 147/497] KVM: x86: Move illegal GPA helper out of the MMU code Rename kvm_mmu_is_illegal_gpa() to kvm_vcpu_is_illegal_gpa() and move it to cpuid.h so that's it's colocated with cpuid_maxphyaddr(). The helper is not MMU specific and will gain a user that is completely unrelated to the MMU in a future patch. No functional change intended. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20200924194250.19137-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 5 +++++ arch/x86/kvm/mmu.h | 5 ----- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 3a923ae15f2f..1d2c4f2e4bb6 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -34,6 +34,11 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) return vcpu->arch.maxphyaddr; } +static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); +} + struct cpuid_reg { u32 function; u32 index; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5efc6081ca13..9c4a9c8e43d9 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -155,11 +155,6 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } -static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); -} - /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5ae9f972609f..c82e365c3f1d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -521,7 +521,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { exception->error_code |= PFERR_RSVD_MASK; return UNMAPPED_GVA; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2b5e749ed111..4e372b4e102d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5314,7 +5314,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ - if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa))) + if (unlikely(kvm_vcpu_is_illegal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); From 7096cbfb6cb63420f333d0ba238d1a857ba3f290 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Sep 2020 12:42:50 -0700 Subject: [PATCH 148/497] KVM: VMX: Use "illegal GPA" helper for PT/RTIT output base check Use kvm_vcpu_is_illegal_gpa() to check for a legal GPA when validating a PT output base instead of open coding a clever, but difficult to read, variant. Code readability is far more important than shaving a few uops in a slow path. No functional change intended. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20200924194250.19137-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4e372b4e102d..2c1abe964b76 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1037,7 +1037,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) { /* The base must be 128-byte aligned and a legal physical address. */ - return !(base & (~((1UL << cpuid_maxphyaddr(vcpu)) - 1) | 0x7f)); + return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f); } static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) From b785a442aa2111b6940be19cdfb3ebabf605f6a3 Mon Sep 17 00:00:00 2001 From: Li Qiang Date: Thu, 24 Sep 2020 08:58:00 -0700 Subject: [PATCH 149/497] cpuidle-haltpoll: fix error comments in arch_haltpoll_disable The 'arch_haltpoll_disable' is used to disable guest halt poll. Correct the comments. Fixes: a1c4423b02b21 ("cpuidle-haltpoll: disable host side polling when kvm virtualized") Signed-off-by: Li Qiang Message-Id: <20200924155800.4939-1-liq3ea@163.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9663ba31347c..42c6e0deff9e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -952,7 +952,7 @@ void arch_haltpoll_disable(unsigned int cpu) if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) return; - /* Enable guest halt poll disables host halt poll */ + /* Disable guest halt poll enables host halt poll */ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); } EXPORT_SYMBOL_GPL(arch_haltpoll_disable); From becdad8592254eeaa84d8f6f1167137c11c30876 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 09:50:45 -0700 Subject: [PATCH 150/497] KVM: VMX: Rename vmx_*_supported() helpers to cpu_has_vmx_*() Rename helpers for a few controls to conform to the more prevelant style of cpu_has_vmx_(). Consistent names will allow adding macros to consolidate the boilerplate code for adjusting secondary execution controls. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923165048.20486-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4bbd8b448d22..5761736d373a 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -196,7 +196,7 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline bool vmx_rdrand_supported(void) +static inline bool cpu_has_vmx_rdrand(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDRAND_EXITING; @@ -233,7 +233,7 @@ static inline bool cpu_has_vmx_encls_vmexit(void) SECONDARY_EXEC_ENCLS_EXITING; } -static inline bool vmx_rdseed_supported(void) +static inline bool cpu_has_vmx_rdseed(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDSEED_EXITING; @@ -244,13 +244,13 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } -static inline bool vmx_xsaves_supported(void) +static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_XSAVES; } -static inline bool vmx_waitpkg_supported(void) +static inline bool cpu_has_vmx_waitpkg(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2c1abe964b76..96adc964afed 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4121,7 +4121,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (vmx_xsaves_supported()) { + if (cpu_has_vmx_xsaves()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4179,7 +4179,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } - if (vmx_rdrand_supported()) { + if (cpu_has_vmx_rdrand()) { bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); if (rdrand_enabled) exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; @@ -4194,7 +4194,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } - if (vmx_rdseed_supported()) { + if (cpu_has_vmx_rdseed()) { bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); if (rdseed_enabled) exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; @@ -4209,7 +4209,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } - if (vmx_waitpkg_supported()) { + if (cpu_has_vmx_waitpkg()) { bool waitpkg_enabled = guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); @@ -4317,7 +4317,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - if (vmx_xsaves_supported()) + if (cpu_has_vmx_xsaves()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { @@ -7254,14 +7254,14 @@ static __init void vmx_set_cpu_caps(void) /* CPUID 0xD.1 */ supported_xss = 0; - if (!vmx_xsaves_supported()) + if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); /* CPUID 0x80000001 */ if (!cpu_has_vmx_rdtscp()) kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); - if (vmx_waitpkg_supported()) + if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } From b936d3eb92b7c1258805bebecc7eaac0c8a51b7f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 09:50:46 -0700 Subject: [PATCH 151/497] KVM: VMX: Unconditionally clear CPUID.INVPCID if !CPUID.PCID If PCID is not exposed to the guest, clear INVPCID in the guest's CPUID even if the VMCS INVPCID enable is not supported. This will allow consolidating the secondary execution control adjustment code without having to special case INVPCID. Technically, this fixes a bug where !CPUID.PCID && CPUID.INVCPID would result in unexpected guest behavior (#UD instead of #GP/#PF), but KVM doesn't support exposing INVPCID if it's not supported in the VMCS, i.e. such a config is broken/bogus no matter what. Signed-off-by: Sean Christopherson Message-Id: <20200923165048.20486-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 96adc964afed..641e5ef513ae 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4158,16 +4158,22 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } + /* + * Expose INVPCID if and only if PCID is also exposed to the guest. + * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF + * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect + * behavior from the guest perspective (it would expect #GP or #PF). + */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); + if (cpu_has_vmx_invpcid()) { /* Exposing INVPCID only when PCID is exposed */ bool invpcid_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && - guest_cpuid_has(vcpu, X86_FEATURE_PCID); + guest_cpuid_has(vcpu, X86_FEATURE_INVPCID); - if (!invpcid_enabled) { + if (!invpcid_enabled) exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); - } if (nested) { if (invpcid_enabled) From 7f3603b631362340774291a961712ec07bbf8122 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 09:50:47 -0700 Subject: [PATCH 152/497] KVM: VMX: Rename RDTSCP secondary exec control name to insert "ENABLE" Rename SECONDARY_EXEC_RDTSCP to SECONDARY_EXEC_ENABLE_RDTSCP in preparation for consolidating the logic for adjusting secondary exec controls based on the guest CPUID model. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923165048.20486-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/vmx/capabilities.h | 2 +- arch/x86/kvm/vmx/nested.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 10 +++++----- tools/testing/selftests/kvm/include/x86_64/vmx.h | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cd7de4b401fe..f8ba5289ecb0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -52,7 +52,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES) #define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT) #define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING) -#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP) +#define SECONDARY_EXEC_ENABLE_RDTSCP VMCS_CONTROL_BIT(RDTSCP) #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC) #define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID) #define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 5761736d373a..3a1861403d73 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -151,7 +151,7 @@ static inline bool vmx_umip_emulated(void) static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_ENABLE_RDTSCP; } static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 473fa40cc924..e8048e01c044 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2286,7 +2286,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -6404,7 +6404,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_APIC_REGISTER_VIRT | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 641e5ef513ae..f60c64b749b3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2430,7 +2430,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_DESC | - SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | @@ -4146,15 +4146,15 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (cpu_has_vmx_rdtscp()) { bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); if (!rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; + exec_control &= ~SECONDARY_EXEC_ENABLE_RDTSCP; if (nested) { if (rdtscp_enabled) vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_ENABLE_RDTSCP; else vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; + ~SECONDARY_EXEC_ENABLE_RDTSCP; } } @@ -7323,7 +7323,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, * Because it is marked as EmulateOnUD, we need to intercept it here. */ case x86_intercept_rdtscp: - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 16fa21ebb99c..54d624dd6c10 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -48,7 +48,7 @@ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_DESC 0x00000004 -#define SECONDARY_EXEC_RDTSCP 0x00000008 +#define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 From 8b50b92f9f1a819cba290e24064337004c00ee36 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Sep 2020 17:30:11 -0700 Subject: [PATCH 153/497] KVM: VMX: Add a helper and macros to reduce boilerplate for sec exec ctls Add a helper function and several wrapping macros to consolidate the copy-paste code in vmx_compute_secondary_exec_control() for adjusting controls that are dependent on guest CPUID bits. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200925003011.21016-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 151 +++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 87 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f60c64b749b3..b0ba1cc79e6a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4081,6 +4081,61 @@ u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +/* + * Adjust a single secondary execution control bit to intercept/allow an + * instruction in the guest. This is usually done based on whether or not a + * feature has been exposed to the guest in order to correctly emulate faults. + */ +static inline void +vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, + u32 control, bool enabled, bool exiting) +{ + /* + * If the control is for an opt-in feature, clear the control if the + * feature is not exposed to the guest, i.e. not enabled. If the + * control is opt-out, i.e. an exiting control, clear the control if + * the feature _is_ exposed to the guest, i.e. exiting/interception is + * disabled for the associated instruction. Note, the caller is + * responsible presetting exec_control to set all supported bits. + */ + if (enabled == exiting) + *exec_control &= ~control; + + /* + * Update the nested MSR settings so that a nested VMM can/can't set + * controls for features that are/aren't exposed to the guest. + */ + if (nested) { + if (enabled) + vmx->nested.msrs.secondary_ctls_high |= control; + else + vmx->nested.msrs.secondary_ctls_high &= ~control; + } +} + +/* + * Wrapper macro for the common case of adjusting a secondary execution control + * based on a single guest CPUID bit, with a dedicated feature bit. This also + * verifies that the control is actually supported by KVM and hardware. + */ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + __enabled = guest_cpuid_has(&(vmx)->vcpu, \ + X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, \ + SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ + } \ +}) + +/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ +#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) + +#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ + vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) { @@ -4130,33 +4185,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) vcpu->arch.xsaves_enabled = xsaves_enabled; - if (!xsaves_enabled) - exec_control &= ~SECONDARY_EXEC_XSAVES; - - if (nested) { - if (xsaves_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_XSAVES; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_XSAVES; - } + vmx_adjust_secondary_exec_control(vmx, &exec_control, + SECONDARY_EXEC_XSAVES, + xsaves_enabled, false); } - if (cpu_has_vmx_rdtscp()) { - bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); - if (!rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_ENABLE_RDTSCP; - - if (nested) { - if (rdtscp_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_RDTSCP; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_RDTSCP; - } - } + vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP); /* * Expose INVPCID if and only if PCID is also exposed to the guest. @@ -4166,71 +4200,14 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) */ if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); - if (cpu_has_vmx_invpcid()) { - /* Exposing INVPCID only when PCID is exposed */ - bool invpcid_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_INVPCID); - if (!invpcid_enabled) - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); + vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); - if (nested) { - if (invpcid_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_INVPCID; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_INVPCID; - } - } - - if (cpu_has_vmx_rdrand()) { - bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); - if (rdrand_enabled) - exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; - - if (nested) { - if (rdrand_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDRAND_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDRAND_EXITING; - } - } - - if (cpu_has_vmx_rdseed()) { - bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); - if (rdseed_enabled) - exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; - - if (nested) { - if (rdseed_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_RDSEED_EXITING; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_RDSEED_EXITING; - } - } - - if (cpu_has_vmx_waitpkg()) { - bool waitpkg_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); - - if (!waitpkg_enabled) - exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - - if (nested) { - if (waitpkg_enabled) - vmx->nested.msrs.secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - else - vmx->nested.msrs.secondary_ctls_high &= - ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; - } - } + vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, + ENABLE_USR_WAIT_PAUSE, false); vmx->secondary_exec_control = exec_control; } From 4d710de9646a7ea99f628e17d9d700cfaf1af8ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 12:12:04 -0700 Subject: [PATCH 154/497] KVM: x86/mmu: Stash 'kvm' in a local variable in kvm_mmu_free_roots() To make kvm_mmu_free_roots() a bit more readable, capture 'kvm' in a local variable instead of doing vcpu->kvm over and over (and over). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923191204.8410-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c82e365c3f1d..5950dc92b583 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3603,6 +3603,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free) { + struct kvm *kvm = vcpu->kvm; int i; LIST_HEAD(invalid_list); bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; @@ -3620,22 +3621,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return; } - spin_lock(&vcpu->kvm->mmu_lock); + spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) - mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, - &invalid_list); + mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); } else { for (i = 0; i < 4; ++i) if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, + mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->root_hpa = INVALID_PAGE; @@ -3643,8 +3643,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, mmu->root_pgd = 0; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); From b44f50d87ce2f051909b6bb2509727e05330036c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 24 Sep 2020 16:57:51 +0200 Subject: [PATCH 155/497] KVM: x86: hyper-v: Mention SynDBG CPUID leaves in api.rst We forgot to update KVM_GET_SUPPORTED_HV_CPUID's documentation in api.rst when SynDBG leaves were added. While on it, fix 'KVM_GET_SUPPORTED_CPUID' copy-paste error. Fixes: f97f5a56f597 ("x86/kvm/hyper-v: Add support for synthetic debugger interface") Signed-off-by: Vitaly Kuznetsov Message-Id: <20200924145757.1035782-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 51191b56e61c..9e2a545d8084 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4498,11 +4498,14 @@ Currently, the following list of CPUID leaves are returned: - HYPERV_CPUID_ENLIGHTMENT_INFO - HYPERV_CPUID_IMPLEMENT_LIMITS - HYPERV_CPUID_NESTED_FEATURES + - HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS + - HYPERV_CPUID_SYNDBG_INTERFACE + - HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +Userspace invokes KVM_GET_SUPPORTED_HV_CPUID by passing a kvm_cpuid2 structure with the 'nent' field indicating the number of entries in the variable-size array 'entries'. If the number of entries is too low to describe all Hyper-V feature leaves, an error (E2BIG) is returned. If the number is more or equal From dbcf3f96fa662bd5e1f93ea7c10a8dd0dce180ae Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 24 Sep 2020 16:57:52 +0200 Subject: [PATCH 156/497] KVM: x86: hyper-v: disallow configuring SynIC timers with no SynIC Hyper-V Synthetic timers require SynIC but we don't seem to check that upon HV_X64_MSR_STIMER[X]_CONFIG/HV_X64_MSR_STIMER0_COUNT writes. Make the behavior match synic_set_msr(). Signed-off-by: Vitaly Kuznetsov Message-Id: <20200924145757.1035782-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1d330564eed8..67a4f60a89b5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -633,6 +633,11 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); @@ -652,6 +657,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { + struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer); + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + + if (!synic->active && !host) + return 1; + trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); From ace569e0154a8c7a686bab8e791876d27b42d9b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 15:14:05 -0700 Subject: [PATCH 157/497] KVM: x86/mmu: Move flush logic from mmu_page_zap_pte() to FNAME(invlpg) Move the logic that controls whether or not FNAME(invlpg) needs to flush fully into FNAME(invlpg) so that mmu_page_zap_pte() doesn't return a value. This allows a future patch to redefine the return semantics for mmu_page_zap_pte() so that it can recursively zap orphaned child shadow pages for nested TDP MMUs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923221406.16297-2-sean.j.christopherson@intel.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 +++------- arch/x86/kvm/mmu/paging_tmpl.h | 7 +++++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5950dc92b583..0122e68ac0ab 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2615,7 +2615,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, +static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *spte) { u64 pte; @@ -2631,13 +2631,9 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } - return true; - } - - if (is_mmio_spte(pte)) + } else if (is_mmio_spte(pte)) { mmu_spte_clear_no_track(spte); - - return false; + } } static void kvm_mmu_page_unlink_children(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4dd6b1e5b8cf..3bb624a3dda9 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -895,6 +895,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + u64 old_spte; int level; u64 *sptep; @@ -917,7 +918,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sptep = iterator.sptep; sp = sptep_to_sp(sptep); - if (is_last_spte(*sptep, level)) { + old_spte = *sptep; + if (is_last_spte(old_spte, level)) { pt_element_t gpte; gpa_t pte_gpa; @@ -927,7 +929,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) + mmu_page_zap_pte(vcpu->kvm, sp, sptep); + if (is_shadow_present_pte(old_spte)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); From 2de4085ccceaf075f7c8d1688338fb82e2349f0f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 23 Sep 2020 15:14:06 -0700 Subject: [PATCH 158/497] KVM: x86/MMU: Recursively zap nested TDP SPs when zapping last/only parent Recursively zap all to-be-orphaned children, unsynced or otherwise, when zapping a shadow page for a nested TDP MMU. KVM currently only zaps the unsynced child pages, but not the synced ones. This can create problems over time when running many nested guests because it leaves unlinked pages which will not be freed until the page quota is hit. With the default page quota of 20 shadow pages per 1000 guest pages, this looks like a memory leak and can degrade MMU performance. In a recent benchmark, substantial performance degradation was observed: An L1 guest was booted with 64G memory. 2G nested Windows guests were booted, 10 at a time for 20 iterations. (200 total boots) Windows was used in this benchmark because they touch all of their memory on startup. By the end of the benchmark, the nested guests were taking ~10% longer to boot. With this patch there is no degradation in boot time. Without this patch the benchmark ends with hundreds of thousands of stale EPT02 pages cluttering up rmaps and the page hash map. As a result, VM shutdown is also much slower: deleting memslot 0 was observed to take over a minute. With this patch it takes just a few miliseconds. Cc: Peter Shier Signed-off-by: Ben Gardon Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20200923221406.16297-3-sean.j.christopherson@intel.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 30 +++++++++++++++++++++++------- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0122e68ac0ab..0db1ca0ba5df 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2615,8 +2615,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *spte) +/* Returns the number of zapped non-leaf child shadow pages. */ +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; @@ -2630,19 +2631,34 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); + + /* + * Recursively zap nested TDP SPs, parentless SPs are + * unlikely to be used again in the near future. This + * avoids retaining a large number of stale nested SPs. + */ + if (tdp_enabled && invalid_list && + child->role.guest_mode && !child->parent_ptes.val) + return kvm_mmu_prepare_zap_page(kvm, child, + invalid_list); } } else if (is_mmio_spte(pte)) { mmu_spte_clear_no_track(spte); } + return 0; } -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) +static int kvm_mmu_page_unlink_children(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list) { + int zapped = 0; unsigned i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - mmu_page_zap_pte(kvm, sp, sp->spt + i); + zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); + + return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -2688,7 +2704,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); - kvm_mmu_page_unlink_children(kvm, sp); + *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ @@ -5396,7 +5412,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u32 base_role = vcpu->arch.mmu->mmu_role.base.word; entry = *spte; - mmu_page_zap_pte(vcpu->kvm, sp, spte); + mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && !((sp->role.word ^ base_role) & ~role_ign.word) && rmap_can_add(vcpu)) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 3bb624a3dda9..e1066226b8f0 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -929,7 +929,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - mmu_page_zap_pte(vcpu->kvm, sp, sptep); + mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); From 7b367bc9a6415dc6f31135d50ab37e8ee18d6974 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 15:04:22 -0700 Subject: [PATCH 159/497] KVM: x86/mmu: Return -EIO if page fault returns RET_PF_INVALID Exit to userspace with an error if the MMU is buggy and returns RET_PF_INVALID when servicing a page fault. This will allow a future patch to invert the emulation path, i.e. emulate only on RET_PF_EMULATE instead of emulating on anything but RET_PF_RETRY. This technically means that KVM will exit to userspace instead of emulating on RET_PF_INVALID, but practically speaking it's a nop as the MMU never returns RET_PF_INVALID. Signed-off-by: Sean Christopherson Message-Id: <20200923220425.18402-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0db1ca0ba5df..68c0a1a374cf 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5462,7 +5462,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - WARN_ON(r == RET_PF_INVALID); + if (WARN_ON_ONCE(r == RET_PF_INVALID)) + return -EIO; } if (r == RET_PF_RETRY) From 83a2ba4cb2b596e5e89faa4e29b1d1f3b245b95e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 15:04:23 -0700 Subject: [PATCH 160/497] KVM: x86/mmu: Invert RET_PF_* check when falling through to emulation Explicitly check for RET_PF_EMULATE instead of implicitly doing the same by checking for !RET_PF_RETRY (RET_PF_INVALID is handled earlier). This will adding new RET_PF_ types in future patches without breaking the emulation path. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923220425.18402-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 68c0a1a374cf..0831e88fb227 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5466,10 +5466,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, return -EIO; } - if (r == RET_PF_RETRY) - return 1; if (r < 0) return r; + if (r != RET_PF_EMULATE) + return 1; /* * Before emulating the instruction, check if the error code From c4371c2a682e0da1ed2cd7e3c5496f055d873554 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 15:04:24 -0700 Subject: [PATCH 161/497] KVM: x86/mmu: Return unique RET_PF_* values if the fault was fixed Introduce RET_PF_FIXED and RET_PF_SPURIOUS to provide unique return values instead of overloading RET_PF_RETRY. In the short term, the unique values add clarity to the code and RET_PF_SPURIOUS will be used by set_spte() to avoid unnecessary work for spurious faults. In the long term, TDX will use RET_PF_FIXED to deterministically map memory during pre-boot. The page fault flow may bail early for benign reasons, e.g. if the mmu_notifier fires for an unrelated address. With only RET_PF_RETRY, it's impossible for the caller to distinguish between "cool, page is mapped" and "darn, need to try again", and thus cannot handle benign cases like the mmu_notifier retry. Signed-off-by: Sean Christopherson Message-Id: <20200923220425.18402-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 46 +++++++++++++++++++------------------ arch/x86/kvm/mmu/mmutrace.h | 13 ++++------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0831e88fb227..2360d33a27b1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -198,17 +198,20 @@ module_param(dbg, bool, 0644); #define PTE_LIST_EXT 3 /* - * Return values of handle_mmio_page_fault and mmu.page_fault: + * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * - * For handle_mmio_page_fault only: * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_FIXED: The faulting entry has been fixed. + * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. */ enum { RET_PF_RETRY = 0, - RET_PF_EMULATE = 1, - RET_PF_INVALID = 2, + RET_PF_EMULATE, + RET_PF_INVALID, + RET_PF_FIXED, + RET_PF_SPURIOUS, }; struct pte_list_desc { @@ -3083,7 +3086,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, int was_rmapped = 0; int rmap_count; int set_spte_ret; - int ret = RET_PF_RETRY; + int ret = RET_PF_FIXED; bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, @@ -3491,21 +3494,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Return value: - * - true: let the vcpu to access on the same address again. - * - false: let the real page fault path to fix it. + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 error_code) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool fault_handled = false; + int ret = RET_PF_INVALID; u64 spte = 0ull; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) - return false; + return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3531,7 +3532,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * they are always ACC_ALL. */ if (is_access_allowed(error_code, spte)) { - fault_handled = true; + ret = RET_PF_SPURIOUS; break; } @@ -3574,11 +3575,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - fault_handled = fast_pf_fix_direct_spte(vcpu, sp, - iterator.sptep, spte, - new_spte); - if (fault_handled) + if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, + new_spte)) { + ret = RET_PF_FIXED; break; + } if (++retry_count > 4) { printk_once(KERN_WARNING @@ -3589,10 +3590,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, fault_handled); + spte, ret); walk_shadow_page_lockless_end(vcpu); - return fault_handled; + return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, @@ -4104,8 +4105,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 9d15bc0c535b..2080f9c32213 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -244,14 +244,11 @@ TRACE_EVENT( __entry->access) ); -#define __spte_satisfied(__spte) \ - (__entry->retry && is_writable_pte(__entry->__spte)) - TRACE_EVENT( fast_page_fault, TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, - u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), + u64 *sptep, u64 old_spte, int ret), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -260,7 +257,7 @@ TRACE_EVENT( __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) - __field(bool, retry) + __field(int, ret) ), TP_fast_assign( @@ -270,7 +267,7 @@ TRACE_EVENT( __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; - __entry->retry = retry; + __entry->ret = ret; ), TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" @@ -278,7 +275,7 @@ TRACE_EVENT( __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, - __spte_satisfied(old_spte), __spte_satisfied(new_spte) + __entry->ret == RET_PF_SPURIOUS, __entry->ret == RET_PF_FIXED ) ); From 127037591c84e8d550e8951d274a5e291d3ed56f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 15:04:25 -0700 Subject: [PATCH 162/497] KVM: x86/mmu: Bail early from final #PF handling on spurious faults Detect spurious page faults, e.g. page faults that occur when multiple vCPUs simultaneously access a not-present page, and skip the SPTE write, prefetch, and stats update for spurious faults. Note, the performance benefits of skipping the write and prefetch are likely negligible, and the false positive stats adjustment is probably lost in the noise. The primary motivation is to play nice with TDX's SEPT in the long term. SEAMCALLs (to program SEPT entries) are quite costly, e.g. thousands of cycles, and a spurious SEPT update will result in a SEAMCALL error (which KVM will ideally treat as fatal). Reported-by: Kai Huang Signed-off-by: Sean Christopherson Message-Id: <20200923220425.18402-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++++++++++++- arch/x86/kvm/mmu/paging_tmpl.h | 3 +++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2360d33a27b1..7ec3d057e8e0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2985,6 +2985,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) /* Bits which may be returned by set_spte() */ #define SET_SPTE_WRITE_PROTECTED_PT BIT(0) #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned int pte_access, int level, @@ -3073,7 +3074,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte = mark_spte_for_access_track(spte); set_pte: - if (mmu_spte_update(sptep, spte)) + if (*sptep == spte) + ret |= SET_SPTE_SPURIOUS; + else if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; return ret; } @@ -3128,6 +3131,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; + /* + * The fault is fully spurious if and only if the new SPTE and old SPTE + * are identical, and emulation is not required. + */ + if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { + WARN_ON_ONCE(!was_rmapped); + return RET_PF_SPURIOUS; + } + pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped && is_large_pte(*sptep)) @@ -3364,6 +3376,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, write, level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + direct_pte_prefetch(vcpu, it.sptep); ++vcpu->stat.pf_fixed; return ret; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index e1066226b8f0..188ec2633bc5 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -711,6 +711,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, it.level, base_gfn, pfn, prefault, map_writable); + if (ret == RET_PF_SPURIOUS) + return ret; + FNAME(pte_prefetch)(vcpu, gw, it.sptep); ++vcpu->stat.pf_fixed; return ret; From 8888cdd0996c2d51cd417f9a60a282c034f3fa28 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 23 Sep 2020 11:31:11 -0700 Subject: [PATCH 163/497] KVM: VMX: Extract posted interrupt support to separate files Extract the posted interrupt code so that it can be reused for Trust Domain Extensions (TDX), which requires posted interrupts and can use KVM VMX's implementation almost verbatim. TDX is different enough from raw VMX that it is highly desirable to implement the guts of TDX in a separate file, i.e. reusing posted interrupt code by shoving TDX support into vmx.c would be a mess. Signed-off-by: Xiaoyao Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20200923183112.3030-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 3 +- arch/x86/kvm/vmx/posted_intr.c | 332 +++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/posted_intr.h | 99 ++++++++++ arch/x86/kvm/vmx/vmx.c | 320 +------------------------------ arch/x86/kvm/vmx/vmx.h | 90 +-------- 5 files changed, 439 insertions(+), 405 deletions(-) create mode 100644 arch/x86/kvm/vmx/posted_intr.c create mode 100644 arch/x86/kvm/vmx/posted_intr.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 4a3081e9f4b5..7f86a14aed0e 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -17,7 +17,8 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o -kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o +kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ + vmx/evmcs.o vmx/nested.o vmx/posted_intr.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c new file mode 100644 index 000000000000..e4e7adff818c --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include +#include + +#include "lapic.h" +#include "posted_intr.h" +#include "trace.h" +#include "vmx.h" + +/* + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * can find which vCPU should be waken up. + */ +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); + +static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +{ + return &(to_vmx(vcpu)->pi_desc); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + + /* The full case. */ + do { + old.control = new.control = pi_desc->control; + + dest = cpu_physical_id(cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + new.sn = 0; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + +after_clear_sn: + + /* + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. + */ + smp_mb__after_atomic(); + + if (!pi_is_pir_empty(pi_desc)) + pi_set_on(pi_desc); +} + +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return; + + /* Set SN when the vCPU is preempted */ + if (vcpu->preempted) + pi_set_sn(pi_desc); +} + +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + +/* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. + * - Store the vCPU to the wakeup list, so when interrupts happen + * we can find the right vCPU to wake up. + * - Change the Posted-interrupt descriptor as below: + * 'NDST' <-- vcpu->pre_pcpu + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR + * - If 'ON' is set during this process, which means at least one + * interrupt is posted for this vCPU, we cannot block it, in + * this case, return 1, otherwise, return 0. + * + */ +int pi_pre_block(struct kvm_vcpu *vcpu) +{ + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return 0; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } + + do { + old.control = new.control = pi_desc->control; + + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); + + /* + * Since vCPU can be preempted during this process, + * vcpu->cpu could be different with pre_pcpu, we + * need to set pre_pcpu as the destination of wakeup + * notification event, then we can find the right vCPU + * to wakeup in wakeup handler if interrupts happen + * when the vCPU is in blocked state. + */ + dest = cpu_physical_id(vcpu->pre_pcpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'wakeup vector' */ + new.nv = POSTED_INTR_WAKEUP_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); +} + +void pi_post_block(struct kvm_vcpu *vcpu) +{ + if (vcpu->pre_pcpu == -1) + return; + + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); +} + +/* + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. + */ +void pi_wakeup_handler(void) +{ + struct kvm_vcpu *vcpu; + int cpu = smp_processor_id(); + + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), + blocked_vcpu_list) { + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (pi_test_on(pi_desc) == 1) + kvm_vcpu_kick(vcpu); + } + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +void __init pi_init(int cpu) +{ + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +} + +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); +} + + +/* + * pi_update_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; + int idx, ret = 0; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(kvm->vcpus[0])) + return 0; + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + /* + * VT-d PI cannot support posting multicast/broadcast + * interrupts to a vCPU, we still use interrupt remapping + * for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + * + * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. + */ + + kvm_set_msi_irq(kvm, e, &irq); + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + + continue; + } + + vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc); + vcpu_info.vector = irq.vector; + + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, + vcpu_info.vector, vcpu_info.pi_desc_addr, set); + + if (set) + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); + else + ret = irq_set_vcpu_affinity(host_irq, NULL); + + if (ret < 0) { + printk(KERN_INFO "%s: failed to update PI IRTE\n", + __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h new file mode 100644 index 000000000000..e53b97f82097 --- /dev/null +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_POSTED_INTR_H +#define __KVM_X86_VMX_POSTED_INTR_H + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + /* bit 256 - Outstanding Notification */ + u16 on : 1, + /* bit 257 - Suppress Notification */ + sn : 1, + /* bit 271:258 - Reserved */ + rsvd_1 : 14; + /* bit 279:272 - Notification Vector */ + u8 nv; + /* bit 287:280 - Reserved */ + u8 rsvd_2; + /* bit 319:288 - Notification Destination */ + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + +static inline int pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + +void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); +int pi_pre_block(struct kvm_vcpu *vcpu); +void pi_post_block(struct kvm_vcpu *vcpu); +void pi_wakeup_handler(void); +void __init pi_init(int cpu); +bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); +int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, + bool set); + +#endif /* __KVM_X86_VMX_POSTED_INTR_H */ \ No newline at end of file diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b0ba1cc79e6a..2cf068f45227 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -395,13 +395,6 @@ DEFINE_PER_CPU(struct vmcs *, current_vmcs); */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -/* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. - */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); - static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -1256,62 +1249,6 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif -static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. - */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) - return; - - /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; - } - - /* The full case. */ - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - -after_clear_sn: - - /* - * Clear SN before reading the bitmap. The VT-d firmware - * writes the bitmap and reads SN atomically (5.2.3 in the - * spec), so it doesn't really have a memory barrier that - * pairs with this, but we cannot do that and we need one. - */ - smp_mb__after_atomic(); - - if (!pi_is_pir_empty(pi_desc)) - pi_set_on(pi_desc); -} - void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { @@ -1395,20 +1332,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -5408,25 +5331,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. - */ -static void wakeup_handler(void) -{ - struct kvm_vcpu *vcpu; - int cpu = smp_processor_id(); - - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (pi_test_on(pi_desc) == 1) - kvm_vcpu_kick(vcpu); - } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); -} - static void vmx_enable_tdp(void) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6283,14 +6187,6 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - return pi_test_on(pi_desc) || - (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); -} - static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -7432,107 +7328,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - do { - old.control = new.control = pi_desc->control; - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } -} - -/* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * - */ -static int pi_pre_block(struct kvm_vcpu *vcpu) -{ - unsigned int dest; - struct pi_desc old, new; - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) - return 0; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } - - do { - old.control = new.control = pi_desc->control; - - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); - - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* set 'NV' to 'wakeup vector' */ - new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) - __pi_post_block(vcpu); - - local_irq_enable(); - return (vcpu->pre_pcpu == -1); -} - static int vmx_pre_block(struct kvm_vcpu *vcpu) { if (pi_pre_block(vcpu)) @@ -7544,17 +7339,6 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void pi_post_block(struct kvm_vcpu *vcpu) -{ - if (vcpu->pre_pcpu == -1) - return; - - WARN_ON(irqs_disabled()); - local_irq_disable(); - __pi_post_block(vcpu); - local_irq_enable(); -} - static void vmx_post_block(struct kvm_vcpu *vcpu) { if (kvm_x86_ops.set_hv_timer) @@ -7563,100 +7347,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) pi_post_block(vcpu); } -/* - * vmx_update_pi_irte - set IRTE for Posted-Interrupts - * - * @kvm: kvm - * @host_irq: host irq of the interrupt - * @guest_irq: gsi of the interrupt - * @set: set or unset PI - * returns 0 on success, < 0 on failure - */ -static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - struct kvm_kernel_irq_routing_entry *e; - struct kvm_irq_routing_table *irq_rt; - struct kvm_lapic_irq irq; - struct kvm_vcpu *vcpu; - struct vcpu_data vcpu_info; - int idx, ret = 0; - - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; - - idx = srcu_read_lock(&kvm->irq_srcu); - irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - if (guest_irq >= irq_rt->nr_rt_entries || - hlist_empty(&irq_rt->map[guest_irq])) { - pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", - guest_irq, irq_rt->nr_rt_entries); - goto out; - } - - hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { - if (e->type != KVM_IRQ_ROUTING_MSI) - continue; - /* - * VT-d PI cannot support posting multicast/broadcast - * interrupts to a vCPU, we still use interrupt remapping - * for these kind of interrupts. - * - * For lowest-priority interrupts, we only support - * those with single CPU as the destination, e.g. user - * configures the interrupts via /proc/irq or uses - * irqbalance to make the interrupts single-CPU. - * - * We will support full lowest-priority interrupt later. - * - * In addition, we can only inject generic interrupts using - * the PI mechanism, refuse to route others through it. - */ - - kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - - continue; - } - - vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); - vcpu_info.vector = irq.vector; - - trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, - vcpu_info.vector, vcpu_info.pi_desc_addr, set); - - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); - - if (ret < 0) { - printk(KERN_INFO "%s: failed to update PI IRTE\n", - __func__); - goto out; - } - } - - ret = 0; -out: - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7820,7 +7510,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, - .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, @@ -7854,7 +7544,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = vmx_update_pi_irte, + .update_pi_irte = pi_update_irte, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, @@ -8020,7 +7710,7 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; @@ -8159,8 +7849,8 @@ static int __init vmx_init(void) for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + + pi_init(cpu); } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 941336dc5ee4..e0a655c7a0fe 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -10,6 +10,7 @@ #include "capabilities.h" #include "kvm_cache_regs.h" #include "ops.h" +#include "posted_intr.h" #include "vmcs.h" #include "cpuid.h" @@ -49,29 +50,6 @@ enum segment_cache_field { SEG_FIELD_NR = 4 }; -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - #define RTIT_ADDR_RANGE 4 struct pt_ctx { @@ -356,67 +334,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) -{ - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_on(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline int pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -497,11 +414,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) -{ - return &(to_vmx(vcpu)->pi_desc); -} - static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); From 5a085326d51d0c92bc5c1f62d60dcf3db450af69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:31:12 -0700 Subject: [PATCH 164/497] KVM: VMX: Rename ops.h to vmx_ops.h Rename ops.h to vmx_ops.h to allow adding a tdx_ops.h in the future without causing massive confusion. Trust Domain Extensions (TDX) is built on VMX, but KVM cannot directly access the VMCS(es) for a TDX guest, thus TDX will need its own "ops" implementation for wrapping the low level operations. Signed-off-by: Sean Christopherson Message-Id: <20200923183112.3030-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/vmx/vmx.h | 2 +- arch/x86/kvm/vmx/{ops.h => vmx_ops.h} | 0 3 files changed, 1 insertion(+), 2 deletions(-) rename arch/x86/kvm/vmx/{ops.h => vmx_ops.h} (100%) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2cf068f45227..68b9a9b3661c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -56,7 +56,6 @@ #include "lapic.h" #include "mmu.h" #include "nested.h" -#include "ops.h" #include "pmu.h" #include "trace.h" #include "vmcs.h" diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e0a655c7a0fe..60bb7a125f0b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -9,9 +9,9 @@ #include "capabilities.h" #include "kvm_cache_regs.h" -#include "ops.h" #include "posted_intr.h" #include "vmcs.h" +#include "vmx_ops.h" #include "cpuid.h" extern const u32 vmx_msr_index[]; diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/vmx_ops.h similarity index 100% rename from arch/x86/kvm/vmx/ops.h rename to arch/x86/kvm/vmx/vmx_ops.h From e89505698c9f70125651060547da4ff5046124fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:28 -0700 Subject: [PATCH 165/497] KVM: x86/mmu: Commit zap of remaining invalid pages when recovering lpages Call kvm_mmu_commit_zap_page() after exiting the "prepare zap" loop in kvm_recover_nx_lpages() to finish zapping pages in the unlikely event that the loop exited due to lpage_disallowed_mmu_pages being empty. Because the recovery thread drops mmu_lock() when rescheduling, it's possible that lpage_disallowed_mmu_pages could be emptied by a different thread without to_zap reaching zero despite to_zap being derived from the number of disallowed lpages. Fixes: 1aa9b9572b105 ("kvm: x86: mmu: Recovery of shattered NX large pages") Cc: Junaid Shahid Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7ec3d057e8e0..340eacf3327b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6394,6 +6394,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) cond_resched_lock(&kvm->mmu_lock); } } + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); From 7d919c7a38fb2e34d4d27e535f19979e1b415f68 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:29 -0700 Subject: [PATCH 166/497] KVM: x86/mmu: Refactor the zap loop for recovering NX lpages Refactor the zap loop in kvm_recover_nx_lpages() to be a for loop that iterates on to_zap and drop the !to_zap check that leads to the in-loop calling of kvm_mmu_commit_zap_page(). The in-loop commit when to_zap hits zero is superfluous now that there's an unconditional commit after the loop to handle the case where lpage_disallowed_mmu_pages is emptied. Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 340eacf3327b..a4cbf5e14991 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6375,7 +6375,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; - while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + for ( ; to_zap; --to_zap) { + if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + break; + /* * We use a separate list instead of just using active_mmu_pages * because the number of lpage_disallowed pages is expected to @@ -6388,10 +6391,9 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); - if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (to_zap) - cond_resched_lock(&kvm->mmu_lock); + cond_resched_lock(&kvm->mmu_lock); } } kvm_mmu_commit_zap_page(kvm, &invalid_list); From 6c2fd34f5c7970915f040b502563cf339fb661cd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:30 -0700 Subject: [PATCH 167/497] KVM: x86/mmu: Move "huge page disallowed" calculation into mapping helpers Calculate huge_page_disallowed in __direct_map() and FNAME(fetch) in preparation for reworking the calculation so that it preserves the requested map level and eventually to avoid flagging a shadow page as being disallowed for being used as a large/huge page when it couldn't have been huge in the first place, e.g. because the backing page in the host is not large. Pass the error code into the helpers and use it to recalcuate exec and write_fault instead adding yet more booleans to the parameters. Opportunistically use huge_page_disallowed instead of lpage_disallowed to match the nomenclature used within the mapping helpers (though even they have existing inconsistencies). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 22 ++++++++++++---------- arch/x86/kvm/mmu/paging_tmpl.h | 24 ++++++++++++++---------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a4cbf5e14991..fb156833af00 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3335,10 +3335,14 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + bool prefault, bool is_tdp) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; int level, ret; @@ -3348,6 +3352,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; + if (huge_page_disallowed) + max_level = PG_LEVEL_4K; + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(gpa, level, pfn); @@ -3368,7 +3375,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (account_disallowed_nx_lpage) + if (is_tdp && huge_page_disallowed) account_huge_nx_page(vcpu->kvm, sp); } } @@ -4108,8 +4115,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool lpage_disallowed = exec && is_nx_huge_page_enabled(); bool map_writable; gfn_t gfn = gpa >> PAGE_SHIFT; @@ -4128,9 +4133,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (r) return r; - if (lpage_disallowed) - max_level = PG_LEVEL_4K; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4147,8 +4149,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, - prefault, is_tdp && lpage_disallowed); + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, + prefault, is_tdp); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 188ec2633bc5..cbb6ed32fead 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -625,11 +625,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * emulate this operation, return 1 to indicate this case. */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, - int write_fault, int max_level, - kvm_pfn_t pfn, bool map_writable, bool prefault, - bool lpage_disallowed) + struct guest_walker *gw, u32 error_code, + int max_level, kvm_pfn_t pfn, bool map_writable, + bool prefault) { + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + int write_fault = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; @@ -679,6 +682,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } + if (huge_page_disallowed) + max_level = PG_LEVEL_4K; + hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -704,7 +710,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (huge_page_disallowed) account_huge_nx_page(vcpu->kvm, sp); } } @@ -786,8 +792,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -828,7 +832,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - if (lpage_disallowed || is_self_change_mapping) + if (is_self_change_mapping) max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -872,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, - map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, + map_writable, prefault); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: From 3cf066127e8754f680ecb3b63ad7dd6949e5e54c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:31 -0700 Subject: [PATCH 168/497] KVM: x86/mmu: Capture requested page level before NX huge page workaround Apply the "huge page disallowed" adjustment of the max level only after capturing the original requested level. The requested level will be used in a future patch to skip adding pages to the list of disallowed huge pages if a huge page wasn't possible anyways, e.g. if the page isn't mapped as a huge page in the host. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 22 +++++++++++++++------- arch/x86/kvm/mmu/paging_tmpl.h | 8 +++----- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fb156833af00..0f35f1a32960 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3267,7 +3267,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, } static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp) + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -3275,6 +3276,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; + *req_level = PG_LEVEL_4K; + if (unlikely(max_level == PG_LEVEL_4K)) return PG_LEVEL_4K; @@ -3299,7 +3302,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (level == PG_LEVEL_4K) return level; - level = min(level, max_level); + *req_level = level = min(level, max_level); + + /* + * Enforce the iTLB multihit workaround after capturing the requested + * level, which will be used to do precise, accurate accounting. + */ + if (huge_page_disallowed) + return PG_LEVEL_4K; /* * mmu_notifier_retry() was successful and mmu_lock is held, so @@ -3345,17 +3355,15 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, ret; + int level, req_level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - if (huge_page_disallowed) - max_level = PG_LEVEL_4K; - - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index cbb6ed32fead..ba9af7c06f9d 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -636,7 +636,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, hlevel, ret; + int top_level, hlevel, req_level, ret; gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -682,10 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - if (huge_page_disallowed) - max_level = PG_LEVEL_4K; - - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); + hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); From 5bcaf3e1715f49de8935fc77bf74837287cae77d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:32 -0700 Subject: [PATCH 169/497] KVM: x86/mmu: Account NX huge page disallowed iff huge page was requested Condition the accounting of a disallowed huge NX page on the original requested level of the page being greater than the current iterator level. This does two things: accounts the page if and only if a huge page was actually disallowed, and accounts the shadow page if and only if it was the level at which the huge page was disallowed. For the latter case, the previous logic would account all shadow pages used to create the translation for the forced small page, e.g. even PML4, which can't be a huge page on current hardware, would be accounted as having been a disallowed huge page when using 5-level EPT. The overzealous accounting is purely a performance issue, i.e. the recovery thread will spuriously zap shadow pages, but otherwise the bad behavior is harmless. Cc: Junaid Shahid Fixes: b8e8c8303ff28 ("kvm: mmu: ITLB_MULTIHIT mitigation") Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0f35f1a32960..4b8d33657cd3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3383,7 +3383,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed) + if (is_tdp && huge_page_disallowed && + req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ba9af7c06f9d..24dd2e9ad816 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -708,7 +708,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (huge_page_disallowed) + if (huge_page_disallowed && req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } From 1d4a7372e149914095e489ada0e64395bbcc8483 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:33 -0700 Subject: [PATCH 170/497] KVM: x86/mmu: Rename 'hlevel' to 'level' in FNAME(fetch) Rename 'hlevel', which presumably stands for 'host level', to simply 'level' in FNAME(fetch). The variable hasn't tracked the host level for quite some time. Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 24dd2e9ad816..c7af4457bcf6 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -636,7 +636,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, hlevel, req_level, ret; + int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -682,8 +682,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, + huge_page_disallowed, &req_level); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -694,10 +694,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); + disallowed_hugepage_adjust(it, gw->gfn, &pfn, &level); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == hlevel) + if (it.level == level) break; validate_direct_spte(vcpu, it.sptep, direct_access); From dcc7065170d7018d9db1748d04026b2f4ce664dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:34 -0700 Subject: [PATCH 171/497] KVM: x86/mmu: Hoist ITLB multi-hit workaround check up a level Move the "ITLB multi-hit workaround enabled" check into the callers of disallowed_hugepage_adjust() to make it more obvious that the helper is specific to the workaround, and to be consistent with the accounting, i.e. account_huge_nx_page() is called if and only if the workaround is enabled. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4b8d33657cd3..ae47a0abf347 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3329,7 +3329,6 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, u64 spte = *it.sptep; if (it.level == level && level > PG_LEVEL_4K && - is_nx_huge_page_enabled() && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -3371,7 +3370,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &level); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(it, gfn, &pfn, &level); base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c7af4457bcf6..479b65df9b2d 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -694,7 +694,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &level); + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(it, gw->gfn, &pfn, &level); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) From e88b8093698f264c990a19f792aa409fd33fb35d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:37:35 -0700 Subject: [PATCH 172/497] KVM: x86/mmu: Track write/user faults using bools Use bools to track write and user faults throughout the page fault paths and down into mmu_set_spte(). The actual usage is purely boolean, but that's not obvious without digging into all paths as the current code uses a mix of bools (TDP and try_async_pf) and ints (shadow paging and mmu_set_spte()). No true functional change intended (although the pgprintk() will now print 0/1 instead of 0/PFERR_WRITE_MASK). Signed-off-by: Sean Christopherson Message-Id: <20200923183735.584-9-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ae47a0abf347..d58658d15366 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3082,7 +3082,7 @@ set_pte: } static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int write_fault, int level, + unsigned int pte_access, bool write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool host_writable) { @@ -3188,7 +3188,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, + mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, page_to_pfn(pages[i]), true, true); put_page(pages[i]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 479b65df9b2d..9a1a15f19beb 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -550,7 +550,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -630,7 +630,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, bool prefault) { bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - int write_fault = error_code & PFERR_WRITE_MASK; + bool write_fault = error_code & PFERR_WRITE_MASK; bool exec = error_code & PFERR_FETCH_MASK; bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; @@ -746,7 +746,7 @@ out_gpte_changed: */ static bool FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, int user_fault, + struct guest_walker *walker, bool user_fault, bool *write_fault_to_shadow_pgtable) { int level; @@ -784,8 +784,8 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; + bool write_fault = error_code & PFERR_WRITE_MASK; + bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; kvm_pfn_t pfn; From fc387d8daf3960c5e1bc18fa353768056f4fd394 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:44:46 -0700 Subject: [PATCH 173/497] KVM: nVMX: Reset the segment cache when stuffing guest segs Explicitly reset the segment cache after stuffing guest segment regs in prepare_vmcs02_rare(). Although the cache is reset when switching to vmcs02, there is nothing that prevents KVM from re-populating the cache prior to writing vmcs02 with vmcs12's values. E.g. if the vCPU is preempted after switching to vmcs02 but before prepare_vmcs02_rare(), kvm_arch_vcpu_put() will dereference GUEST_SS_AR_BYTES via .get_cpl() and cache the stale vmcs02 value. While the current code base only caches stale data in the preemption case, it's theoretically possible future code could read a segment register during the nested flow itself, i.e. this isn't technically illegal behavior in kvm_arch_vcpu_put(), although it did introduce the bug. This manifests as an unexpected nested VM-Enter failure when running with unrestricted guest disabled if the above preemption case coincides with L1 switching L2's CPL, e.g. when switching from a L2 vCPU at CPL3 to to a L2 vCPU at CPL0. stack_segment_valid() will see the new SS_SEL but the old SS_AR_BYTES and incorrectly mark the guest state as invalid due to SS.dpl != SS.rpl. Don't bother updating the cache even though prepare_vmcs02_rare() writes every segment. With unrestricted guest, guest segments are almost never read, let alone L2 guest segments. On the other hand, populating the cache requires a large number of memory writes, i.e. it's unlikely to be a net win. Updating the cache would be a win when unrestricted guest is not supported, as guest_state_valid() will immediately cache all segment registers. But, nested virtualization without unrestricted guest is dirt slow, saving some VMREADs won't change that, and every CPU manufactured in the last decade supports unrestricted guest. In other words, the extra (minor) complexity isn't worth the trouble. Note, kvm_arch_vcpu_put() may see stale data when querying guest CPL depending on when preemption occurs. This is "ok" in that the usage is imperfect by nature, i.e. it's used heuristically to improve performance but doesn't affect functionality. kvm_arch_vcpu_put() could be "fixed" by also disabling preemption while loading segments, but that's pointless and misleading as reading state from kvm_sched_{in,out}() is guaranteed to see stale data in one form or another. E.g. even if all the usage of regs_avail is fixed to call kvm_register_mark_available() after the associated state is set, the individual state might still be stale with respect to the overall vCPU state. I.e. making functional decisions in an asynchronous hook is doomed from the get go. Thankfully KVM doesn't do that. Fixes: de63ad4cf4973 ("KVM: X86: implement the logic for spinlock optimization") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20200923184452.980-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e8048e01c044..2e0f9dbc0ab6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2411,6 +2411,8 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmx->segment_cache.bitmask = 0; } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & From b89d5ad00e789967a5e2c5335f75c48755bebd88 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:44:47 -0700 Subject: [PATCH 174/497] KVM: nVMX: Reload vmcs01 if getting vmcs12's pages fails Reload vmcs01 when bailing from nested_vmx_enter_non_root_mode() as KVM expects vmcs01 to be loaded when is_guest_mode() is false. Fixes: 671ddc700fd08 ("KVM: nVMX: Don't leak L1 MMIO regions to L2") Cc: stable@vger.kernel.org Cc: Dan Cross Cc: Jim Mattson Cc: Peter Shier Signed-off-by: Sean Christopherson Message-Id: <20200923184452.980-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2e0f9dbc0ab6..321a6790ad9f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3349,8 +3349,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - if (unlikely(!nested_get_vmcs12_pages(vcpu))) + if (unlikely(!nested_get_vmcs12_pages(vcpu))) { + vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_KVM_INTERNAL_ERROR; + } if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); From 2ba4493a8b195cc1d7dd5b762a84e6235c2d67bd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:44:48 -0700 Subject: [PATCH 175/497] KVM: nVMX: Explicitly check for valid guest state for !unrestricted guest Call guest_state_valid() directly instead of querying emulation_required when checking if L1 is attempting VM-Enter with invalid guest state. If emulate_invalid_guest_state is false, KVM will fixup segment regs to avoid emulation and will never set emulation_required, i.e. KVM will incorrectly miss the associated consistency checks because the nested path stuffs segments directly into vmcs02. Opportunsitically add Consistency Check tracing to make future debug suck a little less. Fixes: 2bb8cafea80bf ("KVM: vVMX: signal failure for nested VMEntry if emulation_required") Fixes: 3184a995f782c ("KVM: nVMX: fix vmentry failure code when L2 state would require emulation") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20200923184452.980-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 8 ++------ arch/x86/kvm/vmx/vmx.h | 7 +++++++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 321a6790ad9f..e004ab8c2ea6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2576,7 +2576,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. */ - if (vmx->emulation_required) { + if (CC(!vmx_guest_state_valid(vcpu))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 68b9a9b3661c..d87c8d2892ca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -337,7 +337,6 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); -static bool guest_state_valid(struct kvm_vcpu *vcpu); static u32 vmx_segment_access_rights(struct kvm_segment *var); static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr, int type); @@ -1340,7 +1339,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static bool emulation_required(struct kvm_vcpu *vcpu) { - return emulate_invalid_guest_state && !guest_state_valid(vcpu); + return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); } unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -3402,11 +3401,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) * not. * We assume that registers are always usable */ -static bool guest_state_valid(struct kvm_vcpu *vcpu) +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) { - if (is_unrestricted_guest(vcpu)) - return true; - /* real mode guest state checks */ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 60bb7a125f0b..e87fae204d3d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -321,6 +321,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, int root_level); + void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); @@ -472,6 +473,12 @@ static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) SECONDARY_EXEC_UNRESTRICTED_GUEST)); } +bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu); +static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) +{ + return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu); +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ From c61ca2fcbcea62331b82a587ee6e423a97ae6f66 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:44:49 -0700 Subject: [PATCH 176/497] KVM: nVMX: Move free_nested() below vmx_switch_vmcs() Move free_nested() down below vmx_switch_vmcs() so that a future patch can do an "emergency" invocation of vmx_switch_vmcs() if vmcs01 is not the loaded VMCS when freeing nested resources. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923184452.980-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 76 +++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e004ab8c2ea6..ac1e1761dd87 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -233,6 +233,44 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs = NULL; } +static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, + struct loaded_vmcs *prev) +{ + struct vmcs_host_state *dest, *src; + + if (unlikely(!vmx->guest_state_loaded)) + return; + + src = &prev->host_state; + dest = &vmx->loaded_vmcs->host_state; + + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + dest->ldt_sel = src->ldt_sel; +#ifdef CONFIG_X86_64 + dest->ds_sel = src->ds_sel; + dest->es_sel = src->es_sel; +#endif +} + +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *prev; + int cpu; + + if (vmx->loaded_vmcs == vmcs) + return; + + cpu = get_cpu(); + prev = vmx->loaded_vmcs; + vmx->loaded_vmcs = vmcs; + vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_sync_vmcs_host_state(vmx, prev); + put_cpu(); + + vmx_register_cache_reset(vcpu); +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -277,44 +315,6 @@ static void free_nested(struct kvm_vcpu *vcpu) free_loaded_vmcs(&vmx->nested.vmcs02); } -static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, - struct loaded_vmcs *prev) -{ - struct vmcs_host_state *dest, *src; - - if (unlikely(!vmx->guest_state_loaded)) - return; - - src = &prev->host_state; - dest = &vmx->loaded_vmcs->host_state; - - vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); - dest->ldt_sel = src->ldt_sel; -#ifdef CONFIG_X86_64 - dest->ds_sel = src->ds_sel; - dest->es_sel = src->es_sel; -#endif -} - -static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct loaded_vmcs *prev; - int cpu; - - if (vmx->loaded_vmcs == vmcs) - return; - - cpu = get_cpu(); - prev = vmx->loaded_vmcs; - vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); - vmx_sync_vmcs_host_state(vmx, prev); - put_cpu(); - - vmx_register_cache_reset(vcpu); -} - /* * Ensure that the current vmcs of the logical processor is the * vmcs01 of the vcpu before calling free_nested(). From df82a24b29d1b10b965eb7d66b68867b7da76e76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:44:50 -0700 Subject: [PATCH 177/497] KVM: nVMX: Ensure vmcs01 is the loaded VMCS when freeing nested state Add a WARN in free_nested() to ensure vmcs01 is loaded prior to freeing vmcs02 and friends, and explicitly switch to vmcs01 if it's not. KVM is supposed to keep is_guest_mode() and loaded_vmcs==vmcs02 synchronized, but bugs happen and freeing vmcs02 while it's in use will escalate a KVM error to a use-after-free and potentially crash the kernel. Do the WARN and switch even in the !vmxon case to help detect latent bugs. free_nested() is not a hot path, and the check is cheap. Signed-off-by: Sean Christopherson Message-Id: <20200923184452.980-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ac1e1761dd87..33c25ec8b198 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -279,6 +279,9 @@ static void free_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) + vmx_switch_vmcs(vcpu, &vmx->vmcs01); + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; From ebec153a056108ca7818caad549d3ad7c0f2369d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:44:51 -0700 Subject: [PATCH 178/497] KVM: nVMX: Drop redundant VMCS switch and free_nested() call Remove the explicit switch to vmcs01 and the call to free_nested() in nested_vmx_free_vcpu(). free_nested(), which is called unconditionally by vmx_leave_nested(), ensures vmcs01 is loaded prior to freeing vmcs02 and friends. Signed-off-by: Sean Christopherson Message-Id: <20200923184452.980-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 33c25ec8b198..bfd69fcefc72 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -326,8 +326,6 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); vmx_leave_nested(vcpu); - vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); - free_nested(vcpu); vcpu_put(vcpu); } From 138534a810aa6aa3a6f27ac36ac52c0c685859a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:44:52 -0700 Subject: [PATCH 179/497] KVM: nVMX: WARN on attempt to switch the currently loaded VMCS WARN if KVM attempts to switch to the currently loaded VMCS. Now that nested_vmx_free_vcpu() doesn't blindly call vmx_switch_vmcs(), all paths that lead to vmx_switch_vmcs() are implicitly guarded by guest vs. host mode, e.g. KVM should never emulate VMX instructions when guest mode is active, and nested_vmx_vmexit() should never be called when host mode is active. Signed-off-by: Sean Christopherson Message-Id: <20200923184452.980-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bfd69fcefc72..7911c2797220 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -258,7 +258,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) struct loaded_vmcs *prev; int cpu; - if (vmx->loaded_vmcs == vmcs) + if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) return; cpu = get_cpu(); From b2d522552ca02de3b0f72104705fb2e62fcf1ce6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 13:13:43 -0700 Subject: [PATCH 180/497] KVM: x86: Add RIP to the kvm_entry, i.e. VM-Enter, tracepoint Add RIP to the kvm_entry tracepoint to help debug if the kvm_exit tracepoint is disabled or if VM-Enter fails, in which case the kvm_exit tracepoint won't be hit. Read RIP from within the tracepoint itself to avoid a potential VMREAD and retpoline if the guest's RIP isn't available. Signed-off-by: Sean Christopherson Message-Id: <20200923201349.16097-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 10 ++++++---- arch/x86/kvm/x86.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 11cb4c070f0c..7e8edc7a1f73 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -15,18 +15,20 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) + __field( unsigned long, rip ) ), TP_fast_assign( - __entry->vcpu_id = vcpu_id; + __entry->vcpu_id = vcpu->vcpu_id; + __entry->rip = kvm_rip_read(vcpu); ), - TP_printk("vcpu %u", __entry->vcpu_id) + TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) ); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f3e9ab34c80..b4c581c3c187 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8563,7 +8563,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops.request_immediate_exit(vcpu); } - trace_kvm_entry(vcpu->vcpu_id); + trace_kvm_entry(vcpu); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) From a9d7d76c66ede215542f12b18980ef36798564b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 13:13:44 -0700 Subject: [PATCH 181/497] KVM: x86: Read guest RIP from within the kvm_nested_vmexit tracepoint Use kvm_rip_read() to read the guest's RIP for the nested VM-Exit tracepoint instead of having the caller pass in an argument. Params that are passed into a tracepoint are evaluated even if the tracepoint is disabled, i.e. passing in RIP for VMX incurs a VMREAD and retpoline to retrieve a value that may never be used, e.g. if the exit is due to a hardware interrupt. Signed-off-by: Sean Christopherson Message-Id: <20200923201349.16097-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/trace.h | 6 +++--- arch/x86/kvm/vmx/nested.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 08adac8900e0..c02c2d4c0400 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2989,7 +2989,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, + trace_kvm_nested_vmexit(vcpu, exit_code, svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->control.exit_int_info, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7e8edc7a1f73..bb5e44f83262 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -578,10 +578,10 @@ TRACE_EVENT(kvm_nested_intercepts, * Tracepoint for #VMEXIT while nested */ TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(__u64 rip, __u32 exit_code, + TP_PROTO(struct kvm_vcpu *vcpu, __u32 exit_code, __u64 exit_info1, __u64 exit_info2, __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(rip, exit_code, exit_info1, exit_info2, + TP_ARGS(vcpu, exit_code, exit_info1, exit_info2, exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( @@ -595,7 +595,7 @@ TRACE_EVENT(kvm_nested_vmexit, ), TP_fast_assign( - __entry->rip = rip; + __entry->rip = kvm_rip_read(vcpu); __entry->exit_code = exit_code; __entry->exit_info1 = exit_info1; __entry->exit_info2 = exit_info2; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7911c2797220..d440d38c5a86 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5937,7 +5937,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) exit_intr_info = vmx_get_intr_info(vcpu); exit_qual = vmx_get_exit_qual(vcpu); - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, + trace_kvm_nested_vmexit(vcpu, exit_reason, exit_qual, vmx->idt_vectoring_info, exit_intr_info, vmcs_read32(VM_EXIT_INTR_ERROR_CODE), KVM_ISA_VMX); From f315f2b140156f456a091393fd0392acf9e6fb31 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 13:13:45 -0700 Subject: [PATCH 182/497] KVM: VMX: Add a helper to test for a valid error code given an intr info Add a helper, is_exception_with_error_code(), to provide the simple but difficult to read code of checking for a valid exception with an error code given a vmcs.VM_EXIT_INTR_INFO value. The helper will gain another user, vmx_get_exit_info(), in a future patch. Signed-off-by: Sean Christopherson Message-Id: <20200923201349.16097-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +--- arch/x86/kvm/vmx/vmcs.h | 7 +++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d440d38c5a86..4e3c620fafe2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5956,9 +5956,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue. */ - if ((exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_intr_error_code = diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7a3675fddec2..1472c6c376f7 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -138,6 +138,13 @@ static inline bool is_external_intr(u32 intr_info) return is_intr_type(intr_info, INTR_TYPE_EXT_INTR); } +static inline bool is_exception_with_error_code(u32 intr_info) +{ + const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK; + + return (intr_info & mask) == mask; +} + enum vmcs_field_width { VMCS_FIELD_WIDTH_U16 = 0, VMCS_FIELD_WIDTH_U64 = 1, From 235ba74f008d2e0936b29f77f68d4e2f73ffd24a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 13:13:46 -0700 Subject: [PATCH 183/497] KVM: x86: Add intr/vectoring info and error code to kvm_exit tracepoint Extend the kvm_exit tracepoint to align it with kvm_nested_vmexit in terms of what information is captured. On SVM, add interrupt info and error code, while on VMX it add IDT vectoring and error code. This sets the stage for macrofying the kvm_exit tracepoint definition so that it can be reused for kvm_nested_vmexit without loss of information. Opportunistically stuff a zero for VM_EXIT_INTR_INFO if the VM-Enter failed, as the field is guaranteed to be invalid. Note, it'd be possible to further filter the interrupt/exception fields based on the VM-Exit reason, but the helper is intended only for tracepoints, i.e. an extra VMREAD or two is a non-issue, the failed VM-Enter case is just low hanging fruit. Signed-off-by: Sean Christopherson Message-Id: <20200923201349.16097-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ++++++- arch/x86/kvm/svm/svm.c | 9 ++++++++- arch/x86/kvm/trace.h | 12 +++++++++--- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++++++-- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a4a68b2b38d5..3f0ccb646db8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1143,7 +1143,12 @@ struct kvm_x86_ops { /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + /* + * Retrieve somewhat arbitrary exit information. Intended to be used + * only from within tracepoints to avoid VMREADs when tracing is off. + */ + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *exit_int_info, u32 *exit_int_info_err_code); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c02c2d4c0400..eb0a97d687a4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2965,12 +2965,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; *info1 = control->exit_info_1; *info2 = control->exit_info_2; + *intr_info = control->exit_int_info; + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->exit_int_info_err; + else + *error_code = 0; } static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bb5e44f83262..7e3ad6419f90 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -248,6 +248,8 @@ TRACE_EVENT(kvm_exit, __field( u32, isa ) __field( u64, info1 ) __field( u64, info2 ) + __field( u32, intr_info ) + __field( u32, error_code ) __field( unsigned int, vcpu_id ) ), @@ -257,13 +259,17 @@ TRACE_EVENT(kvm_exit, __entry->isa = isa; __entry->vcpu_id = vcpu->vcpu_id; kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, - &__entry->info2); + &__entry->info2, + &__entry->intr_info, + &__entry->error_code); ), - TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx", + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " + "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", __entry->vcpu_id, kvm_print_exit_reason(__entry->exit_reason, __entry->isa), - __entry->guest_rip, __entry->info1, __entry->info2) + __entry->guest_rip, __entry->info1, __entry->info2, + __entry->intr_info, __entry->error_code) ); /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d87c8d2892ca..bac423abcfda 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5545,10 +5545,24 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + u32 *intr_info, u32 *error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + *info1 = vmx_get_exit_qual(vcpu); - *info2 = vmx_get_intr_info(vcpu); + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + *info2 = vmx->idt_vectoring_info; + *intr_info = vmx_get_intr_info(vcpu); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + else + *error_code = 0; + } else { + *info2 = 0; + *intr_info = 0; + *error_code = 0; + } } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) From 029e8c8ad655a253658239438644e0eea2b932fa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 13:13:47 -0700 Subject: [PATCH 184/497] KVM: x86: Add macro wrapper for defining kvm_exit tracepoint Macrofy the definition of kvm_exit so that the definition can be reused verbatim by kvm_nested_vmexit. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923201349.16097-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 69 +++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7e3ad6419f90..f1d84614f94e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -235,42 +235,45 @@ TRACE_EVENT(kvm_apic, (isa == KVM_ISA_VMX) ? \ __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" +#define TRACE_EVENT_KVM_EXIT(name) \ +TRACE_EVENT(name, \ + TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \ + TP_ARGS(exit_reason, vcpu, isa), \ + \ + TP_STRUCT__entry( \ + __field( unsigned int, exit_reason ) \ + __field( unsigned long, guest_rip ) \ + __field( u32, isa ) \ + __field( u64, info1 ) \ + __field( u64, info2 ) \ + __field( u32, intr_info ) \ + __field( u32, error_code ) \ + __field( unsigned int, vcpu_id ) \ + ), \ + \ + TP_fast_assign( \ + __entry->exit_reason = exit_reason; \ + __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->isa = isa; \ + __entry->vcpu_id = vcpu->vcpu_id; \ + kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \ + &__entry->info2, \ + &__entry->intr_info, \ + &__entry->error_code); \ + ), \ + \ + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ + "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \ + __entry->vcpu_id, \ + kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \ + __entry->guest_rip, __entry->info1, __entry->info2, \ + __entry->intr_info, __entry->error_code) \ +) + /* * Tracepoint for kvm guest exit: */ -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), - - TP_STRUCT__entry( - __field( unsigned int, exit_reason ) - __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) - __field( u32, intr_info ) - __field( u32, error_code ) - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - __entry->vcpu_id = vcpu->vcpu_id; - kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, - &__entry->info2, - &__entry->intr_info, - &__entry->error_code); - ), - - TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " - "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", - __entry->vcpu_id, - kvm_print_exit_reason(__entry->exit_reason, __entry->isa), - __entry->guest_rip, __entry->info1, __entry->info2, - __entry->intr_info, __entry->error_code) -); +TRACE_EVENT_KVM_EXIT(kvm_exit); /* * Tracepoint for kvm interrupt injection: From cc167bd7ee99c38615da04bdadb67390eef093b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 13:13:48 -0700 Subject: [PATCH 185/497] KVM: x86: Use common definition for kvm_nested_vmexit tracepoint Use the newly introduced TRACE_EVENT_KVM_EXIT to define the guts of kvm_nested_vmexit so that it captures and prints the same information as kvm_exit. This has the bonus side effect of fixing the interrupt info and error code printing for the case where they're invalid, e.g. if the exit was a failed VM-Entry. This also sets the stage for retrieving EXIT_QUALIFICATION and VM_EXIT_INTR_INFO in nested_vmx_reflect_vmexit() if and only if the VM-Exit is being routed to L1. Signed-off-by: Sean Christopherson Message-Id: <20200923201349.16097-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 7 +------ arch/x86/kvm/trace.h | 34 +--------------------------------- arch/x86/kvm/vmx/nested.c | 5 +---- 3 files changed, 3 insertions(+), 43 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index eb0a97d687a4..ecefec42439f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2996,12 +2996,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(vcpu, exit_code, - svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, - svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err, - KVM_ISA_SVM); + trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index f1d84614f94e..aef960f90f26 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -586,39 +586,7 @@ TRACE_EVENT(kvm_nested_intercepts, /* * Tracepoint for #VMEXIT while nested */ -TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(vcpu, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->rip = kvm_rip_read(vcpu); - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - __entry->rip, - kvm_print_exit_reason(__entry->exit_code, __entry->isa), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); +TRACE_EVENT_KVM_EXIT(kvm_nested_vmexit); /* * Tracepoint for #VMEXIT reinjected to the guest diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4e3c620fafe2..632b52e20383 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5937,10 +5937,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) exit_intr_info = vmx_get_intr_info(vcpu); exit_qual = vmx_get_exit_qual(vcpu); - trace_kvm_nested_vmexit(vcpu, exit_reason, exit_qual, - vmx->idt_vectoring_info, exit_intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) From 02f1965ff83b2f6162205576b7d708c9ad5bd138 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 13:13:49 -0700 Subject: [PATCH 186/497] KVM: nVMX: Read EXIT_QUAL and INTR_INFO only when needed for nested exit Read vmcs.EXIT_QUALIFICATION and vmcs.VM_EXIT_INTR_INFO only if the VM-Exit is being reflected to L1 now that they are no longer passed directly to the kvm_nested_vmexit tracepoint. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923201349.16097-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 632b52e20383..aa9204710705 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5934,9 +5934,6 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - exit_intr_info = vmx_get_intr_info(vcpu); - exit_qual = vmx_get_exit_qual(vcpu); - trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ @@ -5953,12 +5950,14 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue. */ + exit_intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); } + exit_qual = vmx_get_exit_qual(vcpu); reflect_vmexit: nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); From 04d28e375271bf0e6e31abeeb92e4f8b31443d23 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 09:33:14 -0700 Subject: [PATCH 187/497] KVM: x86/mmu: Move individual kvm_mmu initialization into common helper Move initialization of 'struct kvm_mmu' fields into alloc_mmu_pages() to consolidate code, and rename the helper to __kvm_mmu_create(). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923163314.8181-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d58658d15366..32e0e5c0524e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5711,11 +5711,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu) free_page((unsigned long)mmu->lm_root); } -static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; + mmu->root_hpa = INVALID_PAGE; + mmu->root_pgd = 0; + mmu->translate_gpa = translate_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU @@ -5741,7 +5747,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { - uint i; int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; @@ -5755,25 +5760,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_pgd = 0; - vcpu->arch.root_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - - vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_pgd = 0; - vcpu->arch.guest_mmu.translate_gpa = translate_gpa; - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; - ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); + ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; From 7e34fbd05c6350b30161be84d4f8954c9d321292 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:03:55 -0700 Subject: [PATCH 188/497] KVM: x86: Rename "shared_msrs" to "user_return_msrs" Rename the "shared_msrs" mechanism, which is used to defer restoring MSRs that are only consumed when running in userspace, to a more banal but less likely to be confusing "user_return_msrs". The "shared" nomenclature is confusing as it's not obvious who is sharing what, e.g. reasonable interpretations are that the guest value is shared by vCPUs in a VM, or that the MSR value is shared/common to guest and host, both of which are wrong. "shared" is also misleading as the MSR value (in hardware) is not guaranteed to be shared/reused between VMs (if that's indeed the correct interpretation of the name), as the ability to share values between VMs is simply a side effect (albiet a very nice side effect) of deferring restoration of the host value until returning from userspace. "user_return" avoids the above confusion by describing the mechanism itself instead of its effects. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 +- arch/x86/kvm/vmx/vmx.c | 11 ++-- arch/x86/kvm/x86.c | 101 +++++++++++++++++--------------- 3 files changed, 60 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3f0ccb646db8..5d4c39c37390 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1617,8 +1617,8 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); -void kvm_define_shared_msr(unsigned index, u32 msr); -int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +void kvm_define_user_return_msr(unsigned index, u32 msr); +int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bac423abcfda..b41649e54217 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -639,8 +639,7 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, msr->data = data; if (msr - vmx->guest_msrs < vmx->save_nmsrs) { preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_user_return_msr(msr->index, msr->data, msr->mask); preempt_enable(); if (ret) msr->data = old_msr_data; @@ -1138,9 +1137,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (!vmx->guest_msrs_ready) { vmx->guest_msrs_ready = true; for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); + kvm_set_user_return_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); } @@ -7582,7 +7581,7 @@ static __init int hardware_setup(void) host_idt_base = dt.address; for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); + kvm_define_user_return_msr(i, vmx_msr_index[i]); if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4c581c3c187..a547b0052396 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -162,24 +162,29 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); -#define KVM_NR_SHARED_MSRS 16 +/* + * Restoring the host value for MSRs that are only consumed when running in + * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU + * returns to userspace, i.e. the kernel can run with the guest's value. + */ +#define KVM_MAX_NR_USER_RETURN_MSRS 16 -struct kvm_shared_msrs_global { +struct kvm_user_return_msrs_global { int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; }; -struct kvm_shared_msrs { +struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; - struct kvm_shared_msr_values { + struct kvm_user_return_msr_values { u64 host; u64 curr; - } values[KVM_NR_SHARED_MSRS]; + } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; +static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; +static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -294,9 +299,9 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; + struct kvm_user_return_msrs *msrs + = container_of(urn, struct kvm_user_return_msrs, urn); + struct kvm_user_return_msr_values *values; unsigned long flags; /* @@ -304,73 +309,73 @@ static void kvm_on_user_return(struct user_return_notifier *urn) * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); - if (locals->registered) { - locals->registered = false; + if (msrs->registered) { + msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; + for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { + values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); + wrmsrl(user_return_msrs_global.msrs[slot], values->host); values->curr = values->host; } } } -void kvm_define_shared_msr(unsigned slot, u32 msr) +void kvm_define_user_return_msr(unsigned slot, u32 msr) { - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; + BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); + user_return_msrs_global.msrs[slot] = msr; + if (slot >= user_return_msrs_global.nr) + user_return_msrs_global.nr = slot + 1; } -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); +EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); -static void kvm_shared_msr_cpu_online(void) +static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; int i; - for (i = 0; i < shared_msrs_global.nr; ++i) { - rdmsrl_safe(shared_msrs_global.msrs[i], &value); - smsr->values[i].host = value; - smsr->values[i].curr = value; + for (i = 0; i < user_return_msrs_global.nr; ++i) { + rdmsrl_safe(user_return_msrs_global.msrs[i], &value); + msrs->values[i].host = value; + msrs->values[i].curr = value; } } -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; - value = (value & mask) | (smsr->values[slot].host & ~mask); - if (value == smsr->values[slot].curr) + value = (value & mask) | (msrs->values[slot].host & ~mask); + if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); if (err) return 1; - smsr->values[slot].curr = value; - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; + msrs->values[slot].curr = value; + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); +EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - if (smsr->registered) - kvm_on_user_return(&smsr->urn); + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) @@ -7529,9 +7534,9 @@ int kvm_arch_init(void *opaque) goto out_free_x86_fpu_cache; } - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); + if (!user_return_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); goto out_free_x86_emulator_cache; } @@ -7564,7 +7569,7 @@ int kvm_arch_init(void *opaque) return 0; out_free_percpu: - free_percpu(shared_msrs); + free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: @@ -7591,7 +7596,7 @@ void kvm_arch_exit(void) #endif kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); - free_percpu(shared_msrs); + free_percpu(user_return_msrs); kmem_cache_destroy(x86_fpu_cache); } @@ -9722,7 +9727,7 @@ int kvm_arch_hardware_enable(void) u64 max_tsc = 0; bool stable, backwards_tsc = false; - kvm_shared_msr_cpu_online(); + kvm_user_return_msr_cpu_online(); ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; From ce833b2324ba6e5367a54e641a36b61dedb1199b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:03:56 -0700 Subject: [PATCH 189/497] KVM: VMX: Prepend "MAX_" to MSR array size defines Add "MAX" to the LOADSTORE and so called SHARED MSR defines to make it more clear that the define controls the array size, as opposed to the actual number of valid entries that are in the array. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/vmx/vmx.h | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index aa9204710705..b4f63212272b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1041,7 +1041,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); if (in_vmcs12_store_list && !in_autostore_list) { - if (autostore->nr == NR_LOADSTORE_MSRS) { + if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b41649e54217..9de90907543f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -917,8 +917,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, if (!entry_only) j = vmx_find_msr_index(&m->host, msr); - if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) || - (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) { + if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -6721,7 +6721,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != MAX_NR_SHARED_MSRS); for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e87fae204d3d..2e37c17733a8 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -23,16 +23,16 @@ extern const u32 vmx_msr_index[]; #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 -#define NR_SHARED_MSRS 7 +#define MAX_NR_SHARED_MSRS 7 #else -#define NR_SHARED_MSRS 4 +#define MAX_NR_SHARED_MSRS 4 #endif -#define NR_LOADSTORE_MSRS 8 +#define MAX_NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; - struct vmx_msr_entry val[NR_LOADSTORE_MSRS]; + struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS]; }; struct shared_msr_entry { @@ -196,7 +196,7 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; - struct shared_msr_entry guest_msrs[NR_SHARED_MSRS]; + struct shared_msr_entry guest_msrs[MAX_NR_SHARED_MSRS]; int nmsrs; int save_nmsrs; bool guest_msrs_ready; From a128a934f202530036f867b284b3554e754452e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:03:57 -0700 Subject: [PATCH 190/497] KVM: VMX: Rename "vmx_find_msr_index" to "vmx_find_loadstore_msr_slot" Add "loadstore" to vmx_find_msr_index() to differentiate it from the so called shared MSRs helpers (which will soon be renamed), and replace "index" with "slot" to better convey that the helper returns slot in the array, not the MSR index (the value that gets stuffed into ECX). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 16 ++++++++-------- arch/x86/kvm/vmx/vmx.c | 10 +++++----- arch/x86/kvm/vmx/vmx.h | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b4f63212272b..a4a5aa18be8a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -939,11 +939,11 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * VM-exit in L0, use the more accurate value. */ if (msr_index == MSR_IA32_TSC) { - int index = vmx_find_msr_index(&vmx->msr_autostore.guest, - MSR_IA32_TSC); + int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, + MSR_IA32_TSC); - if (index >= 0) { - u64 val = vmx->msr_autostore.guest.val[index].value; + if (i >= 0) { + u64 val = vmx->msr_autostore.guest.val[i].value; *data = kvm_read_l1_tsc(vcpu, val); return true; @@ -1032,12 +1032,12 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msrs *autostore = &vmx->msr_autostore.guest; bool in_vmcs12_store_list; - int msr_autostore_index; + int msr_autostore_slot; bool in_autostore_list; int last; - msr_autostore_index = vmx_find_msr_index(autostore, msr_index); - in_autostore_list = msr_autostore_index >= 0; + msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); + in_autostore_list = msr_autostore_slot >= 0; in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); if (in_vmcs12_store_list && !in_autostore_list) { @@ -1058,7 +1058,7 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, autostore->val[last].index = msr_index; } else if (!in_vmcs12_store_list && in_autostore_list) { last = --autostore->nr; - autostore->val[msr_autostore_index] = autostore->val[last]; + autostore->val[msr_autostore_slot] = autostore->val[last]; } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9de90907543f..2329f1bbbe6f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -812,7 +812,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr) +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -846,7 +846,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; @@ -854,7 +854,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: - i = vmx_find_msr_index(&m->host, msr); + i = vmx_find_loadstore_msr_slot(&m->host, msr); if (i < 0) return; @@ -913,9 +913,9 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - i = vmx_find_msr_index(&m->guest, msr); + i = vmx_find_loadstore_msr_slot(&m->guest, msr); if (!entry_only) - j = vmx_find_msr_index(&m->host, msr); + j = vmx_find_loadstore_msr_slot(&m->host, msr); if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2e37c17733a8..bce8cc4c9792 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -332,7 +332,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); -int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); +int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); static inline u8 vmx_get_rvi(void) From eb3db1b1378882a2d0abc1b6103040c97bcc91e4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:03:58 -0700 Subject: [PATCH 191/497] KVM: VMX: Rename the "shared_msr_entry" struct to "vmx_uret_msr" Rename struct "shared_msr_entry" to "vmx_uret_msr" to align with x86's rename of "shared_msrs" to "user_return_msrs", and to call out that the struct is specific to VMX, i.e. not part of the generic "shared_msrs" framework. Abbreviate "user_return" as "uret" to keep line lengths marginally sane and code more or less readable. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 58 +++++++++++++++++++-------------------- arch/x86/kvm/vmx/vmx.h | 10 +++---- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a4a5aa18be8a..683e0627d4ce 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4273,7 +4273,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) { - struct shared_msr_entry *efer_msr; + struct vmx_uret_msr *efer_msr; unsigned int i; if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2329f1bbbe6f..91e9a3657c4b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -616,28 +616,28 @@ static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) int i; for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) + if (vmx_msr_index[vmx->guest_uret_msrs[i].index] == msr) return i; return -1; } -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +struct vmx_uret_msr *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; i = __find_msr_index(vmx, msr); if (i >= 0) - return &vmx->guest_msrs[i]; + return &vmx->guest_uret_msrs[i]; return NULL; } -static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data) +static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data) { int ret = 0; u64 old_msr_data = msr->data; msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + if (msr - vmx->guest_uret_msrs < vmx->save_nmsrs) { preempt_disable(); ret = kvm_set_user_return_msr(msr->index, msr->data, msr->mask); preempt_enable(); @@ -982,8 +982,8 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + vmx->guest_uret_msrs[efer_offset].data = guest_efer; + vmx->guest_uret_msrs[efer_offset].mask = ~ignore_bits; return true; } @@ -1137,9 +1137,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (!vmx->guest_msrs_ready) { vmx->guest_msrs_ready = true; for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_user_return_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); + kvm_set_user_return_msr(vmx->guest_uret_msrs[i].index, + vmx->guest_uret_msrs[i].data, + vmx->guest_uret_msrs[i].mask); } @@ -1614,11 +1614,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) */ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) { - struct shared_msr_entry tmp; + struct vmx_uret_msr tmp; - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; + tmp = vmx->guest_uret_msrs[to]; + vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from]; + vmx->guest_uret_msrs[from] = tmp; } /* @@ -1729,7 +1729,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; u32 index; switch (msr_info->index) { @@ -1750,7 +1750,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; @@ -1857,9 +1857,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: + find_uret_msr: msr = find_msr_entry(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; @@ -1889,7 +1889,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; @@ -1993,7 +1993,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) return 1; - goto find_shared_msr; + goto find_uret_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2130,10 +2130,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - goto find_shared_msr; + goto find_uret_msr; default: - find_shared_msr: + find_uret_msr: msr = find_msr_entry(vmx, msr_index); if (msr) ret = vmx_set_guest_msr(vmx, msr, data); @@ -2767,7 +2767,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct vmx_uret_msr *msr = find_msr_entry(vmx, MSR_EFER); if (!msr) return; @@ -6721,7 +6721,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != MAX_NR_SHARED_MSRS); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != MAX_NR_USER_RETURN_MSRS); for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; @@ -6733,8 +6733,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (wrmsr_safe(index, data_low, data_high) < 0) continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; + vmx->guest_uret_msrs[j].index = i; + vmx->guest_uret_msrs[j].data = 0; switch (index) { case MSR_IA32_TSX_CTRL: /* @@ -6742,10 +6742,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) * let's avoid changing CPUID bits under the host * kernel's feet. */ - vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; break; default: - vmx->guest_msrs[j].mask = -1ull; + vmx->guest_uret_msrs[j].mask = -1ull; break; } ++vmx->nmsrs; @@ -7111,7 +7111,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { - struct shared_msr_entry *msr; + struct vmx_uret_msr *msr; msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bce8cc4c9792..a13b61ef92b0 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -23,9 +23,9 @@ extern const u32 vmx_msr_index[]; #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) #ifdef CONFIG_X86_64 -#define MAX_NR_SHARED_MSRS 7 +#define MAX_NR_USER_RETURN_MSRS 7 #else -#define MAX_NR_SHARED_MSRS 4 +#define MAX_NR_USER_RETURN_MSRS 4 #endif #define MAX_NR_LOADSTORE_MSRS 8 @@ -35,7 +35,7 @@ struct vmx_msrs { struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS]; }; -struct shared_msr_entry { +struct vmx_uret_msr { unsigned index; u64 data; u64 mask; @@ -196,7 +196,7 @@ struct vcpu_vmx { u32 idt_vectoring_info; ulong rflags; - struct shared_msr_entry guest_msrs[MAX_NR_SHARED_MSRS]; + struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; int nmsrs; int save_nmsrs; bool guest_msrs_ready; @@ -329,7 +329,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); +struct vmx_uret_msr *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); From fbc18007382cb99ca68b1c6cb49d38f4f3b7a5da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:03:59 -0700 Subject: [PATCH 192/497] KVM: VMX: Rename vcpu_vmx's "nmsrs" to "nr_uret_msrs" Rename vcpu_vmx.nsmrs to vcpu_vmx.nr_uret_msrs to explicitly associate it with the guest_uret_msrs array. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 91e9a3657c4b..07249571f6d5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -615,7 +615,7 @@ static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; - for (i = 0; i < vmx->nmsrs; ++i) + for (i = 0; i < vmx->nr_uret_msrs; ++i) if (vmx_msr_index[vmx->guest_uret_msrs[i].index] == msr) return i; return -1; @@ -6726,7 +6726,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; - int j = vmx->nmsrs; + int j = vmx->nr_uret_msrs; if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; @@ -6748,7 +6748,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->guest_uret_msrs[j].mask = -1ull; break; } - ++vmx->nmsrs; + ++vmx->nr_uret_msrs; } err = alloc_loaded_vmcs(&vmx->vmcs01); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a13b61ef92b0..82c39ac53165 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -197,7 +197,7 @@ struct vcpu_vmx { ulong rflags; struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; - int nmsrs; + int nr_uret_msrs; int save_nmsrs; bool guest_msrs_ready; #ifdef CONFIG_X86_64 From e9bb1ae92d62bd55b8093918fdec91f6fc3d3a56 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:00 -0700 Subject: [PATCH 193/497] KVM: VMX: Rename vcpu_vmx's "save_nmsrs" to "nr_active_uret_msrs" Add "uret" into the name of "save_nmsrs" to explicitly associate it with the guest_uret_msrs array, and replace "save" with "active" (for lack of a better word) to better describe what is being tracked. While "save" is more or less accurate when viewed as a literal description of the field, e.g. it holds the number of MSRs that were saved into the array the last time setup_msrs() was invoked, it can easily be misinterpreted by the reader, e.g. as meaning the number of MSRs that were saved from hardware at some point in the past, or as the number of MSRs that need to be saved at some point in the future, both of which are wrong. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 22 +++++++++++----------- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 07249571f6d5..24087abf6077 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -637,7 +637,7 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 u64 old_msr_data = msr->data; msr->data = data; - if (msr - vmx->guest_uret_msrs < vmx->save_nmsrs) { + if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { preempt_disable(); ret = kvm_set_user_return_msr(msr->index, msr->data, msr->mask); preempt_enable(); @@ -1136,7 +1136,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ if (!vmx->guest_msrs_ready) { vmx->guest_msrs_ready = true; - for (i = 0; i < vmx->save_nmsrs; ++i) + for (i = 0; i < vmx->nr_active_uret_msrs; ++i) kvm_set_user_return_msr(vmx->guest_uret_msrs[i].index, vmx->guest_uret_msrs[i].data, vmx->guest_uret_msrs[i].mask); @@ -1628,9 +1628,9 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs, index; + int nr_active_uret_msrs, index; - save_nmsrs = 0; + nr_active_uret_msrs = 0; #ifdef CONFIG_X86_64 /* * The SYSCALL MSRs are only needed on long mode guests, and only @@ -1639,26 +1639,26 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { index = __find_msr_index(vmx, MSR_STAR); if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); + move_msr_up(vmx, index, nr_active_uret_msrs++); index = __find_msr_index(vmx, MSR_LSTAR); if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); + move_msr_up(vmx, index, nr_active_uret_msrs++); index = __find_msr_index(vmx, MSR_SYSCALL_MASK); if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); + move_msr_up(vmx, index, nr_active_uret_msrs++); } #endif index = __find_msr_index(vmx, MSR_EFER); if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); + move_msr_up(vmx, index, nr_active_uret_msrs++); index = __find_msr_index(vmx, MSR_TSC_AUX); if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, save_nmsrs++); + move_msr_up(vmx, index, nr_active_uret_msrs++); index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); + move_msr_up(vmx, index, nr_active_uret_msrs++); - vmx->save_nmsrs = save_nmsrs; + vmx->nr_active_uret_msrs = nr_active_uret_msrs; vmx->guest_msrs_ready = false; if (cpu_has_vmx_msr_bitmap()) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 82c39ac53165..3928992de023 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -198,7 +198,7 @@ struct vcpu_vmx { struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; int nr_uret_msrs; - int save_nmsrs; + int nr_active_uret_msrs; bool guest_msrs_ready; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; From 658ece84f5da1cd8e36f0d13449aa95c61667b4b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:01 -0700 Subject: [PATCH 194/497] KVM: VMX: Rename vcpu_vmx's "guest_msrs_ready" to "guest_uret_msrs_loaded" Add "uret" to "guest_msrs_ready" to explicitly associate it with the "guest_uret_msrs" array, and replace "ready" with "loaded" to more precisely reflect what it tracks, e.g. "ready" could be interpreted as meaning ready for processing (setup_msrs() has run), which is wrong. "loaded" also aligns with the similar "guest_state_loaded" field. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 24087abf6077..252cce60af70 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1134,8 +1134,8 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * when guest state is loaded. This happens when guest transitions * to/from long-mode by setting MSR_EFER.LMA. */ - if (!vmx->guest_msrs_ready) { - vmx->guest_msrs_ready = true; + if (!vmx->guest_uret_msrs_loaded) { + vmx->guest_uret_msrs_loaded = true; for (i = 0; i < vmx->nr_active_uret_msrs; ++i) kvm_set_user_return_msr(vmx->guest_uret_msrs[i].index, vmx->guest_uret_msrs[i].data, @@ -1223,7 +1223,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif load_fixmap_gdt(raw_smp_processor_id()); vmx->guest_state_loaded = false; - vmx->guest_msrs_ready = false; + vmx->guest_uret_msrs_loaded = false; } #ifdef CONFIG_X86_64 @@ -1659,7 +1659,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, nr_active_uret_msrs++); vmx->nr_active_uret_msrs = nr_active_uret_msrs; - vmx->guest_msrs_ready = false; + vmx->guest_uret_msrs_loaded = false; if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 3928992de023..8cb04e75defc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -199,7 +199,7 @@ struct vcpu_vmx { struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; int nr_uret_msrs; int nr_active_uret_msrs; - bool guest_msrs_ready; + bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; From 1e7a483037e8fa884c688f2213c8af10f2e8e96a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:02 -0700 Subject: [PATCH 195/497] KVM: VMX: Rename "__find_msr_index" to "__vmx_find_uret_msr" Rename "__find_msr_index" to scope it to VMX, associate it with guest_uret_msrs, and to avoid conflating "MSR's ECX index" with "MSR's array index". Similarly, don't use "slot" in the name so as to avoid colliding the common x86's half of "user_return_msrs" (the slot in kvm_user_return_msrs is not the same slot in guest_uret_msrs). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-9-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 252cce60af70..ae3c01cde79d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -611,7 +611,7 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } -static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) +static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -625,7 +625,7 @@ struct vmx_uret_msr *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; - i = __find_msr_index(vmx, msr); + i = __vmx_find_uret_msr(vmx, msr); if (i >= 0) return &vmx->guest_uret_msrs[i]; return NULL; @@ -1637,24 +1637,24 @@ static void setup_msrs(struct vcpu_vmx *vmx) * when EFER.SCE is set. */ if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { - index = __find_msr_index(vmx, MSR_STAR); + index = __vmx_find_uret_msr(vmx, MSR_STAR); if (index >= 0) move_msr_up(vmx, index, nr_active_uret_msrs++); - index = __find_msr_index(vmx, MSR_LSTAR); + index = __vmx_find_uret_msr(vmx, MSR_LSTAR); if (index >= 0) move_msr_up(vmx, index, nr_active_uret_msrs++); - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); + index = __vmx_find_uret_msr(vmx, MSR_SYSCALL_MASK); if (index >= 0) move_msr_up(vmx, index, nr_active_uret_msrs++); } #endif - index = __find_msr_index(vmx, MSR_EFER); + index = __vmx_find_uret_msr(vmx, MSR_EFER); if (index >= 0 && update_transition_efer(vmx, index)) move_msr_up(vmx, index, nr_active_uret_msrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); + index = __vmx_find_uret_msr(vmx, MSR_TSC_AUX); if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) move_msr_up(vmx, index, nr_active_uret_msrs++); - index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); + index = __vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (index >= 0) move_msr_up(vmx, index, nr_active_uret_msrs++); From ef1d2ee12e6cd5abbf9986f35238f2fb2cd21a6d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:03 -0700 Subject: [PATCH 196/497] KVM: VMX: Check guest support for RDTSCP before processing MSR_TSC_AUX Check for RDTSCP support prior to checking if MSR_TSC_AUX is in the uret MSRs array so that the array lookup and manipulation are back-to-back. This paves the way toward adding a helper to wrap the lookup and manipulation. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-10-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ae3c01cde79d..068783a13ac8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1651,9 +1651,11 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __vmx_find_uret_msr(vmx, MSR_EFER); if (index >= 0 && update_transition_efer(vmx, index)) move_msr_up(vmx, index, nr_active_uret_msrs++); - index = __vmx_find_uret_msr(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) - move_msr_up(vmx, index, nr_active_uret_msrs++); + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) { + index = __vmx_find_uret_msr(vmx, MSR_TSC_AUX); + if (index >= 0) + move_msr_up(vmx, index, nr_active_uret_msrs++); + } index = __vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (index >= 0) move_msr_up(vmx, index, nr_active_uret_msrs++); From 86e3e494fe32d1e7e9180458d857916155dd2856 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:04 -0700 Subject: [PATCH 197/497] KVM: VMX: Move uret MSR lookup into update_transition_efer() Move checking for the existence of MSR_EFER in the uret MSR array into update_transition_efer() so that the lookup and manipulation of the array in setup_msrs() occur back-to-back. This paves the way toward adding a helper to wrap the lookup and manipulation. To avoid unnecessary overhead, defer the lookup until the uret array would actually be modified in update_transition_efer(). EFER obviously exists on CPUs that support the dedicated VMCS fields for switching EFER, and EFER must exist for the guest and host EFER.NX value to diverge, i.e. there is no danger of attempting to read/write EFER when it doesn't exist. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-11-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 068783a13ac8..1ad9faca44ef 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -941,10 +941,11 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, m->host.val[j].value = host_val; } -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) +static bool update_transition_efer(struct vcpu_vmx *vmx) { u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; + int i; /* Shadow paging assumes NX to be available. */ if (!enable_ept) @@ -976,17 +977,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) else clear_atomic_switch_msr(vmx, MSR_EFER); return false; - } else { - clear_atomic_switch_msr(vmx, MSR_EFER); - - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - - vmx->guest_uret_msrs[efer_offset].data = guest_efer; - vmx->guest_uret_msrs[efer_offset].mask = ~ignore_bits; - - return true; } + + i = __vmx_find_uret_msr(vmx, MSR_EFER); + if (i < 0) + return false; + + clear_atomic_switch_msr(vmx, MSR_EFER); + + guest_efer &= ~ignore_bits; + guest_efer |= host_efer & ignore_bits; + + vmx->guest_uret_msrs[i].data = guest_efer; + vmx->guest_uret_msrs[i].mask = ~ignore_bits; + + return true; } #ifdef CONFIG_X86_32 @@ -1648,9 +1653,11 @@ static void setup_msrs(struct vcpu_vmx *vmx) move_msr_up(vmx, index, nr_active_uret_msrs++); } #endif - index = __vmx_find_uret_msr(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, nr_active_uret_msrs++); + if (update_transition_efer(vmx)) { + index = __vmx_find_uret_msr(vmx, MSR_EFER); + if (index >= 0) + move_msr_up(vmx, index, nr_active_uret_msrs++); + } if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) { index = __vmx_find_uret_msr(vmx, MSR_TSC_AUX); if (index >= 0) From bd65ba82b324e1765121f3602f9b0a89b7aa1c08 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:05 -0700 Subject: [PATCH 198/497] KVM: VMX: Add vmx_setup_uret_msr() to handle lookup and swap Add vmx_setup_uret_msr() to wrap the lookup and manipulation of the uret MSRs array during setup_msrs(). In addition to consolidating code, this eliminates move_msr_up(), which while being a very literally description of the function, isn't exacly helpful in understanding the net effect of the code. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-12-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 49 ++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1ad9faca44ef..3300e373fadf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1614,12 +1614,15 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) +static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr) { struct vmx_uret_msr tmp; + int from, to; + + from = __vmx_find_uret_msr(vmx, msr); + if (from < 0) + return; + to = vmx->nr_active_uret_msrs++; tmp = vmx->guest_uret_msrs[to]; vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from]; @@ -1633,42 +1636,26 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) */ static void setup_msrs(struct vcpu_vmx *vmx) { - int nr_active_uret_msrs, index; - - nr_active_uret_msrs = 0; + vmx->guest_uret_msrs_loaded = false; + vmx->nr_active_uret_msrs = 0; #ifdef CONFIG_X86_64 /* * The SYSCALL MSRs are only needed on long mode guests, and only * when EFER.SCE is set. */ if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { - index = __vmx_find_uret_msr(vmx, MSR_STAR); - if (index >= 0) - move_msr_up(vmx, index, nr_active_uret_msrs++); - index = __vmx_find_uret_msr(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, nr_active_uret_msrs++); - index = __vmx_find_uret_msr(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, nr_active_uret_msrs++); + vmx_setup_uret_msr(vmx, MSR_STAR); + vmx_setup_uret_msr(vmx, MSR_LSTAR); + vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK); } #endif - if (update_transition_efer(vmx)) { - index = __vmx_find_uret_msr(vmx, MSR_EFER); - if (index >= 0) - move_msr_up(vmx, index, nr_active_uret_msrs++); - } - if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) { - index = __vmx_find_uret_msr(vmx, MSR_TSC_AUX); - if (index >= 0) - move_msr_up(vmx, index, nr_active_uret_msrs++); - } - index = __vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); - if (index >= 0) - move_msr_up(vmx, index, nr_active_uret_msrs++); + if (update_transition_efer(vmx)) + vmx_setup_uret_msr(vmx, MSR_EFER); - vmx->nr_active_uret_msrs = nr_active_uret_msrs; - vmx->guest_uret_msrs_loaded = false; + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) + vmx_setup_uret_msr(vmx, MSR_TSC_AUX); + + vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (cpu_has_vmx_msr_bitmap()) vmx_update_msr_bitmap(&vmx->vcpu); From d85a8034c016a1f50e879124bc5839c986d87a0a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:06 -0700 Subject: [PATCH 199/497] KVM: VMX: Rename "find_msr_entry" to "vmx_find_uret_msr" Rename "find_msr_entry" to scope it to VMX and to associate it with guest_uret_msrs. Drop the "entry" so that the function name pairs with the existing __vmx_find_uret_msr(), which intentionally uses a double underscore prefix instead of appending "index" or "slot" as those names are already claimed by other pieces of the user return MSR stack. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-13-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 10 +++++----- arch/x86/kvm/vmx/vmx.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 683e0627d4ce..feb26457d7b2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4287,7 +4287,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmx->msr_autoload.guest.val[i].value; } - efer_msr = find_msr_entry(vmx, MSR_EFER); + efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); if (efer_msr) return efer_msr->data; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3300e373fadf..4cf1ccdb99ce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -621,7 +621,7 @@ static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) return -1; } -struct vmx_uret_msr *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -1856,7 +1856,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) goto find_uret_msr; default: find_uret_msr: - msr = find_msr_entry(vmx, msr_info->index); + msr = vmx_find_uret_msr(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; @@ -2130,7 +2130,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: find_uret_msr: - msr = find_msr_entry(vmx, msr_index); + msr = vmx_find_uret_msr(vmx, msr_index); if (msr) ret = vmx_set_guest_msr(vmx, msr, data); else @@ -2763,7 +2763,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_uret_msr *msr = find_msr_entry(vmx, MSR_EFER); + struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); if (!msr) return; @@ -7108,7 +7108,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_RTM)) { struct vmx_uret_msr *msr; - msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); + msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8cb04e75defc..45c33ffae344 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -329,7 +329,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -struct vmx_uret_msr *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); +struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); From 7bf662bb5ea806e3958638e9b80f203d11fbcc87 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:07 -0700 Subject: [PATCH 200/497] KVM: VMX: Rename "vmx_set_guest_msr" to "vmx_set_guest_uret_msr" Add "uret" to vmx_set_guest_msr() to explicitly associate it with the guest_uret_msrs array, and to differentiate it from vmx_set_msr() as well as VMX's load/store MSRs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-14-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4cf1ccdb99ce..0f75b17d97a3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -631,7 +631,8 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) return NULL; } -static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data) +static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, + struct vmx_uret_msr *msr, u64 data) { int ret = 0; @@ -2132,7 +2133,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_index); if (msr) - ret = vmx_set_guest_msr(vmx, msr, data); + ret = vmx_set_guest_uret_msr(vmx, msr, data); else ret = kvm_set_msr_common(vcpu, msr_info); } @@ -7111,7 +7112,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); - vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } } From 14a61b642de9e5612f091b3295da6d0c89449a49 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:08 -0700 Subject: [PATCH 201/497] KVM: VMX: Rename "vmx_msr_index" to "vmx_uret_msrs_list" Rename "vmx_msr_index" to "vmx_uret_msrs_list" to associate it with the uret MSRs array, and to avoid conflating "MSR's ECX index" with "MSR's index into an array". Similarly, don't use "slot" in the name as that terminology is claimed by the common x86 "user_return_msrs" mechanism. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-15-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0f75b17d97a3..07781403db08 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -435,9 +435,9 @@ static unsigned long host_idt_base; * will emulate SYSCALL in legacy mode if the vendor string in guest * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To * support this emulation, IA32_STAR must always be included in - * vmx_msr_index[], even in i386 builds. + * vmx_uret_msrs_list[], even in i386 builds. */ -const u32 vmx_msr_index[] = { +const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif @@ -616,7 +616,7 @@ static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) int i; for (i = 0; i < vmx->nr_uret_msrs; ++i) - if (vmx_msr_index[vmx->guest_uret_msrs[i].index] == msr) + if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].index] == msr) return i; return -1; } @@ -6718,10 +6718,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != MAX_NR_USER_RETURN_MSRS); + BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) { + u32 index = vmx_uret_msrs_list[i]; u32 data_low, data_high; int j = vmx->nr_uret_msrs; @@ -7577,8 +7577,8 @@ static __init int hardware_setup(void) store_idt(&dt); host_idt_base = dt.address; - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) - kvm_define_user_return_msr(i, vmx_msr_index[i]); + for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) + kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]); if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; From 802145c56a0445bba39de481214302eb3d34e4fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Sep 2020 11:04:09 -0700 Subject: [PATCH 202/497] KVM: VMX: Rename vmx_uret_msr's "index" to "slot" Rename "index" to "slot" in struct vmx_uret_msr to align with the terminology used by common x86's kvm_user_return_msrs, and to avoid conflating "MSR's ECX index" with "MSR's index into an array". No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200923180409.32255-16-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 07781403db08..58fb403ccd1c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -616,7 +616,7 @@ static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) int i; for (i = 0; i < vmx->nr_uret_msrs; ++i) - if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].index] == msr) + if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr) return i; return -1; } @@ -640,7 +640,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, msr->data = data; if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) { preempt_disable(); - ret = kvm_set_user_return_msr(msr->index, msr->data, msr->mask); + ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask); preempt_enable(); if (ret) msr->data = old_msr_data; @@ -1143,7 +1143,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (!vmx->guest_uret_msrs_loaded) { vmx->guest_uret_msrs_loaded = true; for (i = 0; i < vmx->nr_active_uret_msrs; ++i) - kvm_set_user_return_msr(vmx->guest_uret_msrs[i].index, + kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot, vmx->guest_uret_msrs[i].data, vmx->guest_uret_msrs[i].mask); @@ -6730,7 +6730,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (wrmsr_safe(index, data_low, data_high) < 0) continue; - vmx->guest_uret_msrs[j].index = i; + vmx->guest_uret_msrs[j].slot = i; vmx->guest_uret_msrs[j].data = 0; switch (index) { case MSR_IA32_TSX_CTRL: diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 45c33ffae344..fc63ff43929f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -36,7 +36,7 @@ struct vmx_msrs { }; struct vmx_uret_msr { - unsigned index; + unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */ u64 data; u64 mask; }; From 90218e434c4118fd7bba7ea7b073e6c6454140a2 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:15 +0200 Subject: [PATCH 203/497] KVM: x86: Return -ENOENT on unimplemented MSRs When we find an MSR that we can not handle, bubble up that error code as MSR error return code. Follow up patches will use that to expose the fact that an MSR is not handled by KVM to user space. Suggested-by: Aaron Lewis Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-2-graf@amazon.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a547b0052396..43173382f02f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -272,7 +272,7 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, } else { vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", op, msr, data); - return 1; + return -ENOENT; } } From 1ae099540e8c7f1ee066b3ad45cc91f582bb1ce8 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:16 +0200 Subject: [PATCH 204/497] KVM: x86: Allow deflecting unknown MSR accesses to user space MSRs are weird. Some of them are normal control registers, such as EFER. Some however are registers that really are model specific, not very interesting to virtualization workloads, and not performance critical. Others again are really just windows into package configuration. Out of these MSRs, only the first category is necessary to implement in kernel space. Rarely accessed MSRs, MSRs that should be fine tunes against certain CPU models and MSRs that contain information on the package level are much better suited for user space to process. However, over time we have accumulated a lot of MSRs that are not the first category, but still handled by in-kernel KVM code. This patch adds a generic interface to handle WRMSR and RDMSR from user space. With this, any future MSR that is part of the latter categories can be handled in user space. Furthermore, it allows us to replace the existing "ignore_msrs" logic with something that applies per-VM rather than on the full system. That way you can run productive VMs in parallel to experimental ones where you don't care about proper MSR handling. Signed-off-by: Alexander Graf Reviewed-by: Jim Mattson Message-Id: <20200925143422.21718-3-graf@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 85 +++++++++++++++++++--- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/emulate.c | 18 ++++- arch/x86/kvm/x86.c | 120 ++++++++++++++++++++++++++++++-- include/trace/events/kvm.h | 2 +- include/uapi/linux/kvm.h | 13 ++++ 6 files changed, 226 insertions(+), 15 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9e2a545d8084..4fdba43d83e8 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4872,14 +4872,13 @@ to the byte array. .. note:: - For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and - KVM_EXIT_EPR the corresponding - -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. + For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, + KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding + operations are complete (and guest state is consistent) only after userspace + has re-entered the kernel with KVM_RUN. The kernel side will first finish + incomplete operations and then check for pending signals. Userspace + can re-enter the guest with an unmasked signal pending to complete + pending operations. :: @@ -5166,6 +5165,43 @@ Note that KVM does not skip the faulting instruction as it does for KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state if it decides to decode and emulate the instruction. +:: + + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; + +Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is +enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code +will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR +exit for writes. + +The "reason" field specifies why the MSR trap occurred. User space will only +receive MSR exit traps when a particular reason was requested during through +ENABLE_CAP. Currently valid exit reasons are: + + KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM + KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits + +For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest +wants to read. To respond to this request with a successful read, user space +writes the respective data into the "data" field and must continue guest +execution to ensure the read data is transferred into guest register state. + +If the RDMSR request was unsuccessful, user space indicates that with a "1" in +the "error" field. This will inject a #GP into the guest when the VCPU is +executed again. + +For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest +wants to write. Once finished processing the event, user space must continue +vCPU execution. If the MSR write was unsuccessful, user space also sets the +"error" field to "1". + :: /* Fix the size of the union. */ @@ -5855,6 +5891,28 @@ controlled by the kvm module parameter halt_poll_ns. This capability allows the maximum halt time to specified on a per-VM basis, effectively overriding the module parameter for the target VM. +7.21 KVM_CAP_X86_USER_SPACE_MSR +------------------------------- + +:Architectures: x86 +:Target: VM +:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report +:Returns: 0 on success; -1 on error + +This capability enables trapping of #GP invoking RDMSR and WRMSR instructions +into user space. + +When a guest requests to read or write an MSR, KVM may not implement all MSRs +that are relevant to a respective system. It also does not differentiate by +CPU type. + +To allow more fine grained control over MSR handling, user space may enable +this capability. With it enabled, MSR accesses that match the mask specified in +args[0] and trigger a #GP event inside the guest by KVM will instead trigger +KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space +can then handle to implement model specific MSR handling and/or user notifications +to inform a user that an MSR was not handled. + 8. Other capabilities. ====================== @@ -6196,3 +6254,14 @@ distribution...) If this capability is available, then the CPNC and CPVC can be synchronized between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318). + +8.26 KVM_CAP_X86_USER_SPACE_MSR +------------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports deflection of MSR reads and +writes to user space. It can be enabled on a VM level. If enabled, MSR +accesses that would usually trigger a #GP by KVM into the guest will +instead get bounced to user space through the KVM_EXIT_X86_RDMSR and +KVM_EXIT_X86_WRMSR exit notifications. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5d4c39c37390..dd2665504dc0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -961,6 +961,9 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ + u32 user_space_msr_mask; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 85111cd0adcd..0cc0db500f71 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3701,11 +3701,18 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt) static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) + r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; + + if (r) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3713,9 +3720,16 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; - if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) + r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; + + if (r) return emulate_gp(ctxt, 0); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43173382f02f..af6d008145cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1590,12 +1590,89 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); +static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +{ + if (vcpu->run->msr.error) { + kvm_inject_gp(vcpu, 0); + return 1; + } else if (is_read) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, true); +} + +static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, false); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case -ENOENT: + return KVM_MSR_EXIT_REASON_UNKNOWN; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r); +} + +static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_wrmsr, r); +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + int r; - if (kvm_get_msr(vcpu, ecx, &data)) { + r = kvm_get_msr(vcpu, ecx, &data); + + /* MSR read failed? See if we should ask user space */ + if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR read failed? Inject a #GP */ + if (r) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; @@ -1613,8 +1690,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + int r; - if (kvm_set_msr(vcpu, ecx, data)) { + r = kvm_set_msr(vcpu, ecx, data); + + /* MSR write failed? See if we should ask user space */ + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR write failed? Inject a #GP */ + if (r) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -3526,6 +3613,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5046,6 +5134,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_USER_SPACE_MSR: + kvm->arch.user_space_msr_mask = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -6378,13 +6470,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_get_msr(vcpu, msr_index, pdata); + + if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_set_msr(vcpu, msr_index, data); + + if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 9417a34aad08..26cfb0fa8e7e 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -17,7 +17,7 @@ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ - ERSN(HYPERV), ERSN(ARM_NISV) + ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7d8eced6f459..31292a3cdfc2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -248,6 +248,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 #define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_X86_RDMSR 29 +#define KVM_EXIT_X86_WRMSR 30 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -413,6 +415,16 @@ struct kvm_run { __u64 esr_iss; __u64 fault_ipa; } arm_nisv; + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; +#define KVM_MSR_EXIT_REASON_INVAL (1 << 0) +#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; /* Fix the size of the union. */ char padding[256]; }; @@ -1037,6 +1049,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SMALLER_MAXPHYADDR 185 #define KVM_CAP_S390_DIAG318 186 #define KVM_CAP_STEAL_TIME 187 +#define KVM_CAP_X86_USER_SPACE_MSR 188 #ifdef KVM_CAP_IRQ_ROUTING From 51de8151bd21bae0ce76f604eec6a7176f7093b3 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:17 +0200 Subject: [PATCH 205/497] KVM: x86: Add infrastructure for MSR filtering In the following commits we will add pieces of MSR filtering. To ensure that code compiles even with the feature half-merged, let's add a few stubs and struct definitions before the real patches start. Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-4-graf@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/uapi/asm/kvm.h | 2 ++ arch/x86/kvm/x86.c | 6 ++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 10 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd2665504dc0..f4a2443219bc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1235,6 +1235,7 @@ struct kvm_x86_ops { int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); + void (*msr_filter_changed)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0780f97c1850..c2fd0aa2f587 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -192,6 +192,8 @@ struct kvm_msr_list { __u32 indices[0]; }; +#define KVM_MSR_FILTER_READ (1 << 0) +#define KVM_MSR_FILTER_WRITE (1 << 1) struct kvm_cpuid_entry { __u32 function; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af6d008145cd..60219882fee2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1488,6 +1488,12 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) +{ + return true; +} +EXPORT_SYMBOL_GPL(kvm_msr_allowed); + /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 941f288c38aa..3900ab0c6004 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -374,6 +374,7 @@ bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); #define KVM_MSR_RET_INVALID 2 From 476c9bd8e997b495524500cd82471e59b3aac20e Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Fri, 25 Sep 2020 16:34:18 +0200 Subject: [PATCH 206/497] KVM: x86: Prepare MSR bitmaps for userspace tracked MSRs Prepare vmx and svm for a subsequent change that ensures the MSR permission bitmap is set to allow an MSR that userspace is tracking to force a vmx_vmexit in the guest. Signed-off-by: Aaron Lewis Reviewed-by: Oliver Upton [agraf: rebase, adapt SVM scheme to nested changes that came in between] Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-5-graf@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 60 ++++++++++++++++------------ arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 83 +++++++++++++++++++-------------------- arch/x86/kvm/vmx/vmx.h | 2 +- 4 files changed, 77 insertions(+), 70 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ecefec42439f..bfb3330cd580 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -564,7 +564,7 @@ static bool valid_msr_intercept(u32 index) return false; } -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) { u8 bit_write; unsigned long tmp; @@ -583,7 +583,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr) return !!test_bit(bit_write, &tmp); } -static void set_msr_interception(u32 *msrpm, unsigned msr, +static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write) { u8 bit_read, bit_write; @@ -609,11 +609,10 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, msrpm[offset] = tmp; } -static u32 *svm_vcpu_init_msrpm(void) +static u32 *svm_vcpu_alloc_msrpm(void) { - int i; - u32 *msrpm; struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); + u32 *msrpm; if (!pages) return NULL; @@ -621,12 +620,18 @@ static u32 *svm_vcpu_init_msrpm(void) msrpm = page_address(pages); memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + return msrpm; +} + +static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +{ + int i; + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { if (!direct_access_msrs[i].always) continue; - set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); + set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); } - return msrpm; } static void svm_vcpu_free_msrpm(u32 *msrpm) @@ -677,26 +682,26 @@ static void init_msrpm_offsets(void) } } -static void svm_enable_lbrv(struct vcpu_svm *svm) +static void svm_enable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -static void svm_disable_lbrv(struct vcpu_svm *svm) +static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { - u32 *msrpm = svm->msrpm; + struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -1230,14 +1235,19 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->nested.hsave = page_address(hsave_page); - svm->msrpm = svm_vcpu_init_msrpm(); + svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) goto error_free_hsave_page; - svm->nested.msrpm = svm_vcpu_init_msrpm(); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + + svm->nested.msrpm = svm_vcpu_alloc_msrpm(); if (!svm->nested.msrpm) goto error_free_msrpm; + /* We only need the L1 pass-through MSR state, so leave vcpu as NULL */ + svm_vcpu_init_msrpm(vcpu, svm->nested.msrpm); + svm->vmcb = page_address(vmcb_page); svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); svm->asid_generation = 0; @@ -2574,7 +2584,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * We update the L1 MSR bit as well since it will end up * touching the MSR anyway now. */ - set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && @@ -2589,7 +2599,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && @@ -2653,9 +2663,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb->save.dbgctl = data; vmcb_mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) - svm_enable_lbrv(svm); + svm_enable_lbrv(vcpu); else - svm_disable_lbrv(svm); + svm_disable_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: svm->nested.hsave_msr = data; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index feb26457d7b2..91ce6e642db2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4776,7 +4776,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); } return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 58fb403ccd1c..b2307c115268 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -338,7 +338,7 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = { module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); static u32 vmx_segment_access_rights(struct kvm_segment *var); -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); void vmx_vmexit(void); @@ -1980,7 +1980,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; @@ -2016,8 +2016,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ - vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, - MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); break; case MSR_IA32_CR_PAT: if (!kvm_pat_valid(data)) @@ -2067,7 +2066,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_IA32_RTIT_CTL, data); vmx->pt_desc.guest.ctl = data; - pt_update_intercept_for_msr(vmx); + pt_update_intercept_for_msr(vcpu); break; case MSR_IA32_RTIT_STATUS: if (!pt_can_write_msr(vmx)) @@ -3584,9 +3583,11 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) @@ -3622,9 +3623,11 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit } } -static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, +static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) @@ -3660,13 +3663,13 @@ static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitm } } -static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type, bool value) +static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, + u32 msr, int type, bool value) { if (value) - vmx_enable_intercept_for_msr(msr_bitmap, msr, type); + vmx_enable_intercept_for_msr(vcpu, msr, type); else - vmx_disable_intercept_for_msr(msr_bitmap, msr, type); + vmx_disable_intercept_for_msr(vcpu, msr, type); } static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) @@ -3684,8 +3687,8 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) return mode; } -static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, - u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, + unsigned long *msr_bitmap, u8 mode) { int msr; @@ -3700,11 +3703,11 @@ static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use. */ - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } } @@ -3720,30 +3723,24 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) return; if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); + vmx_update_msr_bitmap_x2apic(vcpu, msr_bitmap, mode); vmx->msr_bitmap_mode = mode; } -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + struct vcpu_vmx *vmx = to_vmx(vcpu); bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); u32 i; - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, - MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); for (i = 0; i < vmx->pt_desc.addr_range; i++) { - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); - vmx_set_intercept_for_msr(msr_bitmap, - MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } } @@ -6753,18 +6750,18 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_pml; msr_bitmap = vmx->vmcs01.msr_bitmap; - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); if (kvm_cstate_in_guest(vcpu->kvm)) { - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } vmx->msr_bitmap_mode = 0; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index fc63ff43929f..64fe6c91435f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -330,7 +330,7 @@ bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr); -void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); +void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); From fd6fa73d13377f2bff6ed668c99ca76adcda1336 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:19 +0200 Subject: [PATCH 207/497] KVM: x86: SVM: Prevent MSR passthrough when MSR access is denied We will introduce the concept of MSRs that may not be handled in kernel space soon. Some MSRs are directly passed through to the guest, effectively making them handled by KVM from user space's point of view. This patch introduces all logic required to ensure that MSRs that user space wants trapped are not marked as direct access for guests. Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-6-graf@amazon.com> [Make terminology a bit more similar to VMX. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 77 +++++++++++++++++++++++++++++++++++++----- arch/x86/kvm/svm/svm.h | 7 ++++ 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bfb3330cd580..4f401fc6a05d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -91,7 +91,7 @@ static DEFINE_PER_CPU(u64, current_tsc_ratio); static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ -} direct_access_msrs[] = { +} direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, #ifdef CONFIG_X86_64 @@ -553,15 +553,41 @@ free_cpu_data: } -static bool valid_msr_intercept(u32 index) +static int direct_access_msr_slot(u32 msr) { - int i; + u32 i; for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == index) - return true; + if (direct_access_msrs[i].index == msr) + return i; - return false; + return -ENOENT; +} + +static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, + int write) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int slot = direct_access_msr_slot(msr); + + if (slot == -ENOENT) + return; + + /* Set the shadow bitmaps to the desired intercept states */ + if (read) + set_bit(slot, svm->shadow_msr_intercept.read); + else + clear_bit(slot, svm->shadow_msr_intercept.read); + + if (write) + set_bit(slot, svm->shadow_msr_intercept.write); + else + clear_bit(slot, svm->shadow_msr_intercept.write); +} + +static bool valid_msr_intercept(u32 index) +{ + return direct_access_msr_slot(index) != -ENOENT; } static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) @@ -583,8 +609,8 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) return !!test_bit(bit_write, &tmp); } -static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, - int read, int write) +static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, + u32 msr, int read, int write) { u8 bit_read, bit_write; unsigned long tmp; @@ -596,6 +622,13 @@ static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, */ WARN_ON(!valid_msr_intercept(msr)); + /* Enforce non allowed MSRs to trap */ + if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + read = 0; + + if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + write = 0; + offset = svm_msrpm_offset(msr); bit_read = 2 * (msr & 0x0f); bit_write = 2 * (msr & 0x0f) + 1; @@ -609,6 +642,13 @@ static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, msrpm[offset] = tmp; } +static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, + int read, int write) +{ + set_shadow_msr_intercept(vcpu, msr, read, write); + set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); +} + static u32 *svm_vcpu_alloc_msrpm(void) { struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); @@ -639,6 +679,25 @@ static void svm_vcpu_free_msrpm(u32 *msrpm) __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER); } +static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u32 i; + + /* + * Set intercept permissions for all direct access MSRs again. They + * will automatically get filtered through the MSR filter, so we are + * back in sync after this. + */ + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + u32 msr = direct_access_msrs[i].index; + u32 read = test_bit(i, svm->shadow_msr_intercept.read); + u32 write = test_bit(i, svm->shadow_msr_intercept.write); + + set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); + } +} + static void add_msr_offset(u32 offset) { int i; @@ -4222,6 +4281,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .can_emulate_instruction = svm_can_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, + + .msr_filter_changed = svm_msr_filter_changed, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bb3bbc87d3ff..a7f997459b87 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -31,6 +31,7 @@ static const u32 host_save_user_msrs[] = { #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) +#define MAX_DIRECT_ACCESS_MSRS 15 #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -157,6 +158,12 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; + + /* Save desired MSR intercept (read: pass-through) state */ + struct { + DECLARE_BITMAP(read, MAX_DIRECT_ACCESS_MSRS); + DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); + } shadow_msr_intercept; }; struct svm_cpu_data { From 3eb900173c71392087f4b0ada66f67ceae7e75f0 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:20 +0200 Subject: [PATCH 208/497] KVM: x86: VMX: Prevent MSR passthrough when MSR access is denied We will introduce the concept of MSRs that may not be handled in kernel space soon. Some MSRs are directly passed through to the guest, effectively making them handled by KVM from user space's point of view. This patch introduces all logic required to ensure that MSRs that user space wants trapped are not marked as direct access for guests. Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-7-graf@amazon.com> [Replace "_idx" with "_slot". - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 230 +++++++++++++++++++++++++++++++---------- arch/x86/kvm/vmx/vmx.h | 7 ++ 2 files changed, 183 insertions(+), 54 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b2307c115268..4551a7e80ebc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -145,6 +145,26 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ RTIT_STATUS_BYTECNT)) +/* + * List of MSRs that can be directly passed to the guest. + * In addition to these x2apic and PT MSRs are handled specially. + */ +static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { + MSR_IA32_SPEC_CTRL, + MSR_IA32_PRED_CMD, + MSR_IA32_TSC, + MSR_FS_BASE, + MSR_GS_BASE, + MSR_KERNEL_GS_BASE, + MSR_IA32_SYSENTER_CS, + MSR_IA32_SYSENTER_ESP, + MSR_IA32_SYSENTER_EIP, + MSR_CORE_C1_RES, + MSR_CORE_C3_RESIDENCY, + MSR_CORE_C6_RESIDENCY, + MSR_CORE_C7_RESIDENCY, +}; + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -611,6 +631,41 @@ static inline bool report_flexpriority(void) return flexpriority_enabled; } +static int possible_passthrough_msr_slot(u32 msr) +{ + u32 i; + + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) + if (vmx_possible_passthrough_msrs[i] == msr) + return i; + + return -ENOENT; +} + +static bool is_valid_passthrough_msr(u32 msr) +{ + bool r; + + switch (msr) { + case 0x800 ... 0x8ff: + /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ + return true; + case MSR_IA32_RTIT_STATUS: + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + case MSR_IA32_RTIT_CR3_MATCH: + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ + return true; + } + + r = possible_passthrough_msr_slot(msr) != -ENOENT; + + WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + + return r; +} + static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -3583,12 +3638,51 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } +static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __clear_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + +static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x000 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); +} + +static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) +{ + int f = sizeof(unsigned long); + + if (msr <= 0x1fff) + __set_bit(msr, msr_bitmap + 0x800 / f); + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) + __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); +} + static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) return; @@ -3597,30 +3691,37 @@ static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __clear_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __clear_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filters change. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + clear_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + clear_bit(idx, vmx->shadow_msr_intercept.write); + } } + + if ((type & MSR_TYPE_R) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { + vmx_set_msr_bitmap_read(msr_bitmap, msr); + type &= ~MSR_TYPE_R; + } + + if ((type & MSR_TYPE_W) && + !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { + vmx_set_msr_bitmap_write(msr_bitmap, msr); + type &= ~MSR_TYPE_W; + } + + if (type & MSR_TYPE_R) + vmx_clear_msr_bitmap_read(msr_bitmap, msr); + + if (type & MSR_TYPE_W) + vmx_clear_msr_bitmap_write(msr_bitmap, msr); } static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, @@ -3628,7 +3729,6 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - int f = sizeof(unsigned long); if (!cpu_has_vmx_msr_bitmap()) return; @@ -3637,30 +3737,25 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, evmcs_touch_msr_bitmap(); /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); + * Mark the desired intercept state in shadow bitmap, this is needed + * for resync when the MSR filter changes. + */ + if (is_valid_passthrough_msr(msr)) { + int idx = possible_passthrough_msr_slot(msr); + if (idx != -ENOENT) { + if (type & MSR_TYPE_R) + set_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + set_bit(idx, vmx->shadow_msr_intercept.write); + } } + + if (type & MSR_TYPE_R) + vmx_set_msr_bitmap_read(msr_bitmap, msr); + + if (type & MSR_TYPE_W) + vmx_set_msr_bitmap_write(msr_bitmap, msr); } static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, @@ -3687,15 +3782,14 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) return mode; } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, - unsigned long *msr_bitmap, u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) { int msr; - for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { - unsigned word = msr / BITS_PER_LONG; - msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - msr_bitmap[word + (0x800 / sizeof(long))] = ~0; + for (msr = 0x800; msr <= 0x8ff; msr++) { + bool intercepted = !!(mode & MSR_BITMAP_MODE_X2APIC_APICV); + + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_RW, intercepted); } if (mode & MSR_BITMAP_MODE_X2APIC) { @@ -3715,7 +3809,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; u8 mode = vmx_msr_bitmap_mode(vcpu); u8 changed = mode ^ vmx->msr_bitmap_mode; @@ -3723,7 +3816,7 @@ void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) return; if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(vcpu, msr_bitmap, mode); + vmx_update_msr_bitmap_x2apic(vcpu, mode); vmx->msr_bitmap_mode = mode; } @@ -3764,6 +3857,29 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) return ((rvi & 0xf0) > (vppr & 0xf0)); } +static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 i; + + /* + * Set intercept permissions for all potentially passed through MSRs + * again. They will automatically get filtered through the MSR filter, + * so we are back in sync after this. + */ + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { + u32 msr = vmx_possible_passthrough_msrs[i]; + bool read = test_bit(i, vmx->shadow_msr_intercept.read); + bool write = test_bit(i, vmx->shadow_msr_intercept.write); + + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + } + + pt_update_intercept_for_msr(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -6749,6 +6865,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (err < 0) goto free_pml; + /* The MSR bitmap starts with all ones */ + bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + msr_bitmap = vmx->vmcs01.msr_bitmap; vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); @@ -7563,6 +7683,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .can_emulate_instruction = vmx_can_emulate_instruction, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vmx_msr_filter_changed, }; static __init int hardware_setup(void) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 64fe6c91435f..5961cb897125 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -279,6 +279,13 @@ struct vcpu_vmx { u64 ept_pointer; struct pt_desc pt_desc; + + /* Save desired MSR intercept (read: pass-through) state */ +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 + struct { + DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); + DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); + } shadow_msr_intercept; }; enum ept_pointers_status { From 1a155254ff937ac92cf9940d273ea597b2c667a2 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:21 +0200 Subject: [PATCH 209/497] KVM: x86: Introduce MSR filtering It's not desireable to have all MSRs always handled by KVM kernel space. Some MSRs would be useful to handle in user space to either emulate behavior (like uCode updates) or differentiate whether they are valid based on the CPU model. To allow user space to specify which MSRs it wants to see handled by KVM, this patch introduces a new ioctl to push filter rules with bitmaps into KVM. Based on these bitmaps, KVM can then decide whether to reject MSR access. With the addition of KVM_CAP_X86_USER_SPACE_MSR it can also deflect the denied MSR events to user space to operate on. If no filter is populated, MSR handling stays identical to before. Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-8-graf@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 108 ++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 14 +++ arch/x86/include/uapi/asm/kvm.h | 18 ++++ arch/x86/kvm/x86.c | 145 +++++++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 5 ++ 5 files changed, 289 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 4fdba43d83e8..425325ff4434 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4707,6 +4707,99 @@ KVM_PV_VM_VERIFY Verify the integrity of the unpacked image. Only if this succeeds, KVM is allowed to start protected VCPUs. +4.126 KVM_X86_SET_MSR_FILTER +---------------------------- + +:Capability: KVM_X86_SET_MSR_FILTER +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_msr_filter +:Returns: 0 on success, < 0 on error + +:: + + struct kvm_msr_filter_range { + #define KVM_MSR_FILTER_READ (1 << 0) + #define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ + }; + + #define KVM_MSR_FILTER_MAX_RANGES 16 + struct kvm_msr_filter { + #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) + #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; + }; + +flags values for struct kvm_msr_filter_range: + +KVM_MSR_FILTER_READ + + Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap + indicates that a read should immediately fail, while a 1 indicates that + a read for a particular MSR should be handled regardless of the default + filter action. + +KVM_MSR_FILTER_WRITE + + Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap + indicates that a write should immediately fail, while a 1 indicates that + a write for a particular MSR should be handled regardless of the default + filter action. + +KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE + + Filter both read and write accesses to MSRs using the given bitmap. A 0 + in the bitmap indicates that both reads and writes should immediately fail, + while a 1 indicates that reads and writes for a particular MSR are not + filtered by this range. + +flags values for struct kvm_msr_filter: + +KVM_MSR_FILTER_DEFAULT_ALLOW + + If no filter range matches an MSR index that is getting accessed, KVM will + fall back to allowing access to the MSR. + +KVM_MSR_FILTER_DEFAULT_DENY + + If no filter range matches an MSR index that is getting accessed, KVM will + fall back to rejecting access to the MSR. In this mode, all MSRs that should + be processed by KVM need to explicitly be marked as allowed in the bitmaps. + +This ioctl allows user space to define up to 16 bitmaps of MSR ranges to +specify whether a certain MSR access should be explicitly filtered for or not. + +If this ioctl has never been invoked, MSR accesses are not guarded and the +old KVM in-kernel emulation behavior is fully preserved. + +As soon as the filtering is in place, every MSR access is processed through +the filtering. If a bit is within one of the defined ranges, read and write +accesses are guarded by the bitmap's value for the MSR index. If it is not +defined in any range, whether MSR access is rejected is determined by the flags +field in the kvm_msr_filter struct: KVM_MSR_FILTER_DEFAULT_ALLOW and +KVM_MSR_FILTER_DEFAULT_DENY. + +Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR +filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect. + +Each bitmap range specifies a range of MSRs to potentially allow access on. +The range goes from MSR index [base .. base+nmsrs]. The flags field +indicates whether reads, writes or both reads and writes are filtered +by setting a 1 bit in the bitmap for the corresponding MSR index. + +If an MSR access is not permitted through the filtering, it generates a +#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that +allows user space to deflect and potentially handle various MSR accesses +into user space. + +If a vCPU is in running state while this ioctl is invoked, the vCPU may +experience inconsistent filtering behavior on MSR accesses. + 5. The kvm_run structure ======================== @@ -5187,6 +5280,7 @@ ENABLE_CAP. Currently valid exit reasons are: KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits + KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest wants to read. To respond to this request with a successful read, user space @@ -6265,3 +6359,17 @@ writes to user space. It can be enabled on a VM level. If enabled, MSR accesses that would usually trigger a #GP by KVM into the guest will instead get bounced to user space through the KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. + +8.25 KVM_X86_SET_MSR_FILTER +--------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports that accesses to user defined MSRs +may be rejected. With this capability exposed, KVM exports new VM ioctl +KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR +ranges that KVM should reject access to. + +In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to +trap and emulate MSRs that are outside of the scope of KVM as well as +limit the attack surface on KVM's MSR emulation code. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f4a2443219bc..dc7a58b39faf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -87,6 +87,7 @@ #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) +#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -860,6 +861,13 @@ struct kvm_hv { struct kvm_hv_syndbg hv_syndbg; }; +struct msr_bitmap_range { + u32 flags; + u32 nmsrs; + u32 base; + unsigned long *bitmap; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ @@ -964,6 +972,12 @@ struct kvm_arch { /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; + struct { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; + } msr_filter; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; }; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c2fd0aa2f587..89e5f3d1bba8 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -192,8 +192,26 @@ struct kvm_msr_list { __u32 indices[0]; }; +/* Maximum size of any access bitmap in bytes */ +#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600 + +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ +}; + +#define KVM_MSR_FILTER_MAX_RANGES 16 +struct kvm_msr_filter { +#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; struct kvm_cpuid_entry { __u32 function; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 60219882fee2..72f91f3640f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1490,7 +1490,35 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { - return true; + struct kvm *kvm = vcpu->kvm; + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + u32 count = kvm->arch.msr_filter.count; + u32 i; + bool r = kvm->arch.msr_filter.default_allow; + int idx; + + /* MSR filtering not set up, allow everything */ + if (!count) + return true; + + /* Prevent collision with set_msr_filter */ + idx = srcu_read_lock(&kvm->srcu); + + for (i = 0; i < count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + r = !!test_bit(index - start, bitmap); + break; + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + return r; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -1505,6 +1533,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return -EPERM; + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1561,6 +1592,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return -EPERM; + msr.index = index; msr.host_initiated = host_initiated; @@ -1624,6 +1658,8 @@ static u64 kvm_msr_reason(int r) switch (r) { case -ENOENT: return KVM_MSR_EXIT_REASON_UNKNOWN; + case -EPERM: + return KVM_MSR_EXIT_REASON_FILTER; default: return KVM_MSR_EXIT_REASON_INVAL; } @@ -3620,6 +3656,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5151,6 +5188,103 @@ split_irqchip_unlock: return r; } +static void kvm_clear_msr_filter(struct kvm *kvm) +{ + u32 i; + u32 count = kvm->arch.msr_filter.count; + struct msr_bitmap_range ranges[16]; + + mutex_lock(&kvm->lock); + kvm->arch.msr_filter.count = 0; + memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); + mutex_unlock(&kvm->lock); + synchronize_srcu(&kvm->srcu); + + for (i = 0; i < count; i++) + kfree(ranges[i].bitmap); +} + +static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +{ + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + struct msr_bitmap_range range; + unsigned long *bitmap = NULL; + size_t bitmap_size; + int r; + + if (!user_range->nmsrs) + return 0; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + range = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { + r = -EINVAL; + goto err; + } + + if (!range.flags) { + r = -EINVAL; + goto err; + } + + /* Everything ok, add this range identifier to our global pool */ + ranges[kvm->arch.msr_filter.count] = range; + /* Make sure we filled the array before we tell anyone to walk it */ + smp_wmb(); + kvm->arch.msr_filter.count++; + + return 0; +err: + kfree(bitmap); + return r; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + bool default_allow; + int r = 0; + u32 i; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + kvm_clear_msr_filter(kvm); + + default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + kvm->arch.msr_filter.default_allow = default_allow; + + /* + * Protect from concurrent calls to this function that could trigger + * a TOCTOU violation on kvm->arch.msr_filter.count. + */ + mutex_lock(&kvm->lock); + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + r = kvm_add_msr_filter(kvm, &filter.ranges[i]); + if (r) + break; + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5457,6 +5591,9 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; + case KVM_X86_SET_MSR_FILTER: + r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -8611,6 +8748,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + kvm_x86_ops.msr_filter_changed(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -10163,6 +10302,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u32 i; + if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10179,6 +10320,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops.vm_destroy) kvm_x86_ops.vm_destroy(kvm); + for (i = 0; i < kvm->arch.msr_filter.count; i++) + kfree(kvm->arch.msr_filter.ranges[i].bitmap); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 31292a3cdfc2..58f43aa1fc21 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -421,6 +421,7 @@ struct kvm_run { __u8 pad[7]; #define KVM_MSR_EXIT_REASON_INVAL (1 << 0) #define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) +#define KVM_MSR_EXIT_REASON_FILTER (1 << 2) __u32 reason; /* kernel -> user */ __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ @@ -1050,6 +1051,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_DIAG318 186 #define KVM_CAP_STEAL_TIME 187 #define KVM_CAP_X86_USER_SPACE_MSR 188 +#define KVM_CAP_X86_MSR_FILTER 189 #ifdef KVM_CAP_IRQ_ROUTING @@ -1551,6 +1553,9 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) +/* Available with KVM_CAP_X86_MSR_FILTER */ +#define KVM_X86_SET_MSR_FILTER _IOW(KVMIO, 0xc6, struct kvm_msr_filter) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ From d468706e313ca3ec85b0e6e71a960ee0bbadd9f3 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:22 +0200 Subject: [PATCH 210/497] KVM: selftests: Add test for user space MSR handling Now that we have the ability to handle MSRs from user space and also to select which ones we do want to prevent in-kernel KVM code from handling, let's add a selftest to show case and verify the API. Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-9-graf@amazon.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/x86_64/user_msr_test.c | 248 ++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/user_msr_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 452787152748..307ceaadbbb9 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -11,6 +11,7 @@ /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test +/x86_64/user_msr_test /x86_64/vmx_preemption_timer_test /x86_64/svm_vmcall_test /x86_64/sync_regs_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 4a166588d99f..80d5c348354c 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -55,6 +55,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs +TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test diff --git a/tools/testing/selftests/kvm/x86_64/user_msr_test.c b/tools/testing/selftests/kvm/x86_64/user_msr_test.c new file mode 100644 index 000000000000..cbe1b08890ff --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/user_msr_test.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tests for KVM_CAP_X86_USER_SPACE_MSR and KVM_X86_SET_MSR_FILTER + * + * Copyright (C) 2020, Amazon Inc. + * + * This is a functional test to verify that we can deflect MSR events + * into user space. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 5 + +static u32 msr_reads, msr_writes; + +static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE]; +static u8 bitmap_deadbeef[1] = { 0x1 }; + +static void deny_msr(uint8_t *bitmap, u32 msr) +{ + u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1); + + bitmap[idx / 8] &= ~(1 << (idx % 8)); +} + +static void prepare_bitmaps(void) +{ + memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000)); + memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write)); + memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000)); + memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000)); + memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read)); + + deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL); + deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK); + deny_msr(bitmap_c0000000_read, MSR_GS_BASE); +} + +struct kvm_msr_filter filter = { + .flags = KVM_MSR_FILTER_DEFAULT_DENY, + .ranges = { + { + .flags = KVM_MSR_FILTER_READ, + .base = 0x00000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_00000000, + }, { + .flags = KVM_MSR_FILTER_WRITE, + .base = 0x00000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_00000000_write, + }, { + .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE, + .base = 0x40000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_40000000, + }, { + .flags = KVM_MSR_FILTER_READ, + .base = 0xc0000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_c0000000_read, + }, { + .flags = KVM_MSR_FILTER_WRITE, + .base = 0xc0000000, + .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE, + .bitmap = bitmap_c0000000, + }, { + .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ, + .base = 0xdeadbeef, + .nmsrs = 1, + .bitmap = bitmap_deadbeef, + }, + }, +}; + +struct kvm_msr_filter no_filter = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, +}; + +static void guest_msr_calls(bool trapped) +{ + /* This goes into the in-kernel emulation */ + wrmsr(MSR_SYSCALL_MASK, 0); + + if (trapped) { + /* This goes into user space emulation */ + GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE); + } else { + GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE); + } + + /* If trapped == true, this goes into user space emulation */ + wrmsr(MSR_IA32_POWER_CTL, 0x1234); + + /* This goes into the in-kernel emulation */ + rdmsr(MSR_IA32_POWER_CTL); + + /* Invalid MSR, should always be handled by user space exit */ + GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef); + wrmsr(0xdeadbeef, 0x1234); +} + +static void guest_code(void) +{ + guest_msr_calls(true); + + /* + * Disable msr filtering, so that the kernel + * handles everything in the next round + */ + GUEST_SYNC(0); + + guest_msr_calls(false); + + GUEST_DONE(); +} + +static int handle_ucall(struct kvm_vm *vm) +{ + struct ucall uc; + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("Guest assertion not met"); + break; + case UCALL_SYNC: + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter); + break; + case UCALL_DONE: + return 1; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + return 0; +} + +static void handle_rdmsr(struct kvm_run *run) +{ + run->msr.data = run->msr.index; + msr_reads++; + + if (run->msr.index == MSR_SYSCALL_MASK || + run->msr.index == MSR_GS_BASE) { + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER, + "MSR read trap w/o access fault"); + } + + if (run->msr.index == 0xdeadbeef) { + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN, + "MSR deadbeef read trap w/o inval fault"); + } +} + +static void handle_wrmsr(struct kvm_run *run) +{ + /* ignore */ + msr_writes++; + + if (run->msr.index == MSR_IA32_POWER_CTL) { + TEST_ASSERT(run->msr.data == 0x1234, + "MSR data for MSR_IA32_POWER_CTL incorrect"); + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER, + "MSR_IA32_POWER_CTL trap w/o access fault"); + } + + if (run->msr.index == 0xdeadbeef) { + TEST_ASSERT(run->msr.data == 0x1234, + "MSR data for deadbeef incorrect"); + TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN, + "deadbeef trap w/o inval fault"); + } +} + +int main(int argc, char *argv[]) +{ + struct kvm_enable_cap cap = { + .cap = KVM_CAP_X86_USER_SPACE_MSR, + .args[0] = KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER, + }; + struct kvm_vm *vm; + struct kvm_run *run; + int rc; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + run = vcpu_state(vm, VCPU_ID); + + rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR); + TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available"); + vm_enable_cap(vm, &cap); + + rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER); + TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available"); + + prepare_bitmaps(); + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter); + + while (1) { + rc = _vcpu_run(vm, VCPU_ID); + + TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc); + + switch (run->exit_reason) { + case KVM_EXIT_X86_RDMSR: + handle_rdmsr(run); + break; + case KVM_EXIT_X86_WRMSR: + handle_wrmsr(run); + break; + case KVM_EXIT_IO: + if (handle_ucall(vm)) + goto done; + break; + } + + } + +done: + TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space"); + TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space"); + + kvm_vm_free(vm); + + return 0; +} From 729c15c20f1a7c9ad1d09a603ad1aa7fb60b1f88 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Sep 2020 06:53:57 -0400 Subject: [PATCH 211/497] KVM: x86: rename KVM_REQ_GET_VMCS12_PAGES We are going to use it for SVM too, so use a more generic name. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/vmx/nested.c | 8 ++++---- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dc7a58b39faf..d0f77235da92 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -80,7 +80,7 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) -#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) +#define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) @@ -1261,7 +1261,7 @@ struct kvm_x86_nested_ops { int (*set_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu); int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 91ce6e642db2..6eca8a7deed1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -285,7 +285,7 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; @@ -3395,7 +3395,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * to nested_get_vmcs12_pages before the next VM-entry. The MSRs * have already been set at vmentry time and should not be reset. */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } /* @@ -6192,7 +6192,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ - kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } else { return -EINVAL; } @@ -6573,7 +6573,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, - .get_vmcs12_pages = nested_get_vmcs12_pages, + .get_nested_state_pages = nested_get_vmcs12_pages, .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 72f91f3640f3..411f6103532b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8640,8 +8640,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; goto out; } From a7d5c7ce41ac1e2537d78ddb57ef0ac4f737aa19 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Sep 2020 07:43:14 -0400 Subject: [PATCH 212/497] KVM: nSVM: delay MSR permission processing to first nested VM run Allow userspace to set up the memory map after KVM_SET_NESTED_STATE; to do so, move the call to nested_svm_vmrun_msrpm inside the KVM_REQ_GET_NESTED_STATE_PAGES handler (which is currently not used by nSVM). This is similar to what VMX does already. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index da5e87d002e9..ba50ff6e35c7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -196,6 +196,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } +static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + if (!nested_svm_vmrun_msrpm(svm)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + + return true; +} + static bool nested_vmcb_check_controls(struct vmcb_control_area *control) { if ((vmcb_is_intercept(control, INTERCEPT_VMRUN)) == 0) @@ -698,6 +712,8 @@ void svm_leave_nested(struct vcpu_svm *svm) copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); } + + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -1142,9 +1158,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, load_nested_vmcb_control(svm, ctl); nested_prepare_vmcb_control(svm); - if (!nested_svm_vmrun_msrpm(svm)) - goto out_free; - + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: kfree(save); @@ -1155,6 +1169,7 @@ out_free: struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, }; From 0c899c25d754ae386940f0e1b86b31d3921480b6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 24 Sep 2020 14:45:27 +0200 Subject: [PATCH 213/497] KVM: x86: do not attempt TSC synchronization on guest writes KVM special-cases writes to MSR_IA32_TSC so that all CPUs have the same base for the TSC. This logic is complicated, and we do not want it to have any effect once the VM is started. In particular, if any guest started to synchronize its TSCs with writes to MSR_IA32_TSC rather than MSR_IA32_TSC_ADJUST, the additional effect of kvm_write_tsc code would be uncharted territory. Therefore, this patch makes writes to MSR_IA32_TSC behave essentially the same as writes to MSR_IA32_TSC_ADJUST when they come from the guest. A new selftest (which passes both before and after the patch) checks the current semantics of writes to MSR_IA32_TSC and MSR_IA32_TSC_ADJUST originating from both the host and the guest. Upcoming work to remove the special side effects of host-initiated writes to MSR_IA32_TSC and MSR_IA32_TSC_ADJUST will be able to build onto this test, adjusting the host side to use the new APIs and achieve the same effect. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 30 ++-- tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/x86_64/tsc_msrs_test.c | 168 ++++++++++++++++++ 3 files changed, 179 insertions(+), 20 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 411f6103532b..c4015a43cc8a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2107,12 +2107,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) #endif } -static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) -{ - u64 curr_offset = vcpu->arch.l1_tsc_offset; - vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; -} - /* * Multiply tsc by a fixed point number represented by ratio. * @@ -2174,14 +2168,13 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; bool matched; bool already_matched; - u64 data = msr->data; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2190,7 +2183,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - if (data == 0 && msr->host_initiated) { + if (data == 0) { /* * detection of vcpu initialization -- need to sync * with other vCPUs. This particularly helps to keep @@ -2260,9 +2253,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) - update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -2277,8 +2267,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); - static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { @@ -3073,7 +3061,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_ia32_power_ctl = data; break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); + if (msr_info->host_initiated) { + kvm_synchronize_tsc(vcpu, data); + } else { + u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + adjust_tsc_offset_guest(vcpu, adj); + vcpu->arch.ia32_tsc_adjust_msr += adj; + } break; case MSR_IA32_XSS: if (!msr_info->host_initiated && @@ -9839,7 +9833,6 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct msr_data msr; struct kvm *kvm = vcpu->kvm; kvm_hv_vcpu_postcreate(vcpu); @@ -9847,10 +9840,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - msr.data = 0x0; - msr.index = MSR_IA32_TSC; - msr.host_initiated = true; - kvm_write_tsc(vcpu, &msr); + kvm_synchronize_tsc(vcpu, 0); vcpu_put(vcpu); /* poll control enabled by default */ diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 80d5c348354c..7ebe71fbca53 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -55,6 +55,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test TEST_GEN_PROGS_x86_64 += x86_64/debug_regs +TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_x86_64 += demand_paging_test diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c new file mode 100644 index 000000000000..f8e761149daa --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST. + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include +#include +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 + +#define UNITY (1ull << 30) +#define HOST_ADJUST (UNITY * 64) +#define GUEST_STEP (UNITY * 4) +#define ROUND(x) ((x + UNITY / 2) & -UNITY) +#define rounded_rdmsr(x) ROUND(rdmsr(x)) +#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x)) + +#define GUEST_ASSERT_EQ(a, b) do { \ + __typeof(a) _a = (a); \ + __typeof(b) _b = (b); \ + if (_a != _b) \ + ucall(UCALL_ABORT, 4, \ + "Failed guest assert: " \ + #a " == " #b, __LINE__, _a, _b); \ + } while(0) + +static void guest_code(void) +{ + u64 val = 0; + + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC affect both MSRs. */ + val = 1ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */ + GUEST_SYNC(2); + val = 2ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC_ADJUST, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Host: setting the TSC offset. */ + GUEST_SYNC(3); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the + * host-side offset and affect both MSRs. + */ + GUEST_SYNC(4); + val = 3ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC_ADJUST, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side + * offset is now visible in MSR_IA32_TSC_ADJUST. + */ + GUEST_SYNC(5); + val = 4ull * GUEST_STEP; + wrmsr(MSR_IA32_TSC, val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val); + GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST); + + GUEST_DONE(); +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) +{ + struct ucall uc; + + vcpu_args_set(vm, vcpuid, 1, vcpuid); + + vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + return; + case UCALL_DONE: + return; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld\n" \ + "\tvalues: %#lx, %#lx", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2], uc.args[3]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } +} + +int main(void) +{ + struct kvm_vm *vm; + uint64_t val; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + val = 0; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC affect both MSRs. */ + run_vcpu(vm, VCPU_ID, 1); + val = 1ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */ + run_vcpu(vm, VCPU_ID, 2); + val = 2ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Host: writes to MSR_IA32_TSC set the host-side offset + * and therefore do not change MSR_IA32_TSC_ADJUST. + */ + vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + run_vcpu(vm, VCPU_ID, 3); + + /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */ + vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456); + + /* Restore previous value. */ + vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the + * host-side offset and affect both MSRs. + */ + run_vcpu(vm, VCPU_ID, 4); + val = 3ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val); + + /* + * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side + * offset is now visible in MSR_IA32_TSC_ADJUST. + */ + run_vcpu(vm, VCPU_ID, 5); + val = 4ull * GUEST_STEP; + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val); + ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST); + + kvm_vm_free(vm); + + return 0; +} From 7956b0d4694fb4bf75c4b1b4bcb6cf7092bd5195 Mon Sep 17 00:00:00 2001 From: Artur Rojek Date: Mon, 28 Sep 2020 16:09:09 -0700 Subject: [PATCH 214/497] dt-bindings: input: Add docs for ADC driven joystick Add documentation for the adc-joystick driver, used to provide support for joysticks connected over ADC. Signed-off-by: Artur Rojek Tested-by: Paul Cercueil Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20200927123302.31062-1-contact@artur-rojek.eu Signed-off-by: Dmitry Torokhov --- .../bindings/input/adc-joystick.yaml | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/adc-joystick.yaml diff --git a/Documentation/devicetree/bindings/input/adc-joystick.yaml b/Documentation/devicetree/bindings/input/adc-joystick.yaml new file mode 100644 index 000000000000..054406bbd22b --- /dev/null +++ b/Documentation/devicetree/bindings/input/adc-joystick.yaml @@ -0,0 +1,121 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +# Copyright 2019-2020 Artur Rojek +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/input/adc-joystick.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: ADC attached joystick + +maintainers: + - Artur Rojek + +description: > + Bindings for joystick devices connected to ADC controllers supporting + the Industrial I/O subsystem. + +properties: + compatible: + const: adc-joystick + + io-channels: + minItems: 1 + maxItems: 1024 + description: > + List of phandle and IIO specifier pairs. + Each pair defines one ADC channel to which a joystick axis is connected. + See Documentation/devicetree/bindings/iio/iio-bindings.txt for details. + + '#address-cells': + const: 1 + + '#size-cells': + const: 0 + +required: + - compatible + - io-channels + - '#address-cells' + - '#size-cells' + +additionalProperties: false + +patternProperties: + "^axis@[0-9a-f]+$": + type: object + description: > + Represents a joystick axis bound to the given ADC channel. + For each entry in the io-channels list, one axis subnode with a matching + reg property must be specified. + + properties: + reg: + minimum: 0 + maximum: 1023 + description: Index of an io-channels list entry bound to this axis. + + linux,code: + $ref: /schemas/types.yaml#/definitions/uint32 + description: EV_ABS specific event code generated by the axis. + + abs-range: + allOf: + - $ref: /schemas/types.yaml#/definitions/uint32-array + - items: + - description: minimum value + - description: maximum value + description: > + Minimum and maximum values produced by the axis. + For an ABS_X axis this will be the left-most and right-most + inclination of the joystick. If min > max, it is left to userspace to + treat the axis as inverted. + This property is interpreted as two signed 32 bit values. + + abs-fuzz: + $ref: /schemas/types.yaml#/definitions/uint32 + description: > + Amount of noise in the input value. + Omitting this property indicates the axis is precise. + + abs-flat: + $ref: /schemas/types.yaml#/definitions/uint32 + description: > + Axial "deadzone", or area around the center position, where the axis + is considered to be at rest. + Omitting this property indicates the axis always returns to exactly + the center position. + + required: + - reg + - linux,code + - abs-range + + additionalProperties: false + +examples: + - | + #include + #include + + joystick: adc-joystick { + compatible = "adc-joystick"; + io-channels = <&adc INGENIC_ADC_TOUCH_XP>, + <&adc INGENIC_ADC_TOUCH_YP>; + #address-cells = <1>; + #size-cells = <0>; + + axis@0 { + reg = <0>; + linux,code = ; + abs-range = <3300 0>; + abs-fuzz = <4>; + abs-flat = <200>; + }; + axis@1 { + reg = <1>; + linux,code = ; + abs-range = <0 3300>; + abs-fuzz = <4>; + abs-flat = <200>; + }; + }; From 2c2b364fddd551f0da98953618e264c098dfa140 Mon Sep 17 00:00:00 2001 From: Artur Rojek Date: Mon, 28 Sep 2020 16:12:46 -0700 Subject: [PATCH 215/497] Input: joystick - add ADC attached joystick driver. Add a driver for joystick devices connected to ADC controllers supporting the Industrial I/O subsystem. Signed-off-by: Artur Rojek Tested-by: Paul Cercueil Tested-by: Heiko Stuebner Link: https://lore.kernel.org/r/20200927123302.31062-2-contact@artur-rojek.eu Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/Kconfig | 10 + drivers/input/joystick/Makefile | 1 + drivers/input/joystick/adc-joystick.c | 264 ++++++++++++++++++++++++++ 3 files changed, 275 insertions(+) create mode 100644 drivers/input/joystick/adc-joystick.c diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig index 6f73f02059b5..2a5dbf993aaa 100644 --- a/drivers/input/joystick/Kconfig +++ b/drivers/input/joystick/Kconfig @@ -42,6 +42,16 @@ config JOYSTICK_A3D To compile this driver as a module, choose M here: the module will be called a3d. +config JOYSTICK_ADC + tristate "Simple joystick connected over ADC" + depends on IIO + select IIO_BUFFER_CB + help + Say Y here if you have a simple joystick connected over ADC. + + To compile this driver as a module, choose M here: the + module will be called adc-joystick. + config JOYSTICK_ADI tristate "Logitech ADI digital joysticks and gamepads" select GAMEPORT diff --git a/drivers/input/joystick/Makefile b/drivers/input/joystick/Makefile index 8656023f6ef5..58232b3057d3 100644 --- a/drivers/input/joystick/Makefile +++ b/drivers/input/joystick/Makefile @@ -6,6 +6,7 @@ # Each configuration option enables a list of files. obj-$(CONFIG_JOYSTICK_A3D) += a3d.o +obj-$(CONFIG_JOYSTICK_ADC) += adc-joystick.o obj-$(CONFIG_JOYSTICK_ADI) += adi.o obj-$(CONFIG_JOYSTICK_AMIGA) += amijoy.o obj-$(CONFIG_JOYSTICK_AS5011) += as5011.o diff --git a/drivers/input/joystick/adc-joystick.c b/drivers/input/joystick/adc-joystick.c new file mode 100644 index 000000000000..78ebca7d400a --- /dev/null +++ b/drivers/input/joystick/adc-joystick.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Input driver for joysticks connected over ADC. + * Copyright (c) 2019-2020 Artur Rojek + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +struct adc_joystick_axis { + u32 code; + s32 range[2]; + s32 fuzz; + s32 flat; +}; + +struct adc_joystick { + struct input_dev *input; + struct iio_cb_buffer *buffer; + struct adc_joystick_axis *axes; + struct iio_channel *chans; + int num_chans; +}; + +static int adc_joystick_handle(const void *data, void *private) +{ + struct adc_joystick *joy = private; + enum iio_endian endianness; + int bytes, msb, val, idx, i; + const u16 *data_u16; + bool sign; + + bytes = joy->chans[0].channel->scan_type.storagebits >> 3; + + for (i = 0; i < joy->num_chans; ++i) { + idx = joy->chans[i].channel->scan_index; + endianness = joy->chans[i].channel->scan_type.endianness; + msb = joy->chans[i].channel->scan_type.realbits - 1; + sign = tolower(joy->chans[i].channel->scan_type.sign) == 's'; + + switch (bytes) { + case 1: + val = ((const u8 *)data)[idx]; + break; + case 2: + data_u16 = (const u16 *)data + idx; + + /* + * Data is aligned to the sample size by IIO core. + * Call `get_unaligned_xe16` to hide type casting. + */ + if (endianness == IIO_BE) + val = get_unaligned_be16(data_u16); + else if (endianness == IIO_LE) + val = get_unaligned_le16(data_u16); + else /* IIO_CPU */ + val = *data_u16; + break; + default: + return -EINVAL; + } + + val >>= joy->chans[i].channel->scan_type.shift; + if (sign) + val = sign_extend32(val, msb); + else + val &= GENMASK(msb, 0); + input_report_abs(joy->input, joy->axes[i].code, val); + } + + input_sync(joy->input); + + return 0; +} + +static int adc_joystick_open(struct input_dev *dev) +{ + struct adc_joystick *joy = input_get_drvdata(dev); + struct device *devp = &dev->dev; + int ret; + + ret = iio_channel_start_all_cb(joy->buffer); + if (ret) + dev_err(devp, "Unable to start callback buffer: %d\n", ret); + + return ret; +} + +static void adc_joystick_close(struct input_dev *dev) +{ + struct adc_joystick *joy = input_get_drvdata(dev); + + iio_channel_stop_all_cb(joy->buffer); +} + +static void adc_joystick_cleanup(void *data) +{ + iio_channel_release_all_cb(data); +} + +static int adc_joystick_set_axes(struct device *dev, struct adc_joystick *joy) +{ + struct adc_joystick_axis *axes; + struct fwnode_handle *child; + int num_axes, error, i; + + num_axes = device_get_child_node_count(dev); + if (!num_axes) { + dev_err(dev, "Unable to find child nodes\n"); + return -EINVAL; + } + + if (num_axes != joy->num_chans) { + dev_err(dev, "Got %d child nodes for %d channels\n", + num_axes, joy->num_chans); + return -EINVAL; + } + + axes = devm_kmalloc_array(dev, num_axes, sizeof(*axes), GFP_KERNEL); + if (!axes) + return -ENOMEM; + + device_for_each_child_node(dev, child) { + error = fwnode_property_read_u32(child, "reg", &i); + if (error) { + dev_err(dev, "reg invalid or missing\n"); + goto err_fwnode_put; + } + + if (i >= num_axes) { + error = -EINVAL; + dev_err(dev, "No matching axis for reg %d\n", i); + goto err_fwnode_put; + } + + error = fwnode_property_read_u32(child, "linux,code", + &axes[i].code); + if (error) { + dev_err(dev, "linux,code invalid or missing\n"); + goto err_fwnode_put; + } + + error = fwnode_property_read_u32_array(child, "abs-range", + axes[i].range, 2); + if (error) { + dev_err(dev, "abs-range invalid or missing\n"); + goto err_fwnode_put; + } + + fwnode_property_read_u32(child, "abs-fuzz", &axes[i].fuzz); + fwnode_property_read_u32(child, "abs-flat", &axes[i].flat); + + input_set_abs_params(joy->input, axes[i].code, + axes[i].range[0], axes[i].range[1], + axes[i].fuzz, axes[i].flat); + input_set_capability(joy->input, EV_ABS, axes[i].code); + } + + joy->axes = axes; + + return 0; + +err_fwnode_put: + fwnode_handle_put(child); + return error; +} + +static int adc_joystick_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct adc_joystick *joy; + struct input_dev *input; + int error; + int bits; + int i; + + joy = devm_kzalloc(dev, sizeof(*joy), GFP_KERNEL); + if (!joy) + return -ENOMEM; + + joy->chans = devm_iio_channel_get_all(dev); + if (IS_ERR(joy->chans)) { + error = PTR_ERR(joy->chans); + if (error != -EPROBE_DEFER) + dev_err(dev, "Unable to get IIO channels"); + return error; + } + + /* Count how many channels we got. NULL terminated. */ + for (i = 0; joy->chans[i].indio_dev; i++) { + bits = joy->chans[i].channel->scan_type.storagebits; + if (!bits || bits > 16) { + dev_err(dev, "Unsupported channel storage size\n"); + return -EINVAL; + } + if (bits != joy->chans[0].channel->scan_type.storagebits) { + dev_err(dev, "Channels must have equal storage size\n"); + return -EINVAL; + } + } + joy->num_chans = i; + + input = devm_input_allocate_device(dev); + if (!input) { + dev_err(dev, "Unable to allocate input device\n"); + return -ENOMEM; + } + + joy->input = input; + input->name = pdev->name; + input->id.bustype = BUS_HOST; + input->open = adc_joystick_open; + input->close = adc_joystick_close; + + error = adc_joystick_set_axes(dev, joy); + if (error) + return error; + + input_set_drvdata(input, joy); + error = input_register_device(input); + if (error) { + dev_err(dev, "Unable to register input device\n"); + return error; + } + + joy->buffer = iio_channel_get_all_cb(dev, adc_joystick_handle, joy); + if (IS_ERR(joy->buffer)) { + dev_err(dev, "Unable to allocate callback buffer\n"); + return PTR_ERR(joy->buffer); + } + + error = devm_add_action_or_reset(dev, adc_joystick_cleanup, joy->buffer); + if (error) { + dev_err(dev, "Unable to add action\n"); + return error; + } + + return 0; +} + +static const struct of_device_id adc_joystick_of_match[] = { + { .compatible = "adc-joystick", }, + { } +}; +MODULE_DEVICE_TABLE(of, adc_joystick_of_match); + +static struct platform_driver adc_joystick_driver = { + .driver = { + .name = "adc-joystick", + .of_match_table = adc_joystick_of_match, + }, + .probe = adc_joystick_probe, +}; +module_platform_driver(adc_joystick_driver); + +MODULE_DESCRIPTION("Input driver for joysticks connected over ADC"); +MODULE_AUTHOR("Artur Rojek "); +MODULE_LICENSE("GPL"); From 9aa7bd452af17a3966e3583d959dcacab030bb8e Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 26 Sep 2020 15:03:37 +0800 Subject: [PATCH 216/497] platform/chrome: Use kobj_to_dev() instead of container_of() Use kobj_to_dev() instead of container_of(). Signed-off-by: Wang Qing Reviewed-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_sysfs.c | 2 +- drivers/platform/chrome/cros_ec_vbc.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c index 9c1e0998a721..f521a5c65091 100644 --- a/drivers/platform/chrome/cros_ec_sysfs.c +++ b/drivers/platform/chrome/cros_ec_sysfs.c @@ -320,7 +320,7 @@ static struct attribute *__ec_attrs[] = { static umode_t cros_ec_ctrl_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct cros_ec_dev *ec = to_cros_ec_dev(dev); if (a == &dev_attr_kb_wake_angle.attr && !ec->has_kb_wake_angle) diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c index 46482d12cffe..f3a70a312b43 100644 --- a/drivers/platform/chrome/cros_ec_vbc.c +++ b/drivers/platform/chrome/cros_ec_vbc.c @@ -17,7 +17,7 @@ static ssize_t vboot_context_read(struct file *filp, struct kobject *kobj, struct bin_attribute *att, char *buf, loff_t pos, size_t count) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct cros_ec_dev *ec = to_cros_ec_dev(dev); struct cros_ec_device *ecdev = ec->ec_dev; struct ec_params_vbnvcontext *params; @@ -57,7 +57,7 @@ static ssize_t vboot_context_write(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t pos, size_t count) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct cros_ec_dev *ec = to_cros_ec_dev(dev); struct cros_ec_device *ecdev = ec->ec_dev; struct ec_params_vbnvcontext *params; From 6a2e0923b2df5c00d1c691b0393daf1a87ffb924 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Mon, 28 Sep 2020 23:37:14 +0800 Subject: [PATCH 217/497] KVM: VMX: vmx_uret_msrs_list[] can be static Fixes: 14a61b642de9 ("KVM: VMX: Rename "vmx_msr_index" to "vmx_uret_msrs_list"") Signed-off-by: kernel test robot Message-Id: <20200928153714.GA6285@a3a878002045> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4551a7e80ebc..f7b6677e2a51 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -457,7 +457,7 @@ static unsigned long host_idt_base; * support this emulation, IA32_STAR must always be included in * vmx_uret_msrs_list[], even in i386 builds. */ -const u32 vmx_uret_msrs_list[] = { +static const u32 vmx_uret_msrs_list[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif From 42223fb100b43430daf0c396701cd75b2579ddb7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 12 Mar 2020 17:27:36 +0000 Subject: [PATCH 218/497] KVM: arm64: Refactor PMU attribute error handling The PMU emulation error handling is pretty messy when dealing with attributes. Let's refactor it so that we have less duplication, and that it is easy to extend later on. A functional change is that kvm_arm_pmu_v3_init() used to return -ENXIO when the PMU feature wasn't set. The error is now reported as -ENODEV, matching the documentation. -ENXIO is still returned when the interrupt isn't properly configured. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu-emul.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f0d0312c0a55..363f188fb707 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -735,15 +735,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) { - if (!kvm_arm_support_pmu_v3()) - return -ENODEV; - - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENXIO; - - if (vcpu->arch.pmu.created) - return -EBUSY; - if (irqchip_in_kernel(vcpu->kvm)) { int ret; @@ -796,6 +787,13 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { + if (!kvm_arm_support_pmu_v3() || + !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) + return -ENODEV; + + if (vcpu->arch.pmu.created) + return -EBUSY; + switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; @@ -804,9 +802,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) - return -ENODEV; - if (get_user(irq, uaddr)) return -EFAULT; From fd65a3b5f855c37167890d86e261a20bab1a14a4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Mar 2020 11:11:56 +0000 Subject: [PATCH 219/497] KVM: arm64: Use event mask matching architecture revision The PMU code suffers from a small defect where we assume that the event number provided by the guest is always 16 bit wide, even if the CPU only implements the ARMv8.0 architecture. This isn't really problematic in the sense that the event number ends up in a system register, cropping it to the right width, but still this needs fixing. In order to make it work, let's probe the version of the PMU that the guest is going to use. This is done by temporarily creating a kernel event and looking at the PMUVer field that has been saved at probe time in the associated arm_pmu structure. This in turn gets saved in the kvm structure, and subsequently used to compute the event mask that gets used throughout the PMU code. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/kvm/pmu-emul.c | 80 +++++++++++++++++++++++++++++-- 2 files changed, 77 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e52c927aade5..33e0f0a64c42 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -110,6 +110,8 @@ struct kvm_arch { * supported. */ bool return_nisv_io_abort_to_user; + + unsigned int pmuver; }; struct kvm_vcpu_fault_info { diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 363f188fb707..10d3860a9c0a 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc); #define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1 +static u32 kvm_pmu_event_mask(struct kvm *kvm) +{ + switch (kvm->arch.pmuver) { + case 1: /* ARMv8.0 */ + return GENMASK(9, 0); + case 4: /* ARMv8.1 */ + case 5: /* ARMv8.4 */ + case 6: /* ARMv8.5 */ + return GENMASK(15, 0); + default: /* Shouldn't be here, just for sanity */ + WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver); + return 0; + } +} + /** * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter * @vcpu: The vcpu pointer @@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx) return false; reg = PMEVTYPER0_EL0 + select_idx; - eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT; + eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm); return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN; } @@ -495,7 +510,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) /* PMSWINC only applies to ... SW_INC! */ type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); - type &= ARMV8_PMU_EVTYPE_EVENT; + type &= kvm_pmu_event_mask(vcpu->kvm); if (type != ARMV8_PMUV3_PERFCTR_SW_INCR) continue; @@ -578,7 +593,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) data = __vcpu_sys_reg(vcpu, reg); kvm_pmu_stop_counter(vcpu, pmc); - eventsel = data & ARMV8_PMU_EVTYPE_EVENT; + eventsel = data & kvm_pmu_event_mask(vcpu->kvm);; /* Software increment event does't need to be backed by a perf event */ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && @@ -679,17 +694,66 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx) void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 select_idx) { - u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK; + u64 reg, mask; + + mask = ARMV8_PMU_EVTYPE_MASK; + mask &= ~ARMV8_PMU_EVTYPE_EVENT; + mask |= kvm_pmu_event_mask(vcpu->kvm); reg = (select_idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx; - __vcpu_sys_reg(vcpu, reg) = event_type; + __vcpu_sys_reg(vcpu, reg) = data & mask; kvm_pmu_update_pmc_chained(vcpu, select_idx); kvm_pmu_create_perf_event(vcpu, select_idx); } +static int kvm_pmu_probe_pmuver(void) +{ + struct perf_event_attr attr = { }; + struct perf_event *event; + struct arm_pmu *pmu; + int pmuver = 0xf; + + /* + * Create a dummy event that only counts user cycles. As we'll never + * leave this function with the event being live, it will never + * count anything. But it allows us to probe some of the PMU + * details. Yes, this is terrible. + */ + attr.type = PERF_TYPE_RAW; + attr.size = sizeof(attr); + attr.pinned = 1; + attr.disabled = 0; + attr.exclude_user = 0; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.exclude_host = 1; + attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + attr.sample_period = GENMASK(63, 0); + + event = perf_event_create_kernel_counter(&attr, -1, current, + kvm_pmu_perf_overflow, &attr); + + if (IS_ERR(event)) { + pr_err_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return 0xf; + } + + if (event->pmu) { + pmu = to_arm_pmu(event->pmu); + if (pmu->pmuver) + pmuver = pmu->pmuver; + } + + perf_event_disable(event); + perf_event_release_kernel(event); + + return pmuver; +} + bool kvm_arm_support_pmu_v3(void) { /* @@ -794,6 +858,12 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (vcpu->arch.pmu.created) return -EBUSY; + if (!vcpu->kvm->arch.pmuver) + vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); + + if (vcpu->kvm->arch.pmuver == 0xf) + return -ENODEV; + switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: { int __user *uaddr = (int __user *)(long)attr->addr; From d7eec2360e389cc877a76c2b098f7b745007d2b2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2020 11:31:02 +0000 Subject: [PATCH 220/497] KVM: arm64: Add PMU event filtering infrastructure It can be desirable to expose a PMU to a guest, and yet not want the guest to be able to count some of the implemented events (because this would give information on shared resources, for example. For this, let's extend the PMUv3 device API, and offer a way to setup a bitmap of the allowed events (the default being no bitmap, and thus no filtering). Userspace can thus allow/deny ranges of event. The default policy depends on the "polarity" of the first filter setup (default deny if the filter allows events, and default allow if the filter denies events). This allows to setup exactly what is allowed for a given guest. Note that although the ioctl is per-vcpu, the map of allowed events is global to the VM (it can be setup from any vcpu until the vcpu PMU is initialized). Reviewed-by: Andrew Jones Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 5 +++ arch/arm64/include/uapi/asm/kvm.h | 16 +++++++ arch/arm64/kvm/arm.c | 2 + arch/arm64/kvm/pmu-emul.c | 69 ++++++++++++++++++++++++++++--- 4 files changed, 86 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 33e0f0a64c42..0384defc2a01 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -111,6 +111,11 @@ struct kvm_arch { */ bool return_nisv_io_abort_to_user; + /* + * VM-wide PMU filter, implemented as a bitmap and big enough for + * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+). + */ + unsigned long *pmu_filter; unsigned int pmuver; }; diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index ba85bb23f060..7b1511d6ce44 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -159,6 +159,21 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* + * PMU filter structure. Describe a range of events with a particular + * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. + */ +struct kvm_pmu_event_filter { + __u16 base_event; + __u16 nevents; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + __u8 action; + __u8 pad[3]; +}; + /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { struct { @@ -329,6 +344,7 @@ struct kvm_vcpu_events { #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_PMU_V3_FILTER 2 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 46dc3d75cf13..427848ff6f20 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -145,6 +145,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { int i; + bitmap_free(kvm->arch.pmu_filter); + kvm_vgic_destroy(kvm); for (i = 0; i < KVM_MAX_VCPUS; ++i) { diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 10d3860a9c0a..8cd365968af1 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -593,11 +593,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) data = __vcpu_sys_reg(vcpu, reg); kvm_pmu_stop_counter(vcpu, pmc); - eventsel = data & kvm_pmu_event_mask(vcpu->kvm);; + if (pmc->idx == ARMV8_PMU_CYCLE_IDX) + eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + else + eventsel = data & kvm_pmu_event_mask(vcpu->kvm); - /* Software increment event does't need to be backed by a perf event */ - if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR && - pmc->idx != ARMV8_PMU_CYCLE_IDX) + /* Software increment event doesn't need to be backed by a perf event */ + if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR) + return; + + /* + * If we have a filter in place and that the event isn't allowed, do + * not install a perf event either. + */ + if (vcpu->kvm->arch.pmu_filter && + !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) return; memset(&attr, 0, sizeof(struct perf_event_attr)); @@ -609,8 +619,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx) attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0; attr.exclude_hv = 1; /* Don't count EL2 events */ attr.exclude_host = 1; /* Don't count host events */ - attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ? - ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel; + attr.config = eventsel; counter = kvm_pmu_get_pair_counter_value(vcpu, pmc); @@ -889,6 +898,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) vcpu->arch.pmu.irq_num = irq; return 0; } + case KVM_ARM_VCPU_PMU_V3_FILTER: { + struct kvm_pmu_event_filter __user *uaddr; + struct kvm_pmu_event_filter filter; + int nr_events; + + nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; + + uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (((u32)filter.base_event + filter.nevents) > nr_events || + (filter.action != KVM_PMU_EVENT_ALLOW && + filter.action != KVM_PMU_EVENT_DENY)) + return -EINVAL; + + mutex_lock(&vcpu->kvm->lock); + + if (!vcpu->kvm->arch.pmu_filter) { + vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL); + if (!vcpu->kvm->arch.pmu_filter) { + mutex_unlock(&vcpu->kvm->lock); + return -ENOMEM; + } + + /* + * The default depends on the first applied filter. + * If it allows events, the default is to deny. + * Conversely, if the first filter denies a set of + * events, the default is to allow. + */ + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events); + else + bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events); + } + + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + else + bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents); + + mutex_unlock(&vcpu->kvm->lock); + + return 0; + } case KVM_ARM_VCPU_PMU_V3_INIT: return kvm_arm_pmu_v3_init(vcpu); } @@ -925,6 +981,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_ARM_VCPU_PMU_V3_IRQ: case KVM_ARM_VCPU_PMU_V3_INIT: + case KVM_ARM_VCPU_PMU_V3_FILTER: if (kvm_arm_support_pmu_v3() && test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features)) return 0; From 88865beca90621ae33217d5f23664d382f99205a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 12 Mar 2020 16:11:24 +0000 Subject: [PATCH 221/497] KVM: arm64: Mask out filtered events in PCMEID{0,1}_EL1 As we can now hide events from the guest, let's also adjust its view of PCMEID{0,1}_EL1 so that it can figure out why some common events are not counting as they should. The astute user can still look into the TRM for their CPU and find out they've been cheated, though. Nobody's perfect. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/pmu-emul.c | 29 +++++++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 5 +---- include/kvm/arm_pmu.h | 5 +++++ 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 8cd365968af1..ee13c5eecd3d 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -763,6 +763,35 @@ static int kvm_pmu_probe_pmuver(void) return pmuver; } +u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) +{ + unsigned long *bmap = vcpu->kvm->arch.pmu_filter; + u64 val, mask = 0; + int base, i; + + if (!pmceid1) { + val = read_sysreg(pmceid0_el0); + base = 0; + } else { + val = read_sysreg(pmceid1_el0); + base = 32; + } + + if (!bmap) + return val; + + for (i = 0; i < 32; i += 8) { + u64 byte; + + byte = bitmap_get_value8(bmap, base + i); + mask |= byte << i; + byte = bitmap_get_value8(bmap, 0x4000 + base + i); + mask |= byte << (32 + i); + } + + return val & mask; +} + bool kvm_arm_support_pmu_v3(void) { /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 077293b5115f..20ab2a7d37ca 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false; - if (!(p->Op2 & 1)) - pmceid = read_sysreg(pmceid0_el0); - else - pmceid = read_sysreg(pmceid1_el0); + pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1)); p->regval = pmceid; diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 6db030439e29..98cbfe885a53 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -34,6 +34,7 @@ struct kvm_pmu { u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu); +u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); @@ -108,6 +109,10 @@ static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) { return 0; } +static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) +{ + return 0; +} #endif #endif From 8be86a5eec046cb8bd06b4393d373cd433a32cba Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 12 Feb 2020 14:40:24 +0000 Subject: [PATCH 222/497] KVM: arm64: Document PMU filtering API Add a small blurb describing how the event filtering API gets used. Reviewed-by: Andrew Jones Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/devices/vcpu.rst | 46 +++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index ca374d3fe085..c8b707afca6e 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -55,6 +55,52 @@ Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel virtual GIC implementation, this must be done after initializing the in-kernel irqchip. +1.3 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_FILTER +----------------------------------------- + +:Parameters: in kvm_device_attr.addr the address for a PMU event filter is a + pointer to a struct kvm_pmu_event_filter + +:Returns: + + ======= ====================================================== + -ENODEV: PMUv3 not supported or GIC not initialized + -ENXIO: PMUv3 not properly configured or in-kernel irqchip not + configured as required prior to calling this attribute + -EBUSY: PMUv3 already initialized + -EINVAL: Invalid filter range + ======= ====================================================== + +Request the installation of a PMU event filter described as follows: + +struct kvm_pmu_event_filter { + __u16 base_event; + __u16 nevents; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + __u8 action; + __u8 pad[3]; +}; + +A filter range is defined as the range [@base_event, @base_event + @nevents), +together with an @action (KVM_PMU_EVENT_ALLOW or KVM_PMU_EVENT_DENY). The +first registered range defines the global policy (global ALLOW if the first +@action is DENY, global DENY if the first @action is ALLOW). Multiple ranges +can be programmed, and must fit within the event space defined by the PMU +architecture (10 bits on ARMv8.0, 16 bits from ARMv8.1 onwards). + +Note: "Cancelling" a filter by registering the opposite action for the same +range doesn't change the default action. For example, installing an ALLOW +filter for event range [0:10) as the first filter and then applying a DENY +action for the same range will leave the whole range as disabled. + +Restrictions: Event 0 (SW_INCR) is never filtered, as it doesn't count a +hardware event. Filtering event 0x1E (CHAIN) has no effect either, as it +isn't strictly speaking an event. Filtering the cycle counter is possible +using event 0x11 (CPU_CYCLES). + 2. GROUP: KVM_ARM_VCPU_TIMER_CTRL ================================= From af130d0adc8e48c73030c0d71a59ce1f7809a8fa Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Thu, 24 Sep 2020 13:37:30 +0100 Subject: [PATCH 223/497] KVM: arm64: Add undocumented return values for PMU device control group KVM_ARM_VCPU_PMU_V3_IRQ returns -EFAULT if get_user() fails when reading the interrupt number from kvm_device_attr.addr. KVM_ARM_VCPU_PMU_V3_INIT returns the error value from kvm_vgic_set_owner(). kvm_arm_pmu_v3_init() checks that the vgic has been initialized and the interrupt number is valid, but kvm_vgic_set_owner() can still return the error code -EEXIST if another device has already claimed the interrupt. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20200924123731.268177-2-alexandru.elisei@arm.com --- Documentation/virt/kvm/devices/vcpu.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index c8b707afca6e..1da576a4b767 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -25,6 +25,7 @@ Returns: ======= ======================================================== -EBUSY The PMU overflow interrupt is already set + -EFAULT Error reading interrupt number -ENXIO The overflow interrupt not set when attempting to get it -ENODEV PMUv3 not supported -EINVAL Invalid PMU overflow interrupt number supplied or @@ -45,6 +46,7 @@ all vcpus, while as an SPI it must be a separate number per vcpu. Returns: ======= ====================================================== + -EEXIST Interrupt number already used -ENODEV PMUv3 not supported or GIC not initialized -ENXIO PMUv3 not properly configured or in-kernel irqchip not configured as required prior to calling this attribute From 51dd2eb98c72c1ff4cdb1526fff9a259a563c257 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Thu, 24 Sep 2020 13:37:31 +0100 Subject: [PATCH 224/497] KVM: arm64: Match PMU error code descriptions with error conditions Update the description of the PMU KVM_{GET, SET}_DEVICE_ATTR error codes to be a better match for the code that returns them. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20200924123731.268177-3-alexandru.elisei@arm.com --- Documentation/virt/kvm/devices/vcpu.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 1da576a4b767..da7c2ef7dafc 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -26,8 +26,9 @@ Returns: ======= ======================================================== -EBUSY The PMU overflow interrupt is already set -EFAULT Error reading interrupt number - -ENXIO The overflow interrupt not set when attempting to get it - -ENODEV PMUv3 not supported + -ENXIO PMUv3 not supported or the overflow interrupt not set + when attempting to get it + -ENODEV KVM_ARM_VCPU_PMU_V3 feature missing from VCPU -EINVAL Invalid PMU overflow interrupt number supplied or trying to set the IRQ number without using an in-kernel irqchip. @@ -48,8 +49,8 @@ Returns: ======= ====================================================== -EEXIST Interrupt number already used -ENODEV PMUv3 not supported or GIC not initialized - -ENXIO PMUv3 not properly configured or in-kernel irqchip not - configured as required prior to calling this attribute + -ENXIO PMUv3 not supported, missing VCPU feature or interrupt + number not set -EBUSY PMUv3 already initialized ======= ====================================================== From ab25464bdabd45f283cc1194e332040f89071106 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:01 +0100 Subject: [PATCH 225/497] kvm: arm64: Partially link nVHE hyp code, simplify HYPCOPY Relying on objcopy to prefix the ELF section names of the nVHE hyp code is brittle and prevents us from using wildcards to match specific section names. Improve the build rules by partially linking all '.nvhe.o' files and prefixing their ELF section names using a linker script. Continue using objcopy for prefixing ELF symbol names. One immediate advantage of this approach is that all subsections matching a pattern can be merged into a single prefixed section, eg. .text and .text.* can be linked into a single '.hyp.text'. This removes the need for -fno-reorder-functions on GCC and will be useful in the future too: LTO builds use .text subsections, compilers routinely generate .rodata subsections, etc. Partially linking all hyp code into a single object file also makes it easier to analyze. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-2-dbrazdil@google.com --- arch/arm64/include/asm/hyp_image.h | 24 ++++++++++++ arch/arm64/kvm/hyp/nvhe/.gitignore | 2 + arch/arm64/kvm/hyp/nvhe/Makefile | 60 ++++++++++++++++-------------- arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 13 +++++++ 4 files changed, 72 insertions(+), 27 deletions(-) create mode 100644 arch/arm64/include/asm/hyp_image.h create mode 100644 arch/arm64/kvm/hyp/nvhe/.gitignore create mode 100644 arch/arm64/kvm/hyp/nvhe/hyp.lds.S diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h new file mode 100644 index 000000000000..2e38fcda02fc --- /dev/null +++ b/arch/arm64/include/asm/hyp_image.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + * Written by David Brazdil + */ + +#ifndef __ARM64_HYP_IMAGE_H__ +#define __ARM64_HYP_IMAGE_H__ + +#ifdef LINKER_SCRIPT + +/* + * KVM nVHE ELF section names are prefixed with .hyp, to separate them + * from the kernel proper. + */ +#define HYP_SECTION_NAME(NAME) .hyp##NAME + +/* Defines an ELF hyp section from input section @NAME and its subsections. */ +#define HYP_SECTION(NAME) \ + HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) } + +#endif /* LINKER_SCRIPT */ + +#endif /* __ARM64_HYP_IMAGE_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore new file mode 100644 index 000000000000..695d73d0249e --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +hyp.lds diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index aef76487edc2..2b27b60182f9 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -10,40 +10,46 @@ obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o -obj-y := $(patsubst %.o,%.hyp.o,$(obj-y)) -extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y)) +## +## Build rules for compiling nVHE hyp code +## Output of this folder is `kvm_nvhe.o`, a partially linked object +## file containing all nVHE hyp code and data. +## -$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE +hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y)) +obj-y := kvm_nvhe.o +extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds + +# 1) Compile all source files to `.nvhe.o` object files. The file extension +# avoids file name clashes for files shared with VHE. +$(obj)/%.nvhe.o: $(src)/%.c FORCE $(call if_changed_rule,cc_o_c) -$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE +$(obj)/%.nvhe.o: $(src)/%.S FORCE $(call if_changed_rule,as_o_S) -$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE + +# 2) Compile linker script. +$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE + $(call if_changed_dep,cpp_lds_S) + +# 3) Partially link all '.nvhe.o' files and apply the linker script. +# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'. +# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before +# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to +# keep the dependency on the target while avoiding an error from +# GNU ld if the linker script is passed to it twice. +LDFLAGS_kvm_nvhe.tmp.o := -r -T +$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE + $(call if_changed,ld) + +# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'. +# Prefixes names of ELF symbols with '__kvm_nvhe_'. +$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE $(call if_changed,hypcopy) -# Disable reordering functions by GCC (enabled at -O2). -# This pass puts functions into '.text.*' sections to aid the linker -# in optimizing ELF layout. See HYPCOPY comment below for more info. -ccflags-y += $(call cc-option,-fno-reorder-functions) - # The HYPCOPY command uses `objcopy` to prefix all ELF symbol names -# and relevant ELF section names to avoid clashes with VHE code/data. -# -# Hyp code is assumed to be in the '.text' section of the input object -# files (with the exception of specialized sections such as -# '.hyp.idmap.text'). This assumption may be broken by a compiler that -# divides code into sections like '.text.unlikely' so as to optimize -# ELF layout. HYPCOPY checks that no such sections exist in the input -# using `objdump`, otherwise they would be linked together with other -# kernel code and not memory-mapped correctly at runtime. +# to avoid clashes with VHE code/data. quiet_cmd_hypcopy = HYPCOPY $@ - cmd_hypcopy = \ - if $(OBJDUMP) -h $< | grep -F '.text.'; then \ - echo "$@: function reordering not supported in nVHE hyp code" >&2; \ - /bin/false; \ - fi; \ - $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \ - --rename-section=.text=.hyp.text \ - $< $@ + cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@ # Remove ftrace and Shadow Call Stack CFLAGS. # This is equivalent to the 'notrace' and '__noscs' annotations. diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S new file mode 100644 index 000000000000..3b13d1c7cd1a --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + * Written by David Brazdil + * + * Linker script used for partial linking of nVHE EL2 object files. + */ + +#include + +SECTIONS { + HYP_SECTION(.text) +} From ce492a16ffb8814d9651c3fdafc363bfa1b01189 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:02 +0100 Subject: [PATCH 226/497] kvm: arm64: Move nVHE hyp namespace macros to hyp_image.h Minor cleanup to move all macros related to prefixing nVHE hyp section and symbol names into one place: hyp_image.h. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-3-dbrazdil@google.com --- arch/arm64/include/asm/hyp_image.h | 12 ++++++++++++ arch/arm64/include/asm/kvm_asm.h | 8 +------- arch/arm64/kernel/image-vars.h | 2 -- arch/arm64/kernel/vmlinux.lds.S | 1 + 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h index 2e38fcda02fc..daa1a1da539e 100644 --- a/arch/arm64/include/asm/hyp_image.h +++ b/arch/arm64/include/asm/hyp_image.h @@ -7,6 +7,12 @@ #ifndef __ARM64_HYP_IMAGE_H__ #define __ARM64_HYP_IMAGE_H__ +/* + * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, + * to separate it from the kernel proper. + */ +#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym + #ifdef LINKER_SCRIPT /* @@ -19,6 +25,12 @@ #define HYP_SECTION(NAME) \ HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) } +/* + * Defines a linker script alias of a kernel-proper symbol referenced by + * KVM nVHE hyp code. + */ +#define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym; + #endif /* LINKER_SCRIPT */ #endif /* __ARM64_HYP_IMAGE_H__ */ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 6f98fbd0ac81..c085032e2e3e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -7,6 +7,7 @@ #ifndef __ARM_KVM_ASM_H__ #define __ARM_KVM_ASM_H__ +#include #include #define VCPU_WORKAROUND_2_FLAG_SHIFT 0 @@ -42,13 +43,6 @@ #include -/* - * Translate name of a symbol defined in nVHE hyp to the name seen - * by kernel proper. All nVHE symbols are prefixed by the build system - * to avoid clashes with the VHE variants. - */ -#define kvm_nvhe_sym(sym) __kvm_nvhe_##sym - #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] #define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[] diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 8982b68289b7..76da2ad1010c 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -61,8 +61,6 @@ __efistub__ctype = _ctype; * memory mappings. */ -#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym; - /* Alternative callbacks for init-time patching of nVHE hyp code. */ KVM_NVHE_ALIAS(arm64_enable_wa2_handling); KVM_NVHE_ALIAS(kvm_patch_vector_branch); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7cba7623fcec..fbb13f38d0c5 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -9,6 +9,7 @@ #include #include +#include #include #include #include From 3471ee06e33e413d7fa73c1aa3092e6e794b9e05 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:03 +0100 Subject: [PATCH 227/497] kvm: arm64: Only define __kvm_ex_table for CONFIG_KVM Minor cleanup that only creates __kvm_ex_table ELF section and related symbols if CONFIG_KVM is enabled. Also useful as more hyp-specific sections will be added. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-4-dbrazdil@google.com --- arch/arm64/kernel/vmlinux.lds.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index fbb13f38d0c5..d14166012e51 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -22,11 +22,15 @@ ENTRY(_text) jiffies = jiffies_64; +#ifdef CONFIG_KVM #define HYPERVISOR_EXTABLE \ . = ALIGN(SZ_8); \ __start___kvm_ex_table = .; \ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; +#else /* CONFIG_KVM */ +#define HYPERVISOR_EXTABLE +#endif #define HYPERVISOR_TEXT \ /* \ From 717cf94adb54095d14a6674baea73123188f2901 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:04 +0100 Subject: [PATCH 228/497] kvm: arm64: Remove __hyp_this_cpu_read this_cpu_ptr is meant for use in kernel proper because it selects between TPIDR_EL1/2 based on nVHE/VHE. __hyp_this_cpu_ptr was used in hyp to always select TPIDR_EL2. Unify all users behind this_cpu_ptr and friends by selecting _EL2 register under __KVM_NVHE_HYPERVISOR__. VHE continues selecting the register using alternatives. Under CONFIG_DEBUG_PREEMPT, the kernel helpers perform a preemption check which is omitted by the hyp helpers. Preserve the behavior for nVHE by overriding the corresponding macros under __KVM_NVHE_HYPERVISOR__. Extend the checks into VHE hyp code. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Andrew Scull Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-5-dbrazdil@google.com --- arch/arm64/include/asm/kvm_asm.h | 20 ---------------- arch/arm64/include/asm/percpu.h | 28 +++++++++++++++++++++-- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 4 ++-- arch/arm64/kvm/hyp/include/hyp/switch.h | 8 +++---- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 4 ++-- 7 files changed, 36 insertions(+), 32 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index c085032e2e3e..c196eec25498 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -143,26 +143,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; addr; \ }) -/* - * Home-grown __this_cpu_{ptr,read} variants that always work at HYP, - * provided that sym is really a *symbol* and not a pointer obtained from - * a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps - * sparse quiet. - */ -#define __hyp_this_cpu_ptr(sym) \ - ({ \ - void *__ptr; \ - __verify_pcpu_ptr(&sym); \ - __ptr = hyp_symbol_addr(sym); \ - __ptr += read_sysreg(tpidr_el2); \ - (typeof(sym) __kernel __force *)__ptr; \ - }) - -#define __hyp_this_cpu_read(sym) \ - ({ \ - *__hyp_this_cpu_ptr(sym); \ - }) - #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ " .align 3\n" \ diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 0b6409b89e5e..1599e17379d8 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off) :: "r" (off) : "memory"); } -static inline unsigned long __my_cpu_offset(void) +static inline unsigned long __hyp_my_cpu_offset(void) +{ + /* + * Non-VHE hyp code runs with preemption disabled. No need to hazard + * the register access against barrier() as in __kern_my_cpu_offset. + */ + return read_sysreg(tpidr_el2); +} + +static inline unsigned long __kern_my_cpu_offset(void) { unsigned long off; @@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void) return off; } -#define __my_cpu_offset __my_cpu_offset() + +#ifdef __KVM_NVHE_HYPERVISOR__ +#define __my_cpu_offset __hyp_my_cpu_offset() +#else +#define __my_cpu_offset __kern_my_cpu_offset() +#endif #define PERCPU_RW_OPS(sz) \ static inline unsigned long __percpu_read_##sz(void *ptr) \ @@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd) #include +/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */ +#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT) +#undef this_cpu_ptr +#define this_cpu_ptr raw_cpu_ptr +#undef __this_cpu_read +#define __this_cpu_read raw_cpu_read +#undef __this_cpu_write +#define __this_cpu_write raw_cpu_write +#endif + #endif /* __ASM_PERCPU_H */ diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 5e28ea6aa097..4ebe9f558f3a 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) return; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; guest_ctxt = &vcpu->arch.ctxt; host_dbg = &vcpu->arch.host_debug_state.regs; guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); @@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) return; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; guest_ctxt = &vcpu->arch.ctxt; host_dbg = &vcpu->arch.host_debug_state.regs; guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5b6b8fa00f0a..f150407fa798 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -386,7 +386,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) return false; - ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; __ptrauth_save_key(ctxt, APIA); __ptrauth_save_key(ctxt, APIB); __ptrauth_save_key(ctxt, APDA); @@ -495,7 +495,7 @@ static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu) * guest wants it disabled, so be it... */ if (__needs_ssbd_off(vcpu) && - __hyp_this_cpu_read(arm64_ssbd_callback_required)) + __this_cpu_read(arm64_ssbd_callback_required)) arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL); #endif } @@ -507,7 +507,7 @@ static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu) * If the guest has disabled the workaround, bring it back on. */ if (__needs_ssbd_off(vcpu) && - __hyp_this_cpu_read(arm64_ssbd_callback_required)) + __this_cpu_read(arm64_ssbd_callback_required)) arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL); #endif } @@ -521,7 +521,7 @@ static inline void __kvm_unexpected_el2_exception(void) entry = hyp_symbol_addr(__start___kvm_ex_table); end = hyp_symbol_addr(__stop___kvm_ex_table); - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; while (entry < end) { addr = (unsigned long)&entry->insn + entry->insn; diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 0970442d2dbc..cc4f8e790fb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -175,7 +175,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) vcpu = kern_hyp_va(vcpu); - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c1da4f86ccac..575e8054f116 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -108,7 +108,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt; u64 exit_code; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; host_ctxt->__hyp_running_vcpu = vcpu; guest_ctxt = &vcpu->arch.ctxt; diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 996471e4c138..2a0b8c88d74f 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; __sysreg_save_user_state(host_ctxt); /* @@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; - host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt; + host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; deactivate_traps_vhe_put(); __sysreg_save_el1_state(guest_ctxt); From ea391027d35546d9155f1350123b5af8bddec706 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:05 +0100 Subject: [PATCH 229/497] kvm: arm64: Remove hyp_adr/ldr_this_cpu The hyp_adr/ldr_this_cpu helpers were introduced for use in hyp code because they always needed to use TPIDR_EL2 for base, while adr/ldr_this_cpu from kernel proper would select between TPIDR_EL2 and _EL1 based on VHE/nVHE. Simplify this now that the hyp mode case can be handled using the __KVM_VHE/NVHE_HYPERVISOR__ macros. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Andrew Scull Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-6-dbrazdil@google.com --- arch/arm64/include/asm/assembler.h | 29 +++++++++++++++++++---------- arch/arm64/include/asm/kvm_asm.h | 14 +------------- arch/arm64/kvm/hyp/hyp-entry.S | 2 +- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 54d181177656..ddbe6bf00e33 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -218,6 +218,23 @@ lr .req x30 // link register str \src, [\tmp, :lo12:\sym] .endm + /* + * @dst: destination register + */ +#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__) + .macro this_cpu_offset, dst + mrs \dst, tpidr_el2 + .endm +#else + .macro this_cpu_offset, dst +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + mrs \dst, tpidr_el1 +alternative_else + mrs \dst, tpidr_el2 +alternative_endif + .endm +#endif + /* * @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP) * @sym: The name of the per-cpu variable @@ -226,11 +243,7 @@ lr .req x30 // link register .macro adr_this_cpu, dst, sym, tmp adrp \tmp, \sym add \dst, \tmp, #:lo12:\sym -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs \tmp, tpidr_el1 -alternative_else - mrs \tmp, tpidr_el2 -alternative_endif + this_cpu_offset \tmp add \dst, \dst, \tmp .endm @@ -241,11 +254,7 @@ alternative_endif */ .macro ldr_this_cpu dst, sym, tmp adr_l \dst, \sym -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs \tmp, tpidr_el1 -alternative_else - mrs \tmp, tpidr_el2 -alternative_endif + this_cpu_offset \tmp ldr \dst, [\dst, \tmp] .endm diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index c196eec25498..cf9456663289 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -173,20 +173,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ]; #else /* __ASSEMBLY__ */ -.macro hyp_adr_this_cpu reg, sym, tmp - adr_l \reg, \sym - mrs \tmp, tpidr_el2 - add \reg, \reg, \tmp -.endm - -.macro hyp_ldr_this_cpu reg, sym, tmp - adr_l \reg, \sym - mrs \tmp, tpidr_el2 - ldr \reg, [\reg, \tmp] -.endm - .macro get_host_ctxt reg, tmp - hyp_adr_this_cpu \reg, kvm_host_data, \tmp + adr_this_cpu \reg, kvm_host_data, \tmp add \reg, \reg, #HOST_DATA_CONTEXT .endm diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 46b4dab933d0..fba91c2ab410 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -132,7 +132,7 @@ alternative_cb_end str x0, [x2, #VCPU_WORKAROUND_FLAGS] /* Check that we actually need to perform the call */ - hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2 + ldr_this_cpu x0, arm64_ssbd_callback_required, x2 cbz x0, wa2_end mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 From 572494995bc3d282336bfd8162741929402910b9 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:06 +0100 Subject: [PATCH 230/497] kvm: arm64: Add helpers for accessing nVHE hyp per-cpu vars Defining a per-CPU variable in hyp/nvhe will result in its name being prefixed with __kvm_nvhe_. Add helpers for declaring these variables in kernel proper and accessing them with this_cpu_ptr and per_cpu_ptr. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-7-dbrazdil@google.com --- arch/arm64/include/asm/kvm_asm.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index cf9456663289..911d91787fa0 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -54,9 +54,21 @@ DECLARE_KVM_VHE_SYM(sym); \ DECLARE_KVM_NVHE_SYM(sym) +#define DECLARE_KVM_VHE_PER_CPU(type, sym) \ + DECLARE_PER_CPU(type, sym) +#define DECLARE_KVM_NVHE_PER_CPU(type, sym) \ + DECLARE_PER_CPU(type, kvm_nvhe_sym(sym)) + +#define DECLARE_KVM_HYP_PER_CPU(type, sym) \ + DECLARE_KVM_VHE_PER_CPU(type, sym); \ + DECLARE_KVM_NVHE_PER_CPU(type, sym) + #define CHOOSE_VHE_SYM(sym) sym #define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) +#define this_cpu_ptr_nvhe_sym(sym) this_cpu_ptr(&kvm_nvhe_sym(sym)) +#define per_cpu_ptr_nvhe_sym(sym, cpu) per_cpu_ptr(&kvm_nvhe_sym(sym), cpu) + #ifndef __KVM_NVHE_HYPERVISOR__ /* * BIG FAT WARNINGS: @@ -69,12 +81,21 @@ * - Don't let the nVHE hypervisor have access to this, as it will * pick the *wrong* symbol (yes, it runs at EL2...). */ -#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \ +#define CHOOSE_HYP_SYM(sym) (is_kernel_in_hyp_mode() \ + ? CHOOSE_VHE_SYM(sym) \ : CHOOSE_NVHE_SYM(sym)) +#define this_cpu_ptr_hyp_sym(sym) (is_kernel_in_hyp_mode() \ + ? this_cpu_ptr(&sym) \ + : this_cpu_ptr_nvhe_sym(sym)) +#define per_cpu_ptr_hyp_sym(sym, cpu) (is_kernel_in_hyp_mode() \ + ? per_cpu_ptr(&sym, cpu) \ + : per_cpu_ptr_nvhe_sym(sym, cpu)) #else /* The nVHE hypervisor shouldn't even try to access anything */ extern void *__nvhe_undefined_symbol; -#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol +#define CHOOSE_HYP_SYM(sym) __nvhe_undefined_symbol +#define this_cpu_ptr_hyp_sym(sym) (&__nvhe_undefined_symbol) +#define per_cpu_ptr_hyp_sym(sym, cpu) (&__nvhe_undefined_symbol) #endif /* Translate a kernel address @ptr into its equivalent linear mapping */ From df4c8214a18d202fa0ec221a001f640e020f7e44 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:07 +0100 Subject: [PATCH 231/497] kvm: arm64: Duplicate arm64_ssbd_callback_required for nVHE hyp Hyp keeps track of which cores require SSBD callback by accessing a kernel-proper global variable. Create an nVHE symbol of the same name and copy the value from kernel proper to nVHE as KVM is being enabled on a core. Done in preparation for separating percpu memory owned by kernel proper and nVHE. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-8-dbrazdil@google.com --- arch/arm64/include/asm/kvm_mmu.h | 14 +++++++++++++- arch/arm64/kernel/image-vars.h | 1 - arch/arm64/kvm/arm.c | 3 +++ arch/arm64/kvm/hyp/nvhe/switch.c | 3 +++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 189839c3706a..e134c2ba2c5d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -529,6 +529,7 @@ static inline int kvm_map_vectors(void) #ifdef CONFIG_ARM64_SSBD DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); +DECLARE_KVM_NVHE_PER_CPU(u64, arm64_ssbd_callback_required); static inline int hyp_map_aux_data(void) { @@ -537,18 +538,29 @@ static inline int hyp_map_aux_data(void) for_each_possible_cpu(cpu) { u64 *ptr; - ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu); + ptr = per_cpu_ptr_nvhe_sym(arm64_ssbd_callback_required, cpu); err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP); if (err) return err; } return 0; } + +static inline void hyp_init_aux_data(void) +{ + u64 *ptr; + + /* Copy arm64_ssbd_callback_required value from kernel to hyp. */ + ptr = this_cpu_ptr_nvhe_sym(arm64_ssbd_callback_required); + *ptr = __this_cpu_read(arm64_ssbd_callback_required); +} #else static inline int hyp_map_aux_data(void) { return 0; } + +static inline void hyp_init_aux_data(void) {} #endif #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 76da2ad1010c..59d12a0b4622 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -67,7 +67,6 @@ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); /* Global kernel state accessed by nVHE hyp code. */ -KVM_NVHE_ALIAS(arm64_ssbd_callback_required); KVM_NVHE_ALIAS(kvm_host_data); KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 46dc3d75cf13..cbfff390f55f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1295,6 +1295,9 @@ static void cpu_init_hyp_mode(void) arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { kvm_call_hyp_nvhe(__kvm_enable_ssbs); } + + /* Copy information whether SSBD callback is required to hyp. */ + hyp_init_aux_data(); } static void cpu_hyp_reset(void) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index cc4f8e790fb3..4662df6330d7 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -27,6 +27,9 @@ #include #include +/* Non-VHE copy of the kernel symbol. */ +DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; From 2a1198c9b436402582f7beed57028044b819329c Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:08 +0100 Subject: [PATCH 232/497] kvm: arm64: Create separate instances of kvm_host_data for VHE/nVHE Host CPU context is stored in a global per-cpu variable `kvm_host_data`. In preparation for introducing independent per-CPU region for nVHE hyp, create two separate instances of `kvm_host_data`, one for VHE and one for nVHE. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-9-dbrazdil@google.com --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kernel/image-vars.h | 1 - arch/arm64/kvm/arm.c | 5 ++--- arch/arm64/kvm/hyp/nvhe/switch.c | 3 +++ arch/arm64/kvm/hyp/vhe/switch.c | 3 +++ arch/arm64/kvm/pmu.c | 8 ++++---- 6 files changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e52c927aade5..964e05777fe3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -565,7 +565,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data); +DECLARE_KVM_HYP_PER_CPU(kvm_host_data_t, kvm_host_data); static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 59d12a0b4622..80da861b8180 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -67,7 +67,6 @@ KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); /* Global kernel state accessed by nVHE hyp code. */ -KVM_NVHE_ALIAS(kvm_host_data); KVM_NVHE_ALIAS(kvm_vgic_global_state); /* Kernel constant needed to compute idmap addresses. */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index cbfff390f55f..a9d6f095031d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -46,7 +46,6 @@ __asm__(".arch_extension virt"); #endif -DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); /* The VMID used in the VTTBR */ @@ -1308,7 +1307,7 @@ static void cpu_hyp_reset(void) static void cpu_hyp_reinit(void) { - kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt); + kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); cpu_hyp_reset(); @@ -1543,7 +1542,7 @@ static int init_hyp_mode(void) for_each_possible_cpu(cpu) { kvm_host_data_t *cpu_data; - cpu_data = per_cpu_ptr(&kvm_host_data, cpu); + cpu_data = per_cpu_ptr_hyp_sym(kvm_host_data, cpu); err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); if (err) { diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 4662df6330d7..a7e9b03bd9d1 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -30,6 +30,9 @@ /* Non-VHE copy of the kernel symbol. */ DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); +/* Non-VHE instance of kvm_host_data. */ +DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 575e8054f116..0949fc97bf03 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -28,6 +28,9 @@ const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; +/* VHE instance of kvm_host_data. */ +DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 3c224162b3dd..c869c851d2dd 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -31,7 +31,7 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr) */ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { - struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); if (!kvm_pmu_switch_needed(attr)) return; @@ -47,7 +47,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) */ void kvm_clr_pmu_events(u32 clr) { - struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data); + struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); ctx->pmu_events.events_host &= ~clr; ctx->pmu_events.events_guest &= ~clr; @@ -173,7 +173,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) return; preempt_disable(); - host = this_cpu_ptr(&kvm_host_data); + host = this_cpu_ptr_hyp_sym(kvm_host_data); events_guest = host->pmu_events.events_guest; events_host = host->pmu_events.events_host; @@ -193,7 +193,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) if (!has_vhe()) return; - host = this_cpu_ptr(&kvm_host_data); + host = this_cpu_ptr_hyp_sym(kvm_host_data); events_guest = host->pmu_events.events_guest; events_host = host->pmu_events.events_host; From 30c953911c4370bfb622ee1c2fcc7e78c84df800 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:09 +0100 Subject: [PATCH 233/497] kvm: arm64: Set up hyp percpu data for nVHE Add hyp percpu section to linker script and rename the corresponding ELF sections of hyp/nvhe object files. This moves all nVHE-specific percpu variables to the new hyp percpu section. Allocate sufficient amount of memory for all percpu hyp regions at global KVM init time and create corresponding hyp mappings. The base addresses of hyp percpu regions are kept in a dynamically allocated array in the kernel. Add NULL checks in PMU event-reset code as it may run before KVM memory is initialized. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-10-dbrazdil@google.com --- arch/arm64/include/asm/kvm_asm.h | 19 +++++++++-- arch/arm64/kernel/vmlinux.lds.S | 8 +++++ arch/arm64/kvm/arm.c | 55 +++++++++++++++++++++++++++++-- arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 6 ++++ arch/arm64/kvm/pmu.c | 5 ++- 5 files changed, 87 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 911d91787fa0..863f669d4dc8 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -66,8 +66,19 @@ #define CHOOSE_VHE_SYM(sym) sym #define CHOOSE_NVHE_SYM(sym) kvm_nvhe_sym(sym) -#define this_cpu_ptr_nvhe_sym(sym) this_cpu_ptr(&kvm_nvhe_sym(sym)) -#define per_cpu_ptr_nvhe_sym(sym, cpu) per_cpu_ptr(&kvm_nvhe_sym(sym), cpu) +/* + * Compute pointer to a symbol defined in nVHE percpu region. + * Returns NULL if percpu memory has not been allocated yet. + */ +#define this_cpu_ptr_nvhe_sym(sym) per_cpu_ptr_nvhe_sym(sym, smp_processor_id()) +#define per_cpu_ptr_nvhe_sym(sym, cpu) \ + ({ \ + unsigned long base, off; \ + base = kvm_arm_hyp_percpu_base[cpu]; \ + off = (unsigned long)&CHOOSE_NVHE_SYM(sym) - \ + (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start); \ + base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL; \ + }) #ifndef __KVM_NVHE_HYPERVISOR__ /* @@ -117,6 +128,10 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) +extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; +DECLARE_KVM_NVHE_SYM(__per_cpu_start); +DECLARE_KVM_NVHE_SYM(__per_cpu_end); + #ifdef CONFIG_KVM_INDIRECT_VECTORS extern atomic_t arm64_el2_vector_last_slot; DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index d14166012e51..d52e6b5dbfd3 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -28,8 +28,15 @@ jiffies = jiffies_64; __start___kvm_ex_table = .; \ *(__kvm_ex_table) \ __stop___kvm_ex_table = .; + +#define HYPERVISOR_PERCPU_SECTION \ + . = ALIGN(PAGE_SIZE); \ + HYP_SECTION_NAME(.data..percpu) : { \ + *(HYP_SECTION_NAME(.data..percpu)) \ + } #else /* CONFIG_KVM */ #define HYPERVISOR_EXTABLE +#define HYPERVISOR_PERCPU_SECTION #endif #define HYPERVISOR_TEXT \ @@ -195,6 +202,7 @@ SECTIONS } PERCPU_SECTION(L1_CACHE_BYTES) + HYPERVISOR_PERCPU_SECTION .rela.dyn : ALIGN(8) { *(.rela .rela*) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a9d6f095031d..35bed6b6d0a6 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -47,6 +47,7 @@ __asm__(".arch_extension virt"); #endif static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); @@ -1255,6 +1256,19 @@ long kvm_arch_vm_ioctl(struct file *filp, } } +static unsigned long nvhe_percpu_size(void) +{ + return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - + (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); +} + +static unsigned long nvhe_percpu_order(void) +{ + unsigned long size = nvhe_percpu_size(); + + return size ? get_order(size) : 0; +} + static void cpu_init_hyp_mode(void) { phys_addr_t pgd_ptr; @@ -1270,8 +1284,8 @@ static void cpu_init_hyp_mode(void) * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. */ - tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) - - (unsigned long)kvm_ksym_ref(&kvm_host_data)); + tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) - + (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); pgd_ptr = kvm_mmu_get_httbr(); hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE; @@ -1461,8 +1475,10 @@ static void teardown_hyp_mode(void) int cpu; free_hyp_pgds(); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order()); + } } /** @@ -1495,6 +1511,24 @@ static int init_hyp_mode(void) per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; } + /* + * Allocate and initialize pages for Hypervisor-mode percpu regions. + */ + for_each_possible_cpu(cpu) { + struct page *page; + void *page_addr; + + page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); + if (!page) { + err = -ENOMEM; + goto out_err; + } + + page_addr = page_address(page); + memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); + kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr; + } + /* * Map the Hyp-code called directly from the host */ @@ -1539,6 +1573,21 @@ static int init_hyp_mode(void) } } + /* + * Map Hyp percpu pages + */ + for_each_possible_cpu(cpu) { + char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; + char *percpu_end = percpu_begin + nvhe_percpu_size(); + + err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); + + if (err) { + kvm_err("Cannot map hyp percpu region\n"); + goto out_err; + } + } + for_each_possible_cpu(cpu) { kvm_host_data_t *cpu_data; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S index 3b13d1c7cd1a..bb2d986ff696 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S @@ -7,7 +7,13 @@ */ #include +#include +#include +#include SECTIONS { HYP_SECTION(.text) + HYP_SECTION_NAME(.data..percpu) : { + PERCPU_INPUT(L1_CACHE_BYTES) + } } diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index c869c851d2dd..faf32a44ba04 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); - if (!kvm_pmu_switch_needed(attr)) + if (!ctx || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -49,6 +49,9 @@ void kvm_clr_pmu_events(u32 clr) { struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); + if (!ctx) + return; + ctx->pmu_events.events_host &= ~clr; ctx->pmu_events.events_guest &= ~clr; } From a3bb9c3a00551726590137e3974495ce6cf6b758 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 22 Sep 2020 21:49:10 +0100 Subject: [PATCH 234/497] kvm: arm64: Remove unnecessary hyp mappings With all nVHE per-CPU variables being part of the hyp per-CPU region, mapping them individual is not necessary any longer. They are mapped to hyp as part of the overall per-CPU region. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Acked-by: Andrew Scull Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200922204910.7265-11-dbrazdil@google.com --- arch/arm64/include/asm/kvm_mmu.h | 20 -------------------- arch/arm64/kvm/arm.c | 16 ---------------- 2 files changed, 36 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index e134c2ba2c5d..8e7919801196 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -531,21 +531,6 @@ static inline int kvm_map_vectors(void) DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); DECLARE_KVM_NVHE_PER_CPU(u64, arm64_ssbd_callback_required); -static inline int hyp_map_aux_data(void) -{ - int cpu, err; - - for_each_possible_cpu(cpu) { - u64 *ptr; - - ptr = per_cpu_ptr_nvhe_sym(arm64_ssbd_callback_required, cpu); - err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP); - if (err) - return err; - } - return 0; -} - static inline void hyp_init_aux_data(void) { u64 *ptr; @@ -555,11 +540,6 @@ static inline void hyp_init_aux_data(void) *ptr = __this_cpu_read(arm64_ssbd_callback_required); } #else -static inline int hyp_map_aux_data(void) -{ - return 0; -} - static inline void hyp_init_aux_data(void) {} #endif diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 35bed6b6d0a6..92c88deea357 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1588,22 +1588,6 @@ static int init_hyp_mode(void) } } - for_each_possible_cpu(cpu) { - kvm_host_data_t *cpu_data; - - cpu_data = per_cpu_ptr_hyp_sym(kvm_host_data, cpu); - err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP); - - if (err) { - kvm_err("Cannot map host CPU state: %d\n", err); - goto out_err; - } - } - - err = hyp_map_aux_data(); - if (err) - kvm_err("Cannot map host auxiliary data: %d\n", err); - return 0; out_err: From 3e98fd6d816cd82f529345870efd435f06e02803 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 25 Sep 2020 09:26:04 -0700 Subject: [PATCH 235/497] ARM: dts: cros-ec-keyboard: Add alternate keymap for KEY_LEFTMETA On newer keyboards this key is in a different place. Add both options to the keymap so that both new and old keyboards work. Cc: Douglas Anderson Signed-off-by: Stephen Boyd Reviewed-by: Douglas Anderson Reviewed-by: Heiko Stuebner Reviewed-by: Matthias Brugger Signed-off-by: Enric Balletbo i Serra --- arch/arm/boot/dts/cros-ec-keyboard.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/cros-ec-keyboard.dtsi b/arch/arm/boot/dts/cros-ec-keyboard.dtsi index 4a0c1037fbc0..165c5bcd510e 100644 --- a/arch/arm/boot/dts/cros-ec-keyboard.dtsi +++ b/arch/arm/boot/dts/cros-ec-keyboard.dtsi @@ -46,6 +46,7 @@ MATRIX_KEY(0x02, 0x09, KEY_F8) MATRIX_KEY(0x02, 0x0a, KEY_YEN) + MATRIX_KEY(0x03, 0x00, KEY_LEFTMETA) MATRIX_KEY(0x03, 0x01, KEY_GRAVE) MATRIX_KEY(0x03, 0x02, KEY_F2) MATRIX_KEY(0x03, 0x03, KEY_5) From 030bdf3698b7c3af190dd1fe714f0545f23441d0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 2 Oct 2020 07:49:46 +0200 Subject: [PATCH 236/497] KVM: arm64: Fix some documentation build warnings As warned with make htmldocs: .../Documentation/virt/kvm/devices/vcpu.rst:70: WARNING: Malformed table. Text in column margin in table line 2. ======= ====================================================== -ENODEV: PMUv3 not supported or GIC not initialized -ENXIO: PMUv3 not properly configured or in-kernel irqchip not configured as required prior to calling this attribute -EBUSY: PMUv3 already initialized -EINVAL: Invalid filter range ======= ====================================================== The ':' character for two lines are above the size of the column. Besides that, other tables at the file doesn't use ':', so just drop them. While here, also fix this warning also introduced at the same patch: .../Documentation/virt/kvm/devices/vcpu.rst:88: WARNING: Block quote ends without a blank line; unexpected unindent. By marking the C code as a literal block. Fixes: 8be86a5eec04 ("KVM: arm64: Document PMU filtering API") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Marc Zyngier Acked-by: Paolo Bonzini Link: https://lore.kernel.org/r/b5385dd0213f1f070667925bf7a807bf5270ba78.1601616399.git.mchehab+huawei@kernel.org --- Documentation/virt/kvm/devices/vcpu.rst | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index da7c2ef7dafc..2acec3b9ef65 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -67,25 +67,25 @@ irqchip. :Returns: ======= ====================================================== - -ENODEV: PMUv3 not supported or GIC not initialized - -ENXIO: PMUv3 not properly configured or in-kernel irqchip not + -ENODEV PMUv3 not supported or GIC not initialized + -ENXIO PMUv3 not properly configured or in-kernel irqchip not configured as required prior to calling this attribute - -EBUSY: PMUv3 already initialized - -EINVAL: Invalid filter range + -EBUSY PMUv3 already initialized + -EINVAL Invalid filter range ======= ====================================================== -Request the installation of a PMU event filter described as follows: +Request the installation of a PMU event filter described as follows:: -struct kvm_pmu_event_filter { - __u16 base_event; - __u16 nevents; + struct kvm_pmu_event_filter { + __u16 base_event; + __u16 nevents; -#define KVM_PMU_EVENT_ALLOW 0 -#define KVM_PMU_EVENT_DENY 1 + #define KVM_PMU_EVENT_ALLOW 0 + #define KVM_PMU_EVENT_DENY 1 - __u8 action; - __u8 pad[3]; -}; + __u8 action; + __u8 pad[3]; + }; A filter range is defined as the range [@base_event, @base_event + @nevents), together with an @action (KVM_PMU_EVENT_ALLOW or KVM_PMU_EVENT_DENY). The From b259d137e91d80bf92eac453ffab179eb7941ede Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Sep 2020 14:18:01 +0100 Subject: [PATCH 237/497] KVM: arm64: Pass level hint to TLBI during stage-2 permission fault Alex pointed out that we don't pass a level hint to the TLBI instruction when handling a stage-2 permission fault, even though the walker does at some point have the level information in its hands. Rework stage2_update_leaf_attrs() so that it can optionally return the level of the updated pte to its caller, which can in turn be used to provide the correct TLBI level hint. Reported-by: Alexandru Elisei Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Reviewed-by: Gavin Shan Cc: Marc Zyngier Link: https://lore.kernel.org/r/595cc73e-636e-8b3a-f93a-b4e9fb218db8@arm.com Link: https://lore.kernel.org/r/20200930131801.16889-1-will@kernel.org --- arch/arm64/kvm/hyp/pgtable.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 603d6b415337..0cdf6e461cbd 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -694,6 +694,7 @@ struct stage2_attr_data { kvm_pte_t attr_set; kvm_pte_t attr_clr; kvm_pte_t pte; + u32 level; }; static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, @@ -706,6 +707,7 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, if (!kvm_pte_valid(pte)) return 0; + data->level = level; data->pte = pte; pte &= ~data->attr_clr; pte |= data->attr_set; @@ -723,7 +725,8 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, - kvm_pte_t attr_clr, kvm_pte_t *orig_pte) + kvm_pte_t attr_clr, kvm_pte_t *orig_pte, + u32 *level) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; @@ -743,20 +746,24 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, if (orig_pte) *orig_pte = data.pte; + + if (level) + *level = data.level; return 0; } int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { return stage2_update_leaf_attrs(pgt, addr, size, 0, - KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL); + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, + NULL, NULL); } kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - &pte); + &pte, NULL); dsb(ishst); return pte; } @@ -765,7 +772,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF, - &pte); + &pte, NULL); /* * "But where's the TLBI?!", you scream. * "Over in the core code", I sigh. @@ -778,7 +785,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr) bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr) { kvm_pte_t pte = 0; - stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte); + stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL); return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF; } @@ -786,6 +793,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot) { int ret; + u32 level; kvm_pte_t set = 0, clr = 0; if (prot & KVM_PGTABLE_PROT_R) @@ -797,8 +805,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL); - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, 0); + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level); + if (!ret) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); return ret; } From ffd1b63a5860968068e943eab33383a766d30f64 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Sep 2020 11:24:42 +0100 Subject: [PATCH 238/497] KVM: arm64: Ensure user_mem_abort() return value is initialised If a change in the MMU notifier sequence number forces user_mem_abort() to return early when attempting to handle a stage-2 fault, we return uninitialised stack to kvm_handle_guest_abort(), which could potentially result in the injection of an external abort into the guest or a spurious return to userspace. Neither or these are what we want to do. Initialise 'ret' to 0 in user_mem_abort() so that bailing due to a change in the MMU notrifier sequence number is treated as though the fault was handled. Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Reviewed-by: Gavin Shan Cc: Gavin Shan Cc: Alexandru Elisei Link: https://lore.kernel.org/r/20200930102442.16142-1-will@kernel.org --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c5c26a9cb85b..a816cb8e619b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -742,7 +742,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { - int ret; + int ret = 0; bool write_fault, writable, force_pte = false; bool exec_fault; bool device = false; From 8f445ffa851e999308bbd4d133ea849936f95179 Mon Sep 17 00:00:00 2001 From: Michael Srba Date: Sun, 4 Oct 2020 14:37:18 -0700 Subject: [PATCH 239/497] dt-bindings: input/touchscreen: add bindings for zinitix This patch adds dts bindings for the zinitix bt541 touchscreen. Signed-off-by: Michael Srba Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201001122949.16846-2-michael.srba@seznam.cz Signed-off-by: Dmitry Torokhov --- .../bindings/input/touchscreen/zinitix.txt | 40 +++++++++++++++++++ .../devicetree/bindings/vendor-prefixes.yaml | 2 + 2 files changed, 42 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/touchscreen/zinitix.txt diff --git a/Documentation/devicetree/bindings/input/touchscreen/zinitix.txt b/Documentation/devicetree/bindings/input/touchscreen/zinitix.txt new file mode 100644 index 000000000000..446efb9f5f55 --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/zinitix.txt @@ -0,0 +1,40 @@ +Device tree bindings for Zinitx BT541 touchscreen controller + +Required properties: + + - compatible : Should be "zinitix,bt541" + - reg : I2C address of the chip. Should be 0x20 + - interrupts : Interrupt to which the chip is connected + +Optional properties: + + - vdd-supply : Analog power supply regulator on VCCA pin + - vddo-supply : Digital power supply regulator on VDD pin + - zinitix,mode : Mode of reporting touch points. Some modes may not work + with a particular ts firmware for unknown reasons. Available + modes are 1 and 2. Mode 2 is the default and preferred. + +The touchscreen-* properties are documented in touchscreen.txt in this +directory. + +Example: + + i2c@00000000 { + /* ... */ + + bt541@20 { + compatible = "zinitix,bt541"; + reg = <0x20>; + interrupt-parent = <&msmgpio>; + interrupts = <13 IRQ_TYPE_EDGE_FALLING>; + pinctrl-names = "default"; + pinctrl-0 = <&tsp_default>; + vdd-supply = <®_vdd_tsp>; + vddo-supply = <&pm8916_l6>; + touchscreen-size-x = <540>; + touchscreen-size-y = <960>; + zinitix,mode = <2>; + }; + + /* ... */ + }; diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index d3891386d671..bd7b2d404124 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -1143,6 +1143,8 @@ patternProperties: description: Shenzhen Zidoo Technology Co., Ltd. "^zii,.*": description: Zodiac Inflight Innovations + "^zinitix,.*": + description: Zinitix Co., Ltd "^zte,.*": description: ZTE Corp. "^zyxel,.*": From 26822652c85eff14e40115255727b2693400c524 Mon Sep 17 00:00:00 2001 From: Michael Srba Date: Sun, 4 Oct 2020 14:37:47 -0700 Subject: [PATCH 240/497] Input: add zinitix touchscreen driver Add support for the bt541 touchscreen IC from zinitix, loosely based on downstream driver. The driver currently supports multitouch (5 touch points). The bt541 seems to support touch keys, but the support was not added because that functionality is not being utilized by the touchscreen used for testing. Based on the similartities between downstream drivers, it seems likely that other similar touchscreen ICs can be supported with this driver in the future. Signed-off-by: Michael Srba Link: https://lore.kernel.org/r/20201001122949.16846-1-michael.srba@seznam.cz Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/Kconfig | 12 + drivers/input/touchscreen/Makefile | 1 + drivers/input/touchscreen/zinitix.c | 581 ++++++++++++++++++++++++++++ 3 files changed, 594 insertions(+) create mode 100644 drivers/input/touchscreen/zinitix.c diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index 35c867b2d9a7..f012fe746df0 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -1322,4 +1322,16 @@ config TOUCHSCREEN_IQS5XX To compile this driver as a module, choose M here: the module will be called iqs5xx. +config TOUCHSCREEN_ZINITIX + tristate "Zinitix touchscreen support" + depends on I2C + help + Say Y here if you have a touchscreen using Zinitix bt541, + or something similar enough. + + If unsure, say N. + + To compile this driver as a module, choose M here: the + module will be called zinitix. + endif diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index 30d1e1b42492..6233541e9173 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -111,3 +111,4 @@ obj-$(CONFIG_TOUCHSCREEN_COLIBRI_VF50) += colibri-vf50-ts.o obj-$(CONFIG_TOUCHSCREEN_ROHM_BU21023) += rohm_bu21023.o obj-$(CONFIG_TOUCHSCREEN_RASPBERRYPI_FW) += raspberrypi-ts.o obj-$(CONFIG_TOUCHSCREEN_IQS5XX) += iqs5xx.o +obj-$(CONFIG_TOUCHSCREEN_ZINITIX) += zinitix.o diff --git a/drivers/input/touchscreen/zinitix.c b/drivers/input/touchscreen/zinitix.c new file mode 100644 index 000000000000..1acc2eb2bcb3 --- /dev/null +++ b/drivers/input/touchscreen/zinitix.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Register Map */ + +#define BT541_SWRESET_CMD 0x0000 +#define BT541_WAKEUP_CMD 0x0001 + +#define BT541_IDLE_CMD 0x0004 +#define BT541_SLEEP_CMD 0x0005 + +#define BT541_CLEAR_INT_STATUS_CMD 0x0003 +#define BT541_CALIBRATE_CMD 0x0006 +#define BT541_SAVE_STATUS_CMD 0x0007 +#define BT541_SAVE_CALIBRATION_CMD 0x0008 +#define BT541_RECALL_FACTORY_CMD 0x000f + +#define BT541_THRESHOLD 0x0020 + +#define BT541_LARGE_PALM_REJECT_AREA_TH 0x003F + +#define BT541_DEBUG_REG 0x0115 /* 0~7 */ + +#define BT541_TOUCH_MODE 0x0010 +#define BT541_CHIP_REVISION 0x0011 +#define BT541_FIRMWARE_VERSION 0x0012 + +#define ZINITIX_USB_DETECT 0x116 + +#define BT541_MINOR_FW_VERSION 0x0121 + +#define BT541_VENDOR_ID 0x001C +#define BT541_HW_ID 0x0014 + +#define BT541_DATA_VERSION_REG 0x0013 +#define BT541_SUPPORTED_FINGER_NUM 0x0015 +#define BT541_EEPROM_INFO 0x0018 +#define BT541_INITIAL_TOUCH_MODE 0x0019 + +#define BT541_TOTAL_NUMBER_OF_X 0x0060 +#define BT541_TOTAL_NUMBER_OF_Y 0x0061 + +#define BT541_DELAY_RAW_FOR_HOST 0x007f + +#define BT541_BUTTON_SUPPORTED_NUM 0x00B0 +#define BT541_BUTTON_SENSITIVITY 0x00B2 +#define BT541_DUMMY_BUTTON_SENSITIVITY 0X00C8 + +#define BT541_X_RESOLUTION 0x00C0 +#define BT541_Y_RESOLUTION 0x00C1 + +#define BT541_POINT_STATUS_REG 0x0080 +#define BT541_ICON_STATUS_REG 0x00AA + +#define BT541_POINT_COORD_REG (BT541_POINT_STATUS_REG + 2) + +#define BT541_AFE_FREQUENCY 0x0100 +#define BT541_DND_N_COUNT 0x0122 +#define BT541_DND_U_COUNT 0x0135 + +#define BT541_RAWDATA_REG 0x0200 + +#define BT541_EEPROM_INFO_REG 0x0018 + +#define BT541_INT_ENABLE_FLAG 0x00f0 +#define BT541_PERIODICAL_INTERRUPT_INTERVAL 0x00f1 + +#define BT541_BTN_WIDTH 0x016d + +#define BT541_CHECKSUM_RESULT 0x012c + +#define BT541_INIT_FLASH 0x01d0 +#define BT541_WRITE_FLASH 0x01d1 +#define BT541_READ_FLASH 0x01d2 + +#define ZINITIX_INTERNAL_FLAG_02 0x011e +#define ZINITIX_INTERNAL_FLAG_03 0x011f + +#define ZINITIX_I2C_CHECKSUM_WCNT 0x016a +#define ZINITIX_I2C_CHECKSUM_RESULT 0x016c + +/* Interrupt & status register flags */ + +#define BIT_PT_CNT_CHANGE BIT(0) +#define BIT_DOWN BIT(1) +#define BIT_MOVE BIT(2) +#define BIT_UP BIT(3) +#define BIT_PALM BIT(4) +#define BIT_PALM_REJECT BIT(5) +#define BIT_RESERVED_0 BIT(6) +#define BIT_RESERVED_1 BIT(7) +#define BIT_WEIGHT_CHANGE BIT(8) +#define BIT_PT_NO_CHANGE BIT(9) +#define BIT_REJECT BIT(10) +#define BIT_PT_EXIST BIT(11) +#define BIT_RESERVED_2 BIT(12) +#define BIT_ERROR BIT(13) +#define BIT_DEBUG BIT(14) +#define BIT_ICON_EVENT BIT(15) + +#define SUB_BIT_EXIST BIT(0) +#define SUB_BIT_DOWN BIT(1) +#define SUB_BIT_MOVE BIT(2) +#define SUB_BIT_UP BIT(3) +#define SUB_BIT_UPDATE BIT(4) +#define SUB_BIT_WAIT BIT(5) + +#define DEFAULT_TOUCH_POINT_MODE 2 +#define MAX_SUPPORTED_FINGER_NUM 5 + +#define CHIP_ON_DELAY 15 // ms +#define FIRMWARE_ON_DELAY 40 // ms + +struct point_coord { + __le16 x; + __le16 y; + u8 width; + u8 sub_status; + // currently unused, but needed as padding: + u8 minor_width; + u8 angle; +}; + +struct touch_event { + __le16 status; + u8 finger_cnt; + u8 time_stamp; + struct point_coord point_coord[MAX_SUPPORTED_FINGER_NUM]; +}; + +struct bt541_ts_data { + struct i2c_client *client; + struct input_dev *input_dev; + struct touchscreen_properties prop; + struct regulator_bulk_data supplies[2]; + u32 zinitix_mode; +}; + +static int zinitix_read_data(struct i2c_client *client, + u16 reg, void *values, size_t length) +{ + __le16 reg_le = cpu_to_le16(reg); + int ret; + + /* A single i2c_transfer() transaction does not work here. */ + ret = i2c_master_send(client, (u8 *)®_le, sizeof(reg_le)); + if (ret != sizeof(reg_le)) + return ret < 0 ? ret : -EIO; + + ret = i2c_master_recv(client, (u8 *)values, length); + if (ret != length) + return ret < 0 ? ret : -EIO; ; + + return 0; +} + +static int zinitix_write_u16(struct i2c_client *client, u16 reg, u16 value) +{ + __le16 packet[2] = {cpu_to_le16(reg), cpu_to_le16(value)}; + int ret; + + ret = i2c_master_send(client, (u8 *)packet, sizeof(packet)); + if (ret != sizeof(packet)) + return ret < 0 ? ret : -EIO; + + return 0; +} + +static int zinitix_write_cmd(struct i2c_client *client, u16 reg) +{ + __le16 reg_le = cpu_to_le16(reg); + int ret; + + ret = i2c_master_send(client, (u8 *)®_le, sizeof(reg_le)); + if (ret != sizeof(reg_le)) + return ret < 0 ? ret : -EIO; + + return 0; +} + +static bool zinitix_init_touch(struct bt541_ts_data *bt541) +{ + struct i2c_client *client = bt541->client; + int i; + int error; + + error = zinitix_write_cmd(client, BT541_SWRESET_CMD); + if (error) { + dev_err(&client->dev, "Failed to write reset command\n"); + return error; + } + + error = zinitix_write_u16(client, BT541_INT_ENABLE_FLAG, 0x0); + if (error) { + dev_err(&client->dev, + "Failed to reset interrupt enable flag\n"); + return error; + } + + /* initialize */ + error = zinitix_write_u16(client, BT541_X_RESOLUTION, + bt541->prop.max_x); + if (error) + return error; + + error = zinitix_write_u16(client, BT541_Y_RESOLUTION, + bt541->prop.max_y); + if (error) + return error; + + error = zinitix_write_u16(client, BT541_SUPPORTED_FINGER_NUM, + MAX_SUPPORTED_FINGER_NUM); + if (error) + return error; + + error = zinitix_write_u16(client, BT541_INITIAL_TOUCH_MODE, + bt541->zinitix_mode); + if (error) + return error; + + error = zinitix_write_u16(client, BT541_TOUCH_MODE, + bt541->zinitix_mode); + if (error) + return error; + + error = zinitix_write_u16(client, BT541_INT_ENABLE_FLAG, + BIT_PT_CNT_CHANGE | BIT_DOWN | BIT_MOVE | + BIT_UP); + if (error) + return error; + + /* clear queue */ + for (i = 0; i < 10; i++) { + zinitix_write_cmd(client, BT541_CLEAR_INT_STATUS_CMD); + udelay(10); + } + + return 0; +} + +static int zinitix_init_regulators(struct bt541_ts_data *bt541) +{ + struct i2c_client *client = bt541->client; + int error; + + bt541->supplies[0].supply = "vdd"; + bt541->supplies[1].supply = "vddo"; + error = devm_regulator_bulk_get(&client->dev, + ARRAY_SIZE(bt541->supplies), + bt541->supplies); + if (error < 0) { + dev_err(&client->dev, "Failed to get regulators: %d\n", error); + return error; + } + + return 0; +} + +static int zinitix_send_power_on_sequence(struct bt541_ts_data *bt541) +{ + int error; + struct i2c_client *client = bt541->client; + + error = zinitix_write_u16(client, 0xc000, 0x0001); + if (error) { + dev_err(&client->dev, + "Failed to send power sequence(vendor cmd enable)\n"); + return error; + } + udelay(10); + + error = zinitix_write_cmd(client, 0xc004); + if (error) { + dev_err(&client->dev, + "Failed to send power sequence (intn clear)\n"); + return error; + } + udelay(10); + + error = zinitix_write_u16(client, 0xc002, 0x0001); + if (error) { + dev_err(&client->dev, + "Failed to send power sequence (nvm init)\n"); + return error; + } + mdelay(2); + + error = zinitix_write_u16(client, 0xc001, 0x0001); + if (error) { + dev_err(&client->dev, + "Failed to send power sequence (program start)\n"); + return error; + } + msleep(FIRMWARE_ON_DELAY); + + return 0; +} + +static void zinitix_report_finger(struct bt541_ts_data *bt541, int slot, + const struct point_coord *p) +{ + input_mt_slot(bt541->input_dev, slot); + input_mt_report_slot_state(bt541->input_dev, MT_TOOL_FINGER, true); + touchscreen_report_pos(bt541->input_dev, &bt541->prop, + le16_to_cpu(p->x), le16_to_cpu(p->y), true); + input_report_abs(bt541->input_dev, ABS_MT_TOUCH_MAJOR, p->width); +} + +static irqreturn_t zinitix_ts_irq_handler(int irq, void *bt541_handler) +{ + struct bt541_ts_data *bt541 = bt541_handler; + struct i2c_client *client = bt541->client; + struct touch_event touch_event; + int error; + int i; + + memset(&touch_event, 0, sizeof(struct touch_event)); + + error = zinitix_read_data(bt541->client, BT541_POINT_STATUS_REG, + &touch_event, sizeof(struct touch_event)); + if (error) { + dev_err(&client->dev, "Failed to read in touchpoint struct\n"); + goto out; + } + + for (i = 0; i < MAX_SUPPORTED_FINGER_NUM; i++) + if (touch_event.point_coord[i].sub_status & SUB_BIT_EXIST) + zinitix_report_finger(bt541, i, + &touch_event.point_coord[i]); + + input_mt_sync_frame(bt541->input_dev); + input_sync(bt541->input_dev); + +out: + zinitix_write_cmd(bt541->client, BT541_CLEAR_INT_STATUS_CMD); + return IRQ_HANDLED; +} + +static int zinitix_start(struct bt541_ts_data *bt541) +{ + int error; + + error = regulator_bulk_enable(ARRAY_SIZE(bt541->supplies), + bt541->supplies); + if (error) { + dev_err(&bt541->client->dev, + "Failed to enable regulators: %d\n", error); + return error; + } + + msleep(CHIP_ON_DELAY); + + error = zinitix_send_power_on_sequence(bt541); + if (error) { + dev_err(&bt541->client->dev, + "Error while sending power-on sequence: %d\n", error); + return error; + } + + error = zinitix_init_touch(bt541); + if (error) { + dev_err(&bt541->client->dev, + "Error while configuring touch IC\n"); + return error; + } + + enable_irq(bt541->client->irq); + + return 0; +} + +static int zinitix_stop(struct bt541_ts_data *bt541) +{ + int error; + + disable_irq(bt541->client->irq); + + error = regulator_bulk_disable(ARRAY_SIZE(bt541->supplies), + bt541->supplies); + if (error) { + dev_err(&bt541->client->dev, + "Failed to disable regulators: %d\n", error); + return error; + } + + return 0; +} + +static int zinitix_input_open(struct input_dev *dev) +{ + struct bt541_ts_data *bt541 = input_get_drvdata(dev); + + return zinitix_start(bt541); +} + +static void zinitix_input_close(struct input_dev *dev) +{ + struct bt541_ts_data *bt541 = input_get_drvdata(dev); + + zinitix_stop(bt541); +} + +static int zinitix_init_input_dev(struct bt541_ts_data *bt541) +{ + struct input_dev *input_dev; + int error; + + input_dev = devm_input_allocate_device(&bt541->client->dev); + if (!input_dev) { + dev_err(&bt541->client->dev, + "Failed to allocate input device."); + return -ENOMEM; + } + + input_set_drvdata(input_dev, bt541); + bt541->input_dev = input_dev; + + input_dev->name = "Zinitix Capacitive TouchScreen"; + input_dev->phys = "input/ts"; + input_dev->id.bustype = BUS_I2C; + input_dev->open = zinitix_input_open; + input_dev->close = zinitix_input_close; + + input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_X); + input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_Y); + input_set_abs_params(input_dev, ABS_MT_WIDTH_MAJOR, 0, 255, 0, 0); + input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, 255, 0, 0); + + touchscreen_parse_properties(input_dev, true, &bt541->prop); + if (!bt541->prop.max_x || !bt541->prop.max_y) { + dev_err(&bt541->client->dev, + "Touchscreen-size-x and/or touchscreen-size-y not set in dts\n"); + return -EINVAL; + } + + error = input_mt_init_slots(input_dev, MAX_SUPPORTED_FINGER_NUM, + INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED); + if (error) { + dev_err(&bt541->client->dev, + "Failed to initialize MT slots: %d", error); + return error; + } + + error = input_register_device(input_dev); + if (error) { + dev_err(&bt541->client->dev, + "Failed to register input device: %d", error); + return error; + } + + return 0; +} + +static int zinitix_ts_probe(struct i2c_client *client) +{ + struct bt541_ts_data *bt541; + int error; + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + dev_err(&client->dev, + "Failed to assert adapter's support for plain I2C.\n"); + return -ENXIO; + } + + bt541 = devm_kzalloc(&client->dev, sizeof(*bt541), GFP_KERNEL); + if (!bt541) + return -ENOMEM; + + bt541->client = client; + i2c_set_clientdata(client, bt541); + + error = zinitix_init_regulators(bt541); + if (error) { + dev_err(&client->dev, + "Failed to initialize regulators: %d\n", error); + return error; + } + + error = zinitix_init_input_dev(bt541); + if (error) { + dev_err(&client->dev, + "Failed to initialize input device: %d\n", error); + return error; + } + + error = device_property_read_u32(&client->dev, "zinitix,mode", + &bt541->zinitix_mode); + if (error < 0) { + /* fall back to mode 2 */ + bt541->zinitix_mode = DEFAULT_TOUCH_POINT_MODE; + } + + if (bt541->zinitix_mode != 2) { + /* + * If there are devices that don't support mode 2, support + * for other modes (0, 1) will be needed. + */ + dev_err(&client->dev, + "Malformed zinitix,mode property, must be 2 (supplied: %d)\n", + bt541->zinitix_mode); + return -EINVAL; + } + + irq_set_status_flags(client->irq, IRQ_NOAUTOEN); + error = devm_request_threaded_irq(&client->dev, client->irq, + NULL, zinitix_ts_irq_handler, + IRQF_ONESHOT, client->name, bt541); + if (error) { + dev_err(&client->dev, "Failed to request IRQ: %d\n", error); + return error; + } + + return 0; +} + +static int __maybe_unused zinitix_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct bt541_ts_data *bt541 = i2c_get_clientdata(client); + + mutex_lock(&bt541->input_dev->mutex); + + if (bt541->input_dev->users) + zinitix_stop(bt541); + + mutex_unlock(&bt541->input_dev->mutex); + + return 0; +} + +static int __maybe_unused zinitix_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct bt541_ts_data *bt541 = i2c_get_clientdata(client); + int ret = 0; + + mutex_lock(&bt541->input_dev->mutex); + + if (bt541->input_dev->users) + ret = zinitix_start(bt541); + + mutex_unlock(&bt541->input_dev->mutex); + + return ret; +} + +static SIMPLE_DEV_PM_OPS(zinitix_pm_ops, zinitix_suspend, zinitix_resume); + +#ifdef CONFIG_OF +static const struct of_device_id zinitix_of_match[] = { + { .compatible = "zinitix,bt541" }, + { } +}; +MODULE_DEVICE_TABLE(of, zinitix_of_match); +#endif + +static struct i2c_driver zinitix_ts_driver = { + .probe_new = zinitix_ts_probe, + .driver = { + .name = "Zinitix-TS", + .pm = &zinitix_pm_ops, + .of_match_table = of_match_ptr(zinitix_of_match), + }, +}; +module_i2c_driver(zinitix_ts_driver); + +MODULE_AUTHOR("Michael Srba "); +MODULE_DESCRIPTION("Zinitix touchscreen driver"); +MODULE_LICENSE("GPL v2"); From 261bfb3328b89c63ca410ae30a0c87cd3955344c Mon Sep 17 00:00:00 2001 From: Vincent Huang Date: Sun, 4 Oct 2020 19:42:47 -0700 Subject: [PATCH 241/497] Input: synaptics-rmi4 - rename f30_data to gpio_data f30_data in rmi_device_platform_data could be also referenced by RMI function 3A, so rename it and the structure name to avoid confusion. Signed-off-by: Vincent Huang Reviewed-by: Hans de Goede Tested-by: Hans de Goede Reviewed-by: Andrew Duggan Link: https://lore.kernel.org/r/20200930094147.635556-2-vincent.huang@tw.synaptics.com Signed-off-by: Dmitry Torokhov --- drivers/hid/hid-rmi.c | 2 +- drivers/input/mouse/synaptics.c | 2 +- drivers/input/rmi4/rmi_f30.c | 14 +++++++------- include/linux/rmi.h | 11 ++++++----- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/hid/hid-rmi.c b/drivers/hid/hid-rmi.c index 8cffa84c9650..acac09ba7dd5 100644 --- a/drivers/hid/hid-rmi.c +++ b/drivers/hid/hid-rmi.c @@ -721,7 +721,7 @@ static int rmi_probe(struct hid_device *hdev, const struct hid_device_id *id) } if (data->device_flags & RMI_DEVICE_HAS_PHYS_BUTTONS) - rmi_hid_pdata.f30_data.disable = true; + rmi_hid_pdata.gpio_data.disable = true; data->xport.dev = hdev->dev.parent; data->xport.pdata = rmi_hid_pdata; diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 4b81b2d0fe06..8a54efd6eb95 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -1752,7 +1752,7 @@ static int synaptics_create_intertouch(struct psmouse *psmouse, .kernel_tracking = false, .topbuttonpad = topbuttonpad, }, - .f30_data = { + .gpio_data = { .buttonpad = SYN_CAP_CLICKPAD(info->ext_cap_0c), .trackstick_buttons = !!SYN_CAP_EXT_BUTTONS_STICK(info->ext_cap_10), diff --git a/drivers/input/rmi4/rmi_f30.c b/drivers/input/rmi4/rmi_f30.c index a90dad1d9ac7..35045f161dc2 100644 --- a/drivers/input/rmi4/rmi_f30.c +++ b/drivers/input/rmi4/rmi_f30.c @@ -168,17 +168,17 @@ static int rmi_f30_config(struct rmi_function *fn) rmi_get_platform_data(fn->rmi_dev); int error; - /* can happen if f30_data.disable is set */ + /* can happen if gpio_data.disable is set */ if (!f30) return 0; - if (pdata->f30_data.trackstick_buttons) { + if (pdata->gpio_data.trackstick_buttons) { /* Try [re-]establish link to F03. */ f30->f03 = rmi_find_function(fn->rmi_dev, 0x03); f30->trackstick_buttons = f30->f03 != NULL; } - if (pdata->f30_data.disable) { + if (pdata->gpio_data.disable) { drv->clear_irq_bits(fn->rmi_dev, fn->irq_mask); } else { /* Write Control Register values back to device */ @@ -245,10 +245,10 @@ static int rmi_f30_map_gpios(struct rmi_function *fn, if (!rmi_f30_is_valid_button(i, f30->ctrl)) continue; - if (pdata->f30_data.trackstick_buttons && + if (pdata->gpio_data.trackstick_buttons && i >= TRACKSTICK_RANGE_START && i < TRACKSTICK_RANGE_END) { f30->gpioled_key_map[i] = trackstick_button++; - } else if (!pdata->f30_data.buttonpad || !button_mapped) { + } else if (!pdata->gpio_data.buttonpad || !button_mapped) { f30->gpioled_key_map[i] = button; input_set_capability(input, EV_KEY, button++); button_mapped = true; @@ -264,7 +264,7 @@ static int rmi_f30_map_gpios(struct rmi_function *fn, * but I am not sure, so use only the pdata info and the number of * mapped buttons. */ - if (pdata->f30_data.buttonpad || (button - BTN_LEFT == 1)) + if (pdata->gpio_data.buttonpad || (button - BTN_LEFT == 1)) __set_bit(INPUT_PROP_BUTTONPAD, input->propbit); return 0; @@ -372,7 +372,7 @@ static int rmi_f30_probe(struct rmi_function *fn) struct f30_data *f30; int error; - if (pdata->f30_data.disable) + if (pdata->gpio_data.disable) return 0; if (!drv_data->input) { diff --git a/include/linux/rmi.h b/include/linux/rmi.h index 8ed37f93f3c8..ab7eea01ab42 100644 --- a/include/linux/rmi.h +++ b/include/linux/rmi.h @@ -102,15 +102,16 @@ struct rmi_2d_sensor_platform_data { }; /** - * struct rmi_f30_data - overrides defaults for a single F30 GPIOs/LED chip. + * struct rmi_gpio_data - overrides defaults for a single F30/F3A GPIOs/LED + * chip. * @buttonpad - the touchpad is a buttonpad, so enable only the first actual * button that is found. - * @trackstick_buttons - Set when the function 30 is handling the physical + * @trackstick_buttons - Set when the function 30 or 3a is handling the physical * buttons of the trackstick (as a PS/2 passthrough device). - * @disable - the touchpad incorrectly reports F30 and it should be ignored. + * @disable - the touchpad incorrectly reports F30/F3A and it should be ignored. * This is a special case which is due to misconfigured firmware. */ -struct rmi_f30_data { +struct rmi_gpio_data { bool buttonpad; bool trackstick_buttons; bool disable; @@ -218,7 +219,7 @@ struct rmi_device_platform_data { /* function handler pdata */ struct rmi_2d_sensor_platform_data sensor_pdata; struct rmi_f01_power_management power_management; - struct rmi_f30_data f30_data; + struct rmi_gpio_data gpio_data; }; /** From 9e4c596bfd004f447a652205163234dfd4aafa69 Mon Sep 17 00:00:00 2001 From: Vincent Huang Date: Sun, 4 Oct 2020 19:42:24 -0700 Subject: [PATCH 242/497] Input: synaptics-rmi4 - add support for F3A RMI4 F3A supports the touchpad GPIO function, it's designed to support more GPIOs and used on newer touchpads. This patch adds support of the touchpad buttons. Signed-off-by: Vincent Huang Reviewed-by: Hans de Goede Tested-by: Hans de Goede Reviewed-by: Andrew Duggan Link: https://lore.kernel.org/r/20200930094147.635556-3-vincent.huang@tw.synaptics.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/Kconfig | 8 ++ drivers/input/rmi4/Makefile | 1 + drivers/input/rmi4/rmi_bus.c | 3 + drivers/input/rmi4/rmi_driver.h | 1 + drivers/input/rmi4/rmi_f3a.c | 241 ++++++++++++++++++++++++++++++++ 5 files changed, 254 insertions(+) create mode 100644 drivers/input/rmi4/rmi_f3a.c diff --git a/drivers/input/rmi4/Kconfig b/drivers/input/rmi4/Kconfig index a212ff706f74..16119f760d11 100644 --- a/drivers/input/rmi4/Kconfig +++ b/drivers/input/rmi4/Kconfig @@ -100,6 +100,14 @@ config RMI4_F34 device via the firmware loader interface. This is triggered using a sysfs attribute. +config RMI4_F3A + bool "RMI4 Function 3A (GPIO)" + help + Say Y here if you want to add support for RMI4 function 3A. + + Function 3A provides GPIO support for RMI4 devices. This includes + support for buttons on TouchPads and ClickPads. + config RMI4_F54 bool "RMI4 Function 54 (Analog diagnostics)" depends on VIDEO_V4L2=y || (RMI4_CORE=m && VIDEO_V4L2=m) diff --git a/drivers/input/rmi4/Makefile b/drivers/input/rmi4/Makefile index f17631656987..02f14c846861 100644 --- a/drivers/input/rmi4/Makefile +++ b/drivers/input/rmi4/Makefile @@ -10,6 +10,7 @@ rmi_core-$(CONFIG_RMI4_F11) += rmi_f11.o rmi_core-$(CONFIG_RMI4_F12) += rmi_f12.o rmi_core-$(CONFIG_RMI4_F30) += rmi_f30.o rmi_core-$(CONFIG_RMI4_F34) += rmi_f34.o rmi_f34v7.o +rmi_core-$(CONFIG_RMI4_F3A) += rmi_f3a.o rmi_core-$(CONFIG_RMI4_F54) += rmi_f54.o rmi_core-$(CONFIG_RMI4_F55) += rmi_f55.o diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c index af706a583656..47d1b97ed6cf 100644 --- a/drivers/input/rmi4/rmi_bus.c +++ b/drivers/input/rmi4/rmi_bus.c @@ -365,6 +365,9 @@ static struct rmi_function_handler *fn_handlers[] = { #ifdef CONFIG_RMI4_F34 &rmi_f34_handler, #endif +#ifdef CONFIG_RMI4_F3A + &rmi_f3a_handler, +#endif #ifdef CONFIG_RMI4_F54 &rmi_f54_handler, #endif diff --git a/drivers/input/rmi4/rmi_driver.h b/drivers/input/rmi4/rmi_driver.h index 65bfaa95e193..1c6c6086c0e5 100644 --- a/drivers/input/rmi4/rmi_driver.h +++ b/drivers/input/rmi4/rmi_driver.h @@ -135,6 +135,7 @@ extern struct rmi_function_handler rmi_f11_handler; extern struct rmi_function_handler rmi_f12_handler; extern struct rmi_function_handler rmi_f30_handler; extern struct rmi_function_handler rmi_f34_handler; +extern struct rmi_function_handler rmi_f3a_handler; extern struct rmi_function_handler rmi_f54_handler; extern struct rmi_function_handler rmi_f55_handler; #endif diff --git a/drivers/input/rmi4/rmi_f3a.c b/drivers/input/rmi4/rmi_f3a.c new file mode 100644 index 000000000000..0e8baed84dbb --- /dev/null +++ b/drivers/input/rmi4/rmi_f3a.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2012-2020 Synaptics Incorporated + */ + +#include +#include +#include +#include +#include "rmi_driver.h" + +#define RMI_F3A_MAX_GPIO_COUNT 128 +#define RMI_F3A_MAX_REG_SIZE DIV_ROUND_UP(RMI_F3A_MAX_GPIO_COUNT, 8) + +/* Defs for Query 0 */ +#define RMI_F3A_GPIO_COUNT 0x7F + +#define RMI_F3A_DATA_REGS_MAX_SIZE RMI_F3A_MAX_REG_SIZE + +#define TRACKSTICK_RANGE_START 3 +#define TRACKSTICK_RANGE_END 6 + +struct f3a_data { + /* Query Data */ + u8 gpio_count; + + u8 register_count; + + u8 data_regs[RMI_F3A_DATA_REGS_MAX_SIZE]; + u16 *gpio_key_map; + + struct input_dev *input; + + struct rmi_function *f03; + bool trackstick_buttons; +}; + +static void rmi_f3a_report_button(struct rmi_function *fn, + struct f3a_data *f3a, unsigned int button) +{ + u16 key_code = f3a->gpio_key_map[button]; + bool key_down = !(f3a->data_regs[0] & BIT(button)); + + if (f3a->trackstick_buttons && + button >= TRACKSTICK_RANGE_START && + button <= TRACKSTICK_RANGE_END) { + rmi_f03_overwrite_button(f3a->f03, key_code, key_down); + } else { + rmi_dbg(RMI_DEBUG_FN, &fn->dev, + "%s: call input report key (0x%04x) value (0x%02x)", + __func__, key_code, key_down); + input_report_key(f3a->input, key_code, key_down); + } +} + +static irqreturn_t rmi_f3a_attention(int irq, void *ctx) +{ + struct rmi_function *fn = ctx; + struct f3a_data *f3a = dev_get_drvdata(&fn->dev); + struct rmi_driver_data *drvdata = dev_get_drvdata(&fn->rmi_dev->dev); + int error; + int i; + + if (drvdata->attn_data.data) { + if (drvdata->attn_data.size < f3a->register_count) { + dev_warn(&fn->dev, + "F3A interrupted, but data is missing\n"); + return IRQ_HANDLED; + } + memcpy(f3a->data_regs, drvdata->attn_data.data, + f3a->register_count); + drvdata->attn_data.data += f3a->register_count; + drvdata->attn_data.size -= f3a->register_count; + } else { + error = rmi_read_block(fn->rmi_dev, fn->fd.data_base_addr, + f3a->data_regs, f3a->register_count); + if (error) { + dev_err(&fn->dev, + "%s: Failed to read F3a data registers: %d\n", + __func__, error); + return IRQ_RETVAL(error); + } + } + + for (i = 0; i < f3a->gpio_count; i++) + if (f3a->gpio_key_map[i] != KEY_RESERVED) + rmi_f3a_report_button(fn, f3a, i); + if (f3a->trackstick_buttons) + rmi_f03_commit_buttons(f3a->f03); + + return IRQ_HANDLED; +} + +static int rmi_f3a_config(struct rmi_function *fn) +{ + struct f3a_data *f3a = dev_get_drvdata(&fn->dev); + struct rmi_driver *drv = fn->rmi_dev->driver; + const struct rmi_device_platform_data *pdata = + rmi_get_platform_data(fn->rmi_dev); + + if (!f3a) + return 0; + + if (pdata->gpio_data.trackstick_buttons) { + /* Try [re-]establish link to F03. */ + f3a->f03 = rmi_find_function(fn->rmi_dev, 0x03); + f3a->trackstick_buttons = f3a->f03 != NULL; + } + + drv->set_irq_bits(fn->rmi_dev, fn->irq_mask); + + return 0; +} + +static bool rmi_f3a_is_valid_button(int button, struct f3a_data *f3a, + u8 *query1_regs, u8 *ctrl1_regs) +{ + /* gpio exist && direction input */ + return (query1_regs[0] & BIT(button)) && !(ctrl1_regs[0] & BIT(button)); +} + +static int rmi_f3a_map_gpios(struct rmi_function *fn, struct f3a_data *f3a, + u8 *query1_regs, u8 *ctrl1_regs) +{ + const struct rmi_device_platform_data *pdata = + rmi_get_platform_data(fn->rmi_dev); + struct input_dev *input = f3a->input; + unsigned int button = BTN_LEFT; + unsigned int trackstick_button = BTN_LEFT; + bool button_mapped = false; + int i; + int button_count = min_t(u8, f3a->gpio_count, TRACKSTICK_RANGE_END); + + f3a->gpio_key_map = devm_kcalloc(&fn->dev, + button_count, + sizeof(f3a->gpio_key_map[0]), + GFP_KERNEL); + if (!f3a->gpio_key_map) { + dev_err(&fn->dev, "Failed to allocate gpio map memory.\n"); + return -ENOMEM; + } + + for (i = 0; i < button_count; i++) { + if (!rmi_f3a_is_valid_button(i, f3a, query1_regs, ctrl1_regs)) + continue; + + if (pdata->gpio_data.trackstick_buttons && + i >= TRACKSTICK_RANGE_START && + i < TRACKSTICK_RANGE_END) { + f3a->gpio_key_map[i] = trackstick_button++; + } else if (!pdata->gpio_data.buttonpad || !button_mapped) { + f3a->gpio_key_map[i] = button; + input_set_capability(input, EV_KEY, button++); + button_mapped = true; + } + } + input->keycode = f3a->gpio_key_map; + input->keycodesize = sizeof(f3a->gpio_key_map[0]); + input->keycodemax = f3a->gpio_count; + + if (pdata->gpio_data.buttonpad || (button - BTN_LEFT == 1)) + __set_bit(INPUT_PROP_BUTTONPAD, input->propbit); + + return 0; +} + +static int rmi_f3a_initialize(struct rmi_function *fn, struct f3a_data *f3a) +{ + u8 query1[RMI_F3A_MAX_REG_SIZE]; + u8 ctrl1[RMI_F3A_MAX_REG_SIZE]; + u8 buf; + int error; + + error = rmi_read(fn->rmi_dev, fn->fd.query_base_addr, &buf); + if (error < 0) { + dev_err(&fn->dev, "Failed to read general info register: %d\n", + error); + return -ENODEV; + } + + f3a->gpio_count = buf & RMI_F3A_GPIO_COUNT; + f3a->register_count = DIV_ROUND_UP(f3a->gpio_count, 8); + + /* Query1 -> gpio exist */ + error = rmi_read_block(fn->rmi_dev, fn->fd.query_base_addr + 1, + query1, f3a->register_count); + if (error) { + dev_err(&fn->dev, "Failed to read query1 register\n"); + return error; + } + + /* Ctrl1 -> gpio direction */ + error = rmi_read_block(fn->rmi_dev, fn->fd.control_base_addr + 1, + ctrl1, f3a->register_count); + if (error) { + dev_err(&fn->dev, "Failed to read control1 register\n"); + return error; + } + + error = rmi_f3a_map_gpios(fn, f3a, query1, ctrl1); + if (error) + return error; + + return 0; +} + +static int rmi_f3a_probe(struct rmi_function *fn) +{ + struct rmi_device *rmi_dev = fn->rmi_dev; + struct rmi_driver_data *drv_data = dev_get_drvdata(&rmi_dev->dev); + struct f3a_data *f3a; + int error; + + if (!drv_data->input) { + dev_info(&fn->dev, "F3A: no input device found, ignoring\n"); + return -ENXIO; + } + + f3a = devm_kzalloc(&fn->dev, sizeof(*f3a), GFP_KERNEL); + if (!f3a) + return -ENOMEM; + + f3a->input = drv_data->input; + + error = rmi_f3a_initialize(fn, f3a); + if (error) + return error; + + dev_set_drvdata(&fn->dev, f3a); + return 0; +} + +struct rmi_function_handler rmi_f3a_handler = { + .driver = { + .name = "rmi4_f3a", + }, + .func = 0x3a, + .probe = rmi_f3a_probe, + .config = rmi_f3a_config, + .attention = rmi_f3a_attention, +}; From a6977d758fed511a9977639465689785ac398b01 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 4 Oct 2020 19:47:07 -0700 Subject: [PATCH 243/497] Input: synaptics-rmi4 - support bootloader v8 in f34v7 With the recent addition of the F3A support, we can now accept bootloader v8, which will help support recent Thinkpads. Acked-by: Lyude Paul Signed-off-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/20200930225046.173190-2-Jason@zx2c4.com Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f34v7.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/input/rmi4/rmi_f34v7.c b/drivers/input/rmi4/rmi_f34v7.c index 74f7c6f214ff..8d7ec9d89b18 100644 --- a/drivers/input/rmi4/rmi_f34v7.c +++ b/drivers/input/rmi4/rmi_f34v7.c @@ -1364,9 +1364,14 @@ int rmi_f34v7_probe(struct f34_data *f34) f34->bl_version = 6; } else if (f34->bootloader_id[1] == 7) { f34->bl_version = 7; + } else if (f34->bootloader_id[1] == 8) { + f34->bl_version = 8; } else { - dev_err(&f34->fn->dev, "%s: Unrecognized bootloader version\n", - __func__); + dev_err(&f34->fn->dev, + "%s: Unrecognized bootloader version: %d (%c) %d (%c)\n", + __func__, + f34->bootloader_id[0], f34->bootloader_id[0], + f34->bootloader_id[1], f34->bootloader_id[1]); return -EINVAL; } From 127e4a1bc11e0e3d30f4d20bb888a1f680296990 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 4 Oct 2020 19:49:08 -0700 Subject: [PATCH 244/497] Input: synaptics - enable InterTouch for ThinkPad P1/X1E gen 2 With the new RMI4 F3A support, we're now able to enable full RMI4 support for this model. We also tidy up the comments a bit, as the X1E is essentially the same computer as the P1. Acked-by: Lyude Paul Signed-off-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/20200930225046.173190-3-Jason@zx2c4.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 8a54efd6eb95..43a200ca68b8 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -179,8 +179,9 @@ static const char * const smbus_pnp_ids[] = { "LEN0093", /* T480 */ "LEN0096", /* X280 */ "LEN0097", /* X280 -> ALPS trackpoint */ - "LEN0099", /* X1 Extreme 1st */ + "LEN0099", /* X1 Extreme Gen 1 / P1 Gen 1 */ "LEN009b", /* T580 */ + "LEN0402", /* X1 Extreme Gen 2 / P1 Gen 2 */ "LEN200f", /* T450s */ "LEN2044", /* L470 */ "LEN2054", /* E480 */ From 470d154a62c4ef22b4de384ae91798851c9631a7 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 5 Oct 2020 10:13:18 -0700 Subject: [PATCH 245/497] Input: synaptics - enable InterTouch for ThinkPad T14 Gen 1 With the new RMI4 F3A support, we're now able to enable full RMI4 support for this model. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20201005114919.371592-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 43a200ca68b8..82577095e175 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -186,6 +186,7 @@ static const char * const smbus_pnp_ids[] = { "LEN2044", /* L470 */ "LEN2054", /* E480 */ "LEN2055", /* E580 */ + "LEN2068", /* T14 Gen 1 */ "SYN3052", /* HP EliteBook 840 G4 */ "SYN3221", /* HP 15-ay000 */ "SYN323d", /* HP Spectre X360 13-w013dx */ From 4ba8b8aec58bf8de3ca29ea08d7eb11a127e7b90 Mon Sep 17 00:00:00 2001 From: Kenny Levinsen Date: Mon, 5 Oct 2020 11:15:55 -0700 Subject: [PATCH 246/497] Input: evdev - per-client waitgroups All evdev clients share a common waitgroup. On new input events, all clients waiting on this waitgroup are woken up, even those filtering out the events, possibly more than once per event. This leads to duplicated and unwanted wakeups. Split the shared waitgroup into per-client waitgroups for more fine-grained wakeups. Signed-off-by: Kenny Levinsen Link: https://lore.kernel.org/r/20200429184126.2155-1-kl@kl.wtf Signed-off-by: Dmitry Torokhov --- drivers/input/evdev.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index e494295d1c7b..95f90699d2b1 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -28,7 +28,6 @@ struct evdev { int open; struct input_handle handle; - wait_queue_head_t wait; struct evdev_client __rcu *grab; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ @@ -43,6 +42,7 @@ struct evdev_client { unsigned int tail; unsigned int packet_head; /* [future] position of the first element of next packet */ spinlock_t buffer_lock; /* protects access to buffer, head and tail */ + wait_queue_head_t wait; struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; @@ -245,7 +245,6 @@ static void evdev_pass_values(struct evdev_client *client, const struct input_value *vals, unsigned int count, ktime_t *ev_time) { - struct evdev *evdev = client->evdev; const struct input_value *v; struct input_event event; struct timespec64 ts; @@ -282,7 +281,7 @@ static void evdev_pass_values(struct evdev_client *client, spin_unlock(&client->buffer_lock); if (wakeup) - wake_up_interruptible_poll(&evdev->wait, + wake_up_interruptible_poll(&client->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM); } @@ -426,11 +425,11 @@ static void evdev_hangup(struct evdev *evdev) struct evdev_client *client; spin_lock(&evdev->client_lock); - list_for_each_entry(client, &evdev->client_list, node) + list_for_each_entry(client, &evdev->client_list, node) { kill_fasync(&client->fasync, SIGIO, POLL_HUP); + wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); + } spin_unlock(&evdev->client_lock); - - wake_up_interruptible_poll(&evdev->wait, EPOLLHUP | EPOLLERR); } static int evdev_release(struct inode *inode, struct file *file) @@ -479,6 +478,7 @@ static int evdev_open(struct inode *inode, struct file *file) if (!client) return -ENOMEM; + init_waitqueue_head(&client->wait); client->bufsize = bufsize; spin_lock_init(&client->buffer_lock); client->evdev = evdev; @@ -595,7 +595,7 @@ static ssize_t evdev_read(struct file *file, char __user *buffer, break; if (!(file->f_flags & O_NONBLOCK)) { - error = wait_event_interruptible(evdev->wait, + error = wait_event_interruptible(client->wait, client->packet_head != client->tail || !evdev->exist || client->revoked); if (error) @@ -613,7 +613,7 @@ static __poll_t evdev_poll(struct file *file, poll_table *wait) struct evdev *evdev = client->evdev; __poll_t mask; - poll_wait(file, &evdev->wait, wait); + poll_wait(file, &client->wait, wait); if (evdev->exist && !client->revoked) mask = EPOLLOUT | EPOLLWRNORM; @@ -946,7 +946,7 @@ static int evdev_revoke(struct evdev *evdev, struct evdev_client *client, client->revoked = true; evdev_ungrab(evdev, client); input_flush_device(&evdev->handle, file); - wake_up_interruptible_poll(&evdev->wait, EPOLLHUP | EPOLLERR); + wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); return 0; } @@ -1358,7 +1358,6 @@ static int evdev_connect(struct input_handler *handler, struct input_dev *dev, INIT_LIST_HEAD(&evdev->client_list); spin_lock_init(&evdev->client_lock); mutex_init(&evdev->mutex); - init_waitqueue_head(&evdev->wait); evdev->exist = true; dev_no = minor; From 8a30c6eb29e036f729b4dd6982462ca15d062087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Tue, 6 Oct 2020 21:10:35 -0700 Subject: [PATCH 247/497] Input: Add MAINTAINERS entry for SiS i2c touch input driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I am the author of this SiS touch driver, and willing to maintain and develop it further. Signed-off-by: Mika Penttilä Link: https://lore.kernel.org/r/20201007035108.58636-1-mika.penttila@nextfour.com Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d7e3eab852d3..5008e811f1cc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15441,6 +15441,13 @@ F: Documentation/fb/sisfb.rst F: drivers/video/fbdev/sis/ F: include/video/sisfb.h +SIS I2C TOUCHSCREEN DRIVER +M: Mika Penttilä +L: linux-input@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/input/touchscreen/sis_i2c.txt +F: drivers/input/touchscreen/sis_i2c.c + SIS USB2VGA DRIVER M: Thomas Winischhofer S: Maintained From 2ba87c43872fc6bf30ff8c94adb96f82b61ee1d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:21 +0200 Subject: [PATCH 248/497] scsi: core: Don't export scsi_device_from_queue() This function is only used by code built into scsi_mod.ko. Link: https://lore.kernel.org/r/20201005084130.143273-2-hch@lst.de Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 4e49469b6c53..996ab5ab6fda 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1968,7 +1968,6 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q) return sdev; } -EXPORT_SYMBOL_GPL(scsi_device_from_queue); /** * scsi_block_requests - Utility function used by low-level drivers to prevent From 3a8dc5bbc8c04db2646ed0842118b0f66659a3db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:22 +0200 Subject: [PATCH 249/497] scsi: core: Remove scsi_init_cmd_errh There is no good reason to keep this functionality as a separate function, just merge it into the only caller. Link: https://lore.kernel.org/r/20201005084130.143273-3-hch@lst.de Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 996ab5ab6fda..b515560eaf02 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -293,21 +293,6 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, } EXPORT_SYMBOL(__scsi_execute); -/** - * scsi_init_cmd_errh - Initialize cmd fields related to error handling. - * @cmd: command that is ready to be queued. - * - * This function has the job of initializing a number of fields related to error - * handling. Typically this will be called once for each command, as required. - */ -static void scsi_init_cmd_errh(struct scsi_cmnd *cmd) -{ - scsi_set_resid(cmd, 0); - memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); - if (cmd->cmd_len == 0) - cmd->cmd_len = scsi_command_size(cmd->cmnd); -} - /* * Wake up the error handler if necessary. Avoid as follows that the error * handler is not woken up if host in-flight requests number == @@ -1707,7 +1692,10 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (bd->last) cmd->flags |= SCMD_LAST; - scsi_init_cmd_errh(cmd); + scsi_set_resid(cmd, 0); + memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); + if (cmd->cmd_len == 0) + cmd->cmd_len = scsi_command_size(cmd->cmnd); cmd->scsi_done = scsi_mq_done; reason = scsi_dispatch_cmd(cmd); From 2ceda20f0a99a74a82b78870f3b3e5fa93087a7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:23 +0200 Subject: [PATCH 250/497] scsi: core: Move command size detection out of the fast path We only need to detect the command size for ioctl request from userspace, which is limited to the passthrough path. Move the check there instead of doing it for all queuecommand invocations. Link: https://lore.kernel.org/r/20201005084130.143273-4-hch@lst.de Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b515560eaf02..b0bd15f53467 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1181,6 +1181,8 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, } cmd->cmd_len = scsi_req(req)->cmd_len; + if (cmd->cmd_len == 0) + cmd->cmd_len = scsi_command_size(cmd->cmnd); cmd->cmnd = scsi_req(req)->cmd; cmd->transfersize = blk_rq_bytes(req); cmd->allowed = scsi_req(req)->retries; @@ -1694,8 +1696,6 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, scsi_set_resid(cmd, 0); memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); - if (cmd->cmd_len == 0) - cmd->cmd_len = scsi_command_size(cmd->cmnd); cmd->scsi_done = scsi_mq_done; reason = scsi_dispatch_cmd(cmd); From 40b93836a136a46d7240aa13840c87fb239c865c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:25 +0200 Subject: [PATCH 251/497] scsi: core: Use rq_dma_dir in scsi_setup_cmnd() Link: https://lore.kernel.org/r/20201005084130.143273-6-hch@lst.de Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b0bd15f53467..3325ca22bf6d 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1215,12 +1215,7 @@ static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev, struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; - if (!blk_rq_bytes(req)) - cmd->sc_data_direction = DMA_NONE; - else if (rq_data_dir(req) == WRITE) - cmd->sc_data_direction = DMA_TO_DEVICE; - else - cmd->sc_data_direction = DMA_FROM_DEVICE; + cmd->sc_data_direction = rq_dma_dir(req); if (blk_rq_is_scsi(req)) ret = scsi_setup_scsi_cmnd(sdev, req); From 822bd2db798b3c300bbbcaa67701a7661e382533 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:26 +0200 Subject: [PATCH 252/497] scsi: core: Rename scsi_prep_state_check() to scsi_device_state_check() The old name is rather confusing now that the the legacy prep_fn is gone. Link: https://lore.kernel.org/r/20201005084130.143273-7-hch@lst.de Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 3325ca22bf6d..6a586715379c 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1229,7 +1229,7 @@ static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev, } static blk_status_t -scsi_prep_state_check(struct scsi_device *sdev, struct request *req) +scsi_device_state_check(struct scsi_device *sdev, struct request *req) { switch (sdev->sdev_state) { case SDEV_OFFLINE: @@ -1662,7 +1662,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, * commands. */ if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { - ret = scsi_prep_state_check(sdev, req); + ret = scsi_device_state_check(sdev, req); if (ret != BLK_STS_OK) goto out_put_budget; } From 5843cc3d5acd8e6654eb2477f3e48c30e3a2e4f7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:27 +0200 Subject: [PATCH 253/497] scsi: core: Rename scsi_mq_prep_fn() to scsi_prepare_cmd() The old name is rather confusing now that the the legacy prep_fn is gone. Link: https://lore.kernel.org/r/20201005084130.143273-8-hch@lst.de Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6a586715379c..a6419e45b50f 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1571,7 +1571,7 @@ static unsigned int scsi_mq_inline_sgl_size(struct Scsi_Host *shost) sizeof(struct scatterlist); } -static blk_status_t scsi_mq_prep_fn(struct request *req) +static blk_status_t scsi_prepare_cmd(struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = req->q->queuedata; @@ -1674,7 +1674,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_dec_target_busy; if (!(req->rq_flags & RQF_DONTPREP)) { - ret = scsi_mq_prep_fn(req); + ret = scsi_prepare_cmd(req); if (ret != BLK_STS_OK) goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; From 7007e9dd56767a95de0947b3f7599bcc2f21687f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:28 +0200 Subject: [PATCH 254/497] scsi: core: Clean up allocation and freeing of sgtables Rename scsi_init_io() to scsi_alloc_sgtables(), and ensure callers call scsi_free_sgtables() to cleanup failures close to scsi_init_io() instead of leaking it down the generic I/O submission path. Link: https://lore.kernel.org/r/20201005084130.143273-9-hch@lst.de Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 22 ++++++++-------------- drivers/scsi/sd.c | 27 +++++++++++++++------------ drivers/scsi/sr.c | 16 ++++++---------- include/scsi/scsi_cmnd.h | 3 ++- 4 files changed, 31 insertions(+), 37 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index a6419e45b50f..55a1a060cd11 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -515,7 +515,7 @@ static void scsi_uninit_cmd(struct scsi_cmnd *cmd) } } -static void scsi_free_sgtables(struct scsi_cmnd *cmd) +void scsi_free_sgtables(struct scsi_cmnd *cmd) { if (cmd->sdb.table.nents) sg_free_table_chained(&cmd->sdb.table, @@ -524,6 +524,7 @@ static void scsi_free_sgtables(struct scsi_cmnd *cmd) sg_free_table_chained(&cmd->prot_sdb->table, SCSI_INLINE_PROT_SG_CNT); } +EXPORT_SYMBOL_GPL(scsi_free_sgtables); static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd) { @@ -983,7 +984,7 @@ static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, } /** - * scsi_init_io - SCSI I/O initialization function. + * scsi_alloc_sgtables - allocate S/G tables for a command * @cmd: command descriptor we wish to initialize * * Returns: @@ -991,7 +992,7 @@ static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, * * BLK_STS_RESOURCE - if the failure is retryable * * BLK_STS_IOERR - if the failure is fatal */ -blk_status_t scsi_init_io(struct scsi_cmnd *cmd) +blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct request *rq = cmd->request; @@ -1083,7 +1084,7 @@ out_free_sgtables: scsi_free_sgtables(cmd); return ret; } -EXPORT_SYMBOL(scsi_init_io); +EXPORT_SYMBOL(scsi_alloc_sgtables); /** * scsi_initialize_rq - initialize struct scsi_cmnd partially @@ -1171,7 +1172,7 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, * submit a request without an attached bio. */ if (req->bio) { - blk_status_t ret = scsi_init_io(cmd); + blk_status_t ret = scsi_alloc_sgtables(cmd); if (unlikely(ret != BLK_STS_OK)) return ret; } else { @@ -1213,19 +1214,12 @@ static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev, struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - blk_status_t ret; cmd->sc_data_direction = rq_dma_dir(req); if (blk_rq_is_scsi(req)) - ret = scsi_setup_scsi_cmnd(sdev, req); - else - ret = scsi_setup_fs_cmnd(sdev, req); - - if (ret != BLK_STS_OK) - scsi_free_sgtables(cmd); - - return ret; + return scsi_setup_scsi_cmnd(sdev, req); + return scsi_setup_fs_cmnd(sdev, req); } static blk_status_t diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 7aa69888d062..cfe59edaf838 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -902,7 +902,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) cmd->transfersize = data_len; rq->timeout = SD_TIMEOUT; - return scsi_init_io(cmd); + return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, @@ -934,7 +934,7 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; - return scsi_init_io(cmd); + return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, @@ -966,7 +966,7 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; - return scsi_init_io(cmd); + return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) @@ -1107,7 +1107,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) * knows how much to actually write. */ rq->__data_len = sdp->sector_size; - ret = scsi_init_io(cmd); + ret = scsi_alloc_sgtables(cmd); rq->__data_len = blk_rq_bytes(rq); return ret; @@ -1226,23 +1226,24 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) unsigned int dif; bool dix; - ret = scsi_init_io(cmd); + ret = scsi_alloc_sgtables(cmd); if (ret != BLK_STS_OK) return ret; + ret = BLK_STS_IOERR; if (!scsi_device_online(sdp) || sdp->changed) { scmd_printk(KERN_ERR, cmd, "device offline or changed\n"); - return BLK_STS_IOERR; + goto fail; } if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->rq_disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); - return BLK_STS_IOERR; + goto fail; } if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) { scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n"); - return BLK_STS_IOERR; + goto fail; } /* @@ -1264,7 +1265,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) if (req_op(rq) == REQ_OP_ZONE_APPEND) { ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks); if (ret) - return ret; + goto fail; } fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; @@ -1292,7 +1293,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) } if (unlikely(ret != BLK_STS_OK)) - return ret; + goto fail; /* * We shouldn't disconnect in the middle of a sector, so with a dumb @@ -1316,10 +1317,12 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) blk_rq_sectors(rq))); /* - * This indicates that the command is ready from our end to be - * queued. + * This indicates that the command is ready from our end to be queued. */ return BLK_STS_OK; +fail: + scsi_free_sgtables(cmd); + return ret; } static blk_status_t sd_init_command(struct scsi_cmnd *cmd) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 0c4aa4665a2f..b74dfd8dc116 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -392,15 +392,11 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) struct request *rq = SCpnt->request; blk_status_t ret; - ret = scsi_init_io(SCpnt); + ret = scsi_alloc_sgtables(SCpnt); if (ret != BLK_STS_OK) - goto out; + return ret; cd = scsi_cd(rq->rq_disk); - /* from here on until we're complete, any goto out - * is used for a killable error condition */ - ret = BLK_STS_IOERR; - SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, "Doing sr request, block = %d\n", block)); @@ -509,12 +505,12 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) SCpnt->allowed = MAX_RETRIES; /* - * This indicates that the command is ready from our end to be - * queued. + * This indicates that the command is ready from our end to be queued. */ - ret = BLK_STS_OK; + return BLK_STS_OK; out: - return ret; + scsi_free_sgtables(SCpnt); + return BLK_STS_IOERR; } static int sr_block_open(struct block_device *bdev, fmode_t mode) diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index e76bac4d14c5..69ade4fb71aa 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -165,7 +165,8 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); -extern blk_status_t scsi_init_io(struct scsi_cmnd *cmd); +blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd); +void scsi_free_sgtables(struct scsi_cmnd *cmd); #ifdef CONFIG_SCSI_DMA extern int scsi_dma_map(struct scsi_cmnd *cmd); From 74e5e6c1b18ca0ed1c8499cbaf2e3b758f67dbc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:29 +0200 Subject: [PATCH 255/497] scsi: core: Remove scsi_setup_cmnd() and scsi_setup_fs_cmnd() Move this trivial functionality into scsi_prepare_cmd() instead of splitting it over multiple small functions, and update the comments to better document passthrough commands as the special case. Link: https://lore.kernel.org/r/20201005084130.143273-10-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 51 +++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 55a1a060cd11..6bf043dbae83 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1190,38 +1190,6 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, return BLK_STS_OK; } -/* - * Setup a normal block command. These are simple request from filesystems - * that still need to be translated to SCSI CDBs from the ULD. - */ -static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev, - struct request *req) -{ - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - - if (unlikely(sdev->handler && sdev->handler->prep_fn)) { - blk_status_t ret = sdev->handler->prep_fn(sdev, req); - if (ret != BLK_STS_OK) - return ret; - } - - cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd; - memset(cmd->cmnd, 0, BLK_MAX_CDB); - return scsi_cmd_to_driver(cmd)->init_command(cmd); -} - -static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev, - struct request *req) -{ - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); - - cmd->sc_data_direction = rq_dma_dir(req); - - if (blk_rq_is_scsi(req)) - return scsi_setup_scsi_cmnd(sdev, req); - return scsi_setup_fs_cmnd(sdev, req); -} - static blk_status_t scsi_device_state_check(struct scsi_device *sdev, struct request *req) { @@ -1577,6 +1545,7 @@ static blk_status_t scsi_prepare_cmd(struct request *req) cmd->request = req; cmd->tag = req->tag; cmd->prot_op = SCSI_PROT_NORMAL; + cmd->sc_data_direction = rq_dma_dir(req); sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->sdb.table.sgl = sg; @@ -1590,7 +1559,23 @@ static blk_status_t scsi_prepare_cmd(struct request *req) blk_mq_start_request(req); - return scsi_setup_cmnd(sdev, req); + /* + * Special handling for passthrough commands, which don't go to the ULP + * at all: + */ + if (blk_rq_is_scsi(req)) + return scsi_setup_scsi_cmnd(sdev, req); + + if (sdev->handler && sdev->handler->prep_fn) { + blk_status_t ret = sdev->handler->prep_fn(sdev, req); + + if (ret != BLK_STS_OK) + return ret; + } + + cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd; + memset(cmd->cmnd, 0, BLK_MAX_CDB); + return scsi_cmd_to_driver(cmd)->init_command(cmd); } static void scsi_mq_done(struct scsi_cmnd *cmd) From ed7fb2d018fdda4fcf0e9a8602e5000c7f0d8851 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 5 Oct 2020 10:41:30 +0200 Subject: [PATCH 256/497] scsi: core: Only start the request just before dispatching This has no change in behavior, but improves the accounting a bit. Link: https://lore.kernel.org/r/20201005084130.143273-11-hch@lst.de Reviewed-by: Bart Van Assche Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6bf043dbae83..6b0fccda9af2 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1557,8 +1557,6 @@ static blk_status_t scsi_prepare_cmd(struct request *req) (struct scatterlist *)(cmd->prot_sdb + 1); } - blk_mq_start_request(req); - /* * Special handling for passthrough commands, which don't go to the ULP * at all: @@ -1659,7 +1657,6 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, req->rq_flags |= RQF_DONTPREP; } else { clear_bit(SCMD_STATE_COMPLETE, &cmd->state); - blk_mq_start_request(req); } cmd->flags &= SCMD_PRESERVED_FLAGS; @@ -1672,6 +1669,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); cmd->scsi_done = scsi_mq_done; + blk_mq_start_request(req); reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); From 75c31c80a77d673062684d4d58260bae337c6934 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 17 Sep 2020 15:10:44 +0800 Subject: [PATCH 257/497] scsi: dc395x: Use module_pci_driver() to simplify the code Use the module_pci_driver() macro to make the code simpler by eliminating module_init() and module_exit() calls. Link: https://lore.kernel.org/r/20200917071044.1909268-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Signed-off-by: Martin K. Petersen --- drivers/scsi/dc395x.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c index 0c251a3b99b7..fa16894d8758 100644 --- a/drivers/scsi/dc395x.c +++ b/drivers/scsi/dc395x.c @@ -4721,30 +4721,7 @@ static struct pci_driver dc395x_driver = { .probe = dc395x_init_one, .remove = dc395x_remove_one, }; - - -/** - * dc395x_module_init - Module initialization function - * - * Used by both module and built-in driver to initialise this driver. - **/ -static int __init dc395x_module_init(void) -{ - return pci_register_driver(&dc395x_driver); -} - - -/** - * dc395x_module_exit - Module cleanup function. - **/ -static void __exit dc395x_module_exit(void) -{ - pci_unregister_driver(&dc395x_driver); -} - - -module_init(dc395x_module_init); -module_exit(dc395x_module_exit); +module_pci_driver(dc395x_driver); MODULE_AUTHOR("C.L. Huang / Erich Chen / Kurt Garloff"); MODULE_DESCRIPTION("SCSI host adapter driver for Tekram TRM-S1040 based adapters: Tekram DC395 and DC315 series"); From ca57b069954ab68d936af65c871d3a122938de2a Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Thu, 17 Sep 2020 15:10:45 +0800 Subject: [PATCH 258/497] scsi: initio: Use module_pci_driver() to simplify the code Use the module_pci_driver() macro to make the code simpler by eliminating module_init() and module_exit() calls. Link: https://lore.kernel.org/r/20200917071045.1909320-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Signed-off-by: Martin K. Petersen --- drivers/scsi/initio.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index 1d39628ac947..ca16ef45d8dc 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c @@ -2962,20 +2962,8 @@ static struct pci_driver initio_pci_driver = { .probe = initio_probe_one, .remove = initio_remove_one, }; - -static int __init initio_init_driver(void) -{ - return pci_register_driver(&initio_pci_driver); -} - -static void __exit initio_exit_driver(void) -{ - pci_unregister_driver(&initio_pci_driver); -} +module_pci_driver(initio_pci_driver); MODULE_DESCRIPTION("Initio INI-9X00U/UW SCSI device driver"); MODULE_AUTHOR("Initio Corporation"); MODULE_LICENSE("GPL"); - -module_init(initio_init_driver); -module_exit(initio_exit_driver); From 938b9e9ffbf8eadc09480eaa4062e033b16c395d Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 18 Sep 2020 11:49:20 +0800 Subject: [PATCH 259/497] scsi: gdth: Make option_setup() static Move the two functions around the '__setup' macro which uses them to avoid an 'unused-function' warning. This addresses the following sparse warning: drivers/scsi/gdth.c:3229:12: warning: symbol 'option_setup' was not declared. Should it be static? Link: https://lore.kernel.org/r/20200918034920.3199926-1-yanaijie@huawei.com Reported-by: Hulk Robot Signed-off-by: Jason Yan Signed-off-by: Martin K. Petersen --- drivers/scsi/gdth.c | 151 ++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 75 deletions(-) diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c index dc0e17729acf..5d801388680b 100644 --- a/drivers/scsi/gdth.c +++ b/drivers/scsi/gdth.c @@ -3168,81 +3168,6 @@ static inline void gdth_timer_init(void) } #endif -static void __init internal_setup(char *str,int *ints) -{ - int i; - char *cur_str, *argv; - - TRACE2(("internal_setup() str %s ints[0] %d\n", - str ? str:"NULL", ints ? ints[0]:0)); - - /* analyse string */ - argv = str; - while (argv && (cur_str = strchr(argv, ':'))) { - int val = 0, c = *++cur_str; - - if (c == 'n' || c == 'N') - val = 0; - else if (c == 'y' || c == 'Y') - val = 1; - else - val = (int)simple_strtoul(cur_str, NULL, 0); - - if (!strncmp(argv, "disable:", 8)) - disable = val; - else if (!strncmp(argv, "reserve_mode:", 13)) - reserve_mode = val; - else if (!strncmp(argv, "reverse_scan:", 13)) - reverse_scan = val; - else if (!strncmp(argv, "hdr_channel:", 12)) - hdr_channel = val; - else if (!strncmp(argv, "max_ids:", 8)) - max_ids = val; - else if (!strncmp(argv, "rescan:", 7)) - rescan = val; - else if (!strncmp(argv, "shared_access:", 14)) - shared_access = val; - else if (!strncmp(argv, "reserve_list:", 13)) { - reserve_list[0] = val; - for (i = 1; i < MAX_RES_ARGS; i++) { - cur_str = strchr(cur_str, ','); - if (!cur_str) - break; - if (!isdigit((int)*++cur_str)) { - --cur_str; - break; - } - reserve_list[i] = - (int)simple_strtoul(cur_str, NULL, 0); - } - if (!cur_str) - break; - argv = ++cur_str; - continue; - } - - if ((argv = strchr(argv, ','))) - ++argv; - } -} - -int __init option_setup(char *str) -{ - int ints[MAXHA]; - char *cur = str; - int i = 1; - - TRACE2(("option_setup() str %s\n", str ? str:"NULL")); - - while (cur && isdigit(*cur) && i < MAXHA) { - ints[i++] = simple_strtoul(cur, NULL, 0); - if ((cur = strchr(cur, ',')) != NULL) cur++; - } - - ints[0] = i - 1; - internal_setup(cur, ints); - return 1; -} static const char *gdth_ctr_name(gdth_ha_str *ha) { @@ -4317,5 +4242,81 @@ module_init(gdth_init); module_exit(gdth_exit); #ifndef MODULE +static void __init internal_setup(char *str,int *ints) +{ + int i; + char *cur_str, *argv; + + TRACE2(("internal_setup() str %s ints[0] %d\n", + str ? str:"NULL", ints ? ints[0]:0)); + + /* analyse string */ + argv = str; + while (argv && (cur_str = strchr(argv, ':'))) { + int val = 0, c = *++cur_str; + + if (c == 'n' || c == 'N') + val = 0; + else if (c == 'y' || c == 'Y') + val = 1; + else + val = (int)simple_strtoul(cur_str, NULL, 0); + + if (!strncmp(argv, "disable:", 8)) + disable = val; + else if (!strncmp(argv, "reserve_mode:", 13)) + reserve_mode = val; + else if (!strncmp(argv, "reverse_scan:", 13)) + reverse_scan = val; + else if (!strncmp(argv, "hdr_channel:", 12)) + hdr_channel = val; + else if (!strncmp(argv, "max_ids:", 8)) + max_ids = val; + else if (!strncmp(argv, "rescan:", 7)) + rescan = val; + else if (!strncmp(argv, "shared_access:", 14)) + shared_access = val; + else if (!strncmp(argv, "reserve_list:", 13)) { + reserve_list[0] = val; + for (i = 1; i < MAX_RES_ARGS; i++) { + cur_str = strchr(cur_str, ','); + if (!cur_str) + break; + if (!isdigit((int)*++cur_str)) { + --cur_str; + break; + } + reserve_list[i] = + (int)simple_strtoul(cur_str, NULL, 0); + } + if (!cur_str) + break; + argv = ++cur_str; + continue; + } + + if ((argv = strchr(argv, ','))) + ++argv; + } +} + +static int __init option_setup(char *str) +{ + int ints[MAXHA]; + char *cur = str; + int i = 1; + + TRACE2(("option_setup() str %s\n", str ? str:"NULL")); + + while (cur && isdigit(*cur) && i < MAXHA) { + ints[i++] = simple_strtoul(cur, NULL, 0); + if ((cur = strchr(cur, ',')) != NULL) cur++; + } + + ints[0] = i - 1; + internal_setup(cur, ints); + return 1; +} + __setup("gdth=", option_setup); #endif From ffab5e016b9b3ffee02fbf1ad688b59a56cacd62 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Fri, 18 Sep 2020 15:14:22 +0800 Subject: [PATCH 260/497] scsi: 53c700: Remove set but not used variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes gcc '-Wunused-but-set-variable' warning: drivers/scsi/53c700.c: In function NCR_700_intr: drivers/scsi/53c700.c:1488:27: warning: variable ‘state’ set but not used [-Wunused-but-set-variable] drivers/scsi/53c700.c: In function NCR_700_queuecommand_lck: drivers/scsi/53c700.c:1742:26: warning: variable ‘direction’ set but not used [-Wunused-but-set-variable] these variable is never used, so remove it. Link: https://lore.kernel.org/r/20200918071422.19566-1-zhengyongjun3@huawei.com Signed-off-by: Zheng Yongjun Signed-off-by: Martin K. Petersen --- drivers/scsi/53c700.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c index 461b3babb601..81228b953703 100644 --- a/drivers/scsi/53c700.c +++ b/drivers/scsi/53c700.c @@ -1485,10 +1485,8 @@ NCR_700_intr(int irq, void *dev_id) __u8 sstat0 = 0, dstat = 0; __u32 dsp; struct scsi_cmnd *SCp = hostdata->cmd; - enum NCR_700_Host_State state; handled = 1; - state = hostdata->state; SCp = hostdata->cmd; if(istat & SCSI_INT_PENDING) { @@ -1739,7 +1737,6 @@ NCR_700_queuecommand_lck(struct scsi_cmnd *SCp, void (*done)(struct scsi_cmnd *) struct NCR_700_Host_Parameters *hostdata = (struct NCR_700_Host_Parameters *)SCp->device->host->hostdata[0]; __u32 move_ins; - enum dma_data_direction direction; struct NCR_700_command_slot *slot; if(hostdata->command_slot_count >= NCR_700_COMMAND_SLOTS_PER_HOST) { @@ -1856,7 +1853,6 @@ NCR_700_queuecommand_lck(struct scsi_cmnd *SCp, void (*done)(struct scsi_cmnd *) } /* now build the scatter gather list */ - direction = SCp->sc_data_direction; if(move_ins != 0) { int i; int sg_count; From b994718760fa6de431ee7504ca4553536c77ee43 Mon Sep 17 00:00:00 2001 From: "Pavel Machek (CIP)" Date: Mon, 21 Sep 2020 13:23:40 +0200 Subject: [PATCH 261/497] scsi: qla2xxx: Use constant when it is known Directly return constant when it is known to make code easier to understand. Link: https://lore.kernel.org/r/20200921112340.GA19336@duo.ucw.cz Reviewed-by: Himanshu Madhani Signed-off-by: Pavel Machek (CIP) Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_nvme.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 5cc1bbb1ed74..ccec660b13d4 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -542,7 +542,7 @@ static int qla_nvme_post_cmd(struct nvme_fc_local_port *lport, fc_port_t *fcport; struct srb_iocb *nvme; struct scsi_qla_host *vha; - int rval = -ENODEV; + int rval; srb_t *sp; struct qla_qpair *qpair = hw_queue_handle; struct nvme_private *priv = fd->private; @@ -550,14 +550,14 @@ static int qla_nvme_post_cmd(struct nvme_fc_local_port *lport, if (!priv) { /* nvme association has been torn down */ - return rval; + return -ENODEV; } fcport = qla_rport->fcport; if (!qpair || !fcport || (qpair && !qpair->fw_started) || (fcport && fcport->deleted)) - return rval; + return -ENODEV; vha = fcport->vha; From 657ed8a8a61b15eba70f7c4e2b61f643cf1ba91f Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 29 Sep 2020 09:38:02 +0200 Subject: [PATCH 262/497] scsi: qla2xxx: Do not consume srb greedily qla2xx_process_get_sp_from_handle() will clear the slot in which the current srb is stored. As a result it can't be used in qla24xx_process_mbx_iocb_response() to check for consistency and later again in qla24xx_mbx_iocb_entry(). Move the consistency check directly into qla24xx_mbx_iocb_entry() and avoid the double call or any open coding of the qla2xx_process_get_sp_from_handle() functionality. Link: https://lore.kernel.org/r/20200929073802.18770-1-dwagner@suse.de Fixes: 31a3271ff11b ("scsi: qla2xxx: Handle incorrect entry_type entries") Reviewed-by: Arun Easi Reviewed-by: Himanshu Madhani Signed-off-by: Daniel Wagner Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_isr.c | 42 ++++++++++++---------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index 62097733c6e0..d76ec0ff93e8 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -1839,6 +1839,7 @@ qla24xx_mbx_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, struct mbx_24xx_entry *pkt) { const char func[] = "MBX-IOCB2"; + struct qla_hw_data *ha = vha->hw; srb_t *sp; struct srb_iocb *si; u16 sz, i; @@ -1848,6 +1849,18 @@ qla24xx_mbx_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, if (!sp) return; + if (sp->type == SRB_SCSI_CMD || + sp->type == SRB_NVME_CMD || + sp->type == SRB_TM_CMD) { + ql_log(ql_log_warn, vha, 0x509d, + "Inconsistent event entry type %d\n", sp->type); + if (IS_P3P_TYPE(ha)) + set_bit(FCOE_CTX_RESET_NEEDED, &vha->dpc_flags); + else + set_bit(ISP_ABORT_NEEDED, &vha->dpc_flags); + return; + } + si = &sp->u.iocb_cmd; sz = min(ARRAY_SIZE(pkt->mb), ARRAY_SIZE(sp->u.iocb_cmd.u.mbx.in_mb)); @@ -3400,32 +3413,6 @@ void qla24xx_nvme_ls4_iocb(struct scsi_qla_host *vha, sp->done(sp, comp_status); } -static void qla24xx_process_mbx_iocb_response(struct scsi_qla_host *vha, - struct rsp_que *rsp, struct sts_entry_24xx *pkt) -{ - struct qla_hw_data *ha = vha->hw; - srb_t *sp; - static const char func[] = "MBX-IOCB2"; - - sp = qla2x00_get_sp_from_handle(vha, func, rsp->req, pkt); - if (!sp) - return; - - if (sp->type == SRB_SCSI_CMD || - sp->type == SRB_NVME_CMD || - sp->type == SRB_TM_CMD) { - ql_log(ql_log_warn, vha, 0x509d, - "Inconsistent event entry type %d\n", sp->type); - if (IS_P3P_TYPE(ha)) - set_bit(FCOE_CTX_RESET_NEEDED, &vha->dpc_flags); - else - set_bit(ISP_ABORT_NEEDED, &vha->dpc_flags); - return; - } - - qla24xx_mbx_iocb_entry(vha, rsp->req, (struct mbx_24xx_entry *)pkt); -} - /** * qla24xx_process_response_queue() - Process response queue entries. * @vha: SCSI driver HA context @@ -3535,7 +3522,8 @@ process_err: (struct abort_entry_24xx *)pkt); break; case MBX_IOCB_TYPE: - qla24xx_process_mbx_iocb_response(vha, rsp, pkt); + qla24xx_mbx_iocb_entry(vha, rsp->req, + (struct mbx_24xx_entry *)pkt); break; case VP_CTRL_IOCB_TYPE: qla_ctrlvp_completed(vha, rsp->req, From 21a6cd48bb483e16703f4e4415ad2e9e3c71a5dd Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 5 Oct 2020 07:45:44 -0700 Subject: [PATCH 263/497] scsi: qla2xxx: Initialize variable in qla8044_poll_reg() clang static analysis reports this problem: qla_nx2.c:694:3: warning: 6th function call argument is an uninitialized value ql_log(ql_log_fatal, vha, 0xb090, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In qla8044_poll_reg(), when reading the reg fails, the error is reported by reusing the timeout error reporter. Because the value is unset, a garbage value will be reported. Initialize the value. Link: https://lore.kernel.org/r/20201005144544.25335-1-trix@redhat.com Reviewed-by: Himanshu Madhani Signed-off-by: Tom Rix Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_nx2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_nx2.c b/drivers/scsi/qla2xxx/qla_nx2.c index 50e57603ce3d..36cbd7286f52 100644 --- a/drivers/scsi/qla2xxx/qla_nx2.c +++ b/drivers/scsi/qla2xxx/qla_nx2.c @@ -660,7 +660,7 @@ static int qla8044_poll_reg(struct scsi_qla_host *vha, uint32_t addr, int duration, uint32_t test_mask, uint32_t test_result) { - uint32_t value; + uint32_t value = 0; int timeout_error; uint8_t retries; int ret_val = QLA_SUCCESS; From 5e7e6472eda90433b251566755533cf1e9c2e522 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Sat, 19 Sep 2020 10:52:02 +0800 Subject: [PATCH 264/497] scsi: qla2xxx: Convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Link: https://lore.kernel.org/r/20200919025202.17531-1-miaoqinglang@huawei.com Reviewed-by: Himanshu Madhani Acked-by: Arun Easi Signed-off-by: Qinglang Miao Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_dfs.c | 68 ++++------------------------------ 1 file changed, 8 insertions(+), 60 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_dfs.c b/drivers/scsi/qla2xxx/qla_dfs.c index 6f5f18fc974a..b1ecd9fdb8ec 100644 --- a/drivers/scsi/qla2xxx/qla_dfs.c +++ b/drivers/scsi/qla2xxx/qla_dfs.c @@ -171,20 +171,7 @@ qla2x00_dfs_tgt_sess_show(struct seq_file *s, void *unused) return 0; } -static int -qla2x00_dfs_tgt_sess_open(struct inode *inode, struct file *file) -{ - scsi_qla_host_t *vha = inode->i_private; - - return single_open(file, qla2x00_dfs_tgt_sess_show, vha); -} - -static const struct file_operations dfs_tgt_sess_ops = { - .open = qla2x00_dfs_tgt_sess_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(qla2x00_dfs_tgt_sess); static int qla2x00_dfs_tgt_port_database_show(struct seq_file *s, void *unused) @@ -240,20 +227,7 @@ out_free_id_list: return 0; } -static int -qla2x00_dfs_tgt_port_database_open(struct inode *inode, struct file *file) -{ - scsi_qla_host_t *vha = inode->i_private; - - return single_open(file, qla2x00_dfs_tgt_port_database_show, vha); -} - -static const struct file_operations dfs_tgt_port_database_ops = { - .open = qla2x00_dfs_tgt_port_database_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(qla2x00_dfs_tgt_port_database); static int qla_dfs_fw_resource_cnt_show(struct seq_file *s, void *unused) @@ -302,20 +276,7 @@ qla_dfs_fw_resource_cnt_show(struct seq_file *s, void *unused) return 0; } -static int -qla_dfs_fw_resource_cnt_open(struct inode *inode, struct file *file) -{ - struct scsi_qla_host *vha = inode->i_private; - - return single_open(file, qla_dfs_fw_resource_cnt_show, vha); -} - -static const struct file_operations dfs_fw_resource_cnt_ops = { - .open = qla_dfs_fw_resource_cnt_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(qla_dfs_fw_resource_cnt); static int qla_dfs_tgt_counters_show(struct seq_file *s, void *unused) @@ -392,20 +353,7 @@ qla_dfs_tgt_counters_show(struct seq_file *s, void *unused) return 0; } -static int -qla_dfs_tgt_counters_open(struct inode *inode, struct file *file) -{ - struct scsi_qla_host *vha = inode->i_private; - - return single_open(file, qla_dfs_tgt_counters_show, vha); -} - -static const struct file_operations dfs_tgt_counters_ops = { - .open = qla_dfs_tgt_counters_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(qla_dfs_tgt_counters); static int qla2x00_dfs_fce_show(struct seq_file *s, void *unused) @@ -607,19 +555,19 @@ create_dir: create_nodes: ha->dfs_fw_resource_cnt = debugfs_create_file("fw_resource_count", - S_IRUSR, ha->dfs_dir, vha, &dfs_fw_resource_cnt_ops); + S_IRUSR, ha->dfs_dir, vha, &qla_dfs_fw_resource_cnt_fops); ha->dfs_tgt_counters = debugfs_create_file("tgt_counters", S_IRUSR, - ha->dfs_dir, vha, &dfs_tgt_counters_ops); + ha->dfs_dir, vha, &qla_dfs_tgt_counters_fops); ha->tgt.dfs_tgt_port_database = debugfs_create_file("tgt_port_database", - S_IRUSR, ha->dfs_dir, vha, &dfs_tgt_port_database_ops); + S_IRUSR, ha->dfs_dir, vha, &qla2x00_dfs_tgt_port_database_fops); ha->dfs_fce = debugfs_create_file("fce", S_IRUSR, ha->dfs_dir, vha, &dfs_fce_ops); ha->tgt.dfs_tgt_sess = debugfs_create_file("tgt_sess", - S_IRUSR, ha->dfs_dir, vha, &dfs_tgt_sess_ops); + S_IRUSR, ha->dfs_dir, vha, &qla2x00_dfs_tgt_sess_fops); if (IS_QLA27XX(ha) || IS_QLA83XX(ha) || IS_QLA28XX(ha)) { ha->tgt.dfs_naqp = debugfs_create_file("naqp", From 4b217e015b753c83418cc548acf9fac97961e0a3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 20 Sep 2020 13:26:14 +0200 Subject: [PATCH 265/497] scsi: target: rd: Drop double zeroing sg_init_table zeroes its first argument, so the allocation of that argument doesn't have to. the semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x,n,flags; @@ x = - kcalloc + kmalloc_array (n,sizeof(*x),flags) ... sg_init_table(x,n) // Link: https://lore.kernel.org/r/1600601186-7420-3-git-send-email-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Martin K. Petersen --- drivers/target/target_core_rd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_rd.c b/drivers/target/target_core_rd.c index 408bd975170b..bf936bbeccfe 100644 --- a/drivers/target/target_core_rd.c +++ b/drivers/target/target_core_rd.c @@ -131,7 +131,7 @@ static int rd_allocate_sgl_table(struct rd_dev *rd_dev, struct rd_dev_sg_table * if (sg_per_table < total_sg_needed) chain_entry = 1; - sg = kcalloc(sg_per_table + chain_entry, sizeof(*sg), + sg = kmalloc_array(sg_per_table + chain_entry, sizeof(*sg), GFP_KERNEL); if (!sg) return -ENOMEM; From 39d0c6e770c2478c33b405b5729289aa1ad37957 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 21 Sep 2020 16:24:52 +0800 Subject: [PATCH 266/497] scsi: fnic: Simplify the return expression of vnic_wq_copy_alloc() Simplify the return expression. Link: https://lore.kernel.org/r/20200921082452.2592085-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/vnic_wq_copy.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/scsi/fnic/vnic_wq_copy.c b/drivers/scsi/fnic/vnic_wq_copy.c index 9eab7e7caf38..7b18635df7e6 100644 --- a/drivers/scsi/fnic/vnic_wq_copy.c +++ b/drivers/scsi/fnic/vnic_wq_copy.c @@ -79,8 +79,6 @@ int vnic_wq_copy_alloc(struct vnic_dev *vdev, struct vnic_wq_copy *wq, unsigned int index, unsigned int desc_count, unsigned int desc_size) { - int err; - wq->index = index; wq->vdev = vdev; wq->to_use_index = wq->to_clean_index = 0; @@ -92,11 +90,7 @@ int vnic_wq_copy_alloc(struct vnic_dev *vdev, struct vnic_wq_copy *wq, vnic_wq_copy_disable(wq); - err = vnic_dev_alloc_desc_ring(vdev, &wq->ring, desc_count, desc_size); - if (err) - return err; - - return 0; + return vnic_dev_alloc_desc_ring(vdev, &wq->ring, desc_count, desc_size); } void vnic_wq_copy_init(struct vnic_wq_copy *wq, unsigned int cq_index, From 6afc12fa6e503670939f75e4bd3f1777401ffb99 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 21 Sep 2020 16:24:55 +0800 Subject: [PATCH 267/497] scsi: snic: Simplify the return expression of svnic_cq_alloc() Simplify the return expression. Link: https://lore.kernel.org/r/20200921082455.2592190-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Signed-off-by: Martin K. Petersen --- drivers/scsi/snic/vnic_cq.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/scsi/snic/vnic_cq.c b/drivers/scsi/snic/vnic_cq.c index 4c8e64e4fba6..3455dd7e73f4 100644 --- a/drivers/scsi/snic/vnic_cq.c +++ b/drivers/scsi/snic/vnic_cq.c @@ -31,8 +31,6 @@ void svnic_cq_free(struct vnic_cq *cq) int svnic_cq_alloc(struct vnic_dev *vdev, struct vnic_cq *cq, unsigned int index, unsigned int desc_count, unsigned int desc_size) { - int err; - cq->index = index; cq->vdev = vdev; @@ -43,11 +41,7 @@ int svnic_cq_alloc(struct vnic_dev *vdev, struct vnic_cq *cq, return -EINVAL; } - err = svnic_dev_alloc_desc_ring(vdev, &cq->ring, desc_count, desc_size); - if (err) - return err; - - return 0; + return svnic_dev_alloc_desc_ring(vdev, &cq->ring, desc_count, desc_size); } void svnic_cq_init(struct vnic_cq *cq, unsigned int flow_control_enable, From de6c063fa09a671cce7bea996cfe2a9288f794a8 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Mon, 21 Sep 2020 21:11:02 +0800 Subject: [PATCH 268/497] scsi: fcoe: Simplify the return expression of fcoe_sysfs_setup() Simplify the return expression. Link: https://lore.kernel.org/r/20200921131102.93084-1-miaoqinglang@huawei.com Signed-off-by: Qinglang Miao Signed-off-by: Martin K. Petersen --- drivers/scsi/fcoe/fcoe_sysfs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/scsi/fcoe/fcoe_sysfs.c b/drivers/scsi/fcoe/fcoe_sysfs.c index 2cb7a8c93a15..ffef2c8eddc6 100644 --- a/drivers/scsi/fcoe/fcoe_sysfs.c +++ b/drivers/scsi/fcoe/fcoe_sysfs.c @@ -1053,16 +1053,10 @@ EXPORT_SYMBOL_GPL(fcoe_fcf_device_add); int __init fcoe_sysfs_setup(void) { - int error; - atomic_set(&ctlr_num, 0); atomic_set(&fcf_num, 0); - error = bus_register(&fcoe_bus_type); - if (error) - return error; - - return 0; + return bus_register(&fcoe_bus_type); } void __exit fcoe_sysfs_teardown(void) From f0f6c3a4fcb80fcbcce4ff6739996dd98c228afd Mon Sep 17 00:00:00 2001 From: Jing Xiangfeng Date: Fri, 25 Sep 2020 14:24:23 +0800 Subject: [PATCH 269/497] scsi: bfa: Fix error return in bfad_pci_init() Fix to return error code -ENODEV from the error handling case instead of 0. Link: https://lore.kernel.org/r/20200925062423.161504-1-jingxiangfeng@huawei.com Fixes: 11ea3824140c ("scsi: bfa: fix calls to dma_set_mask_and_coherent()") Signed-off-by: Jing Xiangfeng Signed-off-by: Martin K. Petersen --- drivers/scsi/bfa/bfad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c index bc5d84f87d8f..440ef32be048 100644 --- a/drivers/scsi/bfa/bfad.c +++ b/drivers/scsi/bfa/bfad.c @@ -749,6 +749,7 @@ bfad_pci_init(struct pci_dev *pdev, struct bfad_s *bfad) if (bfad->pci_bar0_kva == NULL) { printk(KERN_ERR "Fail to map bar0\n"); + rc = -ENODEV; goto out_release_region; } From 5f6dcb55a7fa347a0c72fc5afb860e4012628902 Mon Sep 17 00:00:00 2001 From: Jing Xiangfeng Date: Tue, 29 Sep 2020 10:24:58 +0800 Subject: [PATCH 270/497] scsi: myrb: Remove redundant assignment to variable timeout The variable timeout has been initialized with a value '0'. The assignment before while loop is redundant. Remove it. Link: https://lore.kernel.org/r/20200929022458.40652-1-jingxiangfeng@huawei.com Signed-off-by: Jing Xiangfeng Signed-off-by: Martin K. Petersen --- drivers/scsi/myrb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index b8020eaa6320..31c8cbbba45c 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -2732,7 +2732,6 @@ static int DAC960_LA_hw_init(struct pci_dev *pdev, DAC960_LA_disable_intr(base); DAC960_LA_ack_hw_mbox_status(base); udelay(1000); - timeout = 0; while (DAC960_LA_init_in_progress(base) && timeout < MYRB_MAILBOX_TIMEOUT) { if (DAC960_LA_read_error_status(base, &error, From fc29f04a5c6b946bde125a36b2bfe5414c7c03f7 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 30 Sep 2020 10:16:37 +0800 Subject: [PATCH 271/497] scsi: myrb: Fix inconsistent format argument types Fix the following warnings: [drivers/scsi/myrb.c:1052]: (warning) %d in format string (no. 1) requires 'int' but the argument type is 'unsigned int'. [drivers/scsi/myrb.c:1052]: (warning) %d in format string (no. 2) requires 'int' but the argument type is 'unsigned int'. [drivers/scsi/myrb.c:1052]: (warning) %d in format string (no. 4) requires 'int' but the argument type is 'unsigned int'. [drivers/scsi/myrb.c:2170]: (warning) %d in format string (no. 1) requires 'int' but the argument type is 'unsigned int'. Link: https://lore.kernel.org/r/20200930021637.2831618-1-yebin10@huawei.com Reported-by: Hulk Robot Signed-off-by: Ye Bin Signed-off-by: Martin K. Petersen --- drivers/scsi/myrb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index 31c8cbbba45c..48e53861cd34 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -1050,7 +1050,7 @@ static int myrb_get_hba_config(struct myrb_hba *cb) enquiry2->fw.turn_id = 0; } snprintf(cb->fw_version, sizeof(cb->fw_version), - "%d.%02d-%c-%02d", + "%u.%02u-%c-%02u", enquiry2->fw.major_version, enquiry2->fw.minor_version, enquiry2->fw.firmware_type, @@ -2167,7 +2167,7 @@ static ssize_t ctlr_num_show(struct device *dev, struct Scsi_Host *shost = class_to_shost(dev); struct myrb_hba *cb = shost_priv(shost); - return snprintf(buf, 20, "%d\n", cb->ctlr_num); + return snprintf(buf, 20, "%u\n", cb->ctlr_num); } static DEVICE_ATTR_RO(ctlr_num); From 5ccdd101351df3b981b9a36f3f039e6afa531cdf Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 30 Sep 2020 10:22:28 +0800 Subject: [PATCH 272/497] scsi: qla4xxx: Fix inconsistent format argument type Fix the following warning: [drivers/scsi/qla4xxx/ql4_nx.c:3228]: (warning) %ld in format string (no. 1) requires 'long' but the argument type is 'unsigned long'. Link: https://lore.kernel.org/r/20200930022228.2840587-1-yebin10@huawei.com Reported-by: Hulk Robot Signed-off-by: Ye Bin Signed-off-by: Martin K. Petersen --- drivers/scsi/qla4xxx/ql4_nx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla4xxx/ql4_nx.c b/drivers/scsi/qla4xxx/ql4_nx.c index 038e19b1e3c2..4e3b589634e5 100644 --- a/drivers/scsi/qla4xxx/ql4_nx.c +++ b/drivers/scsi/qla4xxx/ql4_nx.c @@ -3226,7 +3226,7 @@ static void qla4_8xxx_uevent_emit(struct scsi_qla_host *ha, u32 code) switch (code) { case QL4_UEVENT_CODE_FW_DUMP: - snprintf(event_string, sizeof(event_string), "FW_DUMP=%ld", + snprintf(event_string, sizeof(event_string), "FW_DUMP=%lu", ha->host_no); break; default: From 45660591ee8f5b08bfc6e3aec50639df82a19ca3 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 3 Oct 2020 07:57:09 +0200 Subject: [PATCH 273/497] scsi: isci: Fix a typo in a comment s/remtoe/remote/ and add a missing '.' Link: https://lore.kernel.org/r/20201003055709.766119-1-christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/scsi/isci/remote_node_table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/isci/remote_node_table.h b/drivers/scsi/isci/remote_node_table.h index 721ab982d2ac..0ddfdda2b248 100644 --- a/drivers/scsi/isci/remote_node_table.h +++ b/drivers/scsi/isci/remote_node_table.h @@ -61,7 +61,7 @@ /** * * - * Remote node sets are sets of remote node index in the remtoe node table The + * Remote node sets are sets of remote node index in the remote node table. The * SCU hardware requires that STP remote node entries take three consecutive * remote node index so the table is arranged in sets of three. The bits are * used as 0111 0111 to make a byte and the bits define the set of three remote From 1725ba8d6ff1e98ce9d1f79d2bad7580cceace05 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 6 Oct 2020 12:02:52 +0100 Subject: [PATCH 274/497] scsi: sym53c8xx_2: Fix sizeof() mismatch An incorrect sizeof() is being used, struct sym_ccb ** is not correct, it should be struct sym_ccb *. Note that since ** is the same size as * this is not causing any issues. Improve this fix by using the idiom sizeof(*np->ccbh) as this allows one to not even reference the type of the pointer. [ Note: this is an ancient 2005 buglet, the sha is from the tglx/history repo ] Link: https://lore.kernel.org/r/20201006110252.536641-1-colin.king@canonical.com Fixes: 473c67f96e06 ("[PATCH] sym2 version 2.2.0") Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen Addresses-Coverity: ("Sizeof not portable (SIZEOF_MISMATCH)") --- drivers/scsi/sym53c8xx_2/sym_hipd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c index 8410117d5aa4..d9bcb6faea95 100644 --- a/drivers/scsi/sym53c8xx_2/sym_hipd.c +++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c @@ -5656,7 +5656,7 @@ int sym_hcb_attach(struct Scsi_Host *shost, struct sym_fw *fw, struct sym_nvram /* * Allocate the array of lists of CCBs hashed by DSA. */ - np->ccbh = kcalloc(CCB_HASH_SIZE, sizeof(struct sym_ccb **), GFP_KERNEL); + np->ccbh = kcalloc(CCB_HASH_SIZE, sizeof(*np->ccbh), GFP_KERNEL); if (!np->ccbh) goto attach_failed; From 05c6c029a44d9f43715577e33e95eba87f44d285 Mon Sep 17 00:00:00 2001 From: Viswas G Date: Mon, 5 Oct 2020 20:20:08 +0530 Subject: [PATCH 275/497] scsi: pm80xx: Increase number of supported queues Current driver uses fixed number of Inbound and Outbound queues and all of the I/O, TMF and internal requests are submitted through those. A global spin lock is used to control the shared access. This can create a lock contention and it is real bottleneck in the I/O path. To avoid this, the number of supported Inbound and Outbound queues is increased to 64, and the number of queues used is decided based on number of CPU cores online and number of MSI-X vectors allocated. Also add locks per queue instead of using the global lock. Link: https://lore.kernel.org/r/20201005145011.23674-2-Viswas.G@microchip.com.com Acked-by: Jack Wang Signed-off-by: Viswas G Signed-off-by: Ruksar Devadi Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_ctl.c | 6 +- drivers/scsi/pm8001/pm8001_defs.h | 17 ++--- drivers/scsi/pm8001/pm8001_hwi.c | 32 ++++---- drivers/scsi/pm8001/pm8001_init.c | 117 +++++++++++++++++++----------- drivers/scsi/pm8001/pm8001_sas.h | 11 ++- drivers/scsi/pm8001/pm80xx_hwi.c | 81 ++++++++++++--------- 6 files changed, 160 insertions(+), 104 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_ctl.c b/drivers/scsi/pm8001/pm8001_ctl.c index 77c805db2724..3587f7c8a428 100644 --- a/drivers/scsi/pm8001/pm8001_ctl.c +++ b/drivers/scsi/pm8001/pm8001_ctl.c @@ -408,9 +408,10 @@ static ssize_t pm8001_ctl_ib_queue_log_show(struct device *cdev, int offset; char *str = buf; int start = 0; + u32 ib_offset = pm8001_ha->ib_offset; #define IB_MEMMAP(c) \ (*(u32 *)((u8 *)pm8001_ha-> \ - memoryMap.region[IB].virt_ptr + \ + memoryMap.region[ib_offset].virt_ptr + \ pm8001_ha->evtlog_ib_offset + (c))) for (offset = 0; offset < IB_OB_READ_TIMES; offset++) { @@ -442,9 +443,10 @@ static ssize_t pm8001_ctl_ob_queue_log_show(struct device *cdev, int offset; char *str = buf; int start = 0; + u32 ob_offset = pm8001_ha->ob_offset; #define OB_MEMMAP(c) \ (*(u32 *)((u8 *)pm8001_ha-> \ - memoryMap.region[OB].virt_ptr + \ + memoryMap.region[ob_offset].virt_ptr + \ pm8001_ha->evtlog_ob_offset + (c))) for (offset = 0; offset < IB_OB_READ_TIMES; offset++) { diff --git a/drivers/scsi/pm8001/pm8001_defs.h b/drivers/scsi/pm8001/pm8001_defs.h index 1c7f15fd69ce..a4f52a5a449e 100644 --- a/drivers/scsi/pm8001/pm8001_defs.h +++ b/drivers/scsi/pm8001/pm8001_defs.h @@ -77,10 +77,8 @@ enum port_type { /* driver compile-time configuration */ #define PM8001_MAX_CCB 256 /* max ccbs supported */ #define PM8001_MPI_QUEUE 1024 /* maximum mpi queue entries */ -#define PM8001_MAX_INB_NUM 1 -#define PM8001_MAX_OUTB_NUM 1 -#define PM8001_MAX_SPCV_INB_NUM 1 -#define PM8001_MAX_SPCV_OUTB_NUM 4 +#define PM8001_MAX_INB_NUM 64 +#define PM8001_MAX_OUTB_NUM 64 #define PM8001_CAN_QUEUE 508 /* SCSI Queue depth */ /* Inbound/Outbound queue size */ @@ -94,11 +92,6 @@ enum port_type { #define PM8001_MAX_MSIX_VEC 64 /* max msi-x int for spcv/ve */ #define USI_MAX_MEMCNT_BASE 5 -#define IB (USI_MAX_MEMCNT_BASE + 1) -#define CI (IB + PM8001_MAX_SPCV_INB_NUM) -#define OB (CI + PM8001_MAX_SPCV_INB_NUM) -#define PI (OB + PM8001_MAX_SPCV_OUTB_NUM) -#define USI_MAX_MEMCNT (PI + PM8001_MAX_SPCV_OUTB_NUM) #define CONFIG_SCSI_PM8001_MAX_DMA_SG 528 #define PM8001_MAX_DMA_SG CONFIG_SCSI_PM8001_MAX_DMA_SG enum memory_region_num { @@ -112,6 +105,12 @@ enum memory_region_num { }; #define PM8001_EVENT_LOG_SIZE (128 * 1024) +/** + * maximum DMA memory regions(number of IBQ + number of IBQ CI + * + number of OBQ + number of OBQ PI) + */ +#define USI_MAX_MEMCNT (USI_MAX_MEMCNT_BASE + 1 + ((2 * PM8001_MAX_INB_NUM) \ + + (2 * PM8001_MAX_OUTB_NUM))) /*error code*/ enum mpi_err { MPI_IO_STATUS_SUCCESS = 0x0, diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index e9a939230b15..e9106575a30f 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -189,6 +189,10 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha) u32 offsetib, offsetob; void __iomem *addressib = pm8001_ha->inbnd_q_tbl_addr; void __iomem *addressob = pm8001_ha->outbnd_q_tbl_addr; + u32 ib_offset = pm8001_ha->ib_offset; + u32 ob_offset = pm8001_ha->ob_offset; + u32 ci_offset = pm8001_ha->ci_offset; + u32 pi_offset = pm8001_ha->pi_offset; pm8001_ha->main_cfg_tbl.pm8001_tbl.inbound_q_nppd_hppd = 0; pm8001_ha->main_cfg_tbl.pm8001_tbl.outbound_hw_event_pid0_3 = 0; @@ -223,19 +227,19 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha) pm8001_ha->inbnd_q_tbl[i].element_pri_size_cnt = PM8001_MPI_QUEUE | (pm8001_ha->iomb_size << 16) | (0x00<<30); pm8001_ha->inbnd_q_tbl[i].upper_base_addr = - pm8001_ha->memoryMap.region[IB + i].phys_addr_hi; + pm8001_ha->memoryMap.region[ib_offset + i].phys_addr_hi; pm8001_ha->inbnd_q_tbl[i].lower_base_addr = - pm8001_ha->memoryMap.region[IB + i].phys_addr_lo; + pm8001_ha->memoryMap.region[ib_offset + i].phys_addr_lo; pm8001_ha->inbnd_q_tbl[i].base_virt = - (u8 *)pm8001_ha->memoryMap.region[IB + i].virt_ptr; + (u8 *)pm8001_ha->memoryMap.region[ib_offset + i].virt_ptr; pm8001_ha->inbnd_q_tbl[i].total_length = - pm8001_ha->memoryMap.region[IB + i].total_len; + pm8001_ha->memoryMap.region[ib_offset + i].total_len; pm8001_ha->inbnd_q_tbl[i].ci_upper_base_addr = - pm8001_ha->memoryMap.region[CI + i].phys_addr_hi; + pm8001_ha->memoryMap.region[ci_offset + i].phys_addr_hi; pm8001_ha->inbnd_q_tbl[i].ci_lower_base_addr = - pm8001_ha->memoryMap.region[CI + i].phys_addr_lo; + pm8001_ha->memoryMap.region[ci_offset + i].phys_addr_lo; pm8001_ha->inbnd_q_tbl[i].ci_virt = - pm8001_ha->memoryMap.region[CI + i].virt_ptr; + pm8001_ha->memoryMap.region[ci_offset + i].virt_ptr; offsetib = i * 0x20; pm8001_ha->inbnd_q_tbl[i].pi_pci_bar = get_pci_bar_index(pm8001_mr32(addressib, @@ -249,21 +253,21 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha) pm8001_ha->outbnd_q_tbl[i].element_size_cnt = PM8001_MPI_QUEUE | (pm8001_ha->iomb_size << 16) | (0x01<<30); pm8001_ha->outbnd_q_tbl[i].upper_base_addr = - pm8001_ha->memoryMap.region[OB + i].phys_addr_hi; + pm8001_ha->memoryMap.region[ob_offset + i].phys_addr_hi; pm8001_ha->outbnd_q_tbl[i].lower_base_addr = - pm8001_ha->memoryMap.region[OB + i].phys_addr_lo; + pm8001_ha->memoryMap.region[ob_offset + i].phys_addr_lo; pm8001_ha->outbnd_q_tbl[i].base_virt = - (u8 *)pm8001_ha->memoryMap.region[OB + i].virt_ptr; + (u8 *)pm8001_ha->memoryMap.region[ob_offset + i].virt_ptr; pm8001_ha->outbnd_q_tbl[i].total_length = - pm8001_ha->memoryMap.region[OB + i].total_len; + pm8001_ha->memoryMap.region[ob_offset + i].total_len; pm8001_ha->outbnd_q_tbl[i].pi_upper_base_addr = - pm8001_ha->memoryMap.region[PI + i].phys_addr_hi; + pm8001_ha->memoryMap.region[pi_offset + i].phys_addr_hi; pm8001_ha->outbnd_q_tbl[i].pi_lower_base_addr = - pm8001_ha->memoryMap.region[PI + i].phys_addr_lo; + pm8001_ha->memoryMap.region[pi_offset + i].phys_addr_lo; pm8001_ha->outbnd_q_tbl[i].interrup_vec_cnt_delay = 0 | (10 << 16) | (i << 24); pm8001_ha->outbnd_q_tbl[i].pi_virt = - pm8001_ha->memoryMap.region[PI + i].virt_ptr; + pm8001_ha->memoryMap.region[pi_offset + i].virt_ptr; offsetob = i * 0x24; pm8001_ha->outbnd_q_tbl[i].ci_pci_bar = get_pci_bar_index(pm8001_mr32(addressob, diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 20fa96cbc9d3..cddcc9205d8c 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -264,12 +264,36 @@ static u32 pm8001_request_irq(struct pm8001_hba_info *pm8001_ha); static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, const struct pci_device_id *ent) { - int i; + int i, count = 0, rc = 0; + u32 ci_offset, ib_offset, ob_offset, pi_offset; + struct inbound_queue_table *circularQ; + spin_lock_init(&pm8001_ha->lock); spin_lock_init(&pm8001_ha->bitmap_lock); PM8001_INIT_DBG(pm8001_ha, pm8001_printk("pm8001_alloc: PHY:%x\n", pm8001_ha->chip->n_phy)); + + /* Setup Interrupt */ + rc = pm8001_setup_irq(pm8001_ha); + if (rc) { + PM8001_FAIL_DBG(pm8001_ha, pm8001_printk( + "pm8001_setup_irq failed [ret: %d]\n", rc)); + goto err_out_shost; + } + /* Request Interrupt */ + rc = pm8001_request_irq(pm8001_ha); + if (rc) + goto err_out_shost; + + count = pm8001_ha->max_q_num; + /* Queues are chosen based on the number of cores/msix availability */ + ib_offset = pm8001_ha->ib_offset = USI_MAX_MEMCNT_BASE + 1; + ci_offset = pm8001_ha->ci_offset = ib_offset + count; + ob_offset = pm8001_ha->ob_offset = ci_offset + count; + pi_offset = pm8001_ha->pi_offset = ob_offset + count; + pm8001_ha->max_memcnt = pi_offset + count; + for (i = 0; i < pm8001_ha->chip->n_phy; i++) { pm8001_phy_init(pm8001_ha, i); pm8001_ha->port[i].wide_port_phymap = 0; @@ -293,54 +317,62 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, pm8001_ha->memoryMap.region[IOP].total_len = PM8001_EVENT_LOG_SIZE; pm8001_ha->memoryMap.region[IOP].alignment = 32; - for (i = 0; i < PM8001_MAX_SPCV_INB_NUM; i++) { + for (i = 0; i < count; i++) { + circularQ = &pm8001_ha->inbnd_q_tbl[i]; + spin_lock_init(&circularQ->iq_lock); /* MPI Memory region 3 for consumer Index of inbound queues */ - pm8001_ha->memoryMap.region[CI+i].num_elements = 1; - pm8001_ha->memoryMap.region[CI+i].element_size = 4; - pm8001_ha->memoryMap.region[CI+i].total_len = 4; - pm8001_ha->memoryMap.region[CI+i].alignment = 4; + pm8001_ha->memoryMap.region[ci_offset+i].num_elements = 1; + pm8001_ha->memoryMap.region[ci_offset+i].element_size = 4; + pm8001_ha->memoryMap.region[ci_offset+i].total_len = 4; + pm8001_ha->memoryMap.region[ci_offset+i].alignment = 4; if ((ent->driver_data) != chip_8001) { /* MPI Memory region 5 inbound queues */ - pm8001_ha->memoryMap.region[IB+i].num_elements = + pm8001_ha->memoryMap.region[ib_offset+i].num_elements = PM8001_MPI_QUEUE; - pm8001_ha->memoryMap.region[IB+i].element_size = 128; - pm8001_ha->memoryMap.region[IB+i].total_len = + pm8001_ha->memoryMap.region[ib_offset+i].element_size + = 128; + pm8001_ha->memoryMap.region[ib_offset+i].total_len = PM8001_MPI_QUEUE * 128; - pm8001_ha->memoryMap.region[IB+i].alignment = 128; + pm8001_ha->memoryMap.region[ib_offset+i].alignment + = 128; } else { - pm8001_ha->memoryMap.region[IB+i].num_elements = + pm8001_ha->memoryMap.region[ib_offset+i].num_elements = PM8001_MPI_QUEUE; - pm8001_ha->memoryMap.region[IB+i].element_size = 64; - pm8001_ha->memoryMap.region[IB+i].total_len = + pm8001_ha->memoryMap.region[ib_offset+i].element_size + = 64; + pm8001_ha->memoryMap.region[ib_offset+i].total_len = PM8001_MPI_QUEUE * 64; - pm8001_ha->memoryMap.region[IB+i].alignment = 64; + pm8001_ha->memoryMap.region[ib_offset+i].alignment = 64; } } - for (i = 0; i < PM8001_MAX_SPCV_OUTB_NUM; i++) { + for (i = 0; i < count; i++) { /* MPI Memory region 4 for producer Index of outbound queues */ - pm8001_ha->memoryMap.region[PI+i].num_elements = 1; - pm8001_ha->memoryMap.region[PI+i].element_size = 4; - pm8001_ha->memoryMap.region[PI+i].total_len = 4; - pm8001_ha->memoryMap.region[PI+i].alignment = 4; + pm8001_ha->memoryMap.region[pi_offset+i].num_elements = 1; + pm8001_ha->memoryMap.region[pi_offset+i].element_size = 4; + pm8001_ha->memoryMap.region[pi_offset+i].total_len = 4; + pm8001_ha->memoryMap.region[pi_offset+i].alignment = 4; if (ent->driver_data != chip_8001) { /* MPI Memory region 6 Outbound queues */ - pm8001_ha->memoryMap.region[OB+i].num_elements = + pm8001_ha->memoryMap.region[ob_offset+i].num_elements = PM8001_MPI_QUEUE; - pm8001_ha->memoryMap.region[OB+i].element_size = 128; - pm8001_ha->memoryMap.region[OB+i].total_len = + pm8001_ha->memoryMap.region[ob_offset+i].element_size + = 128; + pm8001_ha->memoryMap.region[ob_offset+i].total_len = PM8001_MPI_QUEUE * 128; - pm8001_ha->memoryMap.region[OB+i].alignment = 128; + pm8001_ha->memoryMap.region[ob_offset+i].alignment + = 128; } else { /* MPI Memory region 6 Outbound queues */ - pm8001_ha->memoryMap.region[OB+i].num_elements = + pm8001_ha->memoryMap.region[ob_offset+i].num_elements = PM8001_MPI_QUEUE; - pm8001_ha->memoryMap.region[OB+i].element_size = 64; - pm8001_ha->memoryMap.region[OB+i].total_len = + pm8001_ha->memoryMap.region[ob_offset+i].element_size + = 64; + pm8001_ha->memoryMap.region[ob_offset+i].total_len = PM8001_MPI_QUEUE * 64; - pm8001_ha->memoryMap.region[OB+i].alignment = 64; + pm8001_ha->memoryMap.region[ob_offset+i].alignment = 64; } } @@ -369,7 +401,7 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, pm8001_ha->memoryMap.region[FORENSIC_MEM].total_len = 0x10000; pm8001_ha->memoryMap.region[FORENSIC_MEM].element_size = 0x10000; pm8001_ha->memoryMap.region[FORENSIC_MEM].alignment = 0x10000; - for (i = 0; i < USI_MAX_MEMCNT; i++) { + for (i = 0; i < pm8001_ha->max_memcnt; i++) { if (pm8001_mem_alloc(pm8001_ha->pdev, &pm8001_ha->memoryMap.region[i].virt_ptr, &pm8001_ha->memoryMap.region[i].phys_addr, @@ -405,6 +437,8 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, /* Initialize tags */ pm8001_tag_init(pm8001_ha); return 0; +err_out_shost: + scsi_remove_host(pm8001_ha->shost); err_out: return 1; } @@ -899,7 +933,8 @@ static int pm8001_configure_phy_settings(struct pm8001_hba_info *pm8001_ha) static u32 pm8001_setup_msix(struct pm8001_hba_info *pm8001_ha) { u32 number_of_intr; - int rc; + int rc, cpu_online_count; + unsigned int allocated_irq_vectors; /* SPCv controllers supports 64 msi-x */ if (pm8001_ha->chip_id == chip_8001) { @@ -908,13 +943,21 @@ static u32 pm8001_setup_msix(struct pm8001_hba_info *pm8001_ha) number_of_intr = PM8001_MAX_MSIX_VEC; } + cpu_online_count = num_online_cpus(); + number_of_intr = min_t(int, cpu_online_count, number_of_intr); rc = pci_alloc_irq_vectors(pm8001_ha->pdev, number_of_intr, number_of_intr, PCI_IRQ_MSIX); - number_of_intr = rc; + allocated_irq_vectors = rc; if (rc < 0) return rc; + + /* Assigns the number of interrupts */ + number_of_intr = min_t(int, allocated_irq_vectors, number_of_intr); pm8001_ha->number_of_intr = number_of_intr; + /* Maximum queue number updating in HBA structure */ + pm8001_ha->max_q_num = number_of_intr; + PM8001_INIT_DBG(pm8001_ha, pm8001_printk( "pci_alloc_irq_vectors request ret:%d no of intr %d\n", rc, pm8001_ha->number_of_intr)); @@ -1069,13 +1112,6 @@ static int pm8001_pci_probe(struct pci_dev *pdev, rc = -ENOMEM; goto err_out_free; } - /* Setup Interrupt */ - rc = pm8001_setup_irq(pm8001_ha); - if (rc) { - PM8001_FAIL_DBG(pm8001_ha, pm8001_printk( - "pm8001_setup_irq failed [ret: %d]\n", rc)); - goto err_out_shost; - } PM8001_CHIP_DISP->chip_soft_rst(pm8001_ha); rc = PM8001_CHIP_DISP->chip_init(pm8001_ha); @@ -1088,13 +1124,6 @@ static int pm8001_pci_probe(struct pci_dev *pdev, rc = scsi_add_host(shost, &pdev->dev); if (rc) goto err_out_ha_free; - /* Request Interrupt */ - rc = pm8001_request_irq(pm8001_ha); - if (rc) { - PM8001_FAIL_DBG(pm8001_ha, pm8001_printk( - "pm8001_request_irq failed [ret: %d]\n", rc)); - goto err_out_shost; - } PM8001_CHIP_DISP->interrupt_enable(pm8001_ha, 0); if (pm8001_ha->chip_id != chip_8001) { diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index ae7ba9b3c4bc..bdfce3c3f619 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -468,6 +468,7 @@ struct inbound_queue_table { u32 reserved; __le32 consumer_index; u32 producer_idx; + spinlock_t iq_lock; }; struct outbound_queue_table { u32 element_size_cnt; @@ -524,8 +525,8 @@ struct pm8001_hba_info { void __iomem *fatal_tbl_addr; /*MPI IVT Table Addr */ union main_cfg_table main_cfg_tbl; union general_status_table gs_tbl; - struct inbound_queue_table inbnd_q_tbl[PM8001_MAX_SPCV_INB_NUM]; - struct outbound_queue_table outbnd_q_tbl[PM8001_MAX_SPCV_OUTB_NUM]; + struct inbound_queue_table inbnd_q_tbl[PM8001_MAX_INB_NUM]; + struct outbound_queue_table outbnd_q_tbl[PM8001_MAX_OUTB_NUM]; struct sas_phy_attribute_table phy_attr_table; /* MPI SAS PHY attributes */ u8 sas_addr[SAS_ADDR_SIZE]; @@ -561,6 +562,12 @@ struct pm8001_hba_info { u32 reset_in_progress; u32 non_fatal_count; u32 non_fatal_read_length; + u32 max_q_num; + u32 ib_offset; + u32 ob_offset; + u32 ci_offset; + u32 pi_offset; + u32 max_memcnt; }; struct pm8001_work { diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index b42f41d1ed49..26e9e8877107 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -720,7 +720,7 @@ static void read_inbnd_queue_table(struct pm8001_hba_info *pm8001_ha) { int i; void __iomem *address = pm8001_ha->inbnd_q_tbl_addr; - for (i = 0; i < PM8001_MAX_SPCV_INB_NUM; i++) { + for (i = 0; i < PM8001_MAX_INB_NUM; i++) { u32 offset = i * 0x20; pm8001_ha->inbnd_q_tbl[i].pi_pci_bar = get_pci_bar_index(pm8001_mr32(address, @@ -738,7 +738,7 @@ static void read_outbnd_queue_table(struct pm8001_hba_info *pm8001_ha) { int i; void __iomem *address = pm8001_ha->outbnd_q_tbl_addr; - for (i = 0; i < PM8001_MAX_SPCV_OUTB_NUM; i++) { + for (i = 0; i < PM8001_MAX_OUTB_NUM; i++) { u32 offset = i * 0x24; pm8001_ha->outbnd_q_tbl[i].ci_pci_bar = get_pci_bar_index(pm8001_mr32(address, @@ -758,6 +758,10 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha) u32 offsetib, offsetob; void __iomem *addressib = pm8001_ha->inbnd_q_tbl_addr; void __iomem *addressob = pm8001_ha->outbnd_q_tbl_addr; + u32 ib_offset = pm8001_ha->ib_offset; + u32 ob_offset = pm8001_ha->ob_offset; + u32 ci_offset = pm8001_ha->ci_offset; + u32 pi_offset = pm8001_ha->pi_offset; pm8001_ha->main_cfg_tbl.pm80xx_tbl.upper_event_log_addr = pm8001_ha->memoryMap.region[AAP1].phys_addr_hi; @@ -778,23 +782,23 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha) /* Disable end to end CRC checking */ pm8001_ha->main_cfg_tbl.pm80xx_tbl.crc_core_dump = (0x1 << 16); - for (i = 0; i < PM8001_MAX_SPCV_INB_NUM; i++) { + for (i = 0; i < pm8001_ha->max_q_num; i++) { pm8001_ha->inbnd_q_tbl[i].element_pri_size_cnt = PM8001_MPI_QUEUE | (pm8001_ha->iomb_size << 16) | (0x00<<30); pm8001_ha->inbnd_q_tbl[i].upper_base_addr = - pm8001_ha->memoryMap.region[IB + i].phys_addr_hi; + pm8001_ha->memoryMap.region[ib_offset + i].phys_addr_hi; pm8001_ha->inbnd_q_tbl[i].lower_base_addr = - pm8001_ha->memoryMap.region[IB + i].phys_addr_lo; + pm8001_ha->memoryMap.region[ib_offset + i].phys_addr_lo; pm8001_ha->inbnd_q_tbl[i].base_virt = - (u8 *)pm8001_ha->memoryMap.region[IB + i].virt_ptr; + (u8 *)pm8001_ha->memoryMap.region[ib_offset + i].virt_ptr; pm8001_ha->inbnd_q_tbl[i].total_length = - pm8001_ha->memoryMap.region[IB + i].total_len; + pm8001_ha->memoryMap.region[ib_offset + i].total_len; pm8001_ha->inbnd_q_tbl[i].ci_upper_base_addr = - pm8001_ha->memoryMap.region[CI + i].phys_addr_hi; + pm8001_ha->memoryMap.region[ci_offset + i].phys_addr_hi; pm8001_ha->inbnd_q_tbl[i].ci_lower_base_addr = - pm8001_ha->memoryMap.region[CI + i].phys_addr_lo; + pm8001_ha->memoryMap.region[ci_offset + i].phys_addr_lo; pm8001_ha->inbnd_q_tbl[i].ci_virt = - pm8001_ha->memoryMap.region[CI + i].virt_ptr; + pm8001_ha->memoryMap.region[ci_offset + i].virt_ptr; offsetib = i * 0x20; pm8001_ha->inbnd_q_tbl[i].pi_pci_bar = get_pci_bar_index(pm8001_mr32(addressib, @@ -809,25 +813,25 @@ static void init_default_table_values(struct pm8001_hba_info *pm8001_ha) pm8001_ha->inbnd_q_tbl[i].pi_pci_bar, pm8001_ha->inbnd_q_tbl[i].pi_offset)); } - for (i = 0; i < PM8001_MAX_SPCV_OUTB_NUM; i++) { + for (i = 0; i < pm8001_ha->max_q_num; i++) { pm8001_ha->outbnd_q_tbl[i].element_size_cnt = PM8001_MPI_QUEUE | (pm8001_ha->iomb_size << 16) | (0x01<<30); pm8001_ha->outbnd_q_tbl[i].upper_base_addr = - pm8001_ha->memoryMap.region[OB + i].phys_addr_hi; + pm8001_ha->memoryMap.region[ob_offset + i].phys_addr_hi; pm8001_ha->outbnd_q_tbl[i].lower_base_addr = - pm8001_ha->memoryMap.region[OB + i].phys_addr_lo; + pm8001_ha->memoryMap.region[ob_offset + i].phys_addr_lo; pm8001_ha->outbnd_q_tbl[i].base_virt = - (u8 *)pm8001_ha->memoryMap.region[OB + i].virt_ptr; + (u8 *)pm8001_ha->memoryMap.region[ob_offset + i].virt_ptr; pm8001_ha->outbnd_q_tbl[i].total_length = - pm8001_ha->memoryMap.region[OB + i].total_len; + pm8001_ha->memoryMap.region[ob_offset + i].total_len; pm8001_ha->outbnd_q_tbl[i].pi_upper_base_addr = - pm8001_ha->memoryMap.region[PI + i].phys_addr_hi; + pm8001_ha->memoryMap.region[pi_offset + i].phys_addr_hi; pm8001_ha->outbnd_q_tbl[i].pi_lower_base_addr = - pm8001_ha->memoryMap.region[PI + i].phys_addr_lo; + pm8001_ha->memoryMap.region[pi_offset + i].phys_addr_lo; /* interrupt vector based on oq */ pm8001_ha->outbnd_q_tbl[i].interrup_vec_cnt_delay = (i << 24); pm8001_ha->outbnd_q_tbl[i].pi_virt = - pm8001_ha->memoryMap.region[PI + i].virt_ptr; + pm8001_ha->memoryMap.region[pi_offset + i].virt_ptr; offsetob = i * 0x24; pm8001_ha->outbnd_q_tbl[i].ci_pci_bar = get_pci_bar_index(pm8001_mr32(addressob, @@ -871,7 +875,7 @@ static void update_main_config_table(struct pm8001_hba_info *pm8001_ha) pm8001_ha->main_cfg_tbl.pm80xx_tbl.pcs_event_log_severity); /* Update Fatal error interrupt vector */ pm8001_ha->main_cfg_tbl.pm80xx_tbl.fatal_err_interrupt |= - ((pm8001_ha->number_of_intr - 1) << 8); + ((pm8001_ha->max_q_num - 1) << 8); pm8001_mw32(address, MAIN_FATAL_ERROR_INTERRUPT, pm8001_ha->main_cfg_tbl.pm80xx_tbl.fatal_err_interrupt); PM8001_DEV_DBG(pm8001_ha, pm8001_printk( @@ -1010,8 +1014,12 @@ static int mpi_init_check(struct pm8001_hba_info *pm8001_ha) value &= SPCv_MSGU_CFG_TABLE_UPDATE; } while ((value != 0) && (--max_wait_count)); - if (!max_wait_count) - return -1; + if (!max_wait_count) { + /* additional check */ + PM8001_FAIL_DBG(pm8001_ha, pm8001_printk( + "Inb doorbell clear not toggled[value:%x]\n", value)); + return -EBUSY; + } /* check the MPI-State for initialization upto 100ms*/ max_wait_count = 100 * 1000;/* 100 msec */ do { @@ -1022,12 +1030,12 @@ static int mpi_init_check(struct pm8001_hba_info *pm8001_ha) } while ((GST_MPI_STATE_INIT != (gst_len_mpistate & GST_MPI_STATE_MASK)) && (--max_wait_count)); if (!max_wait_count) - return -1; + return -EBUSY; /* check MPI Initialization error */ gst_len_mpistate = gst_len_mpistate >> 16; if (0x0000 != gst_len_mpistate) - return -1; + return -EBUSY; return 0; } @@ -1469,11 +1477,10 @@ static int pm80xx_chip_init(struct pm8001_hba_info *pm8001_ha) /* update main config table ,inbound table and outbound table */ update_main_config_table(pm8001_ha); - for (i = 0; i < PM8001_MAX_SPCV_INB_NUM; i++) + for (i = 0; i < pm8001_ha->max_q_num; i++) { update_inbnd_queue_table(pm8001_ha, i); - for (i = 0; i < PM8001_MAX_SPCV_OUTB_NUM; i++) update_outbnd_queue_table(pm8001_ha, i); - + } /* notify firmware update finished and check initialization status */ if (0 == mpi_init_check(pm8001_ha)) { PM8001_INIT_DBG(pm8001_ha, @@ -4191,7 +4198,7 @@ static int process_oq(struct pm8001_hba_info *pm8001_ha, u8 vec) unsigned long flags; u32 regval; - if (vec == (pm8001_ha->number_of_intr - 1)) { + if (vec == (pm8001_ha->max_q_num - 1)) { regval = pm8001_cr32(pm8001_ha, 0, MSGU_SCRATCH_PAD_1); if ((regval & SCRATCH_PAD_MIPSALL_READY) != SCRATCH_PAD_MIPSALL_READY) { @@ -4274,6 +4281,7 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha, char *preq_dma_addr = NULL; __le64 tmp_addr; u32 i, length; + unsigned long flags; memset(&smp_cmd, 0, sizeof(smp_cmd)); /* @@ -4369,8 +4377,10 @@ static int pm80xx_chip_smp_req(struct pm8001_hba_info *pm8001_ha, build_smp_cmd(pm8001_dev->device_id, smp_cmd.tag, &smp_cmd, pm8001_ha->smp_exp_mode, length); + spin_lock_irqsave(&circularQ->iq_lock, flags); rc = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &smp_cmd, sizeof(smp_cmd), 0); + spin_unlock_irqrestore(&circularQ->iq_lock, flags); if (rc) goto err_out_2; return 0; @@ -4434,7 +4444,8 @@ static int pm80xx_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, u64 phys_addr, start_addr, end_addr; u32 end_addr_high, end_addr_low; struct inbound_queue_table *circularQ; - u32 q_index; + unsigned long flags; + u32 q_index, cpu_id; u32 opc = OPC_INB_SSPINIIOSTART; memset(&ssp_cmd, 0, sizeof(ssp_cmd)); memcpy(ssp_cmd.ssp_iu.lun, task->ssp_task.LUN, 8); @@ -4453,7 +4464,8 @@ static int pm80xx_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, ssp_cmd.ssp_iu.efb_prio_attr |= (task->ssp_task.task_attr & 7); memcpy(ssp_cmd.ssp_iu.cdb, task->ssp_task.cmd->cmnd, task->ssp_task.cmd->cmd_len); - q_index = (u32) (pm8001_dev->id & 0x00ffffff) % PM8001_MAX_INB_NUM; + cpu_id = smp_processor_id(); + q_index = (u32) (cpu_id) % (pm8001_ha->max_q_num); circularQ = &pm8001_ha->inbnd_q_tbl[q_index]; /* Check if encryption is set */ @@ -4576,9 +4588,10 @@ static int pm80xx_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, ssp_cmd.esgl = 0; } } - q_index = (u32) (pm8001_dev->id & 0x00ffffff) % PM8001_MAX_OUTB_NUM; + spin_lock_irqsave(&circularQ->iq_lock, flags); ret = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &ssp_cmd, sizeof(ssp_cmd), q_index); + spin_unlock_irqrestore(&circularQ->iq_lock, flags); return ret; } @@ -4590,7 +4603,7 @@ static int pm80xx_chip_sata_req(struct pm8001_hba_info *pm8001_ha, struct pm8001_device *pm8001_ha_dev = dev->lldd_dev; u32 tag = ccb->ccb_tag; int ret; - u32 q_index; + u32 q_index, cpu_id; struct sata_start_req sata_cmd; u32 hdr_tag, ncg_tag = 0; u64 phys_addr, start_addr, end_addr; @@ -4601,7 +4614,8 @@ static int pm80xx_chip_sata_req(struct pm8001_hba_info *pm8001_ha, unsigned long flags; u32 opc = OPC_INB_SATA_HOST_OPSTART; memset(&sata_cmd, 0, sizeof(sata_cmd)); - q_index = (u32) (pm8001_ha_dev->id & 0x00ffffff) % PM8001_MAX_INB_NUM; + cpu_id = smp_processor_id(); + q_index = (u32) (cpu_id) % (pm8001_ha->max_q_num); circularQ = &pm8001_ha->inbnd_q_tbl[q_index]; if (task->data_dir == DMA_NONE) { @@ -4817,9 +4831,10 @@ static int pm80xx_chip_sata_req(struct pm8001_hba_info *pm8001_ha, } } } - q_index = (u32) (pm8001_ha_dev->id & 0x00ffffff) % PM8001_MAX_OUTB_NUM; + spin_lock_irqsave(&circularQ->iq_lock, flags); ret = pm8001_mpi_build_cmd(pm8001_ha, circularQ, opc, &sata_cmd, sizeof(sata_cmd), q_index); + spin_unlock_irqrestore(&circularQ->iq_lock, flags); return ret; } From 27bc43bd7c42b3995d16ad63794e355ae865a3a3 Mon Sep 17 00:00:00 2001 From: Viswas G Date: Mon, 5 Oct 2020 20:20:09 +0530 Subject: [PATCH 276/497] scsi: pm80xx: Remove DMA memory allocation for ccb and device structures Remove DMA memory allocation for Devices and CCB structure. Instead allocate memory outside of DMA memory. DMA memory is a limited system resource and it is better to allocate memory outside of DMA memory when possible. Link: https://lore.kernel.org/r/20201005145011.23674-3-Viswas.G@microchip.com.com Acked-by: Jack Wang Signed-off-by: Viswas G Signed-off-by: Ruksar Devadi Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_defs.h | 8 ++---- drivers/scsi/pm8001/pm8001_init.c | 48 +++++++++++++++++++------------ 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_defs.h b/drivers/scsi/pm8001/pm8001_defs.h index a4f52a5a449e..1bf1bcfaf010 100644 --- a/drivers/scsi/pm8001/pm8001_defs.h +++ b/drivers/scsi/pm8001/pm8001_defs.h @@ -91,17 +91,15 @@ enum port_type { #define PM8001_MAX_DEVICES 2048 /* max supported device */ #define PM8001_MAX_MSIX_VEC 64 /* max msi-x int for spcv/ve */ -#define USI_MAX_MEMCNT_BASE 5 #define CONFIG_SCSI_PM8001_MAX_DMA_SG 528 #define PM8001_MAX_DMA_SG CONFIG_SCSI_PM8001_MAX_DMA_SG enum memory_region_num { AAP1 = 0x0, /* application acceleration processor */ IOP, /* IO processor */ NVMD, /* NVM device */ - DEV_MEM, /* memory for devices */ - CCB_MEM, /* memory for command control block */ FW_FLASH, /* memory for fw flash update */ - FORENSIC_MEM /* memory for fw forensic data */ + FORENSIC_MEM, /* memory for fw forensic data */ + USI_MAX_MEMCNT_BASE }; #define PM8001_EVENT_LOG_SIZE (128 * 1024) @@ -109,7 +107,7 @@ enum memory_region_num { * maximum DMA memory regions(number of IBQ + number of IBQ CI * + number of OBQ + number of OBQ PI) */ -#define USI_MAX_MEMCNT (USI_MAX_MEMCNT_BASE + 1 + ((2 * PM8001_MAX_INB_NUM) \ +#define USI_MAX_MEMCNT (USI_MAX_MEMCNT_BASE + ((2 * PM8001_MAX_INB_NUM) \ + (2 * PM8001_MAX_OUTB_NUM))) /*error code*/ enum mpi_err { diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index cddcc9205d8c..2117a3e818fd 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -288,7 +288,7 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, count = pm8001_ha->max_q_num; /* Queues are chosen based on the number of cores/msix availability */ - ib_offset = pm8001_ha->ib_offset = USI_MAX_MEMCNT_BASE + 1; + ib_offset = pm8001_ha->ib_offset = USI_MAX_MEMCNT_BASE; ci_offset = pm8001_ha->ci_offset = ib_offset + count; ob_offset = pm8001_ha->ob_offset = ci_offset + count; pi_offset = pm8001_ha->pi_offset = ob_offset + count; @@ -380,19 +380,6 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, pm8001_ha->memoryMap.region[NVMD].num_elements = 1; pm8001_ha->memoryMap.region[NVMD].element_size = 4096; pm8001_ha->memoryMap.region[NVMD].total_len = 4096; - /* Memory region for devices*/ - pm8001_ha->memoryMap.region[DEV_MEM].num_elements = 1; - pm8001_ha->memoryMap.region[DEV_MEM].element_size = PM8001_MAX_DEVICES * - sizeof(struct pm8001_device); - pm8001_ha->memoryMap.region[DEV_MEM].total_len = PM8001_MAX_DEVICES * - sizeof(struct pm8001_device); - - /* Memory region for ccb_info*/ - pm8001_ha->memoryMap.region[CCB_MEM].num_elements = 1; - pm8001_ha->memoryMap.region[CCB_MEM].element_size = PM8001_MAX_CCB * - sizeof(struct pm8001_ccb_info); - pm8001_ha->memoryMap.region[CCB_MEM].total_len = PM8001_MAX_CCB * - sizeof(struct pm8001_ccb_info); /* Memory region for fw flash */ pm8001_ha->memoryMap.region[FW_FLASH].total_len = 4096; @@ -416,18 +403,30 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, } } - pm8001_ha->devices = pm8001_ha->memoryMap.region[DEV_MEM].virt_ptr; + /* Memory region for devices*/ + pm8001_ha->devices = kzalloc(PM8001_MAX_DEVICES + * sizeof(struct pm8001_device), GFP_KERNEL); + if (!pm8001_ha->devices) { + rc = -ENOMEM; + goto err_out_nodev; + } for (i = 0; i < PM8001_MAX_DEVICES; i++) { pm8001_ha->devices[i].dev_type = SAS_PHY_UNUSED; pm8001_ha->devices[i].id = i; pm8001_ha->devices[i].device_id = PM8001_MAX_DEVICES; pm8001_ha->devices[i].running_req = 0; } - pm8001_ha->ccb_info = pm8001_ha->memoryMap.region[CCB_MEM].virt_ptr; + /* Memory region for ccb_info*/ + pm8001_ha->ccb_info = kzalloc(PM8001_MAX_CCB + * sizeof(struct pm8001_ccb_info), GFP_KERNEL); + if (!pm8001_ha->ccb_info) { + rc = -ENOMEM; + goto err_out_noccb; + } for (i = 0; i < PM8001_MAX_CCB; i++) { pm8001_ha->ccb_info[i].ccb_dma_handle = - pm8001_ha->memoryMap.region[CCB_MEM].phys_addr + - i * sizeof(struct pm8001_ccb_info); + virt_to_phys(pm8001_ha->ccb_info) + + (i * sizeof(struct pm8001_ccb_info)); pm8001_ha->ccb_info[i].task = NULL; pm8001_ha->ccb_info[i].ccb_tag = 0xffffffff; pm8001_ha->ccb_info[i].device = NULL; @@ -437,8 +436,21 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, /* Initialize tags */ pm8001_tag_init(pm8001_ha); return 0; + +err_out_noccb: + kfree(pm8001_ha->devices); err_out_shost: scsi_remove_host(pm8001_ha->shost); +err_out_nodev: + for (i = 0; i < pm8001_ha->max_memcnt; i++) { + if (pm8001_ha->memoryMap.region[i].virt_ptr != NULL) { + pci_free_consistent(pm8001_ha->pdev, + (pm8001_ha->memoryMap.region[i].total_len + + pm8001_ha->memoryMap.region[i].alignment), + pm8001_ha->memoryMap.region[i].virt_ptr, + pm8001_ha->memoryMap.region[i].phys_addr); + } + } err_out: return 1; } From 5a141315ed7c2647fa4518570c4941a02588b466 Mon Sep 17 00:00:00 2001 From: Viswas G Date: Mon, 5 Oct 2020 20:20:10 +0530 Subject: [PATCH 277/497] scsi: pm80xx: Increase the number of outstanding I/O supported to 1024 The pm80xx driver currently sets the controller queue depth to 256. Hoewver, the controller supports outstanding I/Os up 1024. Increase the number of outstanding I/Os from 256 to 1024. CCBs and tags are allocated according to outstanding I/Os. Also update the can_queue value (max_out_io - PM8001_RESERVE_SLOT) used by the SCSI midlayer. [mkp: fixed zeroday complaint] Link: https://lore.kernel.org/r/20201005145011.23674-4-Viswas.G@microchip.com.com Reported-by: kernel test robot Acked-by: Jack Wang Signed-off-by: Viswas G Signed-off-by: Ruksar Devadi Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_defs.h | 4 +- drivers/scsi/pm8001/pm8001_hwi.c | 6 +-- drivers/scsi/pm8001/pm8001_init.c | 80 +++++++++++++++++++++++-------- drivers/scsi/pm8001/pm8001_sas.h | 2 +- drivers/scsi/pm8001/pm80xx_hwi.c | 28 ++++------- 5 files changed, 73 insertions(+), 47 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_defs.h b/drivers/scsi/pm8001/pm8001_defs.h index 1bf1bcfaf010..501b574239e8 100644 --- a/drivers/scsi/pm8001/pm8001_defs.h +++ b/drivers/scsi/pm8001/pm8001_defs.h @@ -75,7 +75,7 @@ enum port_type { }; /* driver compile-time configuration */ -#define PM8001_MAX_CCB 256 /* max ccbs supported */ +#define PM8001_MAX_CCB 1024 /* max ccbs supported */ #define PM8001_MPI_QUEUE 1024 /* maximum mpi queue entries */ #define PM8001_MAX_INB_NUM 64 #define PM8001_MAX_OUTB_NUM 64 @@ -90,9 +90,11 @@ enum port_type { #define PM8001_MAX_PORTS 16 /* max. possible ports */ #define PM8001_MAX_DEVICES 2048 /* max supported device */ #define PM8001_MAX_MSIX_VEC 64 /* max msi-x int for spcv/ve */ +#define PM8001_RESERVE_SLOT 8 #define CONFIG_SCSI_PM8001_MAX_DMA_SG 528 #define PM8001_MAX_DMA_SG CONFIG_SCSI_PM8001_MAX_DMA_SG + enum memory_region_num { AAP1 = 0x0, /* application acceleration processor */ IOP, /* IO processor */ diff --git a/drivers/scsi/pm8001/pm8001_hwi.c b/drivers/scsi/pm8001/pm8001_hwi.c index e9106575a30f..2b7b2954ec31 100644 --- a/drivers/scsi/pm8001/pm8001_hwi.c +++ b/drivers/scsi/pm8001/pm8001_hwi.c @@ -4375,8 +4375,7 @@ static int pm8001_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, /* fill in PRD (scatter/gather) table, if any */ if (task->num_scatter > 1) { pm8001_chip_make_sg(task->scatter, ccb->n_elem, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; ssp_cmd.addr_low = cpu_to_le32(lower_32_bits(phys_addr)); ssp_cmd.addr_high = cpu_to_le32(upper_32_bits(phys_addr)); ssp_cmd.esgl = cpu_to_le32(1<<31); @@ -4449,8 +4448,7 @@ static int pm8001_chip_sata_req(struct pm8001_hba_info *pm8001_ha, /* fill in PRD (scatter/gather) table, if any */ if (task->num_scatter > 1) { pm8001_chip_make_sg(task->scatter, ccb->n_elem, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; sata_cmd.addr_low = lower_32_bits(phys_addr); sata_cmd.addr_high = upper_32_bits(phys_addr); sata_cmd.esgl = cpu_to_le32(1 << 31); diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 2117a3e818fd..3cf3e58b6979 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -56,6 +56,7 @@ MODULE_PARM_DESC(link_rate, "Enable link rate.\n" " 8: Link rate 12.0G\n"); static struct scsi_transport_template *pm8001_stt; +static int pm8001_init_ccb_tag(struct pm8001_hba_info *, struct Scsi_Host *, struct pci_dev *); /* * chip info structure to identify chip key functionality as @@ -302,9 +303,6 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, INIT_LIST_HEAD(&pm8001_ha->port[i].list); } - pm8001_ha->tags = kzalloc(PM8001_MAX_CCB, GFP_KERNEL); - if (!pm8001_ha->tags) - goto err_out; /* MPI Memory region 1 for AAP Event Log for fw */ pm8001_ha->memoryMap.region[AAP1].num_elements = 1; pm8001_ha->memoryMap.region[AAP1].element_size = PM8001_EVENT_LOG_SIZE; @@ -416,29 +414,11 @@ static int pm8001_alloc(struct pm8001_hba_info *pm8001_ha, pm8001_ha->devices[i].device_id = PM8001_MAX_DEVICES; pm8001_ha->devices[i].running_req = 0; } - /* Memory region for ccb_info*/ - pm8001_ha->ccb_info = kzalloc(PM8001_MAX_CCB - * sizeof(struct pm8001_ccb_info), GFP_KERNEL); - if (!pm8001_ha->ccb_info) { - rc = -ENOMEM; - goto err_out_noccb; - } - for (i = 0; i < PM8001_MAX_CCB; i++) { - pm8001_ha->ccb_info[i].ccb_dma_handle = - virt_to_phys(pm8001_ha->ccb_info) + - (i * sizeof(struct pm8001_ccb_info)); - pm8001_ha->ccb_info[i].task = NULL; - pm8001_ha->ccb_info[i].ccb_tag = 0xffffffff; - pm8001_ha->ccb_info[i].device = NULL; - ++pm8001_ha->tags_num; - } pm8001_ha->flags = PM8001F_INIT_TIME; /* Initialize tags */ pm8001_tag_init(pm8001_ha); return 0; -err_out_noccb: - kfree(pm8001_ha->devices); err_out_shost: scsi_remove_host(pm8001_ha->shost); err_out_nodev: @@ -1133,6 +1113,10 @@ static int pm8001_pci_probe(struct pci_dev *pdev, goto err_out_ha_free; } + rc = pm8001_init_ccb_tag(pm8001_ha, shost, pdev); + if (rc) + goto err_out_enable; + rc = scsi_add_host(shost, &pdev->dev); if (rc) goto err_out_ha_free; @@ -1178,6 +1162,60 @@ err_out_enable: return rc; } +/* + * pm8001_init_ccb_tag - allocate memory to CCB and tag. + * @pm8001_ha: our hba card information. + * @shost: scsi host which has been allocated outside. + */ +static int +pm8001_init_ccb_tag(struct pm8001_hba_info *pm8001_ha, struct Scsi_Host *shost, + struct pci_dev *pdev) +{ + int i = 0; + u32 max_out_io, ccb_count; + u32 can_queue; + + max_out_io = pm8001_ha->main_cfg_tbl.pm80xx_tbl.max_out_io; + ccb_count = min_t(int, PM8001_MAX_CCB, max_out_io); + + /* Update to the scsi host*/ + can_queue = ccb_count - PM8001_RESERVE_SLOT; + shost->can_queue = can_queue; + + pm8001_ha->tags = kzalloc(ccb_count, GFP_KERNEL); + if (!pm8001_ha->tags) + goto err_out; + + /* Memory region for ccb_info*/ + pm8001_ha->ccb_info = (struct pm8001_ccb_info *) + kcalloc(ccb_count, sizeof(struct pm8001_ccb_info), GFP_KERNEL); + if (!pm8001_ha->ccb_info) { + PM8001_FAIL_DBG(pm8001_ha, pm8001_printk + ("Unable to allocate memory for ccb\n")); + goto err_out_noccb; + } + for (i = 0; i < ccb_count; i++) { + pm8001_ha->ccb_info[i].buf_prd = pci_alloc_consistent(pdev, + sizeof(struct pm8001_prd) * PM8001_MAX_DMA_SG, + &pm8001_ha->ccb_info[i].ccb_dma_handle); + if (!pm8001_ha->ccb_info[i].buf_prd) { + PM8001_FAIL_DBG(pm8001_ha, pm8001_printk + ("pm80xx: ccb prd memory allocation error\n")); + goto err_out; + } + pm8001_ha->ccb_info[i].task = NULL; + pm8001_ha->ccb_info[i].ccb_tag = 0xffffffff; + pm8001_ha->ccb_info[i].device = NULL; + ++pm8001_ha->tags_num; + } + return 0; + +err_out_noccb: + kfree(pm8001_ha->devices); +err_out: + return -ENOMEM; +} + static void pm8001_pci_remove(struct pci_dev *pdev) { struct sas_ha_struct *sha = pci_get_drvdata(pdev); diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index bdfce3c3f619..9d7796a74ed4 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -315,7 +315,7 @@ struct pm8001_ccb_info { u32 ccb_tag; dma_addr_t ccb_dma_handle; struct pm8001_device *device; - struct pm8001_prd buf_prd[PM8001_MAX_DMA_SG]; + struct pm8001_prd *buf_prd; struct fw_control_ex *fw_control_context; u8 open_retry; }; diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 26e9e8877107..7593f248afb2 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -4483,8 +4483,7 @@ static int pm80xx_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, if (task->num_scatter > 1) { pm8001_chip_make_sg(task->scatter, ccb->n_elem, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; ssp_cmd.enc_addr_low = cpu_to_le32(lower_32_bits(phys_addr)); ssp_cmd.enc_addr_high = @@ -4513,9 +4512,7 @@ static int pm80xx_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, end_addr_high, end_addr_low)); pm8001_chip_make_sg(task->scatter, 1, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, - buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; ssp_cmd.enc_addr_low = cpu_to_le32(lower_32_bits(phys_addr)); ssp_cmd.enc_addr_high = @@ -4543,8 +4540,7 @@ static int pm80xx_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, if (task->num_scatter > 1) { pm8001_chip_make_sg(task->scatter, ccb->n_elem, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; ssp_cmd.addr_low = cpu_to_le32(lower_32_bits(phys_addr)); ssp_cmd.addr_high = @@ -4572,9 +4568,7 @@ static int pm80xx_chip_ssp_io_req(struct pm8001_hba_info *pm8001_ha, end_addr_high, end_addr_low)); pm8001_chip_make_sg(task->scatter, 1, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, - buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; ssp_cmd.addr_low = cpu_to_le32(lower_32_bits(phys_addr)); ssp_cmd.addr_high = @@ -4666,8 +4660,7 @@ static int pm80xx_chip_sata_req(struct pm8001_hba_info *pm8001_ha, if (task->num_scatter > 1) { pm8001_chip_make_sg(task->scatter, ccb->n_elem, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; sata_cmd.enc_addr_low = lower_32_bits(phys_addr); sata_cmd.enc_addr_high = upper_32_bits(phys_addr); sata_cmd.enc_esgl = cpu_to_le32(1 << 31); @@ -4692,9 +4685,7 @@ static int pm80xx_chip_sata_req(struct pm8001_hba_info *pm8001_ha, end_addr_high, end_addr_low)); pm8001_chip_make_sg(task->scatter, 1, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, - buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; sata_cmd.enc_addr_low = lower_32_bits(phys_addr); sata_cmd.enc_addr_high = @@ -4732,8 +4723,7 @@ static int pm80xx_chip_sata_req(struct pm8001_hba_info *pm8001_ha, if (task->num_scatter > 1) { pm8001_chip_make_sg(task->scatter, ccb->n_elem, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; sata_cmd.addr_low = lower_32_bits(phys_addr); sata_cmd.addr_high = upper_32_bits(phys_addr); sata_cmd.esgl = cpu_to_le32(1 << 31); @@ -4758,9 +4748,7 @@ static int pm80xx_chip_sata_req(struct pm8001_hba_info *pm8001_ha, end_addr_high, end_addr_low)); pm8001_chip_make_sg(task->scatter, 1, ccb->buf_prd); - phys_addr = ccb->ccb_dma_handle + - offsetof(struct pm8001_ccb_info, - buf_prd[0]); + phys_addr = ccb->ccb_dma_handle; sata_cmd.addr_low = lower_32_bits(phys_addr); sata_cmd.addr_high = From 39a45d538dba5f0ff0a056f0dcdbf3ac30a7beb7 Mon Sep 17 00:00:00 2001 From: Viswas G Date: Mon, 5 Oct 2020 20:20:11 +0530 Subject: [PATCH 278/497] scsi: pm80xx: Driver version update Update driver version from "0.1.39" -> "0.1.40" Link: https://lore.kernel.org/r/20201005145011.23674-5-Viswas.G@microchip.com.com Acked-by: Jack Wang Signed-off-by: Viswas G Signed-off-by: Ruksar Devadi Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_sas.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index 9d7796a74ed4..95663e138083 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -58,7 +58,7 @@ #include "pm8001_defs.h" #define DRV_NAME "pm80xx" -#define DRV_VERSION "0.1.39" +#define DRV_VERSION "0.1.40" #define PM8001_FAIL_LOGGING 0x01 /* Error message logging */ #define PM8001_INIT_LOGGING 0x02 /* driver init logging */ #define PM8001_DISC_LOGGING 0x04 /* discovery layer logging */ From 9aae1c1fe627c850da1cc2eba02af98e060251d3 Mon Sep 17 00:00:00 2001 From: ching Huang Date: Mon, 28 Sep 2020 18:45:32 +0800 Subject: [PATCH 279/497] scsi: arcmsr: Remove unnecessary syntax Remove unnecessary syntax. Link: https://lore.kernel.org/r/29486c1a50df3bb1312fb9d6a2dec075f212e4d5.camel@areca.com.tw Signed-off-by: ching Huang Signed-off-by: Martin K. Petersen --- drivers/scsi/arcmsr/arcmsr_hba.c | 63 +++++++++----------------------- 1 file changed, 18 insertions(+), 45 deletions(-) diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index fa562a085600..5076480b336f 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -317,20 +317,16 @@ static bool arcmsr_remap_pciregion(struct AdapterControlBlock *acb) static void arcmsr_unmap_pciregion(struct AdapterControlBlock *acb) { switch (acb->adapter_type) { - case ACB_ADAPTER_TYPE_A:{ + case ACB_ADAPTER_TYPE_A: iounmap(acb->pmuA); - } - break; - case ACB_ADAPTER_TYPE_B:{ + break; + case ACB_ADAPTER_TYPE_B: iounmap(acb->mem_base0); iounmap(acb->mem_base1); - } - - break; - case ACB_ADAPTER_TYPE_C:{ + break; + case ACB_ADAPTER_TYPE_C: iounmap(acb->pmuC); - } - break; + break; case ACB_ADAPTER_TYPE_D: iounmap(acb->mem_base0); break; @@ -552,18 +548,14 @@ static void arcmsr_flush_adapter_cache(struct AdapterControlBlock *acb) { switch (acb->adapter_type) { - case ACB_ADAPTER_TYPE_A: { + case ACB_ADAPTER_TYPE_A: arcmsr_hbaA_flush_cache(acb); - } break; - - case ACB_ADAPTER_TYPE_B: { + case ACB_ADAPTER_TYPE_B: arcmsr_hbaB_flush_cache(acb); - } break; - case ACB_ADAPTER_TYPE_C: { + case ACB_ADAPTER_TYPE_C: arcmsr_hbaC_flush_cache(acb); - } break; case ACB_ADAPTER_TYPE_D: arcmsr_hbaD_flush_cache(acb); @@ -1213,21 +1205,15 @@ static uint8_t arcmsr_abort_allcmd(struct AdapterControlBlock *acb) { uint8_t rtnval = 0; switch (acb->adapter_type) { - case ACB_ADAPTER_TYPE_A: { + case ACB_ADAPTER_TYPE_A: rtnval = arcmsr_hbaA_abort_allcmd(acb); - } break; - - case ACB_ADAPTER_TYPE_B: { + case ACB_ADAPTER_TYPE_B: rtnval = arcmsr_hbaB_abort_allcmd(acb); - } break; - - case ACB_ADAPTER_TYPE_C: { + case ACB_ADAPTER_TYPE_C: rtnval = arcmsr_hbaC_abort_allcmd(acb); - } break; - case ACB_ADAPTER_TYPE_D: rtnval = arcmsr_hbaD_abort_allcmd(acb); break; @@ -1916,18 +1902,14 @@ static void arcmsr_hbaE_stop_bgrb(struct AdapterControlBlock *pACB) static void arcmsr_stop_adapter_bgrb(struct AdapterControlBlock *acb) { switch (acb->adapter_type) { - case ACB_ADAPTER_TYPE_A: { + case ACB_ADAPTER_TYPE_A: arcmsr_hbaA_stop_bgrb(acb); - } break; - - case ACB_ADAPTER_TYPE_B: { + case ACB_ADAPTER_TYPE_B: arcmsr_hbaB_stop_bgrb(acb); - } break; - case ACB_ADAPTER_TYPE_C: { + case ACB_ADAPTER_TYPE_C: arcmsr_hbaC_stop_bgrb(acb); - } break; case ACB_ADAPTER_TYPE_D: arcmsr_hbaD_stop_bgrb(acb); @@ -1951,7 +1933,6 @@ static void arcmsr_iop_message_read(struct AdapterControlBlock *acb) writel(ARCMSR_INBOUND_DRIVER_DATA_READ_OK, ®->inbound_doorbell); } break; - case ACB_ADAPTER_TYPE_B: { struct MessageUnit_B *reg = acb->pmuB; writel(ARCMSR_DRV2IOP_DATA_READ_OK, reg->drv2iop_doorbell); @@ -2034,7 +2015,6 @@ struct QBUFFER __iomem *arcmsr_get_iop_rqbuffer(struct AdapterControlBlock *acb) qbuffer = (struct QBUFFER __iomem *)®->message_rbuffer; } break; - case ACB_ADAPTER_TYPE_B: { struct MessageUnit_B *reg = acb->pmuB; qbuffer = (struct QBUFFER __iomem *)reg->message_rbuffer; @@ -2069,7 +2049,6 @@ static struct QBUFFER __iomem *arcmsr_get_iop_wqbuffer(struct AdapterControlBloc pqbuffer = (struct QBUFFER __iomem *) ®->message_wbuffer; } break; - case ACB_ADAPTER_TYPE_B: { struct MessageUnit_B *reg = acb->pmuB; pqbuffer = (struct QBUFFER __iomem *)reg->message_wbuffer; @@ -2699,10 +2678,8 @@ static irqreturn_t arcmsr_interrupt(struct AdapterControlBlock *acb) switch (acb->adapter_type) { case ACB_ADAPTER_TYPE_A: return arcmsr_hbaA_handle_isr(acb); - break; case ACB_ADAPTER_TYPE_B: return arcmsr_hbaB_handle_isr(acb); - break; case ACB_ADAPTER_TYPE_C: return arcmsr_hbaC_handle_isr(acb); case ACB_ADAPTER_TYPE_D: @@ -3634,18 +3611,14 @@ static int arcmsr_polling_ccbdone(struct AdapterControlBlock *acb, int rtn = 0; switch (acb->adapter_type) { - case ACB_ADAPTER_TYPE_A: { + case ACB_ADAPTER_TYPE_A: rtn = arcmsr_hbaA_polling_ccbdone(acb, poll_ccb); - } break; - - case ACB_ADAPTER_TYPE_B: { + case ACB_ADAPTER_TYPE_B: rtn = arcmsr_hbaB_polling_ccbdone(acb, poll_ccb); - } break; - case ACB_ADAPTER_TYPE_C: { + case ACB_ADAPTER_TYPE_C: rtn = arcmsr_hbaC_polling_ccbdone(acb, poll_ccb); - } break; case ACB_ADAPTER_TYPE_D: rtn = arcmsr_hbaD_polling_ccbdone(acb, poll_ccb); From 893f4a14b115c7c9fca3c78bc42d30548cd76686 Mon Sep 17 00:00:00 2001 From: ching Huang Date: Mon, 28 Sep 2020 18:31:27 +0800 Subject: [PATCH 280/497] scsi: arcmsr: Fix device hot-plug monitoring timer stop Fix device hot-plug monitoring timer stop. Link: https://lore.kernel.org/r/969213d4f124e230c3febc01e2b1db291bf4585c.camel@areca.com.tw Signed-off-by: ching Huang Signed-off-by: Martin K. Petersen --- drivers/scsi/arcmsr/arcmsr.h | 2 -- drivers/scsi/arcmsr/arcmsr_hba.c | 27 +++------------------------ 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/drivers/scsi/arcmsr/arcmsr.h b/drivers/scsi/arcmsr/arcmsr.h index 9220bcf8388f..0ae401dfdc7f 100644 --- a/drivers/scsi/arcmsr/arcmsr.h +++ b/drivers/scsi/arcmsr/arcmsr.h @@ -836,8 +836,6 @@ struct AdapterControlBlock #define FW_NORMAL 0x0000 #define FW_BOG 0x0001 #define FW_DEADLOCK 0x0010 - atomic_t rq_map_token; - atomic_t ante_token_value; uint32_t maxOutstanding; int vector_count; uint32_t maxFreeCCB; diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 5076480b336f..86f84d7f4d3d 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -777,7 +777,6 @@ static void arcmsr_message_isr_bh_fn(struct work_struct *work) struct scsi_device *psdev; char diff, temp; - acb->acb_flags &= ~ACB_F_MSG_GET_CONFIG; switch (acb->adapter_type) { case ACB_ADAPTER_TYPE_A: { struct MessageUnit_A __iomem *reg = acb->pmuA; @@ -815,7 +814,6 @@ static void arcmsr_message_isr_bh_fn(struct work_struct *work) break; } } - atomic_inc(&acb->rq_map_token); if (readl(signature) != ARCMSR_SIGNATURE_GET_CONFIG) return; for (target = 0; target < ARCMSR_MAX_TARGETID - 1; @@ -846,6 +844,7 @@ static void arcmsr_message_isr_bh_fn(struct work_struct *work) devicemap++; acb_dev_map++; } + acb->acb_flags &= ~ACB_F_MSG_GET_CONFIG; } static int @@ -898,8 +897,6 @@ out_free_irq: static void arcmsr_init_get_devmap_timer(struct AdapterControlBlock *pacb) { INIT_WORK(&pacb->arcmsr_do_message_isr_bh, arcmsr_message_isr_bh_fn); - atomic_set(&pacb->rq_map_token, 16); - atomic_set(&pacb->ante_token_value, 16); pacb->fw_flag = FW_NORMAL; timer_setup(&pacb->eternal_timer, arcmsr_request_device_map, 0); pacb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ); @@ -3925,24 +3922,10 @@ static void arcmsr_wait_firmware_ready(struct AdapterControlBlock *acb) static void arcmsr_request_device_map(struct timer_list *t) { struct AdapterControlBlock *acb = from_timer(acb, t, eternal_timer); - if (unlikely(atomic_read(&acb->rq_map_token) == 0) || - (acb->acb_flags & ACB_F_BUS_RESET) || - (acb->acb_flags & ACB_F_ABORT)) { - mod_timer(&acb->eternal_timer, - jiffies + msecs_to_jiffies(6 * HZ)); + if (acb->acb_flags & (ACB_F_MSG_GET_CONFIG | ACB_F_BUS_RESET | ACB_F_ABORT)) { + mod_timer(&acb->eternal_timer, jiffies + msecs_to_jiffies(6 * HZ)); } else { acb->fw_flag = FW_NORMAL; - if (atomic_read(&acb->ante_token_value) == - atomic_read(&acb->rq_map_token)) { - atomic_set(&acb->rq_map_token, 16); - } - atomic_set(&acb->ante_token_value, - atomic_read(&acb->rq_map_token)); - if (atomic_dec_and_test(&acb->rq_map_token)) { - mod_timer(&acb->eternal_timer, jiffies + - msecs_to_jiffies(6 * HZ)); - return; - } switch (acb->adapter_type) { case ACB_ADAPTER_TYPE_A: { struct MessageUnit_A __iomem *reg = acb->pmuA; @@ -4362,8 +4345,6 @@ wait_reset_done: goto wait_reset_done; } arcmsr_iop_init(acb); - atomic_set(&acb->rq_map_token, 16); - atomic_set(&acb->ante_token_value, 16); acb->fw_flag = FW_NORMAL; mod_timer(&acb->eternal_timer, jiffies + msecs_to_jiffies(6 * HZ)); @@ -4372,8 +4353,6 @@ wait_reset_done: pr_notice("arcmsr: scsi bus reset eh returns with success\n"); } else { acb->acb_flags &= ~ACB_F_BUS_RESET; - atomic_set(&acb->rq_map_token, 16); - atomic_set(&acb->ante_token_value, 16); acb->fw_flag = FW_NORMAL; mod_timer(&acb->eternal_timer, jiffies + msecs_to_jiffies(6 * HZ)); From ae897ae28f9a1da2e04779a16f0a1112804a58fb Mon Sep 17 00:00:00 2001 From: ching Huang Date: Mon, 28 Sep 2020 18:35:05 +0800 Subject: [PATCH 281/497] scsi: arcmsr: Add support for ARC-1886 series RAID controllers Add support for ARC-1886 series RAID controllers. [mkp: apply zeroday build warning fixes] Link: https://lore.kernel.org/r/78ae03d0ac05054c721cc3a94f41f9e656a5e176.camel@areca.com.tw Signed-off-by: ching Huang Signed-off-by: Martin K. Petersen --- drivers/scsi/arcmsr/arcmsr.h | 98 +++++++++++ drivers/scsi/arcmsr/arcmsr_hba.c | 287 +++++++++++++++++++++++++++++-- 2 files changed, 367 insertions(+), 18 deletions(-) diff --git a/drivers/scsi/arcmsr/arcmsr.h b/drivers/scsi/arcmsr/arcmsr.h index 0ae401dfdc7f..5e32f1721d1d 100644 --- a/drivers/scsi/arcmsr/arcmsr.h +++ b/drivers/scsi/arcmsr/arcmsr.h @@ -80,6 +80,7 @@ struct device_attribute; #ifndef PCI_DEVICE_ID_ARECA_1884 #define PCI_DEVICE_ID_ARECA_1884 0x1884 #endif +#define PCI_DEVICE_ID_ARECA_1886 0x188A #define ARCMSR_HOURS (1000 * 60 * 60 * 4) #define ARCMSR_MINUTES (1000 * 60 * 60) /* @@ -436,6 +437,21 @@ struct FIRMWARE_INFO #define ARCMSR_HBEMU_DOORBELL_SYNC 0x100 #define ARCMSR_ARC188X_RESET_ADAPTER 0x00000004 #define ARCMSR_ARC1884_DiagWrite_ENABLE 0x00000080 + +/* +******************************************************************************* +** SPEC. for Areca Type F adapter +******************************************************************************* +*/ +#define ARCMSR_SIGNATURE_1886 0x188617D3 +// Doorbell and interrupt definition are same as Type E adapter +/* ARC-1886 doorbell sync */ +#define ARCMSR_HBFMU_DOORBELL_SYNC 0x100 +//set host rw buffer physical address at inbound message 0, 1 (low,high) +#define ARCMSR_HBFMU_DOORBELL_SYNC1 0x300 +#define ARCMSR_HBFMU_MESSAGE_FIRMWARE_OK 0x80000000 +#define ARCMSR_HBFMU_MESSAGE_NO_VOLUME_CHANGE 0x20000000 + /* ******************************************************************************* ** ARECA SCSI COMMAND DESCRIPTOR BLOCK size 0x1F8 (504) @@ -720,6 +736,80 @@ struct MessageUnit_E{ uint32_t msgcode_rwbuffer[256]; /*2200 23FF*/ }; +/* +********************************************************************* +** Messaging Unit (MU) of Type F processor(LSI) +********************************************************************* +*/ +struct MessageUnit_F { + uint32_t iobound_doorbell; /*0000 0003*/ + uint32_t write_sequence_3xxx; /*0004 0007*/ + uint32_t host_diagnostic_3xxx; /*0008 000B*/ + uint32_t posted_outbound_doorbell; /*000C 000F*/ + uint32_t master_error_attribute; /*0010 0013*/ + uint32_t master_error_address_low; /*0014 0017*/ + uint32_t master_error_address_high; /*0018 001B*/ + uint32_t hcb_size; /*001C 001F*/ + uint32_t inbound_doorbell; /*0020 0023*/ + uint32_t diagnostic_rw_data; /*0024 0027*/ + uint32_t diagnostic_rw_address_low; /*0028 002B*/ + uint32_t diagnostic_rw_address_high; /*002C 002F*/ + uint32_t host_int_status; /*0030 0033*/ + uint32_t host_int_mask; /*0034 0037*/ + uint32_t dcr_data; /*0038 003B*/ + uint32_t dcr_address; /*003C 003F*/ + uint32_t inbound_queueport; /*0040 0043*/ + uint32_t outbound_queueport; /*0044 0047*/ + uint32_t hcb_pci_address_low; /*0048 004B*/ + uint32_t hcb_pci_address_high; /*004C 004F*/ + uint32_t iop_int_status; /*0050 0053*/ + uint32_t iop_int_mask; /*0054 0057*/ + uint32_t iop_inbound_queue_port; /*0058 005B*/ + uint32_t iop_outbound_queue_port; /*005C 005F*/ + uint32_t inbound_free_list_index; /*0060 0063*/ + uint32_t inbound_post_list_index; /*0064 0067*/ + uint32_t reply_post_producer_index; /*0068 006B*/ + uint32_t reply_post_consumer_index; /*006C 006F*/ + uint32_t inbound_doorbell_clear; /*0070 0073*/ + uint32_t i2o_message_unit_control; /*0074 0077*/ + uint32_t last_used_message_source_address_low; /*0078 007B*/ + uint32_t last_used_message_source_address_high; /*007C 007F*/ + uint32_t pull_mode_data_byte_count[4]; /*0080 008F*/ + uint32_t message_dest_address_index; /*0090 0093*/ + uint32_t done_queue_not_empty_int_counter_timer; /*0094 0097*/ + uint32_t utility_A_int_counter_timer; /*0098 009B*/ + uint32_t outbound_doorbell; /*009C 009F*/ + uint32_t outbound_doorbell_clear; /*00A0 00A3*/ + uint32_t message_source_address_index; /*00A4 00A7*/ + uint32_t message_done_queue_index; /*00A8 00AB*/ + uint32_t reserved0; /*00AC 00AF*/ + uint32_t inbound_msgaddr0; /*00B0 00B3*/ + uint32_t inbound_msgaddr1; /*00B4 00B7*/ + uint32_t outbound_msgaddr0; /*00B8 00BB*/ + uint32_t outbound_msgaddr1; /*00BC 00BF*/ + uint32_t inbound_queueport_low; /*00C0 00C3*/ + uint32_t inbound_queueport_high; /*00C4 00C7*/ + uint32_t outbound_queueport_low; /*00C8 00CB*/ + uint32_t outbound_queueport_high; /*00CC 00CF*/ + uint32_t iop_inbound_queue_port_low; /*00D0 00D3*/ + uint32_t iop_inbound_queue_port_high; /*00D4 00D7*/ + uint32_t iop_outbound_queue_port_low; /*00D8 00DB*/ + uint32_t iop_outbound_queue_port_high; /*00DC 00DF*/ + uint32_t message_dest_queue_port_low; /*00E0 00E3*/ + uint32_t message_dest_queue_port_high; /*00E4 00E7*/ + uint32_t last_used_message_dest_address_low; /*00E8 00EB*/ + uint32_t last_used_message_dest_address_high; /*00EC 00EF*/ + uint32_t message_done_queue_base_address_low; /*00F0 00F3*/ + uint32_t message_done_queue_base_address_high; /*00F4 00F7*/ + uint32_t host_diagnostic; /*00F8 00FB*/ + uint32_t write_sequence; /*00FC 00FF*/ + uint32_t reserved1[46]; /*0100 01B7*/ + uint32_t reply_post_producer_index1; /*01B8 01BB*/ + uint32_t reply_post_consumer_index1; /*01BC 01BF*/ +}; + +#define MESG_RW_BUFFER_SIZE (256 * 3) + typedef struct deliver_completeQ { uint16_t cmdFlag; uint16_t cmdSMID; @@ -739,6 +829,7 @@ struct AdapterControlBlock #define ACB_ADAPTER_TYPE_C 0x00000002 /* hbc L IOP */ #define ACB_ADAPTER_TYPE_D 0x00000003 /* hbd M IOP */ #define ACB_ADAPTER_TYPE_E 0x00000004 /* hba L IOP */ +#define ACB_ADAPTER_TYPE_F 0x00000005 /* hba L IOP */ u32 ioqueue_size; struct pci_dev * pdev; struct Scsi_Host * host; @@ -760,10 +851,16 @@ struct AdapterControlBlock struct MessageUnit_C __iomem *pmuC; struct MessageUnit_D *pmuD; struct MessageUnit_E __iomem *pmuE; + struct MessageUnit_F __iomem *pmuF; }; /* message unit ATU inbound base address0 */ void __iomem *mem_base0; void __iomem *mem_base1; + //0x000 - COMPORT_IN (Host sent to ROC) + uint32_t *message_wbuffer; + //0x100 - COMPORT_OUT (ROC sent to Host) + uint32_t *message_rbuffer; + uint32_t *msgcode_rwbuffer; //0x200 - BIOS_AREA uint32_t acb_flags; u16 dev_id; uint8_t adapter_index; @@ -846,6 +943,7 @@ struct AdapterControlBlock uint32_t out_doorbell; uint32_t completionQ_entry; pCompletion_Q pCompletionQ; + uint32_t completeQ_size; };/* HW_DEVICE_EXTENSION */ /* ******************************************************************************* diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 86f84d7f4d3d..1e358d993c8b 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -133,6 +133,7 @@ static void arcmsr_hbaC_message_isr(struct AdapterControlBlock *pACB); static void arcmsr_hbaD_message_isr(struct AdapterControlBlock *acb); static void arcmsr_hbaE_message_isr(struct AdapterControlBlock *acb); static void arcmsr_hbaE_postqueue_isr(struct AdapterControlBlock *acb); +static void arcmsr_hbaF_postqueue_isr(struct AdapterControlBlock *acb); static void arcmsr_hardware_reset(struct AdapterControlBlock *acb); static const char *arcmsr_info(struct Scsi_Host *); static irqreturn_t arcmsr_interrupt(struct AdapterControlBlock *acb); @@ -209,6 +210,8 @@ static struct pci_device_id arcmsr_device_id_table[] = { .driver_data = ACB_ADAPTER_TYPE_C}, {PCI_DEVICE(PCI_VENDOR_ID_ARECA, PCI_DEVICE_ID_ARECA_1884), .driver_data = ACB_ADAPTER_TYPE_E}, + {PCI_DEVICE(PCI_VENDOR_ID_ARECA, PCI_DEVICE_ID_ARECA_1886), + .driver_data = ACB_ADAPTER_TYPE_F}, {0, 0}, /* Terminating entry */ }; MODULE_DEVICE_TABLE(pci, arcmsr_device_id_table); @@ -232,12 +235,12 @@ static void arcmsr_free_io_queue(struct AdapterControlBlock *acb) switch (acb->adapter_type) { case ACB_ADAPTER_TYPE_B: case ACB_ADAPTER_TYPE_D: - case ACB_ADAPTER_TYPE_E: { + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: dma_free_coherent(&acb->pdev->dev, acb->ioqueue_size, acb->dma_coherent2, acb->dma_coherent_handle2); break; } - } } static bool arcmsr_remap_pciregion(struct AdapterControlBlock *acb) @@ -310,6 +313,19 @@ static bool arcmsr_remap_pciregion(struct AdapterControlBlock *acb) acb->out_doorbell = 0; break; } + case ACB_ADAPTER_TYPE_F: { + acb->pmuF = ioremap(pci_resource_start(pdev, 0), pci_resource_len(pdev, 0)); + if (!acb->pmuF) { + pr_notice("arcmsr%d: memory mapping region fail\n", + acb->host->host_no); + return false; + } + writel(0, &acb->pmuF->host_int_status); /* clear interrupt */ + writel(ARCMSR_HBFMU_DOORBELL_SYNC, &acb->pmuF->iobound_doorbell); + acb->in_doorbell = 0; + acb->out_doorbell = 0; + break; + } } return true; } @@ -333,6 +349,9 @@ static void arcmsr_unmap_pciregion(struct AdapterControlBlock *acb) case ACB_ADAPTER_TYPE_E: iounmap(acb->pmuE); break; + case ACB_ADAPTER_TYPE_F: + iounmap(acb->pmuF); + break; } } @@ -561,6 +580,7 @@ static void arcmsr_flush_adapter_cache(struct AdapterControlBlock *acb) arcmsr_hbaD_flush_cache(acb); break; case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: arcmsr_hbaE_flush_cache(acb); break; } @@ -618,6 +638,27 @@ static void arcmsr_hbaD_assign_regAddr(struct AdapterControlBlock *acb) reg->msgcode_rwbuffer = MEM_BASE0(ARCMSR_ARC1214_MESSAGE_RWBUFFER); } +static void arcmsr_hbaF_assign_regAddr(struct AdapterControlBlock *acb) +{ + dma_addr_t host_buffer_dma; + struct MessageUnit_F __iomem *pmuF; + + memset(acb->dma_coherent2, 0xff, acb->completeQ_size); + acb->message_wbuffer = (uint32_t *)round_up((unsigned long)acb->dma_coherent2 + + acb->completeQ_size, 4); + acb->message_rbuffer = ((void *)acb->message_wbuffer) + 0x100; + acb->msgcode_rwbuffer = ((void *)acb->message_wbuffer) + 0x200; + memset((void *)acb->message_wbuffer, 0, MESG_RW_BUFFER_SIZE); + host_buffer_dma = round_up(acb->dma_coherent_handle2 + acb->completeQ_size, 4); + pmuF = acb->pmuF; + /* host buffer low address, bit0:1 all buffer active */ + writel(lower_32_bits(host_buffer_dma | 1), &pmuF->inbound_msgaddr0); + /* host buffer high address */ + writel(upper_32_bits(host_buffer_dma), &pmuF->inbound_msgaddr1); + /* set host buffer physical address */ + writel(ARCMSR_HBFMU_DOORBELL_SYNC1, &pmuF->iobound_doorbell); +} + static bool arcmsr_alloc_io_queue(struct AdapterControlBlock *acb) { bool rtn = true; @@ -671,6 +712,28 @@ static bool arcmsr_alloc_io_queue(struct AdapterControlBlock *acb) acb->doneq_index = 0; } break; + case ACB_ADAPTER_TYPE_F: { + uint32_t QueueDepth; + uint32_t depthTbl[] = {256, 512, 1024, 128, 64, 32}; + + arcmsr_wait_firmware_ready(acb); + QueueDepth = depthTbl[readl(&acb->pmuF->outbound_msgaddr1) & 7]; + acb->completeQ_size = sizeof(struct deliver_completeQ) * QueueDepth + 128; + acb->ioqueue_size = roundup(acb->completeQ_size + MESG_RW_BUFFER_SIZE, 32); + dma_coherent = dma_alloc_coherent(&pdev->dev, acb->ioqueue_size, + &dma_coherent_handle, GFP_KERNEL); + if (!dma_coherent) { + pr_notice("arcmsr%d: DMA allocation failed\n", acb->host->host_no); + return false; + } + acb->dma_coherent_handle2 = dma_coherent_handle; + acb->dma_coherent2 = dma_coherent; + acb->pCompletionQ = dma_coherent; + acb->completionQ_entry = acb->completeQ_size / sizeof(struct deliver_completeQ); + acb->doneq_index = 0; + arcmsr_hbaF_assign_regAddr(acb); + } + break; default: break; } @@ -705,7 +768,8 @@ static int arcmsr_alloc_ccb_pool(struct AdapterControlBlock *acb) acb->host->sg_tablesize = max_sg_entrys; roundup_ccbsize = roundup(sizeof(struct CommandControlBlock) + (max_sg_entrys - 1) * sizeof(struct SG64ENTRY), 32); acb->uncache_size = roundup_ccbsize * acb->maxFreeCCB; - acb->uncache_size += acb->ioqueue_size; + if (acb->adapter_type != ACB_ADAPTER_TYPE_F) + acb->uncache_size += acb->ioqueue_size; dma_coherent = dma_alloc_coherent(&pdev->dev, acb->uncache_size, &dma_coherent_handle, GFP_KERNEL); if(!dma_coherent){ printk(KERN_NOTICE "arcmsr%d: dma_alloc_coherent got error\n", acb->host->host_no); @@ -728,6 +792,7 @@ static int arcmsr_alloc_ccb_pool(struct AdapterControlBlock *acb) case ACB_ADAPTER_TYPE_C: case ACB_ADAPTER_TYPE_D: case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: ccb_tmp->cdb_phyaddr = cdb_phyaddr; break; } @@ -746,8 +811,10 @@ static int arcmsr_alloc_ccb_pool(struct AdapterControlBlock *acb) ccb_tmp = (struct CommandControlBlock *)((unsigned long)ccb_tmp + roundup_ccbsize); dma_coherent_handle = next_ccb_phy; } - acb->dma_coherent_handle2 = dma_coherent_handle; - acb->dma_coherent2 = ccb_tmp; + if (acb->adapter_type != ACB_ADAPTER_TYPE_F) { + acb->dma_coherent_handle2 = dma_coherent_handle; + acb->dma_coherent2 = ccb_tmp; + } switch (acb->adapter_type) { case ACB_ADAPTER_TYPE_B: acb->pmuB = (struct MessageUnit_B *)acb->dma_coherent2; @@ -813,6 +880,11 @@ static void arcmsr_message_isr_bh_fn(struct work_struct *work) devicemap = (char __iomem *)(®->msgcode_rwbuffer[21]); break; } + case ACB_ADAPTER_TYPE_F: { + signature = (uint32_t __iomem *)(&acb->msgcode_rwbuffer[0]); + devicemap = (char __iomem *)(&acb->msgcode_rwbuffer[21]); + break; + } } if (readl(signature) != ARCMSR_SIGNATURE_GET_CONFIG) return; @@ -998,7 +1070,8 @@ static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id) if(!error){ goto free_hbb_mu; } - arcmsr_free_io_queue(acb); + if (acb->adapter_type != ACB_ADAPTER_TYPE_F) + arcmsr_free_io_queue(acb); error = arcmsr_alloc_ccb_pool(acb); if(error){ goto unmap_pci_region; @@ -1111,6 +1184,14 @@ static int arcmsr_resume(struct pci_dev *pdev) acb->out_doorbell = 0; acb->doneq_index = 0; break; + case ACB_ADAPTER_TYPE_F: + writel(0, &acb->pmuF->host_int_status); + writel(ARCMSR_HBFMU_DOORBELL_SYNC, &acb->pmuF->iobound_doorbell); + acb->in_doorbell = 0; + acb->out_doorbell = 0; + acb->doneq_index = 0; + arcmsr_hbaF_assign_regAddr(acb); + break; } arcmsr_iop_init(acb); arcmsr_init_get_devmap_timer(acb); @@ -1123,6 +1204,8 @@ controller_stop: controller_unregister: scsi_remove_host(host); arcmsr_free_ccb_pool(acb); + if (acb->adapter_type == ACB_ADAPTER_TYPE_F) + arcmsr_free_io_queue(acb); arcmsr_unmap_pciregion(acb); pci_release_regions(pdev); scsi_host_put(host); @@ -1215,6 +1298,7 @@ static uint8_t arcmsr_abort_allcmd(struct AdapterControlBlock *acb) rtnval = arcmsr_hbaD_abort_allcmd(acb); break; case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: rtnval = arcmsr_hbaE_abort_allcmd(acb); break; } @@ -1290,7 +1374,8 @@ static u32 arcmsr_disable_outbound_ints(struct AdapterControlBlock *acb) writel(ARCMSR_ARC1214_ALL_INT_DISABLE, reg->pcief0_int_enable); } break; - case ACB_ADAPTER_TYPE_E: { + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: { struct MessageUnit_E __iomem *reg = acb->pmuE; orig_mask = readl(®->host_int_mask); writel(orig_mask | ARCMSR_HBEMU_OUTBOUND_DOORBELL_ISR | ARCMSR_HBEMU_OUTBOUND_POSTQUEUE_ISR, ®->host_int_mask); @@ -1497,6 +1582,9 @@ static void arcmsr_done4abort_postqueue(struct AdapterControlBlock *acb) case ACB_ADAPTER_TYPE_E: arcmsr_hbaE_postqueue_isr(acb); break; + case ACB_ADAPTER_TYPE_F: + arcmsr_hbaF_postqueue_isr(acb); + break; } } @@ -1551,6 +1639,8 @@ static void arcmsr_free_pcidev(struct AdapterControlBlock *acb) pdev = acb->pdev; arcmsr_free_irq(pdev, acb); arcmsr_free_ccb_pool(acb); + if (acb->adapter_type == ACB_ADAPTER_TYPE_F) + arcmsr_free_io_queue(acb); arcmsr_unmap_pciregion(acb); pci_release_regions(pdev); scsi_host_put(host); @@ -1608,6 +1698,8 @@ static void arcmsr_remove(struct pci_dev *pdev) } arcmsr_free_irq(pdev, acb); arcmsr_free_ccb_pool(acb); + if (acb->adapter_type == ACB_ADAPTER_TYPE_F) + arcmsr_free_io_queue(acb); arcmsr_unmap_pciregion(acb); pci_release_regions(pdev); scsi_host_put(host); @@ -1685,7 +1777,8 @@ static void arcmsr_enable_outbound_ints(struct AdapterControlBlock *acb, writel(intmask_org | mask, reg->pcief0_int_enable); break; } - case ACB_ADAPTER_TYPE_E: { + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: { struct MessageUnit_E __iomem *reg = acb->pmuE; mask = ~(ARCMSR_HBEMU_OUTBOUND_DOORBELL_ISR | ARCMSR_HBEMU_OUTBOUND_POSTQUEUE_ISR); @@ -1829,6 +1922,19 @@ static void arcmsr_post_ccb(struct AdapterControlBlock *acb, struct CommandContr writel(ccb_post_stamp, &pmu->inbound_queueport_low); break; } + case ACB_ADAPTER_TYPE_F: { + struct MessageUnit_F __iomem *pmu = acb->pmuF; + u32 ccb_post_stamp, arc_cdb_size; + + if (ccb->arc_cdb_size <= 0x300) + arc_cdb_size = (ccb->arc_cdb_size - 1) >> 6 | 1; + else + arc_cdb_size = (((ccb->arc_cdb_size + 0xff) >> 8) + 2) << 1 | 1; + ccb_post_stamp = (ccb->smid | arc_cdb_size); + writel(0, &pmu->inbound_queueport_high); + writel(ccb_post_stamp, &pmu->inbound_queueport_low); + break; + } } } @@ -1912,6 +2018,7 @@ static void arcmsr_stop_adapter_bgrb(struct AdapterControlBlock *acb) arcmsr_hbaD_stop_bgrb(acb); break; case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: arcmsr_hbaE_stop_bgrb(acb); break; } @@ -1947,7 +2054,8 @@ static void arcmsr_iop_message_read(struct AdapterControlBlock *acb) reg->inbound_doorbell); } break; - case ACB_ADAPTER_TYPE_E: { + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: { struct MessageUnit_E __iomem *reg = acb->pmuE; acb->out_doorbell ^= ARCMSR_HBEMU_DRV2IOP_DATA_READ_OK; writel(acb->out_doorbell, ®->iobound_doorbell); @@ -1993,7 +2101,8 @@ static void arcmsr_iop_message_wrote(struct AdapterControlBlock *acb) reg->inbound_doorbell); } break; - case ACB_ADAPTER_TYPE_E: { + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: { struct MessageUnit_E __iomem *reg = acb->pmuE; acb->out_doorbell ^= ARCMSR_HBEMU_DRV2IOP_DATA_WRITE_OK; writel(acb->out_doorbell, ®->iobound_doorbell); @@ -2032,6 +2141,10 @@ struct QBUFFER __iomem *arcmsr_get_iop_rqbuffer(struct AdapterControlBlock *acb) qbuffer = (struct QBUFFER __iomem *)®->message_rbuffer; } break; + case ACB_ADAPTER_TYPE_F: { + qbuffer = (struct QBUFFER __iomem *)acb->message_rbuffer; + } + break; } return qbuffer; } @@ -2066,6 +2179,9 @@ static struct QBUFFER __iomem *arcmsr_get_iop_wqbuffer(struct AdapterControlBloc pqbuffer = (struct QBUFFER __iomem *)®->message_wbuffer; } break; + case ACB_ADAPTER_TYPE_F: + pqbuffer = (struct QBUFFER __iomem *)acb->message_wbuffer; + break; } return pqbuffer; } @@ -2480,6 +2596,36 @@ static void arcmsr_hbaE_postqueue_isr(struct AdapterControlBlock *acb) spin_unlock_irqrestore(&acb->doneq_lock, flags); } +static void arcmsr_hbaF_postqueue_isr(struct AdapterControlBlock *acb) +{ + uint32_t doneq_index; + uint16_t cmdSMID; + int error; + struct MessageUnit_F __iomem *phbcmu; + struct CommandControlBlock *ccb; + unsigned long flags; + + spin_lock_irqsave(&acb->doneq_lock, flags); + doneq_index = acb->doneq_index; + phbcmu = acb->pmuF; + while (1) { + cmdSMID = acb->pCompletionQ[doneq_index].cmdSMID; + if (cmdSMID == 0xffff) + break; + ccb = acb->pccb_pool[cmdSMID]; + error = (acb->pCompletionQ[doneq_index].cmdFlag & + ARCMSR_CCBREPLY_FLAG_ERROR_MODE1) ? true : false; + arcmsr_drain_donequeue(acb, ccb, error); + acb->pCompletionQ[doneq_index].cmdSMID = 0xffff; + doneq_index++; + if (doneq_index >= acb->completionQ_entry) + doneq_index = 0; + } + acb->doneq_index = doneq_index; + writel(doneq_index, &phbcmu->reply_post_consumer_index); + spin_unlock_irqrestore(&acb->doneq_lock, flags); +} + /* ********************************************************************************** ** Handle a message interrupt @@ -2670,6 +2816,31 @@ static irqreturn_t arcmsr_hbaE_handle_isr(struct AdapterControlBlock *pACB) return IRQ_HANDLED; } +static irqreturn_t arcmsr_hbaF_handle_isr(struct AdapterControlBlock *pACB) +{ + uint32_t host_interrupt_status; + struct MessageUnit_F __iomem *phbcmu = pACB->pmuF; + + host_interrupt_status = readl(&phbcmu->host_int_status) & + (ARCMSR_HBEMU_OUTBOUND_POSTQUEUE_ISR | + ARCMSR_HBEMU_OUTBOUND_DOORBELL_ISR); + if (!host_interrupt_status) + return IRQ_NONE; + do { + /* MU post queue interrupts*/ + if (host_interrupt_status & ARCMSR_HBEMU_OUTBOUND_POSTQUEUE_ISR) + arcmsr_hbaF_postqueue_isr(pACB); + + /* MU ioctl transfer doorbell interrupts*/ + if (host_interrupt_status & ARCMSR_HBEMU_OUTBOUND_DOORBELL_ISR) + arcmsr_hbaE_doorbell_isr(pACB); + + host_interrupt_status = readl(&phbcmu->host_int_status); + } while (host_interrupt_status & (ARCMSR_HBEMU_OUTBOUND_POSTQUEUE_ISR | + ARCMSR_HBEMU_OUTBOUND_DOORBELL_ISR)); + return IRQ_HANDLED; +} + static irqreturn_t arcmsr_interrupt(struct AdapterControlBlock *acb) { switch (acb->adapter_type) { @@ -2683,6 +2854,8 @@ static irqreturn_t arcmsr_interrupt(struct AdapterControlBlock *acb) return arcmsr_hbaD_handle_isr(acb); case ACB_ADAPTER_TYPE_E: return arcmsr_hbaE_handle_isr(acb); + case ACB_ADAPTER_TYPE_F: + return arcmsr_hbaF_handle_isr(acb); default: return IRQ_NONE; } @@ -3231,6 +3404,31 @@ static bool arcmsr_hbaE_get_config(struct AdapterControlBlock *pACB) return true; } +static bool arcmsr_hbaF_get_config(struct AdapterControlBlock *pACB) +{ + struct MessageUnit_F __iomem *reg = pACB->pmuF; + uint32_t intmask_org; + + /* disable all outbound interrupt */ + intmask_org = readl(®->host_int_mask); /* disable outbound message0 int */ + writel(intmask_org | ARCMSR_HBEMU_ALL_INTMASKENABLE, ®->host_int_mask); + /* wait firmware ready */ + arcmsr_wait_firmware_ready(pACB); + /* post "get config" instruction */ + writel(ARCMSR_INBOUND_MESG0_GET_CONFIG, ®->inbound_msgaddr0); + + pACB->out_doorbell ^= ARCMSR_HBEMU_DRV2IOP_MESSAGE_CMD_DONE; + writel(pACB->out_doorbell, ®->iobound_doorbell); + /* wait message ready */ + if (!arcmsr_hbaE_wait_msgint_ready(pACB)) { + pr_notice("arcmsr%d: wait get adapter firmware miscellaneous data timeout\n", + pACB->host->host_no); + return false; + } + arcmsr_get_adapter_config(pACB, pACB->msgcode_rwbuffer); + return true; +} + static bool arcmsr_get_firmware_spec(struct AdapterControlBlock *acb) { bool rtn = false; @@ -3251,6 +3449,9 @@ static bool arcmsr_get_firmware_spec(struct AdapterControlBlock *acb) case ACB_ADAPTER_TYPE_E: rtn = arcmsr_hbaE_get_config(acb); break; + case ACB_ADAPTER_TYPE_F: + rtn = arcmsr_hbaF_get_config(acb); + break; default: break; } @@ -3621,6 +3822,7 @@ static int arcmsr_polling_ccbdone(struct AdapterControlBlock *acb, rtn = arcmsr_hbaD_polling_ccbdone(acb, poll_ccb); break; case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: rtn = arcmsr_hbaE_polling_ccbdone(acb, poll_ccb); break; } @@ -3701,6 +3903,16 @@ static void arcmsr_set_iop_datetime(struct timer_list *t) writel(pacb->out_doorbell, ®->iobound_doorbell); break; } + case ACB_ADAPTER_TYPE_F: { + struct MessageUnit_F __iomem *reg = pacb->pmuF; + + pacb->msgcode_rwbuffer[0] = datetime.b.msg_time[0]; + pacb->msgcode_rwbuffer[1] = datetime.b.msg_time[1]; + writel(ARCMSR_INBOUND_MESG0_SYNC_TIMER, ®->inbound_msgaddr0); + pacb->out_doorbell ^= ARCMSR_HBEMU_DRV2IOP_MESSAGE_CMD_DONE; + writel(pacb->out_doorbell, ®->iobound_doorbell); + break; + } } if (sys_tz.tz_minuteswest) next_time = ARCMSR_HOURS; @@ -3726,6 +3938,7 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb) dma_coherent_handle = acb->dma_coherent_handle2; break; case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: dma_coherent_handle = acb->dma_coherent_handle + offsetof(struct CommandControlBlock, arcmsr_cdb); break; @@ -3843,11 +4056,8 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb) writel(cdb_phyaddr, ®->msgcode_rwbuffer[2]); writel(cdb_phyaddr_hi32, ®->msgcode_rwbuffer[3]); writel(acb->ccbsize, ®->msgcode_rwbuffer[4]); - dma_coherent_handle = acb->dma_coherent_handle2; - cdb_phyaddr = (uint32_t)(dma_coherent_handle & 0xffffffff); - cdb_phyaddr_hi32 = (uint32_t)((dma_coherent_handle >> 16) >> 16); - writel(cdb_phyaddr, ®->msgcode_rwbuffer[5]); - writel(cdb_phyaddr_hi32, ®->msgcode_rwbuffer[6]); + writel(lower_32_bits(acb->dma_coherent_handle2), ®->msgcode_rwbuffer[5]); + writel(upper_32_bits(acb->dma_coherent_handle2), ®->msgcode_rwbuffer[6]); writel(acb->ioqueue_size, ®->msgcode_rwbuffer[7]); writel(ARCMSR_INBOUND_MESG0_SET_CONFIG, ®->inbound_msgaddr0); acb->out_doorbell ^= ARCMSR_HBEMU_DRV2IOP_MESSAGE_CMD_DONE; @@ -3859,6 +4069,27 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb) } } break; + case ACB_ADAPTER_TYPE_F: { + struct MessageUnit_F __iomem *reg = acb->pmuF; + + acb->msgcode_rwbuffer[0] = ARCMSR_SIGNATURE_SET_CONFIG; + acb->msgcode_rwbuffer[1] = ARCMSR_SIGNATURE_1886; + acb->msgcode_rwbuffer[2] = cdb_phyaddr; + acb->msgcode_rwbuffer[3] = cdb_phyaddr_hi32; + acb->msgcode_rwbuffer[4] = acb->ccbsize; + acb->msgcode_rwbuffer[5] = lower_32_bits(acb->dma_coherent_handle2); + acb->msgcode_rwbuffer[6] = upper_32_bits(acb->dma_coherent_handle2); + acb->msgcode_rwbuffer[7] = acb->completeQ_size; + writel(ARCMSR_INBOUND_MESG0_SET_CONFIG, ®->inbound_msgaddr0); + acb->out_doorbell ^= ARCMSR_HBEMU_DRV2IOP_MESSAGE_CMD_DONE; + writel(acb->out_doorbell, ®->iobound_doorbell); + if (!arcmsr_hbaE_wait_msgint_ready(acb)) { + pr_notice("arcmsr%d: 'set command Q window' timeout\n", + acb->host->host_no); + return 1; + } + } + break; } return 0; } @@ -3907,7 +4138,8 @@ static void arcmsr_wait_firmware_ready(struct AdapterControlBlock *acb) ARCMSR_ARC1214_MESSAGE_FIRMWARE_OK) == 0); } break; - case ACB_ADAPTER_TYPE_E: { + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: { struct MessageUnit_E __iomem *reg = acb->pmuE; do { if (!(acb->acb_flags & ACB_F_IOP_INITED)) @@ -3955,10 +4187,23 @@ static void arcmsr_request_device_map(struct timer_list *t) writel(acb->out_doorbell, ®->iobound_doorbell); break; } + case ACB_ADAPTER_TYPE_F: { + struct MessageUnit_F __iomem *reg = acb->pmuF; + uint32_t outMsg1 = readl(®->outbound_msgaddr1); + + if (!(outMsg1 & ARCMSR_HBFMU_MESSAGE_FIRMWARE_OK) || + (outMsg1 & ARCMSR_HBFMU_MESSAGE_NO_VOLUME_CHANGE)) + goto nxt6s; + writel(ARCMSR_INBOUND_MESG0_GET_CONFIG, ®->inbound_msgaddr0); + acb->out_doorbell ^= ARCMSR_HBEMU_DRV2IOP_MESSAGE_CMD_DONE; + writel(acb->out_doorbell, ®->iobound_doorbell); + break; + } default: return; } acb->acb_flags |= ACB_F_MSG_GET_CONFIG; +nxt6s: mod_timer(&acb->eternal_timer, jiffies + msecs_to_jiffies(6 * HZ)); } } @@ -4040,6 +4285,7 @@ static void arcmsr_start_adapter_bgrb(struct AdapterControlBlock *acb) arcmsr_hbaD_start_bgrb(acb); break; case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: arcmsr_hbaE_start_bgrb(acb); break; } @@ -4119,7 +4365,8 @@ static void arcmsr_clear_doorbell_queue_buffer(struct AdapterControlBlock *acb) } } break; - case ACB_ADAPTER_TYPE_E: { + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F: { struct MessageUnit_E __iomem *reg = acb->pmuE; uint32_t i, tmp; @@ -4246,7 +4493,8 @@ static bool arcmsr_reset_in_progress(struct AdapterControlBlock *acb) true : false; } break; - case ACB_ADAPTER_TYPE_E:{ + case ACB_ADAPTER_TYPE_E: + case ACB_ADAPTER_TYPE_F:{ struct MessageUnit_E __iomem *reg = acb->pmuE; rtn = (readl(®->host_diagnostic_3xxx) & ARCMSR_ARC188X_RESET_ADAPTER) ? true : false; @@ -4445,6 +4693,9 @@ static const char *arcmsr_info(struct Scsi_Host *host) case PCI_DEVICE_ID_ARECA_1884: type = "SAS/SATA"; break; + case PCI_DEVICE_ID_ARECA_1886: + type = "NVMe/SAS/SATA"; + break; default: type = "unknown"; raid6 = 0; From c881fb5cd5ffc407f78878cc43ce13699946844d Mon Sep 17 00:00:00 2001 From: ching Huang Date: Mon, 28 Sep 2020 18:38:09 +0800 Subject: [PATCH 282/497] scsi: arcmsr: Update driver version to v1.50.00.02-20200819 Update driver version to v1.50.00.02-20200819. Link: https://lore.kernel.org/r/b41f9af781bc36f4e5f82fccabc86ebbd0e587f8.camel@areca.com.tw Signed-off-by: ching Huang Signed-off-by: Martin K. Petersen --- drivers/scsi/arcmsr/arcmsr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/arcmsr/arcmsr.h b/drivers/scsi/arcmsr/arcmsr.h index 5e32f1721d1d..5d054d5c70a5 100644 --- a/drivers/scsi/arcmsr/arcmsr.h +++ b/drivers/scsi/arcmsr/arcmsr.h @@ -49,7 +49,7 @@ struct device_attribute; #define ARCMSR_MAX_OUTSTANDING_CMD 1024 #define ARCMSR_DEFAULT_OUTSTANDING_CMD 128 #define ARCMSR_MIN_OUTSTANDING_CMD 32 -#define ARCMSR_DRIVER_VERSION "v1.40.00.10-20190116" +#define ARCMSR_DRIVER_VERSION "v1.50.00.02-20200819" #define ARCMSR_SCSI_INITIATOR_ID 255 #define ARCMSR_MAX_XFER_SECTORS 512 #define ARCMSR_MAX_XFER_SECTORS_B 4096 From 9120ac54cce6c37de7580d7567119d28c8988ebc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Oct 2020 22:06:10 +0200 Subject: [PATCH 283/497] scsi: sr: Initialize ->cmd_len Ensure the command length is properly set. Previously the command code tried to find this out using the command opcode. Link: https://lore.kernel.org/r/20201008200611.1818099-2-hch@lst.de Fixes: 2ceda20f0a99 ("scsi: core: Move command size detection out of the fast path") Reported-by: Qian Cai Reported-by: Naresh Kamboju Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/sr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index b74dfd8dc116..c20b43918378 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -503,6 +503,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) SCpnt->transfersize = cd->device->sector_size; SCpnt->underflow = this_count << 9; SCpnt->allowed = MAX_RETRIES; + SCpnt->cmd_len = 10; /* * This indicates that the command is ready from our end to be queued. From b6ba9b0e201a1c36972143366a3c22e672c43a61 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Oct 2020 22:06:11 +0200 Subject: [PATCH 284/497] scsi: core: Set sc_data_direction to DMA_NONE for no-transfer commands No having the special DMA_NONE logic makes libata rather unhappy. Link: https://lore.kernel.org/r/20201008200611.1818099-3-hch@lst.de Fixes: 40b93836a136 ("scsi: core: Use rq_dma_dir in scsi_setup_cmnd()") Reported-by: Qian Cai Reported-by: Naresh Kamboju Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6b0fccda9af2..1a2e9bab42ef 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1545,7 +1545,10 @@ static blk_status_t scsi_prepare_cmd(struct request *req) cmd->request = req; cmd->tag = req->tag; cmd->prot_op = SCSI_PROT_NORMAL; - cmd->sc_data_direction = rq_dma_dir(req); + if (blk_rq_bytes(req)) + cmd->sc_data_direction = rq_dma_dir(req); + else + cmd->sc_data_direction = DMA_NONE; sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->sdb.table.sgl = sg; From 1ef16a407f544408d3559e4de2ed05591df4da75 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 8 Oct 2020 19:32:39 +0100 Subject: [PATCH 285/497] scsi: qla2xxx: Fix return of uninitialized value in rval A previous change removed the initialization of rval and there is now an error where an uninitialized rval is being returned on an error return path. Fix this by returning -ENODEV. Link: https://lore.kernel.org/r/20201008183239.200358-1-colin.king@canonical.com Fixes: b994718760fa ("scsi: qla2xxx: Use constant when it is known") Reviewed-by: Himanshu Madhani Acked-by: Pavel Machek (CIP) Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen Addresses-Coverity: ("Uninitialized scalar variable") --- drivers/scsi/qla2xxx/qla_nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index ccec660b13d4..2cd9bd288910 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -562,7 +562,7 @@ static int qla_nvme_post_cmd(struct nvme_fc_local_port *lport, vha = fcport->vha; if (!(fcport->nvme_flag & NVME_FLAG_REGISTERED)) - return rval; + return -ENODEV; if (test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) || (qpair && !qpair->fw_started) || fcport->deleted) From 57c176074057531b249cf522d90c22313fa74b0b Mon Sep 17 00:00:00 2001 From: Boris Protopopov Date: Thu, 24 Sep 2020 00:36:38 +0000 Subject: [PATCH 286/497] Convert trailing spaces and periods in path components When converting trailing spaces and periods in paths, do so for every component of the path, not just the last component. If the conversion is not done for every path component, then subsequent operations in directories with trailing spaces or periods (e.g. create(), mkdir()) will fail with ENOENT. This is because on the server, the directory will have a special symbol in its name, and the client needs to provide the same. Signed-off-by: Boris Protopopov Acked-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifs_unicode.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 498777d859eb..9bd03a231032 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -488,7 +488,13 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, else if (map_chars == SFM_MAP_UNI_RSVD) { bool end_of_string; - if (i == srclen - 1) + /** + * Remap spaces and periods found at the end of every + * component of the path. The special cases of '.' and + * '..' do not need to be dealt with explicitly because + * they are addressed in namei.c:link_path_walk(). + **/ + if ((i == srclen - 1) || (source[i+1] == '\\')) end_of_string = true; else end_of_string = false; From 119e489681f769e497637da16d5df530213f5720 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 18 Mar 2020 00:51:48 -0500 Subject: [PATCH 287/497] smb3: add defines for new crypto algorithms In encryption capabilities negotiate context can now request AES256 GCM or CCM Signed-off-by: Steve French Acked-by: Ronnie Sahlberg --- fs/cifs/smb2pdu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index c3f1baf5bde2..7b6db162f516 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -325,6 +325,8 @@ struct smb2_preauth_neg_context { /* Encryption Algorithms Ciphers */ #define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) #define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) +#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) /* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ #define MIN_ENCRYPT_CTXT_DATA_LEN 4 From 3984bdc049462003c3025b0915fbb90d2a723d78 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 4 Oct 2020 17:42:27 -0500 Subject: [PATCH 288/497] update structure definitions from updated protocol documentation MS-SMB2 was updated recently to include new protocol definitions for updated compression payload header and new RDMA transform capabilities Update structure definitions in smb2pdu.h to match Signed-off-by: Steve French Acked-by: Ronnie Sahlberg --- fs/cifs/smb2pdu.h | 64 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 7b6db162f516..4dfb51dd7065 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -153,10 +153,14 @@ struct smb2_compression_transform_hdr { } __packed; /* See MS-SMB2 2.2.42.1 */ +#define SMB2_COMPRESSION_FLAG_NONE 0x0000 +#define SMB2_COMPRESSION_FLAG_CHAINDED 0x0001 + struct compression_payload_header { - __le16 AlgorithmId; - __le16 Reserved; - __le32 Length; + __le16 CompressionAlgorithm; + __le16 Flags; + __le32 Length; /* length of compressed playload including field below if present */ + /* __le32 OriginalPayloadSize; */ /* optional */ } __packed; /* See MS-SMB2 2.2.42.2 */ @@ -167,6 +171,26 @@ struct compression_pattern_payload_v1 { __le32 Repetitions; } __packed; +/* See MS-SMB2 2.2.43 */ +struct smb2_rdma_transform { + __le16 RdmaDescriptorOffset; + __le16 RdmaDescriptorLength; + __le32 Channel; /* for values see channel description in smb2 read above */ + __le16 TransformCount; + __le16 Reserved1; + __le32 Reserved2; +} __packed; + +struct smb2_rdma_encryption_transform { + __le16 TransformType; + __le16 SignatureLength; + __le16 NonceLength; + __u16 Reserved; + __u8 Signature[]; /* variable length */ + /* u8 Nonce[] */ + /* followed by padding */ +} __packed; + /* * SMB2 flag definitions */ @@ -297,6 +321,8 @@ struct smb2_negotiate_req { #define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) #define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) #define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) +#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) +#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) #define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) struct smb2_neg_context { @@ -353,10 +379,10 @@ struct smb2_encryption_neg_context { struct smb2_compression_capabilities_context { __le16 ContextType; /* 3 */ __le16 DataLength; - __u32 Flags; + __u32 Reserved; __le16 CompressionAlgorithmCount; __u16 Padding; - __u32 Reserved1; + __u32 Flags; __le16 CompressionAlgorithms[3]; } __packed; @@ -365,12 +391,31 @@ struct smb2_compression_capabilities_context { * Its struct simply contains NetName, an array of Unicode characters */ struct smb2_netname_neg_context { - __le16 ContextType; /* 0x100 */ + __le16 ContextType; /* 5 */ __le16 DataLength; __le32 Reserved; __le16 NetName[]; /* hostname of target converted to UCS-2 */ } __packed; +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.5 + */ + +/* RDMA Transform IDs */ +#define SMB2_RDMA_TRANSFORM_NONE 0x0000 +#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 + +struct smb2_rdma_transform_capabilities_context { + __le16 ContextType; /* 7 */ + __le16 DataLength; + __u32 Reserved; + __le16 TransformCount; + __u16 Reserved1; + __u32 Reserved2; + __le16 RDMATransformIds[1]; +} __packed; + #define POSIX_CTXT_DATA_LEN 16 struct smb2_posix_neg_context { __le16 ContextType; /* 0x100 */ @@ -1180,6 +1225,7 @@ struct smb2_flush_rsp { #define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) #define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ #define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ +#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */ /* SMB2 read request without RFC1001 length at the beginning */ struct smb2_read_plain_req { @@ -1199,6 +1245,10 @@ struct smb2_read_plain_req { __u8 Buffer[1]; } __packed; +/* Read flags */ +#define SMB2_READFLAG_RESPONSE_NONE 0x00000000 +#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001 + struct smb2_read_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 17 */ @@ -1206,7 +1256,7 @@ struct smb2_read_rsp { __u8 Reserved; __le32 DataLength; __le32 DataRemaining; - __u32 Reserved2; + __u32 Flags; __u8 Buffer[1]; } __packed; From 9e81e8ff74b90af0c6eafec3ba3d7817bea16997 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 5 Oct 2020 12:37:52 +1000 Subject: [PATCH 289/497] cifs: return cached_fid from open_shroot Cleanup patch for followon to cache additional information for the root directory when directory lease held. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2inode.c | 11 ++++++----- fs/cifs/smb2ops.c | 22 +++++++++++++++------- fs/cifs/smb2proto.h | 3 ++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index eba01d0908dd..df6212e55e10 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -511,9 +511,9 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; - struct cifs_fid fid; bool no_cached_open = tcon->nohandlecache; struct cifsFileInfo *cfile; + struct cached_fid *cfid = NULL; *adjust_tz = false; *symlink = false; @@ -525,7 +525,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* If it is a root and its handle is cached then use it */ if (!strlen(full_path) && !no_cached_open) { - rc = open_shroot(xid, tcon, cifs_sb, &fid); + rc = open_shroot(xid, tcon, cifs_sb, &cfid); if (rc) goto out; @@ -533,12 +533,13 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, move_smb2_info_to_cifs(data, &tcon->crfid.file_all_info); } else { - rc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, smb2_data); + rc = SMB2_query_info(xid, tcon, + cfid->fid->persistent_fid, + cfid->fid->volatile_fid, smb2_data); if (!rc) move_smb2_info_to_cifs(data, smb2_data); } - close_shroot(&tcon->crfid); + close_shroot(cfid); goto out; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d44df8f95bcd..5f3b6e77b2a7 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -651,7 +651,8 @@ smb2_cached_lease_break(struct work_struct *work) * Open the directory at the root of a share */ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid) + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid) { struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; @@ -666,11 +667,12 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, int rc, flags = 0; __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; + struct cifs_fid *pfid; mutex_lock(&tcon->crfid.fid_mutex); if (tcon->crfid.is_valid) { cifs_dbg(FYI, "found a cached root file handle\n"); - memcpy(pfid, tcon->crfid.fid, sizeof(struct cifs_fid)); + *cfid = &tcon->crfid; kref_get(&tcon->crfid.refcount); mutex_unlock(&tcon->crfid.fid_mutex); return 0; @@ -691,6 +693,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, if (!server->ops->new_lease_key) return -EIO; + pfid = tcon->crfid.fid; server->ops->new_lease_key(pfid); memset(rqst, 0, sizeof(rqst)); @@ -820,6 +823,8 @@ oshr_free: SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + if (rc == 0) + *cfid = &tcon->crfid; return rc; } @@ -833,6 +838,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; bool no_cached_open = tcon->nohandlecache; + struct cached_fid *cfid = NULL; oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES; @@ -841,12 +847,14 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - if (no_cached_open) + if (no_cached_open) { rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); - else - rc = open_shroot(xid, tcon, cifs_sb, &fid); - + } else { + rc = open_shroot(xid, tcon, cifs_sb, &cfid); + if (rc == 0) + memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); + } if (rc) return; @@ -863,7 +871,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, if (no_cached_open) SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); else - close_shroot(&tcon->crfid); + close_shroot(cfid); } static void diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 2f8ecbf54214..67c50d78caa1 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,7 +70,8 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid); + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid); extern void close_shroot(struct cached_fid *cfid); extern void close_shroot_lease(struct cached_fid *cfid); extern void close_shroot_lease_locked(struct cached_fid *cfid); From d1542cf6165e526a6837cc9d1e20cb0da841bd0b Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 5 Oct 2020 12:37:53 +1000 Subject: [PATCH 290/497] cifs: compute full_path already in cifs_readdir() Cleanup patch for followon to cache additional information for the root directory when directory lease held. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/readdir.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6df0922e7e30..31a18aae5e64 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -360,11 +360,11 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, */ static int -initiate_cifs_search(const unsigned int xid, struct file *file) +initiate_cifs_search(const unsigned int xid, struct file *file, + char *full_path) { __u16 search_flags; int rc = 0; - char *full_path = NULL; struct cifsFileInfo *cifsFile; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct tcon_link *tlink = NULL; @@ -400,12 +400,6 @@ initiate_cifs_search(const unsigned int xid, struct file *file) cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; - goto error_exit; - } - cifs_dbg(FYI, "Full path: %s start at: %lld\n", full_path, file->f_pos); ffirst_retry: @@ -444,7 +438,6 @@ ffirst_retry: goto ffirst_retry; } error_exit: - kfree(full_path); cifs_put_tlink(tlink); return rc; } @@ -688,7 +681,8 @@ static int cifs_save_resume_key(const char *current_entry, */ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, - struct file *file, char **current_entry, int *num_to_ret) + struct file *file, char *full_path, + char **current_entry, int *num_to_ret) { __u16 search_flags; int rc = 0; @@ -741,7 +735,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, ntwrk_buf_start); cfile->srch_inf.ntwrk_buf_start = NULL; } - rc = initiate_cifs_search(xid, file); + rc = initiate_cifs_search(xid, file, full_path); if (rc) { cifs_dbg(FYI, "error %d reinitiating a search on rewind\n", rc); @@ -925,15 +919,22 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) char *tmp_buf = NULL; char *end_of_smb; unsigned int max_len; + char *full_path = NULL; xid = get_xid(); + full_path = build_path_from_dentry(file_dentry(file)); + if (full_path == NULL) { + rc = -ENOMEM; + goto rddir2_exit; + } + /* * Ensure FindFirst doesn't fail before doing filldir() for '.' and * '..'. Otherwise we won't be able to notify VFS in case of failure. */ if (file->private_data == NULL) { - rc = initiate_cifs_search(xid, file); + rc = initiate_cifs_search(xid, file, full_path); cifs_dbg(FYI, "initiate cifs search rc %d\n", rc); if (rc) goto rddir2_exit; @@ -960,8 +961,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) } */ tcon = tlink_tcon(cifsFile->tlink); - rc = find_cifs_entry(xid, tcon, ctx->pos, file, ¤t_entry, - &num_to_fill); + rc = find_cifs_entry(xid, tcon, ctx->pos, file, full_path, + ¤t_entry, &num_to_fill); if (rc) { cifs_dbg(FYI, "fce error %d\n", rc); goto rddir2_exit; @@ -1019,6 +1020,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: + kfree(full_path); free_xid(xid); return rc; } From 5a61ae1402f15276ee4e003e198aab816958ca69 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 28 Aug 2020 23:44:36 +0200 Subject: [PATCH 291/497] gfs2: Make sure we don't miss any delayed withdraws Commit ca399c96e96e changes gfs2_log_flush to not withdraw the filesystem while holding the log flush lock, but it fails to check if the filesystem needs to be withdrawn once the log flush lock has been released. Likewise, commit f05b86db314d depends on gfs2_log_flush to trigger for delayed withdraws. Add that and clean up the code flow somewhat. In gfs2_put_super, add a check for delayed withdraws that have been missed to prevent these kinds of bugs in the future. Fixes: ca399c96e96e ("gfs2: flesh out delayed withdraw for gfs2_log_flush") Fixes: f05b86db314d ("gfs2: Prepare to withdraw as soon as an IO error occurs in log write") Cc: stable@vger.kernel.org # v5.7+: 462582b99b607: gfs2: add some much needed cleanup for log flushes that fail Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 61 +++++++++++++++++++++++++------------------------ fs/gfs2/super.c | 2 ++ fs/gfs2/util.h | 10 ++++++++ 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3763c9ff1406..93032feb5159 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -954,10 +954,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) goto out; /* Log might have been flushed while we waited for the flush lock */ - if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) { - up_write(&sdp->sd_log_flush_lock); - return; - } + if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) + goto out; trace_gfs2_log_flush(sdp, 1, flags); if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) @@ -971,25 +969,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (unlikely (state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) - goto out; + goto out_withdraw; } if (unlikely(state == SFS_FROZEN)) if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke)) - goto out; + goto out_withdraw; if (gfs2_assert_withdraw_delayed(sdp, sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke)) - goto out; + goto out_withdraw; gfs2_ordered_write(sdp); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; lops_before_commit(sdp, tr); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); @@ -1000,7 +998,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) log_write_header(sdp, flags); } if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; lops_after_commit(sdp, tr); gfs2_log_lock(sdp); @@ -1020,7 +1018,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (!sdp->sd_log_idle) { empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) - goto out; + goto out_withdraw; atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, flags); @@ -1033,27 +1031,30 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } -out: - if (gfs2_withdrawn(sdp)) { - trans_drain(tr); - /** - * If the tr_list is empty, we're withdrawing during a log - * flush that targets a transaction, but the transaction was - * never queued onto any of the ail lists. Here we add it to - * ail1 just so that ail_drain() will find and free it. - */ - spin_lock(&sdp->sd_ail_lock); - if (tr && list_empty(&tr->tr_list)) - list_add(&tr->tr_list, &sdp->sd_ail1_list); - spin_unlock(&sdp->sd_ail_lock); - ail_drain(sdp); /* frees all transactions */ - tr = NULL; - } - +out_end: trace_gfs2_log_flush(sdp, 0, flags); +out: up_write(&sdp->sd_log_flush_lock); - gfs2_trans_free(sdp, tr); + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); + return; + +out_withdraw: + trans_drain(tr); + /** + * If the tr_list is empty, we're withdrawing during a log + * flush that targets a transaction, but the transaction was + * never queued onto any of the ail lists. Here we add it to + * ail1 just so that ail_drain() will find and free it. + */ + spin_lock(&sdp->sd_ail_lock); + if (tr && list_empty(&tr->tr_list)) + list_add(&tr->tr_list, &sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + ail_drain(sdp); /* frees all transactions */ + tr = NULL; + goto out_end; } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9f4d9e7be839..19add2da1013 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -702,6 +702,8 @@ restart: if (error) gfs2_io_error(sdp); } + WARN_ON(gfs2_withdrawing(sdp)); + /* At this point, we're through modifying the disk */ /* Release stuff */ diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 6d9157efe16c..d7562981b3a0 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -205,6 +205,16 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) test_bit(SDF_WITHDRAWING, &sdp->sd_flags); } +/** + * gfs2_withdrawing - check if a withdraw is pending + * @sdp: the superblock + */ +static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && + !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) From 521031fa970109fbf7a808322fb0c92092a51cf0 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 31 Aug 2020 08:12:15 -0500 Subject: [PATCH 292/497] gfs2: Fix bad comment for trans_drain Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 93032feb5159..a83e6426e26e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -902,7 +902,7 @@ static void empty_ail1_list(struct gfs2_sbd *sdp) } /** - * drain_bd - drain the buf and databuf queue for a failed transaction + * trans_drain - drain the buf and databuf queue for a failed transaction * @tr: the transaction to drain * * When this is called, we're taking an error exit for a log write that failed From e8a8023ee0bda6dcb32e4695bb080499d7a1ffea Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 16 Sep 2020 10:50:21 +0800 Subject: [PATCH 293/497] gfs2: convert to use DEFINE_SEQ_ATTRIBUTE macro Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code. Signed-off-by: Liu Shixin Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f13b136654ca..15c5c26f6ae7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2415,7 +2415,7 @@ static const struct seq_operations gfs2_glstats_seq_ops = { .show = gfs2_glstats_seq_show, }; -static const struct seq_operations gfs2_sbstats_seq_ops = { +static const struct seq_operations gfs2_sbstats_sops = { .start = gfs2_sbstats_seq_start, .next = gfs2_sbstats_seq_next, .stop = gfs2_sbstats_seq_stop, @@ -2468,16 +2468,6 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } -static int gfs2_sbstats_open(struct inode *inode, struct file *file) -{ - int ret = seq_open(file, &gfs2_sbstats_seq_ops); - if (ret == 0) { - struct seq_file *seq = file->private_data; - seq->private = inode->i_private; /* sdp */ - } - return ret; -} - static const struct file_operations gfs2_glocks_fops = { .owner = THIS_MODULE, .open = gfs2_glocks_open, @@ -2494,13 +2484,7 @@ static const struct file_operations gfs2_glstats_fops = { .release = gfs2_glocks_release, }; -static const struct file_operations gfs2_sbstats_fops = { - .owner = THIS_MODULE, - .open = gfs2_sbstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) { From 23d828fc3f1e309bbc23bab817e6b5c40b06d9b9 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 11 Sep 2020 10:56:31 -0500 Subject: [PATCH 294/497] gfs2: rename variable error to ret in gfs2_evict_inode Function gfs2_evict_inode is too big and unreadable. This patch is just a baby step toward improving that. This first step just renames variable error to ret. This will help make future patches more readable. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 19add2da1013..ab08b9a1102c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1338,7 +1338,7 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; struct address_space *metamapping; - int error; + int ret; if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { clear_inode(inode); @@ -1362,8 +1362,8 @@ static void gfs2_evict_inode(struct inode *inode) goto out; /* Must not read inode block until block type has been verified */ - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); - if (unlikely(error)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); + if (unlikely(ret)) { glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); @@ -1372,13 +1372,13 @@ static void gfs2_evict_inode(struct inode *inode) if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) goto out_truncate; - error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (error) + ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (ret) goto out_truncate; if (test_bit(GIF_INVALID, &ip->i_flags)) { - error = gfs2_inode_refresh(ip); - if (error) + ret = gfs2_inode_refresh(ip); + if (ret) goto out_truncate; } @@ -1399,20 +1399,20 @@ out_delete: if (S_ISDIR(inode->i_mode) && (ip->i_diskflags & GFS2_DIF_EXHASH)) { - error = gfs2_dir_exhash_dealloc(ip); - if (error) + ret = gfs2_dir_exhash_dealloc(ip); + if (ret) goto out_unlock; } if (ip->i_eattr) { - error = gfs2_ea_dealloc(ip); - if (error) + ret = gfs2_ea_dealloc(ip); + if (ret) goto out_unlock; } if (!gfs2_is_stuffed(ip)) { - error = gfs2_file_dealloc(ip); - if (error) + ret = gfs2_file_dealloc(ip); + if (ret) goto out_unlock; } @@ -1421,7 +1421,7 @@ out_delete: location and try to set gl_object again. We clear gl_object here so that subsequent inode creates don't see an old gl_object. */ glock_clear_object(ip->i_gl, ip); - error = gfs2_dinode_dealloc(ip); + ret = gfs2_dinode_dealloc(ip); gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); goto out_unlock; @@ -1436,8 +1436,8 @@ out_truncate: write_inode_now(inode, 1); gfs2_ail_flush(ip->i_gl, 0); - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) + ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (ret) goto out_unlock; /* Needs to be done before glock release & also in a transaction */ truncate_inode_pages(&inode->i_data, 0); @@ -1452,8 +1452,8 @@ out_unlock: glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); } - if (error && error != GLR_TRYFAILED && error != -EROFS) - fs_warn(sdp, "gfs2_evict_inode: %d\n", error); + if (ret && ret != GLR_TRYFAILED && ret != -EROFS) + fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); out: truncate_inode_pages_final(&inode->i_data); if (ip->i_qadata) From 6e7e9a505571db3edc926f4bc972c7ed3da29a9d Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 11 Sep 2020 09:29:25 -0500 Subject: [PATCH 295/497] gfs2: factor evict_unlinked_inode out of gfs2_evict_inode Function gfs2_evict_inode is way too big, complex and unreadable. This is a baby step toward breaking it apart to be more readable. It factors out the portion that deletes the online bits for a dinode that is unlinked and needs to be deleted. A future patch will factor out more. (If I factor out too much, the patch itself becomes unreadable). Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 67 +++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ab08b9a1102c..b5279a1d9cb0 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1310,6 +1310,45 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) return true; } +/** + * evict_unlinked_inode - delete the pieces of an unlinked evicted inode + * @inode: The inode to evict + */ +static int evict_unlinked_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + + if (S_ISDIR(inode->i_mode) && + (ip->i_diskflags & GFS2_DIF_EXHASH)) { + ret = gfs2_dir_exhash_dealloc(ip); + if (ret) + goto out; + } + + if (ip->i_eattr) { + ret = gfs2_ea_dealloc(ip); + if (ret) + goto out; + } + + if (!gfs2_is_stuffed(ip)) { + ret = gfs2_file_dealloc(ip); + if (ret) + goto out; + } + + /* We're about to clear the bitmap for the dinode, but as soon as we + do, gfs2_create_inode can create another inode at the same block + location and try to set gl_object again. We clear gl_object here so + that subsequent inode creates don't see an old gl_object. */ + glock_clear_object(ip->i_gl, ip); + ret = gfs2_dinode_dealloc(ip); + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); +out: + return ret; +} + /** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict @@ -1396,33 +1435,7 @@ out_delete: goto out_truncate; } } - - if (S_ISDIR(inode->i_mode) && - (ip->i_diskflags & GFS2_DIF_EXHASH)) { - ret = gfs2_dir_exhash_dealloc(ip); - if (ret) - goto out_unlock; - } - - if (ip->i_eattr) { - ret = gfs2_ea_dealloc(ip); - if (ret) - goto out_unlock; - } - - if (!gfs2_is_stuffed(ip)) { - ret = gfs2_file_dealloc(ip); - if (ret) - goto out_unlock; - } - - /* We're about to clear the bitmap for the dinode, but as soon as we - do, gfs2_create_inode can create another inode at the same block - location and try to set gl_object again. We clear gl_object here so - that subsequent inode creates don't see an old gl_object. */ - glock_clear_object(ip->i_gl, ip); - ret = gfs2_dinode_dealloc(ip); - gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); + ret = evict_unlinked_inode(inode); goto out_unlock; out_truncate: From 53dbc27eb18999619766ff994d2b33c4655c5588 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 11 Sep 2020 10:30:26 -0500 Subject: [PATCH 296/497] gfs2: further simplify gfs2_evict_inode with new func evict_should_delete This patch further simplifies function gfs2_evict_inode() by adding a new function evict_should_delete. The function may also lock the inode glock. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 122 ++++++++++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 45 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b5279a1d9cb0..e44030644fcd 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,6 +44,12 @@ #include "xattr.h" #include "lops.h" +enum dinode_demise { + SHOULD_DELETE_DINODE, + SHOULD_NOT_DELETE_DINODE, + SHOULD_DEFER_EVICTION, +}; + /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock @@ -1310,6 +1316,73 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) return true; } +/** + * evict_should_delete - determine whether the inode is eligible for deletion + * @inode: The inode to evict + * + * This function determines whether the evicted inode is eligible to be deleted + * and locks the inode glock. + * + * Returns: the fate of the dinode + */ +static enum dinode_demise evict_should_delete(struct inode *inode, + struct gfs2_holder *gh) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + int ret; + + if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { + BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); + goto should_delete; + } + + if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) + return SHOULD_DEFER_EVICTION; + + /* Deletes should never happen under memory pressure anymore. */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + return SHOULD_DEFER_EVICTION; + + /* Must not read inode block until block type has been verified */ + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); + if (unlikely(ret)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + return SHOULD_DEFER_EVICTION; + } + + if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) + return SHOULD_NOT_DELETE_DINODE; + ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (ret) + return SHOULD_NOT_DELETE_DINODE; + + if (test_bit(GIF_INVALID, &ip->i_flags)) { + ret = gfs2_inode_refresh(ip); + if (ret) + return SHOULD_NOT_DELETE_DINODE; + } + + /* + * The inode may have been recreated in the meantime. + */ + if (inode->i_nlink) + return SHOULD_NOT_DELETE_DINODE; + +should_delete: + if (gfs2_holder_initialized(&ip->i_iopen_gh) && + test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + if (!gfs2_upgrade_iopen_glock(inode)) { + gfs2_holder_uninit(&ip->i_iopen_gh); + return SHOULD_NOT_DELETE_DINODE; + } + } + return SHOULD_DELETE_DINODE; +} + /** * evict_unlinked_inode - delete the pieces of an unlinked evicted inode * @inode: The inode to evict @@ -1387,54 +1460,13 @@ static void gfs2_evict_inode(struct inode *inode) if (inode->i_nlink || sb_rdonly(sb)) goto out; - if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) { - BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl)); - gfs2_holder_mark_uninitialized(&gh); - goto out_delete; - } - - if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) + gfs2_holder_mark_uninitialized(&gh); + ret = evict_should_delete(inode, &gh); + if (ret == SHOULD_DEFER_EVICTION) goto out; - - /* Deletes should never happen under memory pressure anymore. */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) - goto out; - - /* Must not read inode block until block type has been verified */ - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); - if (unlikely(ret)) { - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - goto out; - } - - if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) - goto out_truncate; - ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); - if (ret) + if (ret == SHOULD_NOT_DELETE_DINODE) goto out_truncate; - if (test_bit(GIF_INVALID, &ip->i_flags)) { - ret = gfs2_inode_refresh(ip); - if (ret) - goto out_truncate; - } - - /* - * The inode may have been recreated in the meantime. - */ - if (inode->i_nlink) - goto out_truncate; - -out_delete: - if (gfs2_holder_initialized(&ip->i_iopen_gh) && - test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - if (!gfs2_upgrade_iopen_glock(inode)) { - gfs2_holder_uninit(&ip->i_iopen_gh); - goto out_truncate; - } - } ret = evict_unlinked_inode(inode); goto out_unlock; From d90be6ab9ad76d4f5492e15b73b9cca5cbb03f91 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 11 Sep 2020 14:53:52 -0500 Subject: [PATCH 297/497] gfs2: factor evict_linked_inode out of gfs2_evict_inode Now that we've factored out the delete-dinode case to simplify gfs2_evict_inode, we take it a step further and factor out the other case: where we don't delete the inode. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 52 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e44030644fcd..ba31952e21b9 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1422,6 +1422,39 @@ out: return ret; } +/* + * evict_linked_inode - evict an inode whose dinode has not been unlinked + * @inode: The inode to evict + */ +static int evict_linked_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct address_space *metamapping; + int ret; + + gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_EVICT_INODE); + metamapping = gfs2_glock2aspace(ip->i_gl); + if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { + filemap_fdatawrite(metamapping); + filemap_fdatawait(metamapping); + } + write_inode_now(inode, 1); + gfs2_ail_flush(ip->i_gl, 0); + + ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (ret) + return ret; + + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(metamapping, 0); + gfs2_trans_end(sdp); + return 0; +} + /** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict @@ -1449,7 +1482,6 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; - struct address_space *metamapping; int ret; if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { @@ -1471,23 +1503,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_unlock; out_truncate: - gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | - GFS2_LFC_EVICT_INODE); - metamapping = gfs2_glock2aspace(ip->i_gl); - if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { - filemap_fdatawrite(metamapping); - filemap_fdatawait(metamapping); - } - write_inode_now(inode, 1); - gfs2_ail_flush(ip->i_gl, 0); - - ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (ret) - goto out_unlock; - /* Needs to be done before glock release & also in a transaction */ - truncate_inode_pages(&inode->i_data, 0); - truncate_inode_pages(metamapping, 0); - gfs2_trans_end(sdp); + ret = evict_linked_inode(inode); out_unlock: if (gfs2_rs_active(&ip->i_res)) From 0a0d9f55c211d7a03b8ec5ad2d8f5b3062b4387c Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 16 Sep 2020 08:50:44 -0500 Subject: [PATCH 298/497] gfs2: simplify the logic in gfs2_evict_inode Now that we've factored out the deleted and undeleted dinode cases in gfs2_evict_inode, we can greatly simplify the logic. Now the function is easy to read and understand. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ba31952e21b9..3d9daac44e1c 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1496,16 +1496,11 @@ static void gfs2_evict_inode(struct inode *inode) ret = evict_should_delete(inode, &gh); if (ret == SHOULD_DEFER_EVICTION) goto out; - if (ret == SHOULD_NOT_DELETE_DINODE) - goto out_truncate; + if (ret == SHOULD_DELETE_DINODE) + ret = evict_unlinked_inode(inode); + else + ret = evict_linked_inode(inode); - ret = evict_unlinked_inode(inode); - goto out_unlock; - -out_truncate: - ret = evict_linked_inode(inode); - -out_unlock: if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); From ee1e2c773e4f4ce2213f9d77cc703b669ca6fa3f Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 16 Sep 2020 11:06:23 -0500 Subject: [PATCH 299/497] gfs2: call truncate_inode_pages_final for address space glocks Before this patch, we were not calling truncate_inode_pages_final for the address space for glocks, which left the possibility of a leak. We now take care of the problem instead of complaining, and we do it during glock tear-down.. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 15c5c26f6ae7..19d4db4c44e7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -270,7 +270,12 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); - GLOCK_BUG_ON(gl, mapping && mapping->nrpages && !gfs2_withdrawn(sdp)); + if (mapping) { + truncate_inode_pages_final(mapping); + if (!gfs2_withdrawn(sdp)) + GLOCK_BUG_ON(gl, mapping->nrpages || + mapping->nrexceptional); + } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } From 2164f9b9186962ffb7c687e18ec6f5255525f09d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jul 2019 23:54:39 +0200 Subject: [PATCH 300/497] gfs2: use iomap for buffered I/O in ordered and writeback mode Switch to using the iomap readpage and writepage helpers for all I/O in the ordered and writeback modes, and thus eliminate using buffer_heads for I/O in these cases. The journaled data mode is left untouched. (Andreas Gruenbacher: In gfs2_unstuffer_page, switch from mark_buffer_dirty to set_page_dirty instead of accidentally leaving the page / buffer clean.) Signed-off-by: Christoph Hellwig Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 42 ++++++++++++++++++++---------------------- fs/gfs2/bmap.c | 48 ++++++++++++++++++++++++++++++++++++------------ fs/gfs2/bmap.h | 1 + 3 files changed, 57 insertions(+), 34 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index d4af283fc888..a195eb60624e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -91,22 +91,13 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; + struct iomap_writepage_ctx wpc = { }; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; if (current->journal_info) goto redirty; - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - goto out; - } - - return nobh_writepage(page, gfs2_get_block_noalloc, wbc); + return iomap_writepage(page, wbc, &wpc, &gfs2_writeback_ops); redirty: redirty_page_for_writepage(wbc, page); @@ -208,7 +199,8 @@ static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); - int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct iomap_writepage_ctx wpc = { }; + int ret; /* * Even if we didn't write any pages here, we might still be holding @@ -216,9 +208,9 @@ static int gfs2_writepages(struct address_space *mapping, * want balance_dirty_pages() to loop indefinitely trying to write out * pages held in the ail that it can't find. */ + ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); if (ret == 0) set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); - return ret; } @@ -470,12 +462,13 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) static int __gfs2_readpage(void *file, struct page *page) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); int error; - if (i_blocksize(page->mapping->host) == PAGE_SIZE && - !page_has_buffers(page)) { + if (!gfs2_is_jdata(ip) || + (i_blocksize(inode) == PAGE_SIZE && !page_has_buffers(page))) { error = iomap_readpage(page, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_readpage(ip, page); @@ -563,8 +556,12 @@ static void gfs2_readahead(struct readahead_control *rac) struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - if (!gfs2_is_stuffed(ip)) + if (gfs2_is_stuffed(ip)) + ; + else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); + else + iomap_readahead(rac, &gfs2_iomap_ops); } /** @@ -784,12 +781,13 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readahead = gfs2_readahead, + .set_page_dirty = iomap_set_page_dirty, + .releasepage = iomap_releasepage, + .invalidatepage = iomap_invalidatepage, .bmap = gfs2_bmap, - .invalidatepage = gfs2_invalidatepage, - .releasepage = gfs2_releasepage, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, + .migratepage = iomap_migrate_page, + .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0f69fbd4af66..ed425d30e636 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -56,7 +56,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct page *page) { struct inode *inode = &ip->i_inode; - struct buffer_head *bh; int release = 0; if (!page || page->index) { @@ -80,20 +79,21 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, SetPageUptodate(page); } - if (!page_has_buffers(page)) - create_empty_buffers(page, BIT(inode->i_blkbits), - BIT(BH_Uptodate)); + if (gfs2_is_jdata(ip)) { + struct buffer_head *bh; - bh = page_buffers(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); - if (!buffer_mapped(bh)) - map_bh(bh, inode->i_sb, block); + bh = page_buffers(page); + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); - set_buffer_uptodate(bh); - if (gfs2_is_jdata(ip)) + set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); - else { - mark_buffer_dirty(bh); + } else { + set_page_dirty(page); gfs2_ordered_add_inode(ip); } @@ -1158,7 +1158,8 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, struct metapath mp = { .mp_aheight = 1, }; int ret; - iomap->flags |= IOMAP_F_BUFFER_HEAD; + if (gfs2_is_jdata(ip)) + iomap->flags |= IOMAP_F_BUFFER_HEAD; trace_gfs2_iomap_start(ip, pos, length, flags); if (gfs2_iomap_need_write_lock(flags)) { @@ -2518,3 +2519,26 @@ out: gfs2_trans_end(sdp); return error; } + +static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, + loff_t offset) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) + return -EIO; + + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + memset(&wpc->iomap, 0, sizeof(wpc->iomap)); + ret = gfs2_iomap_get(inode, offset, INT_MAX, 0, &wpc->iomap, &mp); + release_metapath(&mp); + return ret; +} + +const struct iomap_writeback_ops gfs2_writeback_ops = { + .map_blocks = gfs2_map_blocks, +}; diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index b88fd45ab79f..aed4632d47d3 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, } extern const struct iomap_ops gfs2_iomap_ops; +extern const struct iomap_writeback_ops gfs2_writeback_ops; extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); extern int gfs2_block_map(struct inode *inode, sector_t lblock, From 0e539ca1bbbe85a86549c97a30a765ada4a09df9 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Wed, 7 Oct 2020 12:30:58 +0100 Subject: [PATCH 301/497] gfs2: Fix NULL pointer dereference in gfs2_rgrp_dump When an rindex entry is found to be corrupt, compute_bitstructs() calls gfs2_consist_rgrpd() which calls gfs2_rgrp_dump() like this: gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); gfs2_rgrp_dump then dereferences the gl without checking it and we get BUG: KASAN: null-ptr-deref in gfs2_rgrp_dump+0x28/0x280 because there's no rgrp glock involved while reading the rindex on mount. Fix this by changing gfs2_rgrp_dump to take an rgrp argument. Reported-by: syzbot+43fa87986bdd31df9de6@syzkaller.appspotmail.com Signed-off-by: Andrew Price Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 11 ++++++++++- fs/gfs2/rgrp.c | 9 +++------ fs/gfs2/rgrp.h | 2 +- fs/gfs2/util.c | 2 +- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index de1d5f1d9ff8..c2c90747d79b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -227,6 +227,15 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; } +static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, + const char *fs_id_buf) +{ + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + + if (rgd) + gfs2_rgrp_dump(seq, rgd, fs_id_buf); +} + static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl) { struct gfs2_inode *ip; @@ -712,7 +721,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_sync = rgrp_go_sync, .go_inval = rgrp_go_inval, .go_lock = gfs2_rgrp_go_lock, - .go_dump = gfs2_rgrp_dump, + .go_dump = gfs2_rgrp_go_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_LVB, }; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 074f228ea839..1bba5a9d45fa 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2209,20 +2209,17 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, /** * gfs2_rgrp_dump - print out an rgrp * @seq: The iterator - * @gl: The glock in question + * @rgd: The rgrp in question * @fs_id_buf: pointer to file system id (if requested) * */ -void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl, +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, const char *fs_id_buf) { - struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; const struct rb_node *n; - if (rgd == NULL) - return; gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", fs_id_buf, (unsigned long long)rgd->rd_addr, rgd->rd_flags, @@ -2253,7 +2250,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) (unsigned long long)rgd->rd_addr); fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); - gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); rgd->rd_flags |= GFS2_RDF_ERROR; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index a1d7e14fc55b..9a587ada51ed 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -67,7 +67,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl, +extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, const char *fs_id_buf); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 1cd0328cae20..0fba3bf64189 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -419,7 +419,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); - gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); gfs2_lm(sdp, "fatal: filesystem consistency error\n" " RG = %llu\n" From c2a04b02c060c4858762edce4674d5cba3e5a96f Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Mon, 12 Oct 2020 14:13:09 +0100 Subject: [PATCH 302/497] gfs2: use-after-free in sysfs deregistration syzkaller found the following splat with CONFIG_DEBUG_KOBJECT_RELEASE=y: Read of size 1 at addr ffff000028e896b8 by task kworker/1:2/228 CPU: 1 PID: 228 Comm: kworker/1:2 Tainted: G S 5.9.0-rc8+ #101 Hardware name: linux,dummy-virt (DT) Workqueue: events kobject_delayed_cleanup Call trace: dump_backtrace+0x0/0x4d8 show_stack+0x34/0x48 dump_stack+0x174/0x1f8 print_address_description.constprop.0+0x5c/0x550 kasan_report+0x13c/0x1c0 __asan_report_load1_noabort+0x34/0x60 memcmp+0xd0/0xd8 gfs2_uevent+0xc4/0x188 kobject_uevent_env+0x54c/0x1240 kobject_uevent+0x2c/0x40 __kobject_del+0x190/0x1d8 kobject_delayed_cleanup+0x2bc/0x3b8 process_one_work+0x96c/0x18c0 worker_thread+0x3f0/0xc30 kthread+0x390/0x498 ret_from_fork+0x10/0x18 Allocated by task 1110: kasan_save_stack+0x28/0x58 __kasan_kmalloc.isra.0+0xc8/0xe8 kasan_kmalloc+0x10/0x20 kmem_cache_alloc_trace+0x1d8/0x2f0 alloc_super+0x64/0x8c0 sget_fc+0x110/0x620 get_tree_bdev+0x190/0x648 gfs2_get_tree+0x50/0x228 vfs_get_tree+0x84/0x2e8 path_mount+0x1134/0x1da8 do_mount+0x124/0x138 __arm64_sys_mount+0x164/0x238 el0_svc_common.constprop.0+0x15c/0x598 do_el0_svc+0x60/0x150 el0_svc+0x34/0xb0 el0_sync_handler+0xc8/0x5b4 el0_sync+0x15c/0x180 Freed by task 228: kasan_save_stack+0x28/0x58 kasan_set_track+0x28/0x40 kasan_set_free_info+0x24/0x48 __kasan_slab_free+0x118/0x190 kasan_slab_free+0x14/0x20 slab_free_freelist_hook+0x6c/0x210 kfree+0x13c/0x460 Use the same pattern as f2fs + ext4 where the kobject destruction must complete before allowing the FS itself to be freed. This means that we need an explicit free_sbd in the callers. Cc: Bob Peterson Cc: Andreas Gruenbacher Signed-off-by: Jamie Iles [Also go to fail_free when init_names fails.] Signed-off-by: Andreas Gruenbacher --- fs/gfs2/incore.h | 1 + fs/gfs2/ops_fstype.c | 22 +++++----------------- fs/gfs2/super.c | 1 + fs/gfs2/sys.c | 5 ++++- 4 files changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ca2ec02436ec..387e99d6eda9 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -705,6 +705,7 @@ struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; struct kobject sd_kobj; + struct completion sd_kobj_unregister; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6d18d2c91add..5bd602a290f7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1062,26 +1062,14 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) } error = init_names(sdp, silent); - if (error) { - /* In this case, we haven't initialized sysfs, so we have to - manually free the sdp. */ - free_sbd(sdp); - sb->s_fs_info = NULL; - return error; - } + if (error) + goto fail_free; snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); - /* - * If we hit an error here, gfs2_sys_fs_add will have called function - * kobject_put which causes the sysfs usage count to go to zero, which - * causes sysfs to call function gfs2_sbd_release, which frees sdp. - * Subsequent error paths here will call gfs2_sys_fs_del, which also - * kobject_put to free sdp. - */ if (error) - return error; + goto fail_free; gfs2_create_debugfs_file(sdp); @@ -1179,9 +1167,9 @@ fail_lm: gfs2_lm_unmount(sdp); fail_debug: gfs2_delete_debugfs_file(sdp); - /* gfs2_sys_fs_del must be the last thing we do, since it causes - * sysfs to call function gfs2_sbd_release, which frees sdp. */ gfs2_sys_fs_del(sdp); +fail_free: + free_sbd(sdp); sb->s_fs_info = NULL; return error; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 3d9daac44e1c..8e250ec42e91 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -744,6 +744,7 @@ restart: /* At this point, we're through participating in the lockspace */ gfs2_sys_fs_del(sdp); + free_sbd(sdp); } /** diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d28c41bd69b0..c3e72dba7418 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -303,7 +303,7 @@ static void gfs2_sbd_release(struct kobject *kobj) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - free_sbd(sdp); + complete(&sdp->sd_kobj_unregister); } static struct kobj_type gfs2_ktype = { @@ -655,6 +655,7 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) sprintf(ro, "RDONLY=%d", sb_rdonly(sb)); sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + init_completion(&sdp->sd_kobj_unregister); sdp->sd_kobj.kset = gfs2_kset; error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, "%s", sdp->sd_table_name); @@ -685,6 +686,7 @@ fail_tune: fail_reg: fs_err(sdp, "error %d adding sysfs files\n", error); kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); sb->s_fs_info = NULL; return error; } @@ -695,6 +697,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); } static int gfs2_uevent(struct kset *kset, struct kobject *kobj, From 02e83f46ebfaf9405881e290794c913d457541f0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Oct 2020 16:47:08 -0700 Subject: [PATCH 303/497] vfs: move generic_remap_checks out of mm I would like to move all the generic helpers for the vfs remap range functionality (aka clonerange and dedupe) into a separate file so that they won't be scattered across the vfs and the mm subsystems. The eventual goal is to be able to deselect remap_range.c if none of the filesystems need that code, but the tricky part here is picking a stable(ish) part of the merge window to rearrange code. Signed-off-by: Darrick J. Wong --- fs/Makefile | 3 +- fs/remap_range.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 2 + mm/filemap.c | 81 +------------------------------------- 4 files changed, 102 insertions(+), 81 deletions(-) create mode 100644 fs/remap_range.c diff --git a/fs/Makefile b/fs/Makefile index 1c7b0e3f6daa..7173350739c5 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,8 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o fsopen.o init.o + fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ + remap_range.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/remap_range.c b/fs/remap_range.c new file mode 100644 index 000000000000..f0a8c7065d5b --- /dev/null +++ b/fs/remap_range.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#include +#include + +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone via @req_count argument. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; + + /* + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. + */ + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; + } + + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + + /* + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. + */ + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; + + *req_count = count; + return 0; +} diff --git a/include/linux/fs.h b/include/linux/fs.h index 7519ae003a08..eea754a8dd67 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3012,6 +3012,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); +extern int generic_write_check_limits(struct file *file, loff_t pos, + loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, diff --git a/mm/filemap.c b/mm/filemap.c index 99c49eeae71b..cf20e5aeb11b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3098,8 +3098,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ -static int generic_write_check_limits(struct file *file, loff_t pos, - loff_t *count) +int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; @@ -3161,84 +3160,6 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) } EXPORT_SYMBOL(generic_write_checks); -/* - * Performs necessary checks before doing a clone. - * - * Can adjust amount of bytes to clone via @req_count argument. - * Returns appropriate error code that caller should return or - * zero in case the clone should be allowed. - */ -int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) -{ - struct inode *inode_in = file_in->f_mapping->host; - struct inode *inode_out = file_out->f_mapping->host; - uint64_t count = *req_count; - uint64_t bcount; - loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; - int ret; - - /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) - return -EINVAL; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EINVAL; - - size_in = i_size_read(inode_in); - size_out = i_size_read(inode_out); - - /* Dedupe requires both ranges to be within EOF. */ - if ((remap_flags & REMAP_FILE_DEDUP) && - (pos_in >= size_in || pos_in + count > size_in || - pos_out >= size_out || pos_out + count > size_out)) - return -EINVAL; - - /* Ensure the infile range is within the infile. */ - if (pos_in >= size_in) - return -EINVAL; - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* - * If the user wanted us to link to the infile's EOF, round up to the - * next block boundary for this check. - * - * Otherwise, make sure the count is also block-aligned, having - * already confirmed the starting offsets' block alignment. - */ - if (pos_in + count == size_in) { - bcount = ALIGN(size_in, bs) - pos_in; - } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); - bcount = count; - } - - /* Don't allow overlapped cloning within the same file. */ - if (inode_in == inode_out && - pos_out + bcount > pos_in && - pos_out < pos_in + bcount) - return -EINVAL; - - /* - * We shortened the request but the caller can't deal with that, so - * bounce the request back to userspace. - */ - if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) - return -EINVAL; - - *req_count = count; - return 0; -} - - /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. From 0ddc5154b24c96f20e94d653b0a814438de6032b Mon Sep 17 00:00:00 2001 From: Anant Thazhemadam Date: Wed, 14 Oct 2020 22:01:09 +0530 Subject: [PATCH 304/497] gfs2: add validation checks for size of superblock In gfs2_check_sb(), no validation checks are performed with regards to the size of the superblock. syzkaller detected a slab-out-of-bounds bug that was primarily caused because the block size for a superblock was set to zero. A valid size for a superblock is a power of 2 between 512 and PAGE_SIZE. Performing validation checks and ensuring that the size of the superblock is valid fixes this bug. Reported-by: syzbot+af90d47a37376844e731@syzkaller.appspotmail.com Tested-by: syzbot+af90d47a37376844e731@syzkaller.appspotmail.com Suggested-by: Andrew Price Signed-off-by: Anant Thazhemadam [Minor code reordering.] Signed-off-by: Andreas Gruenbacher --- fs/gfs2/ops_fstype.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 5bd602a290f7..03c33fc03c05 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -169,15 +169,19 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } - /* If format numbers match exactly, we're done. */ + if (sb->sb_fs_format != GFS2_FORMAT_FS || + sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + return -EINVAL; + } - if (sb->sb_fs_format == GFS2_FORMAT_FS && - sb->sb_multihost_format == GFS2_FORMAT_MULTI) - return 0; + if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE || + (sb->sb_bsize & (sb->sb_bsize - 1))) { + pr_warn("Invalid superblock size\n"); + return -EINVAL; + } - fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); - - return -EINVAL; + return 0; } static void end_bio_io_page(struct bio *bio) From 21b6924bb70e77c83d508b0d188d88f5e403f1f5 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 22 Jul 2020 11:51:09 -0500 Subject: [PATCH 305/497] gfs2: rename gfs2_write_full_page to gfs2_write_jdata_page, remove parm Since the function is only used for writing jdata pages, this patch simply renames function gfs2_write_full_page to a more appropriate name: gfs2_write_jdata_page. This makes the code easier to understand. The function was only called in one place, which passed in a pointer to function gfs2_get_block_noalloc. The function doesn't need to be passed in. Therefore, this also eliminates the unnecessary parameter to increase efficiency. I also took the liberty of cleaning up the function comments. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index a195eb60624e..287d160ccf7a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -106,11 +106,16 @@ out: return 0; } -/* This is the same as calling block_write_full_page, but it also +/** + * gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page + * @page: The page to write + * @wbc: The writeback control + * + * This is the same as calling block_write_full_page, but it also * writes pages outside of i_size */ -static int gfs2_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) +static int gfs2_write_jdata_page(struct page *page, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -128,7 +133,7 @@ static int gfs2_write_full_page(struct page *page, get_block_t *get_block, if (page->index == end_index && offset) zero_user_segment(page, offset, PAGE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, + return __block_write_full_page(inode, page, gfs2_get_block_noalloc, wbc, end_buffer_async_write); } @@ -157,7 +162,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize); } - return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc); + return gfs2_write_jdata_page(page, wbc); } /** From 77650bdbd293ac56807b8a3f24ca8152035eb2e3 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 20 Aug 2020 08:53:29 -0500 Subject: [PATCH 306/497] gfs2: add missing log_blocks trace points in gfs2_write_revokes Function gfs2_write_revokes was incrementing and decrementing the number of log blocks free, but there was never a log_blocks trace point for it. Thus, the free blocks from a log_blocks trace would jump around mysteriously. This patch adds the missing trace points so the trace makes more sense. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a83e6426e26e..3712059f19f9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -716,16 +716,24 @@ void gfs2_write_revokes(struct gfs2_sbd *sdp) atomic_dec(&sdp->sd_log_blks_free); /* If no blocks have been reserved, we need to also * reserve a block for the header */ - if (!sdp->sd_log_blks_reserved) + if (!sdp->sd_log_blks_reserved) { atomic_dec(&sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, -2); + } else { + trace_gfs2_log_blocks(sdp, -1); + } } gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); if (!sdp->sd_log_num_revoke) { atomic_inc(&sdp->sd_log_blks_free); - if (!sdp->sd_log_blks_reserved) + if (!sdp->sd_log_blks_reserved) { atomic_inc(&sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, 2); + } else { + trace_gfs2_log_blocks(sdp, 1); + } } } From 97c5e43d51a44bffdc0eb3c4b40c8d2d4cbe1a73 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 20 Aug 2020 08:59:28 -0500 Subject: [PATCH 307/497] gfs2: enhance log_blocks trace point to show log blocks free This patch adds some code to enhance the log_blocks trace point. It reports the number of free log blocks. This makes the trace point much more useful, especially for debugging performance problems when we can tell when the journal gets full and needs to wait for flushes, etc. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/trace_gfs2.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index e0025258107a..fe140ceef74d 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -388,15 +388,17 @@ TRACE_EVENT(gfs2_log_blocks, TP_STRUCT__entry( __field( dev_t, dev ) __field( int, blocks ) + __field( int, blks_free ) ), TP_fast_assign( __entry->dev = sdp->sd_vfs->s_dev; __entry->blocks = blocks; + __entry->blks_free = atomic_read(&sdp->sd_log_blks_free); ), - TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev), - MINOR(__entry->dev), __entry->blocks) + TP_printk("%u,%u log reserve %d %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks, __entry->blks_free) ); /* Writing back the AIL */ From 68942870c66a7983f1d2efb9c1c524f7ebef8097 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 22 Jul 2020 10:27:50 -0500 Subject: [PATCH 308/497] gfs2: Wipe jdata and ail1 in gfs2_journal_wipe, formerly gfs2_meta_wipe Before this patch, when blocks were freed, it called gfs2_meta_wipe to take the metadata out of the pending journal blocks. It did this mostly by calling another function called gfs2_remove_from_journal. This is shortsighted because it does not do anything with jdata blocks which may also be in the journal. This patch expands the function so that it wipes out jdata blocks from the journal as well, and it wipes it from the ail1 list if it hasn't been written back yet. Since it now processes jdata blocks as well, the function has been renamed from gfs2_meta_wipe to gfs2_journal_wipe. New function gfs2_ail1_wipe wants a static view of the ail list, so it locks the sd_ail_lock when removing items. To accomplish this, function gfs2_remove_from_journal no longer locks the sd_ail_lock, and it's now the caller's responsibility to do so. I was going to make sd_ail_lock locking conditional, but the practice is generally frowned upon. For details, see: https://lwn.net/Articles/109066/ Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 5 ++- fs/gfs2/log.c | 2 +- fs/gfs2/log.h | 2 +- fs/gfs2/meta_io.c | 81 ++++++++++++++++++++++++++++++++++++++++++++--- fs/gfs2/meta_io.h | 2 +- fs/gfs2/rgrp.c | 6 ++-- 6 files changed, 86 insertions(+), 12 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 287d160ccf7a..801f244690bf 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -665,8 +665,11 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) if (bd) { if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) list_del_init(&bd->bd_list); - else + else { + spin_lock(&sdp->sd_ail_lock); gfs2_remove_from_journal(bh, REMOVE_JDATA); + spin_unlock(&sdp->sd_ail_lock); + } } bh->b_bdev = NULL; clear_buffer_mapped(bh); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3712059f19f9..cd225ccce8e0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -70,7 +70,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) * */ -static void gfs2_remove_from_ail(struct gfs2_bufdata *bd) +void gfs2_remove_from_ail(struct gfs2_bufdata *bd) { bd->bd_tr = NULL; list_del_init(&bd->bd_ail_st_list); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 8965c751a303..79f97290146e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -63,7 +63,7 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); - +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 9856cc2e0795..2db573e31f78 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -348,38 +348,109 @@ void gfs2_remove_from_journal(struct buffer_head *bh, int meta) brelse(bh); } if (bd) { - spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); } else if (was_pinned) { bh->b_private = NULL; kmem_cache_free(gfs2_bufdata_cachep, bd); + } else if (!list_empty(&bd->bd_ail_st_list) && + !list_empty(&bd->bd_ail_gl_list)) { + gfs2_remove_from_ail(bd); } - spin_unlock(&sdp->sd_ail_lock); } clear_buffer_dirty(bh); clear_buffer_uptodate(bh); } /** - * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore + * gfs2_ail1_wipe - remove deleted/freed buffers from the ail1 list + * @sdp: superblock + * @bstart: starting block address of buffers to remove + * @blen: length of buffers to be removed + * + * This function is called from gfs2_journal wipe, whose job is to remove + * buffers, corresponding to deleted blocks, from the journal. If we find any + * bufdata elements on the system ail1 list, they haven't been written to + * the journal yet. So we remove them. + */ +static void gfs2_ail1_wipe(struct gfs2_sbd *sdp, u64 bstart, u32 blen) +{ + struct gfs2_trans *tr, *s; + struct gfs2_bufdata *bd, *bs; + struct buffer_head *bh; + u64 end = bstart + blen; + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe(tr, s, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, bs, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + if (bh->b_blocknr < bstart || bh->b_blocknr >= end) + continue; + + gfs2_remove_from_journal(bh, REMOVE_JDATA); + } + } + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); +} + +static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct page *page; + struct buffer_head *bh; + unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; + unsigned long index = blkno >> shift; /* convert block to page */ + unsigned int bufnum = blkno - (index << shift); + + page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); + if (!page) + return NULL; + if (!page_has_buffers(page)) { + unlock_page(page); + put_page(page); + return NULL; + } + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + unlock_page(page); + put_page(page); + return bh; +} + +/** + * gfs2_journal_wipe - make inode's buffers so they aren't dirty/pinned anymore * @ip: the inode who owns the buffers * @bstart: the first buffer in the run * @blen: the number of buffers in the run * */ -void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) +void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *bh; + int ty; + gfs2_ail1_wipe(sdp, bstart, blen); while (blen) { + ty = REMOVE_META; bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); + if (!bh && gfs2_is_jdata(ip)) { + bh = gfs2_getjdatabuf(ip, bstart); + ty = REMOVE_JDATA; + } if (bh) { lock_buffer(bh); gfs2_log_lock(sdp); - gfs2_remove_from_journal(bh, REMOVE_META); + spin_lock(&sdp->sd_ail_lock); + gfs2_remove_from_journal(bh, ty); + spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index eafb74e861c6..4a8c01929b79 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -60,7 +60,7 @@ enum { }; extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); -extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head **bhp); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 1bba5a9d45fa..9de676cfd3d7 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2442,8 +2442,8 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); /* Directories keep their data in the metadata address space */ - if (meta || ip->i_depth) - gfs2_meta_wipe(ip, bstart, blen); + if (meta || ip->i_depth || gfs2_is_jdata(ip)) + gfs2_journal_wipe(ip, bstart, blen); } /** @@ -2499,7 +2499,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_statfs_change(sdp, 0, +1, -1); trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); - gfs2_meta_wipe(ip, ip->i_no_addr, 1); + gfs2_journal_wipe(ip, ip->i_no_addr, 1); } /** From 36c783092d49ae71fa87963de9bb5bd9daf4d8c2 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 18 Aug 2020 08:05:14 -0500 Subject: [PATCH 309/497] gfs2: make gfs2_ail1_empty_one return the count of active items This patch is one baby step toward simplifying the journal management. It simply changes function gfs2_ail1_empty_one from a void to an int and makes it return a count of active items. This allows the caller to check the return code rather than list_empty on the tr_ail1_list. This way we can, in a later patch, combine transaction ail1 and ail2 lists. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index cd225ccce8e0..9133b3178677 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -244,13 +244,15 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) * @tr: the transaction * @max_revokes: If nonzero, issue revokes for the bd items for written buffers * + * returns: the transaction's count of remaining active items */ -static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, +static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, int *max_revokes) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; + int active_count = 0; list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { @@ -265,8 +267,10 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, * If the ail buffer is not busy and caught an error, flag it * for others. */ - if (!sdp->sd_log_error && buffer_busy(bh)) + if (!sdp->sd_log_error && buffer_busy(bh)) { + active_count++; continue; + } if (!buffer_uptodate(bh) && !cmpxchg(&sdp->sd_log_error, 0, -EIO)) { gfs2_io_error_bh(sdp, bh); @@ -285,6 +289,7 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, } list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } + return active_count; } /** @@ -303,8 +308,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { - gfs2_ail1_empty_one(sdp, tr, &max_revokes); - if (list_empty(&tr->tr_ail1_list) && oldest_tr) + if (!gfs2_ail1_empty_one(sdp, tr, &max_revokes) && oldest_tr) list_move(&tr->tr_list, &sdp->sd_ail2_list); else oldest_tr = 0; From 249ffe18c68ee74dcfb51a16e151b45ee07e1c10 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 18 Aug 2020 11:50:28 -0500 Subject: [PATCH 310/497] gfs2: don't lock sd_ail_lock in gfs2_releasepage Patch 380f7c65a7eb3288e4b6812acf3474a1de230707 changed gfs2_releasepage so that it held the sd_ail_lock spin_lock for most of its processing. It did this for some mysterious undocumented bug somewhere in the evict code path. But in the nine years since, evict has been reworked and fixed many times, and so have the transactions and ail list. I can't see a reason to hold the sd_ail_lock unless it's protecting the actual ail lists hung off the transactions. Therefore, this patch removes the locking to increase speed and efficiency, and to further help us rework the log flush code to be more concurrent with transactions. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 801f244690bf..1e7ab519bfea 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -741,7 +741,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) */ gfs2_log_lock(sdp); - spin_lock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { if (atomic_read(&bh->b_count)) @@ -753,7 +752,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) goto cannot_release; bh = bh->b_this_page; } while(bh != head); - spin_unlock(&sdp->sd_ail_lock); head = bh = page_buffers(page); do { @@ -779,7 +777,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) return try_to_free_buffers(page); cannot_release: - spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); return 0; } From 6302d6f43e3550495747ffbaeb75a46ea8c15b32 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 19 Aug 2020 08:55:18 -0500 Subject: [PATCH 311/497] gfs2: Only set PageChecked if we have a transaction With jdata writes, we frequently got into situations where gfs2 deadlocked because of this calling sequence: gfs2_ail1_start gfs2_ail1_flush - for every tr on the sd_ail1_list: gfs2_ail1_start_one - for every bd on the tr's tr_ail1_list: generic_writepages write_cache_pages passing __writepage() calls clear_page_dirty_for_io which calls set_page_dirty: which calls jdata_set_page_dirty which sets PageChecked. __writepage() calls mapping->a_ops->writepage AKA gfs2_jdata_writepage However, gfs2_jdata_writepage checks if PageChecked is set, and if so, it ignores the write and redirties the page. The problem is that write_cache_pages calls clear_page_dirty_for_io, which often calls set_page_dirty(). See comments in page-writeback.c starting with "Yes, Virginia". If it's jdata, set_page_dirty will call jdata_set_page_dirty which will set PageChecked. That causes a conflict because it makes it look like the page has been redirtied by another writer, in which case we need to skip writing it and redirty the page. That ends up in a deadlock because it isn't a "real" writer and nothing will ever clear PageChecked. If we do have a real writer, it will have started a transaction. So this patch checks if a transaction is in use, and if not, it skips setting PageChecked. That way, the page will be dirtied, cleaned, and written appropriately. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/aops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1e7ab519bfea..9cd2ecad07db 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -623,7 +623,8 @@ out: static int jdata_set_page_dirty(struct page *page) { - SetPageChecked(page); + if (current->journal_info) + SetPageChecked(page); return __set_page_dirty_buffers(page); } From a6645745d45da5b3dd3ff616a3e44f7651eda9f9 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 19 Aug 2020 09:24:48 -0500 Subject: [PATCH 312/497] gfs2: simplify gfs2_block_map Function gfs2_block_map had a lot of redundancy between its create and no_create paths. This patch simplifies the code to eliminate the redundancy. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ed425d30e636..62d9081d1e26 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1292,6 +1292,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, loff_t length = bh_map->b_size; struct metapath mp = { .mp_aheight = 1, }; struct iomap iomap = { }; + int flags = create ? IOMAP_WRITE : 0; int ret; clear_buffer_mapped(bh_map); @@ -1299,15 +1300,10 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); - if (create) { - ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp); - if (!ret && iomap.type == IOMAP_HOLE) - ret = gfs2_iomap_alloc(inode, &iomap, &mp); - release_metapath(&mp); - } else { - ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp); - release_metapath(&mp); - } + ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp); + if (create && !ret && iomap.type == IOMAP_HOLE) + ret = gfs2_iomap_alloc(inode, &iomap, &mp); + release_metapath(&mp); if (ret) goto out; From b2a846dbef4ef54ef032f0f5ee188c609a0278a7 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 27 Aug 2020 12:18:37 -0500 Subject: [PATCH 313/497] gfs2: Ignore journal log writes for jdata holes When flushing out its ail1 list, gfs2_write_jdata_page calls function __block_write_full_page passing in function gfs2_get_block_noalloc. But there was a problem when a process wrote to a jdata file, then truncated it or punched a hole, leaving references to the blocks within the new hole in its ail list, which are to be written to the journal log. In writing them to the journal, after calling gfs2_block_map, function gfs2_get_block_noalloc determined that the (hole-punched) block was not mapped, so it returned -EIO to generic_writepages, which passed it back to gfs2_ail1_start_one. This, in turn, performed a withdraw, assuming there was a real IO error writing to the journal. This might be a valid error when writing metadata to the journal, but for journaled data writes, it does not warrant a withdraw. This patch adds a check to function gfs2_block_map that makes an exception for journaled data writes that correspond to jdata holes: If the iomap get function returns a block type of IOMAP_HOLE, it instead returns -ENODATA which does not cause the withdraw. Other errors are returned as before. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 62d9081d1e26..8dff9cbd0a87 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1301,8 +1301,12 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, trace_gfs2_bmap(ip, bh_map, lblock, create, 1); ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp); - if (create && !ret && iomap.type == IOMAP_HOLE) - ret = gfs2_iomap_alloc(inode, &iomap, &mp); + if (!ret && iomap.type == IOMAP_HOLE) { + if (create) + ret = gfs2_iomap_alloc(inode, &iomap, &mp); + else + ret = -ENODATA; + } release_metapath(&mp); if (ret) goto out; From e2c6c8a797eea88b267743d99d593d368aa43481 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 12 Oct 2020 13:57:37 -0500 Subject: [PATCH 314/497] gfs2: eliminate GLF_QUEUED flag in favor of list_empty(gl_holders) Before this patch, glock.c maintained a flag, GLF_QUEUED, which indicated when a glock had a holder queued. It was only checked for inode glocks, although set and cleared by all glocks, and it was only used to determine whether the glock should be held for the minimum hold time before releasing. The problem is that the flag is not accurate at all. If a process holds the glock, the flag is set. When they dequeue the glock, it only cleared the flag in cases when the state actually changed. So if the state doesn't change, the flag may still be set, even when nothing is queued. This happens to iopen glocks often: the get held in SH, then the file is closed, but the glock remains in SH mode. We don't need a special flag to indicate this: we can simply tell whether the glock has any items queued to the holders queue. It's a waste of cpu time to maintain it. This patch eliminates the flag in favor of simply checking list_empty on the glock holders. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 11 +++-------- fs/gfs2/incore.h | 1 - fs/gfs2/trace_gfs2.h | 1 - 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 19d4db4c44e7..c0d441228562 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -458,9 +458,6 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) else gl->gl_lockref.count--; } - if (held1 && held2 && list_empty(&gl->gl_holders)) - clear_bit(GLF_QUEUED, &gl->gl_flags); - if (new_state != gl->gl_target) /* shorten our minimum hold time */ gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, @@ -1350,7 +1347,6 @@ fail: if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) insert_pt = &gh2->gh_list; } - set_bit(GLF_QUEUED, &gl->gl_flags); trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); @@ -1651,16 +1647,15 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) unsigned long now = jiffies; gfs2_glock_hold(gl); + spin_lock(&gl->gl_lockref.lock); holdtime = gl->gl_tchange + gl->gl_hold_time; - if (test_bit(GLF_QUEUED, &gl->gl_flags) && + if (!list_empty(&gl->gl_holders) && gl->gl_name.ln_type == LM_TYPE_INODE) { if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) delay = gl->gl_hold_time; } - - spin_lock(&gl->gl_lockref.lock); handle_callback(gl, state, delay, true); __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); @@ -2105,7 +2100,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'I'; if (test_bit(GLF_FROZEN, gflags)) *p++ = 'F'; - if (test_bit(GLF_QUEUED, gflags)) + if (!list_empty(&gl->gl_holders)) *p++ = 'q'; if (test_bit(GLF_LRU, gflags)) *p++ = 'L'; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 387e99d6eda9..346693faf049 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -340,7 +340,6 @@ enum { GLF_REPLY_PENDING = 9, GLF_INITIAL = 10, GLF_FROZEN = 11, - GLF_QUEUED = 12, GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index fe140ceef74d..0b2f858d9a8c 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -56,7 +56,6 @@ {(1UL << GLF_REPLY_PENDING), "r" }, \ {(1UL << GLF_INITIAL), "I" }, \ {(1UL << GLF_FROZEN), "F" }, \ - {(1UL << GLF_QUEUED), "q" }, \ {(1UL << GLF_LRU), "L" }, \ {(1UL << GLF_OBJECT), "o" }, \ {(1UL << GLF_BLOCKING), "b" }) From 1b2c54d63cde7e8cf15aa6319aba168d81c7e364 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Oct 2020 16:38:47 -0700 Subject: [PATCH 315/497] vfs: move the remap range helpers to remap_range.c Complete the migration by moving the file remapping helper functions out of read_write.c and into remap_range.c. This reduces the clutter in the first file and (eventually) will make it so that we can compile out the second file if it isn't needed. Signed-off-by: Darrick J. Wong --- fs/read_write.c | 473 -------------------------------------------- fs/remap_range.c | 480 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/fs.h | 3 - 3 files changed, 477 insertions(+), 479 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index d3428189f36b..f0877f1c0c49 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1832,476 +1832,3 @@ out1: out2: return ret; } - -static int remap_verify_area(struct file *file, loff_t pos, loff_t len, - bool write) -{ - struct inode *inode = file_inode(file); - - if (unlikely(pos < 0 || len < 0)) - return -EINVAL; - - if (unlikely((loff_t) (pos + len) < 0)) - return -EINVAL; - - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - loff_t end = len ? pos + len - 1 : OFFSET_MAX; - int retval; - - retval = locks_mandatory_area(inode, file, pos, end, - write ? F_WRLCK : F_RDLCK); - if (retval < 0) - return retval; - } - - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); -} -/* - * Ensure that we don't remap a partial EOF block in the middle of something - * else. Assume that the offsets have already been checked for block - * alignment. - * - * For clone we only link a partial EOF block above or at the destination file's - * EOF. For deduplication we accept a partial EOF block only if it ends at the - * destination file's EOF (can not link it into the middle of a file). - * - * Shorten the request if possible. - */ -static int generic_remap_check_len(struct inode *inode_in, - struct inode *inode_out, - loff_t pos_out, - loff_t *len, - unsigned int remap_flags) -{ - u64 blkmask = i_blocksize(inode_in) - 1; - loff_t new_len = *len; - - if ((*len & blkmask) == 0) - return 0; - - if (pos_out + *len < i_size_read(inode_out)) - new_len &= ~blkmask; - - if (new_len == *len) - return 0; - - if (remap_flags & REMAP_FILE_CAN_SHORTEN) { - *len = new_len; - return 0; - } - - return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; -} - -/* Read a page's worth of file data into the page cache. */ -static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) -{ - struct page *page; - - page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - return page; -} - -/* - * Lock two pages, ensuring that we lock in offset order if the pages are from - * the same file. - */ -static void vfs_lock_two_pages(struct page *page1, struct page *page2) -{ - /* Always lock in order of increasing index. */ - if (page1->index > page2->index) - swap(page1, page2); - - lock_page(page1); - if (page1 != page2) - lock_page(page2); -} - -/* Unlock two pages, being careful not to unlock the same page twice. */ -static void vfs_unlock_two_pages(struct page *page1, struct page *page2) -{ - unlock_page(page1); - if (page1 != page2) - unlock_page(page2); -} - -/* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ -static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) -{ - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - put_page(src_page); - goto out_error; - } - - vfs_lock_two_pages(src_page, dest_page); - - /* - * Now that we've locked both pages, make sure they're still - * mapped to the file data we're interested in. If not, - * someone is invalidating pages on us and we lose. - */ - if (!PageUptodate(src_page) || !PageUptodate(dest_page) || - src_page->mapping != src->i_mapping || - dest_page->mapping != dest->i_mapping) { - same = false; - goto unlock; - } - - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); -unlock: - vfs_unlock_two_pages(src_page, dest_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - return error; -} - -/* - * Check that the two inodes are eligible for cloning, the ranges make - * sense, and then flush all dirty data. Caller must ensure that the - * inodes have been locked against any other modifications. - * - * If there's an error, then the usual negative error code is returned. - * Otherwise returns 0 with *len set to the request length. - */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - bool same_inode = (inode_in == inode_out); - int ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Don't reflink dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - /* Zero length dedupe exits immediately; reflink goes to EOF. */ - if (*len == 0) { - loff_t isize = i_size_read(inode_in); - - if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) - return 0; - if (pos_in > isize) - return -EINVAL; - *len = isize - pos_in; - if (*len == 0) - return 0; - } - - /* Check that we don't violate system file offset limits. */ - ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + *len - 1); - if (ret) - return ret; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + *len - 1); - if (ret) - return ret; - - /* - * Check that the extents are the same. - */ - if (remap_flags & REMAP_FILE_DEDUP) { - bool is_same = false; - - ret = vfs_dedupe_file_range_compare(inode_in, pos_in, - inode_out, pos_out, *len, &is_same); - if (ret) - return ret; - if (!is_same) - return -EBADE; - } - - ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); - if (ret) - return ret; - - /* If can't alter the file contents, we're done. */ - if (!(remap_flags & REMAP_FILE_DEDUP)) - ret = file_modified(file_out); - - return ret; -} -EXPORT_SYMBOL(generic_remap_file_range_prep); - -loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); - - /* - * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on - * the same mount. Practically, they only need to be on the same file - * system. - */ - if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) - return -EXDEV; - - ret = generic_file_rw_checks(file_in, file_out); - if (ret < 0) - return ret; - - if (!file_in->f_op->remap_file_range) - return -EOPNOTSUPP; - - ret = remap_verify_area(file_in, pos_in, len, false); - if (ret) - return ret; - - ret = remap_verify_area(file_out, pos_out, len, true); - if (ret) - return ret; - - ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, len, remap_flags); - if (ret < 0) - return ret; - - fsnotify_access(file_in); - fsnotify_modify(file_out); - return ret; -} -EXPORT_SYMBOL(do_clone_file_range); - -loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - file_start_write(file_out); - ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, - remap_flags); - file_end_write(file_out); - - return ret; -} -EXPORT_SYMBOL(vfs_clone_file_range); - -/* Check whether we are allowed to dedupe the destination file */ -static bool allow_file_dedupe(struct file *file) -{ - if (capable(CAP_SYS_ADMIN)) - return true; - if (file->f_mode & FMODE_WRITE) - return true; - if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) - return true; - if (!inode_permission(file_inode(file), MAY_WRITE)) - return true; - return false; -} - -loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, - loff_t len, unsigned int remap_flags) -{ - loff_t ret; - - WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | - REMAP_FILE_CAN_SHORTEN)); - - ret = mnt_want_write_file(dst_file); - if (ret) - return ret; - - ret = remap_verify_area(dst_file, dst_pos, len, true); - if (ret < 0) - goto out_drop_write; - - ret = -EPERM; - if (!allow_file_dedupe(dst_file)) - goto out_drop_write; - - ret = -EXDEV; - if (src_file->f_path.mnt != dst_file->f_path.mnt) - goto out_drop_write; - - ret = -EISDIR; - if (S_ISDIR(file_inode(dst_file)->i_mode)) - goto out_drop_write; - - ret = -EINVAL; - if (!dst_file->f_op->remap_file_range) - goto out_drop_write; - - if (len == 0) { - ret = 0; - goto out_drop_write; - } - - ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, - dst_pos, len, remap_flags | REMAP_FILE_DEDUP); -out_drop_write: - mnt_drop_write_file(dst_file); - - return ret; -} -EXPORT_SYMBOL(vfs_dedupe_file_range_one); - -int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) -{ - struct file_dedupe_range_info *info; - struct inode *src = file_inode(file); - u64 off; - u64 len; - int i; - int ret; - u16 count = same->dest_count; - loff_t deduped; - - if (!(file->f_mode & FMODE_READ)) - return -EINVAL; - - if (same->reserved1 || same->reserved2) - return -EINVAL; - - off = same->src_offset; - len = same->src_length; - - if (S_ISDIR(src->i_mode)) - return -EISDIR; - - if (!S_ISREG(src->i_mode)) - return -EINVAL; - - if (!file->f_op->remap_file_range) - return -EOPNOTSUPP; - - ret = remap_verify_area(file, off, len, false); - if (ret < 0) - return ret; - ret = 0; - - if (off + len > i_size_read(src)) - return -EINVAL; - - /* Arbitrary 1G limit on a single dedupe request, can be raised. */ - len = min_t(u64, len, 1 << 30); - - /* pre-format output fields to sane values */ - for (i = 0; i < count; i++) { - same->info[i].bytes_deduped = 0ULL; - same->info[i].status = FILE_DEDUPE_RANGE_SAME; - } - - for (i = 0, info = same->info; i < count; i++, info++) { - struct fd dst_fd = fdget(info->dest_fd); - struct file *dst_file = dst_fd.file; - - if (!dst_file) { - info->status = -EBADF; - goto next_loop; - } - - if (info->reserved) { - info->status = -EINVAL; - goto next_fdput; - } - - deduped = vfs_dedupe_file_range_one(file, off, dst_file, - info->dest_offset, len, - REMAP_FILE_CAN_SHORTEN); - if (deduped == -EBADE) - info->status = FILE_DEDUPE_RANGE_DIFFERS; - else if (deduped < 0) - info->status = deduped; - else - info->bytes_deduped = len; - -next_fdput: - fdput(dst_fd); -next_loop: - if (fatal_signal_pending(current)) - break; - } - return ret; -} -EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/remap_range.c b/fs/remap_range.c index f0a8c7065d5b..e6099beefa97 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -26,9 +26,9 @@ * Returns appropriate error code that caller should return or * zero in case the clone should be allowed. */ -int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) +static int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; @@ -95,3 +95,477 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in, *req_count = count; return 0; } + +static int remap_verify_area(struct file *file, loff_t pos, loff_t len, + bool write) +{ + struct inode *inode = file_inode(file); + + if (unlikely(pos < 0 || len < 0)) + return -EINVAL; + + if (unlikely((loff_t) (pos + len) < 0)) + return -EINVAL; + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + loff_t end = len ? pos + len - 1 : OFFSET_MAX; + int retval; + + retval = locks_mandatory_area(inode, file, pos, end, + write ? F_WRLCK : F_RDLCK); + if (retval < 0) + return retval; + } + + return security_file_permission(file, write ? MAY_WRITE : MAY_READ); +} + +/* + * Ensure that we don't remap a partial EOF block in the middle of something + * else. Assume that the offsets have already been checked for block + * alignment. + * + * For clone we only link a partial EOF block above or at the destination file's + * EOF. For deduplication we accept a partial EOF block only if it ends at the + * destination file's EOF (can not link it into the middle of a file). + * + * Shorten the request if possible. + */ +static int generic_remap_check_len(struct inode *inode_in, + struct inode *inode_out, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + u64 blkmask = i_blocksize(inode_in) - 1; + loff_t new_len = *len; + + if ((*len & blkmask) == 0) + return 0; + + if (pos_out + *len < i_size_read(inode_out)) + new_len &= ~blkmask; + + if (new_len == *len) + return 0; + + if (remap_flags & REMAP_FILE_CAN_SHORTEN) { + *len = new_len; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + +/* Read a page's worth of file data into the page cache. */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct page *page; + + page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * Lock two pages, ensuring that we lock in offset order if the pages are from + * the same file. + */ +static void vfs_lock_two_pages(struct page *page1, struct page *page2) +{ + /* Always lock in order of increasing index. */ + if (page1->index > page2->index) + swap(page1, page2); + + lock_page(page1); + if (page1 != page2) + lock_page(page2); +} + +/* Unlock two pages, being careful not to unlock the same page twice. */ +static void vfs_unlock_two_pages(struct page *page1, struct page *page2) +{ + unlock_page(page1); + if (page1 != page2) + unlock_page(page2); +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + put_page(src_page); + goto out_error; + } + + vfs_lock_two_pages(src_page, dest_page); + + /* + * Now that we've locked both pages, make sure they're still + * mapped to the file data we're interested in. If not, + * someone is invalidating pages on us and we lose. + */ + if (!PageUptodate(src_page) || !PageUptodate(dest_page) || + src_page->mapping != src->i_mapping || + dest_page->mapping != dest->i_mapping) { + same = false; + goto unlock; + } + + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); +unlock: + vfs_unlock_two_pages(src_page, dest_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} + +/* + * Check that the two inodes are eligible for cloning, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the + * inodes have been locked against any other modifications. + * + * If there's an error, then the usual negative error code is returned. + * Otherwise returns 0 with *len set to the request length. + */ +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + bool same_inode = (inode_in == inode_out); + int ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (*len == 0) { + loff_t isize = i_size_read(inode_in); + + if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) + return 0; + if (pos_in > isize) + return -EINVAL; + *len = isize - pos_in; + if (*len == 0) + return 0; + } + + /* Check that we don't violate system file offset limits. */ + ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + *len - 1); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + *len - 1); + if (ret) + return ret; + + /* + * Check that the extents are the same. + */ + if (remap_flags & REMAP_FILE_DEDUP) { + bool is_same = false; + + ret = vfs_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same); + if (ret) + return ret; + if (!is_same) + return -EBADE; + } + + ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, + remap_flags); + if (ret) + return ret; + + /* If can't alter the file contents, we're done. */ + if (!(remap_flags & REMAP_FILE_DEDUP)) + ret = file_modified(file_out); + + return ret; +} +EXPORT_SYMBOL(generic_remap_file_range_prep); + +loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); + + /* + * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on + * the same mount. Practically, they only need to be on the same file + * system. + */ + if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) + return -EXDEV; + + ret = generic_file_rw_checks(file_in, file_out); + if (ret < 0) + return ret; + + if (!file_in->f_op->remap_file_range) + return -EOPNOTSUPP; + + ret = remap_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = remap_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + + ret = file_in->f_op->remap_file_range(file_in, pos_in, + file_out, pos_out, len, remap_flags); + if (ret < 0) + return ret; + + fsnotify_access(file_in); + fsnotify_modify(file_out); + return ret; +} +EXPORT_SYMBOL(do_clone_file_range); + +loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + file_start_write(file_out); + ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len, + remap_flags); + file_end_write(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_clone_file_range); + +/* Check whether we are allowed to dedupe the destination file */ +static bool allow_file_dedupe(struct file *file) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + if (file->f_mode & FMODE_WRITE) + return true; + if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + return true; + if (!inode_permission(file_inode(file), MAY_WRITE)) + return true; + return false; +} + +loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags) +{ + loff_t ret; + + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); + + ret = mnt_want_write_file(dst_file); + if (ret) + return ret; + + ret = remap_verify_area(dst_file, dst_pos, len, true); + if (ret < 0) + goto out_drop_write; + + ret = -EPERM; + if (!allow_file_dedupe(dst_file)) + goto out_drop_write; + + ret = -EXDEV; + if (src_file->f_path.mnt != dst_file->f_path.mnt) + goto out_drop_write; + + ret = -EISDIR; + if (S_ISDIR(file_inode(dst_file)->i_mode)) + goto out_drop_write; + + ret = -EINVAL; + if (!dst_file->f_op->remap_file_range) + goto out_drop_write; + + if (len == 0) { + ret = 0; + goto out_drop_write; + } + + ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, + dst_pos, len, remap_flags | REMAP_FILE_DEDUP); +out_drop_write: + mnt_drop_write_file(dst_file); + + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range_one); + +int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +{ + struct file_dedupe_range_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + u16 count = same->dest_count; + loff_t deduped; + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + if (same->reserved1 || same->reserved2) + return -EINVAL; + + off = same->src_offset; + len = same->src_length; + + if (S_ISDIR(src->i_mode)) + return -EISDIR; + + if (!S_ISREG(src->i_mode)) + return -EINVAL; + + if (!file->f_op->remap_file_range) + return -EOPNOTSUPP; + + ret = remap_verify_area(file, off, len, false); + if (ret < 0) + return ret; + ret = 0; + + if (off + len > i_size_read(src)) + return -EINVAL; + + /* Arbitrary 1G limit on a single dedupe request, can be raised. */ + len = min_t(u64, len, 1 << 30); + + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = FILE_DEDUPE_RANGE_SAME; + } + + for (i = 0, info = same->info; i < count; i++, info++) { + struct fd dst_fd = fdget(info->dest_fd); + struct file *dst_file = dst_fd.file; + + if (!dst_file) { + info->status = -EBADF; + goto next_loop; + } + + if (info->reserved) { + info->status = -EINVAL; + goto next_fdput; + } + + deduped = vfs_dedupe_file_range_one(file, off, dst_file, + info->dest_offset, len, + REMAP_FILE_CAN_SHORTEN); + if (deduped == -EBADE) + info->status = FILE_DEDUPE_RANGE_DIFFERS; + else if (deduped < 0) + info->status = deduped; + else + info->bytes_deduped = len; + +next_fdput: + fdput(dst_fd); +next_loop: + if (fatal_signal_pending(current)) + break; + } + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/include/linux/fs.h b/include/linux/fs.h index eea754a8dd67..073da53b59b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3009,9 +3009,6 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); -extern int generic_remap_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, unsigned int remap_flags); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); From 407e9c63ee571f44a2dfb0828fc30daa02abb6dc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 15 Oct 2020 09:21:17 -0700 Subject: [PATCH 316/497] vfs: move the generic write and copy checks out of mm The generic write check helpers also don't have much to do with the page cache, so move them to the vfs. Signed-off-by: Darrick J. Wong --- fs/read_write.c | 143 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 3 - mm/filemap.c | 143 --------------------------------------------- 3 files changed, 143 insertions(+), 146 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index f0877f1c0c49..016444255d3e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1701,6 +1701,59 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in, flags); } +/* + * Performs necessary checks before doing a file copy + * + * Can adjust amount of bytes to copy via @req_count argument. + * Returns appropriate error code that caller should return or + * zero in case the copy should be allowed. + */ +static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t *req_count, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + uint64_t count = *req_count; + loff_t size_in; + int ret; + + ret = generic_file_rw_checks(file_in, file_out); + if (ret) + return ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EOVERFLOW; + + /* Shorten the copy to EOF */ + size_in = i_size_read(inode_in); + if (pos_in >= size_in) + count = 0; + else + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; + + /* Don't allow overlapped copying within the same file. */ + if (inode_in == inode_out && + pos_out + count > pos_in && + pos_out < pos_in + count) + return -EINVAL; + + *req_count = count; + return 0; +} + /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to @@ -1832,3 +1885,93 @@ out1: out2: return ret; } + +/* + * Don't operate on ranges the page cache doesn't support, and don't exceed the + * LFS limits. If pos is under the limit it becomes a short access. If it + * exceeds the limit we return -EFBIG. + */ +int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) +{ + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + loff_t limit = rlimit(RLIMIT_FSIZE); + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + *count = min(*count, limit - pos); + } + + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; + + if (unlikely(pos >= max_size)) + return -EFBIG; + + *count = min(*count, max_size - pos); + + return 0; +} + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position or amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + loff_t count; + int ret; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + /* FIXME: this is for backwards compatibility with 2.4 */ + if (iocb->ki_flags & IOCB_APPEND) + iocb->ki_pos = i_size_read(inode); + + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); + if (ret) + return ret; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} +EXPORT_SYMBOL(generic_write_checks); + +/* + * Performs common checks before doing a file copy/clone + * from @file_in to @file_out. + */ +int generic_file_rw_checks(struct file *file_in, struct file *file_out) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + + /* Don't copy dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; + + return 0; +} diff --git a/include/linux/fs.h b/include/linux/fs.h index 073da53b59b0..8fb063ab7d50 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3012,9 +3012,6 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); -extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t *count, unsigned int flags); extern ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index cf20e5aeb11b..9962fd682f20 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3093,149 +3093,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); -/* - * Don't operate on ranges the page cache doesn't support, and don't exceed the - * LFS limits. If pos is under the limit it becomes a short access. If it - * exceeds the limit we return -EFBIG. - */ -int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) -{ - struct inode *inode = file->f_mapping->host; - loff_t max_size = inode->i_sb->s_maxbytes; - loff_t limit = rlimit(RLIMIT_FSIZE); - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - *count = min(*count, limit - pos); - } - - if (!(file->f_flags & O_LARGEFILE)) - max_size = MAX_NON_LFS; - - if (unlikely(pos >= max_size)) - return -EFBIG; - - *count = min(*count, max_size - pos); - - return 0; -} - -/* - * Performs necessary checks before doing a write - * - * Can adjust writing position or amount of bytes to write. - * Returns appropriate error code that caller should return or - * zero in case that write should be allowed. - */ -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - loff_t count; - int ret; - - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - /* FIXME: this is for backwards compatibility with 2.4 */ - if (iocb->ki_flags & IOCB_APPEND) - iocb->ki_pos = i_size_read(inode); - - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) - return -EINVAL; - - count = iov_iter_count(from); - ret = generic_write_check_limits(file, iocb->ki_pos, &count); - if (ret) - return ret; - - iov_iter_truncate(from, count); - return iov_iter_count(from); -} -EXPORT_SYMBOL(generic_write_checks); - -/* - * Performs common checks before doing a file copy/clone - * from @file_in to @file_out. - */ -int generic_file_rw_checks(struct file *file_in, struct file *file_out) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - - /* Don't copy dirs, pipes, sockets... */ - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - return -EISDIR; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - return -EINVAL; - - if (!(file_in->f_mode & FMODE_READ) || - !(file_out->f_mode & FMODE_WRITE) || - (file_out->f_flags & O_APPEND)) - return -EBADF; - - return 0; -} - -/* - * Performs necessary checks before doing a file copy - * - * Can adjust amount of bytes to copy via @req_count argument. - * Returns appropriate error code that caller should return or - * zero in case the copy should be allowed. - */ -int generic_copy_file_checks(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t *req_count, unsigned int flags) -{ - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - uint64_t count = *req_count; - loff_t size_in; - int ret; - - ret = generic_file_rw_checks(file_in, file_out); - if (ret) - return ret; - - /* Don't touch certain kinds of inodes */ - if (IS_IMMUTABLE(inode_out)) - return -EPERM; - - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - return -ETXTBSY; - - /* Ensure offsets don't wrap. */ - if (pos_in + count < pos_in || pos_out + count < pos_out) - return -EOVERFLOW; - - /* Shorten the copy to EOF */ - size_in = i_size_read(inode_in); - if (pos_in >= size_in) - count = 0; - else - count = min(count, size_in - (uint64_t)pos_in); - - ret = generic_write_check_limits(file_out, pos_out, &count); - if (ret) - return ret; - - /* Don't allow overlapped copying within the same file. */ - if (inode_in == inode_out && - pos_out + count > pos_in && - pos_out < pos_in + count) - return -EINVAL; - - *req_count = count; - return 0; -} - int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) From 8e670f77c4a55013db6d23b962f9bf6673a5e7b6 Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Fri, 18 Sep 2020 05:37:28 +0000 Subject: [PATCH 317/497] Handle STATUS_IO_TIMEOUT gracefully Currently STATUS_IO_TIMEOUT is not treated as retriable error. It is currently mapped to ETIMEDOUT and returned to userspace for most system calls. STATUS_IO_TIMEOUT is returned by server in case of unavailability or throttling errors. This patch will map the STATUS_IO_TIMEOUT to EAGAIN, so that it can be retried. Also, added a check to drop the connection to not overload the server in case of ongoing unavailability. Signed-off-by: Rohith Surabattula Reviewed-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/connect.c | 15 ++++++++++++++- fs/cifs/smb2maperror.c | 2 +- fs/cifs/smb2ops.c | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b565d83ba89e..5a491afafacc 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -510,6 +510,8 @@ struct smb_version_operations { struct fiemap_extent_info *, u64, u64); /* version specific llseek implementation */ loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int); + /* Check for STATUS_IO_TIMEOUT */ + bool (*is_status_io_timeout)(char *buf); }; struct smb_version_values { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a5731dd6e656..1a3b7793095e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -69,6 +69,9 @@ extern bool disable_legacy_dialects; #define TLINK_ERROR_EXPIRE (1 * HZ) #define TLINK_IDLE_EXPIRE (600 * HZ) +/* Drop the connection to not overload the server */ +#define NUM_STATUS_IO_TIMEOUT 5 + enum { /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, @@ -1117,7 +1120,7 @@ cifs_demultiplex_thread(void *p) struct task_struct *task_to_wake = NULL; struct mid_q_entry *mids[MAX_COMPOUND]; char *bufs[MAX_COMPOUND]; - unsigned int noreclaim_flag; + unsigned int noreclaim_flag, num_io_timeout = 0; noreclaim_flag = memalloc_noreclaim_save(); cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); @@ -1213,6 +1216,16 @@ next_pdu: continue; } + if (server->ops->is_status_io_timeout && + server->ops->is_status_io_timeout(buf)) { + num_io_timeout++; + if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { + cifs_reconnect(server); + num_io_timeout = 0; + continue; + } + } + server->lstrp = jiffies; for (i = 0; i < num_mids; i++) { diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 7fde3775cb57..b004cf87692a 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -488,7 +488,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_PIPE_CONNECTED, -EIO, "STATUS_PIPE_CONNECTED"}, {STATUS_PIPE_LISTENING, -EIO, "STATUS_PIPE_LISTENING"}, {STATUS_INVALID_READ_MODE, -EIO, "STATUS_INVALID_READ_MODE"}, - {STATUS_IO_TIMEOUT, -ETIMEDOUT, "STATUS_IO_TIMEOUT"}, + {STATUS_IO_TIMEOUT, -EAGAIN, "STATUS_IO_TIMEOUT"}, {STATUS_FILE_FORCED_CLOSED, -EIO, "STATUS_FILE_FORCED_CLOSED"}, {STATUS_PROFILING_NOT_STARTED, -EIO, "STATUS_PROFILING_NOT_STARTED"}, {STATUS_PROFILING_NOT_STOPPED, -EIO, "STATUS_PROFILING_NOT_STOPPED"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 5f3b6e77b2a7..76d82a60a550 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2354,6 +2354,17 @@ smb2_is_session_expired(char *buf) return true; } +static bool +smb2_is_status_io_timeout(char *buf) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + + if (shdr->Status == STATUS_IO_TIMEOUT) + return true; + else + return false; +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -4817,6 +4828,7 @@ struct smb_version_operations smb20_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb21_operations = { @@ -4917,6 +4929,7 @@ struct smb_version_operations smb21_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb30_operations = { @@ -5027,6 +5040,7 @@ struct smb_version_operations smb30_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_operations smb311_operations = { @@ -5138,6 +5152,7 @@ struct smb_version_operations smb311_operations = { .make_node = smb2_make_node, .fiemap = smb3_fiemap, .llseek = smb3_llseek, + .is_status_io_timeout = smb2_is_status_io_timeout, }; struct smb_version_values smb20_values = { From c6cc4c5a72505a0ecefc9b413f16bec512f38078 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 9 Oct 2020 09:32:56 +1000 Subject: [PATCH 318/497] cifs: handle -EINTR in cifs_setattr RHBZ: 1848178 Some calls that set attributes, like utimensat(), are not supposed to return -EINTR and thus do not have handlers for this in glibc which causes us to leak -EINTR to the applications which are also unprepared to handle it. For example tar will break if utimensat() return -EINTR and abort unpacking the archive. Other applications may break too. To handle this we add checks, and retry, for -EINTR in cifs_setattr() Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1f75b25e559a..daec31be8571 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2883,13 +2883,18 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); + int rc, retries = 0; - if (pTcon->unix_ext) - return cifs_setattr_unix(direntry, attrs); - - return cifs_setattr_nounix(direntry, attrs); + do { + if (pTcon->unix_ext) + rc = cifs_setattr_unix(direntry, attrs); + else + rc = cifs_setattr_nounix(direntry, attrs); + retries++; + } while (is_retryable_error(rc) && retries < 2); /* BB: add cifs_setattr_legacy for really old servers */ + return rc; } #if 0 From 682955491a0d89ed7993d7b7f5051c858634ee70 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 10 Oct 2020 20:11:47 -0500 Subject: [PATCH 319/497] SMB3.1.1: add defines for new signing negotiate context Currently there are three supported signing algorithms Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2pdu.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4dfb51dd7065..5932fc0dc62c 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -323,6 +323,7 @@ struct smb2_negotiate_req { #define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) #define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) #define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) +#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) #define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) struct smb2_neg_context { @@ -416,6 +417,19 @@ struct smb2_rdma_transform_capabilities_context { __le16 RDMATransformIds[1]; } __packed; +/* Signing algorithms */ +#define SIGNING_ALG_HMAC_SHA256 0 +#define SIGNING_ALG_AES_CMAC 1 +#define SIGNING_ALG_AES_GMAC 2 + +struct smb2_signing_capabilities { + __le16 ContextType; /* 8 */ + __le16 DataLength; + __u32 Reserved; + __le16 SigningAlgorithmCount; + __le16 SigningAlgorithms[]; +} __packed; + #define POSIX_CTXT_DATA_LEN 16 struct smb2_posix_neg_context { __le16 ContextType; /* 0x100 */ From 330857a5d8224fa073bcb88d78e5be51f0848a75 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Mon, 24 Feb 2020 14:15:09 +0100 Subject: [PATCH 320/497] cifs: map STATUS_ACCOUNT_LOCKED_OUT to -EACCES This is basically the same as STATUS_LOGON_FAILURE, but after the account is locked out. Signed-off-by: Stefan Metzmacher Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2maperror.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index b004cf87692a..c775682ee973 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -814,7 +814,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_VARIANT, -EIO, "STATUS_INVALID_VARIANT"}, {STATUS_DOMAIN_CONTROLLER_NOT_FOUND, -EIO, "STATUS_DOMAIN_CONTROLLER_NOT_FOUND"}, - {STATUS_ACCOUNT_LOCKED_OUT, -EIO, "STATUS_ACCOUNT_LOCKED_OUT"}, + {STATUS_ACCOUNT_LOCKED_OUT, -EACCES, "STATUS_ACCOUNT_LOCKED_OUT"}, {STATUS_HANDLE_NOT_CLOSABLE, -EIO, "STATUS_HANDLE_NOT_CLOSABLE"}, {STATUS_CONNECTION_REFUSED, -EIO, "STATUS_CONNECTION_REFUSED"}, {STATUS_GRACEFUL_DISCONNECT, -EIO, "STATUS_GRACEFUL_DISCONNECT"}, From fbfd0b46afa9a8b50a061b0f28204fc94c7bddcf Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 11 Sep 2020 16:19:28 -0500 Subject: [PATCH 321/497] smb3.1.1: add new module load parm require_gcm_256 Add new module load parameter require_gcm_256. If set, then only request AES-256-GCM (strongest encryption type). Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 ++++ fs/cifs/cifsglob.h | 1 + fs/cifs/smb2pdu.c | 14 ++++++++++---- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0fb99d25e8a8..462dbbd17c5f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -71,6 +71,7 @@ bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; bool disable_legacy_dialects; /* false by default */ +bool require_gcm_256; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; @@ -104,6 +105,9 @@ MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait " module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); +module_param(require_gcm_256, bool, 0644); +MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); + module_param(disable_legacy_dialects, bool, 0644); MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be " "helpful to restrict the ability to " diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 5a491afafacc..ec21af833749 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1956,6 +1956,7 @@ extern bool lookupCacheEnabled; extern unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +extern bool require_gcm_256; /* require use of strongest signing (aes-gcm-256) */ extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 96c172d94fba..fcae1e3dfcc5 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -449,10 +449,16 @@ static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) { pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; - pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + two ciphers */ - pneg_ctxt->CipherCount = cpu_to_le16(2); - pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; - pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + if (require_gcm_256) { + pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + 1 cipher */ + pneg_ctxt->CipherCount = cpu_to_le16(1); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES256_GCM; + } else { + pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + 2 ciphers */ + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; + } } static unsigned int From 29e279230413cdd5e00fb5d269cae1099184ab85 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 14 Oct 2020 20:24:09 -0500 Subject: [PATCH 322/497] smb3.1.1: add new module load parm enable_gcm_256 Add new module load parameter enable_gcm_256. If set, then add AES-256-GCM (strongest encryption type) to the list of encryption types requested. Put it in the list as the second choice (since AES-128-GCM is faster and much more broadly supported by SMB3 servers). To make this stronger encryption type, GCM-256, required (the first and only choice, you would use module parameter "require_gcm_256." Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 ++++ fs/cifs/cifsglob.h | 1 + fs/cifs/smb2pdu.c | 6 ++++++ fs/cifs/smb2pdu.h | 5 +++-- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 462dbbd17c5f..472cb7777e3e 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -71,6 +71,7 @@ bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; bool disable_legacy_dialects; /* false by default */ +bool enable_gcm_256; /* false by default, change when more servers support it */ bool require_gcm_256; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ @@ -105,6 +106,9 @@ MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait " module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); +module_param(enable_gcm_256, bool, 0644); +MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0"); + module_param(require_gcm_256, bool, 0644); MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ec21af833749..a1a1a16acb38 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1956,6 +1956,7 @@ extern bool lookupCacheEnabled; extern unsigned int global_secflags; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ extern unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +extern bool enable_gcm_256; /* allow optional negotiate of strongest signing (aes-gcm-256) */ extern bool require_gcm_256; /* require use of strongest signing (aes-gcm-256) */ extern bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index fcae1e3dfcc5..8cfc3122ae5c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -453,6 +453,12 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) pneg_ctxt->DataLength = cpu_to_le16(4); /* Cipher Count + 1 cipher */ pneg_ctxt->CipherCount = cpu_to_le16(1); pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES256_GCM; + } else if (enable_gcm_256) { + pneg_ctxt->DataLength = cpu_to_le16(8); /* Cipher Count + 3 ciphers */ + pneg_ctxt->CipherCount = cpu_to_le16(3); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES256_GCM; + pneg_ctxt->Ciphers[2] = SMB2_ENCRYPTION_AES128_CCM; } else { pneg_ctxt->DataLength = cpu_to_le16(6); /* Cipher Count + 2 ciphers */ pneg_ctxt->CipherCount = cpu_to_le16(2); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 5932fc0dc62c..6f65f1cec8ad 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -361,8 +361,9 @@ struct smb2_encryption_neg_context { __le16 ContextType; /* 2 */ __le16 DataLength; __le32 Reserved; - __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ - __le16 Ciphers[2]; + /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ + __le16 Ciphers[3]; } __packed; /* See MS-SMB2 2.2.3.1.3 */ From 97148d0ae5303bcc18fcd1c9b968a9485292f32a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 13 Oct 2020 10:42:47 +0530 Subject: [PATCH 323/497] cpufreq: Improve code around unlisted freq check The cpufreq core checks if the frequency programmed by the bootloaders is not listed in the freq table and programs one from the table in such a case. This is done only if the driver has set the CPUFREQ_NEED_INITIAL_FREQ_CHECK flag. Currently we print two separate messages, with almost the same content, and do this with a pr_warn() which may be a bit too much as the driver only asked us to check this as it expected this to be the case. Lower down the severity of the print message by switching to pr_info() instead and print a single message only. Reported-by: Sumit Gupta Signed-off-by: Viresh Kumar Reviewed-by: Sumit Gupta Tested-by: Sumit Gupta Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1877f5e2e5b0..f4b60663efe6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1454,14 +1454,13 @@ static int cpufreq_online(unsigned int cpu) */ if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK) && has_target()) { + unsigned int old_freq = policy->cur; + /* Are we running at unknown frequency ? */ - ret = cpufreq_frequency_table_get_index(policy, policy->cur); + ret = cpufreq_frequency_table_get_index(policy, old_freq); if (ret == -EINVAL) { - /* Warn user and fix it */ - pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n", - __func__, policy->cpu, policy->cur); - ret = __cpufreq_driver_target(policy, policy->cur - 1, - CPUFREQ_RELATION_L); + ret = __cpufreq_driver_target(policy, old_freq - 1, + CPUFREQ_RELATION_L); /* * Reaching here after boot in a few seconds may not @@ -1469,8 +1468,8 @@ static int cpufreq_online(unsigned int cpu) * frequency for longer duration. Hence, a BUG_ON(). */ BUG_ON(ret); - pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n", - __func__, policy->cpu, policy->cur); + pr_info("%s: CPU%d: Running at unlisted initial frequency: %u KHz, changing to: %u KHz\n", + __func__, policy->cpu, old_freq, policy->cur); } } From cdc1719cd885ef490e30c14c01a6e7fee42bf2e2 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 9 Oct 2020 11:30:38 +0800 Subject: [PATCH 324/497] cpufreq: intel_pstate: Delete intel_pstate sysfs if failed to register the driver There is a corner case that if the intel_pstate driver fails to be registered (might be due to invalid MSR access) and acpi_cpufreq takse over, the intel_pstate sysfs interface is still populated which may confuse user space (turbostat for example): grep . /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver acpi-cpufreq grep . /sys/devices/system/cpu/intel_pstate/* /sys/devices/system/cpu/intel_pstate/max_perf_pct:0 /sys/devices/system/cpu/intel_pstate/min_perf_pct:0 grep: /sys/devices/system/cpu/intel_pstate/no_turbo: Resource temporarily unavailable grep: /sys/devices/system/cpu/intel_pstate/num_pstates: Resource temporarily unavailable /sys/devices/system/cpu/intel_pstate/status:off grep: /sys/devices/system/cpu/intel_pstate/turbo_pct: Resource temporarily unavailable The mere presence of the intel_pstate sysfs interface does not mean that intel_pstate is in use (for example, echo "off" to "status"), but it should not be created in the failing case. Fix this issue by deleting the intel_pstate sysfs if the driver registration fails. Reported-by: Wendy Wang Suggested-by: Zhang Rui Signed-off-by: Chen Yu Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9a515c460a00..3c1455518738 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1420,6 +1420,24 @@ static void __init intel_pstate_sysfs_expose_params(void) } } +static void __init intel_pstate_sysfs_remove(void) +{ + if (!intel_pstate_kobject) + return; + + sysfs_remove_group(intel_pstate_kobject, &intel_pstate_attr_group); + + if (!per_cpu_limits) { + sysfs_remove_file(intel_pstate_kobject, &max_perf_pct.attr); + sysfs_remove_file(intel_pstate_kobject, &min_perf_pct.attr); + + if (x86_match_cpu(intel_pstate_cpu_ee_disable_ids)) + sysfs_remove_file(intel_pstate_kobject, &energy_efficiency.attr); + } + + kobject_put(intel_pstate_kobject); +} + static void intel_pstate_sysfs_expose_hwp_dynamic_boost(void) { int rc; @@ -3063,8 +3081,10 @@ hwp_cpu_matched: mutex_lock(&intel_pstate_driver_lock); rc = intel_pstate_register_driver(default_driver); mutex_unlock(&intel_pstate_driver_lock); - if (rc) + if (rc) { + intel_pstate_sysfs_remove(); return rc; + } if (hwp_active) { const struct x86_cpu_id *id; From e05783346441ce03f5b422247ca571613360218d Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 1 Oct 2020 11:28:52 +0200 Subject: [PATCH 325/497] MAINTAINERS: Add section for cpuidle-psci PM domain The cpuidle-psci-domain.c is not listed in the section for the cpuidle driver for ARM PSCI. From discussions at LKML, Lorenzo and Sudeep prefer to add a separate section for it, so do that and add myself as the maintainer for that part. Signed-off-by: Ulf Hansson Acked-by: Lorenzo Pieralisi Acked-by: Sudeep Holla [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8c5556df75a8..f13dcd1441e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4588,6 +4588,14 @@ L: linux-arm-kernel@lists.infradead.org S: Supported F: drivers/cpuidle/cpuidle-psci.c +CPUIDLE DRIVER - ARM PSCI PM DOMAIN +M: Ulf Hansson +L: linux-pm@vger.kernel.org +L: linux-arm-kernel@lists.infradead.org +S: Supported +F: drivers/cpuidle/cpuidle-psci.h +F: drivers/cpuidle/cpuidle-psci-domain.c + CRAMFS FILESYSTEM M: Nicolas Pitre S: Maintained From 8bb2e2a887afdf8a39e68fa0dccf82a168aae655 Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Mon, 12 Oct 2020 15:50:33 +0300 Subject: [PATCH 326/497] intel_idle: mention assumption that WBINVD is not needed Intel SDM does not explicitly say that entering a C-state via MWAIT will implicitly flush CPU caches as appropriate for that C-state. However, documentation for individual Intel CPU generations does mention this behavior. Since intel_idle binds to any Intel CPU with MWAIT, list this assumption of MWAIT behavior. In passing, reword opening comment to make it clear that the driver can load on any old and future Intel CPU with MWAIT. Signed-off-by: Alexander Monakov Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 9a810e4a7946..bf3dd4a19fef 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -8,7 +8,7 @@ */ /* - * intel_idle is a cpuidle driver that loads on specific Intel processors + * intel_idle is a cpuidle driver that loads on all Intel CPUs with MWAIT * in lieu of the legacy ACPI processor_idle driver. The intent is to * make Linux more efficient on these processors, as intel_idle knows * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. @@ -20,7 +20,11 @@ * All CPUs have same idle states as boot CPU * * Chipset BM_STS (bus master status) bit is a NOP - * for preventing entry into deep C-stats + * for preventing entry into deep C-states + * + * CPU will flush caches as needed when entering a C-state via MWAIT + * (in contrast to entering ACPI C3, in which case the WBINVD + * instruction needs to be executed to flush the caches) */ /* From bae314dd5d8dfdd90ee584003a0f8c06e1bf3ea2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 15 Oct 2020 16:44:27 +0200 Subject: [PATCH 327/497] cpuidle: Remove pointless stub The cpuidle.h header is declaring a function with an empty stub for the cpuidle disabled case, but that function is only called by cpuidle governors which depend on cpuidle anyway. In other words, the function is only called when cpuidle is enabled, so there is no need for the stub. Remove the pointless stub. Signed-off-by: Daniel Lezcano [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index ed0da0e58e8b..bd605b5585cf 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -271,13 +271,8 @@ struct cpuidle_governor { void (*reflect) (struct cpuidle_device *dev, int index); }; -#ifdef CONFIG_CPU_IDLE extern int cpuidle_register_governor(struct cpuidle_governor *gov); extern s64 cpuidle_governor_latency_req(unsigned int cpu); -#else -static inline int cpuidle_register_governor(struct cpuidle_governor *gov) -{return 0;} -#endif #define __CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, \ idx, \ From 75af76d0a34e048651a6af311781d7206b6964c7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 16 Oct 2020 17:28:32 +0200 Subject: [PATCH 328/497] intel_idle: Ignore _CST if control cannot be taken from the platform e6d4f08a6776 ("intel_idle: Use ACPI _CST on server systems") avoids enabling c-states that have been disabled by the platform with the exception of C1E. Unfortunately, BIOS implementations are not always consistent in terms of how capabilities are advertised and control cannot always be handed over. If control cannot be handed over then intel_idle reports that "ACPI _CST not found or not usable" but does not clear acpi_state_table.count meaning the information is still partially used. This patch ignores ACPI information if CST control cannot be requested from the platform. This was only observed on a number of Haswell platforms that had identical CPUs but not identical BIOS versions. While this problem may be rare overall, 24 separate test cases bisected to this specific commit across 4 separate test machines and is worth addressing. If the situation occurs, the kernel behaves as it did before commit e6d4f08a6776 and uses any c-states that are discovered. The affected test cases were all ones that involved a small number of processes -- exec microbenchmark, pipe microbenchmark, git test suite, netperf, tbench with one client and system call microbenchmark. Each case benefits from being able to use turboboost which is prevented if the lower c-states are unavailable. This may mask real regressions specific to older hardware so it is worth addressing. C-state status before and after the patch 5.9.0-vanilla POLL latency:0 disabled:0 default:enabled 5.9.0-vanilla C1 latency:2 disabled:0 default:enabled 5.9.0-vanilla C1E latency:10 disabled:0 default:enabled 5.9.0-vanilla C3 latency:33 disabled:1 default:disabled 5.9.0-vanilla C6 latency:133 disabled:1 default:disabled 5.9.0-ignore-cst-v1r1 POLL latency:0 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C1 latency:2 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C1E latency:10 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C3 latency:33 disabled:0 default:enabled 5.9.0-ignore-cst-v1r1 C6 latency:133 disabled:0 default:enabled Patch enables C3/C6. Netperf UDP_STREAM netperf-udp 5.5.0 5.9.0 vanilla ignore-cst-v1r1 Hmean send-64 193.41 ( 0.00%) 226.54 * 17.13%* Hmean send-128 392.16 ( 0.00%) 450.54 * 14.89%* Hmean send-256 769.94 ( 0.00%) 881.85 * 14.53%* Hmean send-1024 2994.21 ( 0.00%) 3468.95 * 15.85%* Hmean send-2048 5725.60 ( 0.00%) 6628.99 * 15.78%* Hmean send-3312 8468.36 ( 0.00%) 10288.02 * 21.49%* Hmean send-4096 10135.46 ( 0.00%) 12387.57 * 22.22%* Hmean send-8192 17142.07 ( 0.00%) 19748.11 * 15.20%* Hmean send-16384 28539.71 ( 0.00%) 30084.45 * 5.41%* Fixes: e6d4f08a6776 ("intel_idle: Use ACPI _CST on server systems") Signed-off-by: Mel Gorman Cc: 5.6+ # 5.6+ Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index bf3dd4a19fef..56f5b8077cba 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1216,14 +1216,13 @@ static bool __init intel_idle_acpi_cst_extract(void) if (!intel_idle_cst_usable()) continue; - if (!acpi_processor_claim_cst_control()) { - acpi_state_table.count = 0; - return false; - } + if (!acpi_processor_claim_cst_control()) + break; return true; } + acpi_state_table.count = 0; pr_debug("ACPI _CST not found or not usable\n"); return false; } From 7a57e9f112adebc9e5dc787c2a59dbc06ae5060d Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 13 Oct 2020 15:42:40 +0800 Subject: [PATCH 329/497] powercap/intel_rapl: Fix domain detection As only the low 32 bits of the RAPL_DOMAIN_REG_STATUS register represents the energy counter, and the high 32 bits are reserved, detect the existence of a RAPL domain by checking the low 32 bits only. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 983d75bd5bd1..2651ea6cd6d3 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1228,7 +1228,7 @@ static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp) * values, otherwise skip it. */ - ra.mask = ~0; + ra.mask = ENERGY_STATUS_MASK; if (rp->priv->read_raw(cpu, &ra) || !ra.value) return -ENODEV; From f1e8d7560d3051b38f73a0cf6acc1b0bf5305ad9 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 13 Oct 2020 15:42:41 +0800 Subject: [PATCH 330/497] powercap/intel_rapl: enumerate Psys RAPL domain together with package RAPL domain On multi-package systems, the Psys MSR is only valid for CPUs on specific package (master package). The current code makes the assumption that package 0 is the master package, but this is not true on new platforms like SPR. Fix the problem by emuerating the Psys RAPL domain for every package, so CPUs in slave packages will read 0 for the Psys energy counter and only CPUs in master packages can get a valid reading and register the Psys RAPL domain. The sysfs I/F for the Psys RAPL domain is not changed. Signed-off-by: Zhang Rui [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 80 +++++----------------------- drivers/powercap/intel_rapl_msr.c | 5 +- include/linux/intel_rapl.h | 7 ++- 3 files changed, 18 insertions(+), 74 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 2651ea6cd6d3..0b2830efc574 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -544,7 +544,14 @@ static void rapl_init_domains(struct rapl_package *rp) continue; rd->rp = rp; - rd->name = rapl_domain_names[i]; + + if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) { + snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d", + cpu_data(rp->lead_cpu).phys_proc_id); + } else + snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s", + rapl_domain_names[i]); + rd->id = i; rd->rpl[0].prim_id = PL1_ENABLE; rd->rpl[0].name = pl1_name; @@ -1112,13 +1119,17 @@ static int rapl_package_register_powercap(struct rapl_package *rp) } /* now register domains as children of the socket/package */ for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { + struct powercap_zone *parent = rp->power_zone; + if (rd->id == RAPL_DOMAIN_PACKAGE) continue; + if (rd->id == RAPL_DOMAIN_PLATFORM) + parent = NULL; /* number of power limits per domain varies */ nr_pl = find_nr_power_limit(rd); power_zone = powercap_register_zone(&rd->power_zone, rp->priv->control_type, - rd->name, rp->power_zone, + rd->name, parent, &zone_ops[rd->id], nr_pl, &constraint_ops); @@ -1145,67 +1156,6 @@ err_cleanup: return ret; } -int rapl_add_platform_domain(struct rapl_if_priv *priv) -{ - struct rapl_domain *rd; - struct powercap_zone *power_zone; - struct reg_action ra; - int ret; - - ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS]; - ra.mask = ~0; - ret = priv->read_raw(0, &ra); - if (ret || !ra.value) - return -ENODEV; - - ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT]; - ra.mask = ~0; - ret = priv->read_raw(0, &ra); - if (ret || !ra.value) - return -ENODEV; - - rd = kzalloc(sizeof(*rd), GFP_KERNEL); - if (!rd) - return -ENOMEM; - - rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM]; - rd->id = RAPL_DOMAIN_PLATFORM; - rd->regs[RAPL_DOMAIN_REG_LIMIT] = - priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT]; - rd->regs[RAPL_DOMAIN_REG_STATUS] = - priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS]; - rd->rpl[0].prim_id = PL1_ENABLE; - rd->rpl[0].name = pl1_name; - rd->rpl[1].prim_id = PL2_ENABLE; - rd->rpl[1].name = pl2_name; - rd->rp = rapl_find_package_domain(0, priv); - - power_zone = powercap_register_zone(&rd->power_zone, priv->control_type, - "psys", NULL, - &zone_ops[RAPL_DOMAIN_PLATFORM], - 2, &constraint_ops); - - if (IS_ERR(power_zone)) { - kfree(rd); - return PTR_ERR(power_zone); - } - - priv->platform_rapl_domain = rd; - - return 0; -} -EXPORT_SYMBOL_GPL(rapl_add_platform_domain); - -void rapl_remove_platform_domain(struct rapl_if_priv *priv) -{ - if (priv->platform_rapl_domain) { - powercap_unregister_zone(priv->control_type, - &priv->platform_rapl_domain->power_zone); - kfree(priv->platform_rapl_domain); - } -} -EXPORT_SYMBOL_GPL(rapl_remove_platform_domain); - static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp) { struct reg_action ra; @@ -1215,11 +1165,9 @@ static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp) case RAPL_DOMAIN_PP0: case RAPL_DOMAIN_PP1: case RAPL_DOMAIN_DRAM: + case RAPL_DOMAIN_PLATFORM: ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS]; break; - case RAPL_DOMAIN_PLATFORM: - /* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */ - return -EINVAL; default: pr_err("invalid domain id %d\n", domain); return -EINVAL; diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index d2a2627507a9..1646808d354c 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -44,6 +44,7 @@ static struct rapl_if_priv rapl_msr_priv = { .regs[RAPL_DOMAIN_PLATFORM] = { MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0}, .limits[RAPL_DOMAIN_PACKAGE] = 2, + .limits[RAPL_DOMAIN_PLATFORM] = 2, }; /* Handles CPU hotplug on multi-socket systems. @@ -157,9 +158,6 @@ static int rapl_msr_probe(struct platform_device *pdev) goto out; rapl_msr_priv.pcap_rapl_online = ret; - /* Don't bail out if PSys is not supported */ - rapl_add_platform_domain(&rapl_msr_priv); - return 0; out: @@ -171,7 +169,6 @@ out: static int rapl_msr_remove(struct platform_device *pdev) { cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online); - rapl_remove_platform_domain(&rapl_msr_priv); powercap_unregister_control_type(rapl_msr_priv.control_type); return 0; } diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index 3582176a1eca..50b8398ffd21 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -79,8 +79,10 @@ struct rapl_power_limit { struct rapl_package; +#define RAPL_DOMAIN_NAME_LENGTH 16 + struct rapl_domain { - const char *name; + char name[RAPL_DOMAIN_NAME_LENGTH]; enum rapl_domain_type id; u64 regs[RAPL_DOMAIN_REG_MAX]; struct powercap_zone power_zone; @@ -152,7 +154,4 @@ struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv); void rapl_remove_package(struct rapl_package *rp); -int rapl_add_platform_domain(struct rapl_if_priv *priv); -void rapl_remove_platform_domain(struct rapl_if_priv *priv); - #endif /* __INTEL_RAPL_H__ */ From d4f8138354b9ec290de0c7ba527a945c5549e32b Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 13 Oct 2020 14:23:39 +0200 Subject: [PATCH 331/497] PM: domains: Add support for PM domain on/off notifiers for genpd A device may have specific HW constraints that must be obeyed to, before its corresponding PM domain (genpd) can be powered off - and vice verse at power on. These constraints can't be managed through the regular runtime PM based deployment for a device, because the access pattern for it, isn't always request based. In other words, using the runtime PM callbacks to deal with the constraints doesn't work for these cases. For these reasons, let's instead add a PM domain power on/off notification mechanism to genpd. To add/remove a notifier for a device, the device must already have been attached to the genpd, which also means that it needs to be a part of the PM domain topology. To add/remove a notifier, let's introduce two genpd specific functions: - dev_pm_genpd_add|remove_notifier() Note that, to further clarify when genpd power on/off notifiers may be used, one can compare with the existing CPU_CLUSTER_PM_ENTER|EXIT notifiers. In the long run, the genpd power on/off notifiers should be able to replace them, but that requires additional genpd based platform support for the current users. Signed-off-by: Ulf Hansson Tested-by: Lina Iyer Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 161 +++++++++++++++++++++++++++++++++--- include/linux/pm_domain.h | 22 +++++ 2 files changed, 171 insertions(+), 12 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 05bb4d4401b2..c2a8821bdb26 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -413,28 +413,47 @@ static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed) unsigned int state_idx = genpd->state_idx; ktime_t time_start; s64 elapsed_ns; - int ret; + int ret, nr_calls = 0; + + /* Notify consumers that we are about to power on. */ + ret = __raw_notifier_call_chain(&genpd->power_notifiers, + GENPD_NOTIFY_PRE_ON, NULL, -1, + &nr_calls); + ret = notifier_to_errno(ret); + if (ret) + goto err; if (!genpd->power_on) - return 0; + goto out; - if (!timed) - return genpd->power_on(genpd); + if (!timed) { + ret = genpd->power_on(genpd); + if (ret) + goto err; + + goto out; + } time_start = ktime_get(); ret = genpd->power_on(genpd); if (ret) - return ret; + goto err; elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start)); if (elapsed_ns <= genpd->states[state_idx].power_on_latency_ns) - return ret; + goto out; genpd->states[state_idx].power_on_latency_ns = elapsed_ns; genpd->max_off_time_changed = true; pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n", genpd->name, "on", elapsed_ns); +out: + raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_ON, NULL); + return 0; +err: + raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_OFF, + NULL); return ret; } @@ -443,29 +462,51 @@ static int _genpd_power_off(struct generic_pm_domain *genpd, bool timed) unsigned int state_idx = genpd->state_idx; ktime_t time_start; s64 elapsed_ns; - int ret; + int ret, nr_calls = 0; + + /* Notify consumers that we are about to power off. */ + ret = __raw_notifier_call_chain(&genpd->power_notifiers, + GENPD_NOTIFY_PRE_OFF, NULL, -1, + &nr_calls); + ret = notifier_to_errno(ret); + if (ret) + goto busy; if (!genpd->power_off) - return 0; + goto out; - if (!timed) - return genpd->power_off(genpd); + if (!timed) { + ret = genpd->power_off(genpd); + if (ret) + goto busy; + + goto out; + } time_start = ktime_get(); ret = genpd->power_off(genpd); if (ret) - return ret; + goto busy; elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start)); if (elapsed_ns <= genpd->states[state_idx].power_off_latency_ns) - return 0; + goto out; genpd->states[state_idx].power_off_latency_ns = elapsed_ns; genpd->max_off_time_changed = true; pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n", genpd->name, "off", elapsed_ns); +out: + raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_OFF, + NULL); return 0; +busy: + if (nr_calls) + __raw_notifier_call_chain(&genpd->power_notifiers, + GENPD_NOTIFY_ON, NULL, nr_calls - 1, + NULL); + return ret; } /** @@ -1592,6 +1633,101 @@ int pm_genpd_remove_device(struct device *dev) } EXPORT_SYMBOL_GPL(pm_genpd_remove_device); +/** + * dev_pm_genpd_add_notifier - Add a genpd power on/off notifier for @dev + * + * @dev: Device that should be associated with the notifier + * @nb: The notifier block to register + * + * Users may call this function to add a genpd power on/off notifier for an + * attached @dev. Only one notifier per device is allowed. The notifier is + * sent when genpd is powering on/off the PM domain. + * + * It is assumed that the user guarantee that the genpd wouldn't be detached + * while this routine is getting called. + * + * Returns 0 on success and negative error values on failures. + */ +int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb) +{ + struct generic_pm_domain *genpd; + struct generic_pm_domain_data *gpd_data; + int ret; + + genpd = dev_to_genpd_safe(dev); + if (!genpd) + return -ENODEV; + + if (WARN_ON(!dev->power.subsys_data || + !dev->power.subsys_data->domain_data)) + return -EINVAL; + + gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); + if (gpd_data->power_nb) + return -EEXIST; + + genpd_lock(genpd); + ret = raw_notifier_chain_register(&genpd->power_notifiers, nb); + genpd_unlock(genpd); + + if (ret) { + dev_warn(dev, "failed to add notifier for PM domain %s\n", + genpd->name); + return ret; + } + + gpd_data->power_nb = nb; + return 0; +} +EXPORT_SYMBOL_GPL(dev_pm_genpd_add_notifier); + +/** + * dev_pm_genpd_remove_notifier - Remove a genpd power on/off notifier for @dev + * + * @dev: Device that is associated with the notifier + * + * Users may call this function to remove a genpd power on/off notifier for an + * attached @dev. + * + * It is assumed that the user guarantee that the genpd wouldn't be detached + * while this routine is getting called. + * + * Returns 0 on success and negative error values on failures. + */ +int dev_pm_genpd_remove_notifier(struct device *dev) +{ + struct generic_pm_domain *genpd; + struct generic_pm_domain_data *gpd_data; + int ret; + + genpd = dev_to_genpd_safe(dev); + if (!genpd) + return -ENODEV; + + if (WARN_ON(!dev->power.subsys_data || + !dev->power.subsys_data->domain_data)) + return -EINVAL; + + gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); + if (!gpd_data->power_nb) + return -ENODEV; + + genpd_lock(genpd); + ret = raw_notifier_chain_unregister(&genpd->power_notifiers, + gpd_data->power_nb); + genpd_unlock(genpd); + + if (ret) { + dev_warn(dev, "failed to remove notifier for PM domain %s\n", + genpd->name); + return ret; + } + + gpd_data->power_nb = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(dev_pm_genpd_remove_notifier); + static int genpd_add_subdomain(struct generic_pm_domain *genpd, struct generic_pm_domain *subdomain) { @@ -1762,6 +1898,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd, INIT_LIST_HEAD(&genpd->parent_links); INIT_LIST_HEAD(&genpd->child_links); INIT_LIST_HEAD(&genpd->dev_list); + RAW_INIT_NOTIFIER_HEAD(&genpd->power_notifiers); genpd_lock_init(genpd); genpd->gov = gov; INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 66f3c5d64d81..db039da0aba2 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -68,6 +68,13 @@ enum gpd_status { GENPD_STATE_OFF, /* PM domain is off */ }; +enum genpd_notication { + GENPD_NOTIFY_PRE_OFF = 0, + GENPD_NOTIFY_OFF, + GENPD_NOTIFY_PRE_ON, + GENPD_NOTIFY_ON, +}; + struct dev_power_governor { bool (*power_down_ok)(struct dev_pm_domain *domain); bool (*suspend_ok)(struct device *dev); @@ -112,6 +119,7 @@ struct generic_pm_domain { cpumask_var_t cpus; /* A cpumask of the attached CPUs */ int (*power_off)(struct generic_pm_domain *domain); int (*power_on)(struct generic_pm_domain *domain); + struct raw_notifier_head power_notifiers; /* Power on/off notifiers */ struct opp_table *opp_table; /* OPP table of the genpd */ unsigned int (*opp_to_performance_state)(struct generic_pm_domain *genpd, struct dev_pm_opp *opp); @@ -178,6 +186,7 @@ struct generic_pm_domain_data { struct pm_domain_data base; struct gpd_timing_data td; struct notifier_block nb; + struct notifier_block *power_nb; int cpu; unsigned int performance_state; void *data; @@ -204,6 +213,8 @@ int pm_genpd_init(struct generic_pm_domain *genpd, struct dev_power_governor *gov, bool is_off); int pm_genpd_remove(struct generic_pm_domain *genpd); int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state); +int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb); +int dev_pm_genpd_remove_notifier(struct device *dev); extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; @@ -251,6 +262,17 @@ static inline int dev_pm_genpd_set_performance_state(struct device *dev, return -ENOTSUPP; } +static inline int dev_pm_genpd_add_notifier(struct device *dev, + struct notifier_block *nb) +{ + return -ENOTSUPP; +} + +static inline int dev_pm_genpd_remove_notifier(struct device *dev) +{ + return -ENOTSUPP; +} + #define simple_qos_governor (*(struct dev_power_governor *)(NULL)) #define pm_domain_always_on_gov (*(struct dev_power_governor *)(NULL)) #endif From 505a70b783debaa84c7ebafa44a69a9401db4499 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 13 Oct 2020 16:14:59 +0200 Subject: [PATCH 332/497] PM: domains: Add curly braces to delimit comment + statement block There is not strict need to group a comment and a single statement in an if block, as comments are stripped by the pre-processor. However, adding curly braces does make the code easier to read, and may avoid mistakes when changing the code later. Signed-off-by: Geert Uytterhoeven Acked-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index c2a8821bdb26..bc4243943940 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -1311,13 +1311,14 @@ static int genpd_restore_noirq(struct device *dev) * first time for the given domain in the present cycle. */ genpd_lock(genpd); - if (genpd->suspended_count++ == 0) + if (genpd->suspended_count++ == 0) { /* * The boot kernel might put the domain into arbitrary state, * so make it appear as powered off to genpd_sync_power_on(), * so that it tries to power it on in case it was really off. */ genpd->status = GENPD_STATE_OFF; + } genpd_sync_power_on(genpd, true, 0); genpd_unlock(genpd); From c6a113b52302adcfadda63af81dc05f7a669fbc8 Mon Sep 17 00:00:00 2001 From: Lina Iyer Date: Thu, 15 Oct 2020 14:47:22 -0600 Subject: [PATCH 333/497] PM: domains: enable domain idle state accounting To enable better debug of PM domains, keep a track of successful and failing attempts to enter each domain idle state. This statistics are exported in debugfs when reading the idle_states node associated with each PM domain. Signed-off-by: Lina Iyer [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 10 +++++++--- include/linux/pm_domain.h | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index bc4243943940..859cdb207010 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -591,11 +591,14 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, return -EBUSY; ret = _genpd_power_off(genpd, true); - if (ret) + if (ret) { + genpd->states[genpd->state_idx].rejected++; return ret; + } genpd->status = GENPD_STATE_OFF; genpd_update_accounting(genpd); + genpd->states[genpd->state_idx].usage++; list_for_each_entry(link, &genpd->child_links, child_node) { genpd_sd_counter_dec(link->parent); @@ -3061,7 +3064,7 @@ static int idle_states_show(struct seq_file *s, void *data) if (ret) return -ERESTARTSYS; - seq_puts(s, "State Time Spent(ms)\n"); + seq_puts(s, "State Time Spent(ms) Usage Rejected\n"); for (i = 0; i < genpd->state_count; i++) { ktime_t delta = 0; @@ -3073,7 +3076,8 @@ static int idle_states_show(struct seq_file *s, void *data) msecs = ktime_to_ms( ktime_add(genpd->states[i].idle_time, delta)); - seq_printf(s, "S%-13i %lld\n", i, msecs); + seq_printf(s, "S%-13i %-14lld %-14llu %llu\n", i, msecs, + genpd->states[i].usage, genpd->states[i].rejected); } genpd_unlock(genpd); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index db039da0aba2..1ad0ec481416 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -89,6 +89,8 @@ struct genpd_power_state { s64 power_off_latency_ns; s64 power_on_latency_ns; s64 residency_ns; + u64 usage; + u64 rejected; struct fwnode_handle *fwnode; ktime_t idle_time; void *data; From 0fada277147ffc6d694aa32162f51198d4f10d94 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Mon, 12 Oct 2020 14:04:46 +0100 Subject: [PATCH 334/497] ACPI: debug: don't allow debugging when ACPI is disabled If ACPI is disabled then loading the acpi_dbg module will result in the following splat when lock debugging is enabled. DEBUG_LOCKS_WARN_ON(lock->magic != lock) WARNING: CPU: 0 PID: 1 at kernel/locking/mutex.c:938 __mutex_lock+0xa10/0x1290 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc8+ #103 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x4d8 show_stack+0x34/0x48 dump_stack+0x174/0x1f8 panic+0x360/0x7a0 __warn+0x244/0x2ec report_bug+0x240/0x398 bug_handler+0x50/0xc0 call_break_hook+0x160/0x1d8 brk_handler+0x30/0xc0 do_debug_exception+0x184/0x340 el1_dbg+0x48/0xb0 el1_sync_handler+0x170/0x1c8 el1_sync+0x80/0x100 __mutex_lock+0xa10/0x1290 mutex_lock_nested+0x6c/0xc0 acpi_register_debugger+0x40/0x88 acpi_aml_init+0xc4/0x114 do_one_initcall+0x24c/0xb10 kernel_init_freeable+0x690/0x728 kernel_init+0x20/0x1e8 ret_from_fork+0x10/0x18 This is because acpi_debugger.lock has not been initialized as acpi_debugger_init() is not called when ACPI is disabled. Fail module loading to avoid this and any subsequent problems that might arise by trying to debug AML when ACPI is disabled. Fixes: 8cfb0cdf07e2 ("ACPI / debugger: Add IO interface to access debugger functionalities") Reviewed-by: Hanjun Guo Signed-off-by: Jamie Iles Cc: 4.10+ # 4.10+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_dbg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c index 6041974c7627..fb7290338593 100644 --- a/drivers/acpi/acpi_dbg.c +++ b/drivers/acpi/acpi_dbg.c @@ -749,6 +749,9 @@ static int __init acpi_aml_init(void) { int ret; + if (acpi_disabled) + return -ENODEV; + /* Initialize AML IO interface */ mutex_init(&acpi_aml_io.lock); init_waitqueue_head(&acpi_aml_io.wait); From 9a4888888cc09b0ff3d0a1dd32df88742d29a293 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 13 Oct 2020 15:35:57 +0800 Subject: [PATCH 335/497] ACPI: reboot: Avoid racing after writing to ACPI RESET_REG According to the ACPI spec, "The system must reset immediately following the write to the ACPI RESET_REG register.", but there are cases that the system does not follow this and results in racing with the subsequetial reboot mechanism, which brings unexpected behavior. Fix this by adding a 15ms delay after writing to the ACPI RESET_REG. Reported-by: Ghorai, Sukumar Signed-off-by: Zhang Rui [ rjw: Edit comment in the code and subject ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/reboot.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/acpi/reboot.c b/drivers/acpi/reboot.c index ca707f5b521d..2a61f884e222 100644 --- a/drivers/acpi/reboot.c +++ b/drivers/acpi/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #ifdef CONFIG_PCI static void acpi_pci_reboot(struct acpi_generic_address *rr, u8 reset_value) @@ -66,4 +67,14 @@ void acpi_reboot(void) acpi_reset(); break; } + + /* + * Some platforms do not shut down immediately after writing to the + * ACPI reset register, and this results in racing with the + * subsequent reboot mechanism. + * + * The 15ms delay has been found to be long enough for the system + * to reboot on the affected platforms. + */ + mdelay(15); } From d85cc6635a2a1338f7a45f652e97b02f9a69b9f5 Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Tue, 13 Oct 2020 19:03:40 -0600 Subject: [PATCH 336/497] ACPI: processor: remove comment regarding string _UID support ACPI 6.3 Errata A no longer allows _UID to return a string except for Itanium (for historical reasons) as stated in section 5.2.12: "From ACPI Specification 6.3 onward, all processor objects for all architectures except Itanium must now use Device() objects with an _HID of ACPI0007, and use only integer _UID values." Therefore, the "we don't handle string _UIDs yet" comment, which implies a missing feature, is redundant, so drop it. Signed-off-by: Alex Hung [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_processor.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 412a9725cc1e..2ee5e05a0d69 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -264,7 +264,6 @@ static int acpi_processor_get_info(struct acpi_device *device) } else { /* * Declared with "Device" statement; match _UID. - * Note that we don't handle string _UIDs yet. */ status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, NULL, &value); From ff44fe3e67e41795cd2ef11b7d579a689ea57775 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 Oct 2020 18:58:43 +0200 Subject: [PATCH 337/497] ACPI: DPTF: Fix participant driver names Change the names of DPTF participant drivers to adhere to the sysfs file naming conventions (no spaces present in the name in particular). Fixes: 2ce6324eadb0 ("ACPI: DPTF: Add PCH FIVR participant driver") Fixes: 6256ebd5daf9 ("ACPI / DPTF: Add DPTF power participant driver") Signed-off-by: Rafael J. Wysocki Reviewed-by: Srinivas Pandruvada Acked-by: Borislav Petkov --- drivers/acpi/dptf/dptf_pch_fivr.c | 2 +- drivers/acpi/dptf/dptf_power.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/dptf/dptf_pch_fivr.c b/drivers/acpi/dptf/dptf_pch_fivr.c index 4ab288827747..4c1992fce150 100644 --- a/drivers/acpi/dptf/dptf_pch_fivr.c +++ b/drivers/acpi/dptf/dptf_pch_fivr.c @@ -114,7 +114,7 @@ static struct platform_driver pch_fivr_driver = { .probe = pch_fivr_add, .remove = pch_fivr_remove, .driver = { - .name = "DPTF PCH FIVR", + .name = "dptf_pch_fivr", .acpi_match_table = pch_fivr_device_ids, }, }; diff --git a/drivers/acpi/dptf/dptf_power.c b/drivers/acpi/dptf/dptf_power.c index 92b996a564d0..06741305fc77 100644 --- a/drivers/acpi/dptf/dptf_power.c +++ b/drivers/acpi/dptf/dptf_power.c @@ -237,7 +237,7 @@ static struct platform_driver dptf_power_driver = { .probe = dptf_power_add, .remove = dptf_power_remove, .driver = { - .name = "DPTF Platform Power", + .name = "dptf_power", .acpi_match_table = int3407_device_ids, }, }; From d7a4a85c9a34b8edc3e2f6e64caf5c97c8bdcce4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 15 Oct 2020 18:59:52 +0200 Subject: [PATCH 338/497] ACPI: DPTF: Add ACPI_DPTF Kconfig menu Add a Kconfig menu for Intel DPTF (Dynamic Platform and Thermal Framework), put both the existing participant drivers in it and set them to be built as modules by default. While at it, do a few assorted cleanups for a good measure. Signed-off-by: Rafael J. Wysocki Reviewed-by: Srinivas Pandruvada Acked-by: Borislav Petkov --- drivers/acpi/dptf/Kconfig | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/dptf/Kconfig b/drivers/acpi/dptf/Kconfig index 51f06f36cafa..1e8c7ce89bf1 100644 --- a/drivers/acpi/dptf/Kconfig +++ b/drivers/acpi/dptf/Kconfig @@ -1,7 +1,24 @@ # SPDX-License-Identifier: GPL-2.0 -config DPTF_POWER - tristate "DPTF Platform Power Participant" + +menuconfig ACPI_DPTF + bool "Intel DPTF (Dynamic Platform and Thermal Framework) Support" depends on X86 + help + Intel Dynamic Platform and Thermal Framework (DPTF) is a platform + level hardware/software solution for power and thermal management. + + As a container for multiple power/thermal technologies, DPTF provides + a coordinated approach for different policies to effect the hardware + state of a system. + + For more information see: + + +if ACPI_DPTF + +config DPTF_POWER + tristate "Platform Power DPTF Participant" + default m help This driver adds support for Dynamic Platform and Thermal Framework (DPTF) Platform Power Participant device (INT3407) support. @@ -16,15 +33,17 @@ config DPTF_POWER the module will be called dptf_power. config DPTF_PCH_FIVR - tristate "DPTF PCH FIVR Participant" - depends on X86 + tristate "PCH FIVR DPTF Participant" + default m help This driver adds support for Dynamic Platform and Thermal Framework (DPTF) PCH FIVR Participant device support. This driver allows to - switch PCH FIVR (Fully Integrated Voltage Regulator) frequency. + switch the PCH FIVR (Fully Integrated Voltage Regulator) frequency. This participant is responsible for exposing: freq_mhz_low_clock freq_mhz_high_clock To compile this driver as a module, choose M here: the module will be called dptf_pch_fivr. + +endif From e943c43b32ce15ef23cc6b4574567b045c96c23b Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 6 Oct 2020 18:05:14 +0200 Subject: [PATCH 339/497] PM: AVS: rockchip-io: Move the driver to the rockchip specific drivers The avs drivers are all SoC specific drivers that doesn't share any code. Instead they are located in a directory, mostly to keep similar functionality together. From a maintenance point of view, it makes better sense to collect SoC specific drivers like these, into the SoC specific directories. Therefore, let's move the rockchip-io driver to the rockchip directory. Signed-off-by: Ulf Hansson Acked-by: Heiko Stuebner Signed-off-by: Rafael J. Wysocki --- drivers/power/avs/Kconfig | 8 -------- drivers/power/avs/Makefile | 1 - drivers/soc/rockchip/Kconfig | 8 ++++++++ drivers/soc/rockchip/Makefile | 1 + .../avs/rockchip-io-domain.c => soc/rockchip/io-domain.c} | 0 5 files changed, 9 insertions(+), 9 deletions(-) rename drivers/{power/avs/rockchip-io-domain.c => soc/rockchip/io-domain.c} (100%) diff --git a/drivers/power/avs/Kconfig b/drivers/power/avs/Kconfig index cdb4237bfd02..e31215680771 100644 --- a/drivers/power/avs/Kconfig +++ b/drivers/power/avs/Kconfig @@ -27,11 +27,3 @@ config QCOM_CPR To compile this driver as a module, choose M here: the module will be called qcom-cpr - -config ROCKCHIP_IODOMAIN - tristate "Rockchip IO domain support" - depends on POWER_AVS && ARCH_ROCKCHIP && OF - help - Say y here to enable support io domains on Rockchip SoCs. It is - necessary for the io domain setting of the SoC to match the - voltage supplied by the regulators. diff --git a/drivers/power/avs/Makefile b/drivers/power/avs/Makefile index 9007d05853e2..d611a465cd42 100644 --- a/drivers/power/avs/Makefile +++ b/drivers/power/avs/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_POWER_AVS_OMAP) += smartreflex.o obj-$(CONFIG_QCOM_CPR) += qcom-cpr.o -obj-$(CONFIG_ROCKCHIP_IODOMAIN) += rockchip-io-domain.o diff --git a/drivers/soc/rockchip/Kconfig b/drivers/soc/rockchip/Kconfig index b71b73bf5fc5..2c13bf4dd5db 100644 --- a/drivers/soc/rockchip/Kconfig +++ b/drivers/soc/rockchip/Kconfig @@ -14,6 +14,14 @@ config ROCKCHIP_GRF In a lot of cases there also need to be default settings initialized to make some of them conform to expectations of the kernel. +config ROCKCHIP_IODOMAIN + tristate "Rockchip IO domain support" + depends on OF + help + Say y here to enable support io domains on Rockchip SoCs. It is + necessary for the io domain setting of the SoC to match the + voltage supplied by the regulators. + config ROCKCHIP_PM_DOMAINS bool "Rockchip generic power domain" depends on PM diff --git a/drivers/soc/rockchip/Makefile b/drivers/soc/rockchip/Makefile index afca0a4c4b72..875032f7344e 100644 --- a/drivers/soc/rockchip/Makefile +++ b/drivers/soc/rockchip/Makefile @@ -3,4 +3,5 @@ # Rockchip Soc drivers # obj-$(CONFIG_ROCKCHIP_GRF) += grf.o +obj-$(CONFIG_ROCKCHIP_IODOMAIN) += io-domain.o obj-$(CONFIG_ROCKCHIP_PM_DOMAINS) += pm_domains.o diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/soc/rockchip/io-domain.c similarity index 100% rename from drivers/power/avs/rockchip-io-domain.c rename to drivers/soc/rockchip/io-domain.c From bca815d620544c27288abf4841e39922d694425c Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 6 Oct 2020 18:05:15 +0200 Subject: [PATCH 340/497] PM: AVS: smartreflex Move driver to soc specific drivers The avs drivers are all SoC specific drivers that doesn't share any code. Instead they are located in a directory, mostly to keep similar functionality together. From a maintenance point of view, it makes better sense to collect SoC specific drivers like these, into the SoC specific directories. Therefore, let's move the smartreflex driver for OMAP to the ti directory. Signed-off-by: Ulf Hansson Reviewed-by: Nishanth Menon Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 4 ++-- arch/arm/plat-omap/Kconfig | 2 +- drivers/power/avs/Kconfig | 12 ------------ drivers/power/avs/Makefile | 1 - drivers/soc/ti/Makefile | 1 + drivers/{power/avs => soc/ti}/smartreflex.c | 0 6 files changed, 4 insertions(+), 16 deletions(-) rename drivers/{power/avs => soc/ti}/smartreflex.c (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 8c5556df75a8..6c17687faa52 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5387,11 +5387,11 @@ F: include/linux/debugfs.h F: include/linux/kobj* F: lib/kobj* -DRIVERS FOR ADAPTIVE VOLTAGE SCALING (AVS) +DRIVERS FOR OMAP ADAPTIVE VOLTAGE SCALING (AVS) M: Nishanth Menon L: linux-pm@vger.kernel.org S: Maintained -F: drivers/power/avs/ +F: drivers/soc/ti/smartreflex.c F: include/linux/power/smartreflex.h DRM DRIVER FOR ALLWINNER DE2 AND DE3 ENGINE diff --git a/arch/arm/plat-omap/Kconfig b/arch/arm/plat-omap/Kconfig index 93fd7fc537cf..272670ef1e92 100644 --- a/arch/arm/plat-omap/Kconfig +++ b/arch/arm/plat-omap/Kconfig @@ -23,7 +23,7 @@ config OMAP_DEBUG_LEDS config POWER_AVS_OMAP bool "AVS(Adaptive Voltage Scaling) support for OMAP IP versions 1&2" - depends on POWER_AVS && (ARCH_OMAP3 || ARCH_OMAP4) && PM + depends on (ARCH_OMAP3 || ARCH_OMAP4) && PM select POWER_SUPPLY help Say Y to enable AVS(Adaptive Voltage Scaling) diff --git a/drivers/power/avs/Kconfig b/drivers/power/avs/Kconfig index e31215680771..d789509ae7e9 100644 --- a/drivers/power/avs/Kconfig +++ b/drivers/power/avs/Kconfig @@ -1,16 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -menuconfig POWER_AVS - bool "Adaptive Voltage Scaling class support" - help - AVS is a power management technique which finely controls the - operating voltage of a device in order to optimize (i.e. reduce) - its power consumption. - At a given operating point the voltage is adapted depending on - static factors (chip manufacturing process) and dynamic factors - (temperature depending performance). - AVS is also called SmartReflex on OMAP devices. - - Say Y here to enable Adaptive Voltage Scaling class support. config QCOM_CPR tristate "QCOM Core Power Reduction (CPR) support" diff --git a/drivers/power/avs/Makefile b/drivers/power/avs/Makefile index d611a465cd42..735832f47214 100644 --- a/drivers/power/avs/Makefile +++ b/drivers/power/avs/Makefile @@ -1,3 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_POWER_AVS_OMAP) += smartreflex.o obj-$(CONFIG_QCOM_CPR) += qcom-cpr.o diff --git a/drivers/soc/ti/Makefile b/drivers/soc/ti/Makefile index 1110e5c98685..5463431ec96c 100644 --- a/drivers/soc/ti/Makefile +++ b/drivers/soc/ti/Makefile @@ -12,3 +12,4 @@ obj-$(CONFIG_TI_SCI_PM_DOMAINS) += ti_sci_pm_domains.o obj-$(CONFIG_TI_SCI_INTA_MSI_DOMAIN) += ti_sci_inta_msi.o obj-$(CONFIG_TI_K3_RINGACC) += k3-ringacc.o obj-$(CONFIG_TI_K3_SOCINFO) += k3-socinfo.o +obj-$(CONFIG_POWER_AVS_OMAP) += smartreflex.o diff --git a/drivers/power/avs/smartreflex.c b/drivers/soc/ti/smartreflex.c similarity index 100% rename from drivers/power/avs/smartreflex.c rename to drivers/soc/ti/smartreflex.c From a97cbcd00f05a23146fdd8269011c40b7229242d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 16 Oct 2020 10:44:05 -0700 Subject: [PATCH 341/497] ALSA: sparc: dbri: fix repeated word 'the' Change the duplicated word "the" to "Then the". Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20201016174405.17745-1-rdunlap@infradead.org Signed-off-by: Takashi Iwai --- sound/sparc/dbri.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c index 913adc8568d5..5a6fb66dd118 100644 --- a/sound/sparc/dbri.c +++ b/sound/sparc/dbri.c @@ -620,7 +620,7 @@ A circular command buffer is used here. A new command is being added while another can be executed. The scheme works by adding two WAIT commands after each sent batch of commands. When the next batch is prepared it is added after the WAIT commands then the WAITs are replaced with single JUMP -command to the new batch. The the DBRI is forced to reread the last WAIT +command to the new batch. Then the DBRI is forced to reread the last WAIT command (replaced by the JUMP by then). If the DBRI is still executing previous commands the request to reread the WAIT command is ignored. From 9ce88a13b3016436441fec0b8b00ce8116f91269 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 16 Oct 2020 23:49:13 +0100 Subject: [PATCH 342/497] ALSA: hda/ca0132: make some const arrays static, makes object smaller Don't populate const arrays on the stack but instead make them static. Makes the object code smaller by 57 bytes. Before: text data bss dec hex filename 173256 38016 192 211464 33a08 sound/pci/hda/patch_ca0132.o After: text data bss dec hex filename 172879 38336 192 211407 339cf sound/pci/hda/patch_ca0132.o (gcc version 10.2.0) Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20201016224913.687724-1-colin.king@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_ca0132.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index 2b38b2a716a1..e0c38f2735c6 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -7910,8 +7910,12 @@ static void ae7_post_dsp_asi_stream_setup(struct hda_codec *codec) static void ae7_post_dsp_pll_setup(struct hda_codec *codec) { - const unsigned int addr[] = { 0x41, 0x45, 0x40, 0x43, 0x51 }; - const unsigned int data[] = { 0xc8, 0xcc, 0xcb, 0xc7, 0x8d }; + static const unsigned int addr[] = { + 0x41, 0x45, 0x40, 0x43, 0x51 + }; + static const unsigned int data[] = { + 0xc8, 0xcc, 0xcb, 0xc7, 0x8d + }; unsigned int i; for (i = 0; i < ARRAY_SIZE(addr); i++) { @@ -7925,10 +7929,12 @@ static void ae7_post_dsp_pll_setup(struct hda_codec *codec) static void ae7_post_dsp_asi_setup_ports(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - const unsigned int target[] = { 0x0b, 0x04, 0x06, 0x0a, 0x0c, 0x11, - 0x12, 0x13, 0x14 }; - const unsigned int data[] = { 0x12, 0x00, 0x48, 0x05, 0x5f, 0xff, - 0xff, 0xff, 0x7f }; + static const unsigned int target[] = { + 0x0b, 0x04, 0x06, 0x0a, 0x0c, 0x11, 0x12, 0x13, 0x14 + }; + static const unsigned int data[] = { + 0x12, 0x00, 0x48, 0x05, 0x5f, 0xff, 0xff, 0xff, 0x7f + }; unsigned int i; mutex_lock(&spec->chipio_mutex); From 3c532798ec96b6c2d77706f04ed1d8b566a805df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 3 Oct 2020 10:49:22 -0600 Subject: [PATCH 343/497] tracehook: clear TIF_NOTIFY_RESUME in tracehook_notify_resume() All the callers currently do this, clean it up and move the clearing into tracehook_notify_resume() instead. Reviewed-by: Oleg Nesterov Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- arch/alpha/kernel/signal.c | 1 - arch/arc/kernel/signal.c | 2 +- arch/arm/kernel/signal.c | 1 - arch/arm64/kernel/signal.c | 1 - arch/c6x/kernel/signal.c | 4 +--- arch/csky/kernel/signal.c | 1 - arch/h8300/kernel/signal.c | 4 +--- arch/hexagon/kernel/process.c | 1 - arch/ia64/kernel/process.c | 2 +- arch/m68k/kernel/signal.c | 2 +- arch/microblaze/kernel/signal.c | 2 +- arch/mips/kernel/signal.c | 1 - arch/nds32/kernel/signal.c | 4 +--- arch/nios2/kernel/signal.c | 2 +- arch/openrisc/kernel/signal.c | 1 - arch/parisc/kernel/signal.c | 4 +--- arch/powerpc/kernel/signal.c | 1 - arch/riscv/kernel/signal.c | 4 +--- arch/s390/kernel/signal.c | 1 - arch/sh/kernel/signal_32.c | 4 +--- arch/sparc/kernel/signal_32.c | 4 +--- arch/sparc/kernel/signal_64.c | 4 +--- arch/um/kernel/process.c | 2 +- arch/xtensa/kernel/signal.c | 2 +- include/linux/tracehook.h | 4 ++-- kernel/entry/common.c | 1 - kernel/entry/kvm.c | 4 +--- 27 files changed, 18 insertions(+), 46 deletions(-) diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 15bc9d1e79f4..3739efce1ec0 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -531,7 +531,6 @@ do_work_pending(struct pt_regs *regs, unsigned long thread_flags, do_signal(regs, r0, r19); r0 = 0; } else { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } } diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index 8222f8c54690..2be55fb96d87 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -394,6 +394,6 @@ void do_notify_resume(struct pt_regs *regs) * ASM glue gaurantees that this is only called when returning to * user mode */ - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index c1892f733f20..585edbfccf6d 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -669,7 +669,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) } else if (thread_flags & _TIF_UPROBE) { uprobe_notify_resume(regs); } else { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index bdcaaf091e1e..a8184cad8890 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -946,7 +946,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, do_signal(regs); if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index d05c78eace1b..a3f15b9a79da 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -316,8 +316,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags, if (thread_info_flags & (1 << TIF_SIGPENDING)) do_signal(regs, syscall); - if (thread_info_flags & (1 << TIF_NOTIFY_RESUME)) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & (1 << TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); - } } diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index 970895df75ec..8b068cf37447 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -261,7 +261,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, do_signal(regs); if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 69e68949787f..75d9b7e626b2 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -282,8 +282,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index dfd322c5ce83..5a0a95d93ddb 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -180,7 +180,6 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) } if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); return 1; } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index f25f2f723196..6b61a703bcf5 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -176,7 +176,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) ia64_do_signal(scr, in_syscall); } - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) { + if (test_thread_flag(TIF_NOTIFY_RESUME)) { local_irq_enable(); /* force interrupt enable */ tracehook_notify_resume(&scr->pt); } diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index a98fca977073..29e174a80bf6 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1134,6 +1134,6 @@ void do_notify_resume(struct pt_regs *regs) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 4a96b59f0bee..f11a0ccccabc 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -316,6 +316,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs, in_syscall); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index f44265025281..50d0515bea21 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -907,7 +907,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, do_signal(regs); if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c index 36e25a410bb0..2acb94812af9 100644 --- a/arch/nds32/kernel/signal.c +++ b/arch/nds32/kernel/signal.c @@ -379,8 +379,6 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags) if (thread_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c index d8a087cf2b42..cf2dca2ac7c3 100644 --- a/arch/nios2/kernel/signal.c +++ b/arch/nios2/kernel/signal.c @@ -317,7 +317,7 @@ asmlinkage int do_notify_resume(struct pt_regs *regs) */ return restart; } - } else if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + } else if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); return 0; diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index c779364f0cd0..af66f968dd45 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -311,7 +311,6 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) } syscall = 0; } else { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } } diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 3c037fc96038..9f43eaeb0b0a 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -606,8 +606,6 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs, in_syscall); - if (test_thread_flag(TIF_NOTIFY_RESUME)) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); - } } diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index d15a98c758b8..74a94a125f0d 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -327,7 +327,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) } if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index e996e08f1061..bc6841867b51 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -313,8 +313,6 @@ asmlinkage __visible void do_notify_resume(struct pt_regs *regs, if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index b295090e2ce6..9e900a8977bd 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -535,7 +535,6 @@ void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 4fe3f00137bc..1add47fd31f6 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -502,8 +502,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0, if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, save_r0); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index d0e0025ee3ba..741d0701003a 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -523,10 +523,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, { if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, orig_i0); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } } asmlinkage int do_sys_sigstack(struct sigstack __user *ssptr, diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 255264bcb46a..f7ef7edcd5c1 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -551,10 +551,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long uprobe_notify_resume(regs); if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, orig_i0); - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (thread_info_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - } user_enter(); } diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 26b5e243d3fc..3bed09538dd9 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -101,7 +101,7 @@ void interrupt_end(void) schedule(); if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index b3b17d6c50f0..1fb1047f905c 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -501,6 +501,6 @@ void do_notify_resume(struct pt_regs *regs) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) + if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); } diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 36fb3bbed6b2..b480e1a07ed8 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -178,9 +178,9 @@ static inline void set_notify_resume(struct task_struct *task) */ static inline void tracehook_notify_resume(struct pt_regs *regs) { + clear_thread_flag(TIF_NOTIFY_RESUME); /* - * The caller just cleared TIF_NOTIFY_RESUME. This barrier - * pairs with task_work_add()->set_notify_resume() after + * This barrier pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ smp_mb__after_atomic(); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 145ab11b8318..971ef788b9ae 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -161,7 +161,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, arch_do_signal(regs); if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index eb1a8a4c867c..b6678a5e3cf6 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -16,10 +16,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(NULL); - } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) From 91989c707884ecc7cd537281ab1a4b8fb7219da3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Oct 2020 09:02:26 -0600 Subject: [PATCH 344/497] task_work: cleanup notification modes A previous commit changed the notification mode from true/false to an int, allowing notify-no, notify-yes, or signal-notify. This was backwards compatible in the sense that any existing true/false user would translate to either 0 (on notification sent) or 1, the latter which mapped to TWA_RESUME. TWA_SIGNAL was assigned a value of 2. Clean this up properly, and define a proper enum for the notification mode. Now we have: - TWA_NONE. This is 0, same as before the original change, meaning no notification requested. - TWA_RESUME. This is 1, same as before the original change, meaning that we use TIF_NOTIFY_RESUME. - TWA_SIGNAL. This uses TIF_SIGPENDING/JOBCTL_TASK_WORK for the notification. Clean up all the callers, switching their 0/1/false/true to using the appropriate TWA_* mode for notifications. Fixes: e91b48162332 ("task_work: teach task_work_add() to do signal_wake_up()") Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- arch/x86/kernel/cpu/mce/core.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- drivers/acpi/apei/ghes.c | 2 +- drivers/android/binder.c | 2 +- fs/file_table.c | 2 +- fs/io_uring.c | 13 +++++------ fs/namespace.c | 2 +- include/linux/task_work.h | 11 +++++++--- kernel/events/uprobes.c | 2 +- kernel/irq/manage.c | 2 +- kernel/sched/fair.c | 2 +- kernel/task_work.c | 30 +++++++++++++++++--------- security/keys/keyctl.c | 2 +- security/yama/yama_lsm.c | 2 +- 14 files changed, 46 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 1c08cb9eb9f6..4102b866e7c0 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1277,7 +1277,7 @@ static void queue_task_work(struct mce *m, int kill_it) else current->mce_kill_me.func = kill_me_maybe; - task_work_add(current, ¤t->mce_kill_me, true); + task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); } /* diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index b494187632b2..af323e2e3100 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -561,7 +561,7 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * callback has been invoked. */ atomic_inc(&rdtgrp->waitcount); - ret = task_work_add(tsk, &callback->work, true); + ret = task_work_add(tsk, &callback->work, TWA_RESUME); if (ret) { /* * Task is exiting. Drop the refcount and free the callback. diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 81bf71b10d44..8360f8d6be65 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -879,7 +879,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work) estatus_node->task_work.func = ghes_kick_task_work; estatus_node->task_work_cpu = smp_processor_id(); ret = task_work_add(current, &estatus_node->task_work, - true); + TWA_RESUME); if (ret) estatus_node->task_work.func = NULL; } diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 4b9476521da6..b5117576792b 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2229,7 +2229,7 @@ static void binder_deferred_fd_close(int fd) __close_fd_get_file(fd, &twcb->file); if (twcb->file) { filp_close(twcb->file, current->files); - task_work_add(current, &twcb->twork, true); + task_work_add(current, &twcb->twork, TWA_RESUME); } else { kfree(twcb); } diff --git a/fs/file_table.c b/fs/file_table.c index 656647f9575a..709ada3151da 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -339,7 +339,7 @@ void fput_many(struct file *file, unsigned int refs) if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_u.fu_rcuhead, ____fput); - if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) return; /* * After this task has run exit_task_work(), diff --git a/fs/io_uring.c b/fs/io_uring.c index e1726f457461..6b502885684a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1976,7 +1976,8 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - int ret, notify; + enum task_work_notify_mode notify; + int ret; if (tsk->flags & PF_EXITING) return -ESRCH; @@ -1987,7 +1988,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) * processing task_work. There's no reliable way to tell if TWA_RESUME * will do the job. */ - notify = 0; + notify = TWA_NONE; if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; @@ -2056,7 +2057,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } } @@ -2177,7 +2178,7 @@ static void io_free_req_deferred(struct io_kiocb *req) struct task_struct *tsk; tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } } @@ -3291,7 +3292,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* queue just for cancelation */ init_task_work(&req->task_work, io_req_task_cancel); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } return 1; @@ -4857,7 +4858,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, 0); + task_work_add(tsk, &req->task_work, TWA_NONE); wake_up_process(tsk); } return 1; diff --git a/fs/namespace.c b/fs/namespace.c index 294e05a13d17..1a75336668a3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1191,7 +1191,7 @@ static void mntput_no_expire(struct mount *mnt) struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); - if (!task_work_add(task, &mnt->mnt_rcu, true)) + if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 0fb93aafa478..0d848a1e9e62 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -13,9 +13,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func) twork->func = func; } -#define TWA_RESUME 1 -#define TWA_SIGNAL 2 -int task_work_add(struct task_struct *task, struct callback_head *twork, int); +enum task_work_notify_mode { + TWA_NONE, + TWA_RESUME, + TWA_SIGNAL, +}; + +int task_work_add(struct task_struct *task, struct callback_head *twork, + enum task_work_notify_mode mode); struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); void task_work_run(void); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e18aaf23a7b..00b0358739ab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1823,7 +1823,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); - task_work_add(t, &t->utask->dup_xol_work, true); + task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5df903fccb60..c460e0496006 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1162,7 +1162,7 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); - task_work_add(current, &on_exit_work, false); + task_work_add(current, &on_exit_work, TWA_NONE); irq_thread_check_affinity(desc, action); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa4c6227cd6d..e17012be4d14 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2928,7 +2928,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + task_work_add(curr, work, TWA_RESUME); } } diff --git a/kernel/task_work.c b/kernel/task_work.c index 613b2d634af8..8d6e1217c451 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,23 +9,28 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run - * @notify: send the notification if true + * @notify: how to notify the targeted task * - * Queue @work for task_work_run() below and notify the @task if @notify. - * Fails if the @task is exiting/exited and thus it can't process this @work. - * Otherwise @work->func() will be called when the @task returns from kernel - * mode or exits. + * Queue @work for task_work_run() below and notify the @task if @notify + * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the + * it will interrupt the targeted task and run the task_work. @TWA_RESUME + * work is run only when the task exits the kernel and returns to user mode, + * or before entering guest mode. Fails if the @task is exiting/exited and thus + * it can't process this @work. Otherwise @work->func() will be called when the + * @task goes through one of the aforementioned transitions, or exits. * - * This is like the signal handler which runs in kernel mode, but it doesn't - * try to wake up the @task. + * If the targeted task is exiting, then an error is returned and the work item + * is not queued. It's up to the caller to arrange for an alternative mechanism + * in that case. * - * Note: there is no ordering guarantee on works queued here. + * Note: there is no ordering guarantee on works queued here. The task_work + * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ -int -task_work_add(struct task_struct *task, struct callback_head *work, int notify) +int task_work_add(struct task_struct *task, struct callback_head *work, + enum task_work_notify_mode notify) { struct callback_head *head; unsigned long flags; @@ -38,6 +43,8 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) } while (cmpxchg(&task->task_works, head, work) != head); switch (notify) { + case TWA_NONE: + break; case TWA_RESUME: set_notify_resume(task); break; @@ -54,6 +61,9 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) unlock_task_sighand(task, &flags); } break; + default: + WARN_ON_ONCE(1); + break; } return 0; diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index e26bbccda7cc..61a614c21b9b 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1693,7 +1693,7 @@ long keyctl_session_to_parent(void) /* the replacement session keyring is applied just prior to userspace * restarting */ - ret = task_work_add(parent, newwork, true); + ret = task_work_add(parent, newwork, TWA_RESUME); if (!ret) newwork = NULL; unlock: diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 536c99646f6a..06e226166aab 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -99,7 +99,7 @@ static void report_access(const char *access, struct task_struct *target, info->access = access; info->target = target; info->agent = agent; - if (task_work_add(current, &info->work, true) == 0) + if (task_work_add(current, &info->work, TWA_RESUME) == 0) return; /* success */ WARN(1, "report_access called from exiting task"); From 652af650d3f619363f64edc69794f6141bdf492b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Oct 2020 14:46:39 +0200 Subject: [PATCH 345/497] ACPICA: Add missing type casts in GPE register access code Type casts needed on 32-bit systems are missing in two places in the GPE register access code, so add them. Fixes: 7a8379eb41a4 ("ACPICA: Add support for using logical addresses of GPE blocks") Reported-and-tested-by: Matthieu Baerts Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/hwgpe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 37bb67ef3232..b13a4ed5bc63 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -47,7 +47,7 @@ acpi_status acpi_hw_gpe_read(u64 *value, struct acpi_gpe_address *reg) if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) { #ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES - *value = (u64)ACPI_GET8(reg->address); + *value = (u64)ACPI_GET8((unsigned long)reg->address); return_ACPI_STATUS(AE_OK); #else return acpi_os_read_memory((acpi_physical_address)reg->address, @@ -82,7 +82,7 @@ acpi_status acpi_hw_gpe_write(u64 value, struct acpi_gpe_address *reg) { if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) { #ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES - ACPI_SET8(reg->address, value); + ACPI_SET8((unsigned long)reg->address, value); return_ACPI_STATUS(AE_OK); #else return acpi_os_write_memory((acpi_physical_address)reg->address, From 5368512abe08a28525d9b24abbfc2a72493e8dba Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Sun, 18 Oct 2020 22:57:41 -0500 Subject: [PATCH 346/497] acpi-cpufreq: Honor _PSD table setting on new AMD CPUs acpi-cpufreq has a old quirk that overrides the _PSD table supplied by BIOS on AMD CPUs. However the _PSD table of new AMD CPUs (Family 19h+) now accurately reports the P-state dependency of CPU cores. Hence this quirk needs to be fixed in order to support new CPUs' frequency control. Fixes: acd316248205 ("acpi-cpufreq: Add quirk to disable _PSD usage on all AMD CPUs") Signed-off-by: Wei Huang [ rjw: Subject edit ] Cc: 3.10+ # 3.10+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index e4ff681faaaa..1e4fbb002a31 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -691,7 +691,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) cpumask_copy(policy->cpus, topology_core_cpumask(cpu)); } - if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) { + if (check_amd_hwpstate_cpu(cpu) && boot_cpu_data.x86 < 0x19 && + !acpi_pstate_strict) { cpumask_clear(policy->cpus); cpumask_set_cpu(cpu, policy->cpus); cpumask_copy(data->freqdomain_cpus, From 0070ea29623904224c0f5fa279a16a4ac9223295 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 16 Oct 2020 11:17:22 -0700 Subject: [PATCH 347/497] cpufreq: schedutil: restore cached freq when next_f is not changed We have the raw cached freq to reduce the chance in calling cpufreq driver where it could be costly in some arch/SoC. Currently, the raw cached freq is reset in sugov_update_single() when it avoids frequency reduction (which is not desirable sometimes), but it is better to restore the previous value of it in that case, because it may not change in the next cycle and it is not necessary to change the CPU frequency then. Adapted from https://android-review.googlesource.com/1352810/ Signed-off-by: Wei Wang Acked-by: Viresh Kumar [ rjw: Subject edit and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 5ae7b4e6e8d6..e254745a82cb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -441,6 +441,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; bool busy; + unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -464,8 +465,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; + /* Restore cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = cached_freq; } /* From f8fee6e63e55a7fc0e53a460ae3523d9e4d9bd48 Mon Sep 17 00:00:00 2001 From: Hubert Jasudowicz Date: Sun, 18 Oct 2020 17:21:06 +0200 Subject: [PATCH 348/497] powercap: Fix typo in Kconfig "Plance" -> "Plane" Signed-off-by: Hubert Jasudowicz Signed-off-by: Rafael J. Wysocki --- drivers/powercap/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index ebc4d4578339..bc228725346b 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -30,7 +30,7 @@ config INTEL_RAPL In RAPL, the platform level settings are divided into domains for fine grained control. These domains include processor package, DRAM - controller, CPU core (Power Plance 0), graphics uncore (Power Plane + controller, CPU core (Power Plane 0), graphics uncore (Power Plane 1), etc. config IDLE_INJECT From eda4a7bf5d75b8b579c54622f2795696a02883b9 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Sun, 18 Oct 2020 16:54:01 -0400 Subject: [PATCH 349/497] docs: fb: Add font_6x8 to available built-in fonts Recently we added a new 6x8 font in commit e2028c8e6bf9 ("lib/fonts: add font 6x8 for OLED display"). Add its name to the "compiled-in fonts" list. Signed-off-by: Peilin Ye Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20201018205401.698242-1-yepeilin.cs@gmail.com --- Documentation/fb/fbcon.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/fb/fbcon.rst b/Documentation/fb/fbcon.rst index a7b487cba307..71cade19d92c 100644 --- a/Documentation/fb/fbcon.rst +++ b/Documentation/fb/fbcon.rst @@ -81,7 +81,7 @@ C. Boot options 1. fbcon=font: Select the initial font to use. The value 'name' can be any of the - compiled-in fonts: 10x18, 6x10, 7x14, Acorn8x8, MINI4x6, + compiled-in fonts: 10x18, 6x10, 6x8, 7x14, Acorn8x8, MINI4x6, PEARL8x8, ProFont6x11, SUN12x22, SUN8x16, TER16x32, VGA8x16, VGA8x8. Note, not all drivers can handle font with widths not divisible by 8, From 272d70895113ef00c03ab325787d159ee51718c8 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Sun, 18 Oct 2020 14:12:04 -0400 Subject: [PATCH 350/497] Fonts: Support FONT_EXTRA_WORDS macros for font_6x8 Recently, in commit 6735b4632def ("Fonts: Support FONT_EXTRA_WORDS macros for built-in fonts"), we wrapped each of our built-in data buffers in a `font_data` structure, in order to use the following macros on them, see include/linux/font.h: #define REFCOUNT(fd) (((int *)(fd))[-1]) #define FNTSIZE(fd) (((int *)(fd))[-2]) #define FNTCHARCNT(fd) (((int *)(fd))[-3]) #define FNTSUM(fd) (((int *)(fd))[-4]) #define FONT_EXTRA_WORDS 4 Do the same thing to our new 6x8 font. For built-in fonts, currently we only use FNTSIZE(). Since this is only a temporary solution for an out-of-bounds issue in the framebuffer layer (see commit 5af08640795b ("fbcon: Fix global-out-of-bounds read in fbcon_get_font()")), all the three other fields are intentionally set to zero in order to discourage using these negative-indexing macros. Signed-off-by: Peilin Ye Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/926453876c92caac34cba8545716a491754d04d5.1603037079.git.yepeilin.cs@gmail.com --- lib/fonts/font_6x8.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/fonts/font_6x8.c b/lib/fonts/font_6x8.c index e06447788418..700039a9ceae 100644 --- a/lib/fonts/font_6x8.c +++ b/lib/fonts/font_6x8.c @@ -3,8 +3,8 @@ #define FONTDATAMAX 2048 -static const unsigned char fontdata_6x8[FONTDATAMAX] = { - +static struct font_data fontdata_6x8 = { + { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 000000 */ 0x00, /* 000000 */ @@ -2564,13 +2564,13 @@ static const unsigned char fontdata_6x8[FONTDATAMAX] = { 0x00, /* 000000 */ 0x00, /* 000000 */ 0x00, /* 000000 */ -}; +} }; const struct font_desc font_6x8 = { .idx = FONT6x8_IDX, .name = "6x8", .width = 6, .height = 8, - .data = fontdata_6x8, + .data = fontdata_6x8.data, .pref = 0, }; From 628ade2d0816b2675ab61ba6aadfc9a94e3e1589 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 5 Oct 2020 12:55:31 -0700 Subject: [PATCH 351/497] KVM: VMX: Fix x2APIC MSR intercept handling on !APICV platforms Fix an inverted flag for intercepting x2APIC MSRs and intercept writes by default, even when APICV is enabled. Fixes: 3eb900173c71 ("KVM: x86: VMX: Prevent MSR passthrough when MSR access is denied") Co-developed-by: Peter Xu [sean: added changelog] Signed-off-by: Sean Christopherson Message-Id: <20201005195532.8674-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f7b6677e2a51..4797ec92c88c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3787,9 +3787,10 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) int msr; for (msr = 0x800; msr <= 0x8ff; msr++) { - bool intercepted = !!(mode & MSR_BITMAP_MODE_X2APIC_APICV); + bool apicv = !!(mode & MSR_BITMAP_MODE_X2APIC_APICV); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_RW, intercepted); + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, !apicv); + vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, true); } if (mode & MSR_BITMAP_MODE_X2APIC) { From 354842df3888d63dd0371358189cafde267b4a72 Mon Sep 17 00:00:00 2001 From: Sean Paul Date: Thu, 17 Sep 2020 20:28:42 -0400 Subject: [PATCH 352/497] drm/i915/dp: Tweak initial dpcd backlight.enabled value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 79946723092b ("drm/i915: Assume 100% brightness when not in DPCD control mode"), we fixed the brightness level when DPCD control was not active to max brightness. This is as good as we can guess since most backlights go on full when uncontrolled. However in doing so we changed the semantics of the initial 'backlight.enabled' value. At least on Pixelbooks, they were relying on the brightness level in DP_EDP_BACKLIGHT_BRIGHTNESS_MSB to be 0 on boot such that enabled would be false. This causes the device to be enabled when the brightness is set. Without this, brightness control doesn't work. So by changing brightness to max, we also flipped enabled to be true on boot. To fix this, make enabled a function of brightness and backlight control mechanism. Fixes: 79946723092b ("drm/i915: Assume 100% brightness when not in DPCD control mode") Cc: Lyude Paul Cc: Jani Nikula Cc: Juha-Pekka Heikkila Cc: "Ville Syrjälä" Cc: Rodrigo Vivi Cc: Kevin Chowski > Signed-off-by: Sean Paul Reviewed-by: Lyude Paul Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20200918002845.32766-1-sean@poorly.run (cherry picked from commit 4ade8f31c25bef7ce7ed4d7cbac17df7c4bad850) Signed-off-by: Rodrigo Vivi --- .../drm/i915/display/intel_dp_aux_backlight.c | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c index acbd7eb66cbe..036f504ac7db 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c +++ b/drivers/gpu/drm/i915/display/intel_dp_aux_backlight.c @@ -52,6 +52,25 @@ static void set_aux_backlight_enable(struct intel_dp *intel_dp, bool enable) } } +static bool intel_dp_aux_backlight_dpcd_mode(struct intel_connector *connector) +{ + struct intel_dp *intel_dp = intel_attached_dp(connector); + struct drm_i915_private *i915 = dp_to_i915(intel_dp); + u8 mode_reg; + + if (drm_dp_dpcd_readb(&intel_dp->aux, + DP_EDP_BACKLIGHT_MODE_SET_REGISTER, + &mode_reg) != 1) { + drm_dbg_kms(&i915->drm, + "Failed to read the DPCD register 0x%x\n", + DP_EDP_BACKLIGHT_MODE_SET_REGISTER); + return false; + } + + return (mode_reg & DP_EDP_BACKLIGHT_CONTROL_MODE_MASK) == + DP_EDP_BACKLIGHT_CONTROL_MODE_DPCD; +} + /* * Read the current backlight value from DPCD register(s) based * on if 8-bit(MSB) or 16-bit(MSB and LSB) values are supported @@ -61,24 +80,13 @@ static u32 intel_dp_aux_get_backlight(struct intel_connector *connector) struct intel_dp *intel_dp = intel_attached_dp(connector); struct drm_i915_private *i915 = dp_to_i915(intel_dp); u8 read_val[2] = { 0x0 }; - u8 mode_reg; u16 level = 0; - if (drm_dp_dpcd_readb(&intel_dp->aux, - DP_EDP_BACKLIGHT_MODE_SET_REGISTER, - &mode_reg) != 1) { - drm_dbg_kms(&i915->drm, - "Failed to read the DPCD register 0x%x\n", - DP_EDP_BACKLIGHT_MODE_SET_REGISTER); - return 0; - } - /* * If we're not in DPCD control mode yet, the programmed brightness * value is meaningless and we should assume max brightness */ - if ((mode_reg & DP_EDP_BACKLIGHT_CONTROL_MODE_MASK) != - DP_EDP_BACKLIGHT_CONTROL_MODE_DPCD) + if (!intel_dp_aux_backlight_dpcd_mode(connector)) return connector->panel.backlight.max; if (drm_dp_dpcd_read(&intel_dp->aux, DP_EDP_BACKLIGHT_BRIGHTNESS_MSB, @@ -319,7 +327,8 @@ static int intel_dp_aux_setup_backlight(struct intel_connector *connector, panel->backlight.min = 0; panel->backlight.level = intel_dp_aux_get_backlight(connector); - panel->backlight.enabled = panel->backlight.level != 0; + panel->backlight.enabled = intel_dp_aux_backlight_dpcd_mode(connector) && + panel->backlight.level != 0; return 0; } From 849c0fe9e831dcebea1b46e2237e13f274a8756a Mon Sep 17 00:00:00 2001 From: Ayaz A Siddiqui Date: Wed, 29 Jul 2020 15:55:39 +0530 Subject: [PATCH 353/497] drm/i915/gt: Initialize reserved and unspecified MOCS indices In order to avoid functional breakage of mis-programmed applications that have grown to depend on unused MOCS entries, we are programming those entries to be equal to fully cached ("L3 + LLC") entry. These reserved and unspecified entries should not be used as they may be changed to less performant variants with better coherency in the future if more entries are needed. v2: As suggested by Lucas De Marchi to utilise __init_mocs_table for programming default value, setting I915_MOCS_PTE index of tgl_mocs_table with desired value. Cc: Chris Wilson Cc: Lucas De Marchi Cc: Tomasz Lis Cc: Matt Roper Cc: Joonas Lahtinen Cc: Francisco Jerez Cc: Mathew Alwin Cc: Mcguire Russell W Cc: Spruit Neil R Cc: Zhou Cheng Cc: Benemelis Mike G Signed-off-by: Ayaz A Siddiqui Reviewed-by: Lucas De Marchi Acked-by: Joonas Lahtinen Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20200729102539.134731-2-ayaz.siddiqui@intel.com Cc: stable@vger.kernel.org (cherry picked from commit 4d8a5cfe3b131f60903949f998c5961cc922e0b0) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_mocs.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index 632e08a4592b..b8f56e62158e 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -234,11 +234,17 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { L3_1_UC) static const struct drm_i915_mocs_entry tgl_mocs_table[] = { - /* Base - Error (Reserved for Non-Use) */ - MOCS_ENTRY(0, 0x0, 0x0), - /* Base - Reserved */ - MOCS_ENTRY(1, 0x0, 0x0), - + /* + * NOTE: + * Reserved and unspecified MOCS indices have been set to (L3 + LCC). + * These reserved entries should never be used, they may be changed + * to low performant variants with better coherency in the future if + * more entries are needed. We are programming index I915_MOCS_PTE(1) + * only, __init_mocs_table() take care to program unused index with + * this entry. + */ + MOCS_ENTRY(1, LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_3_WB), GEN11_MOCS_ENTRIES, /* Implicitly enable L1 - HDC:L1 + L3 + LLC */ From 1664ffee760a5d98952318fdd9b198fae396d660 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 15 Oct 2020 13:21:35 +0100 Subject: [PATCH 354/497] drm/i915: Mark ininitial fb obj as WT on eLLC machines to avoid rcu lockup during fbdev init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we leave the cache_level of the initial fb obj set to NONE. This means on eLLC machines the first pin_to_display() will try to switch it to WT which requires a vma unbind+bind. If that happens during the fbdev initialization rcu does not seem operational which causes the unbind to get stuck. To most appearances this looks like a dead machine on boot. Avoid the unbind by already marking the object cache_level as WT when creating it. We still do an excplicit ggtt pin which will rewrite the PTEs anyway, so they will match whatever cache level we set. Cc: # v5.7+ Suggested-by: Chris Wilson Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2381 Signed-off-by: Ville Syrjälä Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201007120329.17076-1-ville.syrjala@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20201015122138.30161-1-chris@chris-wilson.co.uk (cherry picked from commit d46b60a2e8d246f1f0faa38e52f4f5a73858c338) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/intel_display.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index fac4657a286c..d9494fd7c305 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -3434,6 +3434,14 @@ initial_plane_vma(struct drm_i915_private *i915, if (IS_ERR(obj)) return NULL; + /* + * Mark it WT ahead of time to avoid changing the + * cache_level during fbdev initialization. The + * unbind there would get stuck waiting for rcu. + */ + i915_gem_object_set_cache_coherency(obj, HAS_WT(i915) ? + I915_CACHE_WT : I915_CACHE_NONE); + switch (plane_config->tiling) { case I915_TILING_NONE: break; From d5e8782129c22036425f29f9b6a254895482d7bd Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 15 Oct 2020 12:59:54 +0100 Subject: [PATCH 355/497] drm/i915/gem: Support parsing of oversize batches Matthew Auld noted that on more recent systems (such as the parser for gen9) we may have objects that are larger than expected by the GEM uAPI (i.e. greater than u32). These objects would have incorrect implicit batch lengths, causing the parser to reject them for being incomplete, or worse. Based on a patch by Matthew Auld. Reported-by: Matthew Auld Fixes: 435e8fc059db ("drm/i915: Allow parsing of unsized batches") Testcase: igt/gem_exec_params/larger-than-life-batch Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Mika Kuoppala Cc: Jon Bloomfield Reviewed-by: Matthew Auld Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20201015115954.871-1-chris@chris-wilson.co.uk (cherry picked from commit 57b2d834bf235daab388c3ba12d035c820ae09c6) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 4b09bcd70cf4..1904e6e5ea64 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -287,8 +287,8 @@ struct i915_execbuffer { u64 invalid_flags; /** Set of execobj.flags that are invalid */ u32 context_flags; /** Set of execobj.flags to insert from the ctx */ + u64 batch_len; /** Length of batch within object */ u32 batch_start_offset; /** Location within object of batch */ - u32 batch_len; /** Length of batch within object */ u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ @@ -871,6 +871,10 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) if (eb->batch_len == 0) eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; + if (unlikely(eb->batch_len == 0)) { /* impossible! */ + drm_dbg(&i915->drm, "Invalid batch length\n"); + return -EINVAL; + } return 0; @@ -2424,7 +2428,7 @@ static int eb_parse(struct i915_execbuffer *eb) struct drm_i915_private *i915 = eb->i915; struct intel_gt_buffer_pool_node *pool = eb->batch_pool; struct i915_vma *shadow, *trampoline, *batch; - unsigned int len; + unsigned long len; int err; if (!eb_use_cmdparser(eb)) { @@ -2449,6 +2453,8 @@ static int eb_parse(struct i915_execbuffer *eb) } else { len += I915_CMD_PARSER_TRAMPOLINE_SIZE; } + if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */ + return -EINVAL; if (!pool) { pool = intel_gt_get_buffer_pool(eb->engine->gt, len); From 9b99e5ba3e5d68039bd6b657e4bbe520a3521f4c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 15 Oct 2020 20:50:23 +0100 Subject: [PATCH 356/497] drm/i915/gt: Delay execlist processing for tgl When running gem_exec_nop, it floods the system with many requests (with the goal of userspace submitting faster than the HW can process a single empty batch). This causes the driver to continually resubmit new requests onto the end of an active context, a flood of lite-restore preemptions. If we time this just right, Tigerlake hangs. Inserting a small delay between the processing of CS events and submitting the next context, prevents the hang. Naturally it does not occur with debugging enabled. The suspicion then is that this is related to the issues with the CS event buffer, and inserting an mmio read of the CS pointer status appears to be very successful in preventing the hang. Other registers, or uncached reads, or plain mb, do not prevent the hang, suggesting that register is key -- but that the hang can be prevented by a simple udelay, suggests it is just a timing issue like that encountered by commit 233c1ae3c83f ("drm/i915/gt: Wait for CSB entries on Tigerlake"). Also note that the hang is not prevented by applying CTX_DESC_FORCE_RESTORE, or by inserting a delay on the GPU between requests. Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Bruce Chang Cc: Joonas Lahtinen Cc: stable@vger.kernel.org Acked-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201015195023.32346-1-chris@chris-wilson.co.uk (cherry picked from commit 6ca7217dffaf1abba91558e67a2efb655ac91405) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_lrc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 0412a44f25f2..bff237a8843a 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2649,6 +2649,9 @@ static void process_csb(struct intel_engine_cs *engine) smp_wmb(); /* complete the seqlock */ WRITE_ONCE(execlists->active, execlists->inflight); + /* XXX Magic delay for tgl */ + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); + WRITE_ONCE(execlists->pending[0], NULL); } else { if (GEM_WARN_ON(!*execlists->active)) { From 64402570e12f7b63ab33fc4640d3709c9ce2b380 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 2 Oct 2020 09:34:25 +0100 Subject: [PATCH 357/497] drm/i915/gt: Undo forced context restores after trivial preemptions We may try to preempt the currently executing request, only to find that after unravelling all the dependencies that the original executing context is still the earliest in the topological sort and re-submitted back to HW (if we do detect some change in the ELSP that requires re-submission). However, due to the way we check for wrap-around during the unravelling, we mark any context that has been submitted just once (i.e. with the rq->wa_tail set, but the ring->tail earlier) as potentially wrapping and requiring a forced restore on resubmission. This was expected to be not a problem, as it was anticipated that most unwinding for preemption would result in a context switch and the few that did not would be lost in the noise. It did not take long for someone to find one particular workload where the cost of those extra context restores was measurable. However, since we know the wa_tail is of fixed size, and we know that a request must be larger than the wa_tail itself, we can safely maintain the check for request wrapping and check against a slightly future point in the ring that includes an expected wa_tail. (That is if the ring->tail is already set to rq->wa_tail, including another 8 bytes in the check does not invalidate the incremental wrap detection.) Fixes: 8ab3a3812aa9 ("drm/i915/gt: Incrementally check for rewinding") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Bruce Chang Cc: Ramalingam C Cc: Joonas Lahtinen Cc: # v5.4+ Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201002083425.4605-1-chris@chris-wilson.co.uk (cherry picked from commit bb65548e3c6e299175a9e8c3e24b2b9577656a5d) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_lrc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index bff237a8843a..341ff8e3af4c 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1140,9 +1140,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) /* Check in case we rollback so far we wrap [size/2] */ if (intel_ring_direction(rq->ring, - intel_ring_wrap(rq->ring, - rq->tail), - rq->ring->tail) > 0) + rq->tail, + rq->ring->tail + 8) > 0) rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; active = rq; From db9bc2d35f49fed248296d3216597b078c0bab37 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 16 Oct 2020 10:25:27 +0100 Subject: [PATCH 358/497] drm/i915: Use the active reference on the vma while capturing During error capture, we need to take a reference to the vma from before the reset in order to catpure the contents of the vma later. Currently we are using both an active reference and a kref, but due to nature of the i915_vma reference handling, that kref is on the vma->obj and not the vma itself. This means the vma may be destroyed as soon as it is idle, that is in between the i915_active_release(&vma->active) and the i915_vma_put(vma): <3> [197.866181] BUG: KASAN: use-after-free in intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <3> [197.866339] Read of size 8 at addr ffff8881258cb800 by task gem_exec_captur/1041 <3> [197.866467] <4> [197.866512] CPU: 2 PID: 1041 Comm: gem_exec_captur Not tainted 5.9.0-g5e4234f97efba-kasan_200+ #1 <4> [197.866521] Hardware name: Intel Corp. Broxton P/Apollolake RVP1A, BIOS APLKRVPA.X64.0150.B11.1608081044 08/08/2016 <4> [197.866530] Call Trace: <4> [197.866549] dump_stack+0x99/0xd0 <4> [197.866760] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.866783] print_address_description.constprop.8+0x3e/0x60 <4> [197.866797] ? kmsg_dump_rewind_nolock+0xd4/0xd4 <4> [197.866819] ? lockdep_hardirqs_off+0xd4/0x120 <4> [197.867037] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867249] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867270] kasan_report.cold.10+0x1f/0x37 <4> [197.867492] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867710] intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867949] i915_gpu_coredump.part.29+0x150/0x7b0 [i915] <4> [197.868186] i915_capture_error_state+0x5e/0xc0 [i915] <4> [197.868396] intel_gt_handle_error+0x6eb/0xa20 [i915] <4> [197.868624] ? intel_gt_reset_global+0x370/0x370 [i915] <4> [197.868644] ? check_flags+0x50/0x50 <4> [197.868662] ? __lock_acquire+0xd59/0x6b00 <4> [197.868678] ? register_lock_class+0x1ad0/0x1ad0 <4> [197.868944] i915_wedged_set+0xcf/0x1b0 [i915] <4> [197.869147] ? i915_wedged_get+0x90/0x90 [i915] <4> [197.869371] ? i915_wedged_get+0x90/0x90 [i915] <4> [197.869398] simple_attr_write+0x153/0x1c0 <4> [197.869428] full_proxy_write+0xee/0x180 <4> [197.869442] ? __sb_start_write+0x1f3/0x310 <4> [197.869465] vfs_write+0x1a3/0x640 <4> [197.869492] ksys_write+0xec/0x1c0 <4> [197.869507] ? __ia32_sys_read+0xa0/0xa0 <4> [197.869525] ? lockdep_hardirqs_on_prepare+0x32b/0x4e0 <4> [197.869541] ? syscall_enter_from_user_mode+0x1c/0x50 <4> [197.869566] do_syscall_64+0x33/0x80 <4> [197.869579] entry_SYSCALL_64_after_hwframe+0x44/0xa9 <4> [197.869590] RIP: 0033:0x7fd8b7aee281 <4> [197.869604] Code: c3 0f 1f 84 00 00 00 00 00 48 8b 05 59 8d 20 00 c3 0f 1f 84 00 00 00 00 00 8b 05 8a d1 20 00 85 c0 75 16 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 41 54 55 49 89 d4 53 <4> [197.869613] RSP: 002b:00007ffea3b72008 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 <4> [197.869625] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fd8b7aee281 <4> [197.869633] RDX: 0000000000000002 RSI: 00007fd8b81a82e7 RDI: 000000000000000d <4> [197.869641] RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000034 <4> [197.869650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fd8b81a82e7 <4> [197.869658] R13: 000000000000000d R14: 0000000000000000 R15: 0000000000000000 <3> [197.869707] <3> [197.869757] Allocated by task 1041: <4> [197.869833] kasan_save_stack+0x19/0x40 <4> [197.869843] __kasan_kmalloc.constprop.5+0xc1/0xd0 <4> [197.869853] kmem_cache_alloc+0x106/0x8e0 <4> [197.870059] i915_vma_instance+0x212/0x1930 [i915] <4> [197.870270] eb_lookup_vmas+0xe06/0x1d10 [i915] <4> [197.870475] i915_gem_do_execbuffer+0x131d/0x4080 [i915] <4> [197.870682] i915_gem_execbuffer2_ioctl+0x103/0x5d0 [i915] <4> [197.870701] drm_ioctl_kernel+0x1d2/0x270 <4> [197.870710] drm_ioctl+0x40d/0x85c <4> [197.870721] __x64_sys_ioctl+0x10d/0x170 <4> [197.870731] do_syscall_64+0x33/0x80 <4> [197.870740] entry_SYSCALL_64_after_hwframe+0x44/0xa9 <3> [197.870748] <3> [197.870798] Freed by task 22: <4> [197.870865] kasan_save_stack+0x19/0x40 <4> [197.870875] kasan_set_track+0x1c/0x30 <4> [197.870884] kasan_set_free_info+0x1b/0x30 <4> [197.870894] __kasan_slab_free+0x111/0x160 <4> [197.870903] kmem_cache_free+0xcd/0x710 <4> [197.871109] i915_vma_parked+0x618/0x800 [i915] <4> [197.871307] __gt_park+0xdb/0x1e0 [i915] <4> [197.871501] ____intel_wakeref_put_last+0xb1/0x190 [i915] <4> [197.871516] process_one_work+0x8dc/0x15d0 <4> [197.871525] worker_thread+0x82/0xb30 <4> [197.871535] kthread+0x36d/0x440 <4> [197.871545] ret_from_fork+0x22/0x30 <3> [197.871553] <3> [197.871602] The buggy address belongs to the object at ffff8881258cb740 which belongs to the cache i915_vma of size 968 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2553 Fixes: 2850748ef876 ("drm/i915: Pull i915_vma_pin under the vm->mutex") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: # v5.5+ Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20201016092527.29039-1-chris@chris-wilson.co.uk (cherry picked from commit 178536b8292ecd118f59d2fac4509c7e70b99854) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_gpu_error.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index a635ec8d0b94..cf6e47adfde6 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1312,7 +1312,7 @@ capture_vma(struct intel_engine_capture_vma *next, } strcpy(c->name, name); - c->vma = i915_vma_get(vma); + c->vma = vma; /* reference held while active */ c->next = next; return c; @@ -1402,7 +1402,6 @@ intel_engine_coredump_add_vma(struct intel_engine_coredump *ee, compress)); i915_active_release(&vma->active); - i915_vma_put(vma); capture = this->next; kfree(this); From ca05277e40216979d9976613322e64db23a850e0 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 14:49:20 +0100 Subject: [PATCH 359/497] drm/i915/gt: Widen CSB pointer to u64 for the parsers A CSB entry is 64b, and it is simpler for us to treat it as an array of 64b entries than as an array of pairs of 32b entries. Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200915134923.30088-1-chris@chris-wilson.co.uk (cherry picked from commit f24a44e52fbc9881fc5f3bcef536831a15a439f3) (cherry picked from commit 3d4dbe0e0f0d04ebcea917b7279586817da8cf46) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 2 +- drivers/gpu/drm/i915/gt/intel_lrc.c | 33 ++++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index c400aaa2287b..ee6312601c56 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -278,7 +278,7 @@ struct intel_engine_execlists { * * Note these register may be either mmio or HWSP shadow. */ - u32 *csb_status; + u64 *csb_status; /** * @csb_size: context status buffer FIFO size diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 341ff8e3af4c..1a7bbbb16356 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2463,7 +2463,7 @@ cancel_port_requests(struct intel_engine_execlists * const execlists) } static inline void -invalidate_csb_entries(const u32 *first, const u32 *last) +invalidate_csb_entries(const u64 *first, const u64 *last) { clflush((void *)first); clflush((void *)last); @@ -2495,14 +2495,12 @@ invalidate_csb_entries(const u32 *first, const u32 *last) * bits 47-57: sw context id of the lrc the GT switched away from * bits 58-63: sw counter of the lrc the GT switched away from */ -static inline bool -gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) +static inline bool gen12_csb_parse(const u64 *csb) { - u32 lower_dw = csb[0]; - u32 upper_dw = csb[1]; - bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); - bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); - bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; + u64 entry = READ_ONCE(*csb); + bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); + bool new_queue = + lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; /* * The context switch detail is not guaranteed to be 5 when a preemption @@ -2512,7 +2510,7 @@ gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) * would require some extra handling, but we don't support that. */ if (!ctx_away_valid || new_queue) { - GEM_BUG_ON(!ctx_to_valid); + GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry))); return true; } @@ -2521,12 +2519,11 @@ gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) * context switch on an unsuccessful wait instruction since we always * use polling mode. */ - GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); + GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry))); return false; } -static inline bool -gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) +static inline bool gen8_csb_parse(const u64 *csb) { return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); } @@ -2534,7 +2531,7 @@ gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) static void process_csb(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; - const u32 * const buf = execlists->csb_status; + const u64 * const buf = execlists->csb_status; const u8 num_entries = execlists->csb_size; u8 head, tail; @@ -2615,12 +2612,14 @@ static void process_csb(struct intel_engine_cs *engine) */ ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", - head, buf[2 * head + 0], buf[2 * head + 1]); + head, + upper_32_bits(buf[head]), + lower_32_bits(buf[head])); if (INTEL_GEN(engine->i915) >= 12) - promote = gen12_csb_parse(execlists, buf + 2 * head); + promote = gen12_csb_parse(buf + head); else - promote = gen8_csb_parse(execlists, buf + 2 * head); + promote = gen8_csb_parse(buf + head); if (promote) { struct i915_request * const *old = execlists->active; @@ -5159,7 +5158,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) } execlists->csb_status = - &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; + (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; execlists->csb_write = &engine->status_page.addr[intel_hws_csb_write_index(i915)]; From 4a9bb58aba6db4eba2a8b3aa1edc415c94a669a8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 14:49:21 +0100 Subject: [PATCH 360/497] drm/i915/gt: Wait for CSB entries on Tigerlake On Tigerlake, we are seeing a repeat of commit d8f505311717 ("drm/i915/icl: Forcibly evict stale csb entries") where, presumably, due to a missing Global Observation Point synchronisation, the write pointer of the CSB ringbuffer is updated _prior_ to the contents of the ringbuffer. That is we see the GPU report more context-switch entries for us to parse, but those entries have not been written, leading us to process stale events, and eventually report a hung GPU. However, this effect appears to be much more severe than we previously saw on Icelake (though it might be best if we try the same approach there as well and measure), and Bruce suggested the good idea of resetting the CSB entry after use so that we can detect when it has been updated by the GPU. By instrumenting how long that may be, we can set a reliable upper bound for how long we should wait for: 513 late, avg of 61 retries (590 ns), max of 1061 retries (10099 ns) Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2045 References: d8f505311717 ("drm/i915/icl: Forcibly evict stale csb entries") References: HSDES#22011327657, HSDES#1508287568 Suggested-by: Bruce Chang Signed-off-by: Chris Wilson Cc: Bruce Chang Cc: Mika Kuoppala Cc: stable@vger.kernel.org # v5.4 Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200915134923.30088-2-chris@chris-wilson.co.uk (cherry picked from commit 233c1ae3c83f21046c6c4083da904163ece8f110) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_lrc.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 1a7bbbb16356..a32aabce7901 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2497,9 +2497,22 @@ invalidate_csb_entries(const u64 *first, const u64 *last) */ static inline bool gen12_csb_parse(const u64 *csb) { - u64 entry = READ_ONCE(*csb); - bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); - bool new_queue = + bool ctx_away_valid; + bool new_queue; + u64 entry; + + /* HSD#22011248461 */ + entry = READ_ONCE(*csb); + if (unlikely(entry == -1)) { + preempt_disable(); + if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) + GEM_WARN_ON("50us CSB timeout"); + preempt_enable(); + } + WRITE_ONCE(*(u64 *)csb, -1); + + ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); + new_queue = lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; /* @@ -4006,6 +4019,8 @@ static void reset_csb_pointers(struct intel_engine_cs *engine) WRITE_ONCE(*execlists->csb_write, reset_value); wmb(); /* Make sure this is visible to HW (paranoia?) */ + /* Check that the GPU does indeed update the CSB entries! */ + memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); invalidate_csb_entries(&execlists->csb_status[0], &execlists->csb_status[reset_value]); From 511ac89e591ab9affce17a8be4c45f6c2bb837f0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 15 Oct 2020 00:14:47 -0500 Subject: [PATCH 361/497] smb3.1.1: print warning if server does not support requested encryption type If server does not support AES-256-GCM and it was required on mount, print warning message. Also log and return a different error message (EOPNOTSUPP) when encryption mechanism is not supported vs the case when an unknown unrequested encryption mechanism could be returned (EINVAL). Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2pdu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8cfc3122ae5c..d504bc296349 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -610,8 +610,19 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, return -EINVAL; } cifs_dbg(FYI, "SMB311 cipher type:%d\n", le16_to_cpu(ctxt->Ciphers[0])); - if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && - (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM)) { + if (require_gcm_256) { + if (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM) { + cifs_dbg(VFS, "Server does not support requested encryption type (AES256 GCM)\n"); + return -EOPNOTSUPP; + } + } else if (ctxt->Ciphers[0] == 0) { + /* e.g. if server only supported AES256_CCM (very unlikely) */ + cifs_dbg(VFS, "Server does not support requested encryption types\n"); + return -EOPNOTSUPP; + } else if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM) && + (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM)) { + /* server returned a cipher we didn't ask for */ pr_warn_once("Invalid SMB3.11 cipher returned\n"); return -EINVAL; } From fd08f2dbf0c2e95f8503e2c79339fe5711f1aa1d Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 15 Oct 2020 00:25:02 -0500 Subject: [PATCH 362/497] smb3.1.1: rename nonces used for GCM and CCM encryption Now that 256 bit encryption can be negotiated, update names of the nonces to match the updated official protocol documentation (e.g. AES_GCM_NONCE instead of AES_128GCM_NONCE) since they apply to both 128 bit and 256 bit encryption. Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 8 ++++---- fs/cifs/smb2pdu.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 76d82a60a550..dd1edabec328 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3821,9 +3821,9 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); tr_hdr->Flags = cpu_to_le16(0x01); if (cipher_type == SMB2_ENCRYPTION_AES128_GCM) - get_random_bytes(&tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else - get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CCM_NONCE); + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE); memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8); } @@ -3993,10 +3993,10 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, } if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) - memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE); + memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else { iv[0] = 3; - memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE); + memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE); } aead_request_set_crypt(req, sg, sg, crypt_len, iv); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 6f65f1cec8ad..05b010e5a061 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -128,8 +128,8 @@ struct smb2_sync_pdu { __le16 StructureSize2; /* size of wct area (varies, request specific) */ } __packed; -#define SMB3_AES128CCM_NONCE 11 -#define SMB3_AES128GCM_NONCE 12 +#define SMB3_AES_CCM_NONCE 11 +#define SMB3_AES_GCM_NONCE 12 /* Transform flags (for 3.0 dialect this flag indicates CCM */ #define TRANSFORM_FLAG_ENCRYPTED 0x0001 From 63ca5656350a9b798a20a8e5bd55be164a5abeb6 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 15 Oct 2020 23:41:40 -0500 Subject: [PATCH 363/497] smb3.1.1: set gcm256 when requested update smb encryption code to set 32 byte key length and to set gcm256 when requested on mount. Signed-off-by: Steve French --- fs/cifs/smb2glob.h | 1 + fs/cifs/smb2ops.c | 13 ++++++++++--- fs/cifs/smb2pdu.h | 1 + fs/cifs/smb2transport.c | 8 +++++--- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index cf20f0b5d836..99a1951a01ec 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -58,6 +58,7 @@ #define SMB2_HMACSHA256_SIZE (32) #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +#define SMB3_GCM256_CRYPTKEY_SIZE (32) /* Maximum buffer size value we can send with 1 credit */ #define SMB2_MAX_BUFFER_SIZE 65536 diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index dd1edabec328..48657ddbd75e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3820,7 +3820,8 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); tr_hdr->Flags = cpu_to_le16(0x01); - if (cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (cipher_type == SMB2_ENCRYPTION_AES256_GCM)) get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE); @@ -3954,7 +3955,12 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, tfm = enc ? server->secmech.ccmaesencrypt : server->secmech.ccmaesdecrypt; - rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + + if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); + else + rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + if (rc) { cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); return rc; @@ -3992,7 +3998,8 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, goto free_sg; } - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); else { iv[0] = 3; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 05b010e5a061..851c6cd4742a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -352,6 +352,7 @@ struct smb2_preauth_neg_context { /* Encryption Algorithms Ciphers */ #define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) #define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +/* we currently do not request AES256_CCM since presumably GCM faster */ #define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) #define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index c0348e3b1695..ebccd71cc60a 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -849,12 +849,13 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) struct crypto_aead *tfm; if (!server->secmech.ccmaesencrypt) { - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); if (IS_ERR(tfm)) { - cifs_server_dbg(VFS, "%s: Failed to alloc encrypt aead\n", + cifs_server_dbg(VFS, "%s: Failed alloc encrypt aead\n", __func__); return PTR_ERR(tfm); } @@ -862,7 +863,8 @@ smb3_crypto_aead_allocate(struct TCP_Server_Info *server) } if (!server->secmech.ccmaesdecrypt) { - if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) tfm = crypto_alloc_aead("gcm(aes)", 0, 0); else tfm = crypto_alloc_aead("ccm(aes)", 0, 0); From 0bd294b55a5de442370c29fa53bab17aef3ff318 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Thu, 15 Oct 2020 10:41:31 -0700 Subject: [PATCH 364/497] cifs: Return the error from crypt_message when enc/dec key not found. In crypt_message, when smb2_get_enc_key returns error, we need to return the error back to the caller. If not, we end up processing the message further, causing a kernel oops due to unwarranted access of memory. Call Trace: smb3_receive_transform+0x120/0x870 [cifs] cifs_demultiplex_thread+0xb53/0xc20 [cifs] ? cifs_handle_standard+0x190/0x190 [cifs] kthread+0x116/0x130 ? kthread_park+0x80/0x80 ret_from_fork+0x1f/0x30 Signed-off-by: Shyam Prasad N Reviewed-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg CC: Stable Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 48657ddbd75e..0dfa832a3de0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3944,7 +3944,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (rc) { cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); - return 0; + return rc; } rc = smb3_crypto_aead_allocate(server); From acf96fef46f271642b90aa658ba49e33ae34ddf0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 17 Oct 2020 03:54:27 -0500 Subject: [PATCH 365/497] smb3.1.1: do not fail if no encryption required but server doesn't support it There are cases where the server can return a cipher type of 0 and it not be an error. For example server supported no encryption types (e.g. server completely disabled encryption), or the server and client didn't support any encryption types in common (e.g. if a server only supported AES256_CCM). In those cases encryption would not be supported, but that can be ok if the client did not require encryption on mount and it should not return an error. In the case in which mount requested encryption ("seal" on mount) then checks later on during tree connection will return the proper rc, but if seal was not requested by client, since server is allowed to return 0 to indicate no supported cipher, we should not fail mount. Reported-by: Pavel Shilovsky Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index d504bc296349..025db5e8c183 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -616,9 +616,19 @@ static int decode_encrypt_ctx(struct TCP_Server_Info *server, return -EOPNOTSUPP; } } else if (ctxt->Ciphers[0] == 0) { - /* e.g. if server only supported AES256_CCM (very unlikely) */ - cifs_dbg(VFS, "Server does not support requested encryption types\n"); - return -EOPNOTSUPP; + /* + * e.g. if server only supported AES256_CCM (very unlikely) + * or server supported no encryption types or had all disabled. + * Since GLOBAL_CAP_ENCRYPTION will be not set, in the case + * in which mount requested encryption ("seal") checks later + * on during tree connection will return proper rc, but if + * seal not requested by client, since server is allowed to + * return 0 to indicate no supported cipher, we can't fail here + */ + server->cipher_type = 0; + server->capabilities &= ~SMB2_GLOBAL_CAP_ENCRYPTION; + pr_warn_once("Server does not support requested encryption types\n"); + return 0; } else if ((ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_CCM) && (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES128_GCM) && (ctxt->Ciphers[0] != SMB2_ENCRYPTION_AES256_GCM)) { From 79dce09ab02729a90cf6ce49c9aaaf17aa0d21db Mon Sep 17 00:00:00 2001 From: "longguang.yue" Date: Mon, 28 Sep 2020 10:49:38 +0800 Subject: [PATCH 366/497] ipvs: adjust the debug info in function set_tcp_state Outputting client,virtual,dst addresses info when tcp state changes, which makes the connection debug more clear Signed-off-by: longguang.yue Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto_tcp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index dc2e7da2742a..7da51390cea6 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -539,8 +539,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; - IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" - "%s:%d state: %s->%s conn->refcnt:%d\n", + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " + "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), @@ -548,10 +548,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', - IP_VS_DBG_ADDR(cp->daf, &cp->daddr), - ntohs(cp->dport), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->daf, &cp->daddr), + ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); From 4f25434bccc28cf8a07876ef5142a2869a674353 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Wed, 7 Oct 2020 12:32:52 -0700 Subject: [PATCH 367/497] netfilter: conntrack: connection timeout after re-register If the first packet conntrack sees after a re-register is an outgoing keepalive packet with no data (SEG.SEQ = SND.NXT-1), td_end is set to SND.NXT-1. When the peer correctly acknowledges SND.NXT, tcp_in_window fails check III (Upper bound for valid (s)ack: sack <= receiver.td_end) and returns false, which cascades into nf_conntrack_in setting skb->_nfct = 0 and in later conntrack iptables rules not matching. In cases where iptables are dropping packets that do not match conntrack rules this can result in idle tcp connections to time out. v2: adjust td_end when getting the reply rather than when sending out the keepalive packet. Fixes: f94e63801ab2 ("netfilter: conntrack: reset tcp maxwin on re-register") Signed-off-by: Francesco Ruggeri Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index e8c86ee4c1c4..c8fb2187ad4b 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -541,13 +541,20 @@ static bool tcp_in_window(const struct nf_conn *ct, swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; - /* - * We haven't seen traffic in the other direction yet - * but we have to tweak window tracking to pass III - * and IV until that happens. - */ - if (receiver->td_maxwin == 0) + if (receiver->td_maxwin == 0) { + /* We haven't seen traffic in the other + * direction yet but we have to tweak window + * tracking to pass III and IV until that + * happens. + */ receiver->td_end = receiver->td_maxend = sack; + } else if (sack == receiver->td_end + 1) { + /* Likely a reply to a keepalive. + * Needed for III. + */ + receiver->td_end++; + } + } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) From 68f9f9c2c3b6a7259f6a92bc26cdc7bd22e7a982 Mon Sep 17 00:00:00 2001 From: Georg Kohmann Date: Tue, 13 Oct 2020 14:23:12 +0200 Subject: [PATCH 368/497] netfilter: Drop fragmented ndisc packets assembled in netfilter Fragmented ndisc packets assembled in netfilter not dropped as specified in RFC 6980, section 5. This behaviour breaks TAHI IPv6 Core Conformance Tests v6LC.2.1.22/23, V6LC.2.2.26/27 and V6LC.2.3.18. Setting IP6SKB_FRAGMENTED flag during reassembly. References: commit b800c3b966bc ("ipv6: drop fragmented ndisc packets by default (RFC 6980)") Signed-off-by: Georg Kohmann Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index fed9666a2f7d..054d287eb13d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -355,6 +355,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, ipv6_hdr(skb)->payload_len = htons(payload_len); ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; /* Yes, and fold redundant checksum back. 8) */ if (skb->ip_summed == CHECKSUM_COMPLETE) From 63137bc5882a1882c553d389fdeeeace86ee1741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20COCAULT?= Date: Wed, 14 Oct 2020 12:36:15 +0000 Subject: [PATCH 369/497] netfilter: ebtables: Fixes dropping of small packets in bridge nat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes an error causing small packets to get dropped. skb_ensure_writable expects the second parameter to be a length in the ethernet payload.=20 If we want to write the ethernet header (src, dst), we should pass 0. Otherwise, packets with small payloads (< ETH_ALEN) will get dropped. Fixes: c1a831167901 ("netfilter: bridge: convert skb_make_writable to skb_ensure_writable") Signed-off-by: Timothée COCAULT Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_dnat.c | 2 +- net/bridge/netfilter/ebt_redirect.c | 2 +- net/bridge/netfilter/ebt_snat.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index 12a4f4d93681..3fda71a8579d 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -21,7 +21,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - if (skb_ensure_writable(skb, ETH_ALEN)) + if (skb_ensure_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 0cad62a4052b..307790562b49 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_redirect_info *info = par->targinfo; - if (skb_ensure_writable(skb, ETH_ALEN)) + if (skb_ensure_writable(skb, 0)) return EBT_DROP; if (xt_hooknum(par) != NF_BR_BROUTING) diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c index 27443bf229a3..7dfbcdfc30e5 100644 --- a/net/bridge/netfilter/ebt_snat.c +++ b/net/bridge/netfilter/ebt_snat.c @@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; - if (skb_ensure_writable(skb, ETH_ALEN * 2)) + if (skb_ensure_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_source, info->mac); From 64747d5ed19911a867af733f6679d2a859fb18ae Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Sun, 18 Oct 2020 16:30:19 +0100 Subject: [PATCH 370/497] docs: nf_flowtable: fix typo. "mailined" should be "mainlined." Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/nf_flowtable.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/nf_flowtable.rst b/Documentation/networking/nf_flowtable.rst index b6e1fa141aae..6cdf9a1724b6 100644 --- a/Documentation/networking/nf_flowtable.rst +++ b/Documentation/networking/nf_flowtable.rst @@ -109,7 +109,7 @@ More reading This documentation is based on the LWN.net articles [1]_\ [2]_. Rafal Milecki also made a very complete and comprehensive summary called "A state of network acceleration" that describes how things were before this infrastructure was -mailined [3]_ and it also makes a rough summary of this work [4]_. +mainlined [3]_ and it also makes a rough summary of this work [4]_. .. [1] https://lwn.net/Articles/738214/ .. [2] https://lwn.net/Articles/742164/ From 31cc578ae2de19c748af06d859019dced68e325d Mon Sep 17 00:00:00 2001 From: Saeed Mirzamohammadi Date: Tue, 20 Oct 2020 13:41:36 +0200 Subject: [PATCH 371/497] netfilter: nftables_offload: KASAN slab-out-of-bounds Read in nft_flow_rule_create This patch fixes the issue due to: BUG: KASAN: slab-out-of-bounds in nft_flow_rule_create+0x622/0x6a2 net/netfilter/nf_tables_offload.c:40 Read of size 8 at addr ffff888103910b58 by task syz-executor227/16244 The error happens when expr->ops is accessed early on before performing the boundary check and after nft_expr_next() moves the expr to go out-of-bounds. This patch checks the boundary condition before expr->ops that fixes the slab-out-of-bounds Read issue. Add nft_expr_more() and use it to fix this problem. Signed-off-by: Saeed Mirzamohammadi Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nf_tables_api.c | 6 +++--- net/netfilter/nf_tables_offload.c | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3f7e56b1171e..55b4cadf290a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -891,6 +891,12 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) return (struct nft_expr *)&rule->data[rule->dlen]; } +static inline bool nft_expr_more(const struct nft_rule *rule, + const struct nft_expr *expr) +{ + return expr != nft_expr_last(rule) && expr->ops; +} + static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9957e0ed8658..65cb8e3c13d9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -302,7 +302,7 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { if (expr->ops->activate) expr->ops->activate(ctx, expr); @@ -317,7 +317,7 @@ static void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { if (expr->ops->deactivate) expr->ops->deactivate(ctx, expr, phase); @@ -3080,7 +3080,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); - while (expr != nft_expr_last(rule) && expr->ops) { + while (nft_expr_more(rule, expr)) { next = nft_expr_next(expr); nf_tables_expr_destroy(ctx, expr); expr = next; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 7c7e06624dc3..9f625724a20f 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -37,7 +37,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, struct nft_expr *expr; expr = nft_expr_first(rule); - while (expr->ops && expr != nft_expr_last(rule)) { + while (nft_expr_more(rule, expr)) { if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) num_actions++; @@ -61,7 +61,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; - while (expr->ops && expr != nft_expr_last(rule)) { + while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; From 7da4c510abff8ad47eb2d7cc9a97def5a411947f Mon Sep 17 00:00:00 2001 From: Lukasz Halman Date: Tue, 20 Oct 2020 08:14:09 +0200 Subject: [PATCH 372/497] ALSA: usb-audio: Line6 Pod Go interface requires static clock rate quirk Recently released Line6 Pod Go requires static clock rate quirk to make its usb audio interface working. Added its usb id to the list of similar line6 devices. Signed-off-by: Lukasz Halman Cc: Link: https://lore.kernel.org/r/20201020061409.GA24382@TAG009442538903 Signed-off-by: Takashi Iwai --- sound/usb/format.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/format.c b/sound/usb/format.c index 1b28d01d1f4c..3bfead393aa3 100644 --- a/sound/usb/format.c +++ b/sound/usb/format.c @@ -406,6 +406,7 @@ static int line6_parse_audio_format_rates_quirk(struct snd_usb_audio *chip, case USB_ID(0x0e41, 0x4242): /* Line6 Helix Rack */ case USB_ID(0x0e41, 0x4244): /* Line6 Helix LT */ case USB_ID(0x0e41, 0x4246): /* Line6 HX-Stomp */ + case USB_ID(0x0e41, 0x4247): /* Line6 Pod Go */ case USB_ID(0x0e41, 0x4248): /* Line6 Helix >= fw 2.82 */ case USB_ID(0x0e41, 0x4249): /* Line6 Helix Rack >= fw 2.82 */ case USB_ID(0x0e41, 0x424a): /* Line6 Helix LT >= fw 2.82 */ From 9eec21bfbe9096141f15c624d3d0c2142121f6cb Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 19 Oct 2020 18:18:15 -0500 Subject: [PATCH 373/497] smb3: add dynamic trace point to trace when credits obtained MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SMB3 crediting is used for flow control, and it can be useful to trace for problem determination how many credits were acquired and for which operation. Here is an example ("trace-cmd record -e *add_credits"): cifsd-9522    [010] ....  5995.202712: smb3_add_credits: server=localhost current_mid=0x12 credits=373 credits_to_add=10 cifsd-9522    [010] ....  5995.204040: smb3_add_credits: server=localhost current_mid=0x15 credits=400 credits_to_add=30 Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 4 +++- fs/cifs/trace.h | 18 ++++++++++++------ fs/cifs/transport.c | 5 +++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 0dfa832a3de0..f085fe32c342 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -72,7 +72,7 @@ smb2_add_credits(struct TCP_Server_Info *server, /* eg found case where write overlapping reconnect messed up credits */ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, - server->hostname, *val); + server->hostname, *val, add); if ((instance == 0) || (instance == server->reconnect_instance)) *val += add; else @@ -121,6 +121,8 @@ smb2_add_credits(struct TCP_Server_Info *server, cifs_dbg(FYI, "disabling oplocks\n"); break; default: + trace_smb3_add_credits(server->CurrentMid, + server->hostname, rc, add); cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); } } diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index eef4b08c7208..90e0fab69bb8 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -878,33 +878,39 @@ DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, char *hostname, - int credits), - TP_ARGS(currmid, hostname, credits), + int credits, + int credits_to_add), + TP_ARGS(currmid, hostname, credits, credits_to_add), TP_STRUCT__entry( __field(__u64, currmid) __field(char *, hostname) __field(int, credits) + __field(int, credits_to_add) ), TP_fast_assign( __entry->currmid = currmid; __entry->hostname = hostname; __entry->credits = credits; + __entry->credits_to_add = credits_to_add; ), - TP_printk("server=%s current_mid=0x%llx credits=%d", + TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d", __entry->hostname, __entry->currmid, - __entry->credits) + __entry->credits, + __entry->credits_to_add) ) #define DEFINE_SMB3_CREDIT_EVENT(name) \ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ char *hostname, \ - int credits), \ - TP_ARGS(currmid, hostname, credits)) + int credits, \ + int credits_to_add), \ + TP_ARGS(currmid, hostname, credits, credits_to_add)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); DEFINE_SMB3_CREDIT_EVENT(credit_timeout); +DEFINE_SMB3_CREDIT_EVENT(add_credits); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index ac7632482736..e27e255d40dd 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -563,7 +563,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, cifs_num_waiters_dec(server); if (!rc) { trace_smb3_credit_timeout(server->CurrentMid, - server->hostname, num_credits); + server->hostname, num_credits, 0); cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; @@ -604,7 +604,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, if (!rc) { trace_smb3_credit_timeout( server->CurrentMid, - server->hostname, num_credits); + server->hostname, num_credits, + 0); cifs_server_dbg(VFS, "wait timed out after %d ms\n", timeout); return -ENOTSUPP; From 9934430e2178d5164eb1ac91a9b092f9e7e64745 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 20 Oct 2020 02:02:02 -0500 Subject: [PATCH 374/497] SMB3.1.1: Fix ids returned in POSIX query dir We were setting the uid/gid to the default in each dir entry in the parsing of the POSIX query dir response, rather than attempting to map the user and group SIDs returned by the server to well known SIDs (or upcall if not found). CC: Stable Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 5 +++-- fs/cifs/cifsproto.h | 2 ++ fs/cifs/readdir.c | 5 ++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index fcff14ef1c70..23b21e943652 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -338,7 +338,7 @@ invalidate_key: goto out_key_put; } -static int +int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { @@ -359,7 +359,8 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, return -EIO; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) || + (cifs_sb_master_tcon(cifs_sb)->posix_extensions)) { uint32_t unix_id; bool is_group; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index bb68cbf81074..24c6f36177ba 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -209,6 +209,8 @@ extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, extern int cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, const unsigned int xid); +extern int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, + struct cifs_fattr *fattr, uint sidtype); extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, bool get_mode_from_special_sid, diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 31a18aae5e64..5abf1ea21abe 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -267,9 +267,8 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info, if (reparse_file_needs_reval(fattr)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; - /* TODO map SIDs */ - fattr->cf_uid = cifs_sb->mnt_uid; - fattr->cf_gid = cifs_sb->mnt_gid; + sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER); + sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP); } static void __dir_info_to_fattr(struct cifs_fattr *fattr, const void *info) From 3ece60e3e78e6066b4ea02dea6687e5d373c6a77 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 20 Oct 2020 15:19:36 +0100 Subject: [PATCH 375/497] cifs: make const array static, makes object smaller Don't populate const array smb3_create_tag_posix on the stack but instead make it static. Makes the object code smaller by 50 bytes. Before: text data bss dec hex filename 150184 47167 0 197351 302e7 fs/cifs/smb2pdu.o After: text data bss dec hex filename 150070 47231 0 197301 302b5 fs/cifs/smb2pdu.o (gcc version 10.2.0) Signed-off-by: Colin Ian King Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 025db5e8c183..445e80862865 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1981,9 +1981,11 @@ smb2_parse_contexts(struct TCP_Server_Info *server, unsigned int next; unsigned int remaining; char *name; - const char smb3_create_tag_posix[] = {0x93, 0xAD, 0x25, 0x50, 0x9C, - 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83, - 0xDE, 0x96, 0x8B, 0xCD, 0x7C}; + static const char smb3_create_tag_posix[] = { + 0x93, 0xAD, 0x25, 0x50, 0x9C, + 0xB4, 0x11, 0xE7, 0xB4, 0x23, 0x83, + 0xDE, 0x96, 0x8B, 0xCD, 0x7C + }; *oplock = 0; data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); From 330e3932a4811e1628d962e47e6892e1e20eb9a7 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 20 Oct 2020 10:10:35 +0200 Subject: [PATCH 376/497] PM: domains: Fix build error for genpd notifiers The __raw_notifier_call_chain() was recently removed and replaced with raw_notifier_call_chain_robust(). Recent changes to genpd didn't take that into account, which causes a build error. Let's fix this by converting to the raw_notifier_call_chain_robust() in genpd. Reported-by: kernel test robot Reported-by: Lina Iyer Signed-off-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 859cdb207010..743268996336 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -413,15 +413,15 @@ static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed) unsigned int state_idx = genpd->state_idx; ktime_t time_start; s64 elapsed_ns; - int ret, nr_calls = 0; + int ret; /* Notify consumers that we are about to power on. */ - ret = __raw_notifier_call_chain(&genpd->power_notifiers, - GENPD_NOTIFY_PRE_ON, NULL, -1, - &nr_calls); + ret = raw_notifier_call_chain_robust(&genpd->power_notifiers, + GENPD_NOTIFY_PRE_ON, + GENPD_NOTIFY_OFF, NULL); ret = notifier_to_errno(ret); if (ret) - goto err; + return ret; if (!genpd->power_on) goto out; @@ -462,15 +462,15 @@ static int _genpd_power_off(struct generic_pm_domain *genpd, bool timed) unsigned int state_idx = genpd->state_idx; ktime_t time_start; s64 elapsed_ns; - int ret, nr_calls = 0; + int ret; /* Notify consumers that we are about to power off. */ - ret = __raw_notifier_call_chain(&genpd->power_notifiers, - GENPD_NOTIFY_PRE_OFF, NULL, -1, - &nr_calls); + ret = raw_notifier_call_chain_robust(&genpd->power_notifiers, + GENPD_NOTIFY_PRE_OFF, + GENPD_NOTIFY_ON, NULL); ret = notifier_to_errno(ret); if (ret) - goto busy; + return ret; if (!genpd->power_off) goto out; @@ -502,10 +502,7 @@ out: NULL); return 0; busy: - if (nr_calls) - __raw_notifier_call_chain(&genpd->power_notifiers, - GENPD_NOTIFY_ON, NULL, nr_calls - 1, - NULL); + raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_ON, NULL); return ret; } From aa9c9b3f3f08cb0fda8a8139e6fb302c9a2e21ed Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Tue, 20 Oct 2020 17:00:27 +0200 Subject: [PATCH 377/497] PM: runtime: Fix typo in pm_runtime_set_active() helper comment This patch is to fix typo in the comment of helper pm_runtime_set_active(). Signed-off-by: Bean Huo [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 6245caa18034..18b02dcc168e 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -479,7 +479,7 @@ static inline int pm_runtime_set_active(struct device *dev) } /** - * pm_runtime_set_suspended - Set runtime PM status to "active". + * pm_runtime_set_suspended - Set runtime PM status to "suspended". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that From fea456d82c19d201c21313864105876deabe148b Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 20 Oct 2020 08:22:53 +1000 Subject: [PATCH 378/497] drm/ttm: fix eviction valuable range check. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was adding size to start, but pfn and start are in pages, so it should be using num_pages. Not sure this fixes anything in the real world, just noticed it during refactoring. Signed-off-by: Dave Airlie Reviewed-by: Christian König Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20201019222257.1684769-2-airlied@gmail.com --- drivers/gpu/drm/ttm/ttm_bo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 70b3bee27850..eb4b7df02ca0 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -647,7 +647,7 @@ bool ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, /* Don't evict this BO if it's outside of the * requested placement range */ - if (place->fpfn >= (bo->mem.start + bo->mem.size) || + if (place->fpfn >= (bo->mem.start + bo->mem.num_pages) || (place->lpfn && place->lpfn <= bo->mem.start)) return false; From dbffb29dac6a8864bc026ca904a8cc361de71a1a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 12 Oct 2020 15:04:20 -0500 Subject: [PATCH 379/497] gfs2: Fix comments to glock_hash_walk The comments before function glock_hash_walk had the wrong name and an extra parameter. This simply fixes the comments. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c0d441228562..5f3cc68b50ec 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1842,10 +1842,9 @@ static struct shrinker glock_shrinker = { }; /** - * examine_bucket - Call a function for glock in a hash bucket + * glock_hash_walk - Call a function for glock in a hash bucket * @examiner: the function * @sdp: the filesystem - * @bucket: the bucket * * Note that the function can be called multiple times on the same * object. So the user must ensure that the function can cope with From 2ffed5290b3bff7562d29fd06621be4705704242 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 15 Oct 2020 11:16:48 -0500 Subject: [PATCH 380/497] gfs2: Only access gl_delete for iopen glocks Only initialize gl_delete for iopen glocks, but more importantly, only access it for iopen glocks in flush_delete_work: flush_delete_work is called for different types of glocks including rgrp glocks, and those use gl_vm which is in a union with gl_delete. Without this fix, we'll end up clobbering gl_vm, which results in general memory corruption. Fixes: a0e3cc65fa29 ("gfs2: Turn gl_delete into a delayed work") Cc: stable@vger.kernel.org # v5.8+ Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5f3cc68b50ec..5441c17562c5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1051,7 +1051,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); - INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) + INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { @@ -1900,9 +1901,11 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl) static void flush_delete_work(struct gfs2_glock *gl) { - if (cancel_delayed_work(&gl->gl_delete)) { - queue_delayed_work(gfs2_delete_workqueue, - &gl->gl_delete, 0); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { + if (cancel_delayed_work(&gl->gl_delete)) { + queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, 0); + } } gfs2_glock_queue_work(gl, 0); } From 23cfb0c3d845ee0cb45732cd0ac2460115cb7c9c Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 15 Oct 2020 11:07:26 -0500 Subject: [PATCH 381/497] gfs2: Eliminate gl_vm The gfs2_glock structure has a gl_vm member, introduced in commit 7005c3e4ae428 ("GFS2: Use range based functions for rgrp sync/invalidation"), which stores the location of resource groups within their address space. This structure is in a union with iopen glock specific fields. It was introduced because at unmount time, the resource group objects were destroyed before flushing out any pending resource group glock work, and flushing out such work could require flushing / truncating the address space. Since commit b3422cacdd7e6 ("gfs2: Rework how rgrp buffer_heads are managed"), any pending resource group glock work is flushed out before destroying the resource group objects. So the resource group objects will now always exist in rgrp_go_sync and rgrp_go_inval, and we now simply compute the gl_vm values where needed instead of caching them. This also eliminates the union. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 27 ++++++++++++--------------- fs/gfs2/incore.h | 15 ++++----------- fs/gfs2/rgrp.c | 4 ---- 3 files changed, 16 insertions(+), 30 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c2c90747d79b..b8cd1da7499d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -178,6 +178,9 @@ static int rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -186,18 +189,13 @@ static int rgrp_go_sync(struct gfs2_glock *gl) gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_RGRP_GO_SYNC); - filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + filemap_fdatawrite_range(mapping, start, end); + error = filemap_fdatawait_range(mapping, start, end); WARN_ON_ONCE(error); mapping_set_error(mapping, error); if (!error) error = gfs2_ail_empty_gl(gl); - - spin_lock(&gl->gl_lockref.lock); - rgd = gl->gl_object; - if (rgd) - gfs2_free_clones(rgd); - spin_unlock(&gl->gl_lockref.lock); + gfs2_free_clones(rgd); return error; } @@ -216,15 +214,14 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; - if (rgd) - gfs2_rgrp_brelse(rgd); - + gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); - truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - - if (rgd) - rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + truncate_inode_pages_range(mapping, start, end); + rgd->rd_flags &= ~GFS2_RDF_UPTODATE; } static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 346693faf049..c3ca9b8382ec 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -377,17 +377,10 @@ struct gfs2_glock { atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; - union { - /* For iopen glocks only */ - struct { - struct delayed_work gl_delete; - u64 gl_no_formal_ino; - }; - /* For rgrp glocks only */ - struct { - loff_t start; - loff_t end; - } gl_vm; + /* For iopen glocks only */ + struct { + struct delayed_work gl_delete; + u64 gl_no_formal_ino; }; struct rcu_head gl_rcu; struct rhash_head gl_node; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9de676cfd3d7..ee491bb9c1cc 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -878,7 +878,6 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); struct gfs2_rindex buf; int error; @@ -924,9 +923,6 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_unlock(&sdp->sd_rindex_spin); if (!error) { glock_set_object(rgd->rd_gl, rgd); - rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; - rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + - rgd->rd_length) * bsize) - 1; return 0; } From ed3adb375b704662bf36d62d5611f304e2b56c7e Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 20 Oct 2020 14:18:24 +0200 Subject: [PATCH 382/497] gfs2: Ignore subsequent errors after withdraw in rgrp_go_sync Once a withdraw has occurred, ignore errors that are the consequence of the withdraw. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index b8cd1da7499d..aa3f5236befb 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -191,7 +191,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl) GFS2_LFC_RGRP_GO_SYNC); filemap_fdatawrite_range(mapping, start, end); error = filemap_fdatawait_range(mapping, start, end); - WARN_ON_ONCE(error); + WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); mapping_set_error(mapping, error); if (!error) error = gfs2_ail_empty_gl(gl); From 730926982d770dc764b4282aecc82e0039c18f64 Mon Sep 17 00:00:00 2001 From: Abhi Das Date: Tue, 20 Oct 2020 15:58:03 -0500 Subject: [PATCH 383/497] gfs2: Add fields for statfs info in struct gfs2_log_header_host And read these in __get_log_header() from the log header. Also make gfs2_statfs_change_out() non-static so it can be used outside of super.c Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/incore.h | 4 ++++ fs/gfs2/recovery.c | 4 ++++ fs/gfs2/super.c | 2 +- fs/gfs2/super.h | 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c3ca9b8382ec..e34183e02a9e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -41,6 +41,10 @@ struct gfs2_log_header_host { u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail; /* Block number of log tail */ u32 lh_blkno; + + s64 lh_local_total; + s64 lh_local_free; + s64 lh_local_dinodes; }; /* diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 390ea79d682c..a8bb17e355b8 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -144,6 +144,10 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); + head->lh_local_total = be64_to_cpu(lh->lh_local_total); + head->lh_local_free = be64_to_cpu(lh->lh_local_free); + head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); + return 0; } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 8e250ec42e91..e17961ea994d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -230,7 +230,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); } -static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) { struct gfs2_statfs_change *str = buf; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 51900554ed81..ed4f5cb29074 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -37,6 +37,8 @@ extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); +extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); From b8cff311a42df4f15d6432583573d828b5c7b12a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 09:34:44 +0100 Subject: [PATCH 384/497] drm/i915/gt: Onion unwind for scratch page allocation failure In switching to using objects for our ppGTT scratch pages, care was not taken to avoid trying to unref NULL objects on failure. And for gen6 ppGTT, it appears we forgot entirely to unwind after a partial allocation failure. Fixes: 89351925a477 ("drm/i915/gt: Switch to object allocations for page directories") Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Joonas Lahtinen Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20201019083444.1286-1-chris@chris-wilson.co.uk (cherry picked from commit fa812ce96a46efc27cae4dcad866aaee9cb25d28) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 18 ++++++++++++------ drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 ++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c index fd0d24d28763..c30adc05fa98 100644 --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c @@ -239,18 +239,24 @@ static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt) I915_CACHE_NONE, PTE_READ_ONLY); vm->scratch[1] = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); - if (IS_ERR(vm->scratch[1])) - return PTR_ERR(vm->scratch[1]); + if (IS_ERR(vm->scratch[1])) { + ret = PTR_ERR(vm->scratch[1]); + goto err_scratch0; + } ret = pin_pt_dma(vm, vm->scratch[1]); - if (ret) { - i915_gem_object_put(vm->scratch[1]); - return ret; - } + if (ret) + goto err_scratch1; fill32_px(vm->scratch[1], vm->scratch[0]->encode); return 0; + +err_scratch1: + i915_gem_object_put(vm->scratch[1]); +err_scratch0: + i915_gem_object_put(vm->scratch[0]); + return ret; } static void gen6_ppgtt_free_pd(struct gen6_ppgtt *ppgtt) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index eb64f474a78c..38c7069b7749 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -604,7 +604,8 @@ static int gen8_init_scratch(struct i915_address_space *vm) return 0; free_scratch: - free_scratch(vm); + while (i--) + i915_gem_object_put(vm->scratch[i]); return -ENOMEM; } From 3da3c5c1c9825c24168f27b021339e90af37e969 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 17:50:05 +0100 Subject: [PATCH 385/497] drm/i915: Exclude low pages (128KiB) of stolen from use The GPU is trashing the low pages of its reserved memory upon reset. If we are using this memory for ringbuffers, then we will dutiful resubmit the trashed rings after the reset causing further resets, and worse. We must exclude this range from our own use. The value of 128KiB was found by empirical measurement (and verified now with a selftest) on gen9. Signed-off-by: Chris Wilson Cc: stable@vger.kernel.org Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201019165005.18128-2-chris@chris-wilson.co.uk (cherry picked from commit d3606757e611fbd48bb239e8c2fe9779b3f50035) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/Kconfig.debug | 1 + drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 6 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.h | 2 + drivers/gpu/drm/i915/gt/selftest_reset.c | 196 +++++++++++++++++++++ 4 files changed, 203 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug index 1cb28c20807c..25cd9788a4d5 100644 --- a/drivers/gpu/drm/i915/Kconfig.debug +++ b/drivers/gpu/drm/i915/Kconfig.debug @@ -153,6 +153,7 @@ config DRM_I915_SELFTEST select DRM_EXPORT_FOR_TESTS if m select FAULT_INJECTION select PRIME_NUMBERS + select CRC32 help Choose this option to allow the driver to perform selftests upon loading; also requires the i915.selftest=1 module parameter. To diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 0be5e8683337..84b2707d8b17 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -53,8 +53,10 @@ int i915_gem_stolen_insert_node(struct drm_i915_private *i915, struct drm_mm_node *node, u64 size, unsigned alignment) { - return i915_gem_stolen_insert_node_in_range(i915, node, size, - alignment, 0, U64_MAX); + return i915_gem_stolen_insert_node_in_range(i915, node, + size, alignment, + I915_GEM_STOLEN_BIAS, + U64_MAX); } void i915_gem_stolen_remove_node(struct drm_i915_private *i915, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.h b/drivers/gpu/drm/i915/gem/i915_gem_stolen.h index e15c0adad8af..61e028063f9f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.h @@ -30,4 +30,6 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv resource_size_t stolen_offset, resource_size_t size); +#define I915_GEM_STOLEN_BIAS SZ_128K + #endif /* __I915_GEM_STOLEN_H__ */ diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c index 35406ecdf0b2..ef5aeebbeeb0 100644 --- a/drivers/gpu/drm/i915/gt/selftest_reset.c +++ b/drivers/gpu/drm/i915/gt/selftest_reset.c @@ -3,9 +3,203 @@ * Copyright © 2018 Intel Corporation */ +#include + +#include "gem/i915_gem_stolen.h" + +#include "i915_memcpy.h" #include "i915_selftest.h" #include "selftests/igt_reset.h" #include "selftests/igt_atomic.h" +#include "selftests/igt_spinner.h" + +static int +__igt_reset_stolen(struct intel_gt *gt, + intel_engine_mask_t mask, + const char *msg) +{ + struct i915_ggtt *ggtt = >->i915->ggtt; + const struct resource *dsm = >->i915->dsm; + resource_size_t num_pages, page; + struct intel_engine_cs *engine; + intel_wakeref_t wakeref; + enum intel_engine_id id; + struct igt_spinner spin; + long max, count; + void *tmp; + u32 *crc; + int err; + + if (!drm_mm_node_allocated(&ggtt->error_capture)) + return 0; + + num_pages = resource_size(dsm) >> PAGE_SHIFT; + if (!num_pages) + return 0; + + crc = kmalloc_array(num_pages, sizeof(u32), GFP_KERNEL); + if (!crc) + return -ENOMEM; + + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto err_crc; + } + + igt_global_reset_lock(gt); + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + err = igt_spinner_init(&spin, gt); + if (err) + goto err_lock; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + + if (!(mask & engine->mask)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto err_spin; + } + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_spin; + } + i915_request_add(rq); + } + + for (page = 0; page < num_pages; page++) { + dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); + void __iomem *s; + void *in; + + ggtt->vm.insert_page(&ggtt->vm, dma, + ggtt->error_capture.start, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, + ggtt->error_capture.start, + PAGE_SIZE); + + if (!__drm_mm_interval_first(>->i915->mm.stolen, + page << PAGE_SHIFT, + ((page + 1) << PAGE_SHIFT) - 1)) + memset32(s, STACK_MAGIC, PAGE_SIZE / sizeof(u32)); + + in = s; + if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE)) + in = tmp; + crc[page] = crc32_le(0, in, PAGE_SIZE); + + io_mapping_unmap(s); + } + mb(); + ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); + + if (mask == ALL_ENGINES) { + intel_gt_reset(gt, mask, NULL); + } else { + for_each_engine(engine, gt, id) { + if (mask & engine->mask) + intel_engine_reset(engine, NULL); + } + } + + max = -1; + count = 0; + for (page = 0; page < num_pages; page++) { + dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); + void __iomem *s; + void *in; + u32 x; + + ggtt->vm.insert_page(&ggtt->vm, dma, + ggtt->error_capture.start, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, + ggtt->error_capture.start, + PAGE_SIZE); + + in = s; + if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE)) + in = tmp; + x = crc32_le(0, in, PAGE_SIZE); + + if (x != crc[page] && + !__drm_mm_interval_first(>->i915->mm.stolen, + page << PAGE_SHIFT, + ((page + 1) << PAGE_SHIFT) - 1)) { + pr_debug("unused stolen page %pa modified by GPU reset\n", + &page); + if (count++ == 0) + igt_hexdump(in, PAGE_SIZE); + max = page; + } + + io_mapping_unmap(s); + } + mb(); + ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); + + if (count > 0) { + pr_info("%s reset clobbered %ld pages of stolen, last clobber at page %ld\n", + msg, count, max); + } + if (max >= I915_GEM_STOLEN_BIAS >> PAGE_SHIFT) { + pr_err("%s reset clobbered unreserved area [above %x] of stolen; may cause severe faults\n", + msg, I915_GEM_STOLEN_BIAS); + err = -EINVAL; + } + +err_spin: + igt_spinner_fini(&spin); + +err_lock: + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + igt_global_reset_unlock(gt); + + kfree(tmp); +err_crc: + kfree(crc); + return err; +} + +static int igt_reset_device_stolen(void *arg) +{ + return __igt_reset_stolen(arg, ALL_ENGINES, "device"); +} + +static int igt_reset_engines_stolen(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + if (!intel_has_reset_engine(gt)) + return 0; + + for_each_engine(engine, gt, id) { + err = __igt_reset_stolen(gt, engine->mask, engine->name); + if (err) + return err; + } + + return 0; +} static int igt_global_reset(void *arg) { @@ -164,6 +358,8 @@ int intel_reset_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(igt_global_reset), /* attempt to recover GPU first */ + SUBTEST(igt_reset_device_stolen), + SUBTEST(igt_reset_engines_stolen), SUBTEST(igt_wedged_reset), SUBTEST(igt_atomic_reset), SUBTEST(igt_atomic_engine_reset), From 8195400f7ea95399f721ad21f4d663a62c65036f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 11:15:23 +0100 Subject: [PATCH 386/497] drm/i915: Force VT'd workarounds when running as a guest OS If i915.ko is being used as a passthrough device, it does not know if the host is using intel_iommu. Mixing the iommu and gfx causes a few issues (such as scanout overfetch) which we need to workaround inside the driver, so if we detect we are running under a hypervisor, also assume the device access is being virtualised. Reported-by: Stefan Fritsch Suggested-by: Stefan Fritsch Signed-off-by: Chris Wilson Cc: Zhenyu Wang Cc: Joonas Lahtinen Cc: Stefan Fritsch Cc: stable@vger.kernel.org Tested-by: Stefan Fritsch Reviewed-by: Zhenyu Wang Link: https://patchwork.freedesktop.org/patch/msgid/20201019101523.4145-1-chris@chris-wilson.co.uk (cherry picked from commit f566fdcd6cc49a9d5b5d782f56e3e7cb243f01b8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_drv.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index eef9a821c49c..8426d5974669 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -33,6 +33,8 @@ #include #include +#include + #include #include #include @@ -1760,7 +1762,9 @@ static inline bool intel_vtd_active(void) if (intel_iommu_gfx_mapped) return true; #endif - return false; + + /* Running as a guest, we assume the host is enforcing VT'd */ + return !hypervisor_is_type(X86_HYPER_NATIVE); } static inline bool intel_scanout_needs_vtd_wa(struct drm_i915_private *dev_priv) From 5c6c13cd1102caf92d006a3cf4591c0229019daf Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 11 Aug 2020 10:25:32 +0100 Subject: [PATCH 387/497] drm/i915: Drop runtime-pm assert from vgpu io accessors The "mmio" writes into vgpu registers are simple memory traps from the guest into the host. We do not need to assert in the guest that the device is awake for the io as we do not write to the device itself. However, over time we have refactored all the mmio accessors with the result that the vgpu reuses the gen2 accessors and so inherits the assert for runtime-pm of the native device. The assert though has actually been there since commit 3be0bf5acca6 ("drm/i915: Create vGPU specific MMIO operations to reduce traps"). References: 3be0bf5acca6 ("drm/i915: Create vGPU specific MMIO operations to reduce traps") Signed-off-by: Chris Wilson Cc: Yan Zhao Cc: Zhenyu Wang Reviewed-by: Zhenyu Wang Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20200811092532.13753-1-chris@chris-wilson.co.uk (cherry picked from commit 0e65ce24a33c1d37da4bf43c34e080334ec6cb60) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_uncore.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index 263ffcb832b7..97ded2a59cf4 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -1209,6 +1209,18 @@ unclaimed_reg_debug(struct intel_uncore *uncore, spin_unlock(&uncore->debug->lock); } +#define __vgpu_read(x) \ +static u##x \ +vgpu_read##x(struct intel_uncore *uncore, i915_reg_t reg, bool trace) { \ + u##x val = __raw_uncore_read##x(uncore, reg); \ + trace_i915_reg_rw(false, reg, val, sizeof(val), trace); \ + return val; \ +} +__vgpu_read(8) +__vgpu_read(16) +__vgpu_read(32) +__vgpu_read(64) + #define GEN2_READ_HEADER(x) \ u##x val = 0; \ assert_rpm_wakelock_held(uncore->rpm); @@ -1414,6 +1426,16 @@ __gen_reg_write_funcs(gen8); #undef GEN6_WRITE_FOOTER #undef GEN6_WRITE_HEADER +#define __vgpu_write(x) \ +static void \ +vgpu_write##x(struct intel_uncore *uncore, i915_reg_t reg, u##x val, bool trace) { \ + trace_i915_reg_rw(true, reg, val, sizeof(val), trace); \ + __raw_uncore_write##x(uncore, reg, val); \ +} +__vgpu_write(8) +__vgpu_write(16) +__vgpu_write(32) + #define ASSIGN_RAW_WRITE_MMIO_VFUNCS(uncore, x) \ do { \ (uncore)->funcs.mmio_writeb = x##_write8; \ @@ -1735,7 +1757,10 @@ static void uncore_raw_init(struct intel_uncore *uncore) { GEM_BUG_ON(intel_uncore_has_forcewake(uncore)); - if (IS_GEN(uncore->i915, 5)) { + if (intel_vgpu_active(uncore->i915)) { + ASSIGN_RAW_WRITE_MMIO_VFUNCS(uncore, vgpu); + ASSIGN_RAW_READ_MMIO_VFUNCS(uncore, vgpu); + } else if (IS_GEN(uncore->i915, 5)) { ASSIGN_RAW_WRITE_MMIO_VFUNCS(uncore, gen5); ASSIGN_RAW_READ_MMIO_VFUNCS(uncore, gen5); } else { From 36bdcf318bc21af24de10b68e32cdea6b9a8d17f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 17 Sep 2020 15:14:25 +0300 Subject: [PATCH 388/497] vdpa/mlx5: Make use of a specific 16 bit endianness API Introduce a dedicated function to be used for setting 16 bit fields per virio endianness requirements and use it to set the mtu field. Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20200917121425.GA98139@mtl-vdi-166.wap.labs.mlnx Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 74264e590695..56228467d7ec 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1522,6 +1522,11 @@ static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev) (mvdev->actual_features & (1ULL << VIRTIO_F_VERSION_1)); } +static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val) +{ + return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val); +} + static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -1535,8 +1540,7 @@ static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features) return err; ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features; - ndev->config.mtu = __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), - ndev->mtu); + ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu); return err; } From 36b02df2d204da9f7a571f16ed9e91a4d083f207 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 17 Sep 2020 15:15:40 +0300 Subject: [PATCH 389/497] vdpa/mlx5: Fix failure to bring link up Set VIRTIO_NET_S_LINK_UP in config status to allow the get the bring the net device's link up. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20200917121540.GA98184@mtl-vdi-166.wap.labs.mlnx Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 56228467d7ec..5ca309473c03 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1541,6 +1541,7 @@ static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features) ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features; ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu); + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); return err; } From bfec6c83077cd784fb049abbe2c5ff32629a7498 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Fri, 11 Sep 2020 22:35:07 +0200 Subject: [PATCH 390/497] virtio-balloon: Constify id_table id_table is not modified, so make it const to allow the compiler to put it in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20200911203509.26505-2-rikard.falkeborn@gmail.com Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand --- drivers/virtio/virtio_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 31cc97f2f515..481611c09dae 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -128,7 +128,7 @@ struct virtio_balloon { struct page_reporting_dev_info pr_dev_info; }; -static struct virtio_device_id id_table[] = { +static const struct virtio_device_id id_table[] = { { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, { 0 }, }; From 7f90611693f08f6b29209bd8d704ee17cbe003a0 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Fri, 11 Sep 2020 22:35:08 +0200 Subject: [PATCH 391/497] virtio_input: Constify id_table id_table is not modified, so make it const to allow the compiler to put it in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20200911203509.26505-3-rikard.falkeborn@gmail.com Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand --- drivers/virtio/virtio_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c index 877b2ea3ed05..f1f6208edcf5 100644 --- a/drivers/virtio/virtio_input.c +++ b/drivers/virtio/virtio_input.c @@ -363,7 +363,7 @@ static int virtinput_restore(struct virtio_device *vdev) static unsigned int features[] = { /* none */ }; -static struct virtio_device_id id_table[] = { +static const struct virtio_device_id id_table[] = { { VIRTIO_ID_INPUT, VIRTIO_DEV_ANY_ID }, { 0 }, }; From 7ab4de60028e0e9669b72a5f69f2080f98069b04 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Fri, 11 Sep 2020 22:35:09 +0200 Subject: [PATCH 392/497] virtio-mem: Constify mem_id_table mem_id_table is not modified, so make it const to allow the compiler to put it in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20200911203509.26505-4-rikard.falkeborn@gmail.com Signed-off-by: Michael S. Tsirkin Acked-by: David Hildenbrand --- drivers/virtio/virtio_mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index c08512fcea90..8b6059dbc743 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1957,7 +1957,7 @@ static unsigned int virtio_mem_features[] = { #endif }; -static struct virtio_device_id virtio_mem_id_table[] = { +static const struct virtio_device_id virtio_mem_id_table[] = { { VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID }, { 0 }, }; From 5e5e8736ad2ee1be5d6162177317857460c857c2 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 15 Sep 2020 02:08:09 +0800 Subject: [PATCH 393/497] vhost: reduce stack usage in log_used Fix the warning: [-Werror=-Wframe-larger-than=] drivers/vhost/vhost.c: In function log_used: drivers/vhost/vhost.c:1906:1: warning: the frame size of 1040 bytes is larger than 1024 bytes Signed-off-by: Li Wang Link: https://lore.kernel.org/r/1600106889-25013-1-git-send-email-li.wang@windriver.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vhost.c | 2 +- drivers/vhost/vhost.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 9ad45e1d27f0..5c5edd4ff04b 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1897,7 +1897,7 @@ static int log_write_hva(struct vhost_virtqueue *vq, u64 hva, u64 len) static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) { - struct iovec iov[64]; + struct iovec *iov = vq->log_iov; int i, ret; if (!vq->iotlb) diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 9032d3c2a9f4..5fe4b478c215 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -123,6 +123,7 @@ struct vhost_virtqueue { /* Log write descriptors */ void __user *log_base; struct vhost_log *log; + struct iovec log_iov[64]; /* Ring endianness. Defaults to legacy native endianness. * Set to true when starting a modern virtio device. */ From b9747fdf0c1dad088eb21d5191295999ae7b4b40 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Tue, 15 Sep 2020 08:51:42 +0800 Subject: [PATCH 394/497] vhost_vdpa: Fix duplicate included kernel.h linux/kernel.h is included more than once, Remove the one that isn't necessary. Signed-off-by: Tian Tao Link: https://lore.kernel.org/r/1600131102-24672-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 62a9bb0efc55..0bef5b10f044 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "vhost.h" From 0afa15e1a5294754066343cad24af5ec8edae96d Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Thu, 10 Sep 2020 10:53:49 +0200 Subject: [PATCH 395/497] virtio: let arch advertise guest's memory access restrictions An architecture may restrict host access to guest memory, e.g. IBM s390 Secure Execution or AMD SEV. Provide a new Kconfig entry the architecture can select, CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS, when it provides the arch_has_restricted_virtio_memory_access callback to advertise to VIRTIO common code when the architecture restricts memory access from the host. The common code can then fail the probe for any device where VIRTIO_F_ACCESS_PLATFORM is required, but not set. Signed-off-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Link: https://lore.kernel.org/r/1599728030-17085-2-git-send-email-pmorel@linux.ibm.com Signed-off-by: Michael S. Tsirkin Acked-by: Christian Borntraeger --- drivers/virtio/Kconfig | 6 ++++++ drivers/virtio/virtio.c | 15 +++++++++++++++ include/linux/virtio_config.h | 10 ++++++++++ 3 files changed, 31 insertions(+) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 5c92e4a50882..ef2d49430800 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -6,6 +6,12 @@ config VIRTIO bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG or CONFIG_S390_GUEST. +config ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + bool + help + This option is selected if the architecture may need to enforce + VIRTIO_F_ACCESS_PLATFORM + menuconfig VIRTIO_MENU bool "Virtio drivers" default y diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index a977e32a88f2..a2b3f12e10a2 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -176,6 +176,21 @@ int virtio_finalize_features(struct virtio_device *dev) if (ret) return ret; + ret = arch_has_restricted_virtio_memory_access(); + if (ret) { + if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) { + dev_warn(&dev->dev, + "device must provide VIRTIO_F_VERSION_1\n"); + return -ENODEV; + } + + if (!virtio_has_feature(dev, VIRTIO_F_ACCESS_PLATFORM)) { + dev_warn(&dev->dev, + "device must provide VIRTIO_F_ACCESS_PLATFORM\n"); + return -ENODEV; + } + } + if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) return 0; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8fe857e27ef3..3f697c8c8205 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -540,4 +540,14 @@ static inline void virtio_cwrite64(struct virtio_device *vdev, virtio_cread_le((vdev), structname, member, ptr); \ _r; \ }) + +#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS +int arch_has_restricted_virtio_memory_access(void); +#else +static inline int arch_has_restricted_virtio_memory_access(void) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS */ + #endif /* _LINUX_VIRTIO_CONFIG_H */ From 4ce1cf7b02ed691acf0d9c664579cb6e52b1687b Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Thu, 10 Sep 2020 10:53:50 +0200 Subject: [PATCH 396/497] s390: virtio: PV needs VIRTIO I/O device protection If protected virtualization is active on s390, VIRTIO has only retricted access to the guest memory. Define CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS and export arch_has_restricted_virtio_memory_access to advertize VIRTIO if that's the case. Signed-off-by: Pierre Morel Reviewed-by: Cornelia Huck Reviewed-by: Halil Pasic Link: https://lore.kernel.org/r/1599728030-17085-3-git-send-email-pmorel@linux.ibm.com Signed-off-by: Michael S. Tsirkin Acked-by: Christian Borntraeger --- arch/s390/Kconfig | 1 + arch/s390/mm/init.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b29fcc66ec39..938246200d39 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -820,6 +820,7 @@ menu "Virtualization" config PROTECTED_VIRTUALIZATION_GUEST def_bool n prompt "Protected virtualization guest support" + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS help Select this option, if you want to be able to run this kernel as a protected virtualization KVM guest. diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 0d282081dc1f..e27f050cb516 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -45,6 +45,7 @@ #include #include #include +#include pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); @@ -160,6 +161,16 @@ bool force_dma_unencrypted(struct device *dev) return is_prot_virt_guest(); } +#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + +int arch_has_restricted_virtio_memory_access(void) +{ + return is_prot_virt_guest(); +} +EXPORT_SYMBOL(arch_has_restricted_virtio_memory_access); + +#endif + /* protected virtualization */ static void pv_init(void) { From 1897f0b618b0af0eb9dca709ab6bdf9ef1969ef7 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 8 Sep 2020 15:33:46 +0300 Subject: [PATCH 397/497] vdpa/mlx5: Setup driver only if VIRTIO_CONFIG_S_DRIVER_OK set_map() is used by mlx5 vdpa to create a memory region based on the address map passed by the iotlb argument. If we get successive calls, we will destroy the current memory region and build another one based on the new address mapping. We also need to setup the hardware resources since they depend on the memory region. If these calls happen before DRIVER_OK, It means that driver VQs may also not been setup and we may not create them yet. In this case we want to avoid setting up the other resources and defer this till we get DRIVER OK. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20200908123346.GA169007@mtl-vdi-166.wap.labs.mlnx Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 5ca309473c03..1fa6fcac8299 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1658,6 +1658,9 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb * if (err) goto err_mr; + if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) + return 0; + restore_channels_info(ndev); err = setup_driver(ndev); if (err) From 5745bcfbbf89b158416075374254d3c013488f21 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 8 Oct 2020 22:42:56 +0200 Subject: [PATCH 398/497] vringh: fix __vringh_iov() when riov and wiov are different If riov and wiov are both defined and they point to different objects, only riov is initialized. If the wiov is not initialized by the caller, the function fails returning -EINVAL and printing "Readable desc 0x... after writable" error message. This issue happens when descriptors have both readable and writable buffers (eg. virtio-blk devices has virtio_blk_outhdr in the readable buffer and status as last byte of writable buffer) and we call __vringh_iov() to get both type of buffers in two different iovecs. Let's replace the 'else if' clause with 'if' to initialize both riov and wiov if they are not NULL. As checkpatch pointed out, we also avoid crashing the kernel when riov and wiov are both NULL, replacing BUG() with WARN_ON() and returning -EINVAL. Fixes: f87d0fbb5798 ("vringh: host-side implementation of virtio rings.") Cc: stable@vger.kernel.org Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201008204256.162292-1-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vringh.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index e059a9a47cdf..8bd8b403f087 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -284,13 +284,14 @@ __vringh_iov(struct vringh *vrh, u16 i, desc_max = vrh->vring.num; up_next = -1; + /* You must want something! */ + if (WARN_ON(!riov && !wiov)) + return -EINVAL; + if (riov) riov->i = riov->used = 0; - else if (wiov) + if (wiov) wiov->i = wiov->used = 0; - else - /* You must want something! */ - BUG(); for (;;) { void *addr; From 86e182fe12ee5869022614457037097c70fe2ed1 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 9 Sep 2020 14:52:34 +0800 Subject: [PATCH 399/497] vhost_vdpa: remove unnecessary spin_lock in vhost_vring_call This commit removed unnecessary spin_locks in vhost_vring_call and related operations. Because we manipulate irq offloading contents in vhost_vdpa ioctl code path which is already protected by dev mutex and vq mutex. Signed-off-by: Zhu Lingshan Link: https://lore.kernel.org/r/20200909065234.3313-1-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 8 +------- drivers/vhost/vhost.c | 3 --- drivers/vhost/vhost.h | 1 - 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 0bef5b10f044..a2dbc85e0b0d 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -96,26 +96,20 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) return; irq = ops->get_vq_irq(vdpa, qid); - spin_lock(&vq->call_ctx.ctx_lock); irq_bypass_unregister_producer(&vq->call_ctx.producer); - if (!vq->call_ctx.ctx || irq < 0) { - spin_unlock(&vq->call_ctx.ctx_lock); + if (!vq->call_ctx.ctx || irq < 0) return; - } vq->call_ctx.producer.token = vq->call_ctx.ctx; vq->call_ctx.producer.irq = irq; ret = irq_bypass_register_producer(&vq->call_ctx.producer); - spin_unlock(&vq->call_ctx.ctx_lock); } static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid) { struct vhost_virtqueue *vq = &v->vqs[qid]; - spin_lock(&vq->call_ctx.ctx_lock); irq_bypass_unregister_producer(&vq->call_ctx.producer); - spin_unlock(&vq->call_ctx.ctx_lock); } static void vhost_vdpa_reset(struct vhost_vdpa *v) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5c5edd4ff04b..5c835a292783 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -302,7 +302,6 @@ static void vhost_vring_call_reset(struct vhost_vring_call *call_ctx) { call_ctx->ctx = NULL; memset(&call_ctx->producer, 0x0, sizeof(struct irq_bypass_producer)); - spin_lock_init(&call_ctx->ctx_lock); } static void vhost_vq_reset(struct vhost_dev *dev, @@ -1650,9 +1649,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg break; } - spin_lock(&vq->call_ctx.ctx_lock); swap(ctx, vq->call_ctx.ctx); - spin_unlock(&vq->call_ctx.ctx_lock); break; case VHOST_SET_VRING_ERR: if (copy_from_user(&f, argp, sizeof f)) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 5fe4b478c215..e016cd3fa02f 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -64,7 +64,6 @@ enum vhost_uaddr_type { struct vhost_vring_call { struct eventfd_ctx *ctx; struct irq_bypass_producer producer; - spinlock_t ctx_lock; }; /* The virtqueue structure describes a queue attached to a device. */ From 88a0d60c6445f315fbcfff3db792021bb3a67b28 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 17 Jun 2020 12:47:56 +0200 Subject: [PATCH 400/497] MAINTAINERS: add URL for virtio-mem Let's add the status/info page, which is still under construction, however, already contains valuable documentation/information. Cc: "Michael S. Tsirkin" Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20200617104756.6312-1-david@redhat.com Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 867157311dc8..7715945e35b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18503,6 +18503,7 @@ VIRTIO MEM DRIVER M: David Hildenbrand L: virtualization@lists.linux-foundation.org S: Maintained +W: https://virtio-mem.gitlab.io/ F: drivers/virtio/virtio_mem.c F: include/uapi/linux/virtio_mem.h From 25219dbfa734e848fe4da84143f972d0301bb7c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 9 Oct 2020 16:42:59 -0700 Subject: [PATCH 401/497] xfs: fix fallocate functions when rtextsize is larger than 1 In commit fe341eb151ec, I forgot that xfs_free_file_space isn't strictly a "remove mapped blocks" function. It is actually a function to zero file space by punching out the middle and writing zeroes to the unaligned ends of the specified range. Therefore, putting a rtextsize alignment check in that function is wrong because that breaks unaligned ZERO_RANGE on the realtime volume. Furthermore, xfs_file_fallocate already has alignment checks for the functions require the file range to be aligned to the size of a fundamental allocation unit (which is 1 FSB on the data volume and 1 rt extent on the realtime volume). Create a new helper to check fallocate arguments against the realtiem allocation unit size, fix the fallocate frontend to use it, fix free_file_space to delete the correct range, and remove a now redundant check from insert_file_space. NOTE: The realtime extent size is not required to be a power of two! Fixes: fe341eb151ec ("xfs: ensure that fpunch, fcollapse, and finsert operations are aligned to rt extent size") Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R --- fs/xfs/xfs_bmap_util.c | 18 +++++------------- fs/xfs/xfs_file.c | 40 +++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_linux.h | 6 ++++++ 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f2a8a0e75e1f..7371a7f7c652 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -947,11 +947,11 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* We can only free complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip)) { - xfs_extlen_t extsz = xfs_get_extsz_hint(ip); - - if ((startoffset_fsb | endoffset_fsb) & (extsz - 1)) - return -EINVAL; + if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { + startoffset_fsb = roundup_64(startoffset_fsb, + mp->m_sb.sb_rextsize); + endoffset_fsb = rounddown_64(endoffset_fsb, + mp->m_sb.sb_rextsize); } /* @@ -1147,14 +1147,6 @@ xfs_insert_file_space( trace_xfs_insert_file_space(ip); - /* We can only insert complete realtime extents. */ - if (XFS_IS_REALTIME_INODE(ip)) { - xfs_extlen_t extsz = xfs_get_extsz_hint(ip); - - if ((stop_fsb | shift_fsb) & (extsz - 1)) - return -EINVAL; - } - error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb); if (error) return error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3d1b95124744..5b0f93f73837 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -32,6 +32,39 @@ static const struct vm_operations_struct xfs_file_vm_ops; +/* + * Decide if the given file range is aligned to the size of the fundamental + * allocation unit for the file. + */ +static bool +xfs_is_falloc_aligned( + struct xfs_inode *ip, + loff_t pos, + long long int len) +{ + struct xfs_mount *mp = ip->i_mount; + uint64_t mask; + + if (XFS_IS_REALTIME_INODE(ip)) { + if (!is_power_of_2(mp->m_sb.sb_rextsize)) { + u64 rextbytes; + u32 mod; + + rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + div_u64_rem(pos, rextbytes, &mod); + if (mod) + return false; + div_u64_rem(len, rextbytes, &mod); + return mod == 0; + } + mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1; + } else { + mask = mp->m_sb.sb_blocksize - 1; + } + + return !((pos | len) & mask); +} + int xfs_update_prealloc_flags( struct xfs_inode *ip, @@ -850,9 +883,7 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; - - if (offset & blksize_mask || len & blksize_mask) { + if (!xfs_is_falloc_aligned(ip, offset, len)) { error = -EINVAL; goto out_unlock; } @@ -872,10 +903,9 @@ xfs_file_fallocate( if (error) goto out_unlock; } else if (mode & FALLOC_FL_INSERT_RANGE) { - unsigned int blksize_mask = i_blocksize(inode) - 1; loff_t isize = i_size_read(inode); - if (offset & blksize_mask || len & blksize_mask) { + if (!xfs_is_falloc_aligned(ip, offset, len)) { error = -EINVAL; goto out_unlock; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ad1009778d33..5b7a1e201559 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -175,6 +175,12 @@ static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev) #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() +static inline uint64_t rounddown_64(uint64_t x, uint32_t y) +{ + do_div(x, y); + return x * y; +} + static inline uint64_t roundup_64(uint64_t x, uint32_t y) { x += y - 1; From bb7a2c636257a26acf910acf38d13eae86d7e2c1 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Mon, 19 Oct 2020 01:05:57 +0800 Subject: [PATCH 402/497] docs/cpu-load: format the example code. format the example code. Signed-off-by: Hui Su Link: https://lore.kernel.org/r/20201018170557.GA7670@rlk Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/cpu-load.rst | 57 ++++++++++++++------------ 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst index ebdecf864080..f3ada90e9ca8 100644 --- a/Documentation/admin-guide/cpu-load.rst +++ b/Documentation/admin-guide/cpu-load.rst @@ -61,43 +61,46 @@ will lead to quite erratic information inside ``/proc/stat``:: static volatile sig_atomic_t stop; - static void sighandler (int signr) + static void sighandler(int signr) { - (void) signr; - stop = 1; + (void) signr; + stop = 1; } + static unsigned long hog (unsigned long niters) { - stop = 0; - while (!stop && --niters); - return niters; + stop = 0; + while (!stop && --niters); + return niters; } + int main (void) { - int i; - struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, - .it_value = { .tv_sec = 0, .tv_usec = 1 } }; - sigset_t set; - unsigned long v[HIST]; - double tmp = 0.0; - unsigned long n; - signal (SIGALRM, &sighandler); - setitimer (ITIMER_REAL, &it, NULL); + int i; + struct itimerval it = { + .it_interval = { .tv_sec = 0, .tv_usec = 1 }, + .it_value = { .tv_sec = 0, .tv_usec = 1 } }; + sigset_t set; + unsigned long v[HIST]; + double tmp = 0.0; + unsigned long n; + signal(SIGALRM, &sighandler); + setitimer(ITIMER_REAL, &it, NULL); - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) tmp += v[i]; - tmp /= HIST; - n = tmp - (tmp / 3.0); + hog (ULONG_MAX); + for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog(ULONG_MAX); + for (i = 0; i < HIST; ++i) tmp += v[i]; + tmp /= HIST; + n = tmp - (tmp / 3.0); - sigemptyset (&set); - sigaddset (&set, SIGALRM); + sigemptyset(&set); + sigaddset(&set, SIGALRM); - for (;;) { - hog (n); - sigwait (&set, &i); - } - return 0; + for (;;) { + hog(n); + sigwait(&set, &i); + } + return 0; } From 27def953b63b43508021f31560b7d169c5f77857 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 15 Oct 2020 16:17:31 -0700 Subject: [PATCH 403/497] docs: deprecated.rst: Expand str*cpy() replacement notes The notes on replacing the deprecated str*cpy() functions didn't call enough attention to the change in return type. Add these details and clean up the language a bit more. Signed-off-by: Kees Cook Acked-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20201015231730.2138505-1-keescook@chromium.org Signed-off-by: Jonathan Corbet --- Documentation/process/deprecated.rst | 44 ++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/Documentation/process/deprecated.rst b/Documentation/process/deprecated.rst index ff71d802b53d..9d83b8db8874 100644 --- a/Documentation/process/deprecated.rst +++ b/Documentation/process/deprecated.rst @@ -106,23 +106,29 @@ NUL or newline terminated. strcpy() -------- -strcpy() performs no bounds checking on the destination -buffer. This could result in linear overflows beyond the -end of the buffer, leading to all kinds of misbehaviors. While -`CONFIG_FORTIFY_SOURCE=y` and various compiler flags help reduce the -risk of using this function, there is no good reason to add new uses of -this function. The safe replacement is strscpy(). +strcpy() performs no bounds checking on the destination buffer. This +could result in linear overflows beyond the end of the buffer, leading to +all kinds of misbehaviors. While `CONFIG_FORTIFY_SOURCE=y` and various +compiler flags help reduce the risk of using this function, there is +no good reason to add new uses of this function. The safe replacement +is strscpy(), though care must be given to any cases where the return +value of strcpy() was used, since strscpy() does not return a pointer to +the destination, but rather a count of non-NUL bytes copied (or negative +errno when it truncates). strncpy() on NUL-terminated strings ----------------------------------- -Use of strncpy() does not guarantee that the destination buffer -will be NUL terminated. This can lead to various linear read overflows -and other misbehavior due to the missing termination. It also NUL-pads the -destination buffer if the source contents are shorter than the destination -buffer size, which may be a needless performance penalty for callers using -only NUL-terminated strings. The safe replacement is strscpy(). -(Users of strscpy() still needing NUL-padding should instead -use strscpy_pad().) +Use of strncpy() does not guarantee that the destination buffer will +be NUL terminated. This can lead to various linear read overflows and +other misbehavior due to the missing termination. It also NUL-pads +the destination buffer if the source contents are shorter than the +destination buffer size, which may be a needless performance penalty +for callers using only NUL-terminated strings. The safe replacement is +strscpy(), though care must be given to any cases where the return value +of strncpy() was used, since strscpy() does not return a pointer to the +destination, but rather a count of non-NUL bytes copied (or negative +errno when it truncates). Any cases still needing NUL-padding should +instead use strscpy_pad(). If a caller is using non-NUL-terminated strings, strncpy() can still be used, but destinations should be marked with the `__nonstring @@ -131,10 +137,12 @@ attribute to avoid future compiler warnings. strlcpy() --------- -strlcpy() reads the entire source buffer first, possibly exceeding -the given limit of bytes to copy. This is inefficient and can lead to -linear read overflows if a source string is not NUL-terminated. The -safe replacement is strscpy(). +strlcpy() reads the entire source buffer first (since the return value +is meant to match that of strlen()). This read may exceed the destination +size limit. This is both inefficient and can lead to linear read overflows +if a source string is not NUL-terminated. The safe replacement is strscpy(), +though care must be given to any cases where the return value of strlcpy() +is used, since strscpy() will return negative errno values when it truncates. %p format specifier ------------------- From ac8bf0de6ad7fa399d016d6dfc4b9c2f17625a8b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 15 Oct 2020 15:45:59 -0700 Subject: [PATCH 404/497] docs: lkdtm: Modernize and improve details The details on using LKDTM were overly obscure. Modernize the details and expand examples to better illustrate how to use the interfaces. Additionally add missing SPDX header. Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201015224559.2137489-1-keescook@chromium.org Signed-off-by: Jonathan Corbet --- .../fault-injection/provoke-crashes.rst | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/Documentation/fault-injection/provoke-crashes.rst b/Documentation/fault-injection/provoke-crashes.rst index 9279a3e12278..a20ba5d93932 100644 --- a/Documentation/fault-injection/provoke-crashes.rst +++ b/Documentation/fault-injection/provoke-crashes.rst @@ -1,16 +1,19 @@ -=============== -Provoke crashes -=============== +.. SPDX-License-Identifier: GPL-2.0 -The lkdtm module provides an interface to crash or injure the kernel at -predefined crashpoints to evaluate the reliability of crash dumps obtained -using different dumping solutions. The module uses KPROBEs to instrument -crashing points, but can also crash the kernel directly without KRPOBE -support. +============================================================ +Provoking crashes with Linux Kernel Dump Test Module (LKDTM) +============================================================ +The lkdtm module provides an interface to disrupt (and usually crash) +the kernel at predefined code locations to evaluate the reliability of +the kernel's exception handling and to test crash dumps obtained using +different dumping solutions. The module uses KPROBEs to instrument the +trigger location, but can also trigger the kernel directly without KPROBE +support via debugfs. -You can provide the way either through module arguments when inserting -the module, or through a debugfs interface. +You can select the location of the trigger ("crash point name") and the +type of action ("crash point type") either through module arguments when +inserting the module, or through the debugfs interface. Usage:: @@ -18,31 +21,38 @@ Usage:: [cpoint_count={>0}] recur_count - Recursion level for the stack overflow test. Default is 10. + Recursion level for the stack overflow test. By default this is + dynamically calculated based on kernel configuration, with the + goal of being just large enough to exhaust the kernel stack. The + value can be seen at `/sys/module/lkdtm/parameters/recur_count`. cpoint_name - Crash point where the kernel is to be crashed. It can be + Where in the kernel to trigger the action. It can be one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, - IDE_CORE_CP, DIRECT + IDE_CORE_CP, or DIRECT cpoint_type Indicates the action to be taken on hitting the crash point. - It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW, - CORRUPT_STACK, UNALIGNED_LOAD_STORE_WRITE, OVERWRITE_ALLOCATION, - WRITE_AFTER_FREE, + These are numerous, and best queried directly from debugfs. Some + of the common ones are PANIC, BUG, EXCEPTION, LOOP, and OVERFLOW. + See the contents of `/sys/kernel/debug/provoke-crash/DIRECT` for + a complete list. cpoint_count Indicates the number of times the crash point is to be hit - to trigger an action. The default is 10. + before triggering the action. The default is 10 (except for + DIRECT, which always fires immediately). You can also induce failures by mounting debugfs and writing the type to -/provoke-crash/. E.g.:: +/provoke-crash/. E.g.:: - mount -t debugfs debugfs /mnt - echo EXCEPTION > /mnt/provoke-crash/INT_HARDWARE_ENTRY + mount -t debugfs debugfs /sys/kernel/debug + echo EXCEPTION > /sys/kernel/debug/provoke-crash/INT_HARDWARE_ENTRY +The special file `DIRECT` will induce the action directly without KPROBE +instrumentation. This mode is the only one available when the module is +built for a kernel without KPROBEs support:: -A special file is `DIRECT` which will induce the crash directly without -KPROBE instrumentation. This mode is the only one available when the module -is built on a kernel without KPROBEs support. + # Instead of having a BUG kill your shell, have it kill "cat": + cat <(echo WRITE_RO) >/sys/kernel/debug/provoke-crash/DIRECT From d16eb0edf91760cac4d8cb09d8b9ab162424f0df Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 15 Oct 2020 11:12:06 +0200 Subject: [PATCH 405/497] docs: driver-api: remove a duplicated index entry The ipmb file was added twice at index.rst. That sounds to be because the same patch was applied twice, via different git trees: commit f6ae22d64433fd8e08654adad7966299da931bb9 Author: Mauro Carvalho Chehab Commit: Jonathan Corbet docs: ipmb: place it at driver-api and convert to ReST commit ac499fba98c3c65078fd84fa0a62cd6f6d5837ed Author: Mauro Carvalho Chehab Commit: Corey Minyard docs: ipmb: place it at driver-api and convert to ReST With Sphinx 4.0.0 development tree, a new warning is produced due to that: .../Documentation/driver-api/index.rst:14: WARNING: duplicated entry found in toctree: driver-api/ipmb The fix is trivial: just drop the duplicated line. Fixes: f6ae22d64433 ("docs: ipmb: place it at driver-api and convert to ReST") Fixes: ac499fba98c3 ("docs: ipmb: place it at driver-api and convert to ReST") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/623fb26a8409a7b002e45bdbb6f517ac08fd508a.1602753121.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/driver-api/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 5ef2cfe3a16b..a48c61a043f9 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -79,7 +79,6 @@ available subsections can be seen below. console dcdbas eisa - ipmb isa isapnp io-mapping From d7a4c55b1376962a32708def0930ec5a72ba1578 Mon Sep 17 00:00:00 2001 From: Wei Lin Chang Date: Thu, 15 Oct 2020 14:22:42 +0800 Subject: [PATCH 406/497] Documentation: x86: fix a missing word in x86_64/mm.rst. This patch adds a missing word in x86/x86_64/mm.rst, without which the note reads awkwardly. Signed-off-by: Wei Lin Chang Link: https://lore.kernel.org/r/20201015062242.26296-1-r09922117@csie.ntu.edu.tw Signed-off-by: Jonathan Corbet --- Documentation/x86/x86_64/mm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/x86/x86_64/mm.rst b/Documentation/x86/x86_64/mm.rst index e5053404a1ae..ede1875719fb 100644 --- a/Documentation/x86/x86_64/mm.rst +++ b/Documentation/x86/x86_64/mm.rst @@ -19,7 +19,7 @@ Complete virtual memory map with 4-level page tables Note that as we get closer to the top of the address space, the notation changes from TB to GB and then MB/KB. - - "16M TB" might look weird at first sight, but it's an easier to visualize size + - "16M TB" might look weird at first sight, but it's an easier way to visualize size notation than "16 EB", which few will recognize at first sight as 16 exabytes. It also shows it nicely how incredibly large 64-bit address space is. From e0533dee522593c25a88b63bf730b2096f6d4122 Mon Sep 17 00:00:00 2001 From: Bailu Lin Date: Tue, 13 Oct 2020 19:20:03 -0700 Subject: [PATCH 407/497] Documentation: Chinese translation of Documentation/arm64/hugetlbpage.rst This is a Chinese translated version of Documentation/arm64/hugetlbpage.rst Signed-off-by: Bailu Lin Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20201014022003.43862-1-bailu.lin@vivo.com Signed-off-by: Jonathan Corbet --- Documentation/arm64/hugetlbpage.rst | 2 + .../translations/zh_CN/arm64/hugetlbpage.rst | 45 +++++++++++++++++++ .../translations/zh_CN/arm64/index.rst | 1 + 3 files changed, 48 insertions(+) create mode 100644 Documentation/translations/zh_CN/arm64/hugetlbpage.rst diff --git a/Documentation/arm64/hugetlbpage.rst b/Documentation/arm64/hugetlbpage.rst index b44f939e5210..a110124c11e3 100644 --- a/Documentation/arm64/hugetlbpage.rst +++ b/Documentation/arm64/hugetlbpage.rst @@ -1,3 +1,5 @@ +.. _hugetlbpage_index: + ==================== HugeTLBpage on ARM64 ==================== diff --git a/Documentation/translations/zh_CN/arm64/hugetlbpage.rst b/Documentation/translations/zh_CN/arm64/hugetlbpage.rst new file mode 100644 index 000000000000..13304d269d0b --- /dev/null +++ b/Documentation/translations/zh_CN/arm64/hugetlbpage.rst @@ -0,0 +1,45 @@ +.. include:: ../disclaimer-zh_CN.rst + +:Original: :ref:`Documentation/arm64/hugetlbpage.rst ` + +Translator: Bailu Lin + +===================== +ARM64中的 HugeTLBpage +===================== + +大页依靠有效利用 TLBs 来提高地址翻译的性能。这取决于以下 +两点 - + + - 大页的大小 + - TLBs 支持的条目大小 + +ARM64 接口支持2种大页方式。 + +1) pud/pmd 级别的块映射 +----------------------- + +这是常规大页,他们的 pmd 或 pud 页面表条目指向一个内存块。 +不管 TLB 中支持的条目大小如何,块映射可以减少翻译大页地址 +所需遍历的页表深度。 + +2) 使用连续位 +------------- + +架构中转换页表条目(D4.5.3, ARM DDI 0487C.a)中提供一个连续 +位告诉 MMU 这个条目是一个连续条目集的一员,它可以被缓存在单 +个 TLB 条目中。 + +在 Linux 中连续位用来增加 pmd 和 pte(最后一级)级别映射的大 +小。受支持的连续页表条目数量因页面大小和页表级别而异。 + + +支持以下大页尺寸配置 - + + ====== ======== ==== ======== === + - CONT PTE PMD CONT PMD PUD + ====== ======== ==== ======== === + 4K: 64K 2M 32M 1G + 16K: 2M 32M 1G + 64K: 2M 512M 16G + ====== ======== ==== ======== === diff --git a/Documentation/translations/zh_CN/arm64/index.rst b/Documentation/translations/zh_CN/arm64/index.rst index 646ed1f7aea3..e31a6090384d 100644 --- a/Documentation/translations/zh_CN/arm64/index.rst +++ b/Documentation/translations/zh_CN/arm64/index.rst @@ -14,3 +14,4 @@ ARM64 架构 :maxdepth: 2 amu + hugetlbpage From 030f066f677f297033772dcdce9538b968fbeb14 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 13 Oct 2020 18:27:25 +0200 Subject: [PATCH 408/497] docs: submitting-patches: describe preserving review/test tags From time to time, the novice kernel contributors do not add Reviewed-by or Tested-by tags to the next versions of the patches. Mostly because they are unaware that responsibility of adding these tags in next version is on submitter, not maintainer. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20201013162725.13572-1-krzk@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/process/submitting-patches.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index 58586ffe2808..83d9a82055a7 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -527,6 +527,13 @@ done on the patch. Reviewed-by: tags, when supplied by reviewers known to understand the subject area and to perform thorough reviews, will normally increase the likelihood of your patch getting into the kernel. +Both Tested-by and Reviewed-by tags, once received on mailing list from tester +or reviewer, should be added by author to the applicable patches when sending +next versions. However if the patch has changed substantially in following +version, these tags might not be applicable anymore and thus should be removed. +Usually removal of someone's Tested-by or Reviewed-by tags should be mentioned +in the patch changelog (after the '---' separator). + A Suggested-by: tag indicates that the patch idea is suggested by the person named and ensures credit to the person for the idea. Please note that this tag should not be added without the reporter's permission, especially if the From 274c240c760ed4647ddae1f1b994e0dd3f71cbb1 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Wed, 14 Oct 2020 14:05:18 +0800 Subject: [PATCH 409/497] drm/amdgpu: add function to program pbb mode for sienna cichlid Add function for sienna_cichlid to force PBB workload mode to zero by checking whether there have SE been harvested. Signed-off-by: Likun Gao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 62 ++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 9792ec737029..f95a173c52f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -112,6 +112,22 @@ #define mmCP_HYP_ME_UCODE_DATA 0x5817 #define mmCP_HYP_ME_UCODE_DATA_BASE_IDX 1 +//CC_GC_SA_UNIT_DISABLE +#define mmCC_GC_SA_UNIT_DISABLE 0x0fe9 +#define mmCC_GC_SA_UNIT_DISABLE_BASE_IDX 0 +#define CC_GC_SA_UNIT_DISABLE__SA_DISABLE__SHIFT 0x8 +#define CC_GC_SA_UNIT_DISABLE__SA_DISABLE_MASK 0x0000FF00L +//GC_USER_SA_UNIT_DISABLE +#define mmGC_USER_SA_UNIT_DISABLE 0x0fea +#define mmGC_USER_SA_UNIT_DISABLE_BASE_IDX 0 +#define GC_USER_SA_UNIT_DISABLE__SA_DISABLE__SHIFT 0x8 +#define GC_USER_SA_UNIT_DISABLE__SA_DISABLE_MASK 0x0000FF00L +//PA_SC_ENHANCE_3 +#define mmPA_SC_ENHANCE_3 0x1085 +#define mmPA_SC_ENHANCE_3_BASE_IDX 0 +#define PA_SC_ENHANCE_3__FORCE_PBB_WORKLOAD_MODE_TO_ZERO__SHIFT 0x3 +#define PA_SC_ENHANCE_3__FORCE_PBB_WORKLOAD_MODE_TO_ZERO_MASK 0x00000008L + MODULE_FIRMWARE("amdgpu/navi10_ce.bin"); MODULE_FIRMWARE("amdgpu/navi10_pfp.bin"); MODULE_FIRMWARE("amdgpu/navi10_me.bin"); @@ -3188,6 +3204,8 @@ static int gfx_v10_0_wait_for_rlc_autoload_complete(struct amdgpu_device *adev); static void gfx_v10_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume); static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume); static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start, bool secure); +static u32 gfx_v10_3_get_disabled_sa(struct amdgpu_device *adev); +static void gfx_v10_3_program_pbb_mode(struct amdgpu_device *adev); static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t queue_mask) { @@ -6950,6 +6968,9 @@ static int gfx_v10_0_hw_init(void *handle) if (r) return r; + if (adev->asic_type == CHIP_SIENNA_CICHLID) + gfx_v10_3_program_pbb_mode(adev); + return r; } @@ -8797,6 +8818,47 @@ static int gfx_v10_0_get_cu_info(struct amdgpu_device *adev, return 0; } +static u32 gfx_v10_3_get_disabled_sa(struct amdgpu_device *adev) +{ + uint32_t efuse_setting, vbios_setting, disabled_sa, max_sa_mask; + + efuse_setting = RREG32_SOC15(GC, 0, mmCC_GC_SA_UNIT_DISABLE); + efuse_setting &= CC_GC_SA_UNIT_DISABLE__SA_DISABLE_MASK; + efuse_setting >>= CC_GC_SA_UNIT_DISABLE__SA_DISABLE__SHIFT; + + vbios_setting = RREG32_SOC15(GC, 0, mmGC_USER_SA_UNIT_DISABLE); + vbios_setting &= GC_USER_SA_UNIT_DISABLE__SA_DISABLE_MASK; + vbios_setting >>= GC_USER_SA_UNIT_DISABLE__SA_DISABLE__SHIFT; + + max_sa_mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_sh_per_se * + adev->gfx.config.max_shader_engines); + disabled_sa = efuse_setting | vbios_setting; + disabled_sa &= max_sa_mask; + + return disabled_sa; +} + +static void gfx_v10_3_program_pbb_mode(struct amdgpu_device *adev) +{ + uint32_t max_sa_per_se, max_sa_per_se_mask, max_shader_engines; + uint32_t disabled_sa_mask, se_index, disabled_sa_per_se; + + disabled_sa_mask = gfx_v10_3_get_disabled_sa(adev); + + max_sa_per_se = adev->gfx.config.max_sh_per_se; + max_sa_per_se_mask = (1 << max_sa_per_se) - 1; + max_shader_engines = adev->gfx.config.max_shader_engines; + + for (se_index = 0; max_shader_engines > se_index; se_index++) { + disabled_sa_per_se = disabled_sa_mask >> (se_index * max_sa_per_se); + disabled_sa_per_se &= max_sa_per_se_mask; + if (disabled_sa_per_se == max_sa_per_se_mask) { + WREG32_FIELD15(GC, 0, PA_SC_ENHANCE_3, FORCE_PBB_WORKLOAD_MODE_TO_ZERO, 1); + break; + } + } +} + const struct amdgpu_ip_block_version gfx_v10_0_ip_block = { .type = AMD_IP_BLOCK_TYPE_GFX, From 843c7eb2f7571aa092a8ea010c80e8d94c197f67 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Wed, 30 Sep 2020 14:34:08 +0800 Subject: [PATCH 410/497] drm/amdgpu: add rlc iram and dram firmware support Support to load RLC iram and dram ucode when RLC firmware struct use v2.2 Signed-off-by: Likun Gao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h | 4 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 10 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h | 11 +++++++ drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 39 +++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h | 4 +-- 6 files changed, 66 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 18be544d8c1e..a9cae6d943c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1750,6 +1750,12 @@ static int psp_get_fw_type(struct amdgpu_firmware_info *ucode, case AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM: *type = GFX_FW_TYPE_RLC_RESTORE_LIST_SRM_MEM; break; + case AMDGPU_UCODE_ID_RLC_IRAM: + *type = GFX_FW_TYPE_RLC_IRAM; + break; + case AMDGPU_UCODE_ID_RLC_DRAM: + *type = GFX_FW_TYPE_RLC_DRAM_BOOT; + break; case AMDGPU_UCODE_ID_SMC: *type = GFX_FW_TYPE_SMU; break; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h index 60bb3e8b3118..aeaaae713c59 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h @@ -168,12 +168,16 @@ struct amdgpu_rlc { u32 save_restore_list_cntl_size_bytes; u32 save_restore_list_gpm_size_bytes; u32 save_restore_list_srm_size_bytes; + u32 rlc_iram_ucode_size_bytes; + u32 rlc_dram_ucode_size_bytes; u32 *register_list_format; u32 *register_restore; u8 *save_restore_list_cntl; u8 *save_restore_list_gpm; u8 *save_restore_list_srm; + u8 *rlc_iram_ucode; + u8 *rlc_dram_ucode; bool is_rlc_v2_1; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c index 55fe19a2f332..b313ce4c3e97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c @@ -500,6 +500,8 @@ static int amdgpu_ucode_init_single_fw(struct amdgpu_device *adev, ucode->ucode_id != AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL && ucode->ucode_id != AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM && ucode->ucode_id != AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM && + ucode->ucode_id != AMDGPU_UCODE_ID_RLC_IRAM && + ucode->ucode_id != AMDGPU_UCODE_ID_RLC_DRAM && ucode->ucode_id != AMDGPU_UCODE_ID_DMCU_ERAM && ucode->ucode_id != AMDGPU_UCODE_ID_DMCU_INTV && ucode->ucode_id != AMDGPU_UCODE_ID_DMCUB)) { @@ -556,6 +558,14 @@ static int amdgpu_ucode_init_single_fw(struct amdgpu_device *adev, ucode->ucode_size = adev->gfx.rlc.save_restore_list_srm_size_bytes; memcpy(ucode->kaddr, adev->gfx.rlc.save_restore_list_srm, ucode->ucode_size); + } else if (ucode->ucode_id == AMDGPU_UCODE_ID_RLC_IRAM) { + ucode->ucode_size = adev->gfx.rlc.rlc_iram_ucode_size_bytes; + memcpy(ucode->kaddr, adev->gfx.rlc.rlc_iram_ucode, + ucode->ucode_size); + } else if (ucode->ucode_id == AMDGPU_UCODE_ID_RLC_DRAM) { + ucode->ucode_size = adev->gfx.rlc.rlc_dram_ucode_size_bytes; + memcpy(ucode->kaddr, adev->gfx.rlc.rlc_dram_ucode, + ucode->ucode_size); } else if (ucode->ucode_id == AMDGPU_UCODE_ID_CP_MES) { ucode->ucode_size = le32_to_cpu(mes_hdr->mes_ucode_size_bytes); memcpy(ucode->kaddr, (void *)((uint8_t *)adev->mes.fw->data + diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h index 3c23c6293ff9..0e43b46d3ab5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h @@ -222,6 +222,15 @@ struct rlc_firmware_header_v2_1 { uint32_t save_restore_list_srm_offset_bytes; }; +/* version_major=2, version_minor=1 */ +struct rlc_firmware_header_v2_2 { + struct rlc_firmware_header_v2_1 v2_1; + uint32_t rlc_iram_ucode_size_bytes; + uint32_t rlc_iram_ucode_offset_bytes; + uint32_t rlc_dram_ucode_size_bytes; + uint32_t rlc_dram_ucode_offset_bytes; +}; + /* version_major=1, version_minor=0 */ struct sdma_firmware_header_v1_0 { struct common_firmware_header header; @@ -339,6 +348,8 @@ enum AMDGPU_UCODE_ID { AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM, AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM, + AMDGPU_UCODE_ID_RLC_IRAM, + AMDGPU_UCODE_ID_RLC_DRAM, AMDGPU_UCODE_ID_RLC_G, AMDGPU_UCODE_ID_STORAGE, AMDGPU_UCODE_ID_SMC, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index f95a173c52f1..aff14e257ef5 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3604,6 +3604,17 @@ static void gfx_v10_0_init_rlc_ext_microcode(struct amdgpu_device *adev) le32_to_cpu(rlc_hdr->reg_list_format_direct_reg_list_length); } +static void gfx_v10_0_init_rlc_iram_dram_microcode(struct amdgpu_device *adev) +{ + const struct rlc_firmware_header_v2_2 *rlc_hdr; + + rlc_hdr = (const struct rlc_firmware_header_v2_2 *)adev->gfx.rlc_fw->data; + adev->gfx.rlc.rlc_iram_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlc_iram_ucode_size_bytes); + adev->gfx.rlc.rlc_iram_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlc_iram_ucode_offset_bytes); + adev->gfx.rlc.rlc_dram_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlc_dram_ucode_size_bytes); + adev->gfx.rlc.rlc_dram_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlc_dram_ucode_offset_bytes); +} + static bool gfx_v10_0_navi10_gfxoff_should_enable(struct amdgpu_device *adev) { bool ret = false; @@ -3719,8 +3730,6 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) rlc_hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data; version_major = le16_to_cpu(rlc_hdr->header.header_version_major); version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor); - if (version_major == 2 && version_minor == 1) - adev->gfx.rlc.is_rlc_v2_1 = true; adev->gfx.rlc_fw_version = le32_to_cpu(rlc_hdr->header.ucode_version); adev->gfx.rlc_feature_version = le32_to_cpu(rlc_hdr->ucode_feature_version); @@ -3762,8 +3771,12 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) for (i = 0 ; i < (rlc_hdr->reg_list_size_bytes >> 2); i++) adev->gfx.rlc.register_restore[i] = le32_to_cpu(tmp[i]); - if (adev->gfx.rlc.is_rlc_v2_1) - gfx_v10_0_init_rlc_ext_microcode(adev); + if (version_major == 2) { + if (version_minor >= 1) + gfx_v10_0_init_rlc_ext_microcode(adev); + if (version_minor == 2) + gfx_v10_0_init_rlc_iram_dram_microcode(adev); + } } snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec%s.bin", chip_name, wks); @@ -3824,8 +3837,7 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) adev->firmware.fw_size += ALIGN(le32_to_cpu(header->ucode_size_bytes), PAGE_SIZE); } - if (adev->gfx.rlc.is_rlc_v2_1 && - adev->gfx.rlc.save_restore_list_cntl_size_bytes && + if (adev->gfx.rlc.save_restore_list_cntl_size_bytes && adev->gfx.rlc.save_restore_list_gpm_size_bytes && adev->gfx.rlc.save_restore_list_srm_size_bytes) { info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_CNTL]; @@ -3845,6 +3857,21 @@ static int gfx_v10_0_init_microcode(struct amdgpu_device *adev) info->fw = adev->gfx.rlc_fw; adev->firmware.fw_size += ALIGN(adev->gfx.rlc.save_restore_list_srm_size_bytes, PAGE_SIZE); + + if (adev->gfx.rlc.rlc_iram_ucode_size_bytes && + adev->gfx.rlc.rlc_dram_ucode_size_bytes) { + info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_IRAM]; + info->ucode_id = AMDGPU_UCODE_ID_RLC_IRAM; + info->fw = adev->gfx.rlc_fw; + adev->firmware.fw_size += + ALIGN(adev->gfx.rlc.rlc_iram_ucode_size_bytes, PAGE_SIZE); + + info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_DRAM]; + info->ucode_id = AMDGPU_UCODE_ID_RLC_DRAM; + info->fw = adev->gfx.rlc_fw; + adev->firmware.fw_size += + ALIGN(adev->gfx.rlc.rlc_dram_ucode_size_bytes, PAGE_SIZE); + } } info = &adev->firmware.ucode[AMDGPU_UCODE_ID_CP_MEC1]; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h index 1ef2f5b1d828..4137dc710aaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h +++ b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h @@ -201,7 +201,7 @@ enum psp_gfx_fw_type { GFX_FW_TYPE_UVD1 = 23, /* UVD1 VG-20 */ GFX_FW_TYPE_TOC = 24, /* TOC NV-10 */ GFX_FW_TYPE_RLC_P = 25, /* RLC P NV */ - GFX_FW_TYPE_RLX6 = 26, /* RLX6 NV */ + GFX_FW_TYPE_RLC_IRAM = 26, /* RLC_IRAM NV */ GFX_FW_TYPE_GLOBAL_TAP_DELAYS = 27, /* GLOBAL TAP DELAYS NV */ GFX_FW_TYPE_SE0_TAP_DELAYS = 28, /* SE0 TAP DELAYS NV */ GFX_FW_TYPE_SE1_TAP_DELAYS = 29, /* SE1 TAP DELAYS NV */ @@ -223,7 +223,7 @@ enum psp_gfx_fw_type { GFX_FW_TYPE_ACCUM_CTRL_RAM = 45, /* ACCUM CTRL RAM NV */ GFX_FW_TYPE_RLCP_CAM = 46, /* RLCP CAM NV */ GFX_FW_TYPE_RLC_SPP_CAM_EXT = 47, /* RLC SPP CAM EXT NV */ - GFX_FW_TYPE_RLX6_DRAM_BOOT = 48, /* RLX6 DRAM BOOT NV */ + GFX_FW_TYPE_RLC_DRAM_BOOT = 48, /* RLC DRAM BOOT NV */ GFX_FW_TYPE_VCN0_RAM = 49, /* VCN_RAM NV + RN */ GFX_FW_TYPE_VCN1_RAM = 50, /* VCN_RAM NV + RN */ GFX_FW_TYPE_DMUB = 51, /* DMUB RN */ From 207ac684792560acdb9e06f9d707ebf63c84b0e0 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 15 Oct 2020 14:57:46 +0800 Subject: [PATCH 411/497] drm/amdgpu: correct the gpu reset handling for job != NULL case Current code wrongly treat all cases as job == NULL. Signed-off-by: Evan Quan Reviewed-and-tested-by: Jane Jian Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e8b41756c9f9..37da3537ba2e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4625,7 +4625,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { r = amdgpu_device_pre_asic_reset(tmp_adev, - NULL, + (tmp_adev == adev) ? job : NULL, &need_full_reset); /*TODO Should we stop ?*/ if (r) { From d48d7484d8dca1d4577fc53f1f826e68420d00eb Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 16 Oct 2020 11:07:47 +0800 Subject: [PATCH 412/497] drm/amd/swsmu: add missing feature map for sienna_cichlid it will cause smu sysfs node of "pp_features" show error. Signed-off-by: Kevin Wang Reviewed-by: Likun Gao Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/pm/inc/smu_types.h | 1 + drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/inc/smu_types.h b/drivers/gpu/drm/amd/pm/inc/smu_types.h index 35fc46d3c9c0..cbf4a58b77d9 100644 --- a/drivers/gpu/drm/amd/pm/inc/smu_types.h +++ b/drivers/gpu/drm/amd/pm/inc/smu_types.h @@ -220,6 +220,7 @@ enum smu_clk_type { __SMU_DUMMY_MAP(DPM_MP0CLK), \ __SMU_DUMMY_MAP(DPM_LINK), \ __SMU_DUMMY_MAP(DPM_DCEFCLK), \ + __SMU_DUMMY_MAP(DPM_XGMI), \ __SMU_DUMMY_MAP(DS_GFXCLK), \ __SMU_DUMMY_MAP(DS_SOCCLK), \ __SMU_DUMMY_MAP(DS_LCLK), \ diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index c27806fd07e0..ca2abb2e5340 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -151,14 +151,17 @@ static struct cmn2asic_mapping sienna_cichlid_feature_mask_map[SMU_FEATURE_COUNT FEA_MAP(DPM_GFXCLK), FEA_MAP(DPM_GFX_GPO), FEA_MAP(DPM_UCLK), + FEA_MAP(DPM_FCLK), FEA_MAP(DPM_SOCCLK), FEA_MAP(DPM_MP0CLK), FEA_MAP(DPM_LINK), FEA_MAP(DPM_DCEFCLK), + FEA_MAP(DPM_XGMI), FEA_MAP(MEM_VDDCI_SCALING), FEA_MAP(MEM_MVDD_SCALING), FEA_MAP(DS_GFXCLK), FEA_MAP(DS_SOCCLK), + FEA_MAP(DS_FCLK), FEA_MAP(DS_LCLK), FEA_MAP(DS_DCEFCLK), FEA_MAP(DS_UCLK), From 0d142232d9436acab3578ee995472f58adcbf201 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Thu, 15 Oct 2020 10:48:15 +0800 Subject: [PATCH 413/497] drm/amdgpu: update golden setting for sienna_cichlid Update golden setting for sienna_cichlid. Signed-off-by: Likun Gao Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index aff14e257ef5..1c98b248a7fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3107,6 +3107,7 @@ static const struct soc15_reg_golden golden_settings_gc_10_3[] = SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2C_ADDR_MATCH_MASK, 0xffffffff, 0xffffffcf), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2C_CM_CTRL1, 0xff8fff0f, 0x580f1008), SOC15_REG_GOLDEN_VALUE(GC, 0, mmGL2C_CTRL3, 0xf7ffffff, 0x10f80988), + SOC15_REG_GOLDEN_VALUE(GC, 0, mmLDS_CONFIG, 0x00000020, 0x00000020), SOC15_REG_GOLDEN_VALUE(GC, 0, mmPA_CL_ENHANCE, 0xf17fffff, 0x01200007), SOC15_REG_GOLDEN_VALUE(GC, 0, mmPA_SC_BINNER_TIMEOUT_COUNTER, 0xffffffff, 0x00000800), SOC15_REG_GOLDEN_VALUE(GC, 0, mmPA_SC_ENHANCE_2, 0xffffffbf, 0x00000820), From 0d427f6c290c69827b2ca33c5f1386816992e4d8 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 14 Oct 2020 13:10:06 -0400 Subject: [PATCH 414/497] drm/amd/display: Revert "drm/amd/display: Fix a list corruption" This fixes regression on device unplug and/or driver unload. [ 65.681501 < 0.000004>] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 65.681504 < 0.000003>] #PF: supervisor write access in kernel mode [ 65.681506 < 0.000002>] #PF: error_code(0x0002) - not-present page [ 65.681507 < 0.000001>] PGD 7c9437067 P4D 7c9437067 PUD 7c9db7067 PMD 0 [ 65.681511 < 0.000004>] Oops: 0002 [#1] SMP NOPTI [ 65.681512 < 0.000001>] CPU: 8 PID: 127 Comm: kworker/8:1 Tainted: G W O 5.9.0-rc2-dev+ #59 [ 65.681514 < 0.000002>] Hardware name: System manufacturer System Product Name/PRIME X470-PRO, BIOS 4406 02/28/2019 [ 65.681525 < 0.000011>] Workqueue: events drm_connector_free_work_fn [drm] [ 65.681535 < 0.000010>] RIP: 0010:drm_atomic_private_obj_fini+0x11/0x60 [drm] [ 65.681537 < 0.000002>] Code: de 4c 89 e7 e8 70 f2 ba f8 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d c3 90 0f 1f 44 00 00 48 8b 47 08 48 8b 17 55 48 89 e5 53 <48> 89 42 08 48 89 10 48 b8 00 01 00 00 00 00 ad de 48 89 fb 48 89 [ 65.681541 < 0.000004>] RSP: 0018:ffffa5fa805efdd8 EFLAGS: 00010246 [ 65.681542 < 0.000001>] RAX: 0000000000000000 RBX: ffff9a4b094654d8 RCX: 0000000000000000 [ 65.681544 < 0.000002>] RDX: 0000000000000000 RSI: ffffffffba197bc2 RDI: ffff9a4b094654d8 [ 65.681545 < 0.000001>] RBP: ffffa5fa805efde0 R08: ffffffffba197b82 R09: 0000000000000040 [ 65.681547 < 0.000002>] R10: ffffa5fa805efdc8 R11: 000000000000007f R12: ffff9a4b09465888 [ 65.681549 < 0.000002>] R13: ffff9a4b36f20010 R14: ffff9a4b36f20290 R15: ffff9a4b3a692840 [ 65.681551 < 0.000002>] FS: 0000000000000000(0000) GS:ffff9a4b3ea00000(0000) knlGS:0000000000000000 [ 65.681553 < 0.000002>] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 65.681554 < 0.000001>] CR2: 0000000000000008 CR3: 00000007c9c82000 CR4: 00000000003506e0 [ 65.681556 < 0.000002>] Call Trace: [ 65.681561 < 0.000005>] drm_dp_mst_topology_mgr_destroy+0xc4/0xe0 [drm_kms_helper] [ 65.681612 < 0.000051>] amdgpu_dm_connector_destroy+0x3d/0x110 [amdgpu] [ 65.681622 < 0.000010>] drm_connector_free_work_fn+0x78/0x90 [drm] [ 65.681624 < 0.000002>] process_one_work+0x164/0x410 [ 65.681626 < 0.000002>] worker_thread+0x4d/0x450 [ 65.681628 < 0.000002>] ? rescuer_thread+0x390/0x390 [ 65.681630 < 0.000002>] kthread+0x10a/0x140 [ 65.681632 < 0.000002>] ? kthread_unpark+0x70/0x70 [ 65.681634 < 0.000002>] ret_from_fork+0x22/0x30 This reverts commit 1545fbf97eafc1dbdc2923e58b4186b16a834784. Signed-off-by: Andrey Grodzovsky Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index bb1bc7f5d149..51223450918a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -5063,7 +5063,6 @@ static void amdgpu_dm_connector_destroy(struct drm_connector *connector) struct amdgpu_device *adev = drm_to_adev(connector->dev); struct amdgpu_display_manager *dm = &adev->dm; - drm_atomic_private_obj_fini(&aconnector->mst_mgr.base); #if defined(CONFIG_BACKLIGHT_CLASS_DEVICE) ||\ defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE) From 5dff80bdce9e385af5716ed083f9e33e814484ab Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Wed, 14 Oct 2020 13:12:30 -0400 Subject: [PATCH 415/497] drm/amd/display: Avoid MST manager resource leak. On connector destruction call drm_dp_mst_topology_mgr_destroy to release resources allocated in drm_dp_mst_topology_mgr_init. Do it only if MST manager was initilized before otherwsie a crash is seen on driver unload/device unplug. Reviewed-by: Nicholas Kazlauskas Signed-off-by: Andrey Grodzovsky Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 51223450918a..e2b23486ba4c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -5063,6 +5063,13 @@ static void amdgpu_dm_connector_destroy(struct drm_connector *connector) struct amdgpu_device *adev = drm_to_adev(connector->dev); struct amdgpu_display_manager *dm = &adev->dm; + /* + * Call only if mst_mgr was iniitalized before since it's not done + * for all connector types. + */ + if (aconnector->mst_mgr.dev) + drm_dp_mst_topology_mgr_destroy(&aconnector->mst_mgr); + #if defined(CONFIG_BACKLIGHT_CLASS_DEVICE) ||\ defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE) From f1bcddffe46b349a82445a8d9efd5f5fcb72557f Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 16 Oct 2020 10:50:44 -0400 Subject: [PATCH 416/497] drm/amd/psp: Fix sysfs: cannot create duplicate filename psp sysfs not cleaned up on driver unload for sienna_cichlid Fixes: ce87c98db428e7 ("drm/amdgpu: Include sienna_cichlid in USBC PD FW support.") Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index a9cae6d943c4..96a9699f87ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -208,7 +208,8 @@ static int psp_sw_fini(void *handle) adev->psp.ta_fw = NULL; } - if (adev->asic_type == CHIP_NAVI10) + if (adev->asic_type == CHIP_NAVI10 || + adev->asic_type == CHIP_SIENNA_CICHLID) psp_sysfs_fini(adev); return 0; From 9389b9d5d3566b5687829a4098e715f0016451c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 5 Oct 2020 12:55:32 -0700 Subject: [PATCH 417/497] KVM: VMX: Ignore userspace MSR filters for x2APIC Rework the resetting of the MSR bitmap for x2APIC MSRs to ignore userspace filtering. Allowing userspace to intercept reads to x2APIC MSRs when APICV is fully enabled for the guest simply can't work; the LAPIC and thus virtual APIC is in-kernel and cannot be directly accessed by userspace. To keep things simple we will in fact forbid intercepting x2APIC MSRs altogether, independent of the default_allow setting. Cc: Alexander Graf Cc: Aaron Lewis Cc: Peter Xu Signed-off-by: Sean Christopherson Message-Id: <20201005195532.8674-3-sean.j.christopherson@intel.com> [Modified to operate even if APICv is disabled, adjust documentation. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 27 +++++++++++--------- arch/x86/kvm/vmx/vmx.c | 45 ++++++++++++++++++++++------------ arch/x86/kvm/x86.c | 4 +-- 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 425325ff4434..bd94105f2960 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4735,37 +4735,37 @@ KVM_PV_VM_VERIFY struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; }; -flags values for struct kvm_msr_filter_range: +flags values for ``struct kvm_msr_filter_range``: -KVM_MSR_FILTER_READ +``KVM_MSR_FILTER_READ`` Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap indicates that a read should immediately fail, while a 1 indicates that a read for a particular MSR should be handled regardless of the default filter action. -KVM_MSR_FILTER_WRITE +``KVM_MSR_FILTER_WRITE`` Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap indicates that a write should immediately fail, while a 1 indicates that a write for a particular MSR should be handled regardless of the default filter action. -KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE +``KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE`` Filter both read and write accesses to MSRs using the given bitmap. A 0 in the bitmap indicates that both reads and writes should immediately fail, while a 1 indicates that reads and writes for a particular MSR are not filtered by this range. -flags values for struct kvm_msr_filter: +flags values for ``struct kvm_msr_filter``: -KVM_MSR_FILTER_DEFAULT_ALLOW +``KVM_MSR_FILTER_DEFAULT_ALLOW`` If no filter range matches an MSR index that is getting accessed, KVM will fall back to allowing access to the MSR. -KVM_MSR_FILTER_DEFAULT_DENY +``KVM_MSR_FILTER_DEFAULT_DENY`` If no filter range matches an MSR index that is getting accessed, KVM will fall back to rejecting access to the MSR. In this mode, all MSRs that should @@ -4775,14 +4775,19 @@ This ioctl allows user space to define up to 16 bitmaps of MSR ranges to specify whether a certain MSR access should be explicitly filtered for or not. If this ioctl has never been invoked, MSR accesses are not guarded and the -old KVM in-kernel emulation behavior is fully preserved. +default KVM in-kernel emulation behavior is fully preserved. As soon as the filtering is in place, every MSR access is processed through -the filtering. If a bit is within one of the defined ranges, read and write +the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff); +x2APIC MSRs are always allowed, independent of the ``default_allow`` setting, +and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base +register. + +If a bit is within one of the defined ranges, read and write accesses are guarded by the bitmap's value for the MSR index. If it is not defined in any range, whether MSR access is rejected is determined by the flags -field in the kvm_msr_filter struct: KVM_MSR_FILTER_DEFAULT_ALLOW and -KVM_MSR_FILTER_DEFAULT_DENY. +field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW`` and +``KVM_MSR_FILTER_DEFAULT_DENY``. Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect. diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4797ec92c88c..132a8cc9f9a4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3782,28 +3782,41 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) return mode; } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { + unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; + unsigned long read_intercept; int msr; - for (msr = 0x800; msr <= 0x8ff; msr++) { - bool apicv = !!(mode & MSR_BITMAP_MODE_X2APIC_APICV); + read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, !apicv); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, true); + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned int read_idx = msr / BITS_PER_LONG; + unsigned int write_idx = read_idx + (0x800 / sizeof(long)); + + msr_bitmap[read_idx] = read_intercept; + msr_bitmap[write_idx] = ~0ul; } +} - if (mode & MSR_BITMAP_MODE_X2APIC) { - /* - * TPR reads and writes can be virtualized even if virtual interrupt - * delivery is not in use. - */ - vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); - if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { - vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); - } +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +{ + if (!cpu_has_vmx_msr_bitmap()) + return; + + vmx_reset_x2apic_msrs(vcpu, mode); + + /* + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. + */ + vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, + !(mode & MSR_BITMAP_MODE_X2APIC)); + + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { + vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c4015a43cc8a..08cfb5e4bd07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1497,8 +1497,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) bool r = kvm->arch.msr_filter.default_allow; int idx; - /* MSR filtering not set up, allow everything */ - if (!count) + /* MSR filtering not set up or x2APIC enabled, allow everything */ + if (!count || (index >= 0x800 && index <= 0x8ff)) return true; /* Prevent collision with set_msr_filter */ From 043248b3280cefe286113525672327a4ddfecd3f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 20 Oct 2020 10:57:01 -0400 Subject: [PATCH 418/497] KVM: VMX: Forbid userspace MSR filters for x2APIC Allowing userspace to intercept reads to x2APIC MSRs when APICV is fully enabled for the guest simply can't work. But more in general, the LAPIC could be set to in-kernel after the MSR filter is setup and allowing accesses by userspace would be very confusing. We could in principle allow userspace to intercept reads and writes to TPR, and writes to EOI and SELF_IPI, but while that could be made it work, it would still be silly. Cc: Alexander Graf Cc: Aaron Lewis Cc: Peter Xu Cc: Sean Christopherson Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 18 ++++++++++-------- arch/x86/kvm/x86.c | 9 ++++++++- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index bd94105f2960..9ece9a827a58 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4777,20 +4777,22 @@ specify whether a certain MSR access should be explicitly filtered for or not. If this ioctl has never been invoked, MSR accesses are not guarded and the default KVM in-kernel emulation behavior is fully preserved. +Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR +filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes +an error. + As soon as the filtering is in place, every MSR access is processed through the filtering except for accesses to the x2APIC MSRs (from 0x800 to 0x8ff); x2APIC MSRs are always allowed, independent of the ``default_allow`` setting, and their behavior depends on the ``X2APIC_ENABLE`` bit of the APIC base register. -If a bit is within one of the defined ranges, read and write -accesses are guarded by the bitmap's value for the MSR index. If it is not -defined in any range, whether MSR access is rejected is determined by the flags -field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW`` and -``KVM_MSR_FILTER_DEFAULT_DENY``. - -Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR -filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect. +If a bit is within one of the defined ranges, read and write accesses are +guarded by the bitmap's value for the MSR index if the kind of access +is included in the ``struct kvm_msr_filter_range`` flags. If no range +cover this particular access, the behavior is determined by the flags +field in the kvm_msr_filter struct: ``KVM_MSR_FILTER_DEFAULT_ALLOW`` +and ``KVM_MSR_FILTER_DEFAULT_DENY``. Each bitmap range specifies a range of MSRs to potentially allow access on. The range goes from MSR index [base .. base+nmsrs]. The flags field diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08cfb5e4bd07..0f02d0fe3abb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5252,14 +5252,21 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) struct kvm_msr_filter filter; bool default_allow; int r = 0; + bool empty = true; u32 i; if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) return -EFAULT; - kvm_clear_msr_filter(kvm); + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) + empty &= !filter.ranges[i].nmsrs; default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + if (empty && !default_allow) + return -EINVAL; + + kvm_clear_msr_filter(kvm); + kvm->arch.msr_filter.default_allow = default_allow; /* From 10f79ccaf3d767ecf724b5e04b077d28cbcbef57 Mon Sep 17 00:00:00 2001 From: Li Qiang Date: Thu, 1 Oct 2020 02:53:33 -0700 Subject: [PATCH 419/497] Documentation: kvm: fix a typo Fixes: e287d6de62f74 ("Documentation: kvm: Convert cpuid.txt to .rst") Signed-off-by: Li Qiang Message-Id: <20201001095333.7611-1-liq3ea@163.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/cpuid.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst index a7dff9186bed..ff2b38d3e108 100644 --- a/Documentation/virt/kvm/cpuid.rst +++ b/Documentation/virt/kvm/cpuid.rst @@ -62,7 +62,7 @@ KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt handler can be enabled by writing to msr 0x4b564d04 -KVM_FEATURE_PV_UNHAULT 7 guest checks this feature bit +KVM_FEATURE_PV_UNHALT 7 guest checks this feature bit before enabling paravirtualized spinlock support From 8f116a6c7320ce55e8e0885b79ff3518105775b5 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 1 Oct 2020 13:20:14 +0200 Subject: [PATCH 420/497] x86/kvm: hide KVM options from menuconfig when KVM is not compiled Let KVM_WERROR depend on KVM, so it doesn't show in menuconfig alone. Signed-off-by: Matteo Croce Message-Id: <20201001112014.9561-1-mcroce@linux.microsoft.com> Fixes: 4f337faf1c55e ("KVM: allow disabling -Werror") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index fbd5bd7a945a..f92dfd8ef10d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -66,6 +66,7 @@ config KVM_WERROR default y if X86_64 && !KASAN # We use the dependency on !COMPILE_TEST to not be enabled # blindly in allmodconfig or allyesconfig configurations + depends on KVM depends on (X86_64 && !KASAN) || !COMPILE_TEST depends on EXPERT help From 66af4f5cb1ee44c41a8433877c859d4b3f922f83 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 2 Oct 2020 17:43:13 +0200 Subject: [PATCH 421/497] x86/kvm: Update the comment about asynchronous page fault in exc_page_fault() KVM was switched to interrupt-based mechanism for 'page ready' event delivery in Linux-5.8 (see commit 2635b5c4a0e4 ("KVM: x86: interrupt based APF 'page ready' event delivery")) and #PF (ab)use for 'page ready' event delivery was removed. Linux guest switched to this new mechanism exclusively in 5.9 (see commit b1d405751cd5 ("KVM: x86: Switch KVM guest to using interrupts for page ready APF delivery")) so it is not possible to get #PF for a 'page ready' event even when the guest is running on top of an older KVM (APF mechanism won't be enabled). Update the comment in exc_page_fault() to reflect the new reality. Signed-off-by: Vitaly Kuznetsov Message-Id: <20201002154313.1505327-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/mm/fault.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6e3e8a124903..3cf77592ac54 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1446,11 +1446,14 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) prefetchw(¤t->mm->mmap_lock); /* - * KVM has two types of events that are, logically, interrupts, but - * are unfortunately delivered using the #PF vector. These events are - * "you just accessed valid memory, but the host doesn't have it right - * now, so I'll put you to sleep if you continue" and "that memory - * you tried to access earlier is available now." + * KVM uses #PF vector to deliver 'page not present' events to guests + * (asynchronous page fault mechanism). The event happens when a + * userspace task is trying to access some valid (from guest's point of + * view) memory which is not currently mapped by the host (e.g. the + * memory is swapped out). Note, the corresponding "page ready" event + * which is injected when the memory becomes available, is delived via + * an interrupt mechanism and not a #PF exception + * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the From 5b9bb0ebbcdcf8d04bf44a1e73e23a89a6711f31 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 18 Aug 2020 15:24:26 +0000 Subject: [PATCH 422/497] kvm: x86: encapsulate wrmsr(MSR_KVM_SYSTEM_TIME) emulation in helper fn No functional change intended. Reviewed-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Wanpeng Li Signed-off-by: Oliver Upton Change-Id: I7cbe71069db98d1ded612fd2ef088b70e7618426 Message-Id: <20200818152429.1923996-2-oupton@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 58 +++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0f02d0fe3abb..a70733d2abf0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1948,6 +1948,34 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, + bool old_msr, bool host_initiated) +{ + struct kvm_arch *ka = &vcpu->kvm->arch; + + if (vcpu->vcpu_id == 0 && !host_initiated) { + if (ka->boot_vcpu_runs_old_kvmclock && old_msr) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + ka->boot_vcpu_runs_old_kvmclock = old_msr; + } + + vcpu->arch.time = system_time; + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; + if (!(system_time & 1)) + return; + + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) + vcpu->arch.pv_time_enabled = true; + + return; +} + static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); @@ -3093,33 +3121,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - struct kvm_arch *ka = &vcpu->kvm->arch; - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(data & 1)) - break; - - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; - + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); + break; + case MSR_KVM_SYSTEM_TIME: + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; - } case MSR_KVM_ASYNC_PF_EN: if (kvm_pv_enable_async_pf(vcpu, data)) return 1; From 210dfd93ea3dc63e8c21b75ddd909447341f6382 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 18 Aug 2020 15:24:27 +0000 Subject: [PATCH 423/497] kvm: x86: set wall_clock in kvm_write_wall_clock() Small change to avoid meaningless duplication in the subsequent patch. No functional change intended. Reviewed-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Oliver Upton Change-Id: I77ab9cdad239790766b7a49d5cbae5e57a3005ea Message-Id: <20200818152429.1923996-3-oupton@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a70733d2abf0..b928e092da03 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1916,6 +1916,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) struct pvclock_wall_clock wc; u64 wall_nsec; + kvm->arch.wall_clock = wall_clock; + if (!wall_clock) return; @@ -3117,7 +3119,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: From 66570e966dd9cb4fd57811d0056c6472a14a2c41 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 18 Aug 2020 15:24:28 +0000 Subject: [PATCH 424/497] kvm: x86: only provide PV features if enabled in guest's CPUID KVM unconditionally provides PV features to the guest, regardless of the configured CPUID. An unwitting guest that doesn't check KVM_CPUID_FEATURES before use could access paravirt features that userspace did not intend to provide. Fix this by checking the guest's CPUID before performing any paravirtual operations. Introduce a capability, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, to gate the aforementioned enforcement. Migrating a VM from a host w/o this patch to a host with this patch could silently change the ABI exposed to the guest, warranting that we default to the old behavior and opt-in for the new one. Reviewed-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Oliver Upton Change-Id: I202a0926f65035b872bfe8ad15307c026de59a98 Message-Id: <20200818152429.1923996-4-oupton@google.com> Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 ++++++ arch/x86/include/asm/kvm_host.h | 15 ++++++++ arch/x86/kvm/cpuid.c | 7 ++++ arch/x86/kvm/cpuid.h | 10 +++++ arch/x86/kvm/x86.c | 67 ++++++++++++++++++++++++++++++--- include/uapi/linux/kvm.h | 1 + 6 files changed, 106 insertions(+), 5 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9ece9a827a58..76317221d29f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6380,3 +6380,14 @@ ranges that KVM should reject access to. In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. + + +8.26 KVM_CAP_ENFORCE_PV_CPUID +----------------------------- + +Architectures: x86 + +When enabled, KVM will disable paravirtual features provided to the +guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf +(0x40000001). Otherwise, a guest may use the paravirtual features +regardless of what has actually been exposed through the CPUID leaf. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d0f77235da92..15e51343957e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -789,6 +789,21 @@ struct kvm_vcpu_arch { /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; + + /* pv related cpuid info */ + struct { + /* + * value of the eax register in the KVM_CPUID_FEATURES CPUID + * leaf. + */ + u32 features; + + /* + * indicates whether pv emulation should be disabled if features + * are not present in the guest's cpuid + */ + bool enforce; + } pv_cpuid; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 37c3668a774f..d253c023ee76 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -107,6 +107,13 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 1d2c4f2e4bb6..bf8577947ed2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include "x86.h" #include #include +#include extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); @@ -313,4 +314,13 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); } +static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, + unsigned int kvm_feature) +{ + if (!vcpu->arch.pv_cpuid.enforce) + return true; + + return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); +} + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b928e092da03..ca940de53e18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2877,6 +2877,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; @@ -2954,10 +2962,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb_guest(vcpu); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + } vcpu->arch.st.preempted = 0; @@ -3118,30 +3128,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + kvm_write_wall_clock(vcpu->kvm, data); + break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: - kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -3158,11 +3192,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; @@ -3658,6 +3698,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4528,6 +4569,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_x86_ops.enable_direct_tlbflush(vcpu); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + + return 0; + default: return -EINVAL; } @@ -8000,11 +8046,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) goto out; } + ret = -KVM_ENOSYS; + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); kvm_sched_yield(vcpu->kvm, a1); ret = 0; @@ -8015,9 +8066,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; case KVM_HC_SCHED_YIELD: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + kvm_sched_yield(vcpu->kvm, a0); ret = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 58f43aa1fc21..ca41220b40b8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1052,6 +1052,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_STEAL_TIME 187 #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 +#define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #ifdef KVM_CAP_IRQ_ROUTING From 3ee6fb4949aad0f2164829299934a77f62b79dcd Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 18 Aug 2020 15:24:29 +0000 Subject: [PATCH 425/497] Documentation: kvm: fix some typos in cpuid.rst Reviewed-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Oliver Upton Change-Id: I0c6355b09fedf8f9cc4cc5f51be418e2c1c82b7b Message-Id: <20200818152429.1923996-5-oupton@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/cpuid.rst | 88 ++++++++++++++++---------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst index ff2b38d3e108..f1583e682cc8 100644 --- a/Documentation/virt/kvm/cpuid.rst +++ b/Documentation/virt/kvm/cpuid.rst @@ -38,64 +38,64 @@ returns:: where ``flag`` is defined as below: -================================= =========== ================================ -flag value meaning -================================= =========== ================================ -KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs - 0x11 and 0x12 +================================== =========== ================================ +flag value meaning +================================== =========== ================================ +KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs + 0x11 and 0x12 -KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays - on PIO operations +KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays + on PIO operations -KVM_FEATURE_MMU_OP 2 deprecated +KVM_FEATURE_MMU_OP 2 deprecated -KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs - 0x4b564d00 and 0x4b564d01 +KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs + 0x4b564d00 and 0x4b564d01 -KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by - writing to msr 0x4b564d02 +KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by + writing to msr 0x4b564d02 -KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by - writing to msr 0x4b564d03 +KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by + writing to msr 0x4b564d03 -KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt - handler can be enabled by - writing to msr 0x4b564d04 +KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt + handler can be enabled by + writing to msr 0x4b564d04 -KVM_FEATURE_PV_UNHALT 7 guest checks this feature bit - before enabling paravirtualized - spinlock support +KVM_FEATURE_PV_UNHALT 7 guest checks this feature bit + before enabling paravirtualized + spinlock support -KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit - before enabling paravirtualized - tlb flush +KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit + before enabling paravirtualized + tlb flush -KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT - can be enabled by setting bit 2 - when writing to msr 0x4b564d02 +KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT + can be enabled by setting bit 2 + when writing to msr 0x4b564d02 -KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit - before enabling paravirtualized - sebd IPIs +KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit + before enabling paravirtualized + send IPIs -KVM_FEATURE_PV_POLL_CONTROL 12 host-side polling on HLT can - be disabled by writing - to msr 0x4b564d05. +KVM_FEATURE_PV_POLL_CONTROL 12 host-side polling on HLT can + be disabled by writing + to msr 0x4b564d05. -KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit - before using paravirtualized - sched yield. +KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit + before using paravirtualized + sched yield. -KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit - before using the second async - pf control msr 0x4b564d06 and - async pf acknowledgment msr - 0x4b564d07. +KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit + before using the second async + pf control msr 0x4b564d06 and + async pf acknowledgment msr + 0x4b564d07. -KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side - per-cpu warps are expeced in - kvmclock -================================= =========== ================================ +KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 host will warn if no guest-side + per-cpu warps are expected in + kvmclock +================================== =========== ================================ :: From f69858fcc727f8098419f3c595678e671bd2d8b7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 1 Oct 2020 15:05:39 +0200 Subject: [PATCH 426/497] KVM: x86: disconnect kvm_check_cpuid() from vcpu->arch.cpuid_entries As a preparatory step to allocating vcpu->arch.cpuid_entries dynamically make kvm_check_cpuid() check work with an arbitrary 'struct kvm_cpuid_entry2' array. Currently, when kvm_check_cpuid() fails we reset vcpu->arch.cpuid_nent to 0 and this is kind of weird, i.e. one would expect CPUIDs to remain unchanged when KVM_SET_CPUID[2] call fails. No functional change intended. It would've been possible to move the updated kvm_check_cpuid() in kvm_vcpu_ioctl_set_cpuid2() and check the supplied input before we start updating vcpu->arch.cpuid_entries/nent but we can't do the same in kvm_vcpu_ioctl_set_cpuid() as we'll have to copy 'struct kvm_cpuid_entry' entries first. The change will be made when vcpu->arch.cpuid_entries[] array becomes allocated dynamically. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20201001130541.1398392-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d253c023ee76..2e4228ccbe2a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,7 +54,24 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -static int kvm_check_cpuid(struct kvm_vcpu *vcpu) +static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *e; + int i; + + for (i = 0; i < nent; i++) { + e = &entries[i]; + + if (e->function == function && (e->index == index || + !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) + return e; + } + + return NULL; +} + +static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; @@ -62,7 +79,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, 0); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -227,7 +244,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries[i].padding[2] = 0; } vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + r = kvm_check_cpuid(vcpu->arch.cpuid_entries, cpuid->nent); if (r) { vcpu->arch.cpuid_nent = 0; kvfree(cpuid_entries); @@ -257,7 +274,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu); + r = kvm_check_cpuid(vcpu->arch.cpuid_entries, cpuid->nent); if (r) { vcpu->arch.cpuid_nent = 0; goto out; @@ -947,17 +964,8 @@ out_free: struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index) { - struct kvm_cpuid_entry2 *e; - int i; - - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - - if (e->function == function && (e->index == index || - !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) - return e; - } - return NULL; + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); From 255cbecfe0c9466ade041fe381dde18a61cca549 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 1 Oct 2020 15:05:40 +0200 Subject: [PATCH 427/497] KVM: x86: allocate vcpu->arch.cpuid_entries dynamically The current limit for guest CPUID leaves (KVM_MAX_CPUID_ENTRIES, 80) is reported to be insufficient but before we bump it let's switch to allocating vcpu->arch.cpuid_entries[] array dynamically. Currently, 'struct kvm_cpuid_entry2' is 40 bytes so vcpu->arch.cpuid_entries is 3200 bytes which accounts for 1/4 of the whole 'struct kvm_vcpu_arch' but having it pre-allocated (for all vCPUs which we also pre-allocate) gives us no real benefits. Another plus of the dynamic allocation is that we now do kvm_check_cpuid() check before we assign anything to vcpu->arch.cpuid_nent/cpuid_entries so no changes are made in case the check fails. Opportunistically remove unneeded 'out' labels from kvm_vcpu_ioctl_set_cpuid()/kvm_vcpu_ioctl_set_cpuid2() and return directly whenever possible. Signed-off-by: Vitaly Kuznetsov Message-Id: <20201001130541.1398392-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Reviewed-by: Maxim Levitsky --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/cpuid.c | 91 +++++++++++++++++++-------------- arch/x86/kvm/x86.c | 1 + 3 files changed, 54 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 15e51343957e..8c44a82abe54 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -637,7 +637,7 @@ struct kvm_vcpu_arch { int halt_request; /* real mode on Intel only */ int cpuid_nent; - struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + struct kvm_cpuid_entry2 *cpuid_entries; int maxphyaddr; int max_tdp_level; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2e4228ccbe2a..5a62fa4ee493 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -217,46 +217,53 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry __user *entries) { int r, i; - struct kvm_cpuid_entry *cpuid_entries = NULL; + struct kvm_cpuid_entry *e = NULL; + struct kvm_cpuid_entry2 *e2 = NULL; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; + return -E2BIG; + if (cpuid->nent) { - cpuid_entries = vmemdup_user(entries, - array_size(sizeof(struct kvm_cpuid_entry), - cpuid->nent)); - if (IS_ERR(cpuid_entries)) { - r = PTR_ERR(cpuid_entries); - goto out; + e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); + if (IS_ERR(e)) + return PTR_ERR(e); + + e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT); + if (!e2) { + r = -ENOMEM; + goto out_free_cpuid; } } for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; + e2[i].function = e[i].function; + e2[i].eax = e[i].eax; + e2[i].ebx = e[i].ebx; + e2[i].ecx = e[i].ecx; + e2[i].edx = e[i].edx; + e2[i].index = 0; + e2[i].flags = 0; + e2[i].padding[0] = 0; + e2[i].padding[1] = 0; + e2[i].padding[2] = 0; } - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu->arch.cpuid_entries, cpuid->nent); + + r = kvm_check_cpuid(e2, cpuid->nent); if (r) { - vcpu->arch.cpuid_nent = 0; - kvfree(cpuid_entries); - goto out; + kvfree(e2); + goto out_free_cpuid; } + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); - kvfree(cpuid_entries); -out: +out_free_cpuid: + kvfree(e); + return r; } @@ -264,26 +271,32 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { + struct kvm_cpuid_entry2 *e2 = NULL; int r; - r = -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - r = kvm_check_cpuid(vcpu->arch.cpuid_entries, cpuid->nent); - if (r) { - vcpu->arch.cpuid_nent = 0; - goto out; + return -E2BIG; + + if (cpuid->nent) { + e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); + if (IS_ERR(e2)) + return PTR_ERR(e2); } + r = kvm_check_cpuid(e2, cpuid->nent); + if (r) { + kvfree(e2); + return r; + } + + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); -out: - return r; + + return 0; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca940de53e18..92f85003b4bb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9948,6 +9948,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + kvfree(vcpu->arch.cpuid_entries); if (!lapic_in_kernel(vcpu)) static_key_slow_dec(&kvm_no_apic_vcpu); } From 3f4e3eb417b10ef45caddc4e1d3a18a34b539440 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 1 Oct 2020 15:05:41 +0200 Subject: [PATCH 428/497] KVM: x86: bump KVM_MAX_CPUID_ENTRIES As vcpu->arch.cpuid_entries is now allocated dynamically, the only remaining use for KVM_MAX_CPUID_ENTRIES is to check KVM_SET_CPUID/ KVM_SET_CPUID2 input for sanity. Since it was reported that the current limit (80) is insufficient for some CPUs, bump KVM_MAX_CPUID_ENTRIES and use an arbitrary value '256' as the new limit. Signed-off-by: Vitaly Kuznetsov Message-Id: <20201001130541.1398392-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8c44a82abe54..69fbeadb7408 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -133,7 +133,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 80 +#define KVM_MAX_CPUID_ENTRIES 256 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 From d5d6c18dc454f0ee410d035429dd9e1412c01f8a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 3 Oct 2020 17:18:07 -0700 Subject: [PATCH 429/497] kvm x86/mmu: Make struct kernel_param_ops definitions const These should be const, so make it so. Signed-off-by: Joe Perches Message-Id: Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 32e0e5c0524e..08c5fb60fcce 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -64,12 +64,12 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); -static struct kernel_param_ops nx_huge_pages_ops = { +static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { +static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { .set = set_nx_huge_pages_recovery_ratio, .get = param_get_uint, }; From 36385ccc9b185e6958e2911d41202dd0f386298d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 1 Oct 2020 14:29:51 +0300 Subject: [PATCH 430/497] KVM: x86: xen_hvm_config: cleanup return values Return 1 on errors that are caused by wrong guest behavior (which will inject #GP to the guest) And return a negative error value on issues that are the kernel's fault (e.g -ENOMEM) Signed-off-by: Maxim Levitsky Message-Id: <20201001112954.6258-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92f85003b4bb..3f5d08f7a637 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2842,24 +2842,19 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; u8 *page; - int r; - r = -E2BIG; if (page_num >= blob_size) - goto out; - r = -ENOMEM; + return 1; + page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; + if (IS_ERR(page)) + return PTR_ERR(page); + + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { + kfree(page); + return 1; } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; + return 0; } static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) From 7dffecaf4eabb700e7aef3cc6da333517cfc242a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 1 Oct 2020 14:29:52 +0300 Subject: [PATCH 431/497] KVM: x86: report negative values from wrmsr emulation to userspace This will allow the KVM to report such errors (e.g -ENOMEM) to the userspace. Signed-off-by: Maxim Levitsky Message-Id: <20201001112954.6258-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++-- arch/x86/kvm/x86.c | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0cc0db500f71..0d917eb70319 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3712,10 +3712,10 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) if (r == X86EMUL_IO_NEEDED) return r; - if (r) + if (r > 0) return emulate_gp(ctxt, 0); - return X86EMUL_CONTINUE; + return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; } static int em_rdmsr(struct x86_emulate_ctxt *ctxt) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f5d08f7a637..ecdeab8fa0b1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1737,13 +1737,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) r = kvm_set_msr(vcpu, ecx, data); /* MSR write failed? See if we should ask user space */ - if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) { + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) /* Bounce to user space */ return 0; - } + + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; /* MSR write failed? Inject a #GP */ - if (r) { + if (r > 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; From a6c42e8431657487b48fe5f57378517e16eef404 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 16 Oct 2020 16:59:25 +0800 Subject: [PATCH 432/497] drm/amd/swsmu: correct wrong feature bit mapping 1. when smc feature bit isn't mapped, the feature state isn't showed on sysfs node of pp_features. 2. add pp_features table title Signed-off-by: Kevin Wang Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 27 ++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index c30d3338825f..92b2ea4c197b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -431,10 +431,9 @@ size_t smu_cmn_get_pp_feature_mask(struct smu_context *smu, char *buf) { uint32_t feature_mask[2] = { 0 }; - int32_t feature_index = 0; + int feature_index = 0; uint32_t count = 0; - uint32_t sort_feature[SMU_FEATURE_COUNT]; - uint64_t hw_feature_count = 0; + int8_t sort_feature[SMU_FEATURE_COUNT]; size_t size = 0; int ret = 0, i; @@ -447,23 +446,31 @@ size_t smu_cmn_get_pp_feature_mask(struct smu_context *smu, size = sprintf(buf + size, "features high: 0x%08x low: 0x%08x\n", feature_mask[1], feature_mask[0]); + memset(sort_feature, -1, sizeof(sort_feature)); + for (i = 0; i < SMU_FEATURE_COUNT; i++) { feature_index = smu_cmn_to_asic_specific_index(smu, CMN2ASIC_MAPPING_FEATURE, i); if (feature_index < 0) continue; + sort_feature[feature_index] = i; - hw_feature_count++; } - for (i = 0; i < hw_feature_count; i++) { + size += sprintf(buf + size, "%-2s. %-20s %-3s : %-s\n", + "No", "Feature", "Bit", "State"); + + for (i = 0; i < SMU_FEATURE_COUNT; i++) { + if (sort_feature[i] < 0) + continue; + size += sprintf(buf + size, "%02d. %-20s (%2d) : %s\n", - count++, - smu_get_feature_name(smu, sort_feature[i]), - i, - !!smu_cmn_feature_is_enabled(smu, sort_feature[i]) ? - "enabled" : "disabled"); + count++, + smu_get_feature_name(smu, sort_feature[i]), + i, + !!smu_cmn_feature_is_enabled(smu, sort_feature[i]) ? + "enabled" : "disabled"); } return size; From 72f211ecaa80a001c062829894ae5d5effab2b49 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 1 Oct 2020 14:29:53 +0300 Subject: [PATCH 433/497] KVM: x86: allow kvm_x86_ops.set_efer to return an error value This will be used to signal an error to the userspace, in case the vendor code failed during handling of this msr. (e.g -ENOMEM) Signed-off-by: Maxim Levitsky Message-Id: <20201001112954.6258-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 3 ++- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/vmx/vmx.c | 6 ++++-- arch/x86/kvm/vmx/vmx.h | 2 +- arch/x86/kvm/x86.c | 7 ++++++- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69fbeadb7408..8233991386a3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1101,7 +1101,7 @@ struct kvm_x86_ops { void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); - void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4f401fc6a05d..57e0f27ff7d2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -263,7 +263,7 @@ static int get_max_npt_level(void) #endif } -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_svm *svm = to_svm(vcpu); vcpu->arch.efer = efer; @@ -283,6 +283,7 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) svm->vmcb->save.efer = efer | EFER_SVME; vmcb_mark_dirty(svm->vmcb, VMCB_CR); + return 0; } static int is_external_interrupt(u32 info) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a7f997459b87..e7af21e6fe1e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -350,7 +350,7 @@ static inline bool gif_set(struct vcpu_svm *svm) #define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); -void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); +int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void svm_flush_tlb(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 132a8cc9f9a4..a3d63ea5a090 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2815,13 +2815,14 @@ static void enter_rmode(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); + /* Nothing to do if hardware doesn't support EFER. */ if (!msr) - return; + return 0; vcpu->arch.efer = efer; if (efer & EFER_LMA) { @@ -2833,6 +2834,7 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } setup_msrs(vmx); + return 0; } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 5961cb897125..64ff2c0322c3 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -319,7 +319,7 @@ unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); -void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ecdeab8fa0b1..51f75c21c86f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1457,6 +1457,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; + int r; if (efer & efer_reserved_bits) return 1; @@ -1473,7 +1474,11 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops.set_efer(vcpu, efer); + r = kvm_x86_ops.set_efer(vcpu, efer); + if (r) { + WARN_ON(r > 0); + return r; + } /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) From 2fcf4876ada8a293d3b92a1033b8b990a7c613d3 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 1 Oct 2020 14:29:54 +0300 Subject: [PATCH 434/497] KVM: nSVM: implement on demand allocation of the nested state This way we don't waste memory on VMs which don't use nesting virtualization even when the host enabled it for them. Signed-off-by: Maxim Levitsky Message-Id: <20201001112954.6258-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 42 +++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 61 +++++++++++++++++++++------------------ arch/x86/kvm/svm/svm.h | 8 +++++ 3 files changed, 83 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ba50ff6e35c7..9e4c226dbf7d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -481,6 +481,9 @@ int nested_svm_vmrun(struct vcpu_svm *svm) vmcb12 = map.hva; + if (WARN_ON_ONCE(!svm->nested.initialized)) + return -EINVAL; + if (!nested_vmcb_checks(svm, vmcb12)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; @@ -698,6 +701,45 @@ int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } +int svm_allocate_nested(struct vcpu_svm *svm) +{ + struct page *hsave_page; + + if (svm->nested.initialized) + return 0; + + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!hsave_page) + return -ENOMEM; + svm->nested.hsave = page_address(hsave_page); + + svm->nested.msrpm = svm_vcpu_alloc_msrpm(); + if (!svm->nested.msrpm) + goto err_free_hsave; + svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm); + + svm->nested.initialized = true; + return 0; + +err_free_hsave: + __free_page(hsave_page); + return -ENOMEM; +} + +void svm_free_nested(struct vcpu_svm *svm) +{ + if (!svm->nested.initialized) + return; + + svm_vcpu_free_msrpm(svm->nested.msrpm); + svm->nested.msrpm = NULL; + + __free_page(virt_to_page(svm->nested.hsave)); + svm->nested.hsave = NULL; + + svm->nested.initialized = false; +} + /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 57e0f27ff7d2..dc4fe579d460 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -266,6 +266,7 @@ static int get_max_npt_level(void) int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_svm *svm = to_svm(vcpu); + u64 old_efer = vcpu->arch.efer; vcpu->arch.efer = efer; if (!npt_enabled) { @@ -276,9 +277,27 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) efer &= ~EFER_LME; } - if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); - svm_set_gif(svm, true); + if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { + if (!(efer & EFER_SVME)) { + svm_leave_nested(svm); + svm_set_gif(svm, true); + + /* + * Free the nested guest state, unless we are in SMM. + * In this case we will return to the nested guest + * as soon as we leave SMM. + */ + if (!is_smm(&svm->vcpu)) + svm_free_nested(svm); + + } else { + int ret = svm_allocate_nested(svm); + + if (ret) { + vcpu->arch.efer = old_efer; + return ret; + } + } } svm->vmcb->save.efer = efer | EFER_SVME; @@ -650,7 +669,7 @@ static void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); } -static u32 *svm_vcpu_alloc_msrpm(void) +u32 *svm_vcpu_alloc_msrpm(void) { struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); u32 *msrpm; @@ -664,7 +683,7 @@ static u32 *svm_vcpu_alloc_msrpm(void) return msrpm; } -static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) { int i; @@ -675,7 +694,8 @@ static void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) } } -static void svm_vcpu_free_msrpm(u32 *msrpm) + +void svm_vcpu_free_msrpm(u32 *msrpm) { __free_pages(virt_to_page(msrpm), MSRPM_ALLOC_ORDER); } @@ -1268,7 +1288,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb_page; - struct page *hsave_page; int err; BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); @@ -1279,13 +1298,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (!vmcb_page) goto out; - hsave_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!hsave_page) - goto error_free_vmcb_page; - err = avic_init_vcpu(svm); if (err) - goto error_free_hsave_page; + goto error_free_vmcb_page; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1293,21 +1308,12 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) svm->avic_is_running = true; - svm->nested.hsave = page_address(hsave_page); - svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) - goto error_free_hsave_page; + goto error_free_vmcb_page; svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->nested.msrpm = svm_vcpu_alloc_msrpm(); - if (!svm->nested.msrpm) - goto error_free_msrpm; - - /* We only need the L1 pass-through MSR state, so leave vcpu as NULL */ - svm_vcpu_init_msrpm(vcpu, svm->nested.msrpm); - svm->vmcb = page_address(vmcb_page); svm->vmcb_pa = __sme_set(page_to_pfn(vmcb_page) << PAGE_SHIFT); svm->asid_generation = 0; @@ -1318,10 +1324,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) return 0; -error_free_msrpm: - svm_vcpu_free_msrpm(svm->msrpm); -error_free_hsave_page: - __free_page(hsave_page); error_free_vmcb_page: __free_page(vmcb_page); out: @@ -1347,10 +1349,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) */ svm_clear_current_vmcb(svm->vmcb); + svm_free_nested(svm); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->nested.hsave)); - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -4038,6 +4040,9 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) return 1; + if (svm_allocate_nested(svm)) + return 1; + ret = enter_svm_guest_mode(svm, vmcb12_gpa, map.hva); kvm_vcpu_unmap(&svm->vcpu, &map, true); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e7af21e6fe1e..1d853fe4c778 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -97,6 +97,8 @@ struct svm_nested_state { /* cache for control fields of the guest */ struct vmcb_control_area ctl; + + bool initialized; }; struct vcpu_svm { @@ -350,6 +352,10 @@ static inline bool gif_set(struct vcpu_svm *svm) #define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); +u32 *svm_vcpu_alloc_msrpm(void); +void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); +void svm_vcpu_free_msrpm(u32 *msrpm); + int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); @@ -391,6 +397,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb); void svm_leave_nested(struct vcpu_svm *svm); +void svm_free_nested(struct vcpu_svm *svm); +int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); From f6426ab9c957e97418ac5b0466538792767b1738 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Sat, 3 Oct 2020 23:27:07 +0000 Subject: [PATCH 435/497] KVM: SVM: Initialize prev_ga_tag before use The function amd_ir_set_vcpu_affinity makes use of the parameter struct amd_iommu_pi_data.prev_ga_tag to determine if it should delete struct amd_iommu_pi_data from a list when not running in AVIC mode. However, prev_ga_tag is initialized only when AVIC is enabled. The non-zero uninitialized value can cause unintended code path, which ends up making use of the struct vcpu_svm.ir_list and ir_list_lock without being initialized (since they are intended only for the AVIC case). This triggers NULL pointer dereference bug in the function vm_ir_list_del with the following call trace: svm_update_pi_irte+0x3c2/0x550 [kvm_amd] ? proc_create_single_data+0x41/0x50 kvm_arch_irq_bypass_add_producer+0x40/0x60 [kvm] __connect+0x5f/0xb0 [irqbypass] irq_bypass_register_producer+0xf8/0x120 [irqbypass] vfio_msi_set_vector_signal+0x1de/0x2d0 [vfio_pci] vfio_msi_set_block+0x77/0xe0 [vfio_pci] vfio_pci_set_msi_trigger+0x25c/0x2f0 [vfio_pci] vfio_pci_set_irqs_ioctl+0x88/0xb0 [vfio_pci] vfio_pci_ioctl+0x2ea/0xed0 [vfio_pci] ? alloc_file_pseudo+0xa5/0x100 vfio_device_fops_unl_ioctl+0x26/0x30 [vfio] ? vfio_device_fops_unl_ioctl+0x26/0x30 [vfio] __x64_sys_ioctl+0x96/0xd0 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Therefore, initialize prev_ga_tag to zero before use. This should be safe because ga_tag value 0 is invalid (see function avic_vm_init). Fixes: dfa20099e26e ("KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu()") Signed-off-by: Suravee Suthikulpanit Message-Id: <20201003232707.4662-1-suravee.suthikulpanit@amd.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index f73f84d56452..8c550999ace0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -866,6 +866,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * - Tell IOMMU to use legacy mode for this interrupt. * - Retrieve ga_tag of prior interrupt remapping data. */ + pi.prev_ga_tag = 0; pi.is_guest_mode = false; ret = irq_set_vcpu_affinity(host_irq, &pi); From 6e1d849fa3296526e64b75fa227b6377cd0fd3da Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 29 Sep 2020 21:16:55 -0700 Subject: [PATCH 436/497] KVM: x86: Intercept LA57 to inject #GP fault when it's reserved Unconditionally intercept changes to CR4.LA57 so that KVM correctly injects a #GP fault if the guest attempts to set CR4.LA57 when it's supported in hardware but not exposed to the guest. Long term, KVM needs to properly handle CR4 bits that can be under guest control but also may be reserved from the guest's perspective. But, KVM currently sets the CR4 guest/host mask only during vCPU creation, and reworking flows to change that will take a bit of elbow grease. Even if/when generic support for intercepting reserved bits exists, it's probably not worth letting the guest set CR4.LA57 directly. LA57 can't be toggled while long mode is enabled, thus it's all but guaranteed to be set once (maybe twice, e.g. by BIOS and kernel) during boot and never touched again. On the flip side, letting the guest own CR4.LA57 may incur extra VMREADs. In other words, this temporary "hack" is probably also the right long term fix. Fixes: fd8cb433734e ("KVM: MMU: Expose the LA57 feature to VM.") Cc: stable@vger.kernel.org Cc: Lai Jiangshan Signed-off-by: Lai Jiangshan [sean: rewrote changelog] Signed-off-by: Sean Christopherson Message-Id: <20200930041659.28181-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cfe83d4ae625..ca0781b41df9 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ From c44d9b34701dc19792339ae3764ac7b763cb175c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 29 Sep 2020 21:16:56 -0700 Subject: [PATCH 437/497] KVM: x86: Invoke vendor's vcpu_after_set_cpuid() after all common updates Move the call to kvm_x86_ops.vcpu_after_set_cpuid() to the very end of kvm_vcpu_after_set_cpuid() to allow the vendor implementation to react to changes made by the common code. In the near future, this will be used by VMX to update its CR4 guest/host masks to account for reserved bits. In the long term, SGX support will update the allowed XCR0 mask for enclaves based on the vCPU's allowed XCR0. vcpu_after_set_cpuid() (nee kvm_update_cpuid()) was originally added by commit 2acf923e38fb ("KVM: VMX: Enable XSAVE/XRSTOR for guest"), and was called separately after kvm_x86_ops.vcpu_after_set_cpuid() (nee kvm_x86_ops->cpuid_update()). There is no indication that the placement of the common code updates after the vendor updates was anything more than a "new function at the end" decision. Inspection of the current code reveals no dependency on kvm_x86_ops' vcpu_after_set_cpuid() in kvm_vcpu_after_set_cpuid() or any of its helpers. The bulk of the common code depends only on the guest's CPUID configuration, kvm_mmu_reset_context() does not consume dynamic vendor state, and there are no collisions between kvm_pmu_refresh() and VMX's update of PT state. Signed-off-by: Sean Christopherson Message-Id: <20200930041659.28181-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5a62fa4ee493..35223eb63242 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -145,8 +145,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - kvm_x86_ops.vcpu_after_set_cpuid(vcpu); - best = kvm_find_cpuid_entry(vcpu, 1, 0); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) @@ -170,6 +168,9 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); + + /* Invoke the vendor callback only after the above state is updated. */ + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); kvm_x86_ops.update_exception_bitmap(vcpu); } From a6337a3542b152b35f47895b88ef1ac0dadf971d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 29 Sep 2020 21:16:57 -0700 Subject: [PATCH 438/497] KVM: x86: Move call to update_exception_bitmap() into VMX code Now that vcpu_after_set_cpuid() and update_exception_bitmap() are called back-to-back, subsume the exception bitmap update into the common CPUID update. Drop the SVM invocation entirely as SVM's exception bitmap doesn't vary with respect to guest CPUID. Signed-off-by: Sean Christopherson Message-Id: <20200930041659.28181-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 - arch/x86/kvm/vmx/vmx.c | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 35223eb63242..06a278b3701d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -171,7 +171,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* Invoke the vendor callback only after the above state is updated. */ kvm_x86_ops.vcpu_after_set_cpuid(vcpu); - kvm_x86_ops.update_exception_bitmap(vcpu); } static int is_efer_nx(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a3d63ea5a090..b74b59cba22c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7248,6 +7248,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } + + /* Refresh #PF interception to account for MAXPHYADDR changes. */ + update_exception_bitmap(vcpu); } static __init void vmx_set_cpu_caps(void) From 2ed41aa631fc0251cedea3ae98802cb72079d198 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 29 Sep 2020 21:16:58 -0700 Subject: [PATCH 439/497] KVM: VMX: Intercept guest reserved CR4 bits to inject #GP fault Intercept CR4 bits that are guest reserved so that KVM correctly injects a #GP fault if the guest attempts to set a reserved bit. If a feature is supported by the CPU but is not exposed to the guest, and its associated CR4 bit is not intercepted by KVM by default, then KVM will fail to inject a #GP if the guest sets the CR4 bit without triggering an exit, e.g. by toggling only the bit in question. Note, KVM doesn't give the guest direct access to any CR4 bits that are also dependent on guest CPUID. Yet. Signed-off-by: Sean Christopherson Message-Id: <20200930041659.28181-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b74b59cba22c..544a35b8d7fe 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4053,13 +4053,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS; + struct kvm_vcpu *vcpu = &vmx->vcpu; + + vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & + ~vcpu->arch.cr4_guest_rsvd_bits; if (!enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; if (is_guest_mode(&vmx->vcpu)) - vmx->vcpu.arch.cr4_guest_owned_bits &= - ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); + vcpu->arch.cr4_guest_owned_bits &= + ~get_vmcs12(vcpu)->cr4_guest_host_mask; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) @@ -7249,6 +7252,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } } + set_cr4_guest_host_mask(vmx); + /* Refresh #PF interception to account for MAXPHYADDR changes. */ update_exception_bitmap(vcpu); } From 30031c2b0574f43cc6888532b715f639afd39196 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 29 Sep 2020 21:16:59 -0700 Subject: [PATCH 440/497] KVM: x86: Let the guest own CR4.FSGSBASE Add FSGSBASE to the set of possible guest-owned CR4 bits, i.e. let the guest own it on VMX. KVM never queries the guest's CR4.FSGSBASE value, thus there is no reason to force VM-Exit on FSGSBASE being toggled. Note, because FSGSBASE is conditionally available, this is dependent on recent changes to intercept reserved CR4 bits and to update the CR4 guest/host mask in response to guest CPUID changes. Cc: Paolo Bonzini Signed-off-by: Lai Jiangshan [sean: added justification in changelog] Signed-off-by: Sean Christopherson Message-Id: <20200930041659.28181-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index ca0781b41df9..a889563ad02d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ From a4f1d94e6bc1b00c4efa9655ad14e0d49b8f1e37 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 3 Oct 2020 17:18:06 -0700 Subject: [PATCH 441/497] KVM: PPC: Book3S HV: Make struct kernel_param_ops definition const This should be const, so make it so. Signed-off-by: Joe Perches Message-Id: Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 490a0f6a7285..52d85c2d4970 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -111,7 +111,7 @@ module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)"); #ifdef CONFIG_KVM_XICS -static struct kernel_param_ops module_param_ops = { +static const struct kernel_param_ops module_param_ops = { .set = param_set_int, .get = param_get_int, }; From cc4674d0ded069c1673fd6fec94a18e436828195 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Fri, 25 Sep 2020 14:22:48 -0700 Subject: [PATCH 442/497] kvm: mmu: Separate making non-leaf sptes from link_shadow_page The TDP MMU page fault handler will need to be able to create non-leaf SPTEs to build up the paging structures. Rather than re-implementing the function, factor the SPTE creation out of link_shadow_page. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20200925212302.3979661-9-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 08c5fb60fcce..60103fd07bd2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2573,6 +2573,21 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } +static u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) +{ + u64 spte; + + spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else + spte |= shadow_accessed_mask; + + return spte; +} + static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { @@ -2580,13 +2595,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; - - if (sp_ad_disabled(sp)) - spte |= SPTE_AD_DISABLED_MASK; - else - spte |= shadow_accessed_mask; + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); From 799a4190e7341b9bb24549245f2b8f7d11c65360 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 20:26:41 +0200 Subject: [PATCH 443/497] kvm: x86/mmu: Separate making SPTEs from set_spte Separate the functions for generating leaf page table entries from the function that inserts them into the paging structure. This refactoring will facilitate changes to the MMU sychronization model to use atomic compare / exchanges (which are not guaranteed to succeed) instead of a monolithic MMU lock. No functional change expected. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This commit introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Reviewed-by: Peter Shier Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 49 ++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 60103fd07bd2..ef4a63af8fce 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2996,20 +2996,15 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) #define SET_SPTE_SPURIOUS BIT(2) -static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool can_unsync, bool host_writable) +static int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte) { u64 spte = 0; int ret = 0; - struct kvm_mmu_page *sp; - if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) - return 0; - - sp = sptep_to_sp(sptep); - if (sp_ad_disabled(sp)) + if (ad_disabled) spte |= SPTE_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) spte |= SPTE_AD_WRPROT_ONLY_MASK; @@ -3062,8 +3057,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writable_pte(*sptep)) - goto set_pte; + if (!can_unsync && is_writable_pte(old_spte)) + goto out; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %llx, marking ro\n", @@ -3074,15 +3069,37 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } - if (pte_access & ACC_WRITE_MASK) { - kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (pte_access & ACC_WRITE_MASK) spte |= spte_shadow_dirty_mask(spte); - } if (speculative) spte = mark_spte_for_access_track(spte); -set_pte: +out: + *new_spte = spte; + return ret; +} + +static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, bool speculative, + bool can_unsync, bool host_writable) +{ + u64 spte; + struct kvm_mmu_page *sp; + int ret; + + if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) + return 0; + + sp = sptep_to_sp(sptep); + + ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, + can_unsync, host_writable, sp_ad_disabled(sp), &spte); + + if (spte & PT_WRITABLE_MASK) + kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (*sptep == spte) ret |= SET_SPTE_SPURIOUS; else if (mmu_spte_update(sptep, spte)) From cb3eedab453911ca177c1e2e44add0b7fe4a6f09 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 28 Sep 2020 10:17:17 -0400 Subject: [PATCH 444/497] KVM: mmu: Separate updating a PTE from kvm_set_pte_rmapp The TDP MMU's own function for the changed-PTE notifier will need to be update a PTE in the exact same way as the shadow MMU. Rather than re-implementing this logic, factor the SPTE creation out of kvm_set_pte_rmapp. Extracted out of a patch by Ben Gardon. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ef4a63af8fce..3dec4744ab9c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1747,6 +1747,21 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return kvm_zap_rmapp(kvm, rmap_head); } +static u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) +{ + u64 new_spte; + + new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + + new_spte = mark_spte_for_access_track(new_spte); + + return new_spte; +} + static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) @@ -1772,13 +1787,8 @@ restart: pte_list_remove(rmap_head, sptep); goto restart; } else { - new_spte = *sptep & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - - new_spte = mark_spte_for_access_track(new_spte); + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + *sptep, new_pfn); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); From 5a9624affe7c7498fb395879d9bb613628e89e60 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 16 Oct 2020 10:29:37 -0400 Subject: [PATCH 445/497] KVM: mmu: extract spte.h and spte.c The SPTE format will be common to both the shadow and the TDP MMU. Extract code that implements the format to a separate module, as a first step towards adding the TDP MMU and putting mmu.c on a diet. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 3 +- arch/x86/kvm/mmu/mmu.c | 551 +------------------------------- arch/x86/kvm/mmu/mmu_internal.h | 31 +- arch/x86/kvm/mmu/spte.c | 318 ++++++++++++++++++ arch/x86/kvm/mmu/spte.h | 252 +++++++++++++++ 5 files changed, 607 insertions(+), 548 deletions(-) create mode 100644 arch/x86/kvm/mmu/spte.c create mode 100644 arch/x86/kvm/mmu/spte.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 7f86a14aed0e..66aa24f5e2db 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,7 +15,8 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ + mmu/spte.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3dec4744ab9c..02af304c168a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -23,6 +23,7 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "cpuid.h" +#include "spte.h" #include #include @@ -45,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -104,45 +104,13 @@ enum { AUDIT_POST_SYNC }; -#undef MMU_DEBUG - #ifdef MMU_DEBUG -static bool dbg = 0; +bool dbg = 0; module_param(dbg, bool, 0644); - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) -#else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) #endif #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 54 - -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. - */ -#define SPTE_SPECIAL_MASK (3ULL << 52) -#define SPTE_AD_ENABLED_MASK (0ULL << 52) -#define SPTE_AD_DISABLED_MASK (1ULL << 52) -#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) -#define SPTE_MMIO_MASK (3ULL << 52) - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - #define PT32_LEVEL_BITS 10 #define PT32_LEVEL_SHIFT(level) \ @@ -156,18 +124,6 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) -#else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) - #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) @@ -175,25 +131,8 @@ module_param(dbg, bool, 0644); (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask | shadow_me_mask) - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -/* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull - #include -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) -#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 @@ -248,62 +187,7 @@ static struct kmem_cache *pte_list_desc_cache; static struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; -static u64 __read_mostly shadow_nx_mask; -static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ -static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; -static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_value; -static u64 __read_mostly shadow_mmio_access_mask; -static u64 __read_mostly shadow_present_mask; -static u64 __read_mostly shadow_me_mask; - -/* - * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; - * shadow_acc_track_mask is the set of bits to be cleared in non-accessed - * pages. - */ -static u64 __read_mostly shadow_acc_track_mask; - -/* - * The mask/shift to use for saving the original R/X bits when marking the PTE - * as not-present for access tracking purposes. We do not save the W bit as the - * PTEs being access tracked also need to be dirty tracked, so the W bit will be - * restored only when a write is attempted to the page. - */ -static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | - PT64_EPT_EXECUTABLE_MASK; -static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; - -/* - * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order - * to guard against L1TF attacks. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; - -/* - * The number of high-order 1 bits to use in the mask above. - */ -static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; - -/* - * In some cases, we need to preserve the GFN of a non-present or reserved - * SPTE when we usurp the upper five bits of the physical address space to - * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll - * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask - * left into the reserved bits, i.e. the GFN in the SPTE will be split into - * high and low parts. This mask covers the lower bits of the GFN. - */ -static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; - -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -static u8 __read_mostly shadow_phys_bits; - static void mmu_spte_set(u64 *sptep, u64 spte); -static bool is_executable_pte(u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -339,134 +223,11 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) -{ - BUG_ON((u64)(unsigned)access_mask != access_mask); - WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); - WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); - shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; - shadow_mmio_access_mask = access_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); - -static bool is_mmio_spte(u64 spte) -{ - return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; -} - -static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) -{ - return sp->role.ad_disabled; -} - -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) -{ - /* - * When using the EPT page-modification log, the GPAs in the log - * would come from L2 rather than L1. Therefore, we need to rely - * on write protection to record dirty pages. This also bypasses - * PML, since writes now result in a vmexit. - */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu; -} - -static inline bool spte_ad_enabled(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; -} - -static inline bool spte_ad_need_write_protect(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; -} - -static bool is_nx_huge_page_enabled(void) +bool is_nx_huge_page_enabled(void) { return READ_ONCE(nx_huge_pages); } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - MMU_WARN_ON(is_mmio_spte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - -static inline bool is_access_track_spte(u64 spte) -{ - return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; -} - -/* - * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of - * the memslots generation and is derived as follows: - * - * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 - * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 - * - * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in - * the MMIO generation number, as doing so would require stealing a bit from - * the "real" generation number and thus effectively halve the maximum number - * of MMIO generations that can be handled before encountering a wrap (which - * requires a full MMU zap). The flag is instead explicitly queried when - * checking for MMIO spte cache hits. - */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) - -#define MMIO_SPTE_GEN_LOW_START 3 -#define MMIO_SPTE_GEN_LOW_END 11 -#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ - MMIO_SPTE_GEN_LOW_START) - -#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT -#define MMIO_SPTE_GEN_HIGH_END 62 -#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ - MMIO_SPTE_GEN_HIGH_START) - -static u64 generation_mmio_spte_mask(u64 gen) -{ - u64 mask; - - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); - - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; - mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; - return mask; -} - -static u64 get_mmio_spte_generation(u64 spte) -{ - u64 gen; - - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; - gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen; -} - -static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) -{ - - u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; - u64 mask = generation_mmio_spte_mask(gen); - u64 gpa = gfn << PAGE_SHIFT; - - access &= shadow_mmio_access_mask; - mask |= shadow_mmio_value | access; - mask |= gpa | shadow_nonpresent_or_rsvd_mask; - mask |= (gpa & shadow_nonpresent_or_rsvd_mask) - << shadow_nonpresent_or_rsvd_mask_len; - - return mask; -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { @@ -532,90 +293,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, return gpa; } -/* - * Sets the shadow PTE masks used by the MMU. - * - * Assumptions: - * - Setting either @accessed_mask or @dirty_mask requires setting both - * - At least one of @accessed_mask or @acc_track_mask must be set - */ -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask, u64 me_mask) -{ - BUG_ON(!dirty_mask != !accessed_mask); - BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); - - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; - shadow_present_mask = p_mask; - shadow_acc_track_mask = acc_track_mask; - shadow_me_mask = me_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); - -static u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} - -static void kvm_mmu_reset_all_pte_masks(void) -{ - u8 low_phys_bits; - - shadow_user_mask = 0; - shadow_accessed_mask = 0; - shadow_dirty_mask = 0; - shadow_nx_mask = 0; - shadow_x_mask = 0; - shadow_present_mask = 0; - shadow_acc_track_mask = 0; - - shadow_phys_bits = kvm_get_shadow_phys_bits(); - - /* - * If the CPU has 46 or less physical address bits, then set an - * appropriate mask to guard against L1TF attacks. Otherwise, it is - * assumed that the CPU is not vulnerable to L1TF. - * - * Some Intel CPUs address the L1 cache using more PA bits than are - * reported by CPUID. Use the PA width of the L1 cache when possible - * to achieve more effective mitigation, e.g. if system RAM overlaps - * the most significant bits of legal physical address space. - */ - shadow_nonpresent_or_rsvd_mask = 0; - low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_has_bug(X86_BUG_L1TF) && - !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= - 52 - shadow_nonpresent_or_rsvd_mask_len)) { - low_phys_bits = boot_cpu_data.x86_cache_bits - - shadow_nonpresent_or_rsvd_mask_len; - shadow_nonpresent_or_rsvd_mask = - rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); - } - - shadow_nonpresent_or_rsvd_lower_gfn_mask = - GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); -} - static int is_cpuid_PSE36(void) { return 1; @@ -626,35 +303,6 @@ static int is_nx(struct kvm_vcpu *vcpu) return vcpu->arch.efer & EFER_NX; } -static int is_shadow_present_pte(u64 pte) -{ - return (pte != 0) && !is_mmio_spte(pte); -} - -static int is_large_pte(u64 pte) -{ - return pte & PT_PAGE_SIZE_MASK; -} - -static int is_last_spte(u64 pte, int level) -{ - if (level == PG_LEVEL_4K) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; -} - -static bool is_executable_pte(u64 spte) -{ - return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; -} - -static kvm_pfn_t spte_to_pfn(u64 pte) -{ - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -799,12 +447,6 @@ retry: } #endif -static bool spte_can_locklessly_be_made_writable(u64 spte) -{ - return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == - (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); -} - static bool spte_has_volatile_bits(u64 spte) { if (!is_shadow_present_pte(spte)) @@ -829,21 +471,6 @@ static bool spte_has_volatile_bits(u64 spte) return false; } -static bool is_accessed_spte(u64 spte) -{ - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; -} - /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present @@ -979,34 +606,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -static u64 mark_spte_for_access_track(u64 spte) -{ - if (spte_ad_enabled(spte)) - return spte & ~shadow_accessed_mask; - - if (is_access_track_spte(spte)) - return spte; - - /* - * Making an Access Tracking PTE will result in removal of write access - * from the PTE. So, verify that we will be able to restore the write - * access in the fast page fault path later on. - */ - WARN_ONCE((spte & PT_WRITABLE_MASK) && - !spte_can_locklessly_be_made_writable(spte), - "kvm: Writable SPTE is not locklessly dirty-trackable\n"); - - WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << - shadow_acc_track_saved_bits_shift), - "kvm: Access Tracking saved bit locations are not zero\n"); - - spte |= (spte & shadow_acc_track_saved_bits_mask) << - shadow_acc_track_saved_bits_shift; - spte &= ~shadow_acc_track_mask; - - return spte; -} - /* Restore an acc-track PTE back to a regular PTE */ static u64 restore_acc_track_spte(u64 spte) { @@ -1747,21 +1346,6 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return kvm_zap_rmapp(kvm, rmap_head); } -static u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) -{ - u64 new_spte; - - new_spte = old_spte & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - - new_spte = mark_spte_for_access_track(new_spte); - - return new_spte; -} - static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long data) @@ -2583,21 +2167,6 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) -{ - u64 spte; - - spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_me_mask; - - if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; - else - spte |= shadow_accessed_mask; - - return spte; -} - static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { @@ -2919,8 +2488,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { struct kvm_mmu_page *sp; @@ -2980,116 +2549,6 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return false; } -static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) -{ - if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && - /* - * Some reserved pages, such as those from NVDIMM - * DAX devices, are not for MMIO, and can be mapped - * with cached memory type for better performance. - * However, the above check misconceives those pages - * as MMIO, and results in KVM mapping them with UC - * memory type, which would hurt the performance. - * Therefore, we check the host memory type in addition - * and only treat UC/UC-/WC pages as MMIO. - */ - (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); - - return !e820__mapped_raw_any(pfn_to_hpa(pfn), - pfn_to_hpa(pfn + 1) - 1, - E820_TYPE_RAM); -} - -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - -static int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte) -{ - u64 spte = 0; - int ret = 0; - - if (ad_disabled) - spte |= SPTE_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) - spte |= SPTE_AD_WRPROT_ONLY_MASK; - - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ - spte |= shadow_present_mask; - if (!speculative) - spte |= spte_shadow_accessed_mask(spte); - - if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { - pte_access &= ~ACC_EXEC_MASK; - } - - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; - - if (level > PG_LEVEL_4K) - spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); - - if (host_writable) - spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; - - if (!kvm_is_mmio_pfn(pfn)) - spte |= shadow_me_mask; - - spte |= (u64)pfn << PAGE_SHIFT; - - if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. - * Same reasoning can be applied to dirty page accounting. - */ - if (!can_unsync && is_writable_pte(old_spte)) - goto out; - - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); - } - } - - if (pte_access & ACC_WRITE_MASK) - spte |= spte_shadow_dirty_mask(spte); - - if (speculative) - spte = mark_spte_for_access_track(spte); - -out: - *new_spte = spte; - return ret; -} - static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 3acf3b8eb469..fc72f199eaa6 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -3,9 +3,23 @@ #define __KVM_X86_MMU_INTERNAL_H #include - +#include #include +#undef MMU_DEBUG + +#ifdef MMU_DEBUG +extern bool dbg; + +#define pgprintk(x...) do { if (dbg) printk(x); } while (0) +#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) +#define MMU_WARN_ON(x) WARN_ON(x) +#else +#define pgprintk(x...) do { } while (0) +#define rmap_printk(x...) do { } while (0) +#define MMU_WARN_ON(x) do { } while (0) +#endif + struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; @@ -55,6 +69,21 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) return to_shadow_page(__pa(sptep)); } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + +bool is_nx_huge_page_enabled(void); +bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync); + void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c new file mode 100644 index 000000000000..d9c5665a55e9 --- /dev/null +++ b/arch/x86/kvm/mmu/spte.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * Macros and functions to access KVM PTEs (also known as SPTEs) + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2020 Red Hat, Inc. and/or its affiliates. + */ + + +#include +#include "mmu.h" +#include "mmu_internal.h" +#include "x86.h" +#include "spte.h" + +#include + +u64 __read_mostly shadow_nx_mask; +u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +u64 __read_mostly shadow_user_mask; +u64 __read_mostly shadow_accessed_mask; +u64 __read_mostly shadow_dirty_mask; +u64 __read_mostly shadow_mmio_value; +u64 __read_mostly shadow_mmio_access_mask; +u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_me_mask; +u64 __read_mostly shadow_acc_track_mask; + +u64 __read_mostly shadow_nonpresent_or_rsvd_mask; +u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +u8 __read_mostly shadow_phys_bits; + +static u64 generation_mmio_spte_mask(u64 gen) +{ + u64 mask; + + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; + return mask; +} + +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) +{ + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; + u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; + + access &= shadow_mmio_access_mask; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; + + return mask; +} + +static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) +{ + if (pfn_valid(pfn)) + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && + /* + * Some reserved pages, such as those from NVDIMM + * DAX devices, are not for MMIO, and can be mapped + * with cached memory type for better performance. + * However, the above check misconceives those pages + * as MMIO, and results in KVM mapping them with UC + * memory type, which would hurt the performance. + * Therefore, we check the host memory type in addition + * and only treat UC/UC-/WC pages as MMIO. + */ + (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); + + return !e820__mapped_raw_any(pfn_to_hpa(pfn), + pfn_to_hpa(pfn + 1) - 1, + E820_TYPE_RAM); +} + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte) +{ + u64 spte = 0; + int ret = 0; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; + + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; + if (!speculative) + spte |= spte_shadow_accessed_mask(spte); + + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; + + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + + if (level > PG_LEVEL_4K) + spte |= PT_PAGE_SIZE_MASK; + if (tdp_enabled) + spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); + + if (host_writable) + spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; + + if (!kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_mask; + + spte |= (u64)pfn << PAGE_SHIFT; + + if (pte_access & ACC_WRITE_MASK) { + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writable_pte(old_spte)) + goto out; + + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + pgprintk("%s: found shadow page for %llx, marking ro\n", + __func__, gfn); + ret |= SET_SPTE_WRITE_PROTECTED_PT; + pte_access &= ~ACC_WRITE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + } + } + + if (pte_access & ACC_WRITE_MASK) + spte |= spte_shadow_dirty_mask(spte); + + if (speculative) + spte = mark_spte_for_access_track(spte); + +out: + *new_spte = spte; + return ret; +} + +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) +{ + u64 spte; + + spte = __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_me_mask; + + if (ad_disabled) + spte |= SPTE_AD_DISABLED_MASK; + else + spte |= shadow_accessed_mask; + + return spte; +} + +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) +{ + u64 new_spte; + + new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte |= (u64)new_pfn << PAGE_SHIFT; + + new_spte &= ~PT_WRITABLE_MASK; + new_spte &= ~SPTE_HOST_WRITEABLE; + + new_spte = mark_spte_for_access_track(new_spte); + + return new_spte; +} + +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} + +u64 mark_spte_for_access_track(u64 spte) +{ + if (spte_ad_enabled(spte)) + return spte & ~shadow_accessed_mask; + + if (is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + + return spte; +} + +void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) +{ + BUG_ON((u64)(unsigned)access_mask != access_mask); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; + shadow_mmio_access_mask = access_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); + +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ +void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask, u64 me_mask) +{ + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); + + shadow_user_mask = user_mask; + shadow_accessed_mask = accessed_mask; + shadow_dirty_mask = dirty_mask; + shadow_nx_mask = nx_mask; + shadow_x_mask = x_mask; + shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + shadow_me_mask = me_mask; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); + +void kvm_mmu_reset_all_pte_masks(void) +{ + u8 low_phys_bits; + + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; + + shadow_phys_bits = kvm_get_shadow_phys_bits(); + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. + */ + shadow_nonpresent_or_rsvd_mask = 0; + low_phys_bits = boot_cpu_data.x86_phys_bits; + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); + } + + shadow_nonpresent_or_rsvd_lower_gfn_mask = + GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); +} diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h new file mode 100644 index 000000000000..4ecf40e0b8fe --- /dev/null +++ b/arch/x86/kvm/mmu/spte.h @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef KVM_X86_MMU_SPTE_H +#define KVM_X86_MMU_SPTE_H + +#include "mmu_internal.h" + +#define PT_FIRST_AVAIL_BITS_SHIFT 10 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) + +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) + +#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) + +#define ACC_EXEC_MASK 1 +#define ACC_WRITE_MASK PT_WRITABLE_MASK +#define ACC_USER_MASK PT_USER_MASK +#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) + +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + +#define PT64_LEVEL_BITS 9 + +#define PT64_LEVEL_SHIFT(level) \ + (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) + +#define PT64_INDEX(address, level)\ + (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + + +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) + +/* + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: + * + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 + * + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. + */ +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) + +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) + +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) + +extern u64 __read_mostly shadow_nx_mask; +extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ +extern u64 __read_mostly shadow_user_mask; +extern u64 __read_mostly shadow_accessed_mask; +extern u64 __read_mostly shadow_dirty_mask; +extern u64 __read_mostly shadow_mmio_value; +extern u64 __read_mostly shadow_mmio_access_mask; +extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_me_mask; + +/* + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. + */ +extern u64 __read_mostly shadow_acc_track_mask; + +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + +/* + * In some cases, we need to preserve the GFN of a non-present or reserved + * SPTE when we usurp the upper five bits of the physical address space to + * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll + * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask + * left into the reserved bits, i.e. the GFN in the SPTE will be split into + * high and low parts. This mask covers the lower bits of the GFN. + */ +extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; + +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +extern u8 __read_mostly shadow_phys_bits; + +static inline bool is_mmio_spte(u64 spte) +{ + return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK; +} + +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + +static inline bool is_access_track_spte(u64 spte) +{ + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; +} + +static inline int is_shadow_present_pte(u64 pte) +{ + return (pte != 0) && !is_mmio_spte(pte); +} + +static inline int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + +static inline int is_last_spte(u64 pte, int level) +{ + if (level == PG_LEVEL_4K) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + +static inline bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + +static inline kvm_pfn_t spte_to_pfn(u64 pte) +{ + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; +} + +static inline bool is_accessed_spte(u64 spte) +{ + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); +} + +static inline bool is_dirty_spte(u64 spte) +{ + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; +} + +static inline bool spte_can_locklessly_be_made_writable(u64 spte) +{ + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); +} + +static inline u64 get_mmio_spte_generation(u64 spte) +{ + u64 gen; + + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen; +} + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, + bool can_unsync, bool host_writable, bool ad_disabled, + u64 *new_spte); +u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); +u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); +u64 mark_spte_for_access_track(u64 spte); +u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); + +void kvm_mmu_reset_all_pte_masks(void); + +#endif From c9180b7291cf13a746aaca907b9fdd499cce1e38 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 20:26:42 +0200 Subject: [PATCH 446/497] kvm: x86/mmu: Introduce tdp_iter The TDP iterator implements a pre-order traversal of a TDP paging structure. This iterator will be used in future patches to create an efficient implementation of the KVM MMU for the TDP case. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/mmu/tdp_iter.c | 177 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_iter.h | 56 ++++++++++++ 3 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kvm/mmu/tdp_iter.c create mode 100644 arch/x86/kvm/mmu/tdp_iter.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 66aa24f5e2db..a5dd4e5970f8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o + mmu/spte.o mmu/tdp_iter.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c new file mode 100644 index 000000000000..ad2184cb054c --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "mmu_internal.h" +#include "tdp_iter.h" +#include "spte.h" + +/* + * Recalculates the pointer to the SPTE for the current GFN and level and + * reread the SPTE. + */ +static void tdp_iter_refresh_sptep(struct tdp_iter *iter) +{ + iter->sptep = iter->pt_path[iter->level - 1] + + SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + iter->old_spte = READ_ONCE(*iter->sptep); +} + +static gfn_t round_gfn_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); +} + +/* + * Sets a TDP iterator to walk a pre-order traversal of the paging structure + * rooted at root_pt, starting with the walk to translate goal_gfn. + */ +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn) +{ + WARN_ON(root_level < 1); + WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + + iter->goal_gfn = goal_gfn; + iter->root_level = root_level; + iter->min_level = min_level; + iter->level = root_level; + iter->pt_path[iter->level - 1] = root_pt; + + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + iter->valid = true; +} + +/* + * Given an SPTE and its level, returns a pointer containing the host virtual + * address of the child page table referenced by the SPTE. Returns null if + * there is no such entry. + */ +u64 *spte_to_child_pt(u64 spte, int level) +{ + /* + * There's no child entry if this entry isn't present or is a + * last-level entry. + */ + if (!is_shadow_present_pte(spte) || is_last_spte(spte, level)) + return NULL; + + return __va(spte_to_pfn(spte) << PAGE_SHIFT); +} + +/* + * Steps down one level in the paging structure towards the goal GFN. Returns + * true if the iterator was able to step down a level, false otherwise. + */ +static bool try_step_down(struct tdp_iter *iter) +{ + u64 *child_pt; + + if (iter->level == iter->min_level) + return false; + + /* + * Reread the SPTE before stepping down to avoid traversing into page + * tables that are no longer linked from this entry. + */ + iter->old_spte = READ_ONCE(*iter->sptep); + + child_pt = spte_to_child_pt(iter->old_spte, iter->level); + if (!child_pt) + return false; + + iter->level--; + iter->pt_path[iter->level - 1] = child_pt; + iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Steps to the next entry in the current page table, at the current page table + * level. The next entry could point to a page backing guest memory or another + * page table, or it could be non-present. Returns true if the iterator was + * able to step to the next entry in the page table, false if the iterator was + * already at the end of the current page table. + */ +static bool try_step_side(struct tdp_iter *iter) +{ + /* + * Check if the iterator is already at the end of the current page + * table. + */ + if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (PT64_ENT_PER_PAGE - 1)) + return false; + + iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); + iter->goal_gfn = iter->gfn; + iter->sptep++; + iter->old_spte = READ_ONCE(*iter->sptep); + + return true; +} + +/* + * Tries to traverse back up a level in the paging structure so that the walk + * can continue from the next entry in the parent page table. Returns true on a + * successful step up, false if already in the root page. + */ +static bool try_step_up(struct tdp_iter *iter) +{ + if (iter->level == iter->root_level) + return false; + + iter->level++; + iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + tdp_iter_refresh_sptep(iter); + + return true; +} + +/* + * Step to the next SPTE in a pre-order traversal of the paging structure. + * To get to the next SPTE, the iterator either steps down towards the goal + * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a + * highter GFN. + * + * The basic algorithm is as follows: + * 1. If the current SPTE is a non-last-level SPTE, step down into the page + * table it points to. + * 2. If the iterator cannot step down, it will try to step to the next SPTE + * in the current page of the paging structure. + * 3. If the iterator cannot step to the next entry in the current page, it will + * try to step up to the parent paging structure page. In this case, that + * SPTE will have already been visited, and so the iterator must also step + * to the side again. + */ +void tdp_iter_next(struct tdp_iter *iter) +{ + if (try_step_down(iter)) + return; + + do { + if (try_step_side(iter)) + return; + } while (try_step_up(iter)); + iter->valid = false; +} + +/* + * Restart the walk over the paging structure from the root, starting from the + * highest gfn the iterator had previously reached. Assumes that the entire + * paging structure, except the root page, may have been completely torn down + * and rebuilt. + */ +void tdp_iter_refresh_walk(struct tdp_iter *iter) +{ + gfn_t goal_gfn = iter->goal_gfn; + + if (iter->gfn > goal_gfn) + goal_gfn = iter->gfn; + + tdp_iter_start(iter, iter->pt_path[iter->root_level - 1], + iter->root_level, iter->min_level, goal_gfn); +} + diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h new file mode 100644 index 000000000000..d629a53e1b73 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_ITER_H +#define __KVM_X86_MMU_TDP_ITER_H + +#include + +#include "mmu.h" + +/* + * A TDP iterator performs a pre-order walk over a TDP paging structure. + */ +struct tdp_iter { + /* + * The iterator will traverse the paging structure towards the mapping + * for this GFN. + */ + gfn_t goal_gfn; + /* Pointers to the page tables traversed to reach the current SPTE */ + u64 *pt_path[PT64_ROOT_MAX_LEVEL]; + /* A pointer to the current SPTE */ + u64 *sptep; + /* The lowest GFN mapped by the current SPTE */ + gfn_t gfn; + /* The level of the root page given to the iterator */ + int root_level; + /* The lowest level the iterator should traverse to */ + int min_level; + /* The iterator's current level within the paging structure */ + int level; + /* A snapshot of the value at sptep */ + u64 old_spte; + /* + * Whether the iterator has a valid state. This will be false if the + * iterator walks off the end of the paging structure. + */ + bool valid; +}; + +/* + * Iterates over every SPTE mapping the GFN range [start, end) in a + * preorder traversal. + */ +#define for_each_tdp_pte(iter, root, root_level, start, end) \ + for (tdp_iter_start(&iter, root, root_level, PG_LEVEL_4K, start); \ + iter.valid && iter.gfn < end; \ + tdp_iter_next(&iter)) + +u64 *spte_to_child_pt(u64 pte, int level); + +void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, + int min_level, gfn_t goal_gfn); +void tdp_iter_next(struct tdp_iter *iter); +void tdp_iter_refresh_walk(struct tdp_iter *iter); + +#endif /* __KVM_X86_MMU_TDP_ITER_H */ From fe5db27d36017715827e9be7711332d701c6b7de Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:43 -0700 Subject: [PATCH 447/497] kvm: x86/mmu: Init / Uninit the TDP MMU The TDP MMU offers an alternative mode of operation to the x86 shadow paging based MMU, optimized for running an L1 guest with TDP. The TDP MMU will require new fields that need to be initialized and torn down. Add hooks into the existing KVM MMU initialization process to do that initialization / cleanup. Currently the initialization and cleanup fucntions do not do very much, however more operations will be added in future patches. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-4-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 9 +++++++++ arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/mmu/mmu.c | 5 +++++ arch/x86/kvm/mmu/tdp_mmu.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 10 ++++++++++ 5 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kvm/mmu/tdp_mmu.c create mode 100644 arch/x86/kvm/mmu/tdp_mmu.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8233991386a3..f6d47ac74a52 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -995,6 +995,15 @@ struct kvm_arch { struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; + + /* + * Whether the TDP MMU is enabled for this VM. This contains a + * snapshot of the TDP MMU module parameter from when the VM was + * created and remains unchanged for the life of the VM. If this is + * true, TDP MMU handler functions will run for various MMU + * operations. + */ + bool tdp_mmu_enabled; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a5dd4e5970f8..b804444e16d4 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -16,7 +16,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o mmu/tdp_iter.o + mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/evmcs.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 02af304c168a..2afaf17284bb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -19,6 +19,7 @@ #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" +#include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" @@ -5377,6 +5378,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + kvm_mmu_init_tdp_mmu(kvm); + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); @@ -5387,6 +5390,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; kvm_page_track_unregister_notifier(kvm, node); + + kvm_mmu_uninit_tdp_mmu(kvm); } void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c new file mode 100644 index 000000000000..e567e8aa61a1 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "tdp_mmu.h" + +static bool __read_mostly tdp_mmu_enabled = false; + +static bool is_tdp_mmu_enabled(void) +{ +#ifdef CONFIG_X86_64 + return tdp_enabled && READ_ONCE(tdp_mmu_enabled); +#else + return false; +#endif /* CONFIG_X86_64 */ +} + +/* Initializes the TDP MMU for the VM, if enabled. */ +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +{ + if (!is_tdp_mmu_enabled()) + return; + + /* This should not be changed for the lifetime of the VM. */ + kvm->arch.tdp_mmu_enabled = true; +} + +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) +{ + if (!kvm->arch.tdp_mmu_enabled) + return; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h new file mode 100644 index 000000000000..cd4a562a70e9 --- /dev/null +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef __KVM_X86_MMU_TDP_MMU_H +#define __KVM_X86_MMU_TDP_MMU_H + +#include + +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); +#endif /* __KVM_X86_MMU_TDP_MMU_H */ From 02c00b3a2f7e86203d878ff432a5a19876049db6 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 20:26:44 +0200 Subject: [PATCH 448/497] kvm: x86/mmu: Allocate and free TDP MMU roots The TDP MMU must be able to allocate paging structure root pages and track the usage of those pages. Implement a similar, but separate system for root page allocation to that of the x86 shadow paging implementation. When future patches add synchronization model changes to allow for parallel page faults, these pages will need to be handled differently from the x86 shadow paging based MMU's root pages. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 24 ++++++-- arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++ arch/x86/kvm/mmu/tdp_mmu.c | 102 ++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 5 ++ 5 files changed, 146 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f6d47ac74a52..082684ce2d1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1004,6 +1004,7 @@ struct kvm_arch { * operations. */ bool tdp_mmu_enabled; + struct list_head tdp_mmu_roots; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2afaf17284bb..017d37b19cf3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -185,7 +185,7 @@ struct kvm_shadow_walk_iterator { __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; +struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); @@ -3132,9 +3132,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + + if (kvm_mmu_put_root(kvm, sp)) { + if (sp->tdp_mmu_page) + kvm_tdp_mmu_free_root(kvm, sp); + else if (sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3224,8 +3228,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) hpa_t root; unsigned i; - if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (vcpu->kvm->arch.tdp_mmu_enabled) { + root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); + + if (!VALID_PAGE(root)) + return -ENOSPC; + vcpu->arch.mmu->root_hpa = root; + } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, + true); + if (!VALID_PAGE(root)) return -ENOSPC; vcpu->arch.mmu->root_hpa = root; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index fc72f199eaa6..6665b10288ce 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -55,8 +55,12 @@ struct kvm_mmu_page { /* Number of writes since the last time traversal visited this page. */ atomic_t write_flooding_count; + + bool tdp_mmu_page; }; +extern struct kmem_cache *mmu_page_header_cache; + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -89,4 +93,20 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + BUG_ON(!sp->root_count); + lockdep_assert_held(&kvm->mmu_lock); + + ++sp->root_count; +} + +static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + lockdep_assert_held(&kvm->mmu_lock); + --sp->root_count; + + return !sp->root_count; +} + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index e567e8aa61a1..76ebb5898dd7 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1,6 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 +#include "mmu.h" +#include "mmu_internal.h" #include "tdp_mmu.h" +#include "spte.h" static bool __read_mostly tdp_mmu_enabled = false; @@ -21,10 +24,109 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; + + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); } void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { if (!kvm->arch.tdp_mmu_enabled) return; + + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); +} + +#define for_each_tdp_mmu_root(_kvm, _root) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +{ + struct kvm_mmu_page *sp; + + sp = to_shadow_page(hpa); + + return sp->tdp_mmu_page && sp->root_count; +} + +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + lockdep_assert_held(&kvm->mmu_lock); + + WARN_ON(root->root_count); + WARN_ON(!root->tdp_mmu_page); + + list_del(&root->link); + + free_page((unsigned long)root->spt); + kmem_cache_free(mmu_page_header_cache, root); +} + +static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, + int level) +{ + union kvm_mmu_page_role role; + + role = vcpu->arch.mmu->mmu_role.base; + role.level = level; + role.direct = true; + role.gpte_is_8_bytes = true; + role.access = ACC_ALL; + + return role; +} + +static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + sp->role.word = page_role_for_level(vcpu, level).word; + sp->gfn = gfn; + sp->tdp_mmu_page = true; + + return sp; +} + +static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role; + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *root; + + role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); + + spin_lock(&kvm->mmu_lock); + + /* Check for an existing root before allocating a new one. */ + for_each_tdp_mmu_root(kvm, root) { + if (root->role.word == role.word) { + kvm_mmu_get_root(kvm, root); + spin_unlock(&kvm->mmu_lock); + return root; + } + } + + root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); + root->root_count = 1; + + list_add(&root->link, &kvm->arch.tdp_mmu_roots); + + spin_unlock(&kvm->mmu_lock); + + return root; +} + +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *root; + + root = get_tdp_mmu_vcpu_root(vcpu); + if (!root) + return INVALID_PAGE; + + return __pa(root->spt); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index cd4a562a70e9..ac0ef9129442 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -7,4 +7,9 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + +bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); +hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); + #endif /* __KVM_X86_MMU_TDP_MMU_H */ From 2f2fad0897cbfda4e384a7b9eab73654974015ac Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 20:26:45 +0200 Subject: [PATCH 449/497] kvm: x86/mmu: Add functions to handle changed TDP SPTEs The existing bookkeeping done by KVM when a PTE is changed is spread around several functions. This makes it difficult to remember all the stats, bitmaps, and other subsystems that need to be updated whenever a PTE is modified. When a non-leaf PTE is marked non-present or becomes a leaf PTE, page table memory must also be freed. To simplify the MMU and facilitate the use of atomic operations on SPTEs in future patches, create functions to handle some of the bookkeeping required as a result of a change. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/mmu_internal.h | 2 + arch/x86/kvm/mmu/tdp_mmu.c | 112 ++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 017d37b19cf3..9c8f42e17f44 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -213,7 +213,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); } -static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages) { struct kvm_tlb_range range; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 6665b10288ce..564954c6b079 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -92,6 +92,8 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); +void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, + u64 start_gfn, u64 pages); static inline void kvm_mmu_get_root(struct kvm *kvm, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 76ebb5898dd7..8accfae76bf6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -2,6 +2,7 @@ #include "mmu.h" #include "mmu_internal.h" +#include "tdp_iter.h" #include "tdp_mmu.h" #include "spte.h" @@ -130,3 +131,114 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) return __pa(root->spt); } + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level); + +/** + * handle_changed_spte - handle bookkeeping associated with an SPTE change + * @kvm: kvm instance + * @as_id: the address space of the paging structure the SPTE was a part of + * @gfn: the base GFN that was mapped by the SPTE + * @old_spte: The value of the SPTE before the change + * @new_spte: The value of the SPTE after the change + * @level: the level of the PT the SPTE is part of in the paging structure + * + * Handle bookkeeping that might result from the modification of a SPTE. + * This function must be called for all TDP SPTE modifications. + */ +static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool was_leaf = was_present && is_last_spte(old_spte, level); + bool is_leaf = is_present && is_last_spte(new_spte, level); + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + u64 *pt; + u64 old_child_spte; + int i; + + WARN_ON(level > PT64_ROOT_MAX_LEVEL); + WARN_ON(level < PG_LEVEL_4K); + WARN_ON(gfn % KVM_PAGES_PER_HPAGE(level)); + + /* + * If this warning were to trigger it would indicate that there was a + * missing MMU notifier or a race with some notifier handler. + * A present, leaf SPTE should never be directly replaced with another + * present leaf SPTE pointing to a differnt PFN. A notifier handler + * should be zapping the SPTE before the main MM's page table is + * changed, or the SPTE should be zeroed, and the TLBs flushed by the + * thread before replacement. + */ + if (was_leaf && is_leaf && pfn_changed) { + pr_err("Invalid SPTE change: cannot replace a present leaf\n" + "SPTE with another present leaf SPTE mapping a\n" + "different PFN!\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + + /* + * Crash the host to prevent error propagation and guest data + * courruption. + */ + BUG(); + } + + if (old_spte == new_spte) + return; + + /* + * The only times a SPTE should be changed from a non-present to + * non-present state is when an MMIO entry is installed/modified/ + * removed. In that case, there is nothing to do here. + */ + if (!was_present && !is_present) { + /* + * If this change does not involve a MMIO SPTE, it is + * unexpected. Log the change, though it should not impact the + * guest since both the former and current SPTEs are nonpresent. + */ + if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) + pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" + "should not be replaced with another,\n" + "different nonpresent SPTE, unless one or both\n" + "are MMIO SPTEs.\n" + "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", + as_id, gfn, old_spte, new_spte, level); + return; + } + + + if (was_leaf && is_dirty_spte(old_spte) && + (!is_dirty_spte(new_spte) || pfn_changed)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + /* + * Recursively handle child PTs if the change removed a subtree from + * the paging structure. + */ + if (was_present && !was_leaf && (pfn_changed || !is_present)) { + pt = spte_to_child_pt(old_spte, level); + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + old_child_spte = READ_ONCE(*(pt + i)); + WRITE_ONCE(*(pt + i), 0); + handle_changed_spte(kvm, as_id, + gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), + old_child_spte, 0, level - 1); + } + + kvm_flush_remote_tlbs_with_address(kvm, gfn, + KVM_PAGES_PER_HPAGE(level)); + + free_page((unsigned long)pt); + } +} + +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); +} From 9e9eb226b91225fc199bbafc06f3cd70bfce0100 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 14 Oct 2020 11:26:46 -0700 Subject: [PATCH 450/497] KVM: Cache as_id in kvm_memory_slot Cache the address space ID just like the slot ID. It will be used in order to fill in the dirty ring entries. Suggested-by: Paolo Bonzini Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Peter Xu Message-Id: <20201014182700.2888246-7-bgardon@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 05e3c2fb3ef7..c6f45687ba89 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -346,6 +346,7 @@ struct kvm_memory_slot { unsigned long userspace_addr; u32 flags; short id; + u16 as_id; }; static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 68edd25dcb11..2e8539213125 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1247,6 +1247,11 @@ static int kvm_delete_memslot(struct kvm *kvm, memset(&new, 0, sizeof(new)); new.id = old->id; + /* + * This is only for debugging purpose; it should never be referenced + * for a removed memslot. + */ + new.as_id = as_id; r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE); if (r) @@ -1313,6 +1318,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!mem->memory_size) return kvm_delete_memslot(kvm, mem, &old, as_id); + new.as_id = as_id; new.id = id; new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; new.npages = mem->memory_size >> PAGE_SHIFT; From faaf05b00aecdb347ffd1d763d024394ec0329f8 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:47 -0700 Subject: [PATCH 451/497] kvm: x86/mmu: Support zapping SPTEs in the TDP MMU Add functions to zap SPTEs to the TDP MMU. These are needed to tear down TDP MMU roots properly and implement other MMU functions which require tearing down mappings. Future patches will add functions to populate the page tables, but as for this patch there will not be any work for these functions to do. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-8-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 +++++ arch/x86/kvm/mmu/tdp_iter.c | 5 ++ arch/x86/kvm/mmu/tdp_iter.h | 1 + arch/x86/kvm/mmu/tdp_mmu.c | 113 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 2 + 5 files changed, 136 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9c8f42e17f44..dd15e519c361 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5371,6 +5371,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) kvm_reload_remote_mmus(kvm); kvm_zap_obsolete_pages(kvm); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } @@ -5411,6 +5415,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; + bool flush; spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { @@ -5430,6 +5435,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) } } + if (kvm->arch.tdp_mmu_enabled) { + flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end); + if (flush) + kvm_flush_remote_tlbs(kvm); + } + spin_unlock(&kvm->mmu_lock); } @@ -5596,6 +5607,10 @@ restart: } kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_all(kvm); + spin_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index ad2184cb054c..87b7e16911db 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -175,3 +175,8 @@ void tdp_iter_refresh_walk(struct tdp_iter *iter) iter->root_level, iter->min_level, goal_gfn); } +u64 *tdp_iter_root_pt(struct tdp_iter *iter) +{ + return iter->pt_path[iter->root_level - 1]; +} + diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index d629a53e1b73..884ed2c70bfe 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -52,5 +52,6 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, int min_level, gfn_t goal_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_refresh_walk(struct tdp_iter *iter); +u64 *tdp_iter_root_pt(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 8accfae76bf6..45a182475f68 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -49,8 +49,13 @@ bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) return sp->tdp_mmu_page && sp->root_count; } +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end); + void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + lockdep_assert_held(&kvm->mmu_lock); WARN_ON(root->root_count); @@ -58,6 +63,8 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del(&root->link); + zap_gfn_range(kvm, root, 0, max_gfn); + free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); } @@ -135,6 +142,11 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level); +static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return sp->role.smm ? 1 : 0; +} + /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance @@ -242,3 +254,104 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, { __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); } + +static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) +{ + u64 *root_pt = tdp_iter_root_pt(iter); + struct kvm_mmu_page *root = sptep_to_sp(root_pt); + int as_id = kvm_mmu_page_as_id(root); + + *iter->sptep = new_spte; + + handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level); +} + +#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ + for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) + +/* + * Flush the TLB if the process should drop kvm->mmu_lock. + * Return whether the caller still needs to flush the tlb. + */ +static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + return false; + } else { + return true; + } +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + */ +static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + bool flush_needed = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + /* + * If this is a non-last-level SPTE that covers a larger range + * than should be zapped, continue, and zap the mappings at a + * lower level. + */ + if ((iter.gfn < start || + iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && + !is_last_spte(iter.old_spte, iter.level)) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + return flush_needed; +} + +/* + * Tears down the mappings for the range of gfns, [start, end), and frees the + * non-root pages mapping GFNs strictly within that range. Returns true if + * SPTEs have been cleared and a TLB flush is needed before releasing the + * MMU lock. + */ +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +{ + struct kvm_mmu_page *root; + bool flush = false; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + flush |= zap_gfn_range(kvm, root, start, end); + + kvm_mmu_put_root(kvm, root); + } + + return flush; +} + +void kvm_tdp_mmu_zap_all(struct kvm *kvm) +{ + gfn_t max_gfn = 1ULL << (boot_cpu_data.x86_phys_bits - PAGE_SHIFT); + bool flush; + + flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); + if (flush) + kvm_flush_remote_tlbs(kvm); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index ac0ef9129442..6de2d007fc03 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -12,4 +12,6 @@ bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root); hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); +bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +void kvm_tdp_mmu_zap_all(struct kvm *kvm); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From 7d94531249a54b822f1a8b20d8a8f8d59ad1d985 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:49 -0700 Subject: [PATCH 452/497] kvm: x86/mmu: Remove disallowed_hugepage_adjust shadow_walk_iterator arg In order to avoid creating executable hugepages in the TDP MMU PF handler, remove the dependency between disallowed_hugepage_adjust and the shadow_walk_iterator. This will open the function up to being used by the TDP MMU PF handler in a future patch. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-10-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 13 +++++++------ arch/x86/kvm/mmu/paging_tmpl.h | 3 ++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index dd15e519c361..6f22c155381d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2818,13 +2818,12 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, - gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +static void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *levelp) { int level = *levelp; - u64 spte = *it.sptep; - if (it.level == level && level > PG_LEVEL_4K && + if (cur_level == level && level > PG_LEVEL_4K && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -2834,7 +2833,8 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - + KVM_PAGES_PER_HPAGE(level - 1); *pfnp |= gfn & page_mask; (*levelp)--; } @@ -2867,7 +2867,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, * large page, as the leaf could be executable. */ if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(it, gfn, &pfn, &level); + disallowed_hugepage_adjust(*it.sptep, gfn, it.level, + &pfn, &level); base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 9a1a15f19beb..50e268eb8e1a 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -695,7 +695,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * large page, as the leaf could be executable. */ if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(it, gw->gfn, &pfn, &level); + disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, + &pfn, &level); base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) From 62593011247c8a8cfeb0c86aff84688b196727c2 Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Thu, 8 Oct 2020 09:58:41 +0000 Subject: [PATCH 453/497] SMB3: Resolve data corruption of TCP server info fields TCP server info field server->total_read is modified in parallel by demultiplex thread and decrypt offload worker thread. server->total_read is used in calculation to discard the remaining data of PDU which is not read into memory. Because of parallel modification, server->total_read can get corrupted and can result in discarding the valid data of next PDU. Signed-off-by: Rohith Surabattula Reviewed-by: Aurelien Aptel Reviewed-by: Pavel Shilovsky CC: Stable #5.4+ Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f085fe32c342..2c3cfb2e8e72 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4131,7 +4131,8 @@ smb3_is_transform_hdr(void *buf) static int decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int buf_data_size, struct page **pages, - unsigned int npages, unsigned int page_data_size) + unsigned int npages, unsigned int page_data_size, + bool is_offloaded) { struct kvec iov[2]; struct smb_rqst rqst = {NULL}; @@ -4157,7 +4158,8 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, memmove(buf, iov[1].iov_base, buf_data_size); - server->total_read = buf_data_size + page_data_size; + if (!is_offloaded) + server->total_read = buf_data_size + page_data_size; return rc; } @@ -4370,7 +4372,7 @@ static void smb2_decrypt_offload(struct work_struct *work) struct mid_q_entry *mid; rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, - dw->ppages, dw->npages, dw->len); + dw->ppages, dw->npages, dw->len, true); if (rc) { cifs_dbg(VFS, "error decrypting rc=%d\n", rc); goto free_pages; @@ -4476,7 +4478,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, non_offloaded_decrypt: rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size, - pages, npages, len); + pages, npages, len, false); if (rc) goto free_pages; @@ -4532,7 +4534,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, server->total_read += length; buf_size = pdu_length - sizeof(struct smb2_transform_hdr); - length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0); + length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0, false); if (length) return length; From def6e1dc17816826fac94f6a5ce125fdee3231ae Mon Sep 17 00:00:00 2001 From: Samuel Cabrero Date: Fri, 16 Oct 2020 11:54:55 +0200 Subject: [PATCH 454/497] cifs: Print the address and port we are connecting to in generic_ip_connect() Can be helpful in debugging mount and reconnect issues Signed-off-by: Samuel Cabrero Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 1a3b7793095e..d096cfda56eb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3902,13 +3902,21 @@ generic_ip_connect(struct TCP_Server_Info *server) saddr = (struct sockaddr *) &server->dstaddr; if (server->dstaddr.ss_family == AF_INET6) { - sport = ((struct sockaddr_in6 *) saddr)->sin6_port; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&server->dstaddr; + + sport = ipv6->sin6_port; slen = sizeof(struct sockaddr_in6); sfamily = AF_INET6; + cifs_dbg(FYI, "%s: connecting to [%pI6]:%d\n", __func__, &ipv6->sin6_addr, + ntohs(sport)); } else { - sport = ((struct sockaddr_in *) saddr)->sin_port; + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&server->dstaddr; + + sport = ipv4->sin_port; slen = sizeof(struct sockaddr_in); sfamily = AF_INET; + cifs_dbg(FYI, "%s: connecting to %pI4:%d\n", __func__, &ipv4->sin_addr, + ntohs(sport)); } if (socket == NULL) { From 3c3317daef0afa0cd541fc9c1bfd6ce8bbf1129a Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 21 Oct 2020 13:12:08 -0500 Subject: [PATCH 455/497] smb3: fix stat when special device file and mounted with modefromsid When mounting with modefromsid mount option, it was possible to get the error on stat of a fifo or char or block device: "cannot stat : Operation not supported" Special devices can be stored as reparse points by some servers (e.g. Windows NFS server and when using the SMB3.1.1 POSIX Extensions) but when the modefromsid mount option is used the client attempts to get the ACL for the file which requires opening with OPEN_REPARSE_POINT create option. Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg Reviewed-by: Shyam Prasad N --- fs/cifs/smb2ops.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2c3cfb2e8e72..3cde719ec41b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3093,7 +3093,12 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, oparms.tcon = tcon; oparms.desired_access = READ_CONTROL; oparms.disposition = FILE_OPEN; - oparms.create_options = cifs_create_options(cifs_sb, 0); + /* + * When querying an ACL, even if the file is a symlink we want to open + * the source not the target, and so the protocol requires that the + * client specify this flag when opening a reparse point + */ + oparms.create_options = cifs_create_options(cifs_sb, 0) | OPEN_REPARSE_POINT; oparms.fid = &fid; oparms.reconnect = false; From 3c6e65e679182d55779ef6f8582f0945af4319b0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 21 Oct 2020 00:15:42 -0500 Subject: [PATCH 456/497] smb3: do not try to cache root directory if dir leases not supported To servers which do not support directory leases (e.g. Samba) it is wasteful to try to open_shroot (ie attempt to cache the root directory handle). Skip attempt to open_shroot when server does not indicate support for directory leases. Cuts the number of requests on mount from 17 to 15, and cuts the number of requests on stat of the root directory from 4 to 3. Signed-off-by: Steve French CC: Stable # v5.1+ --- fs/cifs/connect.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d096cfda56eb..3a9980bf0d6e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3608,7 +3608,10 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) */ tcon->retry = volume_info->retry; tcon->nocase = volume_info->nocase; - tcon->nohandlecache = volume_info->nohandlecache; + if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) + tcon->nohandlecache = volume_info->nohandlecache; + else + tcon->nohandlecache = 1; tcon->nodelete = volume_info->nodelete; tcon->local_lease = volume_info->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); From 2e76f188fd90d9ac29adbb82c30345f84d04bfa4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 19 Oct 2020 09:28:02 -0700 Subject: [PATCH 457/497] xfs: cancel intents immediately if process_intents fails If processing recovered log intent items fails, we need to cancel all the unprocessed recovered items immediately so that a subsequent AIL push in the bail out path won't get wedged on the pinned intent items that didn't get processed. This can happen if the log contains (1) an intent that gets and releases an inode, (2) an intent that cannot be recovered successfully, and (3) some third intent item. When recovery of (2) fails, we leave (3) pinned in memory. Inode reclamation is called in the error-out path of xfs_mountfs before xfs_log_cancel_mount. Reclamation calls xfs_ail_push_all_sync, which gets stuck waiting for (3). Therefore, call xlog_recover_cancel_intents if _process_intents fails. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_log_recover.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a8289adc1b29..87886b7f77da 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3446,6 +3446,14 @@ xlog_recover_finish( int error; error = xlog_recover_process_intents(log); if (error) { + /* + * Cancel all the unprocessed intent items now so that + * we don't leave them pinned in the AIL. This can + * cause the AIL to livelock on the pinned item if + * anyone tries to push the AIL (inode reclaim does + * this) before we get around to xfs_log_mount_cancel. + */ + xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); return error; } From d56b1980d7efe9ef08469e856fc0703d0cef65e4 Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Sat, 17 Oct 2020 08:38:43 -0500 Subject: [PATCH 458/497] drm/amdkfd: Use same SQ prefetch setting as amdgpu 0 causes instruction fetch stall at cache line boundary under some conditions on Navi10. A non-zero prefetch is the preferred default in any case. Fixes soft hang in Luxmark. Signed-off-by: Jay Cornwall Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c index 72e4d61ac752..ad0593342333 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c @@ -58,8 +58,9 @@ static int update_qpd_v10(struct device_queue_manager *dqm, /* check if sh_mem_config register already configured */ if (qpd->sh_mem_config == 0) { qpd->sh_mem_config = - SH_MEM_ALIGNMENT_MODE_UNALIGNED << - SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; + (SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | + (3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT); #if 0 /* TODO: * This shouldn't be an issue with Navi10. Verify. From 9a2f408f5406df567a3515f4cb5c2ce1bde64501 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Tue, 20 Oct 2020 16:29:30 +0800 Subject: [PATCH 459/497] drm/amd/pm: fix pcie information for sienna cichlid Fix the function used for sienna cichlid to get correct PCIE information by pp_dpm_pcie. Signed-off-by: Likun Gao Reviewed-by: Hawking Zhang Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index ca2abb2e5340..d708b383f83b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -962,8 +962,8 @@ static int sienna_cichlid_print_clk_levels(struct smu_context *smu, } break; case SMU_PCIE: - gen_speed = smu_v11_0_get_current_pcie_link_speed(smu); - lane_width = smu_v11_0_get_current_pcie_link_width(smu); + gen_speed = smu_v11_0_get_current_pcie_link_speed_level(smu); + lane_width = smu_v11_0_get_current_pcie_link_width_level(smu); for (i = 0; i < NUM_LINK_LEVELS; i++) size += sprintf(buf + size, "%d: %s %s %dMhz %s\n", i, (dpm_context->dpm_tables.pcie_table.pcie_gen[i] == 0) ? "2.5GT/s," : From e4eeceb73cb06b8fa379b94cbba77e6a0a032e43 Mon Sep 17 00:00:00 2001 From: John Clements Date: Wed, 21 Oct 2020 16:20:42 +0800 Subject: [PATCH 460/497] Revert drm/amdgpu: disable sienna chichlid UMC RAS This reverts commit 265c280a4807419249644156654e5c40a235ea84. Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8bf6a7c056bc..4e36551ab50b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1986,7 +1986,8 @@ static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev) { if (adev->asic_type != CHIP_VEGA10 && adev->asic_type != CHIP_VEGA20 && - adev->asic_type != CHIP_ARCTURUS) + adev->asic_type != CHIP_ARCTURUS && + adev->asic_type != CHIP_SIENNA_CICHLID) return 1; else return 0; @@ -2030,7 +2031,6 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, *supported = amdgpu_ras_enable == 0 ? 0 : *hw_supported & amdgpu_ras_mask; - adev->ras_features = *supported; } From 392d256fa26d943fb0a019fea4be80382780d3b1 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Wed, 21 Oct 2020 16:15:47 +0800 Subject: [PATCH 461/497] drm/amd/pm: fix pp_dpm_fclk fclk value is missing in pp_dpm_fclk. add this to correctly show the current value. Signed-off-by: Kenneth Feng Reviewed-by: Likun Gao Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index d708b383f83b..9ca3d93b1c95 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -455,6 +455,9 @@ static int sienna_cichlid_get_smu_metrics_data(struct smu_context *smu, case METRICS_CURR_DCEFCLK: *value = metrics->CurrClock[PPCLK_DCEFCLK]; break; + case METRICS_CURR_FCLK: + *value = metrics->CurrClock[PPCLK_FCLK]; + break; case METRICS_AVERAGE_GFXCLK: if (metrics->AverageGfxActivity <= SMU_11_0_7_GFX_BUSY_THRESHOLD) *value = metrics->AverageGfxclkFrequencyPostDs; From 0435d77cd9f4613e7c95ca208d252acf6d745c3f Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Wed, 21 Oct 2020 17:30:02 +0800 Subject: [PATCH 462/497] drm/amd/pm: remove the average clock value in sysfs if it's fine-grained clock dpm, remove the average clock value and reflects the real clock. Signed-off-by: Kenneth Feng Reviewed-by: Likun Gao Signed-off-by: Alex Deucher --- .../gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 9ca3d93b1c95..685a8a3b25d4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -954,12 +954,16 @@ static int sienna_cichlid_print_clk_levels(struct smu_context *smu, freq_values[1] = cur_value; mark_index = cur_value == freq_values[0] ? 0 : cur_value == freq_values[2] ? 2 : 1; - if (mark_index != 1) - freq_values[1] = (freq_values[0] + freq_values[2]) / 2; - for (i = 0; i < 3; i++) { + count = 3; + if (mark_index != 1) { + count = 2; + freq_values[1] = freq_values[2]; + } + + for (i = 0; i < count; i++) { size += sprintf(buf + size, "%d: %uMhz %s\n", i, freq_values[i], - i == mark_index ? "*" : ""); + cur_value == freq_values[i] ? "*" : ""); } } From 687e79c0feb4243b141b1e9a20adba3c0ec66f7f Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Thu, 22 Oct 2020 00:50:07 +0800 Subject: [PATCH 463/497] drm/amdgpu: correct the cu and rb info for sienna cichlid Skip disabled sa to correct the cu_info and active_rbs for sienna cichlid. Signed-off-by: Likun Gao Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.9.x --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 1c98b248a7fb..56fdbe626d30 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4582,12 +4582,17 @@ static void gfx_v10_0_setup_rb(struct amdgpu_device *adev) int i, j; u32 data; u32 active_rbs = 0; + u32 bitmap; u32 rb_bitmap_width_per_sh = adev->gfx.config.max_backends_per_se / adev->gfx.config.max_sh_per_se; mutex_lock(&adev->grbm_idx_mutex); for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { + bitmap = i * adev->gfx.config.max_sh_per_se + j; + if ((adev->asic_type == CHIP_SIENNA_CICHLID) && + ((gfx_v10_3_get_disabled_sa(adev) >> bitmap) & 1)) + continue; gfx_v10_0_select_se_sh(adev, i, j, 0xffffffff); data = gfx_v10_0_get_rb_active_bitmap(adev); active_rbs |= data << ((i * adev->gfx.config.max_sh_per_se + j) * @@ -8812,6 +8817,10 @@ static int gfx_v10_0_get_cu_info(struct amdgpu_device *adev, mutex_lock(&adev->grbm_idx_mutex); for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) { + bitmap = i * adev->gfx.config.max_sh_per_se + j; + if ((adev->asic_type == CHIP_SIENNA_CICHLID) && + ((gfx_v10_3_get_disabled_sa(adev) >> bitmap) & 1)) + continue; mask = 1; ao_bitmap = 0; counter = 0; From 033e4040d453f1f7111e5957a54f3019eb089cc6 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 22 Oct 2020 11:02:21 +0800 Subject: [PATCH 464/497] ALSA: hda - Fix the return value if cb func is already registered If the cb function is already registered, should return the pointer of the structure hda_jack_callback which contains this cb func, but instead it returns the NULL. Now fix it by replacing func_is_already_in_callback_list() with find_callback_from_list(). Fixes: f4794c6064a8 ("ALSA: hda - Don't register a cb func if it is registered already") Reported-and-suggested-by: Dan Carpenter Cc: Signed-off-by: Hui Wang Link: https://lore.kernel.org/r/20201022030221.22393-1-hui.wang@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_jack.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/sound/pci/hda/hda_jack.c b/sound/pci/hda/hda_jack.c index ded4813f8b54..588059428d8f 100644 --- a/sound/pci/hda/hda_jack.c +++ b/sound/pci/hda/hda_jack.c @@ -275,16 +275,21 @@ int snd_hda_jack_detect_state_mst(struct hda_codec *codec, } EXPORT_SYMBOL_GPL(snd_hda_jack_detect_state_mst); -static bool func_is_already_in_callback_list(struct hda_jack_tbl *jack, - hda_jack_callback_fn func) +static struct hda_jack_callback * +find_callback_from_list(struct hda_jack_tbl *jack, + hda_jack_callback_fn func) { struct hda_jack_callback *cb; + if (!func) + return NULL; + for (cb = jack->callback; cb; cb = cb->next) { if (cb->func == func) - return true; + return cb; } - return false; + + return NULL; } /** @@ -309,7 +314,10 @@ snd_hda_jack_detect_enable_callback_mst(struct hda_codec *codec, hda_nid_t nid, jack = snd_hda_jack_tbl_new(codec, nid, dev_id); if (!jack) return ERR_PTR(-ENOMEM); - if (func && !func_is_already_in_callback_list(jack, func)) { + + callback = find_callback_from_list(jack, func); + + if (func && !callback) { callback = kzalloc(sizeof(*callback), GFP_KERNEL); if (!callback) return ERR_PTR(-ENOMEM); From c77761c8a59405cb7aa44188b30fffe13fbdd02d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Oct 2020 12:55:52 +0200 Subject: [PATCH 465/497] netfilter: nf_fwd_netdev: clear timestamp in forwarding path Similar to 7980d2eabde8 ("ipvs: clear skb->tstamp in forwarding path"). fq qdisc requires tstamp to be cleared in forwarding path. Fixes: 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_dup_netdev.c | 1 + net/netfilter/nft_fwd_netdev.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index 2b01a151eaa8..a579e59ee5c5 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -19,6 +19,7 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) skb_push(skb, skb->mac_len); skb->dev = dev; + skb->tstamp = 0; dev_queue_xmit(skb); } diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 3087e23297db..b77985986b24 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -138,6 +138,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, return; skb->dev = dev; + skb->tstamp = 0; neigh_xmit(neigh_table, dev, addr, skb); out: regs->verdict.code = verdict; From 0a1754b2a97efa644aa6e84d1db5b17c42251483 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Mon, 19 Oct 2020 22:22:42 +0800 Subject: [PATCH 466/497] ring-buffer: Return 0 on success from ring_buffer_resize() We don't need to check the new buffer size, and the return value had confused resize_buffer_duplicate_size(). ... ret = ring_buffer_resize(trace_buf->buffer, per_cpu_ptr(size_buf->data,cpu_id)->entries, cpu_id); if (ret == 0) per_cpu_ptr(trace_buf->data, cpu_id)->entries = per_cpu_ptr(size_buf->data, cpu_id)->entries; ... Link: https://lkml.kernel.org/r/20201019142242.11560-1-hqjagain@gmail.com Cc: stable@vger.kernel.org Fixes: d60da506cbeb3 ("tracing: Add a resize function to make one buffer equivalent to another buffer") Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 15bf28b13e50..5c6a9c6a058f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1952,18 +1952,18 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; - int cpu, err = 0; + int cpu, err; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) - return size; + return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) - return size; + return 0; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); @@ -2119,7 +2119,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } mutex_unlock(&buffer->mutex); - return size; + return 0; out_err: for_each_buffer_cpu(buffer, cpu) { From e1981f75d398c0afe83c8ffa4e5864f037967409 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 17 Oct 2020 17:52:46 +0800 Subject: [PATCH 467/497] ring-buffer: Update the description for ring_buffer_wait The function changed at some point, but the description was not updated. Link: https://lkml.kernel.org/r/20201017095246.5170-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5c6a9c6a058f..7f45fd9d5a45 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -793,7 +793,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on - * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise From a7305e684fcfb33029fe3d0af6b7d8dc4c8ca7a1 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 6 Oct 2020 18:05:13 +0200 Subject: [PATCH 468/497] PM: AVS: qcom-cpr: Move the driver to the qcom specific drivers The avs drivers are all SoC specific drivers that doesn't share any code. Instead they are located in a directory, mostly to keep similar functionality together. From a maintenance point of view, it makes better sense to collect SoC specific drivers like these, into the SoC specific directories. Therefore, let's move the qcom-cpr driver to the qcom directory. Signed-off-by: Ulf Hansson Acked-by: Bjorn Andersson Acked-by: Niklas Cassel Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 2 +- drivers/power/avs/Kconfig | 16 ---------------- drivers/power/avs/Makefile | 1 - drivers/soc/qcom/Kconfig | 16 ++++++++++++++++ drivers/soc/qcom/Makefile | 1 + drivers/{power/avs/qcom-cpr.c => soc/qcom/cpr.c} | 0 6 files changed, 18 insertions(+), 18 deletions(-) rename drivers/{power/avs/qcom-cpr.c => soc/qcom/cpr.c} (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 6c17687faa52..d08ff56e35b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14354,7 +14354,7 @@ L: linux-pm@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/power/avs/qcom,cpr.txt -F: drivers/power/avs/qcom-cpr.c +F: drivers/soc/qcom/cpr.c QUALCOMM CPUFREQ DRIVER MSM8996/APQ8096 M: Ilia Lin diff --git a/drivers/power/avs/Kconfig b/drivers/power/avs/Kconfig index d789509ae7e9..a4e40e534e6a 100644 --- a/drivers/power/avs/Kconfig +++ b/drivers/power/avs/Kconfig @@ -1,17 +1 @@ # SPDX-License-Identifier: GPL-2.0-only - -config QCOM_CPR - tristate "QCOM Core Power Reduction (CPR) support" - depends on POWER_AVS && HAS_IOMEM - select PM_OPP - select REGMAP - help - Say Y here to enable support for the CPR hardware found on Qualcomm - SoCs like QCS404. - - This driver populates CPU OPPs tables and makes adjustments to the - tables based on feedback from the CPR hardware. If you want to do - CPUfrequency scaling say Y here. - - To compile this driver as a module, choose M here: the module will - be called qcom-cpr diff --git a/drivers/power/avs/Makefile b/drivers/power/avs/Makefile index 735832f47214..a4e40e534e6a 100644 --- a/drivers/power/avs/Makefile +++ b/drivers/power/avs/Makefile @@ -1,2 +1 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_QCOM_CPR) += qcom-cpr.o diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig index 3dc3e3d61ea3..6a3b69b43ad5 100644 --- a/drivers/soc/qcom/Kconfig +++ b/drivers/soc/qcom/Kconfig @@ -26,6 +26,22 @@ config QCOM_COMMAND_DB resource on a RPM-hardened platform must use this database to get SoC specific identifier and information for the shared resources. +config QCOM_CPR + tristate "QCOM Core Power Reduction (CPR) support" + depends on ARCH_QCOM && HAS_IOMEM + select PM_OPP + select REGMAP + help + Say Y here to enable support for the CPR hardware found on Qualcomm + SoCs like QCS404. + + This driver populates CPU OPPs tables and makes adjustments to the + tables based on feedback from the CPR hardware. If you want to do + CPUfrequency scaling say Y here. + + To compile this driver as a module, choose M here: the module will + be called qcom-cpr + config QCOM_GENI_SE tristate "QCOM GENI Serial Engine Driver" depends on ARCH_QCOM || COMPILE_TEST diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile index 93392d9dc7f7..ad675a6593d0 100644 --- a/drivers/soc/qcom/Makefile +++ b/drivers/soc/qcom/Makefile @@ -3,6 +3,7 @@ CFLAGS_rpmh-rsc.o := -I$(src) obj-$(CONFIG_QCOM_AOSS_QMP) += qcom_aoss.o obj-$(CONFIG_QCOM_GENI_SE) += qcom-geni-se.o obj-$(CONFIG_QCOM_COMMAND_DB) += cmd-db.o +obj-$(CONFIG_QCOM_CPR) += cpr.o obj-$(CONFIG_QCOM_GSBI) += qcom_gsbi.o obj-$(CONFIG_QCOM_MDT_LOADER) += mdt_loader.o obj-$(CONFIG_QCOM_OCMEM) += ocmem.o diff --git a/drivers/power/avs/qcom-cpr.c b/drivers/soc/qcom/cpr.c similarity index 100% rename from drivers/power/avs/qcom-cpr.c rename to drivers/soc/qcom/cpr.c From 785b5bb41b0a9b1d9173192dcdebe6e994d1f71a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 6 Oct 2020 18:05:16 +0200 Subject: [PATCH 469/497] PM: AVS: Drop the avs directory and the corresponding Kconfig All avs drivers have now been moved to their corresponding soc specific directories. Additionally, they don't depend on the POWER_AVS Kconfig anymore. Therefore, let's simply drop the drivers/power/avs directory altogether. Cc: Nishanth Menon Signed-off-by: Ulf Hansson Reviewed-by: Nishanth Menon Signed-off-by: Rafael J. Wysocki --- drivers/power/Kconfig | 1 - drivers/power/Makefile | 1 - drivers/power/avs/Kconfig | 1 - drivers/power/avs/Makefile | 1 - 4 files changed, 4 deletions(-) delete mode 100644 drivers/power/avs/Kconfig delete mode 100644 drivers/power/avs/Makefile diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index ff0350ca3b74..696bf77a7042 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -source "drivers/power/avs/Kconfig" source "drivers/power/reset/Kconfig" source "drivers/power/supply/Kconfig" diff --git a/drivers/power/Makefile b/drivers/power/Makefile index b7c2e372186b..effbf0377f32 100644 --- a/drivers/power/Makefile +++ b/drivers/power/Makefile @@ -1,4 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_POWER_AVS) += avs/ obj-$(CONFIG_POWER_RESET) += reset/ obj-$(CONFIG_POWER_SUPPLY) += supply/ diff --git a/drivers/power/avs/Kconfig b/drivers/power/avs/Kconfig deleted file mode 100644 index a4e40e534e6a..000000000000 --- a/drivers/power/avs/Kconfig +++ /dev/null @@ -1 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only diff --git a/drivers/power/avs/Makefile b/drivers/power/avs/Makefile deleted file mode 100644 index a4e40e534e6a..000000000000 --- a/drivers/power/avs/Makefile +++ /dev/null @@ -1 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only From d298787dbbab5f7ada97c292e19c5c6e55fda6cd Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Oct 2020 13:03:30 -0700 Subject: [PATCH 470/497] PM: sleep: remove unreachable break A break following a return statement is pointless, so drop it. Signed-off-by: Tom Rix [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 205a06752ca9..c7ac49042cee 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -363,7 +363,6 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; - break; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ From abcba2e135ec45a54580c80e5e14bbc2911ba231 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Oct 2020 13:04:53 -0700 Subject: [PATCH 471/497] ACPI: utils: remove unreachable breaks A break following a return statement is pointless, so drop all of the breaks following return statements from this file. Signed-off-by: Tom Rix [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 838b719ec7ce..d5411a166685 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -104,7 +104,6 @@ acpi_extract_package(union acpi_object *package, " [%c]\n", i, format_string[i]); return AE_BAD_DATA; - break; } break; @@ -129,7 +128,6 @@ acpi_extract_package(union acpi_object *package, " expecting [%c]\n", i, format_string[i]); return AE_BAD_DATA; - break; } break; case ACPI_TYPE_LOCAL_REFERENCE: @@ -144,7 +142,6 @@ acpi_extract_package(union acpi_object *package, " expecting [%c]\n", i, format_string[i]); return AE_BAD_DATA; - break; } break; @@ -155,7 +152,6 @@ acpi_extract_package(union acpi_object *package, i)); /* TBD: handle nested packages... */ return AE_SUPPORT; - break; } } From a6a9cffad0a28a4a7a3a91b6ee13dd1baae4dfcb Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 21 Oct 2020 10:22:33 +1000 Subject: [PATCH 472/497] cifs: add files to host new mount api This will make it easier in the future, but also will allow us to shrink connect.c which is getting too big, and harder to read Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/Makefile | 2 +- fs/cifs/fs_context.c | 8 ++++++++ fs/cifs/fs_context.h | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 fs/cifs/fs_context.c create mode 100644 fs/cifs/fs_context.h diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 51bae9340842..cd17d0e50f2a 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -10,7 +10,7 @@ cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ - smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o + smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o cifs-$(CONFIG_CIFS_XATTR) += xattr.o diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c new file mode 100644 index 000000000000..f50dfc2d2e44 --- /dev/null +++ b/fs/cifs/fs_context.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020, Microsoft Corporation. + * + * Author(s): Steve French + * David Howells + */ + diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h new file mode 100644 index 000000000000..082f286f923e --- /dev/null +++ b/fs/cifs/fs_context.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020, Microsoft Corporation. + * + * Author(s): Steve French + * David Howells + */ + +#ifndef _FS_CONTEXT_H +#define _FS_CONTEXT_H + + + +#endif + From 5c6e5aa496804451fc94d00a7cf9be2e3051ae29 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 21 Oct 2020 10:37:11 +1000 Subject: [PATCH 473/497] cifs: move security mount options into fs_context.ch This patch moves the parsing of security mount options into fs_context.ch. There are no changes to any logic. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/connect.c | 85 +------------------------------------------- fs/cifs/fs_context.c | 76 +++++++++++++++++++++++++++++++++++++++ fs/cifs/fs_context.h | 20 ++++++++++- 3 files changed, 96 insertions(+), 85 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3a9980bf0d6e..c2a92e26fef4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -61,6 +61,7 @@ #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif +#include "fs_context.h" extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -279,33 +280,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_err, NULL } }; -enum { - Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, - Opt_sec_ntlmsspi, Opt_sec_ntlmssp, - Opt_ntlm, Opt_sec_ntlmi, Opt_sec_ntlmv2, - Opt_sec_ntlmv2i, Opt_sec_lanman, - Opt_sec_none, - - Opt_sec_err -}; - -static const match_table_t cifs_secflavor_tokens = { - { Opt_sec_krb5, "krb5" }, - { Opt_sec_krb5i, "krb5i" }, - { Opt_sec_krb5p, "krb5p" }, - { Opt_sec_ntlmsspi, "ntlmsspi" }, - { Opt_sec_ntlmssp, "ntlmssp" }, - { Opt_ntlm, "ntlm" }, - { Opt_sec_ntlmi, "ntlmi" }, - { Opt_sec_ntlmv2, "nontlm" }, - { Opt_sec_ntlmv2, "ntlmv2" }, - { Opt_sec_ntlmv2i, "ntlmv2i" }, - { Opt_sec_lanman, "lanman" }, - { Opt_sec_none, "none" }, - - { Opt_sec_err, NULL } -}; - /* cache flavors */ enum { Opt_cache_loose, @@ -1372,63 +1346,6 @@ static int get_option_gid(substring_t args[], kgid_t *result) return 0; } -static int cifs_parse_security_flavors(char *value, - struct smb_vol *vol) -{ - - substring_t args[MAX_OPT_ARGS]; - - /* - * With mount options, the last one should win. Reset any existing - * settings back to default. - */ - vol->sectype = Unspecified; - vol->sign = false; - - switch (match_token(value, cifs_secflavor_tokens, args)) { - case Opt_sec_krb5p: - cifs_dbg(VFS, "sec=krb5p is not supported!\n"); - return 1; - case Opt_sec_krb5i: - vol->sign = true; - fallthrough; - case Opt_sec_krb5: - vol->sectype = Kerberos; - break; - case Opt_sec_ntlmsspi: - vol->sign = true; - fallthrough; - case Opt_sec_ntlmssp: - vol->sectype = RawNTLMSSP; - break; - case Opt_sec_ntlmi: - vol->sign = true; - fallthrough; - case Opt_ntlm: - vol->sectype = NTLM; - break; - case Opt_sec_ntlmv2i: - vol->sign = true; - fallthrough; - case Opt_sec_ntlmv2: - vol->sectype = NTLMv2; - break; -#ifdef CONFIG_CIFS_WEAK_PW_HASH - case Opt_sec_lanman: - vol->sectype = LANMAN; - break; -#endif - case Opt_sec_none: - vol->nullauth = 1; - break; - default: - cifs_dbg(VFS, "bad security option: %s\n", value); - return 1; - } - - return 0; -} - static int cifs_parse_cache_flavor(char *value, struct smb_vol *vol) { diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index f50dfc2d2e44..dd9da734e346 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -6,3 +6,79 @@ * David Howells */ +#include "cifsglob.h" +#include "cifs_debug.h" +#include "fs_context.h" + +static const match_table_t cifs_secflavor_tokens = { + { Opt_sec_krb5, "krb5" }, + { Opt_sec_krb5i, "krb5i" }, + { Opt_sec_krb5p, "krb5p" }, + { Opt_sec_ntlmsspi, "ntlmsspi" }, + { Opt_sec_ntlmssp, "ntlmssp" }, + { Opt_ntlm, "ntlm" }, + { Opt_sec_ntlmi, "ntlmi" }, + { Opt_sec_ntlmv2, "nontlm" }, + { Opt_sec_ntlmv2, "ntlmv2" }, + { Opt_sec_ntlmv2i, "ntlmv2i" }, + { Opt_sec_lanman, "lanman" }, + { Opt_sec_none, "none" }, + + { Opt_sec_err, NULL } +}; + +int cifs_parse_security_flavors(char *value, struct smb_vol *vol) +{ + + substring_t args[MAX_OPT_ARGS]; + + /* + * With mount options, the last one should win. Reset any existing + * settings back to default. + */ + vol->sectype = Unspecified; + vol->sign = false; + + switch (match_token(value, cifs_secflavor_tokens, args)) { + case Opt_sec_krb5p: + cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + return 1; + case Opt_sec_krb5i: + vol->sign = true; + fallthrough; + case Opt_sec_krb5: + vol->sectype = Kerberos; + break; + case Opt_sec_ntlmsspi: + vol->sign = true; + fallthrough; + case Opt_sec_ntlmssp: + vol->sectype = RawNTLMSSP; + break; + case Opt_sec_ntlmi: + vol->sign = true; + fallthrough; + case Opt_ntlm: + vol->sectype = NTLM; + break; + case Opt_sec_ntlmv2i: + vol->sign = true; + fallthrough; + case Opt_sec_ntlmv2: + vol->sectype = NTLMv2; + break; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + case Opt_sec_lanman: + vol->sectype = LANMAN; + break; +#endif + case Opt_sec_none: + vol->nullauth = 1; + break; + default: + cifs_dbg(VFS, "bad security option: %s\n", value); + return 1; + } + + return 0; +} diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 082f286f923e..87747ef7a2d3 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -9,7 +9,25 @@ #ifndef _FS_CONTEXT_H #define _FS_CONTEXT_H +#include +#include "cifsglob.h" +enum cifs_sec_param { + Opt_sec_krb5, + Opt_sec_krb5i, + Opt_sec_krb5p, + Opt_sec_ntlmsspi, + Opt_sec_ntlmssp, + Opt_ntlm, + Opt_sec_ntlmi, + Opt_sec_ntlmv2, + Opt_sec_ntlmv2i, + Opt_sec_lanman, + Opt_sec_none, + + Opt_sec_err +}; + +int cifs_parse_security_flavors(char *value, struct smb_vol *vol); #endif - From 2f20f076865daed006459b39ba78c2fc23b5c8b4 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 21 Oct 2020 11:30:35 +1000 Subject: [PATCH 474/497] cifs: move cache mount options to fs_context.ch Helps to shrink connect.c and make it more readable by moving mount related code to fs_context.c and fs_context.h Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/connect.c | 62 -------------------------------------------- fs/cifs/fs_context.c | 52 +++++++++++++++++++++++++++++++++++++ fs/cifs/fs_context.h | 11 ++++++++ 3 files changed, 63 insertions(+), 62 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index c2a92e26fef4..f7ef38a9dafa 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -280,25 +280,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_err, NULL } }; -/* cache flavors */ -enum { - Opt_cache_loose, - Opt_cache_strict, - Opt_cache_none, - Opt_cache_ro, - Opt_cache_rw, - Opt_cache_err -}; - -static const match_table_t cifs_cacheflavor_tokens = { - { Opt_cache_loose, "loose" }, - { Opt_cache_strict, "strict" }, - { Opt_cache_none, "none" }, - { Opt_cache_ro, "ro" }, - { Opt_cache_rw, "singleclient" }, - { Opt_cache_err, NULL } -}; - static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, { Smb_20, SMB20_VERSION_STRING}, @@ -1346,49 +1327,6 @@ static int get_option_gid(substring_t args[], kgid_t *result) return 0; } -static int -cifs_parse_cache_flavor(char *value, struct smb_vol *vol) -{ - substring_t args[MAX_OPT_ARGS]; - - switch (match_token(value, cifs_cacheflavor_tokens, args)) { - case Opt_cache_loose: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_strict: - vol->direct_io = false; - vol->strict_io = true; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_none: - vol->direct_io = true; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = false; - break; - case Opt_cache_ro: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = true; - vol->cache_rw = false; - break; - case Opt_cache_rw: - vol->direct_io = false; - vol->strict_io = false; - vol->cache_ro = false; - vol->cache_rw = true; - break; - default: - cifs_dbg(VFS, "bad cache= option: %s\n", value); - return 1; - } - return 0; -} - static int cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) { diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index dd9da734e346..deb1168ed8af 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -82,3 +82,55 @@ int cifs_parse_security_flavors(char *value, struct smb_vol *vol) return 0; } + +static const match_table_t cifs_cacheflavor_tokens = { + { Opt_cache_loose, "loose" }, + { Opt_cache_strict, "strict" }, + { Opt_cache_none, "none" }, + { Opt_cache_ro, "ro" }, + { Opt_cache_rw, "singleclient" }, + { Opt_cache_err, NULL } +}; + +int +cifs_parse_cache_flavor(char *value, struct smb_vol *vol) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_cacheflavor_tokens, args)) { + case Opt_cache_loose: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_strict: + vol->direct_io = false; + vol->strict_io = true; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_none: + vol->direct_io = true; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = false; + break; + case Opt_cache_ro: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = true; + vol->cache_rw = false; + break; + case Opt_cache_rw: + vol->direct_io = false; + vol->strict_io = false; + vol->cache_ro = false; + vol->cache_rw = true; + break; + default: + cifs_dbg(VFS, "bad cache= option: %s\n", value); + return 1; + } + return 0; +} diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 87747ef7a2d3..3e3f6e29e787 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -12,6 +12,17 @@ #include #include "cifsglob.h" +enum { + Opt_cache_loose, + Opt_cache_strict, + Opt_cache_none, + Opt_cache_ro, + Opt_cache_rw, + Opt_cache_err +}; + +int cifs_parse_cache_flavor(char *value, struct smb_vol *vol); + enum cifs_sec_param { Opt_sec_krb5, Opt_sec_krb5i, From 555782aa556af869d4f390996607abd356513ba4 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 21 Oct 2020 12:10:44 +1000 Subject: [PATCH 475/497] cifs: move smb version mount options into fs_context.c This and related patches which move mount related code to fs_context.c has the advantage of shriking the code in fs/cifs/connect.c (which had the second most lines of code of any of the files in cifs.ko and was getting harder to read due to its size) and will also make it easier to switch over to the new mount API in the future. Signed-off-by: Ronnie Sahlberg Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 12 ------- fs/cifs/connect.c | 85 -------------------------------------------- fs/cifs/fs_context.c | 85 ++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/fs_context.h | 14 ++++++++ 4 files changed, 99 insertions(+), 97 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a1a1a16acb38..b6925aeeb621 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -195,18 +195,6 @@ struct smb_rqst { unsigned int rq_tailsz; /* length of last page */ }; -enum smb_version { - Smb_1 = 1, - Smb_20, - Smb_21, - Smb_30, - Smb_302, - Smb_311, - Smb_3any, - Smb_default, - Smb_version_err -}; - struct mid_q_entry; struct TCP_Server_Info; struct cifsFileInfo; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f7ef38a9dafa..c38156f324dd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -280,20 +280,6 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_err, NULL } }; -static const match_table_t cifs_smb_version_tokens = { - { Smb_1, SMB1_VERSION_STRING }, - { Smb_20, SMB20_VERSION_STRING}, - { Smb_21, SMB21_VERSION_STRING }, - { Smb_30, SMB30_VERSION_STRING }, - { Smb_302, SMB302_VERSION_STRING }, - { Smb_302, ALT_SMB302_VERSION_STRING }, - { Smb_311, SMB311_VERSION_STRING }, - { Smb_311, ALT_SMB311_VERSION_STRING }, - { Smb_3any, SMB3ANY_VERSION_STRING }, - { Smb_default, SMBDEFAULT_VERSION_STRING }, - { Smb_version_err, NULL } -}; - static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); @@ -1327,77 +1313,6 @@ static int get_option_gid(substring_t args[], kgid_t *result) return 0; } -static int -cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) -{ - substring_t args[MAX_OPT_ARGS]; - - switch (match_token(value, cifs_smb_version_tokens, args)) { -#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY - case Smb_1: - if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); - return 1; - } - if (is_smb3) { - cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); - return 1; - } - cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); - vol->ops = &smb1_operations; - vol->vals = &smb1_values; - break; - case Smb_20: - if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); - return 1; - } - if (is_smb3) { - cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); - return 1; - } - vol->ops = &smb20_operations; - vol->vals = &smb20_values; - break; -#else - case Smb_1: - cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); - return 1; - case Smb_20: - cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); - return 1; -#endif /* CIFS_ALLOW_INSECURE_LEGACY */ - case Smb_21: - vol->ops = &smb21_operations; - vol->vals = &smb21_values; - break; - case Smb_30: - vol->ops = &smb30_operations; - vol->vals = &smb30_values; - break; - case Smb_302: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smb302_values; - break; - case Smb_311: - vol->ops = &smb311_operations; - vol->vals = &smb311_values; - break; - case Smb_3any: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smb3any_values; - break; - case Smb_default: - vol->ops = &smb30_operations; /* currently identical with 3.0 */ - vol->vals = &smbdefault_values; - break; - default: - cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); - return 1; - } - return 0; -} - /* * Parse a devname into substrings and populate the vol->UNC and vol->prepath * fields with the result. Returns 0 on success and an error otherwise. diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index deb1168ed8af..ad6c2fed4055 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -10,6 +10,91 @@ #include "cifs_debug.h" #include "fs_context.h" +static const match_table_t cifs_smb_version_tokens = { + { Smb_1, SMB1_VERSION_STRING }, + { Smb_20, SMB20_VERSION_STRING}, + { Smb_21, SMB21_VERSION_STRING }, + { Smb_30, SMB30_VERSION_STRING }, + { Smb_302, SMB302_VERSION_STRING }, + { Smb_302, ALT_SMB302_VERSION_STRING }, + { Smb_311, SMB311_VERSION_STRING }, + { Smb_311, ALT_SMB311_VERSION_STRING }, + { Smb_3any, SMB3ANY_VERSION_STRING }, + { Smb_default, SMBDEFAULT_VERSION_STRING }, + { Smb_version_err, NULL } +}; + +int +cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3) +{ + substring_t args[MAX_OPT_ARGS]; + + switch (match_token(value, cifs_smb_version_tokens, args)) { +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY + case Smb_1: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); + return 1; + } + cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); + vol->ops = &smb1_operations; + vol->vals = &smb1_values; + break; + case Smb_20: + if (disable_legacy_dialects) { + cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + return 1; + } + if (is_smb3) { + cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); + return 1; + } + vol->ops = &smb20_operations; + vol->vals = &smb20_values; + break; +#else + case Smb_1: + cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); + return 1; + case Smb_20: + cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); + return 1; +#endif /* CIFS_ALLOW_INSECURE_LEGACY */ + case Smb_21: + vol->ops = &smb21_operations; + vol->vals = &smb21_values; + break; + case Smb_30: + vol->ops = &smb30_operations; + vol->vals = &smb30_values; + break; + case Smb_302: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb302_values; + break; + case Smb_311: + vol->ops = &smb311_operations; + vol->vals = &smb311_values; + break; + case Smb_3any: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smb3any_values; + break; + case Smb_default: + vol->ops = &smb30_operations; /* currently identical with 3.0 */ + vol->vals = &smbdefault_values; + break; + default: + cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); + return 1; + } + return 0; +} + static const match_table_t cifs_secflavor_tokens = { { Opt_sec_krb5, "krb5" }, { Opt_sec_krb5i, "krb5i" }, diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 3e3f6e29e787..886208a1b0ef 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -12,6 +12,20 @@ #include #include "cifsglob.h" +enum smb_version { + Smb_1 = 1, + Smb_20, + Smb_21, + Smb_30, + Smb_302, + Smb_311, + Smb_3any, + Smb_default, + Smb_version_err +}; + +int cifs_parse_smb_version(char *value, struct smb_vol *vol, bool is_smb3); + enum { Opt_cache_loose, Opt_cache_strict, From 1af34fdd0799fed0b248fa2521ae9e2d69365742 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 21 Oct 2020 20:36:26 -0500 Subject: [PATCH 476/497] smb3.1.1: fix typo in compression flag Fix minor typo in new compression flag define Reported-by: Tom Talpey Signed-off-by: Steve French --- fs/cifs/smb2pdu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 851c6cd4742a..171f54965703 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -154,7 +154,7 @@ struct smb2_compression_transform_hdr { /* See MS-SMB2 2.2.42.1 */ #define SMB2_COMPRESSION_FLAG_NONE 0x0000 -#define SMB2_COMPRESSION_FLAG_CHAINDED 0x0001 +#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 struct compression_payload_header { __le16 CompressionAlgorithm; From d367cb960ce88914898cbfa43645c2e43ede9465 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Sep 2020 23:18:21 +0300 Subject: [PATCH 477/497] cifs: remove bogus debug code The "end" pointer is either NULL or it points to the next byte to parse. If there isn't a next byte then dereferencing "end" is an off-by-one out of bounds error. And, of course, if it's NULL that leads to an Oops. Printing "*end" doesn't seem very useful so let's delete this code. Also for the last debug statement, I noticed that it should be printing "sequence_end" instead of "end" so fix that as well. Reported-by: Dominik Maier Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/asn1.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 689162e2e175..3150c19cdc2f 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -530,8 +530,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; } @@ -541,8 +541,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n", + cls, con, tag, end); return 0; } @@ -552,8 +552,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; } @@ -564,8 +564,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n", + cls, con, tag, sequence_end); return 0; } From 13909d96c84afd409bf11aa6c8fbcb1efacb12eb Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 21 Oct 2020 23:54:19 -0500 Subject: [PATCH 478/497] SMB3: add support for recognizing WSL reparse tags The IO_REPARSE_TAG_LX_ tags originally were used by WSL but they are preferred by the Linux client in some cases since, unlike the NFS reparse tag (or EAs), they don't require an extra query to determine which type of special file they represent. Add support for readdir to recognize special file types of FIFO, SOCKET, CHAR, BLOCK and SYMLINK. This can be tested by creating these special files in WSL Linux and then sharing that location on the Windows server and mounting to the Windows server to access them. Prior to this patch all of the special files would show up as being of type 'file' but with this patch they can be seen with the correct file type as can be seen below: brwxr-xr-x 1 root root 0, 0 Oct 21 17:10 block crwxr-xr-x 1 root root 0, 0 Oct 21 17:46 char drwxr-xr-x 2 root root 0 Oct 21 18:27 dir prwxr-xr-x 1 root root 0 Oct 21 16:21 fifo -rwxr-xr-x 1 root root 0 Oct 21 15:48 file lrwxr-xr-x 1 root root 0 Oct 21 15:52 symlink-to-file TODO: go through all documented reparse tags to see if we can reasonably map some of them to directories vs. files vs. symlinks and also add support for device numbers for block and char devices. Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/readdir.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5abf1ea21abe..799be3a5d25e 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -168,10 +168,33 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; + /* + * The IO_REPARSE_TAG_LX_ tags originally were used by WSL but they + * are preferred by the Linux client in some cases since, unlike + * the NFS reparse tag (or EAs), they don't require an extra query + * to determine which type of special file they represent. + * TODO: go through all documented reparse tags to see if we can + * reasonably map some of them to directories vs. files vs. symlinks + */ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; - } else { + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_SYMLINK) { + fattr->cf_mode |= S_IFLNK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_LNK; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_FIFO) { + fattr->cf_mode |= S_IFIFO | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_FIFO; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_AF_UNIX) { + fattr->cf_mode |= S_IFSOCK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_SOCK; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_CHR) { + fattr->cf_mode |= S_IFCHR | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_CHR; + } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_BLK) { + fattr->cf_mode |= S_IFBLK | cifs_sb->mnt_file_mode; + fattr->cf_dtype = DT_BLK; + } else { /* TODO: should we mark some other reparse points (like DFSR) as directories? */ fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; } From 8c42a5c02bec6c7eccf08957be3c6c8fccf9790b Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 22 Oct 2020 03:16:22 -0700 Subject: [PATCH 479/497] ARC: perf: redo the pct irq missing in device-tree handling commit feb92d7d3813456c11dce21 "(ARC: perf: don't bail setup if pct irq missing in device-tree)" introduced a silly brown-paper bag bug: The assignment and comparison in an if statement were not bracketed correctly leaving the order of evaluation undefined. | | if (has_interrupts && (irq = platform_get_irq(pdev, 0) >= 0)) { | ^^^ ^^^^ And given such a chance, the compiler will bite you hard, fully entitled to generating this piece of beauty: | | # if (has_interrupts && (irq = platform_get_irq(pdev, 0) >= 0)) { | | bl.d @platform_get_irq <-- irq returned in r0 | | setge r2, r0, 0 <-- r2 is bool 1 or 0 if irq >= 0 true/false | brlt.d r0, 0, @.L114 | | st_s r2,[sp] <-- irq saved is bool 1 or 0, not actual return val | st 1,[r3,160] # arc_pmu.18_29->irq <-- drops bool and assumes 1 | | # return __request_percpu_irq(irq, handler, 0, | | bl.d @__request_percpu_irq; | mov_s r0,1 <-- drops even bool and assumes 1 which fails With the snafu fixed, everything is as expected. | bl.d @platform_get_irq <-- returns irq in r0 | | mov_s r2,r0 | brlt.d r2, 0, @.L112 | | st_s r0,[sp] <-- irq isaved is actual return value above | st r0,[r13,160] #arc_pmu.18_27->irq | | bl.d @__request_percpu_irq <-- r0 unchanged so actual irq returned | add r4,r4,r12 #, tmp363, __ptr Cc: Signed-off-by: Vineet Gupta --- arch/arc/kernel/perf_event.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 79849f37e782..145722f80c9b 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -562,7 +562,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) { struct arc_reg_pct_build pct_bcr; struct arc_reg_cc_build cc_bcr; - int i, has_interrupts, irq; + int i, has_interrupts, irq = -1; int counter_size; /* in bits */ union cc_name { @@ -637,19 +637,28 @@ static int arc_pmu_device_probe(struct platform_device *pdev) .attr_groups = arc_pmu->attr_groups, }; - if (has_interrupts && (irq = platform_get_irq(pdev, 0) >= 0)) { + if (has_interrupts) { + irq = platform_get_irq(pdev, 0); + if (irq >= 0) { + int ret; - arc_pmu->irq = irq; + arc_pmu->irq = irq; - /* intc map function ensures irq_set_percpu_devid() called */ - request_percpu_irq(irq, arc_pmu_intr, "ARC perf counters", - this_cpu_ptr(&arc_pmu_cpu)); + /* intc map function ensures irq_set_percpu_devid() called */ + ret = request_percpu_irq(irq, arc_pmu_intr, "ARC perf counters", + this_cpu_ptr(&arc_pmu_cpu)); + + if (!ret) + on_each_cpu(arc_cpu_pmu_irq_init, &irq, 1); + else + irq = -1; + } - on_each_cpu(arc_cpu_pmu_irq_init, &irq, 1); - } else { - arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } + if (irq == -1) + arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + /* * perf parser doesn't really like '-' symbol in events name, so let's * use '_' in arc pct name as it goes to kernel PMU event prefix. From 700465fd338fe5df08a1b2e27fa16981f562547f Mon Sep 17 00:00:00 2001 From: Ke Li Date: Thu, 22 Oct 2020 02:41:46 -0400 Subject: [PATCH 480/497] net: Properly typecast int values to set sk_max_pacing_rate In setsockopt(SO_MAX_PACING_RATE) on 64bit systems, sk_max_pacing_rate, after extended from 'u32' to 'unsigned long', takes unintentionally hiked value whenever assigned from an 'int' value with MSB=1, due to binary sign extension in promoting s32 to u64, e.g. 0x80000000 becomes 0xFFFFFFFF80000000. Thus inflated sk_max_pacing_rate causes subsequent getsockopt to return ~0U unexpectedly. It may also result in increased pacing rate. Fix by explicitly casting the 'int' value to 'unsigned int' before assigning it to sk_max_pacing_rate, for zero extension to happen. Fixes: 76a9ebe811fb ("net: extend sk_pacing_rate to unsigned long") Signed-off-by: Ji Li Signed-off-by: Ke Li Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20201022064146.79873-1-keli@akamai.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 3 ++- net/core/sock.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6d0fa65a4a46..2ca5eecebacf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4733,7 +4733,8 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; + sk->sk_max_pacing_rate = (val == ~0U) ? + ~0UL : (unsigned int)val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; diff --git a/net/core/sock.c b/net/core/sock.c index 4e8729357122..727ea1cc633c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1163,7 +1163,7 @@ set_sndbuf: case SO_MAX_PACING_RATE: { - unsigned long ulval = (val == ~0U) ? ~0UL : val; + unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; if (sizeof(ulval) != sizeof(val) && optlen >= sizeof(ulval) && From 18ded910b589839e38a51623a179837ab4cc3789 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 22 Oct 2020 10:33:31 -0400 Subject: [PATCH 481/497] tcp: fix to update snd_wl1 in bulk receiver fast path In the header prediction fast path for a bulk data receiver, if no data is newly acknowledged then we do not call tcp_ack() and do not call tcp_ack_update_window(). This means that a bulk receiver that receives large amounts of data can have the incoming sequence numbers wrap, so that the check in tcp_may_update_window fails: after(ack_seq, tp->snd_wl1) If the incoming receive windows are zero in this state, and then the connection that was a bulk data receiver later wants to send data, that connection can find itself persistently rejecting the window updates in incoming ACKs. This means the connection can persistently fail to discover that the receive window has opened, which in turn means that the connection is unable to send anything, and the connection's sending process can get permanently "stuck". The fix is to update snd_wl1 in the header prediction fast path for a bulk data receiver, so that it keeps up and does not see wrapping problems. This fix is based on a very nice and thorough analysis and diagnosis by Apollon Oikonomopoulos (see link below). This is a stable candidate but there is no Fixes tag here since the bug predates current git history. Just for fun: looks like the bug dates back to when header prediction was added in Linux v2.1.8 in Nov 1996. In that version tcp_rcv_established() was added, and the code only updates snd_wl1 in tcp_ack(), and in the new "Bulk data transfer: receiver" code path it does not call tcp_ack(). This fix seems to apply cleanly at least as far back as v3.2. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Neal Cardwell Reported-by: Apollon Oikonomopoulos Tested-by: Apollon Oikonomopoulos Link: https://www.spinics.net/lists/netdev/msg692430.html Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20201022143331.1887495-1-ncardwell.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 67f10d3ec240..fc445833b5e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5827,6 +5827,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; + } else { + tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); } __tcp_ack_snd_check(sk, 0); From 94ebdd28fcab7ef1484cd98f4a8e8426fe207994 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Oct 2020 15:26:53 +0100 Subject: [PATCH 482/497] docs/vm: trivial fixes to several spelling mistakes Fix several spelling mistakes in vm documentation. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20201022142653.254429-1-colin.king@canonical.com Signed-off-by: Jonathan Corbet --- Documentation/vm/mmu_notifier.rst | 2 +- Documentation/vm/page_migration.rst | 2 +- Documentation/vm/page_owner.rst | 2 +- Documentation/vm/slub.rst | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/vm/mmu_notifier.rst index 47baa1cf28c5..df5d7777fc6b 100644 --- a/Documentation/vm/mmu_notifier.rst +++ b/Documentation/vm/mmu_notifier.rst @@ -89,7 +89,7 @@ they are write protected for COW (other case of B apply too). So here because at time N+2 the clear page table entry was not pair with a notification to invalidate the secondary TLB, the device see the new value for -addrB before seing the new value for addrA. This break total memory ordering +addrB before seeing the new value for addrA. This break total memory ordering for the device. When changing a pte to write protect or to point to a new write protected page diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 91a98a6b43bb..db9d7e5539cb 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -99,7 +99,7 @@ Steps: 2. Ensure that writeback is complete. 3. Lock the new page that we want to move to. It is locked so that accesses to - this (not yet uptodate) page immediately block while the move is in progress. + this (not yet up-to-date) page immediately block while the move is in progress. 4. All the page table references to the page are converted to migration entries. This decreases the mapcount of a page. If the resulting diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 079f3f8c4784..02deac76673f 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -18,7 +18,7 @@ Although we already have tracepoint for tracing page allocation/free, using it for analyzing who allocate each page is rather complex. We need to enlarge the trace buffer for preventing overlapping until userspace program launched. And, launched program continually dump out the trace -buffer for later analysis and it would change system behviour with more +buffer for later analysis and it would change system behaviour with more possibility rather than just keeping it in memory, so bad for debugging. page owner can also be used for various purposes. For example, accurate diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 289d231cee97..03f294a638bd 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -378,7 +378,7 @@ c) Execute ``slabinfo-gnuplot.sh`` in '-t' mode, passing all of the can go unnoticed. To deal with that, ``slabinfo-gnuplot.sh`` has two options to 'zoom-in'/'zoom-out': - a) ``-s %d,%d`` -- overwrites the default image width and heigh + a) ``-s %d,%d`` -- overwrites the default image width and height b) ``-r %d,%d`` -- specifies a range of samples to use (for example, in ``slabinfo -X >> FOO_STATS; sleep 1;`` case, using a ``-r 40,60`` range will plot only samples collected between 40th and From 62af696471e58bdfcf416fd56f032a60853c2bae Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Thu, 22 Oct 2020 07:54:03 +0100 Subject: [PATCH 483/497] docs: Add two missing entries in vm sysctl index Both seem overlooked while adding the section in the main content. Signed-off-by: Fam Zheng Link: https://lore.kernel.org/r/20201022065403.3936070-1-fam@euphon.net Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/sysctl/vm.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 4b9d2e8e9142..f455fa00c00f 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes - block_dump - compact_memory +- compaction_proactiveness - compact_unevictable_allowed - dirty_background_bytes - dirty_background_ratio @@ -37,6 +38,7 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold +- highmem_is_dirtyable - hugetlb_shm_group - laptop_mode - legacy_va_layout From bb18842e21111a979e2e0e1c5d85c09646f18d51 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:50 -0700 Subject: [PATCH 484/497] kvm: x86/mmu: Add TDP MMU PF handler Add functions to handle page faults in the TDP MMU. These page faults are currently handled in much the same way as the x86 shadow paging based MMU, however the ordering of some operations is slightly different. Future patches will add eager NX splitting, a fast page fault handler, and parallel page faults. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-11-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 53 +++++-------- arch/x86/kvm/mmu/mmu_internal.h | 32 ++++++++ arch/x86/kvm/mmu/mmutrace.h | 8 +- arch/x86/kvm/mmu/tdp_mmu.c | 134 ++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 4 + 5 files changed, 194 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f22c155381d..31d7ba716b44 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -137,23 +137,6 @@ module_param(dbg, bool, 0644); /* make pte_list_desc fit well in cache line */ #define PTE_LIST_EXT 3 -/* - * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). - * - * RET_PF_RETRY: let CPU fault again on the address. - * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. - * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. - * RET_PF_FIXED: The faulting entry has been fixed. - * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. - */ -enum { - RET_PF_RETRY = 0, - RET_PF_EMULATE, - RET_PF_INVALID, - RET_PF_FIXED, - RET_PF_SPURIOUS, -}; - struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; @@ -233,11 +216,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 mask = make_mmio_spte(vcpu, gfn, access); - unsigned int gen = get_mmio_spte_generation(mask); - access = mask & ACC_ALL; - - trace_mark_mmio_spte(sptep, gfn, access, gen); + trace_mark_mmio_spte(sptep, gfn, mask); mmu_spte_set(sptep, mask); } @@ -2762,9 +2742,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level) +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; @@ -2818,10 +2798,10 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *levelp) +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp) { - int level = *levelp; + int level = *goal_levelp; if (cur_level == level && level > PG_LEVEL_4K && is_shadow_present_pte(spte) && @@ -2836,7 +2816,7 @@ static void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); *pfnp |= gfn & page_mask; - (*levelp)--; + (*goal_levelp)--; } } @@ -3643,9 +3623,11 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; + if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; + } r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -3667,8 +3649,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, - prefault, is_tdp); + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, + pfn, prefault); + else + r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, + prefault, is_tdp); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 564954c6b079..6db40ea85974 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -111,4 +111,36 @@ static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp) return !sp->root_count; } +/* + * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). + * + * RET_PF_RETRY: let CPU fault again on the address. + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. + * RET_PF_FIXED: The faulting entry has been fixed. + * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + */ +enum { + RET_PF_RETRY = 0, + RET_PF_EMULATE, + RET_PF_INVALID, + RET_PF_FIXED, + RET_PF_SPURIOUS, +}; + +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) +#define SET_SPTE_SPURIOUS BIT(2) + +int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, + bool huge_page_disallowed, int *req_level); +void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, + kvm_pfn_t *pfnp, int *goal_levelp); + +bool is_nx_huge_page_enabled(void); + +void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 2080f9c32213..213699b27b44 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -202,8 +202,8 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TRACE_EVENT( mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen), - TP_ARGS(sptep, gfn, access, gen), + TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte), + TP_ARGS(sptep, gfn, spte), TP_STRUCT__entry( __field(void *, sptep) @@ -215,8 +215,8 @@ TRACE_EVENT( TP_fast_assign( __entry->sptep = sptep; __entry->gfn = gfn; - __entry->access = access; - __entry->gen = gen; + __entry->access = spte & ACC_ALL; + __entry->gen = get_mmio_spte_generation(spte); ), TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 45a182475f68..ae8ac15b5623 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -2,6 +2,7 @@ #include "mmu.h" #include "mmu_internal.h" +#include "mmutrace.h" #include "tdp_iter.h" #include "tdp_mmu.h" #include "spte.h" @@ -271,6 +272,10 @@ static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) +#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ + for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ + _mmu->shadow_root_level, _start, _end) + /* * Flush the TLB if the process should drop kvm->mmu_lock. * Return whether the caller still needs to flush the tlb. @@ -355,3 +360,132 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) if (flush) kvm_flush_remote_tlbs(kvm); } + +/* + * Installs a last-level SPTE to handle a TDP page fault. + * (NPT/EPT violation/misconfiguration) + */ +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, + int map_writable, + struct tdp_iter *iter, + kvm_pfn_t pfn, bool prefault) +{ + u64 new_spte; + int ret = 0; + int make_spte_ret = 0; + + if (unlikely(is_noslot_pfn(pfn))) { + new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); + } else + make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, + pfn, iter->old_spte, prefault, true, + map_writable, !shadow_accessed_mask, + &new_spte); + + if (new_spte == iter->old_spte) + ret = RET_PF_SPURIOUS; + else + tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); + + /* + * If the page fault was caused by a write but the page is write + * protected, emulation is needed. If the emulation was skipped, + * the vCPU would have the same fault again. + */ + if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + if (write) + ret = RET_PF_EMULATE; + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ + if (unlikely(is_mmio_spte(new_spte))) + ret = RET_PF_EMULATE; + + trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); + if (!prefault) + vcpu->stat.pf_fixed++; + + return ret; +} + +/* + * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing + * page tables and SPTEs to translate the faulting guest physical address. + */ +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault) +{ + bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); + bool write = error_code & PFERR_WRITE_MASK; + bool exec = error_code & PFERR_FETCH_MASK; + bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; + struct kvm_mmu *mmu = vcpu->arch.mmu; + struct tdp_iter iter; + struct kvm_mmu_memory_cache *pf_pt_cache = + &vcpu->arch.mmu_shadow_page_cache; + u64 *child_pt; + u64 new_spte; + int ret; + gfn_t gfn = gpa >> PAGE_SHIFT; + int level; + int req_level; + + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) + return RET_PF_RETRY; + + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, + huge_page_disallowed, &req_level); + + trace_kvm_mmu_spte_requested(gpa, level, pfn); + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + if (nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(iter.old_spte, gfn, + iter.level, &pfn, &level); + + if (iter.level == level) + break; + + /* + * If there is an SPTE mapping a large page at a higher level + * than the target, that SPTE must be cleared and replaced + * with a non-leaf SPTE. + */ + if (is_shadow_present_pte(iter.old_spte) && + is_large_pte(iter.old_spte)) { + tdp_mmu_set_spte(vcpu->kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, + KVM_PAGES_PER_HPAGE(iter.level)); + + /* + * The iter must explicitly re-read the spte here + * because the new value informs the !present + * path below. + */ + iter.old_spte = READ_ONCE(*iter.sptep); + } + + if (!is_shadow_present_pte(iter.old_spte)) { + child_pt = kvm_mmu_memory_cache_alloc(pf_pt_cache); + clear_page(child_pt); + new_spte = make_nonleaf_spte(child_pt, + !shadow_accessed_mask); + + trace_kvm_mmu_get_page(sp, true); + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); + } + } + + if (WARN_ON(iter.level != level)) + return RET_PF_RETRY; + + ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, + pfn, prefault); + + return ret; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 6de2d007fc03..aed21a7a3bd6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -14,4 +14,8 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_tdp_mmu_zap_all(struct kvm *kvm); + +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From 89c0fd494af3912d32ba5765b7147f36a34d1fa3 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:51 -0700 Subject: [PATCH 485/497] kvm: x86/mmu: Allocate struct kvm_mmu_pages for all pages in TDP MMU Attach struct kvm_mmu_pages to every page in the TDP MMU to track metadata, facilitate NX reclaim, and enable inproved parallelism of MMU operations in future patches. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-12-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/mmu/tdp_mmu.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 082684ce2d1b..d44858b69353 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1004,7 +1004,11 @@ struct kvm_arch { * operations. */ bool tdp_mmu_enabled; + + /* List of struct tdp_mmu_pages being used as roots */ struct list_head tdp_mmu_roots; + /* List of struct tdp_mmu_pages not being used as roots */ + struct list_head tdp_mmu_pages; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index ae8ac15b5623..f06802289c1f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -28,6 +28,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); + INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); } void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) @@ -169,6 +170,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); u64 *pt; + struct kvm_mmu_page *sp; u64 old_child_spte; int i; @@ -234,6 +236,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (was_present && !was_leaf && (pfn_changed || !is_present)) { pt = spte_to_child_pt(old_spte, level); + sp = sptep_to_sp(pt); + + list_del(&sp->link); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { old_child_spte = READ_ONCE(*(pt + i)); @@ -247,6 +252,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, KVM_PAGES_PER_HPAGE(level)); free_page((unsigned long)pt); + kmem_cache_free(mmu_page_header_cache, sp); } } @@ -424,8 +430,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; - struct kvm_mmu_memory_cache *pf_pt_cache = - &vcpu->arch.mmu_shadow_page_cache; + struct kvm_mmu_page *sp; u64 *child_pt; u64 new_spte; int ret; @@ -471,7 +476,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } if (!is_shadow_present_pte(iter.old_spte)) { - child_pt = kvm_mmu_memory_cache_alloc(pf_pt_cache); + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); + child_pt = sp->spt; clear_page(child_pt); new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); From 063afacd8730be3d9a3d50f9ea730f840265aba0 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:52 -0700 Subject: [PATCH 486/497] kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU In order to interoperate correctly with the rest of KVM and other Linux subsystems, the TDP MMU must correctly handle various MMU notifiers. Add hooks to handle the invalidate range family of MMU notifiers. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-13-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 ++++- arch/x86/kvm/mmu/tdp_mmu.c | 80 +++++++++++++++++++++++++++++++++++--- arch/x86/kvm/mmu/tdp_mmu.h | 3 ++ 3 files changed, 86 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 31d7ba716b44..35c277ed6c78 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1497,7 +1497,14 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, unsigned flags) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + int r; + + r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end); + + return r; } int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index f06802289c1f..96bc6aa39628 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -52,7 +52,7 @@ bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) } static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end); + gfn_t start, gfn_t end, bool can_yield); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { @@ -65,7 +65,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del(&root->link); - zap_gfn_range(kvm, root, 0, max_gfn); + zap_gfn_range(kvm, root, 0, max_gfn, false); free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); @@ -303,9 +303,14 @@ static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *it * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. + * If can_yield is true, will release the MMU lock and reschedule if the + * scheduler needs the CPU or there is contention on the MMU lock. If this + * function cannot yield, it will not release the MMU lock or reschedule and + * the caller must ensure it does not supply too large a GFN range, or the + * operation can cause a soft lockup. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) + gfn_t start, gfn_t end, bool can_yield) { struct tdp_iter iter; bool flush_needed = false; @@ -326,7 +331,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, tdp_mmu_set_spte(kvm, &iter, 0); - flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + if (can_yield) + flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + else + flush_needed = true; } return flush_needed; } @@ -349,7 +357,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) */ kvm_mmu_get_root(kvm, root); - flush |= zap_gfn_range(kvm, root, start, end); + flush |= zap_gfn_range(kvm, root, start, end, true); kvm_mmu_put_root(kvm, root); } @@ -496,3 +504,65 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return ret; } + +static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end, unsigned long data, + int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long data)) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + struct kvm_mmu_page *root; + int ret = 0; + int as_id; + + for_each_tdp_mmu_root(kvm, root) { + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + as_id = kvm_mmu_page_as_id(root); + slots = __kvm_memslots(kvm, as_id); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, memslot, root, gfn_start, + gfn_end, data); + } + + kvm_mmu_put_root(kvm, root); + } + + return ret; +} + +static int zap_gfn_range_hva_wrapper(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, + gfn_t end, unsigned long unused) +{ + return zap_gfn_range(kvm, root, start, end, false); +} + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + zap_gfn_range_hva_wrapper); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index aed21a7a3bd6..af25d2462cb8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -18,4 +18,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int map_writable, int max_level, kvm_pfn_t pfn, bool prefault); + +int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From f8e144971c6834fa1e171be4cd8026f8bc537bca Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:53 -0700 Subject: [PATCH 487/497] kvm: x86/mmu: Add access tracking for tdp_mmu In order to interoperate correctly with the rest of KVM and other Linux subsystems, the TDP MMU must correctly handle various MMU notifiers. The main Linux MM uses the access tracking MMU notifiers for swap and other features. Add hooks to handle the test/flush HVA (range) family of MMU notifiers. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-14-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 16 +++++- arch/x86/kvm/mmu/tdp_mmu.c | 115 +++++++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu/tdp_mmu.h | 4 ++ 3 files changed, 128 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 35c277ed6c78..33ec6c4c36d7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1558,12 +1558,24 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + int young = false; + + young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_age_hva_range(kvm, start, end); + + return young; } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + int young = false; + + young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); + if (kvm->arch.tdp_mmu_enabled) + young |= kvm_tdp_mmu_test_age_hva(kvm, hva); + + return young; } #ifdef MMU_DEBUG diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 96bc6aa39628..dd6b8a8f1c93 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -149,6 +149,18 @@ static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) return sp->role.smm ? 1 : 0; } +static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) + return; + + if (is_accessed_spte(old_spte) && + (!is_accessed_spte(new_spte) || pfn_changed)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); +} + /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance @@ -260,24 +272,48 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level) { __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); + handle_changed_spte_acc_track(old_spte, new_spte, level); } -static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte) +static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte, bool record_acc_track) { u64 *root_pt = tdp_iter_root_pt(iter); struct kvm_mmu_page *root = sptep_to_sp(root_pt); int as_id = kvm_mmu_page_as_id(root); - *iter->sptep = new_spte; + WRITE_ONCE(*iter->sptep, new_spte); - handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, - iter->level); + __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, + iter->level); + if (record_acc_track) + handle_changed_spte_acc_track(iter->old_spte, new_spte, + iter->level); +} + +static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true); +} + +static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, false); } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) +#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _root, _start, _end) \ + if (!is_shadow_present_pte(_iter.old_spte) || \ + !is_last_spte(_iter.old_spte, _iter.level)) \ + continue; \ + else + #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ _mmu->shadow_root_level, _start, _end) @@ -566,3 +602,72 @@ int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, zap_gfn_range_hva_wrapper); } + +/* + * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero + * if any of the GFNs in the range have been accessed. + */ +static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t start, gfn_t end, + unsigned long unused) +{ + struct tdp_iter iter; + int young = 0; + u64 new_spte = 0; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + /* + * If we have a non-accessed entry we don't need to change the + * pte. + */ + if (!is_accessed_spte(iter.old_spte)) + continue; + + new_spte = iter.old_spte; + + if (spte_ad_enabled(new_spte)) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)&new_spte); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(new_spte)) + kvm_set_pfn_dirty(spte_to_pfn(new_spte)); + + new_spte = mark_spte_for_access_track(new_spte); + } + + tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); + young = 1; + } + + return young; +} + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, + age_gfn_range); +} + +static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long unused2) +{ + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) + if (is_accessed_spte(iter.old_spte)) + return 1; + + return 0; +} + +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, + test_age_gfn); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index af25d2462cb8..ddc1bf12d0fc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -21,4 +21,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); + +int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, + unsigned long end); +int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From 1d8dd6b3f12b03f617820a9ebc19cc2fabf59ce9 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:54 -0700 Subject: [PATCH 488/497] kvm: x86/mmu: Support changed pte notifier in tdp MMU In order to interoperate correctly with the rest of KVM and other Linux subsystems, the TDP MMU must correctly handle various MMU notifiers. Add a hook and handle the change_pte MMU notifier. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-15-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 +++++- arch/x86/kvm/mmu/tdp_mmu.c | 56 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 3 ++ 3 files changed, 67 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 33ec6c4c36d7..41f0354c7489 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1509,7 +1509,14 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + int r; + + r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); + + if (kvm->arch.tdp_mmu_enabled) + r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte); + + return r; } static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index dd6b8a8f1c93..64e640cfcff9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -671,3 +671,59 @@ int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, test_age_gfn); } + +/* + * Handle the changed_pte MMU notifier for the TDP MMU. + * data is a pointer to the new pte_t mapping the HVA specified by the MMU + * notifier. + * Returns non-zero if a flush is needed before releasing the MMU lock. + */ +static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, + unsigned long data) +{ + struct tdp_iter iter; + pte_t *ptep = (pte_t *)data; + kvm_pfn_t new_pfn; + u64 new_spte; + int need_flush = 0; + + WARN_ON(pte_huge(*ptep)); + + new_pfn = pte_pfn(*ptep); + + tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { + if (iter.level != PG_LEVEL_4K) + continue; + + if (!is_shadow_present_pte(iter.old_spte)) + break; + + tdp_mmu_set_spte(kvm, &iter, 0); + + kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); + + if (!pte_write(*ptep)) { + new_spte = kvm_mmu_changed_pte_notifier_make_spte( + iter.old_spte, new_pfn); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + } + + need_flush = 1; + } + + if (need_flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); + + return 0; +} + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep) +{ + return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, + (unsigned long)host_ptep, + set_tdp_spte); +} + diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index ddc1bf12d0fc..aeee3ce7b3f4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -25,4 +25,7 @@ int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); + +int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, + pte_t *host_ptep); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From a6a0b05da9f37ff56faa6b8351ed6e0b55032460 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:55 -0700 Subject: [PATCH 489/497] kvm: x86/mmu: Support dirty logging for the TDP MMU Dirty logging is a key feature of the KVM MMU and must be supported by the TDP MMU. Add support for both the write protection and PML dirty logging modes. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-16-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 ++ arch/x86/kvm/mmu/tdp_iter.h | 7 +- arch/x86/kvm/mmu/tdp_mmu.c | 299 +++++++++++++++++++++++++++++++++++- arch/x86/kvm/mmu/tdp_mmu.h | 10 ++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 6 +- 6 files changed, 328 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 41f0354c7489..0c64643819b9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1223,6 +1223,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, true); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1249,6 +1252,9 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_rmap_head *rmap_head; + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, + slot->base_gfn + gfn_offset, mask, false); while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -5473,6 +5479,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K); spin_unlock(&kvm->mmu_lock); /* @@ -5561,6 +5569,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); spin_unlock(&kvm->mmu_lock); /* @@ -5582,6 +5592,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M); spin_unlock(&kvm->mmu_lock); if (flush) @@ -5596,6 +5608,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); + if (kvm->arch.tdp_mmu_enabled) + flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot); spin_unlock(&kvm->mmu_lock); if (flush) diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index 884ed2c70bfe..47170d0dc98e 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -41,11 +41,14 @@ struct tdp_iter { * Iterates over every SPTE mapping the GFN range [start, end) in a * preorder traversal. */ -#define for_each_tdp_pte(iter, root, root_level, start, end) \ - for (tdp_iter_start(&iter, root, root_level, PG_LEVEL_4K, start); \ +#define for_each_tdp_pte_min_level(iter, root, root_level, min_level, start, end) \ + for (tdp_iter_start(&iter, root, root_level, min_level, start); \ iter.valid && iter.gfn < end; \ tdp_iter_next(&iter)) +#define for_each_tdp_pte(iter, root, root_level, start, end) \ + for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end) + u64 *spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level, diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 64e640cfcff9..7181b4ab54d0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -161,6 +161,24 @@ static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } +static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level) +{ + bool pfn_changed; + struct kvm_memory_slot *slot; + + if (level > PG_LEVEL_4K) + return; + + pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + + if ((!is_writable_pte(old_spte) || pfn_changed) && + is_writable_pte(new_spte)) { + slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); + mark_page_dirty_in_slot(slot, gfn); + } +} + /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance @@ -273,10 +291,13 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, { __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); handle_changed_spte_acc_track(old_spte, new_spte, level); + handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, + new_spte, level); } static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte, bool record_acc_track) + u64 new_spte, bool record_acc_track, + bool record_dirty_log) { u64 *root_pt = tdp_iter_root_pt(iter); struct kvm_mmu_page *root = sptep_to_sp(root_pt); @@ -289,19 +310,30 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, if (record_acc_track) handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); + if (record_dirty_log) + handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, + iter->old_spte, new_spte, + iter->level); } static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - __tdp_mmu_set_spte(kvm, iter, new_spte, true); + __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); } static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { - __tdp_mmu_set_spte(kvm, iter, new_spte, false); + __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); +} + +static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); } #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ @@ -334,6 +366,14 @@ static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *it } } +static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) +{ + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + cond_resched_lock(&kvm->mmu_lock); + tdp_iter_refresh_walk(iter); + } +} + /* * Tears down the mappings for the range of gfns, [start, end), and frees the * non-root pages mapping GFNs strictly within that range. Returns true if @@ -638,6 +678,7 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, new_spte = mark_spte_for_access_track(new_spte); } + new_spte &= ~shadow_dirty_mask; tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); young = 1; @@ -727,3 +768,255 @@ int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, set_tdp_spte); } +/* + * Remove write access from all the SPTEs mapping GFNs [start, end). If + * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end, int min_level) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Remove write access from all the SPTEs mapping GFNs in the memslot. Will + * only affect leaf SPTEs down to min_level. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages, min_level); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, start, end) { + if (spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + return spte_set; +} + +/* + * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If + * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. + * If AD bits are not enabled, this will require clearing the writable bit on + * each SPTE. Returns true if an SPTE has been changed and the TLBs need to + * be flushed. + */ +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + + return spte_set; +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn, unsigned long mask, bool wrprot) +{ + struct tdp_iter iter; + u64 new_spte; + + tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + gfn + BITS_PER_LONG) { + if (!mask) + break; + + if (iter.level > PG_LEVEL_4K || + !(mask & (1UL << (iter.gfn - gfn)))) + continue; + + if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { + if (is_writable_pte(iter.old_spte)) + new_spte = iter.old_spte & ~PT_WRITABLE_MASK; + else + continue; + } else { + if (iter.old_spte & shadow_dirty_mask) + new_spte = iter.old_spte & ~shadow_dirty_mask; + else + continue; + } + + tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + + mask &= ~(1UL << (iter.gfn - gfn)); + } +} + +/* + * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is + * set in mask, starting at gfn. The given memslot is expected to contain all + * the GFNs represented by set bits in the mask. If AD bits are enabled, + * clearing the dirty status will involve clearing the dirty bit on each SPTE + * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + */ +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); + } +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte)) + continue; + + new_spte = iter.old_spte | shadow_dirty_mask; + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + + tdp_mmu_iter_cond_resched(kvm, &iter); + } + + return spte_set; +} + +/* + * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is + * only used for PML, and so will involve setting the dirty bit on each SPTE. + * Returns true if an SPTE has been changed and the TLBs need to be flushed. + */ +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } + return spte_set; +} + diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index aeee3ce7b3f4..ece66f10d85f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -28,4 +28,14 @@ int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva); int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, pte_t *host_ptep); + +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + int min_level); +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + struct kvm_memory_slot *slot); +void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn, unsigned long mask, + bool wrprot); +bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); #endif /* __KVM_X86_MMU_TDP_MMU_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c6f45687ba89..7f2e2a09ebbd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -798,6 +798,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); +void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2e8539213125..2541a17ff1c4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -143,8 +143,6 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); - __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -2645,8 +2643,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, - gfn_t gfn) +void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn) { if (memslot && memslot->dirty_bitmap) { unsigned long rel_gfn = gfn - memslot->base_gfn; @@ -2654,6 +2651,7 @@ static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, set_bit_le(rel_gfn, memslot->dirty_bitmap); } } +EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { From 14881998566d2dc0703870bbe063e8d42d780eb9 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:56 -0700 Subject: [PATCH 490/497] kvm: x86/mmu: Support disabling dirty logging for the tdp MMU Dirty logging ultimately breaks down MMU mappings to 4k granularity. When dirty logging is no longer needed, these granaular mappings represent a useless performance penalty. When dirty logging is disabled, search the paging structure for mappings that could be re-constituted into a large page mapping. Zap those mappings so that they can be faulted in again at a higher mapping level. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-17-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++ arch/x86/kvm/mmu/tdp_mmu.c | 58 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 2 ++ 3 files changed, 63 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c64643819b9..cd1be200e2a3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5544,6 +5544,9 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, spin_lock(&kvm->mmu_lock); slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, kvm_mmu_zap_collapsible_spte, true); + + if (kvm->arch.tdp_mmu_enabled) + kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot); spin_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7181b4ab54d0..0f181f324455 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1020,3 +1020,61 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) return spte_set; } +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +static void zap_collapsible_spte_range(struct kvm *kvm, + struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + struct tdp_iter iter; + kvm_pfn_t pfn; + bool spte_set = false; + + tdp_root_for_each_pte(iter, root, start, end) { + if (!is_shadow_present_pte(iter.old_spte) || + is_last_spte(iter.old_spte, iter.level)) + continue; + + pfn = spte_to_pfn(iter.old_spte); + if (kvm_is_reserved_pfn(pfn) || + !PageTransCompoundMap(pfn_to_page(pfn))) + continue; + + tdp_mmu_set_spte(kvm, &iter, 0); + + spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); + } + + if (spte_set) + kvm_flush_remote_tlbs(kvm); +} + +/* + * Clear non-leaf entries (and free associated page tables) which could + * be replaced by large mappings, for GFNs within the slot. + */ +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + struct kvm_mmu_page *root; + int root_as_id; + + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + /* + * Take a reference on the root so that it cannot be freed if + * this thread releases the MMU lock and yields in this loop. + */ + kvm_mmu_get_root(kvm, root); + + zap_collapsible_spte_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); + + kvm_mmu_put_root(kvm, root); + } +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index ece66f10d85f..8cc902b8b9f8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -38,4 +38,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn, unsigned long mask, bool wrprot); bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From 46044f72c3826b7528339f454fe8900bae6adaaa Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:57 -0700 Subject: [PATCH 491/497] kvm: x86/mmu: Support write protection for nesting in tdp MMU To support nested virtualization, KVM will sometimes need to write protect pages which are part of a shadowed paging structure or are not writable in the shadowed paging structure. Add a function to write protect GFN mappings for this purpose. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-18-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 +++ arch/x86/kvm/mmu/tdp_mmu.c | 50 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 3 +++ 3 files changed, 57 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cd1be200e2a3..4c62ac8db169 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1299,6 +1299,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, write_protected |= __rmap_write_protect(kvm, rmap_head, true); } + if (kvm->arch.tdp_mmu_enabled) + write_protected |= + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + return write_protected; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 0f181f324455..1491e2f7a897 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1078,3 +1078,53 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, kvm_mmu_put_root(kvm, root); } } + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t gfn) +{ + struct tdp_iter iter; + u64 new_spte; + bool spte_set = false; + + tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + if (!is_writable_pte(iter.old_spte)) + break; + + new_spte = iter.old_spte & + ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); + + tdp_mmu_set_spte(kvm, &iter, new_spte); + spte_set = true; + } + + return spte_set; +} + +/* + * Removes write access on the last level SPTE mapping this GFN and unsets the + * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. + * Returns true if an SPTE was set and a TLB flush is needed. + */ +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) +{ + struct kvm_mmu_page *root; + int root_as_id; + bool spte_set = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_tdp_mmu_root(kvm, root) { + root_as_id = kvm_mmu_page_as_id(root); + if (root_as_id != slot->as_id) + continue; + + spte_set |= write_protect_gfn(kvm, root, gfn); + } + return spte_set; +} + diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 8cc902b8b9f8..6501dd2ef8e4 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -40,4 +40,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot); + +bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From 95fb5b0258b7bd2d540102771e31cfd76b72aa7b Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:58 -0700 Subject: [PATCH 492/497] kvm: x86/mmu: Support MMIO in the TDP MMU In order to support MMIO, KVM must be able to walk the TDP paging structures to find mappings for a given GFN. Support this walk for the TDP MMU. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 v2: Thanks to Dan Carpenter and kernel test robot for finding that root was used uninitialized in get_mmio_spte. Signed-off-by: Ben Gardon Reported-by: kernel test robot Reported-by: Dan Carpenter Message-Id: <20201014182700.2888246-19-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 70 ++++++++++++++++++++++++++------------ arch/x86/kvm/mmu/tdp_mmu.c | 21 ++++++++++++ arch/x86/kvm/mmu/tdp_mmu.h | 2 ++ 3 files changed, 72 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4c62ac8db169..6a0941ccac34 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3479,54 +3479,82 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) return vcpu_match_mmio_gva(vcpu, addr); } -/* return true if reserved bit is detected on spte. */ -static bool -walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; - struct rsvd_bits_validate *rsvd_check; - int root, leaf; - bool reserved = false; + int leaf = vcpu->arch.mmu->root_level; + u64 spte; - rsvd_check = &vcpu->arch.mmu->shadow_zero_check; walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), - leaf = root = iterator.level; + for (shadow_walk_init(&iterator, vcpu, addr); shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { + leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf - 1] = spte; - leaf--; if (!is_shadow_present_pte(spte)) break; + } + + walk_shadow_page_lockless_end(vcpu); + + return leaf; +} + +/* return true if reserved bit is detected on spte. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL]; + struct rsvd_bits_validate *rsvd_check; + int root = vcpu->arch.mmu->root_level; + int leaf; + int level; + bool reserved = false; + + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { + *sptep = 0ull; + return reserved; + } + + if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + else + leaf = get_walk(vcpu, addr, sptes); + + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + + for (level = root; level >= leaf; level--) { + if (!is_shadow_present_pte(sptes[level - 1])) + break; /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, spte) | - __is_rsvd_bits_set(rsvd_check, spte, iterator.level); + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | + __is_rsvd_bits_set(rsvd_check, sptes[level - 1], + level); } - walk_shadow_page_lockless_end(vcpu); - if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", __func__, addr); - while (root > leaf) { + for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[root - 1], root); - root--; - } + sptes[level - 1], level); } - *sptep = spte; + *sptep = sptes[leaf - 1]; + return reserved; } @@ -3538,7 +3566,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; - reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON(reserved)) return -EINVAL; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1491e2f7a897..5158d02b8925 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -7,7 +7,10 @@ #include "tdp_mmu.h" #include "spte.h" +#ifdef CONFIG_X86_64 static bool __read_mostly tdp_mmu_enabled = false; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); +#endif static bool is_tdp_mmu_enabled(void) { @@ -1128,3 +1131,21 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, return spte_set; } +/* + * Return the level of the lowest level SPTE added to sptes. + * That SPTE may be non-present. + */ +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + int leaf = vcpu->arch.mmu->shadow_root_level; + gfn_t gfn = addr >> PAGE_SHIFT; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + leaf = iter.level; + sptes[leaf - 1] = iter.old_spte; + } + + return leaf; +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 6501dd2ef8e4..556e065503f6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -43,4 +43,6 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); + +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); #endif /* __KVM_X86_MMU_TDP_MMU_H */ From daa5b6c12337a0e6e269d022baa21b0549f507c3 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:26:59 -0700 Subject: [PATCH 493/497] kvm: x86/mmu: Don't clear write flooding count for direct roots Direct roots don't have a write flooding count because the guest can't affect that paging structure. Thus there's no need to clear the write flooding count on a fast CR3 switch for direct roots. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-20-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6a0941ccac34..7b52fa1f01b0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3892,7 +3892,13 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); + /* + * If this is a direct root page, it doesn't have a write flooding + * count. Otherwise, clear the write flooding count. + */ + if (!new_role.direct) + __clear_sp_write_flooding_count( + to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, From 29cf0f5007a215b51feb0ae25ca5353480d53ead Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 14 Oct 2020 11:27:00 -0700 Subject: [PATCH 494/497] kvm: x86/mmu: NX largepage recovery for TDP MMU When KVM maps a largepage backed region at a lower level in order to make it executable (i.e. NX large page shattering), it reduces the TLB performance of that region. In order to avoid making this degradation permanent, KVM must periodically reclaim shattered NX largepages by zapping them and allowing them to be rebuilt in the page fault handler. With this patch, the TDP MMU does not respect KVM's rate limiting on reclaim. It traverses the entire TDP structure every time. This will be addressed in a future patch. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon Message-Id: <20201014182700.2888246-21-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 13 +++++++++---- arch/x86/kvm/mmu/mmu_internal.h | 3 +++ arch/x86/kvm/mmu/tdp_mmu.c | 6 ++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7b52fa1f01b0..17587f496ec7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -776,7 +776,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } -static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->lpage_disallowed) return; @@ -804,7 +804,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } -static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) { --kvm->stat.nx_lpage_splits; sp->lpage_disallowed = false; @@ -5988,8 +5988,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page, lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); + if (sp->tdp_mmu_page) + kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, + sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + else { + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + } if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 6db40ea85974..bfc6389edc28 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -143,4 +143,7 @@ bool is_nx_huge_page_enabled(void); void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); +void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); + #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 5158d02b8925..e246d71b8ea2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -273,6 +273,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, list_del(&sp->link); + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { old_child_spte = READ_ONCE(*(pt + i)); WRITE_ONCE(*(pt + i), 0); @@ -571,6 +574,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, !shadow_accessed_mask); trace_kvm_mmu_get_page(sp, true); + if (huge_page_disallowed && req_level >= iter.level) + account_huge_nx_page(vcpu->kvm, sp); + tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); } } From 97fd734ba17e32463742c569137f54f713c27fe0 Mon Sep 17 00:00:00 2001 From: Abhi Das Date: Tue, 20 Oct 2020 15:58:04 -0500 Subject: [PATCH 495/497] gfs2: lookup local statfs inodes prior to journal recovery We need to lookup the master statfs inode and the local statfs inodes earlier in the mount process (in init_journal) so journal recovery can use them when it attempts to recover the statfs info. We lookup all the local statfs inodes and store them in a linked list to allow a node to recover statfs info for other nodes in the cluster. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/incore.h | 8 +++ fs/gfs2/ops_fstype.c | 133 +++++++++++++++++++++++++++++++------------ fs/gfs2/super.c | 31 +++++++++- fs/gfs2/super.h | 3 + 4 files changed, 139 insertions(+), 36 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e34183e02a9e..d7707307f4b1 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -697,6 +697,13 @@ struct gfs2_pcpu_lkstats { struct gfs2_lkstats lkstats[10]; }; +/* List of local (per node) statfs inodes */ +struct local_statfs_inode { + struct list_head si_list; + struct inode *si_sc_inode; + unsigned int si_jid; /* journal id this statfs inode corresponds to */ +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; @@ -748,6 +755,7 @@ struct gfs2_sbd { struct inode *sd_jindex; struct inode *sd_statfs_inode; struct inode *sd_sc_inode; + struct list_head sd_sc_inodes_list; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 03c33fc03c05..7a7e3c10a9a9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -110,6 +110,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_bitmap_lock); + INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); + mapping = &sdp->sd_aspace; address_space_init_once(mapping); @@ -608,6 +610,90 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) return error; } +/** + * init_statfs - look up and initialize master and local (per node) statfs inodes + * @sdp: The GFS2 superblock + * + * This should be called after the jindex is initialized in init_journal() and + * before gfs2_journal_recovery() is called because we need to be able to write + * to these inodes during recovery. + * + * Returns: errno + */ +static int init_statfs(struct gfs2_sbd *sdp) +{ + int error = 0; + struct inode *master = d_inode(sdp->sd_master_dir); + struct inode *pn = NULL; + char buf[30]; + struct gfs2_jdesc *jd; + struct gfs2_inode *ip; + + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail; + } + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + goto put_statfs; + } + + /* For each jid, lookup the corresponding local statfs inode in the + * per_node metafs directory and save it in the sdp->sd_sc_inodes_list. */ + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + struct local_statfs_inode *lsi = + kmalloc(sizeof(struct local_statfs_inode), GFP_NOFS); + if (!lsi) { + error = -ENOMEM; + goto free_local; + } + sprintf(buf, "statfs_change%u", jd->jd_jid); + lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(lsi->si_sc_inode)) { + error = PTR_ERR(lsi->si_sc_inode); + fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", + jd->jd_jid, error); + goto free_local; + } + lsi->si_jid = jd->jd_jid; + if (jd->jd_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = lsi->si_sc_inode; + + list_add_tail(&lsi->si_list, &sdp->sd_sc_inodes_list); + } + + iput(pn); + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto free_local; + } + return 0; + +free_local: + free_local_statfs_inodes(sdp); + iput(pn); +put_statfs: + iput(sdp->sd_statfs_inode); +fail: + return error; +} + +/* Uninitialize and free up memory used by the list of statfs inodes */ +static void uninit_statfs(struct gfs2_sbd *sdp) +{ + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + free_local_statfs_inodes(sdp); + iput(sdp->sd_statfs_inode); +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = d_inode(sdp->sd_master_dir); @@ -694,6 +780,11 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); + /* Lookup statfs inodes here so journal recovery can use them. */ + error = init_statfs(sdp); + if (error) + goto fail_jinode_gh; + if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { @@ -702,14 +793,14 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_args.ar_spectator) { error = check_journal_clean(sdp, jd, true); if (error) - goto fail_jinode_gh; + goto fail_statfs; continue; } error = gfs2_recover_journal(jd, true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); - goto fail_jinode_gh; + goto fail_statfs; } } @@ -718,7 +809,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) error = gfs2_recover_journal(sdp->sd_jdesc, true); if (error) { fs_err(sdp, "error recovering my journal: %d\n", error); - goto fail_jinode_gh; + goto fail_statfs; } } @@ -729,6 +820,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func); return 0; +fail_statfs: + uninit_statfs(sdp); fail_jinode_gh: /* A withdraw may have done dq/uninit so now we need to check it */ if (!sdp->sd_args.ar_spectator && @@ -762,20 +855,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) if (error) goto fail; - /* Read in the master statfs inode */ - sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); - if (IS_ERR(sdp->sd_statfs_inode)) { - error = PTR_ERR(sdp->sd_statfs_inode); - fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail_journal; - } - /* Read in the resource index inode */ sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); - goto fail_statfs; + goto fail_journal; } sdp->sd_rindex_uptodate = 0; @@ -804,8 +889,6 @@ fail_qinode: fail_rindex: gfs2_clear_rgrpd(sdp); iput(sdp->sd_rindex); -fail_statfs: - iput(sdp->sd_statfs_inode); fail_journal: init_journal(sdp, UNDO); fail: @@ -833,14 +916,6 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) return error; } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); - sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_sc_inode)) { - error = PTR_ERR(sdp->sd_sc_inode); - fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail; - } - sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); if (IS_ERR(sdp->sd_qc_inode)) { @@ -852,33 +927,21 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) iput(pn); pn = NULL; - ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, - &sdp->sd_sc_gh); - if (error) { - fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); - goto fail_qc_i; - } - ip = GFS2_I(sdp->sd_qc_inode); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); - goto fail_ut_gh; + goto fail_qc_i; } return 0; fail_qc_gh: gfs2_glock_dq_uninit(&sdp->sd_qc_gh); -fail_ut_gh: - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: - iput(sdp->sd_sc_inode); -fail: iput(pn); return error; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index e17961ea994d..b285192bd6b3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -729,7 +729,7 @@ restart: gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_sc_inode); + free_local_statfs_inodes(sdp); iput(sdp->sd_qc_inode); } @@ -1561,6 +1561,35 @@ static void gfs2_free_inode(struct inode *inode) kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) +{ + struct local_statfs_inode *lsi, *safe; + + /* Run through the statfs inodes list to iput and free memory */ + list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = NULL; /* belongs to this node */ + if (lsi->si_sc_inode) + iput(lsi->si_sc_inode); + list_del(&lsi->si_list); + kfree(lsi); + } +} + +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index) +{ + struct local_statfs_inode *lsi; + + /* Return the local (per node) statfs inode in the + * sdp->sd_sc_inodes_list corresponding to the 'index'. */ + list_for_each_entry(lsi, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == index) + return lsi->si_sc_inode; + } + return NULL; +} + const struct super_operations gfs2_super_ops = { .alloc_inode = gfs2_alloc_inode, .free_inode = gfs2_free_inode, diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index ed4f5cb29074..c9fb2a654181 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -44,6 +44,9 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, extern int gfs2_statfs_sync(struct super_block *sb, int type); extern void gfs2_freeze_func(struct work_struct *work); +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp); +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index); extern void free_sbd(struct gfs2_sbd *sdp); extern struct file_system_type gfs2_fs_type; From bedb0f056faa94e953e7b3da5a77d25e0008364b Mon Sep 17 00:00:00 2001 From: Abhi Das Date: Tue, 20 Oct 2020 15:58:05 -0500 Subject: [PATCH 496/497] gfs2: Recover statfs info in journal head Apply the outstanding statfs changes in the journal head to the master statfs file. Zero out the local statfs file for good measure. Previously, statfs updates would be read in from the local statfs inode and synced to the master statfs inode during recovery. We now use the statfs updates in the journal head to update the master statfs inode instead of reading in from the local statfs inode. To preserve backward compatibility with kernels that can't do this, we still need to keep the local statfs inode up to date by writing changes to it. At some point in the future, we can do away with the local statfs inodes altogether and keep the statfs changes solely in the journal. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/lops.c | 2 +- fs/gfs2/lops.h | 1 + fs/gfs2/recovery.c | 104 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed1da4323967..ed69298dd824 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -823,7 +823,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, * */ -static void gfs2_meta_sync(struct gfs2_glock *gl) +void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c5e4e491e03..4a3d8aecdf82 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,6 +27,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_meta_sync(struct gfs2_glock *gl); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index a8bb17e355b8..b5cbe21efdfb 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -296,6 +296,109 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } +/** + * update_statfs_inode - Update the master statfs inode or zero out the local + * statfs inode for a given journal. + * @jd: The journal + * @head: If NULL, @inode is the local statfs inode and we need to zero it out. + * Otherwise, it @head contains the statfs change info that needs to be + * synced to the master statfs inode (pointed to by @inode). + * @inode: statfs inode to update. + */ +static int update_statfs_inode(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_inode *ip; + struct buffer_head *bh; + struct gfs2_statfs_change_host sc; + int error = 0; + + BUG_ON(!inode); + ip = GFS2_I(inode); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + + if (head) { /* Update the master statfs inode */ + gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + sc.sc_total += head->lh_local_total; + sc.sc_free += head->lh_local_free; + sc.sc_dinodes += head->lh_local_dinodes; + gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + + fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " + "Free:%lld, Dinodes:%lld after change " + "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, + sc.sc_free, sc.sc_dinodes, head->lh_local_total, + head->lh_local_free, head->lh_local_dinodes); + } else { /* Zero out the local statfs inode */ + memset(bh->b_data + sizeof(struct gfs2_dinode), 0, + sizeof(struct gfs2_statfs_change)); + /* If it's our own journal, reset any in-memory changes too */ + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + memset(&sdp->sd_statfs_local, 0, + sizeof(struct gfs2_statfs_change_host)); + } + } + spin_unlock(&sdp->sd_statfs_spin); + + mark_buffer_dirty(bh); + brelse(bh); + gfs2_meta_sync(ip->i_gl); + +out: + return error; +} + +/** + * recover_local_statfs - Update the master and local statfs changes for this + * journal. + * + * Previously, statfs updates would be read in from the local statfs inode and + * synced to the master statfs inode during recovery. + * + * We now use the statfs updates in the journal head to update the master statfs + * inode instead of reading in from the local statfs inode. To preserve backward + * compatibility with kernels that can't do this, we still need to keep the + * local statfs inode up to date by writing changes to it. At some point in the + * future, we can do away with the local statfs inodes altogether and keep the + * statfs changes solely in the journal. + * + * @jd: the journal + * @head: the journal head + * + * Returns: errno + */ +static void recover_local_statfs(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head) +{ + int error; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (!head->lh_local_total && !head->lh_local_free + && !head->lh_local_dinodes) /* No change */ + goto zero_local; + + /* First update the master statfs inode with the changes we + * found in the journal. */ + error = update_statfs_inode(jd, head, sdp->sd_statfs_inode); + if (error) + goto out; + +zero_local: + /* Zero out the local statfs inode so any changes in there + * are not re-recovered. */ + error = update_statfs_inode(jd, NULL, + find_local_statfs_inode(sdp, jd->jd_jid)); +out: + return; +} + void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); @@ -415,6 +518,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_thaw; } + recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); From 9c5743dff415a7384669229d327702ea9bd45560 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 23 Oct 2020 22:31:54 +0200 Subject: [PATCH 497/497] x86/uaccess: fix code generation in put_user() Quoting https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html: You can define a local register variable and associate it with a specified register... The only supported use for this feature is to specify registers for input and output operands when calling Extended asm (see Extended Asm). This may be necessary if the constraints for a particular machine don't provide sufficient control to select the desired register. On 32-bit x86, this is used to ensure that gcc will put an 8-byte value into the %edx:%eax pair, while all other cases will just use the single register %eax (%rax on x86-64). While the _ASM_AX actually just expands to "%eax", note this comment next to get_user() which does something very similar: * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. However, getting this to work requires that there is no code between the assignment to the local register variable and its use as an input to the asm() which can possibly clobber any of the registers involved - including evaluation of the expressions making up other inputs. In the current code, the ptr expression used directly as an input may cause such code to be emitted. For example, Sean Christopherson observed that with KASAN enabled and ptr being current->set_child_tid (from chedule_tail()), the load of current->set_child_tid causes a call to __asan_load8() to be emitted immediately prior to the __put_user_4 call, and Naresh Kamboju reports that various mmstress tests fail on KASAN-enabled builds. It's also possible to synthesize a broken case without KASAN if one uses "foo()" as the ptr argument, with foo being some "extern u64 __user *foo(void);" (though I don't know if that appears in real code). Fix it by making sure ptr gets evaluated before the assignment to __val_pu, and add a comment that __val_pu must be the last thing computed before the asm() is entered. Cc: Sean Christopherson Reported-by: Naresh Kamboju Tested-by: Naresh Kamboju Fixes: d55564cfc222 ("x86: Make __put_user() generate an out-of-line call") Signed-off-by: Rasmus Villemoes Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index f13659523108..c9fa7be3df82 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -208,16 +208,24 @@ extern void __put_user_nocheck_2(void); extern void __put_user_nocheck_4(void); extern void __put_user_nocheck_8(void); +/* + * ptr must be evaluated and assigned to the temporary __ptr_pu before + * the assignment of x to __val_pu, to avoid any function calls + * involved in the ptr expression (possibly implicitly generated due + * to KASAN) from clobbering %ax. + */ #define do_put_user_call(fn,x,ptr) \ ({ \ int __ret_pu; \ + void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ __chk_user_ptr(ptr); \ + __ptr_pu = (ptr); \ __val_pu = (x); \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ - : "0" (ptr), \ + : "0" (__ptr_pu), \ "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \