From 778fbf4179991e7652e97d7f1ca1f657ef828422 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 1 Apr 2020 14:23:29 -0700 Subject: [PATCH 001/300] HID: wacom: Read HID_DG_CONTACTMAX directly for non-generic devices We've recently switched from extracting the value of HID_DG_CONTACTMAX at a fixed offset (which may not be correct for all tablets) to injecting the report into the driver for the generic codepath to handle. Unfortunately, this change was made for *all* tablets, even those which aren't generic. Because `wacom_wac_report` ignores reports from non- generic devices, the contact count never gets initialized. Ultimately this results in the touch device itself failing to probe, and thus the loss of touch input. This commit adds back the fixed-offset extraction for non-generic devices. Link: https://github.com/linuxwacom/input-wacom/issues/155 Fixes: 184eccd40389 ("HID: wacom: generic: read HID_DG_CONTACTMAX from any feature report") Signed-off-by: Jason Gerecke Reviewed-by: Aaron Armstrong Skomra CC: stable@vger.kernel.org # 5.3+ Signed-off-by: Benjamin Tissoires --- drivers/hid/wacom_sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 5ded94b7bf68..cd71e7133944 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -319,9 +319,11 @@ static void wacom_feature_mapping(struct hid_device *hdev, data[0] = field->report->id; ret = wacom_get_report(hdev, HID_FEATURE_REPORT, data, n, WAC_CMD_RETRIES); - if (ret == n) { + if (ret == n && features->type == HID_GENERIC) { ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, data, n, 0); + } else if (ret == 2 && features->type != HID_GENERIC) { + features->touch_max = data[1]; } else { features->touch_max = 16; hid_warn(hdev, "wacom_feature_mapping: " From 8d97fb393c5cbae23389317615f2bf30a559ed17 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Apr 2020 00:53:47 -0700 Subject: [PATCH 002/300] gcc-plugins/stackleak: Avoid assignment for unused macro argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With GCC version >= 8, the cgraph_create_edge() macro argument using "frequency" goes unused. Instead of assigning a temporary variable for the argument, pass the compute_call_stmt_bb_frequency() call directly as the macro argument so that it will just not be called when it is not wanted by the macros. Silences the warning: scripts/gcc-plugins/stackleak_plugin.c:54:6: warning: variable ‘frequency’ set but not used [-Wunused-but-set-variable] Now builds cleanly with gcc-7 and gcc-9. Both boot and pass STACKLEAK_ERASING LKDTM test. Signed-off-by: Kees Cook --- scripts/gcc-plugins/stackleak_plugin.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index dbd37460c573..cc75eeba0be1 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -51,7 +51,6 @@ static void stackleak_add_track_stack(gimple_stmt_iterator *gsi, bool after) gimple stmt; gcall *stackleak_track_stack; cgraph_node_ptr node; - int frequency; basic_block bb; /* Insert call to void stackleak_track_stack(void) */ @@ -68,9 +67,9 @@ static void stackleak_add_track_stack(gimple_stmt_iterator *gsi, bool after) bb = gimple_bb(stackleak_track_stack); node = cgraph_get_create_node(track_function_decl); gcc_assert(node); - frequency = compute_call_stmt_bb_frequency(current_function_decl, bb); cgraph_create_edge(cgraph_get_node(current_function_decl), node, - stackleak_track_stack, bb->count, frequency); + stackleak_track_stack, bb->count, + compute_call_stmt_bb_frequency(current_function_decl, bb)); } static bool is_alloca(gimple stmt) From c7527373fe28f97d8a196ab562db5589be0d34b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Pierret=20=28fepitre=29?= Date: Tue, 7 Apr 2020 13:32:59 +0200 Subject: [PATCH 003/300] gcc-common.h: Update for GCC 10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove "params.h" include, which has been dropped in GCC 10. Remove is_a_helper() macro, which is now defined in gimple.h, as seen when running './scripts/gcc-plugin.sh g++ g++ gcc': In file included from :1: ./gcc-plugins/gcc-common.h:852:13: error: redefinition of ‘static bool is_a_helper::test(U*) [with U = const gimple; T = const ggoto*]’ 852 | inline bool is_a_helper::test(const_gimple gs) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from ./gcc-plugins/gcc-common.h:125, from :1: /usr/lib/gcc/x86_64-redhat-linux/10/plugin/include/gimple.h:1037:1: note: ‘static bool is_a_helper::test(U*) [with U = const gimple; T = const ggoto*]’ previously declared here 1037 | is_a_helper ::test (const gimple *gs) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~ Add -Wno-format-diag to scripts/gcc-plugins/Makefile to avoid meaningless warnings from error() formats used by plugins: scripts/gcc-plugins/structleak_plugin.c: In function ‘int plugin_init(plugin_name_args*, plugin_gcc_version*)’: scripts/gcc-plugins/structleak_plugin.c:253:12: warning: unquoted sequence of 2 consecutive punctuation characters ‘'-’ in format [-Wformat-diag] 253 | error(G_("unknown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Frédéric Pierret (fepitre) Link: https://lore.kernel.org/r/20200407113259.270172-1-frederic.pierret@qubes-os.org [kees: include -Wno-format-diag for plugin builds] Signed-off-by: Kees Cook --- scripts/gcc-plugins/Makefile | 1 + scripts/gcc-plugins/gcc-common.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile index f22858b2c3d6..80f354289eeb 100644 --- a/scripts/gcc-plugins/Makefile +++ b/scripts/gcc-plugins/Makefile @@ -4,6 +4,7 @@ GCC_PLUGINS_DIR := $(shell $(CC) -print-file-name=plugin) HOST_EXTRACXXFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu++98 -fno-rtti HOST_EXTRACXXFLAGS += -fno-exceptions -fasynchronous-unwind-tables -ggdb HOST_EXTRACXXFLAGS += -Wno-narrowing -Wno-unused-variable -Wno-c++11-compat +HOST_EXTRACXXFLAGS += -Wno-format-diag $(obj)/randomize_layout_plugin.o: $(objtree)/$(obj)/randomize_layout_seed.h quiet_cmd_create_randomize_layout_seed = GENSEED $@ diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h index 17f06079a712..9ad76b7f3f10 100644 --- a/scripts/gcc-plugins/gcc-common.h +++ b/scripts/gcc-plugins/gcc-common.h @@ -35,7 +35,9 @@ #include "ggc.h" #include "timevar.h" +#if BUILDING_GCC_VERSION < 10000 #include "params.h" +#endif #if BUILDING_GCC_VERSION <= 4009 #include "pointer-set.h" @@ -847,6 +849,7 @@ static inline gimple gimple_build_assign_with_ops(enum tree_code subcode, tree l return gimple_build_assign(lhs, subcode, op1, op2 PASS_MEM_STAT); } +#if BUILDING_GCC_VERSION < 10000 template <> template <> inline bool is_a_helper::test(const_gimple gs) @@ -860,6 +863,7 @@ inline bool is_a_helper::test(const_gimple gs) { return gs->code == GIMPLE_RETURN; } +#endif static inline gasm *as_a_gasm(gimple stmt) { From f9e82295eec141a0569649d400d249333d74aa91 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Mon, 13 Apr 2020 18:02:37 +0200 Subject: [PATCH 004/300] HID: multitouch: add eGalaxTouch P80H84 support Add support for P80H84 touchscreen from eGalaxy: idVendor 0x0eef D-WAV Scientific Co., Ltd idProduct 0xc002 iManufacturer 1 eGalax Inc. iProduct 2 eGalaxTouch P80H84 2019 vDIVA_1204_T01 k4.02.146 Signed-off-by: Sebastian Reichel Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-multitouch.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index b18b13147a6f..68d46e818dc2 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -385,6 +385,7 @@ #define USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH_7349 0x7349 #define USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH_73F7 0x73f7 #define USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH_A001 0xa001 +#define USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH_C002 0xc002 #define USB_VENDOR_ID_ELAN 0x04f3 #define USB_DEVICE_ID_TOSHIBA_CLICK_L9W 0x0401 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index 362805ddf377..03c720b47306 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1922,6 +1922,9 @@ static const struct hid_device_id mt_devices[] = { { .driver_data = MT_CLS_EGALAX_SERIAL, MT_USB_DEVICE(USB_VENDOR_ID_DWAV, USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH_A001) }, + { .driver_data = MT_CLS_EGALAX, + MT_USB_DEVICE(USB_VENDOR_ID_DWAV, + USB_DEVICE_ID_DWAV_EGALAX_MULTITOUCH_C002) }, /* Elitegroup panel */ { .driver_data = MT_CLS_SERIAL, From b1bd0f75288f60e8d142a1b3e979ed0192c04931 Mon Sep 17 00:00:00 2001 From: Fabian Schindlatz Date: Mon, 13 Apr 2020 18:46:28 +0200 Subject: [PATCH 005/300] HID: logitech: Add support for Logitech G11 extra keys The Logitech G11 keyboard is a cheap variant of the G15 without the LCD screen. It uses the same layout for its extra and macro keys (G1 - G18, M1-M3, MR) and - from the input subsystem's perspective - behaves just like the G15, so we can treat it as such. Tested it with my own keyboard. Signed-off-by: Fabian Schindlatz Reviewed-by: Hans de Goede Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-lg-g15.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 68d46e818dc2..7ccbd58a324e 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -760,6 +760,7 @@ #define USB_DEVICE_ID_LOGITECH_RUMBLEPAD2 0xc218 #define USB_DEVICE_ID_LOGITECH_RUMBLEPAD2_2 0xc219 #define USB_DEVICE_ID_LOGITECH_G15_LCD 0xc222 +#define USB_DEVICE_ID_LOGITECH_G11 0xc225 #define USB_DEVICE_ID_LOGITECH_G15_V2_LCD 0xc227 #define USB_DEVICE_ID_LOGITECH_G510 0xc22d #define USB_DEVICE_ID_LOGITECH_G510_USB_AUDIO 0xc22e diff --git a/drivers/hid/hid-lg-g15.c b/drivers/hid/hid-lg-g15.c index ad4b5412a9f4..ef0cbcd7540d 100644 --- a/drivers/hid/hid-lg-g15.c +++ b/drivers/hid/hid-lg-g15.c @@ -872,6 +872,10 @@ error_hw_stop: } static const struct hid_device_id lg_g15_devices[] = { + /* The G11 is a G15 without the LCD, treat it as a G15 */ + { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, + USB_DEVICE_ID_LOGITECH_G11), + .driver_data = LG_G15 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G15_LCD), .driver_data = LG_G15 }, From 640e403b1fd24e7f31ac6f29f0b6a21d285ed729 Mon Sep 17 00:00:00 2001 From: Artem Borisov Date: Mon, 6 Apr 2020 03:55:15 +0400 Subject: [PATCH 006/300] HID: alps: Add AUI1657 device ID This device is used on Lenovo V130-15IKB variants and uses the same registers as U1. Signed-off-by: Artem Borisov Signed-off-by: Jiri Kosina --- drivers/hid/hid-alps.c | 1 + drivers/hid/hid-ids.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-alps.c b/drivers/hid/hid-alps.c index fa704153cb00..c2a2bd528890 100644 --- a/drivers/hid/hid-alps.c +++ b/drivers/hid/hid-alps.c @@ -802,6 +802,7 @@ static int alps_probe(struct hid_device *hdev, const struct hid_device_id *id) break; case HID_DEVICE_ID_ALPS_U1_DUAL: case HID_DEVICE_ID_ALPS_U1: + case HID_DEVICE_ID_ALPS_1657: data->dev_type = U1; break; default: diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 7ccbd58a324e..e3e2fa6733fb 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -81,7 +81,7 @@ #define HID_DEVICE_ID_ALPS_U1 0x1215 #define HID_DEVICE_ID_ALPS_T4_BTNLESS 0x120C #define HID_DEVICE_ID_ALPS_1222 0x1222 - +#define HID_DEVICE_ID_ALPS_1657 0x121E #define USB_VENDOR_ID_AMI 0x046b #define USB_DEVICE_ID_AMI_VIRT_KEYBOARD_AND_MOUSE 0xff10 From 185af3e775b693f773d9a4b5a8c3cda69fc8ca0f Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 15 Apr 2020 14:51:42 +0200 Subject: [PATCH 007/300] HID: alps: ALPS_1657 is too specific; use U1_UNICORN_LEGACY instead HID_DEVICE_ID_ALPS_1657 PID is too specific, as there are many other ALPS hardware IDs using this particular touchpad. Rename the identifier to HID_DEVICE_ID_ALPS_U1_UNICORN_LEGACY in order to describe reality better. Fixes: 640e403b1fd24 ("HID: alps: Add AUI1657 device ID") Reported-by: Xiaojian Cao Signed-off-by: Jiri Kosina --- drivers/hid/hid-alps.c | 2 +- drivers/hid/hid-ids.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-alps.c b/drivers/hid/hid-alps.c index c2a2bd528890..b2ad319a74b9 100644 --- a/drivers/hid/hid-alps.c +++ b/drivers/hid/hid-alps.c @@ -802,7 +802,7 @@ static int alps_probe(struct hid_device *hdev, const struct hid_device_id *id) break; case HID_DEVICE_ID_ALPS_U1_DUAL: case HID_DEVICE_ID_ALPS_U1: - case HID_DEVICE_ID_ALPS_1657: + case HID_DEVICE_ID_ALPS_U1_UNICORN_LEGACY: data->dev_type = U1; break; default: diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index e3e2fa6733fb..6eb25b9e8575 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -79,9 +79,9 @@ #define HID_DEVICE_ID_ALPS_U1_DUAL_PTP 0x121F #define HID_DEVICE_ID_ALPS_U1_DUAL_3BTN_PTP 0x1220 #define HID_DEVICE_ID_ALPS_U1 0x1215 +#define HID_DEVICE_ID_ALPS_U1_UNICORN_LEGACY 0x121E #define HID_DEVICE_ID_ALPS_T4_BTNLESS 0x120C #define HID_DEVICE_ID_ALPS_1222 0x1222 -#define HID_DEVICE_ID_ALPS_1657 0x121E #define USB_VENDOR_ID_AMI 0x046b #define USB_DEVICE_ID_AMI_VIRT_KEYBOARD_AND_MOUSE 0xff10 From 1c32ca5dc6d00012f0c964e5fdd7042fcc71efb1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 14 Apr 2020 15:10:08 +0100 Subject: [PATCH 008/300] KVM: arm: vgic: Fix limit condition when writing to GICD_I[CS]ACTIVER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When deciding whether a guest has to be stopped we check whether this is a private interrupt or not. Unfortunately, there's an off-by-one bug here, and we fail to recognize a whole range of interrupts as being global (GICv2 SPIs 32-63). Fix the condition from > to be >=. Cc: stable@vger.kernel.org Fixes: abd7229626b93 ("KVM: arm/arm64: Simplify active_change_prepare and plug race") Reported-by: André Przywara Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 2199302597fa..d085e047953f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -444,7 +444,7 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid > VGIC_NR_PRIVATE_IRQS) + intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } @@ -452,7 +452,7 @@ static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) { if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid > VGIC_NR_PRIVATE_IRQS) + intid >= VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } From b43f977dd281945960c26b3ef67bba0fa07d39d9 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Wed, 8 Apr 2020 07:58:37 -0700 Subject: [PATCH 009/300] Revert "HID: wacom: generic: read the number of expected touches on a per collection basis" This reverts commit 15893fa40109f5e7c67eeb8da62267d0fdf0be9d. The referenced commit broke pen and touch input for a variety of devices such as the Cintiq Pro 32. Affected devices may appear to work normally for a short amount of time, but eventually loose track of actual touch state and can leave touch arbitration enabled which prevents the pen from working. The commit is not itself required for any currently-available Bluetooth device, and so we revert it to correct the behavior of broken devices. This breakage occurs due to a mismatch between the order of collections and the order of usages on some devices. This commit tries to read the contact count before processing events, but will fail if the contact count does not occur prior to the first logical finger collection. This is the case for devices like the Cintiq Pro 32 which place the contact count at the very end of the report. Without the contact count set, touches will only be partially processed. The `wacom_wac_finger_slot` function will not open any slots since the number of contacts seen is greater than the expectation of 0, but we will still end up calling `input_mt_sync_frame` for each finger anyway. This can cause problems for userspace separate from the issue currently taking place in the kernel. Only once all of the individual finger collections have been processed do we finally get to the enclosing collection which contains the contact count. The value ends up being used for the *next* report, however. This delayed use of the contact count can cause the driver to loose track of the actual touch state and believe that there are contacts down when there aren't. This leaves touch arbitration enabled and prevents the pen from working. It can also cause userspace to incorrectly treat single- finger input as gestures. Link: https://github.com/linuxwacom/input-wacom/issues/146 Signed-off-by: Jason Gerecke Reviewed-by: Aaron Armstrong Skomra Fixes: 15893fa40109 ("HID: wacom: generic: read the number of expected touches on a per collection basis") Cc: stable@vger.kernel.org # 5.3+ Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 79 +++++++++-------------------------------- 1 file changed, 16 insertions(+), 63 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index d99a9d407671..96d00eba99c0 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2637,9 +2637,25 @@ static void wacom_wac_finger_pre_report(struct hid_device *hdev, case HID_DG_TIPSWITCH: hid_data->last_slot_field = equivalent_usage; break; + case HID_DG_CONTACTCOUNT: + hid_data->cc_report = report->id; + hid_data->cc_index = i; + hid_data->cc_value_index = j; + break; } } } + + if (hid_data->cc_report != 0 && + hid_data->cc_index >= 0) { + struct hid_field *field = report->field[hid_data->cc_index]; + int value = field->value[hid_data->cc_value_index]; + if (value) + hid_data->num_expected = value; + } + else { + hid_data->num_expected = wacom_wac->features.touch_max; + } } static void wacom_wac_finger_report(struct hid_device *hdev, @@ -2649,7 +2665,6 @@ static void wacom_wac_finger_report(struct hid_device *hdev, struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct input_dev *input = wacom_wac->touch_input; unsigned touch_max = wacom_wac->features.touch_max; - struct hid_data *hid_data = &wacom_wac->hid_data; /* If more packets of data are expected, give us a chance to * process them rather than immediately syncing a partial @@ -2663,7 +2678,6 @@ static void wacom_wac_finger_report(struct hid_device *hdev, input_sync(input); wacom_wac->hid_data.num_received = 0; - hid_data->num_expected = 0; /* keep touch state for pen event */ wacom_wac->shared->touch_down = wacom_wac_finger_count_touches(wacom_wac); @@ -2738,73 +2752,12 @@ static void wacom_report_events(struct hid_device *hdev, } } -static void wacom_set_num_expected(struct hid_device *hdev, - struct hid_report *report, - int collection_index, - struct hid_field *field, - int field_index) -{ - struct wacom *wacom = hid_get_drvdata(hdev); - struct wacom_wac *wacom_wac = &wacom->wacom_wac; - struct hid_data *hid_data = &wacom_wac->hid_data; - unsigned int original_collection_level = - hdev->collection[collection_index].level; - bool end_collection = false; - int i; - - if (hid_data->num_expected) - return; - - // find the contact count value for this segment - for (i = field_index; i < report->maxfield && !end_collection; i++) { - struct hid_field *field = report->field[i]; - unsigned int field_level = - hdev->collection[field->usage[0].collection_index].level; - unsigned int j; - - if (field_level != original_collection_level) - continue; - - for (j = 0; j < field->maxusage; j++) { - struct hid_usage *usage = &field->usage[j]; - - if (usage->collection_index != collection_index) { - end_collection = true; - break; - } - if (wacom_equivalent_usage(usage->hid) == HID_DG_CONTACTCOUNT) { - hid_data->cc_report = report->id; - hid_data->cc_index = i; - hid_data->cc_value_index = j; - - if (hid_data->cc_report != 0 && - hid_data->cc_index >= 0) { - - struct hid_field *field = - report->field[hid_data->cc_index]; - int value = - field->value[hid_data->cc_value_index]; - - if (value) - hid_data->num_expected = value; - } - } - } - } - - if (hid_data->cc_report == 0 || hid_data->cc_index < 0) - hid_data->num_expected = wacom_wac->features.touch_max; -} - static int wacom_wac_collection(struct hid_device *hdev, struct hid_report *report, int collection_index, struct hid_field *field, int field_index) { struct wacom *wacom = hid_get_drvdata(hdev); - if (WACOM_FINGER_FIELD(field)) - wacom_set_num_expected(hdev, report, collection_index, field, - field_index); wacom_report_events(hdev, report, collection_index, field_index); /* From 2890ac993daa7bdcc57a92da2f07bcdec243666b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 1 Apr 2020 12:25:05 +0100 Subject: [PATCH 010/300] KVM: arm64: PSCI: Narrow input registers when using 32bit functions When a guest delibarately uses an SMC32 function number (which is allowed), we should make sure we drop the top 32bits from the input arguments, as they could legitimately be junk. Reported-by: Christoffer Dall Reviewed-by: Christoffer Dall Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- virt/kvm/arm/psci.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index 14a162e295a9..3772717efe3e 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -186,6 +186,18 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); } +static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) +{ + int i; + + /* + * Zero the input registers' upper 32 bits. They will be fully + * zeroed on exit, so we're fine changing them in place. + */ + for (i = 1; i < 4; i++) + vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); +} + static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; @@ -210,12 +222,16 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) val = PSCI_RET_SUCCESS; break; case PSCI_0_2_FN_CPU_ON: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; case PSCI_0_2_FN64_CPU_ON: mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); mutex_unlock(&kvm->lock); break; case PSCI_0_2_FN_AFFINITY_INFO: + kvm_psci_narrow_to_32bit(vcpu); + fallthrough; case PSCI_0_2_FN64_AFFINITY_INFO: val = kvm_psci_vcpu_affinity_info(vcpu); break; From fdc9999e20cd038d9d5bf54496c1f0226bc2b64e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 1 Apr 2020 12:38:49 +0100 Subject: [PATCH 011/300] KVM: arm64: PSCI: Forbid 64bit functions for 32bit guests Implementing (and even advertising) 64bit PSCI functions to 32bit guests is at least a bit odd, if not altogether violating the spec which says ("5.2.1 Register usage in arguments and return values"): "Adherence to the SMC Calling Conventions implies that any AArch32 caller of an SMC64 function will get a return code of 0xFFFFFFFF(int32). This matches the NOT_SUPPORTED error code used in PSCI" Tighten the implementation by pretending these functions are not there for 32bit guests. Reviewed-by: Christoffer Dall Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier --- virt/kvm/arm/psci.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index 3772717efe3e..ae364716ee40 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -198,6 +198,21 @@ static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu) vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i))); } +static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn) +{ + switch(fn) { + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN64_AFFINITY_INFO: + /* Disallow these functions for 32bit guests */ + if (vcpu_mode_is_32bit(vcpu)) + return PSCI_RET_NOT_SUPPORTED; + break; + } + + return 0; +} + static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; @@ -205,6 +220,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) unsigned long val; int ret = 1; + val = kvm_psci_check_allowed_function(vcpu, psci_fn); + if (val) + goto out; + switch (psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: /* @@ -272,6 +291,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) break; } +out: smccc_set_retval(vcpu, val, 0, 0, 0); return ret; } @@ -289,6 +309,10 @@ static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) break; case PSCI_1_0_FN_PSCI_FEATURES: feature = smccc_get_arg1(vcpu); + val = kvm_psci_check_allowed_function(vcpu, feature); + if (val) + break; + switch(feature) { case PSCI_0_2_FN_PSCI_VERSION: case PSCI_0_2_FN_CPU_SUSPEND: From 4dbccb873f2b35ad1b26419ff88c80509e2d4cbb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Apr 2020 12:30:52 +0300 Subject: [PATCH 012/300] platform/x86: surface3_power: Fix a NULL vs IS_ERR() check in probe The i2c_acpi_new_device() function never returns NULL, it returns error pointers. Fixes: b1f81b496b0d ("platform/x86: surface3_power: MSHW0011 rev-eng implementation") Signed-off-by: Dan Carpenter Signed-off-by: Andy Shevchenko --- drivers/platform/x86/surface3_power.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/surface3_power.c b/drivers/platform/x86/surface3_power.c index 946ac2dc08ae..cc4f9cba6856 100644 --- a/drivers/platform/x86/surface3_power.c +++ b/drivers/platform/x86/surface3_power.c @@ -522,8 +522,8 @@ static int mshw0011_probe(struct i2c_client *client) strlcpy(board_info.type, "MSHW0011-bat0", I2C_NAME_SIZE); bat0 = i2c_acpi_new_device(dev, 1, &board_info); - if (!bat0) - return -ENOMEM; + if (IS_ERR(bat0)) + return PTR_ERR(bat0); data->bat0 = bat0; i2c_set_clientdata(bat0, data); From 713df99a9ef0133c09ad05bac10cf7d0163cd272 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 2 Apr 2020 15:15:49 +0800 Subject: [PATCH 013/300] platform/x86: wmi: Make two functions static Fix sparse warnings: drivers/platform/x86/xiaomi-wmi.c:26:5: warning: symbol 'xiaomi_wmi_probe' was not declared. Should it be static? drivers/platform/x86/xiaomi-wmi.c:51:6: warning: symbol 'xiaomi_wmi_notify' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Andy Shevchenko --- drivers/platform/x86/xiaomi-wmi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/xiaomi-wmi.c b/drivers/platform/x86/xiaomi-wmi.c index 601cbb282f54..54a2546bb93b 100644 --- a/drivers/platform/x86/xiaomi-wmi.c +++ b/drivers/platform/x86/xiaomi-wmi.c @@ -23,7 +23,7 @@ struct xiaomi_wmi { unsigned int key_code; }; -int xiaomi_wmi_probe(struct wmi_device *wdev, const void *context) +static int xiaomi_wmi_probe(struct wmi_device *wdev, const void *context) { struct xiaomi_wmi *data; @@ -48,7 +48,7 @@ int xiaomi_wmi_probe(struct wmi_device *wdev, const void *context) return input_register_device(data->input_dev); } -void xiaomi_wmi_notify(struct wmi_device *wdev, union acpi_object *dummy) +static void xiaomi_wmi_notify(struct wmi_device *wdev, union acpi_object *dummy) { struct xiaomi_wmi *data; From f585c9d5436a2dbedcd6c581bcabd41d7e372e21 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 8 Apr 2020 10:42:00 +0800 Subject: [PATCH 014/300] platform/x86/intel-uncore-freq: make uncore_root_kobj static Fix the following sparse warning: drivers/platform/x86/intel-uncore-frequency.c:56:16: warning: symbol 'uncore_root_kobj' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jason Yan Acked-by: Srinivas Pandruvada Signed-off-by: Andy Shevchenko --- drivers/platform/x86/intel-uncore-frequency.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel-uncore-frequency.c b/drivers/platform/x86/intel-uncore-frequency.c index b96d172eb2c1..12d5ab7e1f5d 100644 --- a/drivers/platform/x86/intel-uncore-frequency.c +++ b/drivers/platform/x86/intel-uncore-frequency.c @@ -53,7 +53,7 @@ static int uncore_max_entries __read_mostly; /* Storage for uncore data for all instances */ static struct uncore_data *uncore_instances; /* Root of the all uncore sysfs kobjs */ -struct kobject *uncore_root_kobj; +static struct kobject *uncore_root_kobj; /* Stores the CPU mask of the target CPUs to use during uncore read/write */ static cpumask_t uncore_cpu_mask; /* CPU online callback register instance */ From fd0c42c4dea54335967c5a86f15fc064235a2797 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Sun, 8 Mar 2020 09:44:59 -0400 Subject: [PATCH 015/300] batman-adv: fix batadv_nc_random_weight_tq and change to pseudorandom numbers, as this is a traffic dithering operation that doesn't need crypto-grade. The previous code operated in 4 steps: 1. Generate a random byte 0 <= rand_tq <= 255 2. Multiply it by BATADV_TQ_MAX_VALUE - tq 3. Divide by 255 (= BATADV_TQ_MAX_VALUE) 4. Return BATADV_TQ_MAX_VALUE - rand_tq This would apperar to scale (BATADV_TQ_MAX_VALUE - tq) by a random value between 0/255 and 255/255. But! The intermediate value between steps 3 and 4 is stored in a u8 variable. So it's truncated, and most of the time, is less than 255, after which the division produces 0. Specifically, if tq is odd, the product is always even, and can never be 255. If tq is even, there's exactly one random byte value that will produce a product byte of 255. Thus, the return value is 255 (511/512 of the time) or 254 (1/512 of the time). If we assume that the truncation is a bug, and the code is meant to scale the input, a simpler way of looking at it is that it's returning a random value between tq and BATADV_TQ_MAX_VALUE, inclusive. Well, we have an optimized function for doing just that. Fixes: 3c12de9a5c75 ("batman-adv: network coding - code and transmit packets if possible") Signed-off-by: George Spelvin Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/network-coding.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 8f0717c3f7b5..b0469d15da0e 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1009,15 +1009,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, */ static u8 batadv_nc_random_weight_tq(u8 tq) { - u8 rand_val, rand_tq; - - get_random_bytes(&rand_val, sizeof(rand_val)); - /* randomize the estimated packet loss (max TQ - estimated TQ) */ - rand_tq = rand_val * (BATADV_TQ_MAX_VALUE - tq); - - /* normalize the randomized packet loss */ - rand_tq /= BATADV_TQ_MAX_VALUE; + u8 rand_tq = prandom_u32_max(BATADV_TQ_MAX_VALUE + 1 - tq); /* convert to (randomized) estimated tq again */ return BATADV_TQ_MAX_VALUE - rand_tq; From f872de8185acf1b48b954ba5bd8f9bc0a0d14016 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Wed, 15 Apr 2020 16:31:50 +0800 Subject: [PATCH 016/300] batman-adv: Fix refcnt leak in batadv_show_throughput_override batadv_show_throughput_override() invokes batadv_hardif_get_by_netdev(), which gets a batadv_hard_iface object from net_dev with increased refcnt and its reference is assigned to a local pointer 'hard_iface'. When batadv_show_throughput_override() returns, "hard_iface" becomes invalid, so the refcount should be decreased to keep refcount balanced. The issue happens in the normal path of batadv_show_throughput_override(), which forgets to decrease the refcnt increased by batadv_hardif_get_by_netdev() before the function returns, causing a refcnt leak. Fix this issue by calling batadv_hardif_put() before the batadv_show_throughput_override() returns in the normal path. Fixes: 0b5ecc6811bd ("batman-adv: add throughput override attribute to hard_ifaces") Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index c45962d8527b..c0b00268aac4 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1190,6 +1190,7 @@ static ssize_t batadv_show_throughput_override(struct kobject *kobj, tp_override = atomic_read(&hard_iface->bat_v.throughput_override); + batadv_hardif_put(hard_iface); return sprintf(buff, "%u.%u MBit\n", tp_override / 10, tp_override % 10); } From 6107c5da0fca8b50b4d3215e94d619d38cc4a18c Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Wed, 15 Apr 2020 16:35:21 +0800 Subject: [PATCH 017/300] batman-adv: Fix refcnt leak in batadv_store_throughput_override batadv_show_throughput_override() invokes batadv_hardif_get_by_netdev(), which gets a batadv_hard_iface object from net_dev with increased refcnt and its reference is assigned to a local pointer 'hard_iface'. When batadv_store_throughput_override() returns, "hard_iface" becomes invalid, so the refcount should be decreased to keep refcount balanced. The issue happens in one error path of batadv_store_throughput_override(). When batadv_parse_throughput() returns NULL, the refcnt increased by batadv_hardif_get_by_netdev() is not decreased, causing a refcnt leak. Fix this issue by jumping to "out" label when batadv_parse_throughput() returns NULL. Fixes: 0b5ecc6811bd ("batman-adv: add throughput override attribute to hard_ifaces") Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index c0b00268aac4..0f962dcd239e 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1150,7 +1150,7 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj, ret = batadv_parse_throughput(net_dev, buff, "throughput_override", &tp_override); if (!ret) - return count; + goto out; old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override); if (old_tp_override == tp_override) From 6f91a3f7af4186099dd10fa530dd7e0d9c29747d Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Mon, 20 Apr 2020 13:37:20 +0800 Subject: [PATCH 018/300] batman-adv: Fix refcnt leak in batadv_v_ogm_process batadv_v_ogm_process() invokes batadv_hardif_neigh_get(), which returns a reference of the neighbor object to "hardif_neigh" with increased refcount. When batadv_v_ogm_process() returns, "hardif_neigh" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling paths of batadv_v_ogm_process(). When batadv_v_ogm_orig_get() fails to get the orig node and returns NULL, the refcnt increased by batadv_hardif_neigh_get() is not decreased, causing a refcnt leak. Fix this issue by jumping to "out" label when batadv_v_ogm_orig_get() fails to get the orig node. Fixes: 9323158ef9f4 ("batman-adv: OGMv2 - implement originators logic") Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 969466218999..80b87b1f4e3a 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -893,7 +893,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) - return; + goto out; neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming, ethhdr->h_source); From e9b3c610a05c1cdf8e959a6d89c38807ff758ee6 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 15 Apr 2020 16:03:04 +0200 Subject: [PATCH 019/300] USB: serial: garmin_gps: add sanity checking for data length We must not process packets shorter than a packet ID Signed-off-by: Oliver Neukum Reported-and-tested-by: syzbot+d29e9263e13ce0b9f4fd@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/garmin_gps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c index ffd984142171..d63072fee099 100644 --- a/drivers/usb/serial/garmin_gps.c +++ b/drivers/usb/serial/garmin_gps.c @@ -1138,8 +1138,8 @@ static void garmin_read_process(struct garmin_data *garmin_data_p, send it directly to the tty port */ if (garmin_data_p->flags & FLAGS_QUEUING) { pkt_add(garmin_data_p, data, data_length); - } else if (bulk_data || - getLayerId(data) == GARMIN_LAYERID_APPL) { + } else if (bulk_data || (data_length >= sizeof(u32) && + getLayerId(data) == GARMIN_LAYERID_APPL)) { spin_lock_irqsave(&garmin_data_p->lock, flags); garmin_data_p->flags |= APP_RESP_SEEN; From 9a50ebbffa9862db7604345f5fd763122b0f6fed Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 6 Apr 2020 16:21:20 +0100 Subject: [PATCH 020/300] KVM: arm: vgic: Synchronize the whole guest on GIC{D,R}_I{S,C}ACTIVER read When a guest tries to read the active state of its interrupts, we currently just return whatever state we have in memory. This means that if such an interrupt lives in a List Register on another CPU, we fail to obsertve the latest active state for this interrupt. In order to remedy this, stop all the other vcpus so that they exit and we can observe the most recent value for the state. This is similar to what we are doing for the write side of the same registers, and results in new MMIO handlers for userspace (which do not need to stop the guest, as it is supposed to be stopped already). Reported-by: Julien Grall Reviewed-by: Andre Przywara Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio-v2.c | 4 +- virt/kvm/arm/vgic/vgic-mmio-v3.c | 12 ++-- virt/kvm/arm/vgic/vgic-mmio.c | 100 ++++++++++++++++++++----------- virt/kvm/arm/vgic/vgic-mmio.h | 3 + 4 files changed, 75 insertions(+), 44 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index 5945f062d749..d63881f60e1a 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -422,11 +422,11 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, vgic_mmio_read_active, vgic_mmio_write_sactive, - NULL, vgic_mmio_uaccess_write_sactive, 1, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR, vgic_mmio_read_active, vgic_mmio_write_cactive, - NULL, vgic_mmio_uaccess_write_cactive, 1, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI, vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index e72dcc454247..f2b37a081f26 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -553,11 +553,11 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER, vgic_mmio_read_active, vgic_mmio_write_sactive, - NULL, vgic_mmio_uaccess_write_sactive, 1, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER, vgic_mmio_read_active, vgic_mmio_write_cactive, - NULL, vgic_mmio_uaccess_write_cactive, + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR, vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL, @@ -625,12 +625,12 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = { VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0, vgic_mmio_read_active, vgic_mmio_write_sactive, - NULL, vgic_mmio_uaccess_write_sactive, - 4, VGIC_ACCESS_32bit), + vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4, + VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0, vgic_mmio_read_active, vgic_mmio_write_cactive, - NULL, vgic_mmio_uaccess_write_cactive, - 4, VGIC_ACCESS_32bit), + vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4, + VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0, vgic_mmio_read_priority, vgic_mmio_write_priority, 32, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit), diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index d085e047953f..b38e94e8f74a 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -348,8 +348,39 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, } } -unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, - gpa_t addr, unsigned int len) + +/* + * If we are fiddling with an IRQ's active state, we have to make sure the IRQ + * is not queued on some running VCPU's LRs, because then the change to the + * active state can be overwritten when the VCPU's state is synced coming back + * from the guest. + * + * For shared interrupts as well as GICv3 private interrupts, we have to + * stop all the VCPUs because interrupts can be migrated while we don't hold + * the IRQ locks and we don't want to be chasing moving targets. + * + * For GICv2 private interrupts we don't have to do anything because + * userspace accesses to the VGIC state already require all VCPUs to be + * stopped, and only the VCPU itself can modify its private interrupts + * active state, which guarantees that the VCPU is not running. + */ +static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid) +{ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_halt_guest(vcpu->kvm); +} + +/* See vgic_access_active_prepare */ +static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid) +{ + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid >= VGIC_NR_PRIVATE_IRQS) + kvm_arm_resume_guest(vcpu->kvm); +} + +static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); u32 value = 0; @@ -359,6 +390,10 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + /* + * Even for HW interrupts, don't evaluate the HW state as + * all the guest is interested in is the virtual state. + */ if (irq->active) value |= (1U << i); @@ -368,6 +403,29 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, return value; } +unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + u32 val; + + mutex_lock(&vcpu->kvm->lock); + vgic_access_active_prepare(vcpu, intid); + + val = __vgic_mmio_read_active(vcpu, addr, len); + + vgic_access_active_finish(vcpu, intid); + mutex_unlock(&vcpu->kvm->lock); + + return val; +} + +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + return __vgic_mmio_read_active(vcpu, addr, len); +} + /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool active, bool is_uaccess) @@ -426,36 +484,6 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } -/* - * If we are fiddling with an IRQ's active state, we have to make sure the IRQ - * is not queued on some running VCPU's LRs, because then the change to the - * active state can be overwritten when the VCPU's state is synced coming back - * from the guest. - * - * For shared interrupts, we have to stop all the VCPUs because interrupts can - * be migrated while we don't hold the IRQ locks and we don't want to be - * chasing moving targets. - * - * For private interrupts we don't have to do anything because userspace - * accesses to the VGIC state already require all VCPUs to be stopped, and - * only the VCPU itself can modify its private interrupts active state, which - * guarantees that the VCPU is not running. - */ -static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) -{ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid >= VGIC_NR_PRIVATE_IRQS) - kvm_arm_halt_guest(vcpu->kvm); -} - -/* See vgic_change_active_prepare */ -static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) -{ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || - intid >= VGIC_NR_PRIVATE_IRQS) - kvm_arm_resume_guest(vcpu->kvm); -} - static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) @@ -477,11 +505,11 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, u32 intid = VGIC_ADDR_TO_INTID(addr, 1); mutex_lock(&vcpu->kvm->lock); - vgic_change_active_prepare(vcpu, intid); + vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_cactive(vcpu, addr, len, val); - vgic_change_active_finish(vcpu, intid); + vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->lock); } @@ -514,11 +542,11 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, u32 intid = VGIC_ADDR_TO_INTID(addr, 1); mutex_lock(&vcpu->kvm->lock); - vgic_change_active_prepare(vcpu, intid); + vgic_access_active_prepare(vcpu, intid); __vgic_mmio_write_sactive(vcpu, addr, len, val); - vgic_change_active_finish(vcpu, intid); + vgic_access_active_finish(vcpu, intid); mutex_unlock(&vcpu->kvm->lock); } diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 5af2aefad435..30713a44e3fa 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -152,6 +152,9 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); +unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len); + void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val); From 41ee52ecbcdc98eb2a03241923b1a7d46f476bb3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 9 Apr 2020 13:05:26 +0100 Subject: [PATCH 021/300] KVM: arm: vgic: Only use the virtual state when userspace accesses enable bits There is no point in accessing the HW when writing to any of the ISENABLER/ICENABLER registers from userspace, as only the guest should be allowed to change the HW state. Introduce new userspace-specific accessors that deal solely with the virtual state. Reported-by: James Morse Tested-by: James Morse Reviewed-by: James Morse Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio-v2.c | 6 +++-- virt/kvm/arm/vgic/vgic-mmio-v3.c | 16 +++++++----- virt/kvm/arm/vgic/vgic-mmio.c | 42 ++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-mmio.h | 8 ++++++ 4 files changed, 64 insertions(+), 8 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index d63881f60e1a..f51c6e939c76 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -409,10 +409,12 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = { NULL, vgic_mmio_uaccess_write_v2_group, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET, - vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR, - vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1, diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index f2b37a081f26..416613f2400c 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -538,10 +538,12 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER, - vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER, - vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR, vgic_mmio_read_pending, vgic_mmio_write_spending, @@ -609,11 +611,13 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = { REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0, vgic_mmio_read_group, vgic_mmio_write_group, 4, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ISENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_senable, 4, + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_senable, + NULL, vgic_uaccess_write_senable, 4, VGIC_ACCESS_32bit), - REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICENABLER0, - vgic_mmio_read_enable, vgic_mmio_write_cenable, 4, + REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0, + vgic_mmio_read_enable, vgic_mmio_write_cenable, + NULL, vgic_uaccess_write_cenable, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0, vgic_mmio_read_pending, vgic_mmio_write_spending, diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index b38e94e8f74a..6e30034d1464 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -184,6 +184,48 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, } } +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->enabled = false; + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 30713a44e3fa..327d0a6938e4 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -138,6 +138,14 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val); +int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); From ba1ed9e17b581c9a204ec1d72d40472dd8557edd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 9 Apr 2020 13:05:26 +0100 Subject: [PATCH 022/300] KVM: arm: vgic-v2: Only use the virtual state when userspace accesses pending bits There is no point in accessing the HW when writing to any of the ISPENDR/ICPENDR registers from userspace, as only the guest should be allowed to change the HW state. Introduce new userspace-specific accessors that deal solely with the virtual state. Note that the API differs from that of GICv3, where userspace exclusively uses ISPENDR to set the state. Too bad we can't reuse it. Fixes: 82e40f558de56 ("KVM: arm/arm64: vgic-v2: Handle SGI bits in GICD_I{S,C}PENDR0 as WI") Reviewed-by: James Morse Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio-v2.c | 6 ++- virt/kvm/arm/vgic/vgic-mmio.c | 86 ++++++++++++++++++++++++-------- virt/kvm/arm/vgic/vgic-mmio.h | 8 +++ 3 files changed, 76 insertions(+), 24 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index f51c6e939c76..a016f07adc28 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -417,10 +417,12 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = { NULL, vgic_uaccess_write_cenable, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET, - vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1, + vgic_mmio_read_pending, vgic_mmio_write_spending, + NULL, vgic_uaccess_write_spending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR, - vgic_mmio_read_pending, vgic_mmio_write_cpending, NULL, NULL, 1, + vgic_mmio_read_pending, vgic_mmio_write_cpending, + NULL, vgic_uaccess_write_cpending, 1, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET, vgic_mmio_read_active, vgic_mmio_write_sactive, diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 6e30034d1464..b2d73fc0d1ef 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -261,17 +261,6 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, return value; } -/* Must be called with irq->irq_lock held */ -static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool is_uaccess) -{ - if (is_uaccess) - return; - - irq->pending_latch = true; - vgic_irq_set_phys_active(irq, true); -} - static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq) { return (vgic_irq_is_sgi(irq->intid) && @@ -282,7 +271,6 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - bool is_uaccess = !kvm_get_running_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -312,22 +300,48 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, continue; } + irq->pending_latch = true; if (irq->hw) - vgic_hw_irq_spending(vcpu, irq, is_uaccess); - else - irq->pending_latch = true; + vgic_irq_set_phys_active(irq, true); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } -/* Must be called with irq->irq_lock held */ -static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool is_uaccess) +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) { - if (is_uaccess) - return; + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + + /* + * GICv2 SGIs are terribly broken. We can't restore + * the source of the interrupt, so just pick the vcpu + * itself as the source... + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source |= BIT(vcpu->vcpu_id); + + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq) +{ irq->pending_latch = false; /* @@ -350,7 +364,6 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - bool is_uaccess = !kvm_get_running_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -381,7 +394,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, } if (irq->hw) - vgic_hw_irq_cpending(vcpu, irq, is_uaccess); + vgic_hw_irq_cpending(vcpu, irq); else irq->pending_latch = false; @@ -390,6 +403,35 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, } } +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u32 intid = VGIC_ADDR_TO_INTID(addr, 1); + int i; + unsigned long flags; + + for_each_set_bit(i, &val, len * 8) { + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + + raw_spin_lock_irqsave(&irq->irq_lock, flags); + /* + * More fun with GICv2 SGIs! If we're clearing one of them + * from userspace, which source vcpu to clear? Let's not + * even think of it, and blow the whole set. + */ + if (is_vgic_v2_sgi(vcpu, irq)) + irq->source = 0; + + irq->pending_latch = false; + + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); + + vgic_put_irq(vcpu->kvm, irq); + } + + return 0; +} /* * If we are fiddling with an IRQ's active state, we have to make sure the IRQ diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 327d0a6938e4..fefcca2b14dc 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -157,6 +157,14 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val); +int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + +int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val); + unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); From 969ce8b5260d8ec01e6f1949d2927a86419663ce Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 14 Apr 2020 11:03:47 +0800 Subject: [PATCH 023/300] KVM: arm64: vgic-v3: Retire all pending LPIs on vcpu destroy It's likely that the vcpu fails to handle all virtual interrupts if userspace decides to destroy it, leaving the pending ones stay in the ap_list. If the un-handled one is a LPI, its vgic_irq structure will be eventually leaked because of an extra refcount increment in vgic_queue_irq_unlock(). This was detected by kmemleak on almost every guest destroy, the backtrace is as follows: unreferenced object 0xffff80725aed5500 (size 128): comm "CPU 5/KVM", pid 40711, jiffies 4298024754 (age 166366.512s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 08 01 a9 73 6d 80 ff ff ...........sm... c8 61 ee a9 00 20 ff ff 28 1e 55 81 6c 80 ff ff .a... ..(.U.l... backtrace: [<000000004bcaa122>] kmem_cache_alloc_trace+0x2dc/0x418 [<0000000069c7dabb>] vgic_add_lpi+0x88/0x418 [<00000000bfefd5c5>] vgic_its_cmd_handle_mapi+0x4dc/0x588 [<00000000cf993975>] vgic_its_process_commands.part.5+0x484/0x1198 [<000000004bd3f8e3>] vgic_its_process_commands+0x50/0x80 [<00000000b9a65b2b>] vgic_mmio_write_its_cwriter+0xac/0x108 [<0000000009641ebb>] dispatch_mmio_write+0xd0/0x188 [<000000008f79d288>] __kvm_io_bus_write+0x134/0x240 [<00000000882f39ac>] kvm_io_bus_write+0xe0/0x150 [<0000000078197602>] io_mem_abort+0x484/0x7b8 [<0000000060954e3c>] kvm_handle_guest_abort+0x4cc/0xa58 [<00000000e0d0cd65>] handle_exit+0x24c/0x770 [<00000000b44a7fad>] kvm_arch_vcpu_ioctl_run+0x460/0x1988 [<0000000025fb897c>] kvm_vcpu_ioctl+0x4f8/0xee0 [<000000003271e317>] do_vfs_ioctl+0x160/0xcd8 [<00000000e7f39607>] ksys_ioctl+0x98/0xd8 Fix it by retiring all pending LPIs in the ap_list on the destroy path. p.s. I can also reproduce it on a normal guest shutdown. It is because userspace still send LPIs to vcpu (through KVM_SIGNAL_MSI ioctl) while the guest is being shutdown and unable to handle it. A little strange though and haven't dig further... Reviewed-by: James Morse Signed-off-by: Zenghui Yu [maz: moved the distributor deallocation down to avoid an UAF splat] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200414030349.625-2-yuzenghui@huawei.com --- virt/kvm/arm/vgic/vgic-init.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index a963b9d766b7..30dbec9fe0b4 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -348,6 +348,12 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + /* + * Retire all pending LPIs on this vcpu anyway as we're + * going to destroy it. + */ + vgic_flush_pending_lpis(vcpu); + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); } @@ -359,10 +365,10 @@ static void __kvm_vgic_destroy(struct kvm *kvm) vgic_debug_destroy(kvm); - kvm_vgic_dist_destroy(kvm); - kvm_for_each_vcpu(i, vcpu, kvm) kvm_vgic_vcpu_destroy(vcpu); + + kvm_vgic_dist_destroy(kvm); } void kvm_vgic_destroy(struct kvm *kvm) From 57bdb436ce869a45881d8aa4bc5dac8e072dd2b6 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 14 Apr 2020 11:03:48 +0800 Subject: [PATCH 024/300] KVM: arm64: vgic-its: Fix memory leak on the error path of vgic_add_lpi() If we're going to fail out the vgic_add_lpi(), let's make sure the allocated vgic_irq memory is also freed. Though it seems that both cases are unlikely to fail. Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200414030349.625-3-yuzenghui@huawei.com --- virt/kvm/arm/vgic/vgic-its.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index d53d34a33e35..c012a52b19f5 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -96,14 +96,21 @@ out_unlock: * We "cache" the configuration table entries in our struct vgic_irq's. * However we only have those structs for mapped IRQs, so we read in * the respective config data from memory here upon mapping the LPI. + * + * Should any of these fail, behave as if we couldn't create the LPI + * by dropping the refcount and returning the error. */ ret = update_lpi_config(kvm, irq, NULL, false); - if (ret) + if (ret) { + vgic_put_irq(kvm, irq); return ERR_PTR(ret); + } ret = vgic_v3_lpi_sync_pending_status(kvm, irq); - if (ret) + if (ret) { + vgic_put_irq(kvm, irq); return ERR_PTR(ret); + } return irq; } From b61ad5c0e21cc54107ca65b0b14bb57c2d39a3b6 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 30 Mar 2020 12:10:38 +0200 Subject: [PATCH 025/300] phy: tegra: Select USB_COMMON for usb_get_maximum_speed() The usb_get_maximum_speed() function is part of the usb-common module, so enable it by selecting the corresponding Kconfig symbol. While at it, also make sure to depend on USB_SUPPORT because USB_PHY requires that. This can lead to Kconfig conflicts if USB_SUPPORT is not enabled while attempting to enable PHY_TEGRA_XUSB. Reported-by: kbuild test robot Suggested-by: Nathan Chancellor Signed-off-by: Thierry Reding Signed-off-by: Kishon Vijay Abraham I --- drivers/phy/tegra/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/phy/tegra/Kconfig b/drivers/phy/tegra/Kconfig index a208aca4ba7b..c591c958f1eb 100644 --- a/drivers/phy/tegra/Kconfig +++ b/drivers/phy/tegra/Kconfig @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only config PHY_TEGRA_XUSB tristate "NVIDIA Tegra XUSB pad controller driver" - depends on ARCH_TEGRA + depends on ARCH_TEGRA && USB_SUPPORT + select USB_COMMON select USB_CONN_GPIO select USB_PHY help From 45a76264c26fd8cfd0c9746196892d9b7e2657ee Mon Sep 17 00:00:00 2001 From: Arun Easi Date: Tue, 31 Mar 2020 03:40:14 -0700 Subject: [PATCH 026/300] scsi: qla2xxx: Fix hang when issuing nvme disconnect-all in NPIV In NPIV environment, a NPIV host may use a queue pair created by base host or other NPIVs, so the check for a queue pair created by this NPIV is not correct, and can cause an abort to fail, which in turn means the NVME command not returned. This leads to hang in nvme_fc layer in nvme_fc_delete_association() which waits for all I/Os to be returned, which is seen as hang in the application. Link: https://lore.kernel.org/r/20200331104015.24868-3-njavali@marvell.com Reviewed-by: Himanshu Madhani Signed-off-by: Arun Easi Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_mbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 4ed90437e8c4..d6c991bd1bde 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -3153,7 +3153,7 @@ qla24xx_abort_command(srb_t *sp) ql_dbg(ql_dbg_mbx + ql_dbg_verbose, vha, 0x108c, "Entered %s.\n", __func__); - if (vha->flags.qpairs_available && sp->qpair) + if (sp->qpair) req = sp->qpair->req; else return QLA_FUNCTION_FAILED; From c48f849d3f7a4ec1025105f446e29d395c4dcc2f Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Tue, 31 Mar 2020 03:40:15 -0700 Subject: [PATCH 027/300] scsi: qla2xxx: Delete all sessions before unregister local nvme port Delete all sessions before unregistering local nvme port. This allows nvme layer to decrement all active rport count down to zero. Once the count is down to zero, nvme would call qla to continue with the npiv port deletion. PID: 27448 TASK: ffff9e34b777c1c0 CPU: 0 COMMAND: "qaucli" 0 [ffff9e25e84abbd8] __schedule at ffffffff977858ca 1 [ffff9e25e84abc68] schedule at ffffffff97785d79 2 [ffff9e25e84abc78] schedule_timeout at ffffffff97783881 3 [ffff9e25e84abd28] wait_for_completion at ffffffff9778612d 4 [ffff9e25e84abd88] qla_nvme_delete at ffffffffc0e3024e [qla2xxx] 5 [ffff9e25e84abda8] qla24xx_vport_delete at ffffffffc0e024b9 [qla2xxx] 6 [ffff9e25e84abdf0] fc_vport_terminate at ffffffffc011c247 [scsi_transport_fc] 7 [ffff9e25e84abe28] store_fc_host_vport_delete at ffffffffc011cd94 [scsi_transport_fc] 8 [ffff9e25e84abe70] dev_attr_store at ffffffff974b376b 9 [ffff9e25e84abe80] sysfs_kf_write at ffffffff972d9a92 10 [ffff9e25e84abe90] kernfs_fop_write at ffffffff972d907b 11 [ffff9e25e84abec8] vfs_write at ffffffff9724c790 12 [ffff9e25e84abf08] sys_write at ffffffff9724d55f 13 [ffff9e25e84abf50] system_call_fastpath at ffffffff97792ed2 RIP: 00007fc0bd81a6fd RSP: 00007ffff78d9648 RFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000022 RCX: 00007ffff78d96e0 RDX: 0000000000000022 RSI: 00007ffff78d94e0 RDI: 0000000000000008 RBP: 00007ffff78d9440 R8: 0000000000000000 R9: 00007fc0bd48b2cd R10: 0000000000000017 R11: 0000000000000293 R12: 0000000000000000 R13: 00005624e4dac840 R14: 00005624e4da9a10 R15: 0000000000000000 ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b Link: https://lore.kernel.org/r/20200331104015.24868-4-njavali@marvell.com Reviewed-by: Himanshu Madhani Signed-off-by: Quinn Tran Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 97cabd7e0014..33255968f03a 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -3031,11 +3031,11 @@ qla24xx_vport_delete(struct fc_vport *fc_vport) test_bit(FCPORT_UPDATE_NEEDED, &vha->dpc_flags)) msleep(1000); - qla_nvme_delete(vha); qla24xx_disable_vp(vha); qla2x00_wait_for_sess_deletion(vha); + qla_nvme_delete(vha); vha->flags.delete_progress = 1; qlt_remove_target(ha, vha); From d8dd25a461e4eec7190cb9d66616aceacc5110ad Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:03:00 -0500 Subject: [PATCH 028/300] objtool: Fix stack offset tracking for indirect CFAs When the current frame address (CFA) is stored on the stack (i.e., cfa->base == CFI_SP_INDIRECT), objtool neglects to adjust the stack offset when there are subsequent pushes or pops. This results in bad ORC data at the end of the ENTER_IRQ_STACK macro, when it puts the previous stack pointer on the stack and does a subsequent push. This fixes the following unwinder warning: WARNING: can't dereference registers at 00000000f0a6bdba for ip interrupt_entry+0x9f/0xa0 Fixes: 627fce14809b ("objtool: Add ORC unwind table generation") Reported-by: Vince Weaver Reported-by: Dave Jones Reported-by: Steven Rostedt Reported-by: Vegard Nossum Reported-by: Joe Mario Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/853d5d691b29e250333332f09b8e27410b2d9924.1587808742.git.jpoimboe@redhat.com --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4b170fd08a28..e7184641a40c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1449,7 +1449,7 @@ static int update_insn_state_regs(struct instruction *insn, struct insn_state *s struct cfi_reg *cfa = &state->cfa; struct stack_op *op = &insn->stack_op; - if (cfa->base != CFI_SP) + if (cfa->base != CFI_SP && cfa->base != CFI_SP_INDIRECT) return 0; /* push */ From 06a9750edcffa808494d56da939085c35904e618 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:03:01 -0500 Subject: [PATCH 029/300] x86/entry/64: Fix unwind hints in register clearing code The PUSH_AND_CLEAR_REGS macro zeroes each register immediately after pushing it. If an NMI or exception hits after a register is cleared, but before the UNWIND_HINT_REGS annotation, the ORC unwinder will wrongly think the previous value of the register was zero. This can confuse the unwinding process and cause it to exit early. Because ORC is simpler than DWARF, there are a limited number of unwind annotation states, so it's not possible to add an individual unwind hint after each push/clear combination. Instead, the register clearing instructions need to be consolidated and moved to after the UNWIND_HINT_REGS annotation. Fixes: 3f01daecd545 ("x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro") Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/68fd3d0bc92ae2d62ff7879d15d3684217d51f08.1587808742.git.jpoimboe@redhat.com --- arch/x86/entry/calling.h | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 0789e13ece90..1c7f13bb6728 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -98,13 +98,6 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 - /* - * Push registers and sanitize registers of values that a - * speculation attack might otherwise want to exploit. The - * lower registers are likely clobbered well before they - * could be put to use in a speculative execution gadget. - * Interleave XOR with PUSH for better uop scheduling: - */ .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -114,34 +107,43 @@ For 32-bit we have the following conventions - kernel is built with pushq %rsi /* pt_regs->si */ .endif pushq \rdx /* pt_regs->dx */ - xorl %edx, %edx /* nospec dx */ pushq %rcx /* pt_regs->cx */ - xorl %ecx, %ecx /* nospec cx */ pushq \rax /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ - xorl %r8d, %r8d /* nospec r8 */ pushq %r9 /* pt_regs->r9 */ - xorl %r9d, %r9d /* nospec r9 */ pushq %r10 /* pt_regs->r10 */ - xorl %r10d, %r10d /* nospec r10 */ pushq %r11 /* pt_regs->r11 */ - xorl %r11d, %r11d /* nospec r11*/ pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx*/ pushq %rbp /* pt_regs->rbp */ - xorl %ebp, %ebp /* nospec rbp*/ pushq %r12 /* pt_regs->r12 */ - xorl %r12d, %r12d /* nospec r12*/ pushq %r13 /* pt_regs->r13 */ - xorl %r13d, %r13d /* nospec r13*/ pushq %r14 /* pt_regs->r14 */ - xorl %r14d, %r14d /* nospec r14*/ pushq %r15 /* pt_regs->r15 */ - xorl %r15d, %r15d /* nospec r15*/ UNWIND_HINT_REGS + .if \save_ret pushq %rsi /* return address on top of stack */ .endif + + /* + * Sanitize registers of values that a speculation attack might + * otherwise want to exploit. The lower registers are likely clobbered + * well before they could be put to use in a speculative execution + * gadget. + */ + xorl %edx, %edx /* nospec dx */ + xorl %ecx, %ecx /* nospec cx */ + xorl %r8d, %r8d /* nospec r8 */ + xorl %r9d, %r9d /* nospec r9 */ + xorl %r10d, %r10d /* nospec r10 */ + xorl %r11d, %r11d /* nospec r11 */ + xorl %ebx, %ebx /* nospec rbx */ + xorl %ebp, %ebp /* nospec rbp */ + xorl %r12d, %r12d /* nospec r12 */ + xorl %r13d, %r13d /* nospec r13 */ + xorl %r14d, %r14d /* nospec r14 */ + xorl %r15d, %r15d /* nospec r15 */ + .endm .macro POP_REGS pop_rdi=1 skip_r11rcx=0 From 1fb143634a38095b641a3a21220774799772dc4c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:03:02 -0500 Subject: [PATCH 030/300] x86/entry/64: Fix unwind hints in kernel exit path In swapgs_restore_regs_and_return_to_usermode, after the stack is switched to the trampoline stack, the existing UNWIND_HINT_REGS hint is no longer valid, which can result in the following ORC unwinder warning: WARNING: can't dereference registers at 000000003aeb0cdd for ip swapgs_restore_regs_and_return_to_usermode+0x93/0xa0 For full correctness, we could try to add complicated unwind hints so the unwinder could continue to find the registers, but when when it's this close to kernel exit, unwind hints aren't really needed anymore and it's fine to just use an empty hint which tells the unwinder to stop. For consistency, also move the UNWIND_HINT_EMPTY in entry_SYSCALL_64_after_hwframe to a similar location. Fixes: 3e3b9293d392 ("x86/entry/64: Return to userspace from the trampoline stack") Reported-by: Vince Weaver Reported-by: Dave Jones Reported-by: Dr. David Alan Gilbert Reported-by: Joe Mario Reported-by: Jann Horn Reported-by: Linus Torvalds Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/60ea8f562987ed2d9ace2977502fe481c0d7c9a0.1587808742.git.jpoimboe@redhat.com --- arch/x86/entry/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0e9504fabe52..6b0d679efd6b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -249,7 +249,6 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - UNWIND_HINT_EMPTY POP_REGS pop_rdi=0 skip_r11rcx=1 /* @@ -258,6 +257,7 @@ syscall_return_via_sysret: */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -637,6 +637,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ From 96c64806b4bf35f5edb465cafa6cec490e424a30 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:03:03 -0500 Subject: [PATCH 031/300] x86/entry/64: Fix unwind hints in __switch_to_asm() UNWIND_HINT_FUNC has some limitations: specifically, it doesn't reset all the registers to undefined. This causes objtool to get confused about the RBP push in __switch_to_asm(), resulting in bad ORC data. While __switch_to_asm() does do some stack magic, it's otherwise a normal callable-from-C function, so just annotate it as a function, which makes objtool happy and allows it to produces the correct hints automatically. Fixes: 8c1f75587a18 ("x86/entry/64: Add unwind hint annotations") Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/03d0411920d10f7418f2e909210d8e9a3b2ab081.1587808742.git.jpoimboe@redhat.com --- arch/x86/entry/entry_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 6b0d679efd6b..34a588950fe1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -279,8 +279,7 @@ SYM_CODE_END(entry_SYSCALL_64) * %rdi: prev task * %rsi: next task */ -SYM_CODE_START(__switch_to_asm) - UNWIND_HINT_FUNC +SYM_FUNC_START(__switch_to_asm) /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -321,7 +320,7 @@ SYM_CODE_START(__switch_to_asm) popq %rbp jmp __switch_to -SYM_CODE_END(__switch_to_asm) +SYM_FUNC_END(__switch_to_asm) /* * A newly forked process directly context switches into this address. From f977df7b7ca45a4ac4b66d30a8931d0434c394b1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 25 Apr 2020 05:03:04 -0500 Subject: [PATCH 032/300] x86/entry/64: Fix unwind hints in rewind_stack_do_exit() The LEAQ instruction in rewind_stack_do_exit() moves the stack pointer directly below the pt_regs at the top of the task stack before calling do_exit(). Tell the unwinder to expect pt_regs. Fixes: 8c1f75587a18 ("x86/entry/64: Add unwind hint annotations") Reviewed-by: Miroslav Benes Signed-off-by: Jann Horn Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/68c33e17ae5963854916a46f522624f8e1d264f2.1587808742.git.jpoimboe@redhat.com --- arch/x86/entry/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 34a588950fe1..9fe0d5cad8e4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1739,7 +1739,7 @@ SYM_CODE_START(rewind_stack_do_exit) movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp - UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE + UNWIND_HINT_REGS call do_exit SYM_CODE_END(rewind_stack_do_exit) From 153eb2223c794251b28400f3f74862e090d23f16 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:03:05 -0500 Subject: [PATCH 033/300] x86/unwind/orc: Convert global variables to static These variables aren't used outside of unwind_orc.c, make them static. Also annotate some of them with '__ro_after_init', as applicable. Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/43ae310bf7822b9862e571f36ae3474cfde8f301.1587808742.git.jpoimboe@redhat.com --- arch/x86/kernel/unwind_orc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index e9cc182aa97e..64889da666f4 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -15,12 +15,12 @@ extern int __stop_orc_unwind_ip[]; extern struct orc_entry __start_orc_unwind[]; extern struct orc_entry __stop_orc_unwind[]; -static DEFINE_MUTEX(sort_mutex); -int *cur_orc_ip_table = __start_orc_unwind_ip; -struct orc_entry *cur_orc_table = __start_orc_unwind; +static bool orc_init __ro_after_init; +static unsigned int lookup_num_blocks __ro_after_init; -unsigned int lookup_num_blocks; -bool orc_init; +static DEFINE_MUTEX(sort_mutex); +static int *cur_orc_ip_table = __start_orc_unwind_ip; +static struct orc_entry *cur_orc_table = __start_orc_unwind; static inline unsigned long orc_ip(const int *ip) { From b08418b54831255a7e3700d6bf7dfc2bdae25cd7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:03:06 -0500 Subject: [PATCH 034/300] x86/unwind: Prevent false warnings for non-current tasks There's some daring kernel code out there which dumps the stack of another task without first making sure the task is inactive. If the task happens to be running while the unwinder is reading the stack, unusual unwinder warnings can result. There's no race-free way for the unwinder to know whether such a warning is legitimate, so just disable unwinder warnings for all non-current tasks. Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/ec424a2aea1d461eb30cab48a28c6433de2ab784.1587808742.git.jpoimboe@redhat.com --- arch/x86/kernel/dumpstack_64.c | 3 ++- arch/x86/kernel/unwind_frame.c | 3 +++ arch/x86/kernel/unwind_orc.c | 40 +++++++++++++++++++--------------- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 87b97897a881..460ae7f66818 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -183,7 +183,8 @@ recursion_check: */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { - printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + if (task == current) + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a224b5ab103f..54226110bc7f 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -344,6 +344,9 @@ bad_address: if (IS_ENABLED(CONFIG_X86_32)) goto the_end; + if (state->task != current) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 64889da666f4..45166fd50be3 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -8,7 +8,13 @@ #include #define orc_warn(fmt, ...) \ - printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) + +#define orc_warn_current(args...) \ +({ \ + if (state->task == current) \ + orc_warn(args); \ +}) extern int __start_orc_unwind_ip[]; extern int __stop_orc_unwind_ip[]; @@ -446,8 +452,8 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_R10: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R10 at ip %pB\n", - (void *)state->ip); + orc_warn_current("missing R10 value at %pB\n", + (void *)state->ip); goto err; } sp = state->regs->r10; @@ -455,8 +461,8 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_R13: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R13 at ip %pB\n", - (void *)state->ip); + orc_warn_current("missing R13 value at %pB\n", + (void *)state->ip); goto err; } sp = state->regs->r13; @@ -464,8 +470,8 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_DI: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DI at ip %pB\n", - (void *)state->ip); + orc_warn_current("missing RDI value at %pB\n", + (void *)state->ip); goto err; } sp = state->regs->di; @@ -473,15 +479,15 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_DX: if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DX at ip %pB\n", - (void *)state->ip); + orc_warn_current("missing DX value at %pB\n", + (void *)state->ip); goto err; } sp = state->regs->dx; break; default: - orc_warn("unknown SP base reg %d for ip %pB\n", + orc_warn("unknown SP base reg %d at %pB\n", orc->sp_reg, (void *)state->ip); goto err; } @@ -509,8 +515,8 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_TYPE_REGS: if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { - orc_warn("can't dereference registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); + orc_warn_current("can't access registers at %pB\n", + (void *)orig_ip); goto err; } @@ -521,8 +527,8 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_TYPE_REGS_IRET: if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { - orc_warn("can't dereference iret registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); + orc_warn_current("can't access iret registers at %pB\n", + (void *)orig_ip); goto err; } @@ -532,7 +538,7 @@ bool unwind_next_frame(struct unwind_state *state) break; default: - orc_warn("unknown .orc_unwind entry type %d for ip %pB\n", + orc_warn("unknown .orc_unwind entry type %d at %pB\n", orc->type, (void *)orig_ip); break; } @@ -564,8 +570,8 @@ bool unwind_next_frame(struct unwind_state *state) if (state->stack_info.type == prev_type && on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && state->sp <= prev_sp) { - orc_warn("stack going in the wrong direction? ip=%pB\n", - (void *)orig_ip); + orc_warn_current("stack going in the wrong direction? at %pB\n", + (void *)orig_ip); goto err; } From f1d9a2abff66aa8156fbc1493abed468db63ea48 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Sat, 25 Apr 2020 05:03:07 -0500 Subject: [PATCH 035/300] x86/unwind/orc: Don't skip the first frame for inactive tasks When unwinding an inactive task, the ORC unwinder skips the first frame by default. If both the 'regs' and 'first_frame' parameters of unwind_start() are NULL, 'state->sp' and 'first_frame' are later initialized to the same value for an inactive task. Given there is a "less than or equal to" comparison used at the end of __unwind_start() for skipping stack frames, the first frame is skipped. Drop the equal part of the comparison and make the behavior equivalent to the frame pointer unwinder. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Reviewed-by: Miroslav Benes Signed-off-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/7f08db872ab59e807016910acdbe82f744de7065.1587808742.git.jpoimboe@redhat.com --- arch/x86/kernel/unwind_orc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 45166fd50be3..e9f5a20c69c6 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -657,7 +657,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp <= (unsigned long)first_frame)) + state->sp < (unsigned long)first_frame)) unwind_next_frame(state); return; From 98d0c8ebf77e0ba7c54a9ae05ea588f0e9e3f46e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:03:08 -0500 Subject: [PATCH 036/300] x86/unwind/orc: Prevent unwinding before ORC initialization If the unwinder is called before the ORC data has been initialized, orc_find() returns NULL, and it tries to fall back to using frame pointers. This can cause some unexpected warnings during boot. Move the 'orc_init' check from orc_find() to __unwind_init(), so that it doesn't even try to unwind from an uninitialized state. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/069d1499ad606d85532eb32ce39b2441679667d5.1587808742.git.jpoimboe@redhat.com --- arch/x86/kernel/unwind_orc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index e9f5a20c69c6..cb11567361cc 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -148,9 +148,6 @@ static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; - if (!orc_init) - return NULL; - if (ip == 0) return &null_orc_entry; @@ -591,6 +588,9 @@ EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { + if (!orc_init) + goto done; + memset(state, 0, sizeof(*state)); state->task = task; From a0f81bf26888048100bf017fadf438a5bdffa8d8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:06:13 -0500 Subject: [PATCH 037/300] x86/unwind/orc: Fix error path for bad ORC entry type If the ORC entry type is unknown, nothing else can be done other than reporting an error. Exit the function instead of breaking out of the switch statement. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/a7fa668ca6eabbe81ab18b2424f15adbbfdc810a.1587808742.git.jpoimboe@redhat.com --- arch/x86/kernel/unwind_orc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index cb11567361cc..33b80a7f998f 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -537,7 +537,7 @@ bool unwind_next_frame(struct unwind_state *state) default: orc_warn("unknown .orc_unwind entry type %d at %pB\n", orc->type, (void *)orig_ip); - break; + goto err; } /* Find BP: */ From 81b67439d147677d844d492fcbd03712ea438f42 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:06:14 -0500 Subject: [PATCH 038/300] x86/unwind/orc: Fix premature unwind stoppage due to IRET frames The following execution path is possible: fsnotify() [ realign the stack and store previous SP in R10 ] [ only IRET regs saved ] common_interrupt() interrupt_entry() [ full pt_regs saved ] ... [ unwind stack ] When the unwinder goes through the NMI and the IRQ on the stack, and then sees fsnotify(), it doesn't have access to the value of R10, because it only has the five IRET registers. So the unwind stops prematurely. However, because the interrupt_entry() code is careful not to clobber R10 before saving the full regs, the unwinder should be able to read R10 from the previously saved full pt_regs associated with the NMI. Handle this case properly. When encountering an IRET regs frame immediately after a full pt_regs frame, use the pt_regs as a backup which can be used to get the C register values. Also, note that a call frame resets the 'prev_regs' value, because a function is free to clobber the registers. For this fix to work, the IRET and full regs frames must be adjacent, with no FUNC frames in between. So replace the FUNC hint in interrupt_entry() with an IRET_REGS hint. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/97a408167cc09f1cfa0de31a7b70dd88868d743f.1587808742.git.jpoimboe@redhat.com --- arch/x86/entry/entry_64.S | 4 +-- arch/x86/include/asm/unwind.h | 2 +- arch/x86/kernel/unwind_orc.c | 51 +++++++++++++++++++++++++++-------- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9fe0d5cad8e4..3063aa9090f9 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -511,7 +511,7 @@ SYM_CODE_END(spurious_entries_start) * +----------------------------------------------------+ */ SYM_CODE_START(interrupt_entry) - UNWIND_HINT_FUNC + UNWIND_HINT_IRET_REGS offset=16 ASM_CLAC cld @@ -543,9 +543,9 @@ SYM_CODE_START(interrupt_entry) pushq 5*8(%rdi) /* regs->eflags */ pushq 4*8(%rdi) /* regs->cs */ pushq 3*8(%rdi) /* regs->ip */ + UNWIND_HINT_IRET_REGS pushq 2*8(%rdi) /* regs->orig_ax */ pushq 8(%rdi) /* return address */ - UNWIND_HINT_FUNC movq (%rdi), %rdi jmp 2f diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 499578f7e6d7..70fc159ebe69 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -19,7 +19,7 @@ struct unwind_state { #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; - struct pt_regs *regs; + struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 33b80a7f998f..0ebc11a8bb45 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -384,9 +384,38 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr return true; } +/* + * If state->regs is non-NULL, and points to a full pt_regs, just get the reg + * value from state->regs. + * + * Otherwise, if state->regs just points to IRET regs, and the previous frame + * had full regs, it's safe to get the value from the previous regs. This can + * happen when early/late IRQ entry code gets interrupted by an NMI. + */ +static bool get_reg(struct unwind_state *state, unsigned int reg_off, + unsigned long *val) +{ + unsigned int reg = reg_off/8; + + if (!state->regs) + return false; + + if (state->full_regs) { + *val = ((unsigned long *)state->regs)[reg]; + return true; + } + + if (state->prev_regs) { + *val = ((unsigned long *)state->prev_regs)[reg]; + return true; + } + + return false; +} + bool unwind_next_frame(struct unwind_state *state) { - unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp; + unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; bool indirect = false; @@ -448,39 +477,35 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_REG_R10: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) { orc_warn_current("missing R10 value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->r10; break; case ORC_REG_R13: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) { orc_warn_current("missing R13 value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->r13; break; case ORC_REG_DI: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) { orc_warn_current("missing RDI value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->di; break; case ORC_REG_DX: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) { orc_warn_current("missing DX value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->dx; break; default: @@ -507,6 +532,7 @@ bool unwind_next_frame(struct unwind_state *state) state->sp = sp; state->regs = NULL; + state->prev_regs = NULL; state->signal = false; break; @@ -518,6 +544,7 @@ bool unwind_next_frame(struct unwind_state *state) } state->regs = (struct pt_regs *)sp; + state->prev_regs = NULL; state->full_regs = true; state->signal = true; break; @@ -529,6 +556,8 @@ bool unwind_next_frame(struct unwind_state *state) goto err; } + if (state->full_regs) + state->prev_regs = state->regs; state->regs = (void *)sp - IRET_FRAME_OFFSET; state->full_regs = false; state->signal = true; @@ -543,8 +572,8 @@ bool unwind_next_frame(struct unwind_state *state) /* Find BP: */ switch (orc->bp_reg) { case ORC_REG_UNDEFINED: - if (state->regs && state->full_regs) - state->bp = state->regs->bp; + if (get_reg(state, offsetof(struct pt_regs, bp), &tmp)) + state->bp = tmp; break; case ORC_REG_PREV_SP: From 1200832c6e850a17f36631f6492f953a1b39e6b8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 24 Apr 2020 13:15:21 +0200 Subject: [PATCH 039/300] mptcp: fix race in msk status update Currently subflow_finish_connect() changes unconditionally any msk socket status other than TCP_ESTABLISHED. If an unblocking connect() races with close(), we can end-up triggering: IPv4: Attempt to release TCP socket in state 1 00000000e32b8b7e when the msk socket is disposed. Be sure to enter the established status only from SYN_SENT. Fixes: c3c123d16c0e ("net: mptcp: don't hang in mptcp_sendmsg() after TCP fallback") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fabd06f2ff45..71256f03707f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -225,7 +225,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); - if (inet_sk_state_load(parent) != TCP_ESTABLISHED) { + if (inet_sk_state_load(parent) == TCP_SYN_SENT) { inet_sk_state_store(parent, TCP_ESTABLISHED); parent->sk_state_change(parent); } From 52a90612fa6108d20cffd3cf6a2c228e2f3619f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Apr 2020 18:39:29 -0700 Subject: [PATCH 040/300] net: remove obsolete comment Commit b656722906ef ("net: Increase the size of skb_frag_t") removed the 16bit limitation of a frag on some 32bit arches. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 90509c37d291..b714162213ae 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2364,7 +2364,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -/* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); From 53fb6e990d782ded62d7c76d566e107c03393b74 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 14:19:01 -0500 Subject: [PATCH 041/300] objtool: Fix infinite loop in for_offset_range() Randy reported that objtool got stuck in an infinite loop when processing drivers/i2c/busses/i2c-parport.o. It was caused by the following code: 00000000000001fd : 1fd: 48 b8 00 00 00 00 00 movabs $0x0,%rax 204: 00 00 00 1ff: R_X86_64_64 .rodata-0x8 207: 41 55 push %r13 209: 41 89 f5 mov %esi,%r13d 20c: 41 54 push %r12 20e: 49 89 fc mov %rdi,%r12 211: 55 push %rbp 212: 48 89 d5 mov %rdx,%rbp 215: 53 push %rbx 216: 0f b6 5a 01 movzbl 0x1(%rdx),%ebx 21a: 48 8d 34 dd 00 00 00 lea 0x0(,%rbx,8),%rsi 221: 00 21e: R_X86_64_32S .rodata 222: 48 89 f1 mov %rsi,%rcx 225: 48 29 c1 sub %rax,%rcx find_jump_table() saw the .rodata reference and tried to find a jump table associated with it (though there wasn't one). The -0x8 rela addend is unusual. It caused find_jump_table() to send a negative table_offset (unsigned 0xfffffffffffffff8) to find_rela_by_dest(). The negative offset should have been harmless, but it actually threw for_offset_range() for a loop... literally. When the mask value got incremented past the end value, it also wrapped to zero, causing the loop exit condition to remain true forever. Prevent this scenario from happening by ensuring the incremented value is always >= the starting value. Fixes: 74b873e49d92 ("objtool: Optimize find_rela_by_dest_range()") Reported-by: Randy Dunlap Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Julien Thierry Cc: Miroslav Benes Cc: Peter Zijlstra Link: https://lore.kernel.org/r/02b719674b031800b61e33c30b2e823183627c19.1587842122.git.jpoimboe@redhat.com --- tools/objtool/elf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index ebbb10c61e24..c227a2e55751 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -87,9 +87,10 @@ struct elf { #define OFFSET_STRIDE (1UL << OFFSET_STRIDE_BITS) #define OFFSET_STRIDE_MASK (~(OFFSET_STRIDE - 1)) -#define for_offset_range(_offset, _start, _end) \ - for (_offset = ((_start) & OFFSET_STRIDE_MASK); \ - _offset <= ((_end) & OFFSET_STRIDE_MASK); \ +#define for_offset_range(_offset, _start, _end) \ + for (_offset = ((_start) & OFFSET_STRIDE_MASK); \ + _offset >= ((_start) & OFFSET_STRIDE_MASK) && \ + _offset <= ((_end) & OFFSET_STRIDE_MASK); \ _offset += OFFSET_STRIDE) static inline u32 sec_offset_hash(struct section *sec, unsigned long offset) From ea64d8d6c675c0bb712689b13810301de9d8f77a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 21 Apr 2020 02:42:19 +0200 Subject: [PATCH 042/300] netfilter: nat: never update the UDP checksum when it's 0 If the UDP header of a local VXLAN endpoint is NAT-ed, and the VXLAN device has disabled UDP checksums and enabled Tx checksum offloading, then the skb passed to udp_manip_pkt() has hdr->check == 0 (outer checksum disabled) and skb->ip_summed == CHECKSUM_PARTIAL (inner packet checksum offloaded). Because of the ->ip_summed value, udp_manip_pkt() tries to update the outer checksum with the new address and port, leading to an invalid checksum sent on the wire, as the original null checksum obviously didn't take the old address and port into account. So, we can't take ->ip_summed into account in udp_manip_pkt(), as it might not refer to the checksum we're acting on. Instead, we can base the decision to update the UDP checksum entirely on the value of hdr->check, because it's null if and only if checksum is disabled: * A fully computed checksum can't be 0, since a 0 checksum is represented by the CSUM_MANGLED_0 value instead. * A partial checksum can't be 0, since the pseudo-header always adds at least one non-zero value (the UDP protocol type 0x11) and adding more values to the sum can't make it wrap to 0 as the carry is then added to the wrapped number. * A disabled checksum uses the special value 0. The problem seems to be there from day one, although it was probably not visible before UDP tunnels were implemented. Fixes: 5b1158e909ec ("[NETFILTER]: Add NAT support for nf_conntrack") Signed-off-by: Guillaume Nault Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_proto.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 3d816a1e5442..59151dc07fdc 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -68,15 +68,13 @@ static bool udp_manip_pkt(struct sk_buff *skb, enum nf_nat_manip_type maniptype) { struct udphdr *hdr; - bool do_csum; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); - do_csum = hdr->check || skb->ip_summed == CHECKSUM_PARTIAL; + __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); - __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, do_csum); return true; } From 8aebfffacfa379ba400da573a5bf9e49634e38cb Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Sat, 25 Apr 2020 20:52:26 +0800 Subject: [PATCH 043/300] configfs: fix config_item refcnt leak in configfs_rmdir() configfs_rmdir() invokes configfs_get_config_item(), which returns a reference of the specified config_item object to "parent_item" with increased refcnt. When configfs_rmdir() returns, local variable "parent_item" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling path of configfs_rmdir(). When down_write_killable() fails, the function forgets to decrease the refcnt increased by configfs_get_config_item(), causing a refcnt leak. Fix this issue by calling config_item_put() when down_write_killable() fails. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Christoph Hellwig --- fs/configfs/dir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cf7b7e1d5bd7..cb733652ecca 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1519,6 +1519,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) spin_lock(&configfs_dirent_lock); configfs_detach_rollback(dentry); spin_unlock(&configfs_dirent_lock); + config_item_put(parent_item); return -EINTR; } frag->frag_dead = true; From 67321e02fb2da91a6a6c3fb059bf89d10ccda8ad Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 21 Apr 2020 04:18:15 +0000 Subject: [PATCH 044/300] phy: qcom-qusb2: Re add "qcom,sdm845-qusb2-phy" compat string This patch fixes a regression in 5.7-rc1+ In commit 8fe75cd4cddf ("phy: qcom-qusb2: Add generic QUSB2 V2 PHY support"), the change was made to add "qcom,qusb2-v2-phy" as a generic compat string. However the change also removed the "qcom,sdm845-qusb2-phy" compat string, which is documented in the binding and already in use. This patch re-adds the "qcom,sdm845-qusb2-phy" compat string which allows the driver to continue to work with existing dts entries such as found on the db845c. Cc: Andy Gross Cc: Bjorn Andersson Cc: Rob Herring Cc: Mark Rutland Cc: Doug Anderson Cc: Manu Gautam Cc: Sandeep Maheswaram Cc: Matthias Kaehlcke Cc: Stephen Boyd Cc: Kishon Vijay Abraham I Cc: linux-arm-msm@vger.kernel.org Cc: devicetree@vger.kernel.org Reviewed-by: Stephen Boyd Reviewed-by: Douglas Anderson Reviewed-by: Bjorn Andersson Fixes: 8fe75cd4cddf ("phy: qcom-qusb2: Add generic QUSB2 V2 PHY support") Reported-by: YongQin Liu Signed-off-by: John Stultz Signed-off-by: Kishon Vijay Abraham I --- drivers/phy/qualcomm/phy-qcom-qusb2.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/phy/qualcomm/phy-qcom-qusb2.c b/drivers/phy/qualcomm/phy-qcom-qusb2.c index 3708d43b7508..393011a05b48 100644 --- a/drivers/phy/qualcomm/phy-qcom-qusb2.c +++ b/drivers/phy/qualcomm/phy-qcom-qusb2.c @@ -815,6 +815,13 @@ static const struct of_device_id qusb2_phy_of_match_table[] = { }, { .compatible = "qcom,msm8998-qusb2-phy", .data = &msm8998_phy_cfg, + }, { + /* + * Deprecated. Only here to support legacy device + * trees that didn't include "qcom,qusb2-v2-phy" + */ + .compatible = "qcom,sdm845-qusb2-phy", + .data = &qusb2_v2_phy_cfg, }, { .compatible = "qcom,qusb2-v2-phy", .data = &qusb2_v2_phy_cfg, From 107bc0766b9feb5113074c753735a3f115c2141f Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 24 Apr 2020 17:08:29 +0200 Subject: [PATCH 045/300] vhost/vsock: fix packet delivery order to monitoring devices We want to deliver packets to monitoring devices before it is put in the virtqueue, to avoid that replies can appear in the packet capture before the transmitted packet. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index e36aaf9ba7bd..4f50dcb89ac8 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -181,14 +181,14 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, break; } - vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len); - added = true; - - /* Deliver to monitoring devices all correctly transmitted - * packets. + /* Deliver to monitoring devices all packets that we + * will transmit. */ virtio_transport_deliver_tap_pkt(pkt); + vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len); + added = true; + pkt->off += payload_len; total_len += payload_len; From a78d163978567adc2733465289293dad479d842a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 24 Apr 2020 17:08:30 +0200 Subject: [PATCH 046/300] vsock/virtio: fix multiple packet delivery to monitoring devices In virtio_transport.c, if the virtqueue is full, the transmitting packet is queued up and it will be sent in the next iteration. This causes the same packet to be delivered multiple times to monitoring devices. We want to continue to deliver packets to monitoring devices before it is put in the virtqueue, to avoid that replies can appear in the packet capture before the transmitted packet. This patch fixes the issue, adding a new flag (tap_delivered) in struct virtio_vsock_pkt, to check if the packet is already delivered to monitoring devices. In vhost/vsock.c, we are splitting packets, so we must set 'tap_delivered' to false when we queue up the same virtio_vsock_pkt to handle the remaining bytes. Signed-off-by: Stefano Garzarella Signed-off-by: David S. Miller --- drivers/vhost/vsock.c | 6 ++++++ include/linux/virtio_vsock.h | 1 + net/vmw_vsock/virtio_transport_common.c | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 4f50dcb89ac8..31a98c74f678 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -196,6 +196,12 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, * to send it with the next available buffer. */ if (pkt->off < pkt->len) { + /* We are queueing the same virtio_vsock_pkt to handle + * the remaining bytes, and we want to deliver it + * to monitoring devices in the next iteration. + */ + pkt->tap_delivered = false; + spin_lock_bh(&vsock->send_pkt_list_lock); list_add(&pkt->list, &vsock->send_pkt_list); spin_unlock_bh(&vsock->send_pkt_list_lock); diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 71c81e0dc8f2..dc636b727179 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -48,6 +48,7 @@ struct virtio_vsock_pkt { u32 len; u32 off; bool reply; + bool tap_delivered; }; struct virtio_vsock_pkt_info { diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 709038a4783e..69efc891885f 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -157,7 +157,11 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) { + if (pkt->tap_delivered) + return; + vsock_deliver_tap(virtio_transport_build_skb, pkt); + pkt->tap_delivered = true; } EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); From 6de556c31061e3b9c36546ffaaac5fdb679a2f14 Mon Sep 17 00:00:00 2001 From: Richard Clark Date: Sat, 25 Apr 2020 08:58:11 +0800 Subject: [PATCH 047/300] aquantia: Fix the media type of AQC100 ethernet controller in the driver The Aquantia AQC100 controller enables a SFP+ port, so the driver should configure the media type as '_TYPE_FIBRE' instead of '_TYPE_TP'. Signed-off-by: Richard Clark Cc: Igor Russkikh Cc: "David S. Miller" Acked-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 2edf137a7030..8a70ffe1d326 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -57,7 +57,7 @@ static const struct aq_board_revision_s hw_atl_boards[] = { { AQ_DEVICE_ID_D108, AQ_HWREV_2, &hw_atl_ops_b0, &hw_atl_b0_caps_aqc108, }, { AQ_DEVICE_ID_D109, AQ_HWREV_2, &hw_atl_ops_b0, &hw_atl_b0_caps_aqc109, }, - { AQ_DEVICE_ID_AQC100, AQ_HWREV_ANY, &hw_atl_ops_b1, &hw_atl_b0_caps_aqc107, }, + { AQ_DEVICE_ID_AQC100, AQ_HWREV_ANY, &hw_atl_ops_b1, &hw_atl_b0_caps_aqc100, }, { AQ_DEVICE_ID_AQC107, AQ_HWREV_ANY, &hw_atl_ops_b1, &hw_atl_b0_caps_aqc107, }, { AQ_DEVICE_ID_AQC108, AQ_HWREV_ANY, &hw_atl_ops_b1, &hw_atl_b0_caps_aqc108, }, { AQ_DEVICE_ID_AQC109, AQ_HWREV_ANY, &hw_atl_ops_b1, &hw_atl_b0_caps_aqc109, }, From 095f5614bfe16e5b3e191b34ea41b10d6fdd4ced Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Sat, 25 Apr 2020 20:54:37 +0800 Subject: [PATCH 048/300] net/tls: Fix sk_psock refcnt leak in bpf_exec_tx_verdict() bpf_exec_tx_verdict() invokes sk_psock_get(), which returns a reference of the specified sk_psock object to "psock" with increased refcnt. When bpf_exec_tx_verdict() returns, local variable "psock" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling path of bpf_exec_tx_verdict(). When "policy" equals to NULL but "psock" is not NULL, the function forgets to decrease the refcnt increased by sk_psock_get(), causing a refcnt leak. Fix this issue by calling sk_psock_put() on this error path before bpf_exec_tx_verdict() returns. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index c98e602a1a2d..704313dd082f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -800,6 +800,8 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, *copied -= sk_msg_free(sk, msg); tls_free_open_rec(sk); } + if (psock) + sk_psock_put(sk, psock); return err; } more_data: From 4becb7ee5b3d2829ed7b9261a245a77d5b7de902 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Sat, 25 Apr 2020 21:06:25 +0800 Subject: [PATCH 049/300] net/x25: Fix x25_neigh refcnt leak when x25 disconnect x25_connect() invokes x25_get_neigh(), which returns a reference of the specified x25_neigh object to "x25->neighbour" with increased refcnt. When x25 connect success and returns, the reference still be hold by "x25->neighbour", so the refcount should be decreased in x25_disconnect() to keep refcount balanced. The reference counting issue happens in x25_disconnect(), which forgets to decrease the refcnt increased by x25_get_neigh() in x25_connect(), causing a refcnt leak. Fix this issue by calling x25_neigh_put() before x25_disconnect() returns. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: David S. Miller --- net/x25/x25_subr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 8aa415a38814..8b1b06cabcbf 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -357,6 +357,10 @@ void x25_disconnect(struct sock *sk, int reason, unsigned char cause, sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } + read_lock_bh(&x25_list_lock); + x25_neigh_put(x25->neighbour); + x25->neighbour = NULL; + read_unlock_bh(&x25_list_lock); } /* From 62b4011fa7bef9fa00a6aeec26e69685dc1cc21e Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Sat, 25 Apr 2020 21:10:23 +0800 Subject: [PATCH 050/300] net/tls: Fix sk_psock refcnt leak when in tls_data_ready() tls_data_ready() invokes sk_psock_get(), which returns a reference of the specified sk_psock object to "psock" with increased refcnt. When tls_data_ready() returns, local variable "psock" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling path of tls_data_ready(). When "psock->ingress_msg" is empty but "psock" is not NULL, the function forgets to decrease the refcnt increased by sk_psock_get(), causing a refcnt leak. Fix this issue by calling sk_psock_put() on all paths when "psock" is not NULL. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 704313dd082f..e23f94a5549b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2083,8 +2083,9 @@ static void tls_data_ready(struct sock *sk) strp_data_ready(&ctx->strp); psock = sk_psock_get(sk); - if (psock && !list_empty(&psock->ingress_msg)) { - ctx->saved_data_ready(sk); + if (psock) { + if (!list_empty(&psock->ingress_msg)) + ctx->saved_data_ready(sk); sk_psock_put(sk, psock); } } From 14695212d4cd8b0c997f6121b6df8520038ce076 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 25 Apr 2020 12:40:25 -0700 Subject: [PATCH 051/300] fq_codel: fix TCA_FQ_CODEL_DROP_BATCH_SIZE sanity checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My intent was to not let users set a zero drop_batch_size, it seems I once again messed with min()/max(). Fixes: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()") Signed-off-by: Eric Dumazet Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 968519ff36e9..436160be9c18 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -416,7 +416,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) - q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); + q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); if (tb[TCA_FQ_CODEL_MEMORY_LIMIT]) q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])); From 8738c85c72b3108c9b9a369a39868ba5f8e10ae0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 25 Apr 2020 15:19:51 -0700 Subject: [PATCH 052/300] sch_choke: avoid potential panic in choke_reset() If choke_init() could not allocate q->tab, we would crash later in choke_reset(). BUG: KASAN: null-ptr-deref in memset include/linux/string.h:366 [inline] BUG: KASAN: null-ptr-deref in choke_reset+0x208/0x340 net/sched/sch_choke.c:326 Write of size 8 at addr 0000000000000000 by task syz-executor822/7022 CPU: 1 PID: 7022 Comm: syz-executor822 Not tainted 5.7.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x188/0x20d lib/dump_stack.c:118 __kasan_report.cold+0x5/0x4d mm/kasan/report.c:515 kasan_report+0x33/0x50 mm/kasan/common.c:625 check_memory_region_inline mm/kasan/generic.c:187 [inline] check_memory_region+0x141/0x190 mm/kasan/generic.c:193 memset+0x20/0x40 mm/kasan/common.c:85 memset include/linux/string.h:366 [inline] choke_reset+0x208/0x340 net/sched/sch_choke.c:326 qdisc_reset+0x6b/0x520 net/sched/sch_generic.c:910 dev_deactivate_queue.constprop.0+0x13c/0x240 net/sched/sch_generic.c:1138 netdev_for_each_tx_queue include/linux/netdevice.h:2197 [inline] dev_deactivate_many+0xe2/0xba0 net/sched/sch_generic.c:1195 dev_deactivate+0xf8/0x1c0 net/sched/sch_generic.c:1233 qdisc_graft+0xd25/0x1120 net/sched/sch_api.c:1051 tc_modify_qdisc+0xbab/0x1a00 net/sched/sch_api.c:1670 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5454 netlink_rcv_skb+0x15a/0x410 net/netlink/af_netlink.c:2469 netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline] netlink_unicast+0x537/0x740 net/netlink/af_netlink.c:1329 netlink_sendmsg+0x882/0xe10 net/netlink/af_netlink.c:1918 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6bf/0x7e0 net/socket.c:2362 ___sys_sendmsg+0x100/0x170 net/socket.c:2416 __sys_sendmsg+0xec/0x1b0 net/socket.c:2449 do_syscall_64+0xf6/0x7d0 arch/x86/entry/common.c:295 Fixes: 77e62da6e60c ("sch_choke: drop all packets in queue during reset") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index a36974e9c601..1bcf8fbfd40e 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -323,7 +323,8 @@ static void choke_reset(struct Qdisc *sch) sch->q.qlen = 0; sch->qstats.backlog = 0; - memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); + if (q->tab) + memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *)); q->head = q->tail = 0; red_restart(&q->vars); } From c3e302edca2457bbd0c958c445a7538fbf6a6ac8 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 26 Apr 2020 09:22:06 +0300 Subject: [PATCH 053/300] net: phy: marvell10g: fix temperature sensor on 2110 Read the temperature sensor register from the correct location for the 88E2110 PHY. There is no enable/disable bit on 2110, so make mv3310_hwmon_config() run on 88X3310 only. Fixes: 62d01535474b61 ("net: phy: marvell10g: add support for the 88x2110 PHY") Cc: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: Baruch Siach Reviewed-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/marvell10g.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index ff12492771ab..1f1a01c98e44 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -66,6 +66,9 @@ enum { MV_PCS_CSSR1_SPD2_2500 = 0x0004, MV_PCS_CSSR1_SPD2_10000 = 0x0000, + /* Temperature read register (88E2110 only) */ + MV_PCS_TEMP = 0x8042, + /* These registers appear at 0x800X and 0xa00X - the 0xa00X control * registers appear to set themselves to the 0x800X when AN is * restarted, but status registers appear readable from either. @@ -77,6 +80,7 @@ enum { MV_V2_PORT_CTRL = 0xf001, MV_V2_PORT_CTRL_SWRST = BIT(15), MV_V2_PORT_CTRL_PWRDOWN = BIT(11), + /* Temperature control/read registers (88X3310 only) */ MV_V2_TEMP_CTRL = 0xf08a, MV_V2_TEMP_CTRL_MASK = 0xc000, MV_V2_TEMP_CTRL_SAMPLE = 0x0000, @@ -104,6 +108,24 @@ static umode_t mv3310_hwmon_is_visible(const void *data, return 0; } +static int mv3310_hwmon_read_temp_reg(struct phy_device *phydev) +{ + return phy_read_mmd(phydev, MDIO_MMD_VEND2, MV_V2_TEMP); +} + +static int mv2110_hwmon_read_temp_reg(struct phy_device *phydev) +{ + return phy_read_mmd(phydev, MDIO_MMD_PCS, MV_PCS_TEMP); +} + +static int mv10g_hwmon_read_temp_reg(struct phy_device *phydev) +{ + if (phydev->drv->phy_id == MARVELL_PHY_ID_88X3310) + return mv3310_hwmon_read_temp_reg(phydev); + else /* MARVELL_PHY_ID_88E2110 */ + return mv2110_hwmon_read_temp_reg(phydev); +} + static int mv3310_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *value) { @@ -116,7 +138,7 @@ static int mv3310_hwmon_read(struct device *dev, enum hwmon_sensor_types type, } if (type == hwmon_temp && attr == hwmon_temp_input) { - temp = phy_read_mmd(phydev, MDIO_MMD_VEND2, MV_V2_TEMP); + temp = mv10g_hwmon_read_temp_reg(phydev); if (temp < 0) return temp; @@ -169,6 +191,9 @@ static int mv3310_hwmon_config(struct phy_device *phydev, bool enable) u16 val; int ret; + if (phydev->drv->phy_id != MARVELL_PHY_ID_88X3310) + return 0; + ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_V2_TEMP, MV_V2_TEMP_UNKNOWN); if (ret < 0) From c71c4e49afe173823a2a85b0cabc9b3f1176ffa2 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 26 Apr 2020 16:24:38 -0400 Subject: [PATCH 054/300] bnxt_en: Fix VF anti-spoof filter setup. Fix the logic that sets the enable/disable flag for the source MAC filter according to firmware spec 1.7.1. In the original firmware spec. before 1.7.1, the VF spoof check flags were not latched after making the HWRM_FUNC_CFG call, so there was a need to keep the func_flags so that subsequent calls would perserve the VF spoof check setting. A change was made in the 1.7.1 spec so that the flags became latched. So we now set or clear the anti- spoof setting directly without retrieving the old settings in the stored vf->func_flags which are no longer valid. We also remove the unneeded vf->func_flags. Fixes: 8eb992e876a8 ("bnxt_en: Update firmware interface spec to 1.7.6.2.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 - drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index f2caa2756f5b..f6a3250ef1c5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1066,7 +1066,6 @@ struct bnxt_vf_info { #define BNXT_VF_LINK_FORCED 0x4 #define BNXT_VF_LINK_UP 0x8 #define BNXT_VF_TRUST 0x10 - u32 func_flags; /* func cfg flags */ u32 min_tx_rate; u32 max_tx_rate; void *hwrm_cmd_req_addr; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index 6ea3df6da18c..cea2f9958a1d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -85,11 +85,10 @@ int bnxt_set_vf_spoofchk(struct net_device *dev, int vf_id, bool setting) if (old_setting == setting) return 0; - func_flags = vf->func_flags; if (setting) - func_flags |= FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_ENABLE; + func_flags = FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_ENABLE; else - func_flags |= FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_DISABLE; + func_flags = FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_DISABLE; /*TODO: if the driver supports VLAN filter on guest VLAN, * the spoof check should also include vlan anti-spoofing */ @@ -98,7 +97,6 @@ int bnxt_set_vf_spoofchk(struct net_device *dev, int vf_id, bool setting) req.flags = cpu_to_le32(func_flags); rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); if (!rc) { - vf->func_flags = func_flags; if (setting) vf->flags |= BNXT_VF_SPOOFCHK; else @@ -228,7 +226,6 @@ int bnxt_set_vf_mac(struct net_device *dev, int vf_id, u8 *mac) memcpy(vf->mac_addr, mac, ETH_ALEN); bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1); req.fid = cpu_to_le16(vf->fw_fid); - req.flags = cpu_to_le32(vf->func_flags); req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_DFLT_MAC_ADDR); memcpy(req.dflt_mac_addr, mac, ETH_ALEN); return hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); @@ -266,7 +263,6 @@ int bnxt_set_vf_vlan(struct net_device *dev, int vf_id, u16 vlan_id, u8 qos, bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1); req.fid = cpu_to_le16(vf->fw_fid); - req.flags = cpu_to_le32(vf->func_flags); req.dflt_vlan = cpu_to_le16(vlan_tag); req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_DFLT_VLAN); rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT); @@ -305,7 +301,6 @@ int bnxt_set_vf_bw(struct net_device *dev, int vf_id, int min_tx_rate, return 0; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1); req.fid = cpu_to_le16(vf->fw_fid); - req.flags = cpu_to_le32(vf->func_flags); req.enables = cpu_to_le32(FUNC_CFG_REQ_ENABLES_MAX_BW); req.max_bw = cpu_to_le32(max_tx_rate); req.enables |= cpu_to_le32(FUNC_CFG_REQ_ENABLES_MIN_BW); @@ -477,7 +472,6 @@ static void __bnxt_set_vf_params(struct bnxt *bp, int vf_id) vf = &bp->pf.vf[vf_id]; bnxt_hwrm_cmd_hdr_init(bp, &req, HWRM_FUNC_CFG, -1, -1); req.fid = cpu_to_le16(vf->fw_fid); - req.flags = cpu_to_le32(vf->func_flags); if (is_valid_ether_addr(vf->mac_addr)) { req.enables |= cpu_to_le32(FUNC_CFG_REQ_ENABLES_DFLT_MAC_ADDR); From 9e68cb0359b20f99c7b070f1d3305e5e0a9fae6d Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Sun, 26 Apr 2020 16:24:39 -0400 Subject: [PATCH 055/300] bnxt_en: Reduce BNXT_MSIX_VEC_MAX value to supported CQs per PF. Broadcom adapters support only maximum of 512 CQs per PF. If user sets MSIx vectors more than supported CQs, firmware is setting incorrect value for msix_vec_per_pf_max parameter. Fix it by reducing the BNXT_MSIX_VEC_MAX value to 512, even though the maximum # of MSIx vectors supported by adapter are 1280. Fixes: f399e8497826 ("bnxt_en: Use msix_vec_per_pf_max and msix_vec_per_pf_min devlink params.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h index 95f893f2a74d..d5c8bd49383a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h @@ -43,7 +43,7 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct devlink *dl) #define BNXT_NVM_CFG_VER_BITS 24 #define BNXT_NVM_CFG_VER_BYTES 4 -#define BNXT_MSIX_VEC_MAX 1280 +#define BNXT_MSIX_VEC_MAX 512 #define BNXT_MSIX_VEC_MIN_MAX 128 enum bnxt_nvm_dir_type { From bae361c54fb6ac6eba3b4762f49ce14beb73ef13 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 26 Apr 2020 16:24:40 -0400 Subject: [PATCH 056/300] bnxt_en: Improve AER slot reset. Improve the slot reset sequence by disabling the device to prevent bad DMAs if slot reset fails. Return the proper result instead of always PCI_ERS_RESULT_RECOVERED to the caller. Fixes: 6316ea6db93d ("bnxt_en: Enable AER support.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fead64f1ad90..d8db08ec8693 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12212,12 +12212,15 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) bnxt_ulp_start(bp, err); } - if (result != PCI_ERS_RESULT_RECOVERED && netif_running(netdev)) - dev_close(netdev); + if (result != PCI_ERS_RESULT_RECOVERED) { + if (netif_running(netdev)) + dev_close(netdev); + pci_disable_device(pdev); + } rtnl_unlock(); - return PCI_ERS_RESULT_RECOVERED; + return result; } /** From bbf211b1ecb891c7e0cc7888834504183fc8b534 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 26 Apr 2020 16:24:41 -0400 Subject: [PATCH 057/300] bnxt_en: Return error when allocating zero size context memory. bnxt_alloc_ctx_pg_tbls() should return error when the memory size of the context memory to set up is zero. By returning success (0), the caller may proceed normally and may crash later when it tries to set up the memory. Fixes: 08fe9d181606 ("bnxt_en: Add Level 2 context memory paging support.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d8db08ec8693..070c42d3318f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6642,7 +6642,7 @@ static int bnxt_alloc_ctx_pg_tbls(struct bnxt *bp, int rc; if (!mem_size) - return 0; + return -EINVAL; ctx_pg->nr_pages = DIV_ROUND_UP(mem_size, BNXT_PAGE_SIZE); if (ctx_pg->nr_pages > MAX_CTX_TOTAL_PAGES) { From c72cb303aa6c2ae7e4184f0081c6d11bf03fb96b Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 26 Apr 2020 16:24:42 -0400 Subject: [PATCH 058/300] bnxt_en: Fix VLAN acceleration handling in bnxt_fix_features(). The current logic in bnxt_fix_features() will inadvertently turn on both CTAG and STAG VLAN offload if the user tries to disable both. Fix it by checking that the user is trying to enable CTAG or STAG before enabling both. The logic is supposed to enable or disable both CTAG and STAG together. Fixes: 5a9f6b238e59 ("bnxt_en: Enable and disable RX CTAG and RX STAG VLAN acceleration together.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 070c42d3318f..d1a83716d934 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9780,6 +9780,7 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev, netdev_features_t features) { struct bnxt *bp = netdev_priv(dev); + netdev_features_t vlan_features; if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp)) features &= ~NETIF_F_NTUPLE; @@ -9796,12 +9797,14 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev, /* Both CTAG and STAG VLAN accelaration on the RX side have to be * turned on or off together. */ - if ((features & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX)) != - (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX)) { + vlan_features = features & (NETIF_F_HW_VLAN_CTAG_RX | + NETIF_F_HW_VLAN_STAG_RX); + if (vlan_features != (NETIF_F_HW_VLAN_CTAG_RX | + NETIF_F_HW_VLAN_STAG_RX)) { if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) features &= ~(NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX); - else + else if (vlan_features) features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_STAG_RX; } From df4953e4e997e273501339f607b77953772e3559 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 26 Apr 2020 18:19:07 -0700 Subject: [PATCH 059/300] sch_sfq: validate silly quantum values syzbot managed to set up sfq so that q->scaled_quantum was zero, triggering an infinite loop in sfq_dequeue() More generally, we must only accept quantum between 1 and 2^18 - 7, meaning scaled_quantum must be in [1, 0x7FFF] range. Otherwise, we also could have a loop in sfq_dequeue() if scaled_quantum happens to be 0x8000, since slot->allot could indefinitely switch between 0 and 0x8000. Fixes: eeaeb068f139 ("sch_sfq: allow big packets and be fair") Signed-off-by: Eric Dumazet Reported-by: syzbot+0251e883fe39e7a0cb0a@syzkaller.appspotmail.com Cc: Jason A. Donenfeld Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index c787d4d46017..5a6def5e4e6d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -637,6 +637,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; + + /* slot->allot is a short, make sure quantum is not too big. */ + if (ctl->quantum) { + unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum); + + if (scaled <= 0 || scaled > SHRT_MAX) + return -EINVAL; + } + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog)) return -EINVAL; From 4b5b71f770e2edefbfe74203777264bfe6a9927c Mon Sep 17 00:00:00 2001 From: Anthony Felice Date: Sun, 26 Apr 2020 22:00:59 -0400 Subject: [PATCH 060/300] net: tc35815: Fix phydev supported/advertising mask Commit 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode") updated ethernet drivers to use a linkmode bitmap. It mistakenly dropped a bitwise negation in the tc35815 ethernet driver on a bitmask to set the supported/advertising flags. Found by Anthony via code inspection, not tested as I do not have the required hardware. Fixes: 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode") Signed-off-by: Anthony Felice Reviewed-by: Akshay Bhat Reviewed-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/toshiba/tc35815.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c index b50c3ec3495b..6bcda20ed7e7 100644 --- a/drivers/net/ethernet/toshiba/tc35815.c +++ b/drivers/net/ethernet/toshiba/tc35815.c @@ -643,7 +643,7 @@ static int tc_mii_probe(struct net_device *dev) linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, mask); linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, mask); } - linkmode_and(phydev->supported, phydev->supported, mask); + linkmode_andnot(phydev->supported, phydev->supported, mask); linkmode_copy(phydev->advertising, phydev->supported); lp->link = 0; From 10e3cc180e64385edc9890c6855acf5ed9ca1339 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 27 Apr 2020 08:18:03 +0200 Subject: [PATCH 061/300] net/sonic: Fix a resource leak in an error handling path in 'jazz_sonic_probe()' A call to 'dma_alloc_coherent()' is hidden in 'sonic_alloc_descriptors()', called from 'sonic_probe1()'. This is correctly freed in the remove function, but not in the error handling path of the probe function. Fix it and add the missing 'dma_free_coherent()' call. While at it, rename a label in order to be slightly more informative. Fixes: efcce839360f ("[PATCH] macsonic/jazzsonic network drivers update") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/natsemi/jazzsonic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/natsemi/jazzsonic.c b/drivers/net/ethernet/natsemi/jazzsonic.c index bfa0c0d39600..8b018ed37b1b 100644 --- a/drivers/net/ethernet/natsemi/jazzsonic.c +++ b/drivers/net/ethernet/natsemi/jazzsonic.c @@ -208,11 +208,13 @@ static int jazz_sonic_probe(struct platform_device *pdev) err = register_netdev(dev); if (err) - goto out1; + goto undo_probe1; return 0; -out1: +undo_probe1: + dma_free_coherent(lp->device, SIZEOF_SONIC_DESC * SONIC_BUS_SCALE(lp->dma_bitmode), + lp->descriptors, lp->descriptors_laddr); release_mem_region(dev->base_addr, SONIC_MEM_SIZE); out: free_netdev(dev); From 66bb7fa81e28514ffd2cee9d07526fbb7484c029 Mon Sep 17 00:00:00 2001 From: Brian King Date: Mon, 27 Apr 2020 16:48:24 -0500 Subject: [PATCH 062/300] scsi: ibmvfc: Don't send implicit logouts prior to NPIV login Commit ed830385a2b1 ("scsi: ibmvfc: Avoid loss of all paths during SVC node reboot") introduced a regression where when the client resets or re-enables its CRQ with the hypervisor there is a chance that if the server side doesn't issue its INIT handshake quick enough the client can issue an Implicit Logout prior to doing an NPIV Login. The server treats this scenario as a protocol violation and closes the CRQ on its end forcing the client through a reset that gets the client host state and next host action out of agreement leading to a BUG assert. ibmvfc 30000003: Partner initialization complete ibmvfc 30000002: Partner initialization complete ibmvfc 30000002: Host partner adapter deregistered or failed (rc=2) ibmvfc 30000002: Partner initialized ------------[ cut here ]------------ kernel BUG at ../drivers/scsi/ibmvscsi/ibmvfc.c:4489! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Supported: No, Unreleased kernel CPU: 16 PID: 1290 Comm: ibmvfc_0 Tainted: G OE X 5.3.18-12-default NIP: c00800000d84a2b4 LR: c00800000d84a040 CTR: c00800000d84a2a0 REGS: c00000000cb57a00 TRAP: 0700 Tainted: G OE X (5.3.18-12-default) MSR: 800000000282b033 CR: 24000848 XER: 00000001 CFAR: c00800000d84a070 IRQMASK: 1 GPR00: c00800000d84a040 c00000000cb57c90 c00800000d858e00 0000000000000000 GPR04: 0000000000000000 0000000000000000 0000000000000000 00000000000000a0 GPR08: c00800000d84a074 0000000000000001 0000000000000014 c00800000d84d7d0 GPR12: 0000000000000000 c00000001ea28200 c00000000016cd98 0000000000000000 GPR16: c00800000d84b7b8 0000000000000000 0000000000000000 c00000542c706d68 GPR20: 0000000000000005 c00000542c706d88 5deadbeef0000100 5deadbeef0000122 GPR24: 000000000000000c 000000000000000b c00800000d852180 0000000000000001 GPR28: 0000000000000000 c00000542c706da0 c00000542c706860 c00000542c706828 NIP [c00800000d84a2b4] ibmvfc_work+0x3ac/0xc90 [ibmvfc] LR [c00800000d84a040] ibmvfc_work+0x138/0xc90 [ibmvfc] This scenario can be prevented by rejecting any attempt to send an Implicit Logout if the client adapter is not logged in yet. Link: https://lore.kernel.org/r/20200427214824.6890-1-tyreld@linux.ibm.com Fixes: ed830385a2b1 ("scsi: ibmvfc: Avoid loss of all paths during SVC node reboot") Signed-off-by: Brian King Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi/ibmvfc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 7da9e060b270..635f6f9cffc4 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -3640,6 +3640,11 @@ static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *tgt) struct ibmvfc_host *vhost = tgt->vhost; struct ibmvfc_event *evt; + if (!vhost->logged_in) { + ibmvfc_set_tgt_action(tgt, IBMVFC_TGT_ACTION_DEL_RPORT); + return; + } + if (vhost->discovery_threads >= disc_threads) return; From 522587e7c008c24f19ef5ef34806268992c5e5a6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Apr 2020 12:31:33 +0300 Subject: [PATCH 063/300] bus: mhi: core: Fix a NULL vs IS_ERR check in mhi_create_devices() The mhi_alloc_device() function never returns NULL, it returns error pointers. Fixes: da1c4f856924 ("bus: mhi: core: Add support for creating and destroying MHI devices") Signed-off-by: Dan Carpenter Acked-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200407093133.GM68494@mwanda Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index eb4256b81406..55928feea0c9 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -294,7 +294,7 @@ void mhi_create_devices(struct mhi_controller *mhi_cntrl) !(mhi_chan->ee_mask & BIT(mhi_cntrl->ee))) continue; mhi_dev = mhi_alloc_device(mhi_cntrl); - if (!mhi_dev) + if (IS_ERR(mhi_dev)) return; mhi_dev->dev_type = MHI_DEVICE_XFER; From 5e56bc06e18dfc8a66180fa369384b36e2ab621a Mon Sep 17 00:00:00 2001 From: Christian Gromm Date: Fri, 24 Apr 2020 17:16:34 +0200 Subject: [PATCH 064/300] most: core: use function subsys_initcall() This patch replaces function module_init() with subsys_initcall(). It is needed to ensure that the core module of the driver is initialized before a component tries to register with the core. This leads to a NULL pointer dereference if the driver is configured as in-tree. Signed-off-by: Christian Gromm Reported-by: kernel test robot Link: https://lore.kernel.org/r/1587741394-22021-1-git-send-email-christian.gromm@microchip.com Signed-off-by: Greg Kroah-Hartman --- drivers/most/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/most/core.c b/drivers/most/core.c index 06426fc5c990..f781c46cd4af 100644 --- a/drivers/most/core.c +++ b/drivers/most/core.c @@ -1483,7 +1483,7 @@ static void __exit most_exit(void) ida_destroy(&mdev_id); } -module_init(most_init); +subsys_initcall(most_init); module_exit(most_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Christian Gromm "); From 9495b7e92f716ab2bd6814fab5e97ab4a39adfdd Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 22 Apr 2020 12:09:54 +0200 Subject: [PATCH 065/300] driver core: platform: Initialize dma_parms for platform devices It's currently the platform driver's responsibility to initialize the pointer, dma_parms, for its corresponding struct device. The benefit with this approach allows us to avoid the initialization and to not waste memory for the struct device_dma_parameters, as this can be decided on a case by case basis. However, it has turned out that this approach is not very practical. Not only does it lead to open coding, but also to real errors. In principle callers of dma_set_max_seg_size() doesn't check the error code, but just assumes it succeeds. For these reasons, let's do the initialization from the common platform bus at the device registration point. This also follows the way the PCI devices are being managed, see pci_device_add(). Suggested-by: Christoph Hellwig Cc: Tested-by: Haibo Chen Reviewed-by: Arnd Bergmann Signed-off-by: Ulf Hansson Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20200422100954.31211-1-ulf.hansson@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 2 ++ include/linux/platform_device.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 5255550b7c34..b27d0f6c18c9 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -380,6 +380,8 @@ struct platform_object { */ static void setup_pdev_dma_masks(struct platform_device *pdev) { + pdev->dev.dma_parms = &pdev->dma_parms; + if (!pdev->dev.coherent_dma_mask) pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); if (!pdev->dev.dma_mask) { diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index bdc35753ef7c..77a2aada106d 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -25,6 +25,7 @@ struct platform_device { bool id_auto; struct device dev; u64 platform_dma_mask; + struct device_dma_parameters dma_parms; u32 num_resources; struct resource *resource; From f458488425f1cc9a396aa1d09bb00c48783936da Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 22 Apr 2020 12:10:13 +0200 Subject: [PATCH 066/300] amba: Initialize dma_parms for amba devices It's currently the amba driver's responsibility to initialize the pointer, dma_parms, for its corresponding struct device. The benefit with this approach allows us to avoid the initialization and to not waste memory for the struct device_dma_parameters, as this can be decided on a case by case basis. However, it has turned out that this approach is not very practical. Not only does it lead to open coding, but also to real errors. In principle callers of dma_set_max_seg_size() doesn't check the error code, but just assumes it succeeds. For these reasons, let's do the initialization from the common amba bus at the device registration point. This also follows the way the PCI devices are being managed, see pci_device_add(). Suggested-by: Christoph Hellwig Cc: Russell King Cc: Tested-by: Haibo Chen Reviewed-by: Arnd Bergmann Signed-off-by: Ulf Hansson Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20200422101013.31267-1-ulf.hansson@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/amba/bus.c | 1 + include/linux/amba/bus.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index fe1523664816..8558b629880b 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -645,6 +645,7 @@ static void amba_device_initialize(struct amba_device *dev, const char *name) dev->dev.release = amba_device_release; dev->dev.bus = &amba_bustype; dev->dev.dma_mask = &dev->dev.coherent_dma_mask; + dev->dev.dma_parms = &dev->dma_parms; dev->res.name = dev_name(&dev->dev); } diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 26f0ecf401ea..0bbfd647f5c6 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -65,6 +65,7 @@ struct amba_device { struct device dev; struct resource res; struct clk *pclk; + struct device_dma_parameters dma_parms; unsigned int periphid; unsigned int cid; struct amba_cs_uci_id uci; From 3740d93e37902b31159a82da2d5c8812ed825404 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 16 Apr 2020 16:28:59 +0000 Subject: [PATCH 067/300] coredump: fix crash when umh is disabled Commit 64e90a8acb859 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()") added the optiont to disable all call_usermodehelper() calls by setting STATIC_USERMODEHELPER_PATH to an empty string. When this is done, and crashdump is triggered, it will crash on null pointer dereference, since we make assumptions over what call_usermodehelper_exec() did. This has been reported by Sergey when one triggers a a coredump with the following configuration: ``` CONFIG_STATIC_USERMODEHELPER=y CONFIG_STATIC_USERMODEHELPER_PATH="" kernel.core_pattern = |/usr/lib/systemd/systemd-coredump %P %u %g %s %t %c %h %e ``` The way disabling the umh was designed was that call_usermodehelper_exec() would just return early, without an error. But coredump assumes certain variables are set up for us when this happens, and calls ile_start_write(cprm.file) with a NULL file. [ 2.819676] BUG: kernel NULL pointer dereference, address: 0000000000000020 [ 2.819859] #PF: supervisor read access in kernel mode [ 2.820035] #PF: error_code(0x0000) - not-present page [ 2.820188] PGD 0 P4D 0 [ 2.820305] Oops: 0000 [#1] SMP PTI [ 2.820436] CPU: 2 PID: 89 Comm: a Not tainted 5.7.0-rc1+ #7 [ 2.820680] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014 [ 2.821150] RIP: 0010:do_coredump+0xd80/0x1060 [ 2.821385] Code: e8 95 11 ed ff 48 c7 c6 cc a7 b4 81 48 8d bd 28 ff ff ff 89 c2 e8 70 f1 ff ff 41 89 c2 85 c0 0f 84 72 f7 ff ff e9 b4 fe ff ff <48> 8b 57 20 0f b7 02 66 25 00 f0 66 3d 00 8 0 0f 84 9c 01 00 00 44 [ 2.822014] RSP: 0000:ffffc9000029bcb8 EFLAGS: 00010246 [ 2.822339] RAX: 0000000000000000 RBX: ffff88803f860000 RCX: 000000000000000a [ 2.822746] RDX: 0000000000000009 RSI: 0000000000000282 RDI: 0000000000000000 [ 2.823141] RBP: ffffc9000029bde8 R08: 0000000000000000 R09: ffffc9000029bc00 [ 2.823508] R10: 0000000000000001 R11: ffff88803dec90be R12: ffffffff81c39da0 [ 2.823902] R13: ffff88803de84400 R14: 0000000000000000 R15: 0000000000000000 [ 2.824285] FS: 00007fee08183540(0000) GS:ffff88803e480000(0000) knlGS:0000000000000000 [ 2.824767] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.825111] CR2: 0000000000000020 CR3: 000000003f856005 CR4: 0000000000060ea0 [ 2.825479] Call Trace: [ 2.825790] get_signal+0x11e/0x720 [ 2.826087] do_signal+0x1d/0x670 [ 2.826361] ? force_sig_info_to_task+0xc1/0xf0 [ 2.826691] ? force_sig_fault+0x3c/0x40 [ 2.826996] ? do_trap+0xc9/0x100 [ 2.827179] exit_to_usermode_loop+0x49/0x90 [ 2.827359] prepare_exit_to_usermode+0x77/0xb0 [ 2.827559] ? invalid_op+0xa/0x30 [ 2.827747] ret_from_intr+0x20/0x20 [ 2.827921] RIP: 0033:0x55e2c76d2129 [ 2.828107] Code: 2d ff ff ff e8 68 ff ff ff 5d c6 05 18 2f 00 00 01 c3 0f 1f 80 00 00 00 00 c3 0f 1f 80 00 00 00 00 e9 7b ff ff ff 55 48 89 e5 <0f> 0b b8 00 00 00 00 5d c3 66 2e 0f 1f 84 0 0 00 00 00 00 0f 1f 40 [ 2.828603] RSP: 002b:00007fffeba5e080 EFLAGS: 00010246 [ 2.828801] RAX: 000055e2c76d2125 RBX: 0000000000000000 RCX: 00007fee0817c718 [ 2.829034] RDX: 00007fffeba5e188 RSI: 00007fffeba5e178 RDI: 0000000000000001 [ 2.829257] RBP: 00007fffeba5e080 R08: 0000000000000000 R09: 00007fee08193c00 [ 2.829482] R10: 0000000000000009 R11: 0000000000000000 R12: 000055e2c76d2040 [ 2.829727] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 2.829964] CR2: 0000000000000020 [ 2.830149] ---[ end trace ceed83d8c68a1bf1 ]--- ``` Cc: # v4.11+ Fixes: 64e90a8acb85 ("Introduce STATIC_USERMODEHELPER to mediate call_usermodehelper()") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=199795 Reported-by: Tony Vroon Reported-by: Sergey Kvachonok Tested-by: Sergei Trofimovich Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20200416162859.26518-1-mcgrof@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/coredump.c | 8 ++++++++ kernel/umh.c | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/fs/coredump.c b/fs/coredump.c index 408418e6aa13..478a0d810136 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -788,6 +788,14 @@ void do_coredump(const kernel_siginfo_t *siginfo) if (displaced) put_files_struct(displaced); if (!dump_interrupted()) { + /* + * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would + * have this set to NULL. + */ + if (!cprm.file) { + pr_info("Core dump to |%s disabled\n", cn.corename); + goto close_fail; + } file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); file_end_write(cprm.file); diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..11bf5eea474c 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -544,6 +544,11 @@ EXPORT_SYMBOL_GPL(fork_usermode_blob); * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of system workqueues. * (ie. it runs with full root capabilities and optimized affinity). + * + * Note: successful return value does not guarantee the helper was called at + * all. You can't rely on sub_info->{init,cleanup} being called even for + * UMH_WAIT_* wait modes as STATIC_USERMODEHELPER_PATH="" turns all helpers + * into a successful no-op. */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { From 00b247557858bc0651a646710d90aba186bfeed4 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 30 Mar 2020 19:28:32 -0700 Subject: [PATCH 068/300] driver core: Fix handling of fw_devlink=permissive When commit 8375e74f2bca ("driver core: Add fw_devlink kernel commandline option") added fw_devlink, it didn't implement "permissive" mode correctly. That commit got the device links flags correct to make sure unprobed suppliers don't block the probing of a consumer. However, if a consumer is waiting for mandatory suppliers to register, that could still block a consumer from probing. This commit fixes that by making sure in permissive mode, all suppliers to a consumer are treated as a optional suppliers. So, even if a consumer is waiting for suppliers to register and link itself (using the DL_FLAG_SYNC_STATE_ONLY flag) to the supplier, the consumer is never blocked from probing. Fixes: 8375e74f2bca ("driver core: Add fw_devlink kernel commandline option") Reported-by: Marek Szyprowski Signed-off-by: Saravana Kannan Tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200331022832.209618-1-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 139cdf7e7327..073045cb214e 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2370,6 +2370,11 @@ u32 fw_devlink_get_flags(void) return fw_devlink_flags; } +static bool fw_devlink_is_permissive(void) +{ + return fw_devlink_flags == DL_FLAG_SYNC_STATE_ONLY; +} + /** * device_add - add device to device hierarchy. * @dev: device. @@ -2524,7 +2529,7 @@ int device_add(struct device *dev) if (fw_devlink_flags && is_fwnode_dev && fwnode_has_op(dev->fwnode, add_links)) { fw_ret = fwnode_call_int_op(dev->fwnode, add_links, dev); - if (fw_ret == -ENODEV) + if (fw_ret == -ENODEV && !fw_devlink_is_permissive()) device_link_wait_for_mandatory_supplier(dev); else if (fw_ret) device_link_wait_for_optional_supplier(dev); From 7706b0a76a9697021e2bf395f3f065c18f51043d Mon Sep 17 00:00:00 2001 From: James Hilliard Date: Sat, 11 Apr 2020 13:02:41 -0600 Subject: [PATCH 069/300] component: Silence bind error on -EPROBE_DEFER If a component fails to bind due to -EPROBE_DEFER we should not log an error as this is not a real failure. Fixes messages like: vc4-drm soc:gpu: failed to bind 3f902000.hdmi (ops vc4_hdmi_ops): -517 vc4-drm soc:gpu: master bind failed: -517 Signed-off-by: James Hilliard Link: https://lore.kernel.org/r/20200411190241.89404-1-james.hilliard1@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/component.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/base/component.c b/drivers/base/component.c index e97704104784..dcfbe7251dc4 100644 --- a/drivers/base/component.c +++ b/drivers/base/component.c @@ -256,7 +256,8 @@ static int try_to_bring_up_master(struct master *master, ret = master->ops->bind(master->dev); if (ret < 0) { devres_release_group(master->dev, NULL); - dev_info(master->dev, "master bind failed: %d\n", ret); + if (ret != -EPROBE_DEFER) + dev_info(master->dev, "master bind failed: %d\n", ret); return ret; } @@ -611,8 +612,9 @@ static int component_bind(struct component *component, struct master *master, devres_release_group(component->dev, NULL); devres_release_group(master->dev, NULL); - dev_err(master->dev, "failed to bind %s (ops %ps): %d\n", - dev_name(component->dev), component->ops, ret); + if (ret != -EPROBE_DEFER) + dev_err(master->dev, "failed to bind %s (ops %ps): %d\n", + dev_name(component->dev), component->ops, ret); } return ret; From ce68929f07de0780c1afbbeef9aa6cf3550163d6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 22 Apr 2020 20:32:43 +0000 Subject: [PATCH 070/300] driver core: Revert default driver_deferred_probe_timeout value to 0 This patch addresses a regression in 5.7-rc1+ In commit c8c43cee29f6 ("driver core: Fix driver_deferred_probe_check_state() logic"), we both cleaned up the logic and also set the default driver_deferred_probe_timeout value to 30 seconds to allow for drivers that are missing dependencies to have some time so that the dependency may be loaded from userland after initcalls_done is set. However, Yoshihiro Shimoda reported that on his device that expects to have unmet dependencies (due to "optional links" in its devicetree), was failing to mount the NFS root. In digging further, it seemed the problem was that while the device properly probes after waiting 30 seconds for any missing modules to load, the ip_auto_config() had already failed, resulting in NFS to fail. This was due to ip_auto_config() calling wait_for_device_probe() which doesn't wait for the driver_deferred_probe_timeout to fire. Fixing that issue is possible, but could also introduce 30 second delays in bootups for users who don't have any missing dependencies, which is not ideal. So I think the best solution to avoid any regressions is to revert back to a default timeout value of zero, and allow systems that need to utilize the timeout in order for userland to load any modules that supply misisng dependencies in the dts to specify the timeout length via the exiting documented boot argument. Thanks to Geert for chasing down that ip_auto_config was why NFS was failing in this case! Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Jakub Kicinski Cc: Rafael J. Wysocki Cc: Rob Herring Cc: Geert Uytterhoeven Cc: Yoshihiro Shimoda Cc: Robin Murphy Cc: Andy Shevchenko Cc: Sudeep Holla Cc: Andy Shevchenko Cc: Naresh Kamboju Cc: Basil Eljuse Cc: Ferry Toth Cc: Arnd Bergmann Cc: Anders Roxell Reported-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Tested-by: Geert Uytterhoeven Fixes: c8c43cee29f6 ("driver core: Fix driver_deferred_probe_check_state() logic") Signed-off-by: John Stultz Link: https://lore.kernel.org/r/20200422203245.83244-2-john.stultz@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 06ec0e851fa1..908ae4d7805e 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -224,16 +224,7 @@ static int deferred_devs_show(struct seq_file *s, void *data) } DEFINE_SHOW_ATTRIBUTE(deferred_devs); -#ifdef CONFIG_MODULES -/* - * In the case of modules, set the default probe timeout to - * 30 seconds to give userland some time to load needed modules - */ -int driver_deferred_probe_timeout = 30; -#else -/* In the case of !modules, no probe timeout needed */ -int driver_deferred_probe_timeout = -1; -#endif +int driver_deferred_probe_timeout; EXPORT_SYMBOL_GPL(driver_deferred_probe_timeout); static int __init deferred_probe_timeout_setup(char *str) @@ -266,7 +257,7 @@ int driver_deferred_probe_check_state(struct device *dev) return -ENODEV; } - if (!driver_deferred_probe_timeout) { + if (!driver_deferred_probe_timeout && initcalls_done) { dev_WARN(dev, "deferred probe timeout, ignoring dependency"); return -ETIMEDOUT; } From 4ccc03e28ec309173f434fa38b48b6ad65ff5d1b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 22 Apr 2020 20:32:44 +0000 Subject: [PATCH 071/300] driver core: Use dev_warn() instead of dev_WARN() for deferred_probe_timeout warnings In commit c8c43cee29f6 ("driver core: Fix driver_deferred_probe_check_state() logic") and following changes the logic was changes slightly so that if there is no driver to match whats found in the dtb, we wait the sepcified seconds for modules to be loaded by userland, and then timeout, where as previously we'd print "ignoring dependency for device, assuming no driver" and immediately return -ENODEV after initcall_done. However, in the timeout case (which previously existed but was practicaly un-used without a boot argument), the timeout message uses dev_WARN(). This means folks are now seeing a big backtrace in their boot logs if there a entry in their dts that doesn't have a driver. To fix this, lets use dev_warn(), instead of dev_WARN() to match the previous error path. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Jakub Kicinski Cc: Rafael J. Wysocki Cc: Rob Herring Cc: Geert Uytterhoeven Cc: Yoshihiro Shimoda Cc: Robin Murphy Cc: Andy Shevchenko Cc: Sudeep Holla Cc: Andy Shevchenko Cc: Naresh Kamboju Cc: Basil Eljuse Cc: Ferry Toth Cc: Arnd Bergmann Cc: Anders Roxell Cc: linux-pm@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Fixes: c8c43cee29f6 ("driver core: Fix driver_deferred_probe_check_state() logic") Signed-off-by: John Stultz Link: https://lore.kernel.org/r/20200422203245.83244-3-john.stultz@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 908ae4d7805e..9c88afa5c74a 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -258,7 +258,7 @@ int driver_deferred_probe_check_state(struct device *dev) } if (!driver_deferred_probe_timeout && initcalls_done) { - dev_WARN(dev, "deferred probe timeout, ignoring dependency"); + dev_warn(dev, "deferred probe timeout, ignoring dependency"); return -ETIMEDOUT; } From 35a672363ab3e8dfe4ebcadb4dd0b2d06bb85ebe Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 22 Apr 2020 20:32:45 +0000 Subject: [PATCH 072/300] driver core: Ensure wait_for_device_probe() waits until the deferred_probe_timeout fires In commit c8c43cee29f6 ("driver core: Fix driver_deferred_probe_check_state() logic"), we set the default driver_deferred_probe_timeout value to 30 seconds to allow for drivers that are missing dependencies to have some time so that the dependency may be loaded from userland after initcalls_done is set. However, Yoshihiro Shimoda reported that on his device that expects to have unmet dependencies (due to "optional links" in its devicetree), was failing to mount the NFS root. In digging further, it seemed the problem was that while the device properly probes after waiting 30 seconds for any missing modules to load, the ip_auto_config() had already failed, resulting in NFS to fail. This was due to ip_auto_config() calling wait_for_device_probe() which doesn't wait for the driver_deferred_probe_timeout to fire. This patch tries to fix the issue by creating a waitqueue for the driver_deferred_probe_timeout, and calling wait_event() to make sure driver_deferred_probe_timeout is zero in wait_for_device_probe() to make sure all the probing is finished. The downside to this solution is that kernel functionality that uses wait_for_device_probe(), will block until the driver_deferred_probe_timeout fires, regardless of if there is any missing dependencies. However, the previous patch reverts the default timeout value to zero, so this side-effect will only affect users who specify a driver_deferred_probe_timeout= value as a boot argument, where the additional delay would be beneficial to allow modules to load later during boot. Thanks to Geert for chasing down that ip_auto_config was why NFS was failing in this case! Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Jakub Kicinski Cc: Rafael J. Wysocki Cc: Rob Herring Cc: Geert Uytterhoeven Cc: Yoshihiro Shimoda Cc: Robin Murphy Cc: Andy Shevchenko Cc: Sudeep Holla Cc: Andy Shevchenko Cc: Naresh Kamboju Cc: Basil Eljuse Cc: Ferry Toth Cc: Arnd Bergmann Cc: Anders Roxell Cc: linux-pm@vger.kernel.org Reported-by: Yoshihiro Shimoda Tested-by: Geert Uytterhoeven Tested-by: Yoshihiro Shimoda Fixes: c8c43cee29f6 ("driver core: Fix driver_deferred_probe_check_state() logic") Signed-off-by: John Stultz Link: https://lore.kernel.org/r/20200422203245.83244-4-john.stultz@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 9c88afa5c74a..94037be7f5d7 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -226,6 +226,7 @@ DEFINE_SHOW_ATTRIBUTE(deferred_devs); int driver_deferred_probe_timeout; EXPORT_SYMBOL_GPL(driver_deferred_probe_timeout); +static DECLARE_WAIT_QUEUE_HEAD(probe_timeout_waitqueue); static int __init deferred_probe_timeout_setup(char *str) { @@ -275,6 +276,7 @@ static void deferred_probe_timeout_work_func(struct work_struct *work) list_for_each_entry_safe(private, p, &deferred_probe_pending_list, deferred_probe) dev_info(private->device, "deferred probe pending"); + wake_up(&probe_timeout_waitqueue); } static DECLARE_DELAYED_WORK(deferred_probe_timeout_work, deferred_probe_timeout_work_func); @@ -649,6 +651,9 @@ int driver_probe_done(void) */ void wait_for_device_probe(void) { + /* wait for probe timeout */ + wait_event(probe_timeout_waitqueue, !driver_deferred_probe_timeout); + /* wait for the deferred probe workqueue to finish */ flush_work(&deferred_probe_work); From c3bf9930921b33edb31909006607e478751a6f5e Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 9 Apr 2020 10:18:10 +0300 Subject: [PATCH 073/300] thunderbolt: Check return value of tb_sw_read() in usb4_switch_op() The function misses checking return value of tb_sw_read() before it accesses the value that was read. Fix this by checking the return value first. Fixes: b04079837b20 ("thunderbolt: Add initial support for USB4") Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/thunderbolt/usb4.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c index 3d084cec136f..50c7534ba31e 100644 --- a/drivers/thunderbolt/usb4.c +++ b/drivers/thunderbolt/usb4.c @@ -182,6 +182,9 @@ static int usb4_switch_op(struct tb_switch *sw, u16 opcode, u8 *status) return ret; ret = tb_sw_read(sw, &val, TB_CFG_SWITCH, ROUTER_CS_26, 1); + if (ret) + return ret; + if (val & ROUTER_CS_26_ONS) return -EOPNOTSUPP; From caec66198d137c26f0d234abc498866a58c64150 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 28 Apr 2020 14:49:45 +1000 Subject: [PATCH 074/300] net/ena: Fix build warning in ena_xdp_set() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following build warning in ena_xdp_set(), which is observed on aarch64 with 64KB page size. In file included from ./include/net/inet_sock.h:19, from ./include/net/ip.h:27, from drivers/net/ethernet/amazon/ena/ena_netdev.c:46: drivers/net/ethernet/amazon/ena/ena_netdev.c: In function \ ‘ena_xdp_set’: \ drivers/net/ethernet/amazon/ena/ena_netdev.c:557:6: warning: \ format ‘%lu’ \ expects argument of type ‘long unsigned int’, but argument 4 \ has type ‘int’ \ [-Wformat=] "Failed to set xdp program, the current MTU (%d) is \ larger than the maximum allowed MTU (%lu) while xdp is on", Signed-off-by: Gavin Shan Acked-by: Shay Agroskin Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 97dfd0c67e84..9e1860d81908 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -69,7 +69,7 @@ * 16kB. */ #if PAGE_SIZE > SZ_16K -#define ENA_PAGE_SIZE SZ_16K +#define ENA_PAGE_SIZE (_AC(SZ_16K, UL)) #else #define ENA_PAGE_SIZE PAGE_SIZE #endif From 8999dc89497ab1c80d0718828e838c7cd5f6bffe Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 28 Apr 2020 16:12:08 +0800 Subject: [PATCH 075/300] net/x25: Fix null-ptr-deref in x25_disconnect We should check null before do x25_neigh_put in x25_disconnect, otherwise may cause null-ptr-deref like this: #include #include int main() { int sck_x25; sck_x25 = socket(AF_X25, SOCK_SEQPACKET, 0); close(sck_x25); return 0; } BUG: kernel NULL pointer dereference, address: 00000000000000d8 CPU: 0 PID: 4817 Comm: t2 Not tainted 5.7.0-rc3+ #159 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3- RIP: 0010:x25_disconnect+0x91/0xe0 Call Trace: x25_release+0x18a/0x1b0 __sock_release+0x3d/0xc0 sock_close+0x13/0x20 __fput+0x107/0x270 ____fput+0x9/0x10 task_work_run+0x6d/0xb0 exit_to_usermode_loop+0x102/0x110 do_syscall_64+0x23c/0x260 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Reported-by: syzbot+6db548b615e5aeefdce2@syzkaller.appspotmail.com Fixes: 4becb7ee5b3d ("net/x25: Fix x25_neigh refcnt leak when x25 disconnect") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/x25/x25_subr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index 8b1b06cabcbf..0285aaa1e93c 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -357,10 +357,12 @@ void x25_disconnect(struct sock *sk, int reason, unsigned char cause, sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } - read_lock_bh(&x25_list_lock); - x25_neigh_put(x25->neighbour); - x25->neighbour = NULL; - read_unlock_bh(&x25_list_lock); + if (x25->neighbour) { + read_lock_bh(&x25_list_lock); + x25_neigh_put(x25->neighbour); + x25->neighbour = NULL; + read_unlock_bh(&x25_list_lock); + } } /* From b36522150e5b85045f868768d46fbaaa034174b2 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Mon, 27 Apr 2020 15:49:53 -0700 Subject: [PATCH 076/300] scsi: ibmvscsi: Fix WARN_ON during event pool release While removing an ibmvscsi client adapter a WARN_ON like the following is seen in the kernel log: drmgr: drmgr: -r -c slot -s U9080.M9S.783AEC8-V11-C11 -w 5 -d 1 WARNING: CPU: 9 PID: 24062 at ../kernel/dma/mapping.c:311 dma_free_attrs+0x78/0x110 Supported: No, Unreleased kernel CPU: 9 PID: 24062 Comm: drmgr Kdump: loaded Tainted: G X 5.3.18-12-default NIP: c0000000001fa758 LR: c0000000001fa744 CTR: c0000000001fa6e0 REGS: c0000002173375d0 TRAP: 0700 Tainted: G X (5.3.18-12-default) MSR: 8000000000029033 CR: 28088282 XER: 20000000 CFAR: c0000000001fbf0c IRQMASK: 1 GPR00: c0000000001fa744 c000000217337860 c00000000161ab00 0000000000000000 GPR04: 0000000000000000 c000011e12250000 0000000018010000 0000000000000000 GPR08: 0000000000000000 0000000000000001 0000000000000001 c0080000190f4fa8 GPR12: c0000000001fa6e0 c000000007fc2a00 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 000000011420e310 0000000000000000 0000000000000000 0000000018010000 GPR28: c00000000159de50 c000011e12250000 0000000000006600 c000011e5c994848 NIP [c0000000001fa758] dma_free_attrs+0x78/0x110 LR [c0000000001fa744] dma_free_attrs+0x64/0x110 Call Trace: [c000000217337860] [000000011420e310] 0x11420e310 (unreliable) [c0000002173378b0] [c0080000190f0280] release_event_pool+0xd8/0x120 [ibmvscsi] [c000000217337930] [c0080000190f3f74] ibmvscsi_remove+0x6c/0x160 [ibmvscsi] [c000000217337960] [c0000000000f3cac] vio_bus_remove+0x5c/0x100 [c0000002173379a0] [c00000000087a0a4] device_release_driver_internal+0x154/0x280 [c0000002173379e0] [c0000000008777cc] bus_remove_device+0x11c/0x220 [c000000217337a60] [c000000000870fc4] device_del+0x1c4/0x470 [c000000217337b10] [c0000000008712a0] device_unregister+0x30/0xa0 [c000000217337b80] [c0000000000f39ec] vio_unregister_device+0x2c/0x60 [c000000217337bb0] [c00800001a1d0964] dlpar_remove_slot+0x14c/0x250 [rpadlpar_io] [c000000217337c50] [c00800001a1d0bcc] remove_slot_store+0xa4/0x110 [rpadlpar_io] [c000000217337cd0] [c000000000c091a0] kobj_attr_store+0x30/0x50 [c000000217337cf0] [c00000000057c934] sysfs_kf_write+0x64/0x90 [c000000217337d10] [c00000000057be10] kernfs_fop_write+0x1b0/0x290 [c000000217337d60] [c000000000488c4c] __vfs_write+0x3c/0x70 [c000000217337d80] [c00000000048c648] vfs_write+0xd8/0x260 [c000000217337dd0] [c00000000048ca8c] ksys_write+0xdc/0x130 [c000000217337e20] [c00000000000b488] system_call+0x5c/0x70 Instruction dump: 7c840074 f8010010 f821ffb1 20840040 eb830218 7c8407b4 48002019 60000000 2fa30000 409e003c 892d0988 792907e0 <0b090000> 2fbd0000 419e0028 2fbc0000 ---[ end trace 5955b3c0cc079942 ]--- rpadlpar_io: slot U9080.M9S.783AEC8-V11-C11 removed This is tripped as a result of irqs being disabled during the call to dma_free_coherent() by release_event_pool(). At this point in the code path we have quiesced the adapter and it is overly paranoid to be holding the host lock. [mkp: fixed build warning reported by sfr] Link: https://lore.kernel.org/r/1588027793-17952-1-git-send-email-tyreld@linux.ibm.com Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi/ibmvscsi.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index 7f66a7783209..59f0f1030c54 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -2320,16 +2320,12 @@ static int ibmvscsi_probe(struct vio_dev *vdev, const struct vio_device_id *id) static int ibmvscsi_remove(struct vio_dev *vdev) { struct ibmvscsi_host_data *hostdata = dev_get_drvdata(&vdev->dev); - unsigned long flags; srp_remove_host(hostdata->host); scsi_remove_host(hostdata->host); purge_requests(hostdata, DID_ERROR); - - spin_lock_irqsave(hostdata->host->host_lock, flags); release_event_pool(&hostdata->pool, hostdata); - spin_unlock_irqrestore(hostdata->host->host_lock, flags); ibmvscsi_release_crq_queue(&hostdata->queue, hostdata, max_events); From 0c1d3f2c9a0dc49d96754148b74a44eaa37bfbd8 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 29 Apr 2020 06:21:16 +0200 Subject: [PATCH 077/300] MAINTAINERS: remove entry after hp100 driver removal Commit a10079c66290 ("staging: remove hp100 driver") removed all files from ./drivers/staging/hp/, but missed to adjust MAINTAINERS. Since then, ./scripts/get_maintainer.pl --self-test=patterns complains: warning: no file matches F: drivers/staging/hp/hp100.* So, drop HP100 Driver entry in MAINTAINERS now. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20200429042116.29126-1-lukas.bulwahn@gmail.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 5 ----- 1 file changed, 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 26f281d9f32a..41e2b577488f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7746,11 +7746,6 @@ L: platform-driver-x86@vger.kernel.org S: Orphan F: drivers/platform/x86/tc1100-wmi.c -HP100: Driver for HP 10/100 Mbit/s Voice Grade Network Adapter Series -M: Jaroslav Kysela -S: Obsolete -F: drivers/staging/hp/hp100.* - HPET: High Precision Event Timers driver M: Clemens Ladisch S: Maintained From 5409e0cca53af8d2529f8398c9b05c36e25f9d2f Mon Sep 17 00:00:00 2001 From: ChenTao Date: Wed, 29 Apr 2020 13:19:04 +0300 Subject: [PATCH 078/300] interconnect: qcom: Move the static keyword to the front of declaration Fix the following warning: Move the static keyword to the front of declaration of sdm845_icc_osm_l3 sdm845_aggre1_noc sc7180_icc_osm_l3 sdm845_aggre2_noc sdm845_config_noc sdm845_dc_noc sdm845_gladiator_noc sdm845_mem_noc sdm845_mmss_noc and sdm845_system_noc, resolve the following compiler warning that can be when building with warnings enabled (W=1): drivers/interconnect/qcom/osm-l3.c:81:1: warning: const static struct qcom_icc_desc sdm845_icc_osm_l3 = { drivers/interconnect/qcom/osm-l3.c:94:1: warning: const static struct qcom_icc_desc sc7180_icc_osm_l3 = { drivers/interconnect/qcom/sdm845.c:195:1: warning: const static struct qcom_icc_desc sdm845_aggre1_noc = { drivers/interconnect/qcom/sdm845.c:223:1: warning: const static struct qcom_icc_desc sdm845_aggre2_noc = { drivers/interconnect/qcom/sdm845.c:284:1: warning: const static struct qcom_icc_desc sdm845_config_noc = { drivers/interconnect/qcom/sdm845.c:300:1: warning: const static struct qcom_icc_desc sdm845_dc_noc = { drivers/interconnect/qcom/sdm845.c:318:1: warning: const static struct qcom_icc_desc sdm845_gladiator_noc = { drivers/interconnect/qcom/sdm845.c:353:1: warning: const static struct qcom_icc_desc sdm845_mem_noc = { drivers/interconnect/qcom/sdm845.c:387:1: warning: const static struct qcom_icc_desc sdm845_mmss_noc = { drivers/interconnect/qcom/sdm845.c:433:1: warning: const static struct qcom_icc_desc sdm845_system_noc = { Reported-by: Hulk Robot Signed-off-by: ChenTao Link: https://lore.kernel.org/r/20200423132142.45174-1-chentao107@huawei.com Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/20200429101904.5771-2-georgi.djakov@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/interconnect/qcom/osm-l3.c | 4 ++-- drivers/interconnect/qcom/sdm845.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/interconnect/qcom/osm-l3.c b/drivers/interconnect/qcom/osm-l3.c index a03c6d6833df..96fb9ff5ff2e 100644 --- a/drivers/interconnect/qcom/osm-l3.c +++ b/drivers/interconnect/qcom/osm-l3.c @@ -78,7 +78,7 @@ static struct qcom_icc_node *sdm845_osm_l3_nodes[] = { [SLAVE_OSM_L3] = &sdm845_osm_l3, }; -const static struct qcom_icc_desc sdm845_icc_osm_l3 = { +static const struct qcom_icc_desc sdm845_icc_osm_l3 = { .nodes = sdm845_osm_l3_nodes, .num_nodes = ARRAY_SIZE(sdm845_osm_l3_nodes), }; @@ -91,7 +91,7 @@ static struct qcom_icc_node *sc7180_osm_l3_nodes[] = { [SLAVE_OSM_L3] = &sc7180_osm_l3, }; -const static struct qcom_icc_desc sc7180_icc_osm_l3 = { +static const struct qcom_icc_desc sc7180_icc_osm_l3 = { .nodes = sc7180_osm_l3_nodes, .num_nodes = ARRAY_SIZE(sc7180_osm_l3_nodes), }; diff --git a/drivers/interconnect/qcom/sdm845.c b/drivers/interconnect/qcom/sdm845.c index b013b80caa45..f6c7b969520d 100644 --- a/drivers/interconnect/qcom/sdm845.c +++ b/drivers/interconnect/qcom/sdm845.c @@ -192,7 +192,7 @@ static struct qcom_icc_node *aggre1_noc_nodes[] = { [SLAVE_ANOC_PCIE_A1NOC_SNOC] = &qns_pcie_a1noc_snoc, }; -const static struct qcom_icc_desc sdm845_aggre1_noc = { +static const struct qcom_icc_desc sdm845_aggre1_noc = { .nodes = aggre1_noc_nodes, .num_nodes = ARRAY_SIZE(aggre1_noc_nodes), .bcms = aggre1_noc_bcms, @@ -220,7 +220,7 @@ static struct qcom_icc_node *aggre2_noc_nodes[] = { [SLAVE_SERVICE_A2NOC] = &srvc_aggre2_noc, }; -const static struct qcom_icc_desc sdm845_aggre2_noc = { +static const struct qcom_icc_desc sdm845_aggre2_noc = { .nodes = aggre2_noc_nodes, .num_nodes = ARRAY_SIZE(aggre2_noc_nodes), .bcms = aggre2_noc_bcms, @@ -281,7 +281,7 @@ static struct qcom_icc_node *config_noc_nodes[] = { [SLAVE_SERVICE_CNOC] = &srvc_cnoc, }; -const static struct qcom_icc_desc sdm845_config_noc = { +static const struct qcom_icc_desc sdm845_config_noc = { .nodes = config_noc_nodes, .num_nodes = ARRAY_SIZE(config_noc_nodes), .bcms = config_noc_bcms, @@ -297,7 +297,7 @@ static struct qcom_icc_node *dc_noc_nodes[] = { [SLAVE_MEM_NOC_CFG] = &qhs_memnoc, }; -const static struct qcom_icc_desc sdm845_dc_noc = { +static const struct qcom_icc_desc sdm845_dc_noc = { .nodes = dc_noc_nodes, .num_nodes = ARRAY_SIZE(dc_noc_nodes), .bcms = dc_noc_bcms, @@ -315,7 +315,7 @@ static struct qcom_icc_node *gladiator_noc_nodes[] = { [SLAVE_SERVICE_GNOC] = &srvc_gnoc, }; -const static struct qcom_icc_desc sdm845_gladiator_noc = { +static const struct qcom_icc_desc sdm845_gladiator_noc = { .nodes = gladiator_noc_nodes, .num_nodes = ARRAY_SIZE(gladiator_noc_nodes), .bcms = gladiator_noc_bcms, @@ -350,7 +350,7 @@ static struct qcom_icc_node *mem_noc_nodes[] = { [SLAVE_EBI1] = &ebi, }; -const static struct qcom_icc_desc sdm845_mem_noc = { +static const struct qcom_icc_desc sdm845_mem_noc = { .nodes = mem_noc_nodes, .num_nodes = ARRAY_SIZE(mem_noc_nodes), .bcms = mem_noc_bcms, @@ -384,7 +384,7 @@ static struct qcom_icc_node *mmss_noc_nodes[] = { [SLAVE_CAMNOC_UNCOMP] = &qns_camnoc_uncomp, }; -const static struct qcom_icc_desc sdm845_mmss_noc = { +static const struct qcom_icc_desc sdm845_mmss_noc = { .nodes = mmss_noc_nodes, .num_nodes = ARRAY_SIZE(mmss_noc_nodes), .bcms = mmss_noc_bcms, @@ -430,7 +430,7 @@ static struct qcom_icc_node *system_noc_nodes[] = { [SLAVE_TCU] = &xs_sys_tcu_cfg, }; -const static struct qcom_icc_desc sdm845_system_noc = { +static const struct qcom_icc_desc sdm845_system_noc = { .nodes = system_noc_nodes, .num_nodes = ARRAY_SIZE(system_noc_nodes), .bcms = system_noc_bcms, From 0ed08faded1da03eb3def61502b27f81aef2e615 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 22 Apr 2020 16:18:48 -0400 Subject: [PATCH 079/300] HID: usbhid: Fix race between usbhid_close() and usbhid_stop() The syzbot fuzzer discovered a bad race between in the usbhid driver between usbhid_stop() and usbhid_close(). In particular, usbhid_stop() does: usb_free_urb(usbhid->urbin); ... usbhid->urbin = NULL; /* don't mess up next start */ and usbhid_close() does: usb_kill_urb(usbhid->urbin); with no mutual exclusion. If the two routines happen to run concurrently so that usb_kill_urb() is called in between the usb_free_urb() and the NULL assignment, it will access the deallocated urb structure -- a use-after-free bug. This patch adds a mutex to the usbhid private structure and uses it to enforce mutual exclusion of the usbhid_start(), usbhid_stop(), usbhid_open() and usbhid_close() callbacks. Reported-and-tested-by: syzbot+7bf5a7b0f0a1f9446f4c@syzkaller.appspotmail.com Signed-off-by: Alan Stern CC: Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 37 +++++++++++++++++++++++++++-------- drivers/hid/usbhid/usbhid.h | 1 + 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index c7bc9db5b192..17a638f15082 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -682,16 +682,21 @@ static int usbhid_open(struct hid_device *hid) struct usbhid_device *usbhid = hid->driver_data; int res; + mutex_lock(&usbhid->mutex); + set_bit(HID_OPENED, &usbhid->iofl); - if (hid->quirks & HID_QUIRK_ALWAYS_POLL) - return 0; + if (hid->quirks & HID_QUIRK_ALWAYS_POLL) { + res = 0; + goto Done; + } res = usb_autopm_get_interface(usbhid->intf); /* the device must be awake to reliably request remote wakeup */ if (res < 0) { clear_bit(HID_OPENED, &usbhid->iofl); - return -EIO; + res = -EIO; + goto Done; } usbhid->intf->needs_remote_wakeup = 1; @@ -725,6 +730,9 @@ static int usbhid_open(struct hid_device *hid) msleep(50); clear_bit(HID_RESUME_RUNNING, &usbhid->iofl); + + Done: + mutex_unlock(&usbhid->mutex); return res; } @@ -732,6 +740,8 @@ static void usbhid_close(struct hid_device *hid) { struct usbhid_device *usbhid = hid->driver_data; + mutex_lock(&usbhid->mutex); + /* * Make sure we don't restart data acquisition due to * a resumption we no longer care about by avoiding racing @@ -743,12 +753,13 @@ static void usbhid_close(struct hid_device *hid) clear_bit(HID_IN_POLLING, &usbhid->iofl); spin_unlock_irq(&usbhid->lock); - if (hid->quirks & HID_QUIRK_ALWAYS_POLL) - return; + if (!(hid->quirks & HID_QUIRK_ALWAYS_POLL)) { + hid_cancel_delayed_stuff(usbhid); + usb_kill_urb(usbhid->urbin); + usbhid->intf->needs_remote_wakeup = 0; + } - hid_cancel_delayed_stuff(usbhid); - usb_kill_urb(usbhid->urbin); - usbhid->intf->needs_remote_wakeup = 0; + mutex_unlock(&usbhid->mutex); } /* @@ -1057,6 +1068,8 @@ static int usbhid_start(struct hid_device *hid) unsigned int n, insize = 0; int ret; + mutex_lock(&usbhid->mutex); + clear_bit(HID_DISCONNECTED, &usbhid->iofl); usbhid->bufsize = HID_MIN_BUFFER_SIZE; @@ -1177,6 +1190,8 @@ static int usbhid_start(struct hid_device *hid) usbhid_set_leds(hid); device_set_wakeup_enable(&dev->dev, 1); } + + mutex_unlock(&usbhid->mutex); return 0; fail: @@ -1187,6 +1202,7 @@ fail: usbhid->urbout = NULL; usbhid->urbctrl = NULL; hid_free_buffers(dev, hid); + mutex_unlock(&usbhid->mutex); return ret; } @@ -1202,6 +1218,8 @@ static void usbhid_stop(struct hid_device *hid) usbhid->intf->needs_remote_wakeup = 0; } + mutex_lock(&usbhid->mutex); + clear_bit(HID_STARTED, &usbhid->iofl); spin_lock_irq(&usbhid->lock); /* Sync with error and led handlers */ set_bit(HID_DISCONNECTED, &usbhid->iofl); @@ -1222,6 +1240,8 @@ static void usbhid_stop(struct hid_device *hid) usbhid->urbout = NULL; hid_free_buffers(hid_to_usb_dev(hid), hid); + + mutex_unlock(&usbhid->mutex); } static int usbhid_power(struct hid_device *hid, int lvl) @@ -1382,6 +1402,7 @@ static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id * INIT_WORK(&usbhid->reset_work, hid_reset); timer_setup(&usbhid->io_retry, hid_retry_timeout, 0); spin_lock_init(&usbhid->lock); + mutex_init(&usbhid->mutex); ret = hid_add_device(hid); if (ret) { diff --git a/drivers/hid/usbhid/usbhid.h b/drivers/hid/usbhid/usbhid.h index 8620408bd7af..75fe85d3d27a 100644 --- a/drivers/hid/usbhid/usbhid.h +++ b/drivers/hid/usbhid/usbhid.h @@ -80,6 +80,7 @@ struct usbhid_device { dma_addr_t outbuf_dma; /* Output buffer dma */ unsigned long last_out; /* record of last output for timeouts */ + struct mutex mutex; /* start/stop/open/close */ spinlock_t lock; /* fifo spinlock */ unsigned long iofl; /* I/O flags (CTRL_RUNNING, OUT_RUNNING) */ struct timer_list io_retry; /* Retry timer */ From 2a15483b401c0b07e44b43b95414e36f32c02f32 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 29 Apr 2020 17:23:49 +0000 Subject: [PATCH 080/300] regulator: Revert "Use driver_deferred_probe_timeout for regulator_init_complete_work" This reverts commit dca0b44957e5 ("regulator: Use driver_deferred_probe_timeout for regulator_init_complete_work"), as we ended up reverting the default deferred_probe_timeout value back to zero, to preserve behavior with 5.6 we need to decouple the regulator timeout which was previously 30 seconds. This avoids breaking some systems that depend on the regulator timeout but don't require the deferred probe timeout. Cc: linux-pm@vger.kernel.org Cc: Linus Walleij Cc: Thierry Reding Cc: Liam Girdwood Cc: Bjorn Andersson Cc: Saravana Kannan Cc: Todd Kjos Cc: Len Brown Cc: Pavel Machek Cc: Ulf Hansson Cc: Kevin Hilman Cc: "Rafael J. Wysocki" Cc: Rob Herring Reported-by: Marek Szyprowski Suggested-by: Mark Brown Signed-off-by: John Stultz Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20200429172349.55979-1-john.stultz@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/regulator/core.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index c340505150b6..7486f6e4e613 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -5754,10 +5754,6 @@ static DECLARE_DELAYED_WORK(regulator_init_complete_work, static int __init regulator_init_complete(void) { - int delay = driver_deferred_probe_timeout; - - if (delay < 0) - delay = 0; /* * Since DT doesn't provide an idiomatic mechanism for * enabling full constraints and since it's much more natural @@ -5768,17 +5764,18 @@ static int __init regulator_init_complete(void) has_full_constraints = true; /* - * If driver_deferred_probe_timeout is set, we punt - * completion for that many seconds since systems like - * distros will load many drivers from userspace so consumers - * might not always be ready yet, this is particularly an - * issue with laptops where this might bounce the display off - * then on. Ideally we'd get a notification from userspace - * when this happens but we don't so just wait a bit and hope - * we waited long enough. It'd be better if we'd only do - * this on systems that need it. + * We punt completion for an arbitrary amount of time since + * systems like distros will load many drivers from userspace + * so consumers might not always be ready yet, this is + * particularly an issue with laptops where this might bounce + * the display off then on. Ideally we'd get a notification + * from userspace when this happens but we don't so just wait + * a bit and hope we waited long enough. It'd be better if + * we'd only do this on systems that need it, and a kernel + * command line option might be useful. */ - schedule_delayed_work(®ulator_init_complete_work, delay * HZ); + schedule_delayed_work(®ulator_init_complete_work, + msecs_to_jiffies(30000)); return 0; } From 9812307491231974f8eef1329237ce3d27da7462 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Wed, 29 Apr 2020 22:10:01 +0800 Subject: [PATCH 081/300] net: dsa: mv88e6xxx: remove duplicate assignment of struct members These struct members named 'phylink_validate' was assigned twice: static const struct mv88e6xxx_ops mv88e6190_ops = { ...... .phylink_validate = mv88e6390_phylink_validate, ...... .phylink_validate = mv88e6390_phylink_validate, }; static const struct mv88e6xxx_ops mv88e6190x_ops = { ...... .phylink_validate = mv88e6390_phylink_validate, ...... .phylink_validate = mv88e6390x_phylink_validate, }; static const struct mv88e6xxx_ops mv88e6191_ops = { ...... .phylink_validate = mv88e6390_phylink_validate, ...... .phylink_validate = mv88e6390_phylink_validate, }; static const struct mv88e6xxx_ops mv88e6290_ops = { ...... .phylink_validate = mv88e6390_phylink_validate, ...... .phylink_validate = mv88e6390_phylink_validate, }; Remove all the first one and leave the second one which are been used in fact. Be aware that for 'mv88e6190x_ops' the assignment functions is different while the others are all the same. This fixes the following coccicheck warning: drivers/net/dsa/mv88e6xxx/chip.c:3911:48-49: phylink_validate: first occurrence line 3965, second occurrence line 3967 drivers/net/dsa/mv88e6xxx/chip.c:3970:49-50: phylink_validate: first occurrence line 4024, second occurrence line 4026 drivers/net/dsa/mv88e6xxx/chip.c:4029:48-49: phylink_validate: first occurrence line 4082, second occurrence line 4085 drivers/net/dsa/mv88e6xxx/chip.c:4184:48-49: phylink_validate: first occurrence line 4238, second occurrence line 4242 Fixes: 4262c38dc42e ("net: dsa: mv88e6xxx: Add SERDES stats counters to all 6390 family members") Signed-off-by: Jason Yan Reviewed-by: Russell King Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index dd8a5666a584..2b4a723c8306 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3962,7 +3962,6 @@ static const struct mv88e6xxx_ops mv88e6190_ops = { .serdes_get_stats = mv88e6390_serdes_get_stats, .serdes_get_regs_len = mv88e6390_serdes_get_regs_len, .serdes_get_regs = mv88e6390_serdes_get_regs, - .phylink_validate = mv88e6390_phylink_validate, .gpio_ops = &mv88e6352_gpio_ops, .phylink_validate = mv88e6390_phylink_validate, }; @@ -4021,7 +4020,6 @@ static const struct mv88e6xxx_ops mv88e6190x_ops = { .serdes_get_stats = mv88e6390_serdes_get_stats, .serdes_get_regs_len = mv88e6390_serdes_get_regs_len, .serdes_get_regs = mv88e6390_serdes_get_regs, - .phylink_validate = mv88e6390_phylink_validate, .gpio_ops = &mv88e6352_gpio_ops, .phylink_validate = mv88e6390x_phylink_validate, }; @@ -4079,7 +4077,6 @@ static const struct mv88e6xxx_ops mv88e6191_ops = { .serdes_get_stats = mv88e6390_serdes_get_stats, .serdes_get_regs_len = mv88e6390_serdes_get_regs_len, .serdes_get_regs = mv88e6390_serdes_get_regs, - .phylink_validate = mv88e6390_phylink_validate, .avb_ops = &mv88e6390_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, .phylink_validate = mv88e6390_phylink_validate, @@ -4235,7 +4232,6 @@ static const struct mv88e6xxx_ops mv88e6290_ops = { .serdes_get_stats = mv88e6390_serdes_get_stats, .serdes_get_regs_len = mv88e6390_serdes_get_regs_len, .serdes_get_regs = mv88e6390_serdes_get_regs, - .phylink_validate = mv88e6390_phylink_validate, .gpio_ops = &mv88e6352_gpio_ops, .avb_ops = &mv88e6390_avb_ops, .ptp_ops = &mv88e6352_ptp_ops, From c165d57b552aaca607fa5daf3fb524a6efe3c5a3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 29 Apr 2020 21:00:41 +0200 Subject: [PATCH 082/300] netfilter: nf_osf: avoid passing pointer to local var gcc-10 points out that a code path exists where a pointer to a stack variable may be passed back to the caller: net/netfilter/nfnetlink_osf.c: In function 'nf_osf_hdr_ctx_init': cc1: warning: function may return address of local variable [-Wreturn-local-addr] net/netfilter/nfnetlink_osf.c:171:16: note: declared here 171 | struct tcphdr _tcph; | ^~~~~ I am not sure whether this can happen in practice, but moving the variable declaration into the callers avoids the problem. Fixes: 31a9c29210e2 ("netfilter: nf_osf: add struct nf_osf_hdr_ctx") Signed-off-by: Arnd Bergmann Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_osf.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 9f5dea0064ea..916a3c7f9eaf 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -165,12 +165,12 @@ static bool nf_osf_match_one(const struct sk_buff *skb, static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, - unsigned char *opts) + unsigned char *opts, + struct tcphdr *_tcph) { const struct tcphdr *tcp; - struct tcphdr _tcph; - tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; @@ -205,10 +205,11 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; + struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); - tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; @@ -265,10 +266,11 @@ bool nf_osf_find(const struct sk_buff *skb, const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; + struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); - tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts); + tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; From 42c556fef92361bbc58be22f91b1c49db0963c34 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Apr 2020 20:43:20 +0200 Subject: [PATCH 083/300] mptcp: replace mptcp_disconnect with a stub Paolo points out that mptcp_disconnect is bogus: "lock_sock(sk); looks suspicious (lock should be already held by the caller) And call to: tcp_disconnect(sk, flags); too, sk is not a tcp socket". ->disconnect() gets called from e.g. inet_stream_connect when one tries to disassociate a connected socket again (to re-connect without closing the socket first). MPTCP however uses mptcp_stream_connect, not inet_stream_connect, for the mptcp-socket connect call. inet_stream_connect only gets called indirectly, for the tcp socket, so any ->disconnect() calls end up calling tcp_disconnect for that tcp subflow sk. This also explains why syzkaller has not yet reported a problem here. So for now replace this with a stub that doesn't do anything. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/14 Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b22a63ba2348..6e0188f5d3f3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1316,11 +1316,12 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) static int mptcp_disconnect(struct sock *sk, int flags) { - lock_sock(sk); - __mptcp_clear_xmit(sk); - release_sock(sk); - mptcp_cancel_work(sk); - return tcp_disconnect(sk, flags); + /* Should never be called. + * inet_stream_connect() calls ->disconnect, but that + * refers to the subflow socket, not the mptcp one. + */ + WARN_ON_ONCE(1); + return 0; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) From dcce8ef8f70a8e38e6c47c1bae8b312376c04420 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Fri, 24 Apr 2020 14:04:00 -0700 Subject: [PATCH 084/300] HID: wacom: Report 2nd-gen Intuos Pro S center button status over BT The state of the center button was not reported to userspace for the 2nd-gen Intuos Pro S when used over Bluetooth due to the pad handling code not being updated to support its reduced number of buttons. This patch uses the actual number of buttons present on the tablet to assemble a button state bitmap. Link: https://github.com/linuxwacom/xf86-input-wacom/issues/112 Fixes: cd47de45b855 ("HID: wacom: Add 2nd gen Intuos Pro Small support") Signed-off-by: Jason Gerecke Cc: stable@vger.kernel.org # v5.3+ Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 96d00eba99c0..1c96809b51c9 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -1427,11 +1427,13 @@ static void wacom_intuos_pro2_bt_pad(struct wacom_wac *wacom) { struct input_dev *pad_input = wacom->pad_input; unsigned char *data = wacom->data; + int nbuttons = wacom->features.numbered_buttons; - int buttons = data[282] | ((data[281] & 0x40) << 2); + int expresskeys = data[282]; + int center = (data[281] & 0x40) >> 6; int ring = data[285] & 0x7F; bool ringstatus = data[285] & 0x80; - bool prox = buttons || ringstatus; + bool prox = expresskeys || center || ringstatus; /* Fix touchring data: userspace expects 0 at left and increasing clockwise */ ring = 71 - ring; @@ -1439,7 +1441,8 @@ static void wacom_intuos_pro2_bt_pad(struct wacom_wac *wacom) if (ring > 71) ring -= 72; - wacom_report_numbered_buttons(pad_input, 9, buttons); + wacom_report_numbered_buttons(pad_input, nbuttons, + expresskeys | (center << (nbuttons - 1))); input_report_abs(pad_input, ABS_WHEEL, ringstatus ? ring : 0); From 538f67407e2c0e5ed2a46e7d7ffa52f2e30c7ef8 Mon Sep 17 00:00:00 2001 From: Daniel Playfair Cal Date: Sat, 25 Apr 2020 20:58:17 +1000 Subject: [PATCH 085/300] HID: i2c-hid: reset Synaptics SYNA2393 on resume On the Dell XPS 9570, the Synaptics SYNA2393 touchpad generates spurious interrupts after resuming from suspend until it receives some input or is reset. Add it to the quirk I2C_HID_QUIRK_RESET_ON_RESUME so that it is reset when resuming from suspend. More information about the bug can be found in this mailing list discussion: https://www.spinics.net/lists/linux-input/msg59530.html Signed-off-by: Daniel Playfair Cal Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 3 +++ drivers/hid/i2c-hid/i2c-hid-core.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 6eb25b9e8575..7354fa79cb67 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -1099,6 +1099,9 @@ #define USB_DEVICE_ID_SYMBOL_SCANNER_2 0x1300 #define USB_DEVICE_ID_SYMBOL_SCANNER_3 0x1200 +#define I2C_VENDOR_ID_SYNAPTICS 0x06cb +#define I2C_PRODUCT_ID_SYNAPTICS_SYNA2393 0x7a13 + #define USB_VENDOR_ID_SYNAPTICS 0x06cb #define USB_DEVICE_ID_SYNAPTICS_TP 0x0001 #define USB_DEVICE_ID_SYNAPTICS_INT_TP 0x0002 diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 009000c5d55c..294c84e136d7 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -177,6 +177,8 @@ static const struct i2c_hid_quirks { I2C_HID_QUIRK_BOGUS_IRQ }, { USB_VENDOR_ID_ALPS_JP, HID_ANY_ID, I2C_HID_QUIRK_RESET_ON_RESUME }, + { I2C_VENDOR_ID_SYNAPTICS, I2C_PRODUCT_ID_SYNAPTICS_SYNA2393, + I2C_HID_QUIRK_RESET_ON_RESUME }, { USB_VENDOR_ID_ITE, I2C_DEVICE_ID_ITE_LENOVO_LEGION_Y720, I2C_HID_QUIRK_BAD_INPUT_SIZE }, { 0, 0 } From 2465f0d5c9e36d4c99eb424d38f72ea250f0ee92 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 Apr 2020 23:30:17 +0200 Subject: [PATCH 086/300] HID: mcp2221: add gpiolib dependency Without gpiolib, this driver fails to link: arm-linux-gnueabi-ld: drivers/hid/hid-mcp2221.o: in function `mcp2221_probe': hid-mcp2221.c:(.text+0x1b0): undefined reference to `devm_gpiochip_add_data' arm-linux-gnueabi-ld: drivers/hid/hid-mcp2221.o: in function `mcp_gpio_get': hid-mcp2221.c:(.text+0x30c): undefined reference to `gpiochip_get_data' Fixes: 328de1c519c5 ("HID: mcp2221: add GPIO functionality support") Signed-off-by: Arnd Bergmann Reviewed-by: Rishi Gupta Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 7c89edbd6c5a..34f07371716d 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -1155,6 +1155,7 @@ config HID_ALPS config HID_MCP2221 tristate "Microchip MCP2221 HID USB-to-I2C/SMbus host support" depends on USB_HID && I2C + depends on GPIOLIB ---help--- Provides I2C and SMBUS host adapter functionality over USB-HID through MCP2221 device. From b31d1d2b1c3a8452f425b09ebd374ecd3ddd5179 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Mon, 27 Apr 2020 15:59:02 -0700 Subject: [PATCH 087/300] platform/chrome: cros_ec_sensorhub: Allocate sensorhub resource before claiming sensors Allocate callbacks array before enumerating the sensors: The probe routine for these sensors (for instance cros_ec_sensors_probe) can be called within the sensorhub probe routine (cros_ec_sensors_probe()) Fixes: 145d59baff594 ("platform/chrome: cros_ec_sensorhub: Add FIFO support") Signed-off-by: Gwendal Grignou Reported-by: Douglas Anderson Tested-by: Douglas Anderson Signed-off-by: Enric Balletbo i Serra --- drivers/platform/chrome/cros_ec_sensorhub.c | 80 +++++++++++-------- .../platform/chrome/cros_ec_sensorhub_ring.c | 75 ++++++++++------- .../linux/platform_data/cros_ec_sensorhub.h | 1 + 3 files changed, 94 insertions(+), 62 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_sensorhub.c b/drivers/platform/chrome/cros_ec_sensorhub.c index b7f2c00db5e1..9c4af76a9956 100644 --- a/drivers/platform/chrome/cros_ec_sensorhub.c +++ b/drivers/platform/chrome/cros_ec_sensorhub.c @@ -52,28 +52,15 @@ static int cros_ec_sensorhub_register(struct device *dev, int sensor_type[MOTIONSENSE_TYPE_MAX] = { 0 }; struct cros_ec_command *msg = sensorhub->msg; struct cros_ec_dev *ec = sensorhub->ec; - int ret, i, sensor_num; + int ret, i; char *name; - sensor_num = cros_ec_get_sensor_count(ec); - if (sensor_num < 0) { - dev_err(dev, - "Unable to retrieve sensor information (err:%d)\n", - sensor_num); - return sensor_num; - } - - sensorhub->sensor_num = sensor_num; - if (sensor_num == 0) { - dev_err(dev, "Zero sensors reported.\n"); - return -EINVAL; - } msg->version = 1; msg->insize = sizeof(struct ec_response_motion_sense); msg->outsize = sizeof(struct ec_params_motion_sense); - for (i = 0; i < sensor_num; i++) { + for (i = 0; i < sensorhub->sensor_num; i++) { sensorhub->params->cmd = MOTIONSENSE_CMD_INFO; sensorhub->params->info.sensor_num = i; @@ -140,8 +127,7 @@ static int cros_ec_sensorhub_probe(struct platform_device *pdev) struct cros_ec_dev *ec = dev_get_drvdata(dev->parent); struct cros_ec_sensorhub *data; struct cros_ec_command *msg; - int ret; - int i; + int ret, i, sensor_num; msg = devm_kzalloc(dev, sizeof(struct cros_ec_command) + max((u16)sizeof(struct ec_params_motion_sense), @@ -166,10 +152,52 @@ static int cros_ec_sensorhub_probe(struct platform_device *pdev) dev_set_drvdata(dev, data); /* Check whether this EC is a sensor hub. */ - if (cros_ec_check_features(data->ec, EC_FEATURE_MOTION_SENSE)) { + if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE)) { + sensor_num = cros_ec_get_sensor_count(ec); + if (sensor_num < 0) { + dev_err(dev, + "Unable to retrieve sensor information (err:%d)\n", + sensor_num); + return sensor_num; + } + if (sensor_num == 0) { + dev_err(dev, "Zero sensors reported.\n"); + return -EINVAL; + } + data->sensor_num = sensor_num; + + /* + * Prepare the ring handler before enumering the + * sensors. + */ + if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO)) { + ret = cros_ec_sensorhub_ring_allocate(data); + if (ret) + return ret; + } + + /* Enumerate the sensors.*/ ret = cros_ec_sensorhub_register(dev, data); if (ret) return ret; + + /* + * When the EC does not have a FIFO, the sensors will query + * their data themselves via sysfs or a software trigger. + */ + if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO)) { + ret = cros_ec_sensorhub_ring_add(data); + if (ret) + return ret; + /* + * The msg and its data is not under the control of the + * ring handler. + */ + return devm_add_action_or_reset(dev, + cros_ec_sensorhub_ring_remove, + data); + } + } else { /* * If the device has sensors but does not claim to @@ -184,22 +212,6 @@ static int cros_ec_sensorhub_probe(struct platform_device *pdev) } } - /* - * If the EC does not have a FIFO, the sensors will query their data - * themselves via sysfs or a software trigger. - */ - if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE_FIFO)) { - ret = cros_ec_sensorhub_ring_add(data); - if (ret) - return ret; - /* - * The msg and its data is not under the control of the ring - * handler. - */ - return devm_add_action_or_reset(dev, - cros_ec_sensorhub_ring_remove, - data); - } return 0; } diff --git a/drivers/platform/chrome/cros_ec_sensorhub_ring.c b/drivers/platform/chrome/cros_ec_sensorhub_ring.c index c48e5b38a441..24e48d96ed76 100644 --- a/drivers/platform/chrome/cros_ec_sensorhub_ring.c +++ b/drivers/platform/chrome/cros_ec_sensorhub_ring.c @@ -956,6 +956,53 @@ static int cros_ec_sensorhub_event(struct notifier_block *nb, return NOTIFY_OK; } +/** + * cros_ec_sensorhub_ring_allocate() - Prepare the FIFO functionality if the EC + * supports it. + * + * @sensorhub : Sensor Hub object. + * + * Return: 0 on success. + */ +int cros_ec_sensorhub_ring_allocate(struct cros_ec_sensorhub *sensorhub) +{ + int fifo_info_length = + sizeof(struct ec_response_motion_sense_fifo_info) + + sizeof(u16) * sensorhub->sensor_num; + + /* Allocate the array for lost events. */ + sensorhub->fifo_info = devm_kzalloc(sensorhub->dev, fifo_info_length, + GFP_KERNEL); + if (!sensorhub->fifo_info) + return -ENOMEM; + + /* + * Allocate the callback area based on the number of sensors. + * Add one for the sensor ring. + */ + sensorhub->push_data = devm_kcalloc(sensorhub->dev, + sensorhub->sensor_num, + sizeof(*sensorhub->push_data), + GFP_KERNEL); + if (!sensorhub->push_data) + return -ENOMEM; + + sensorhub->tight_timestamps = cros_ec_check_features( + sensorhub->ec, + EC_FEATURE_MOTION_SENSE_TIGHT_TIMESTAMPS); + + if (sensorhub->tight_timestamps) { + sensorhub->batch_state = devm_kcalloc(sensorhub->dev, + sensorhub->sensor_num, + sizeof(*sensorhub->batch_state), + GFP_KERNEL); + if (!sensorhub->batch_state) + return -ENOMEM; + } + + return 0; +} + /** * cros_ec_sensorhub_ring_add() - Add the FIFO functionality if the EC * supports it. @@ -972,12 +1019,6 @@ int cros_ec_sensorhub_ring_add(struct cros_ec_sensorhub *sensorhub) sizeof(struct ec_response_motion_sense_fifo_info) + sizeof(u16) * sensorhub->sensor_num; - /* Allocate the array for lost events. */ - sensorhub->fifo_info = devm_kzalloc(sensorhub->dev, fifo_info_length, - GFP_KERNEL); - if (!sensorhub->fifo_info) - return -ENOMEM; - /* Retrieve FIFO information */ sensorhub->msg->version = 2; sensorhub->params->cmd = MOTIONSENSE_CMD_FIFO_INFO; @@ -998,31 +1039,9 @@ int cros_ec_sensorhub_ring_add(struct cros_ec_sensorhub *sensorhub) if (!sensorhub->ring) return -ENOMEM; - /* - * Allocate the callback area based on the number of sensors. - */ - sensorhub->push_data = devm_kcalloc( - sensorhub->dev, sensorhub->sensor_num, - sizeof(*sensorhub->push_data), - GFP_KERNEL); - if (!sensorhub->push_data) - return -ENOMEM; - sensorhub->fifo_timestamp[CROS_EC_SENSOR_LAST_TS] = cros_ec_get_time_ns(); - sensorhub->tight_timestamps = cros_ec_check_features( - ec, EC_FEATURE_MOTION_SENSE_TIGHT_TIMESTAMPS); - - if (sensorhub->tight_timestamps) { - sensorhub->batch_state = devm_kcalloc(sensorhub->dev, - sensorhub->sensor_num, - sizeof(*sensorhub->batch_state), - GFP_KERNEL); - if (!sensorhub->batch_state) - return -ENOMEM; - } - /* Register the notifier that will act as a top half interrupt. */ sensorhub->notifier.notifier_call = cros_ec_sensorhub_event; ret = blocking_notifier_chain_register(&ec->ec_dev->event_notifier, diff --git a/include/linux/platform_data/cros_ec_sensorhub.h b/include/linux/platform_data/cros_ec_sensorhub.h index c588be843f61..0ecce6aa69d5 100644 --- a/include/linux/platform_data/cros_ec_sensorhub.h +++ b/include/linux/platform_data/cros_ec_sensorhub.h @@ -185,6 +185,7 @@ int cros_ec_sensorhub_register_push_data(struct cros_ec_sensorhub *sensorhub, void cros_ec_sensorhub_unregister_push_data(struct cros_ec_sensorhub *sensorhub, u8 sensor_num); +int cros_ec_sensorhub_ring_allocate(struct cros_ec_sensorhub *sensorhub); int cros_ec_sensorhub_ring_add(struct cros_ec_sensorhub *sensorhub); void cros_ec_sensorhub_ring_remove(void *arg); int cros_ec_sensorhub_ring_fifo_enable(struct cros_ec_sensorhub *sensorhub, From d6833e42786e050e7522d6a91a9361e54085897d Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 29 Apr 2020 14:59:20 -0600 Subject: [PATCH 088/300] wireguard: send: remove errant newline from packet_encrypt_worker This commit removes a useless newline at the end of a scope, which doesn't add anything in the way of organization or readability. Signed-off-by: Sultan Alsawaf Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/send.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 7348c10cbae3..3e030d614df5 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -304,7 +304,6 @@ void wg_packet_encrypt_worker(struct work_struct *work) } wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first, state); - } } From 130c58606171326c81841a49cc913cd354113dd9 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 29 Apr 2020 14:59:21 -0600 Subject: [PATCH 089/300] wireguard: queueing: cleanup ptr_ring in error path of packet_queue_init Prior, if the alloc_percpu of packet_percpu_multicore_worker_alloc failed, the previously allocated ptr_ring wouldn't be freed. This commit adds the missing call to ptr_ring_cleanup in the error case. Reported-by: Sultan Alsawaf Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/queueing.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireguard/queueing.c b/drivers/net/wireguard/queueing.c index 5c964fcb994e..71b8e80b58e1 100644 --- a/drivers/net/wireguard/queueing.c +++ b/drivers/net/wireguard/queueing.c @@ -35,8 +35,10 @@ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, if (multicore) { queue->worker = wg_packet_percpu_multicore_worker_alloc( function, queue); - if (!queue->worker) + if (!queue->worker) { + ptr_ring_cleanup(&queue->ring, NULL); return -ENOMEM; + } } else { INIT_WORK(&queue->work, function); } From eebabcb26ea1e3295704477c6cd4e772c96a9559 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 29 Apr 2020 14:59:22 -0600 Subject: [PATCH 090/300] wireguard: receive: use tunnel helpers for decapsulating ECN markings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WireGuard currently only propagates ECN markings on tunnel decap according to the old RFC3168 specification. However, the spec has since been updated in RFC6040 to recommend slightly different decapsulation semantics. This was implemented in the kernel as a set of common helpers for ECN decapsulation, so let's just switch over WireGuard to using those, so it can benefit from this enhancement and any future tweaks. We do not drop packets with invalid ECN marking combinations, because WireGuard is frequently used to work around broken ISPs, which could be doing that. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Reported-by: Olivier Tilmans Cc: Dave Taht Cc: Rodney W. Grimes Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/receive.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index da3b782ab7d3..267f202f1931 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -393,13 +393,11 @@ static void wg_packet_consume_data_done(struct wg_peer *peer, len = ntohs(ip_hdr(skb)->tot_len); if (unlikely(len < sizeof(struct iphdr))) goto dishonest_packet_size; - if (INET_ECN_is_ce(PACKET_CB(skb)->ds)) - IP_ECN_set_ce(ip_hdr(skb)); + INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ip_hdr(skb)->tos); } else if (skb->protocol == htons(ETH_P_IPV6)) { len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); - if (INET_ECN_is_ce(PACKET_CB(skb)->ds)) - IP6_ECN_set_ce(skb, ipv6_hdr(skb)); + INET_ECN_decapsulate(skb, PACKET_CB(skb)->ds, ipv6_get_dsfield(ipv6_hdr(skb))); } else { goto dishonest_packet_type; } From 706024a52c614b478b63f7728d202532ce6591a9 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 22 Apr 2020 17:18:53 -0600 Subject: [PATCH 091/300] crypto: arch/lib - limit simd usage to 4k chunks The initial Zinc patchset, after some mailing list discussion, contained code to ensure that kernel_fpu_enable would not be kept on for more than a 4k chunk, since it disables preemption. The choice of 4k isn't totally scientific, but it's not a bad guess either, and it's what's used in both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form of PAGE_SIZE, which this commit corrects to be explicitly 4k for the former two). Ard did some back of the envelope calculations and found that at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k means we have a maximum preemption disabling of 20us, which Sebastian confirmed was probably a good limit. Unfortunately the chunking appears to have been left out of the final patchset that added the glue code. So, this commit adds it back in. Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function") Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function") Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function") Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel") Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation") Cc: Eric Biggers Cc: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/chacha-glue.c | 14 +++++++++++--- arch/arm/crypto/poly1305-glue.c | 15 +++++++++++---- arch/arm64/crypto/chacha-neon-glue.c | 14 +++++++++++--- arch/arm64/crypto/poly1305-glue.c | 15 +++++++++++---- arch/x86/crypto/blake2s-glue.c | 10 ++++------ arch/x86/crypto/chacha_glue.c | 14 +++++++++++--- arch/x86/crypto/poly1305_glue.c | 13 ++++++------- 7 files changed, 65 insertions(+), 30 deletions(-) diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c index 6fdb0ac62b3d..59da6c0b63b6 100644 --- a/arch/arm/crypto/chacha-glue.c +++ b/arch/arm/crypto/chacha-glue.c @@ -91,9 +91,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, return; } - kernel_neon_begin(); - chacha_doneon(state, dst, src, bytes, nrounds); - kernel_neon_end(); + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c index ceec04ec2f40..13cfef4ae22e 100644 --- a/arch/arm/crypto/poly1305-glue.c +++ b/arch/arm/crypto/poly1305-glue.c @@ -160,13 +160,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); if (static_branch_likely(&have_neon) && do_neon) { - kernel_neon_begin(); - poly1305_blocks_neon(&dctx->h, src, len, 1); - kernel_neon_end(); + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(&dctx->h, src, todo, 1); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); } else { poly1305_blocks_arm(&dctx->h, src, len, 1); + src += len; } - src += len; nbytes %= POLY1305_BLOCK_SIZE; } diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index 37ca3e889848..af2bbca38e70 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, !crypto_simd_usable()) return chacha_crypt_generic(state, dst, src, bytes, nrounds); - kernel_neon_begin(); - chacha_doneon(state, dst, src, bytes, nrounds); - kernel_neon_end(); + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_neon_begin(); + chacha_doneon(state, dst, src, todo, nrounds); + kernel_neon_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index e97b092f56b8..f33ada70c4ed 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); if (static_branch_likely(&have_neon) && crypto_simd_usable()) { - kernel_neon_begin(); - poly1305_blocks_neon(&dctx->h, src, len, 1); - kernel_neon_end(); + do { + unsigned int todo = min_t(unsigned int, len, SZ_4K); + + kernel_neon_begin(); + poly1305_blocks_neon(&dctx->h, src, todo, 1); + kernel_neon_end(); + + len -= todo; + src += todo; + } while (len); } else { poly1305_blocks(&dctx->h, src, len, 1); + src += len; } - src += len; nbytes %= POLY1305_BLOCK_SIZE; } diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 06ef2d4a4701..6737bcea1fa1 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2s_state *state, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { blake2s_compress_generic(state, block, nblocks, inc); return; } - for (;;) { + do { const size_t blocks = min_t(size_t, nblocks, - PAGE_SIZE / BLAKE2S_BLOCK_SIZE); + SZ_4K / BLAKE2S_BLOCK_SIZE); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && @@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2s_state *state, kernel_fpu_end(); nblocks -= blocks; - if (!nblocks) - break; block += blocks * BLAKE2S_BLOCK_SIZE; - } + } while (nblocks); } EXPORT_SYMBOL(blake2s_compress_arch); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index b412c21ee06e..22250091cdbe 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -153,9 +153,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, bytes <= CHACHA_BLOCK_SIZE) return chacha_crypt_generic(state, dst, src, bytes, nrounds); - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, bytes, nrounds); - kernel_fpu_end(); + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, todo, nrounds); + kernel_fpu_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); } EXPORT_SYMBOL(chacha_crypt_arch); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 6dfec19f7d57..dfe921efa9b2 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, struct poly1305_arch_internal *state = ctx; /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE || - PAGE_SIZE % POLY1305_BLOCK_SIZE); + BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || + SZ_4K % POLY1305_BLOCK_SIZE); if (!static_branch_likely(&poly1305_use_avx) || (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || @@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, return; } - for (;;) { - const size_t bytes = min_t(size_t, len, PAGE_SIZE); + do { + const size_t bytes = min_t(size_t, len, SZ_4K); kernel_fpu_begin(); if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) @@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, else poly1305_blocks_avx(ctx, inp, bytes, padbit); kernel_fpu_end(); + len -= bytes; - if (!len) - break; inp += bytes; - } + } while (len); } static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], From a9a8ba90fa5857c2c8a0e32eef2159cec717da11 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 22 Apr 2020 17:18:54 -0600 Subject: [PATCH 092/300] crypto: arch/nhpoly1305 - process in explicit 4k chunks Rather than chunking via PAGE_SIZE, this commit changes the arch implementations to chunk in explicit 4k parts, so that calculations on maximum acceptable latency don't suddenly become invalid on platforms where PAGE_SIZE isn't 4k, such as arm64. Fixes: 0f961f9f670e ("crypto: x86/nhpoly1305 - add AVX2 accelerated NHPoly1305") Fixes: 012c82388c03 ("crypto: x86/nhpoly1305 - add SSE2 accelerated NHPoly1305") Fixes: a00fa0c88774 ("crypto: arm64/nhpoly1305 - add NEON-accelerated NHPoly1305") Fixes: 16aae3595a9d ("crypto: arm/nhpoly1305 - add NEON-accelerated NHPoly1305") Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Reviewed-by: Eric Biggers Signed-off-by: Herbert Xu --- arch/arm/crypto/nhpoly1305-neon-glue.c | 2 +- arch/arm64/crypto/nhpoly1305-neon-glue.c | 2 +- arch/x86/crypto/nhpoly1305-avx2-glue.c | 2 +- arch/x86/crypto/nhpoly1305-sse2-glue.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c index ae5aefc44a4d..ffa8d73fe722 100644 --- a/arch/arm/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm/crypto/nhpoly1305-neon-glue.c @@ -30,7 +30,7 @@ static int nhpoly1305_neon_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_neon_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c index 895d3727c1fb..c5405e6a6db7 100644 --- a/arch/arm64/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -30,7 +30,7 @@ static int nhpoly1305_neon_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_neon_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index f7567cbd35b6..80fcb85736e1 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -29,7 +29,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2); diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index a661ede3b5cf..cc6b7c1a2705 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -29,7 +29,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc, return crypto_nhpoly1305_update(desc, src, srclen); do { - unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + unsigned int n = min_t(unsigned int, srclen, SZ_4K); kernel_fpu_begin(); crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2); From 6f8280cec1c9dfcd69c98e124ae6e0b61115158b Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Fri, 24 Apr 2020 11:26:38 +0530 Subject: [PATCH 093/300] MAINTAINERS: Add Vinod Koul as Generic PHY co-maintainer Add Vinod Koul as Generic PHY Subsystem co-maintainer and move the linux-phy to a shared repository. Cc: Vinod Koul Acked-By: Vinod Koul Signed-off-by: Kishon Vijay Abraham I --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..6c7adc14c389 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7115,9 +7115,10 @@ F: include/uapi/asm-generic/ GENERIC PHY FRAMEWORK M: Kishon Vijay Abraham I +M: Vinod Koul L: linux-kernel@vger.kernel.org S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/kishon/linux-phy.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy.git F: Documentation/devicetree/bindings/phy/ F: drivers/phy/ F: include/linux/phy/ From 820eeb9de62f1f479c04fc6575d874611b1e2095 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 7 Apr 2020 18:28:54 -0700 Subject: [PATCH 094/300] phy: qualcomm: usb-hs-28nm: Prepare clocks in init The AHB clock must be on for qcom_snps_hsphy_init() to be able to write the initialization sequence to the hardware, so move the clock enablement to phy init and exit. Fixes: 67b27dbeac4d ("phy: qualcomm: Add Synopsys 28nm Hi-Speed USB PHY driver") Signed-off-by: Bjorn Andersson Reviewed-by: Bryan O'Donoghue Signed-off-by: Vinod Koul Signed-off-by: Kishon Vijay Abraham I --- drivers/phy/qualcomm/phy-qcom-usb-hs-28nm.c | 32 ++++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/phy/qualcomm/phy-qcom-usb-hs-28nm.c b/drivers/phy/qualcomm/phy-qcom-usb-hs-28nm.c index d998e65c89c8..a52a9bf13b75 100644 --- a/drivers/phy/qualcomm/phy-qcom-usb-hs-28nm.c +++ b/drivers/phy/qualcomm/phy-qcom-usb-hs-28nm.c @@ -160,18 +160,11 @@ static int qcom_snps_hsphy_power_on(struct phy *phy) ret = regulator_bulk_enable(VREG_NUM, priv->vregs); if (ret) return ret; - ret = clk_bulk_prepare_enable(priv->num_clks, priv->clks); - if (ret) - goto err_disable_regulator; + qcom_snps_hsphy_disable_hv_interrupts(priv); qcom_snps_hsphy_exit_retention(priv); return 0; - -err_disable_regulator: - regulator_bulk_disable(VREG_NUM, priv->vregs); - - return ret; } static int qcom_snps_hsphy_power_off(struct phy *phy) @@ -180,7 +173,6 @@ static int qcom_snps_hsphy_power_off(struct phy *phy) qcom_snps_hsphy_enter_retention(priv); qcom_snps_hsphy_enable_hv_interrupts(priv); - clk_bulk_disable_unprepare(priv->num_clks, priv->clks); regulator_bulk_disable(VREG_NUM, priv->vregs); return 0; @@ -266,21 +258,39 @@ static int qcom_snps_hsphy_init(struct phy *phy) struct hsphy_priv *priv = phy_get_drvdata(phy); int ret; - ret = qcom_snps_hsphy_reset(priv); + ret = clk_bulk_prepare_enable(priv->num_clks, priv->clks); if (ret) return ret; + ret = qcom_snps_hsphy_reset(priv); + if (ret) + goto disable_clocks; + qcom_snps_hsphy_init_sequence(priv); ret = qcom_snps_hsphy_por_reset(priv); if (ret) - return ret; + goto disable_clocks; + + return 0; + +disable_clocks: + clk_bulk_disable_unprepare(priv->num_clks, priv->clks); + return ret; +} + +static int qcom_snps_hsphy_exit(struct phy *phy) +{ + struct hsphy_priv *priv = phy_get_drvdata(phy); + + clk_bulk_disable_unprepare(priv->num_clks, priv->clks); return 0; } static const struct phy_ops qcom_snps_hsphy_ops = { .init = qcom_snps_hsphy_init, + .exit = qcom_snps_hsphy_exit, .power_on = qcom_snps_hsphy_power_on, .power_off = qcom_snps_hsphy_power_off, .set_mode = qcom_snps_hsphy_set_mode, From 9f04db234af691007bb785342a06abab5fb34474 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 29 Apr 2020 17:52:18 +0200 Subject: [PATCH 095/300] USB: uas: add quirk for LaCie 2Big Quadra MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This device needs US_FL_NO_REPORT_OPCODES to avoid going through prolonged error handling on enumeration. Signed-off-by: Oliver Neukum Reported-by: Julian Groß Cc: stable Link: https://lore.kernel.org/r/20200429155218.7308-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_uas.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index 1b23741036ee..37157ed9a881 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -28,6 +28,13 @@ * and don't forget to CC: the USB development list */ +/* Reported-by: Julian Groß */ +UNUSUAL_DEV(0x059f, 0x105f, 0x0000, 0x9999, + "LaCie", + "2Big Quadra USB3", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_REPORT_OPCODES), + /* * Apricorn USB3 dongle sometimes returns "USBSUSBSUSBS" in response to SCSI * commands in UAS mode. Observed with the 1.28 firmware; are there others? From 6aea9e050394e83ac9fbd9fb0cb77c173e5bcae1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 13 Apr 2020 16:10:16 -0700 Subject: [PATCH 096/300] KVM: arm64: Delete duplicated label in invalid_vector SYM_CODE_START defines \label , so it is redundant to define \label again. A redefinition at the same place is accepted by GNU as (https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=159fbb6088f17a341bcaaac960623cab881b4981) but rejected by the clang integrated assembler. Fixes: 617a2f392c92 ("arm64: kvm: Annotate assembly using modern annoations") Signed-off-by: Fangrui Song Signed-off-by: Marc Zyngier Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers Link: https://github.com/ClangBuiltLinux/linux/issues/988 Link: https://lore.kernel.org/r/20200413231016.250737-1-maskray@google.com --- arch/arm64/kvm/hyp/hyp-entry.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index c2a13ab3c471..9c5cfb04170e 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -198,7 +198,6 @@ SYM_CODE_END(__hyp_panic) .macro invalid_vector label, target = __hyp_panic .align 2 SYM_CODE_START(\label) -\label: b \target SYM_CODE_END(\label) .endm From 6e977984f6d8e5689e079de1fd2e337cd17dcca5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 24 Apr 2020 14:24:34 +0100 Subject: [PATCH 097/300] KVM: arm64: Save/restore sp_el0 as part of __guest_enter We currently save/restore sp_el0 in C code. This is a bit unsafe, as a lot of the C code expects 'current' to be accessible from there (and the opportunity to run kernel code in HYP is specially great with VHE). Instead, let's move the save/restore of sp_el0 to the assembly code (in __guest_enter), making sure that sp_el0 is correct very early on when we exit the guest, and is preserved as long as possible to its host value when we enter the guest. Reviewed-by: Andrew Jones Acked-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/entry.S | 23 +++++++++++++++++++++++ arch/arm64/kvm/hyp/sysreg-sr.c | 17 +++-------------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index d22d0534dd60..90186cf6473e 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -18,6 +18,7 @@ #define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x) #define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x) +#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8) .text .pushsection .hyp.text, "ax" @@ -47,6 +48,16 @@ ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)] .endm +.macro save_sp_el0 ctxt, tmp + mrs \tmp, sp_el0 + str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] +.endm + +.macro restore_sp_el0 ctxt, tmp + ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET] + msr sp_el0, \tmp +.endm + /* * u64 __guest_enter(struct kvm_vcpu *vcpu, * struct kvm_cpu_context *host_ctxt); @@ -60,6 +71,9 @@ SYM_FUNC_START(__guest_enter) // Store the host regs save_callee_saved_regs x1 + // Save the host's sp_el0 + save_sp_el0 x1, x2 + // Now the host state is stored if we have a pending RAS SError it must // affect the host. If any asynchronous exception is pending we defer // the guest entry. The DSB isn't necessary before v8.2 as any SError @@ -83,6 +97,9 @@ alternative_else_nop_endif // when this feature is enabled for kernel code. ptrauth_switch_to_guest x29, x0, x1, x2 + // Restore the guest's sp_el0 + restore_sp_el0 x29, x0 + // Restore guest regs x0-x17 ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] @@ -130,6 +147,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // Store the guest regs x18-x29, lr save_callee_saved_regs x1 + // Store the guest's sp_el0 + save_sp_el0 x1, x2 + get_host_ctxt x2, x3 // Macro ptrauth_switch_to_guest format: @@ -139,6 +159,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // when this feature is enabled for kernel code. ptrauth_switch_to_host x1, x2, x3, x4, x5 + // Restore the hosts's sp_el0 + restore_sp_el0 x2, x3 + // Now restore the host regs restore_callee_saved_regs x2 diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 75b1925763f1..6d2df9fe0b5d 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -15,8 +15,9 @@ /* * Non-VHE: Both host and guest must save everything. * - * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and pstate, - * which are handled as part of the el2 return state) on every switch. + * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and + * pstate, which are handled as part of the el2 return state) on every + * switch (sp_el0 is being dealt with in the assembly code). * tpidr_el0 and tpidrro_el0 only need to be switched when going * to host userspace or a different VCPU. EL1 registers only need to be * switched when potentially going to run a different VCPU. The latter two @@ -26,12 +27,6 @@ static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1); - - /* - * The host arm64 Linux uses sp_el0 to point to 'current' and it must - * therefore be saved/restored on every entry/exit to/from the guest. - */ - ctxt->gp_regs.regs.sp = read_sysreg(sp_el0); } static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt) @@ -99,12 +94,6 @@ NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe); static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1); - - /* - * The host arm64 Linux uses sp_el0 to point to 'current' and it must - * therefore be saved/restored on every entry/exit to/from the guest. - */ - write_sysreg(ctxt->gp_regs.regs.sp, sp_el0); } static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt) From 958e8e14fd24c0aa1fe6a6beb2b985cb638577d6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 24 Apr 2020 15:30:30 +0100 Subject: [PATCH 098/300] KVM: arm64: vgic-v4: Initialize GICv4.1 even in the absence of a virtual ITS KVM now expects to be able to use HW-accelerated delivery of vSGIs as soon as the guest has enabled thm. Unfortunately, we only initialize the GICv4 context if we have a virtual ITS exposed to the guest. Fix it by always initializing the GICv4.1 context if it is available on the host. Fixes: 2291ff2f2a56 ("KVM: arm64: GICv4.1: Plumb SGI implementation selection in the distributor") Reviewed-by: Zenghui Yu Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-init.c | 9 ++++++++- virt/kvm/arm/vgic/vgic-mmio-v3.c | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 30dbec9fe0b4..32e32d67a127 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -294,8 +294,15 @@ int vgic_init(struct kvm *kvm) } } - if (vgic_has_its(kvm)) { + if (vgic_has_its(kvm)) vgic_lpi_translation_cache_init(kvm); + + /* + * If we have GICv4.1 enabled, unconditionnaly request enable the + * v4 support so that we get HW-accelerated vSGIs. Otherwise, only + * enable it if we present a virtual ITS to the guest. + */ + if (vgic_supports_direct_msis(kvm)) { ret = vgic_v4_init(kvm); if (ret) goto out; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 416613f2400c..89a14ec8b33b 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -50,7 +50,8 @@ bool vgic_has_its(struct kvm *kvm) bool vgic_supports_direct_msis(struct kvm *kvm) { - return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); + return (kvm_vgic_global_state.has_gicv4_1 || + (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm))); } /* From 799499850ae956c3ec3aceb84223bfb33b2bdcfc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 20 Apr 2020 04:20:41 -0500 Subject: [PATCH 099/300] net/mlx5: E-switch, Fix error unwinding flow for steering init failure Error unwinding is done incorrectly in the cited commit. When steering init fails, there is no need to perform steering cleanup. When vport error exists, error cleanup should be mirror of the setup routine, i.e. to perform steering cleanup before metadata cleanup. This avoids the call trace in accessing uninitialized objects which are skipped during steering_init() due to failure in steering_init(). Call trace: mlx5_cmd_modify_header_alloc:805:(pid 21128): too many modify header actions 1, max supported 0 E-Switch: Failed to create restore mod header BUG: kernel NULL pointer dereference, address: 00000000000000d0 [ 677.263079] mlx5_destroy_flow_group+0x13/0x80 [mlx5_core] [ 677.268921] esw_offloads_steering_cleanup+0x51/0xf0 [mlx5_core] [ 677.275281] esw_offloads_enable+0x1a5/0x800 [mlx5_core] [ 677.280949] mlx5_eswitch_enable_locked+0x155/0x860 [mlx5_core] [ 677.287227] mlx5_devlink_eswitch_mode_set+0x1af/0x320 [ 677.293741] devlink_nl_cmd_eswitch_set_doit+0x41/0xb0 [ 677.299217] genl_rcv_msg+0x1eb/0x430 Fixes: 7983a675ba65 ("net/mlx5: E-Switch, Enable chains only if regs loopback is enabled") Signed-off-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index b2e38e0cde97..94d6c91a8612 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2377,9 +2377,9 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) err_vports: esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK); err_uplink: - esw_set_passing_vport_metadata(esw, false); -err_steering_init: esw_offloads_steering_cleanup(esw); +err_steering_init: + esw_set_passing_vport_metadata(esw, false); err_vport_metadata: mlx5_rdma_disable_roce(esw->dev); mutex_destroy(&esw->offloads.termtbl_mutex); From e9864539053ae15c2d6475833f62d7383f9271ce Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 20 Apr 2020 04:32:48 -0500 Subject: [PATCH 100/300] net/mlx5: E-switch, Fix printing wrong error value When mlx5_modify_header_alloc() fails, instead of printing the error value returned, current error log prints 0. Fix by printing correct error value returned by mlx5_modify_header_alloc(). Fixes: 6724e66b90ee ("net/mlx5: E-Switch, Get reg_c1 value on miss") Signed-off-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 94d6c91a8612..8289af360e8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1550,9 +1550,9 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw) MLX5_FLOW_NAMESPACE_KERNEL, 1, modact); if (IS_ERR(mod_hdr)) { + err = PTR_ERR(mod_hdr); esw_warn(dev, "Failed to create restore mod header, err: %d\n", err); - err = PTR_ERR(mod_hdr); goto err_mod_hdr; } From f8d1eddaf94abdc459ccfb881aa7233cb9f7f39a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 21 Apr 2020 05:36:07 -0500 Subject: [PATCH 101/300] net/mlx5: E-switch, Fix mutex init order In cited patch mutex is initialized after its used. Below call trace is observed. Fix the order to initialize the mutex early enough. Similarly follow mirror sequence during cleanup. kernel: DEBUG_LOCKS_WARN_ON(lock->magic != lock) kernel: WARNING: CPU: 5 PID: 45916 at kernel/locking/mutex.c:938 __mutex_lock+0x7d6/0x8a0 kernel: Call Trace: kernel: ? esw_vport_tbl_get+0x3b/0x250 [mlx5_core] kernel: ? mark_held_locks+0x55/0x70 kernel: ? __slab_free+0x274/0x400 kernel: ? lockdep_hardirqs_on+0x140/0x1d0 kernel: esw_vport_tbl_get+0x3b/0x250 [mlx5_core] kernel: ? mlx5_esw_chains_create_fdb_prio+0xa57/0xc20 [mlx5_core] kernel: mlx5_esw_vport_tbl_get+0x88/0xf0 [mlx5_core] kernel: mlx5_esw_chains_create+0x2f3/0x3e0 [mlx5_core] kernel: esw_create_offloads_fdb_tables+0x11d/0x580 [mlx5_core] kernel: esw_offloads_enable+0x26d/0x540 [mlx5_core] kernel: mlx5_eswitch_enable_locked+0x155/0x860 [mlx5_core] kernel: mlx5_devlink_eswitch_mode_set+0x1af/0x320 [mlx5_core] kernel: devlink_nl_cmd_eswitch_set_doit+0x41/0xb0 Fixes: 96e326878fa5 ("net/mlx5e: Eswitch, Use per vport tables for mirroring") Signed-off-by: Parav Pandit Reviewed-by: Roi Dayan Reviewed-by: Eli Cohen Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 8289af360e8d..5d9def18ae3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2219,10 +2219,12 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw) total_vports = num_vfs + MLX5_SPECIAL_VPORTS(esw->dev); memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb)); + mutex_init(&esw->fdb_table.offloads.vports.lock); + hash_init(esw->fdb_table.offloads.vports.table); err = esw_create_uplink_offloads_acl_tables(esw); if (err) - return err; + goto create_acl_err; err = esw_create_offloads_table(esw, total_vports); if (err) @@ -2240,9 +2242,6 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw) if (err) goto create_fg_err; - mutex_init(&esw->fdb_table.offloads.vports.lock); - hash_init(esw->fdb_table.offloads.vports.table); - return 0; create_fg_err: @@ -2253,18 +2252,19 @@ create_restore_err: esw_destroy_offloads_table(esw); create_offloads_err: esw_destroy_uplink_offloads_acl_tables(esw); - +create_acl_err: + mutex_destroy(&esw->fdb_table.offloads.vports.lock); return err; } static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw) { - mutex_destroy(&esw->fdb_table.offloads.vports.lock); esw_destroy_vport_rx_group(esw); esw_destroy_offloads_fdb_tables(esw); esw_destroy_restore_table(esw); esw_destroy_offloads_table(esw); esw_destroy_uplink_offloads_acl_tables(esw); + mutex_destroy(&esw->fdb_table.offloads.vports.lock); } static void From 8075411d93b6efe143d9f606f6531077795b7fbf Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 25 Mar 2020 17:19:43 +0200 Subject: [PATCH 102/300] net/mlx5: DR, On creation set CQ's arm_db member to right value In polling mode, set arm_db member to a value that will avoid CQ event recovery by the HW. Otherwise we might get event without completion function. In addition,empty completion function to was added to protect from unexpected events. Fixes: 297cccebdc5a ("net/mlx5: DR, Expose an internal API to issue RDMA operations") Signed-off-by: Erez Shitrit Reviewed-by: Tariq Toukan Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index c0ab9cf74929..18719acb7e54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -695,6 +695,12 @@ static void dr_cq_event(struct mlx5_core_cq *mcq, pr_info("CQ event %u on CQ #%u\n", event, mcq->cqn); } +static void dr_cq_complete(struct mlx5_core_cq *mcq, + struct mlx5_eqe *eqe) +{ + pr_err("CQ completion CQ: #%u\n", mcq->cqn); +} + static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_uars_page *uar, size_t ncqe) @@ -756,6 +762,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas); cq->mcq.event = dr_cq_event; + cq->mcq.comp = dr_cq_complete; err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); kvfree(in); @@ -767,7 +774,12 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, cq->mcq.set_ci_db = cq->wq_ctrl.db.db; cq->mcq.arm_db = cq->wq_ctrl.db.db + 1; *cq->mcq.set_ci_db = 0; - *cq->mcq.arm_db = 0; + + /* set no-zero value, in order to avoid the HW to run db-recovery on + * CQ that used in polling mode. + */ + *cq->mcq.arm_db = cpu_to_be32(2 << 28); + cq->mcq.vector = 0; cq->mcq.irqn = irqn; cq->mcq.uar = uar; From f3cb3cebe26ed4c8036adbd9448b372129d3c371 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 21 Jul 2019 08:40:13 +0300 Subject: [PATCH 103/300] net/mlx5: Fix forced completion access non initialized command entry mlx5_cmd_flush() will trigger forced completions to all valid command entries. Triggered by an asynch event such as fast teardown it can happen at any stage of the command, including command initialization. It will trigger forced completion and that can lead to completion on an uninitialized command entry. Setting MLX5_CMD_ENT_STATE_PENDING_COMP only after command entry is initialized will ensure force completion is treated only if command entry is initialized. Fixes: 73dd3a4839c1 ("net/mlx5: Avoid using pending command interface slots") Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 34cba97f7bf4..d7470f8d355e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -888,7 +888,6 @@ static void cmd_work_handler(struct work_struct *work) } cmd->ent_arr[ent->idx] = ent; - set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); lay = get_inst(cmd, ent->idx); ent->lay = lay; memset(lay, 0, sizeof(*lay)); @@ -910,6 +909,7 @@ static void cmd_work_handler(struct work_struct *work) if (ent->callback) schedule_delayed_work(&ent->cb_timeout_work, cb_timeout); + set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state); /* Skip sending command to fw if internal error */ if (pci_channel_offline(dev->pdev) || From cece6f432cca9f18900463ed01b97a152a03600a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 23 Feb 2020 03:27:41 +0200 Subject: [PATCH 104/300] net/mlx5: Fix command entry leak in Internal Error State Processing commands by cmd_work_handler() while already in Internal Error State will result in entry leak, since the handler process force completion without doorbell. Forced completion doesn't release the entry and event completion will never arrive, so entry should be released. Fixes: 73dd3a4839c1 ("net/mlx5: Avoid using pending command interface slots") Signed-off-by: Moshe Shemesh Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index d7470f8d355e..cede5bdfd598 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -922,6 +922,10 @@ static void cmd_work_handler(struct work_struct *work) MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true); + /* no doorbell, no need to keep the entry */ + free_ent(cmd, ent->idx); + if (ent->callback) + free_cmd(ent); return; } From 67b38de646894c9a94fe4d6d17719e70cc6028eb Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 23 Apr 2020 12:37:21 +0300 Subject: [PATCH 105/300] net/mlx5e: Fix q counters on uplink representors Need to allocate the q counters before init_rx which needs them when creating the rq. Fixes: 8520fa57a4e9 ("net/mlx5e: Create q counters on uplink representors") Signed-off-by: Roi Dayan Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 55457f268495..f372e94948fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1773,19 +1773,14 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv) static int mlx5e_init_ul_rep_rx(struct mlx5e_priv *priv) { - int err = mlx5e_init_rep_rx(priv); - - if (err) - return err; - mlx5e_create_q_counters(priv); - return 0; + return mlx5e_init_rep_rx(priv); } static void mlx5e_cleanup_ul_rep_rx(struct mlx5e_priv *priv) { - mlx5e_destroy_q_counters(priv); mlx5e_cleanup_rep_rx(priv); + mlx5e_destroy_q_counters(priv); } static int mlx5e_init_uplink_rep_tx(struct mlx5e_rep_priv *rpriv) From ab5130186d7476dcee0d4e787d19a521ca552ce9 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Wed, 22 Apr 2020 20:13:55 -0700 Subject: [PATCH 106/300] x86/mm/cpa: Flush direct map alias during cpa As an optimization, cpa_flush() was changed to optionally only flush the range in @cpa if it was small enough. However, this range does not include any direct map aliases changed in cpa_process_alias(). So small set_memory_() calls that touch that alias don't get the direct map changes flushed. This situation can happen when the virtual address taking variants are passed an address in vmalloc or modules space. In these cases, force a full TLB flush. Note this issue does not extend to cases where the set_memory_() calls are passed a direct map address, or page array, etc, as the primary target. In those cases the direct map would be flushed. Fixes: 935f5839827e ("x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation") Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200424105343.GA20730@hirez.programming.kicks-ass.net --- arch/x86/mm/pat/set_memory.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 59eca6a94ce7..b8c55a2e402d 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -43,7 +43,8 @@ struct cpa_data { unsigned long pfn; unsigned int flags; unsigned int force_split : 1, - force_static_prot : 1; + force_static_prot : 1, + force_flush_all : 1; struct page **pages; }; @@ -355,10 +356,10 @@ static void cpa_flush(struct cpa_data *data, int cache) return; } - if (cpa->numpages <= tlb_single_page_flush_ceiling) - on_each_cpu(__cpa_flush_tlb, cpa, 1); - else + if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) flush_tlb_all(); + else + on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) return; @@ -1598,6 +1599,8 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + cpa->force_flush_all = 1; + ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret; @@ -1618,6 +1621,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; + cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value. From 263e1201a2c324b60b15ecda5de9ebf1e7293e31 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Apr 2020 15:01:51 +0200 Subject: [PATCH 107/300] mptcp: consolidate synack processing. Currently the MPTCP code uses 2 hooks to process syn-ack packets, mptcp_rcv_synsent() and the sk_rx_dst_set() callback. We can drop the first, moving the relevant code into the latter, reducing the hooking into the TCP code. This is also needed by the next patch. v1 -> v2: - use local tcp sock ptr instead of casting the sk variable several times - DaveM Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/mptcp.h | 1 - net/ipv4/tcp_input.c | 3 --- net/mptcp/options.c | 22 ---------------------- net/mptcp/subflow.c | 27 ++++++++++++++++++++++++--- 4 files changed, 24 insertions(+), 29 deletions(-) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 0e7c5471010b..4ecfa7d5e0c7 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -72,7 +72,6 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); -void mptcp_rcv_synsent(struct sock *sk); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf4ced9273e8..81425542da44 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5990,9 +5990,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); - if (sk_is_mptcp(sk)) - mptcp_rcv_synsent(sk); - /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4a7c467b99db..8fea686a5562 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -344,28 +344,6 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, return false; } -void mptcp_rcv_synsent(struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { - subflow->mp_capable = 1; - subflow->can_ack = 1; - subflow->remote_key = tp->rx_opt.mptcp.sndr_key; - pr_debug("subflow=%p, remote_key=%llu", subflow, - subflow->remote_key); - } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { - subflow->mp_join = 1; - subflow->thmac = tp->rx_opt.mptcp.thmac; - subflow->remote_nonce = tp->rx_opt.mptcp.nonce; - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, - subflow->thmac, subflow->remote_nonce); - } else if (subflow->request_mptcp) { - tcp_sk(sk)->is_mptcp = 0; - } -} - /* MP_JOIN client subflow must wait for 4th ack before sending any data: * TCP can't schedule delack timer before the subflow is fully established. * MPTCP uses the delack timer to do 3rd ack retransmissions diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 71256f03707f..84f6408594c9 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -222,6 +222,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; + struct tcp_sock *tp = tcp_sk(sk); subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -230,14 +231,35 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) parent->sk_state_change(parent); } - if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp) + /* be sure no special action on any packet other than syn-ack */ + if (subflow->conn_finished) + return; + + subflow->conn_finished = 1; + + if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + subflow->mp_capable = 1; + subflow->can_ack = 1; + subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + pr_debug("subflow=%p, remote_key=%llu", subflow, + subflow->remote_key); + } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { + subflow->mp_join = 1; + subflow->thmac = tp->rx_opt.mptcp.thmac; + subflow->remote_nonce = tp->rx_opt.mptcp.nonce; + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, + subflow->thmac, subflow->remote_nonce); + } else if (subflow->request_mptcp) { + tp->is_mptcp = 0; + } + + if (!tp->is_mptcp) return; if (subflow->mp_capable) { pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), subflow->remote_key); mptcp_finish_connect(sk); - subflow->conn_finished = 1; if (skb) { pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); @@ -264,7 +286,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (!mptcp_finish_join(sk)) goto do_reset; - subflow->conn_finished = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); } else { do_reset: From cfde141ea3faa30e362bbdb5c28001bbbdb0b8e0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Apr 2020 15:01:52 +0200 Subject: [PATCH 108/300] mptcp: move option parsing into mptcp_incoming_options() The mptcp_options_received structure carries several per packet flags (mp_capable, mp_join, etc.). Such fields must be cleared on each packet, even on dropped ones or packet not carrying any MPTCP options, but the current mptcp code clears them only on TCP option reset. On several races/corner cases we end-up with stray bits in incoming options, leading to WARN_ON splats. e.g.: [ 171.164906] Bad mapping: ssn=32714 map_seq=1 map_data_len=32713 [ 171.165006] WARNING: CPU: 1 PID: 5026 at net/mptcp/subflow.c:533 warn_bad_map (linux-mptcp/net/mptcp/subflow.c:533 linux-mptcp/net/mptcp/subflow.c:531) [ 171.167632] Modules linked in: ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel geneve ip6_udp_tunnel udp_tunnel macsec macvtap tap ipvlan macvlan 8021q garp mrp xfrm_interface veth netdevsim nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun binfmt_misc intel_rapl_msr intel_rapl_common rfkill kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel joydev virtio_balloon pcspkr i2c_piix4 sunrpc ip_tables xfs libcrc32c crc32c_intel serio_raw virtio_console ata_generic virtio_blk virtio_net net_failover failover ata_piix libata [ 171.199464] CPU: 1 PID: 5026 Comm: repro Not tainted 5.7.0-rc1.mptcp_f227fdf5d388+ #95 [ 171.200886] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 [ 171.202546] RIP: 0010:warn_bad_map (linux-mptcp/net/mptcp/subflow.c:533 linux-mptcp/net/mptcp/subflow.c:531) [ 171.206537] Code: c1 ea 03 0f b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 1d 8b 55 3c 44 89 e6 48 c7 c7 20 51 13 95 e8 37 8b 22 fe <0f> 0b 48 83 c4 08 5b 5d 41 5c c3 89 4c 24 04 e8 db d6 94 fe 8b 4c [ 171.220473] RSP: 0018:ffffc90000150560 EFLAGS: 00010282 [ 171.221639] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 171.223108] RDX: 0000000000000000 RSI: 0000000000000008 RDI: fffff5200002a09e [ 171.224388] RBP: ffff8880aa6e3c00 R08: 0000000000000001 R09: fffffbfff2ec9955 [ 171.225706] R10: ffffffff9764caa7 R11: fffffbfff2ec9954 R12: 0000000000007fca [ 171.227211] R13: ffff8881066f4a7f R14: ffff8880aa6e3c00 R15: 0000000000000020 [ 171.228460] FS: 00007f8623719740(0000) GS:ffff88810be00000(0000) knlGS:0000000000000000 [ 171.230065] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 171.231303] CR2: 00007ffdab190a50 CR3: 00000001038ea006 CR4: 0000000000160ee0 [ 171.232586] Call Trace: [ 171.233109] [ 171.233531] get_mapping_status (linux-mptcp/net/mptcp/subflow.c:691) [ 171.234371] mptcp_subflow_data_available (linux-mptcp/net/mptcp/subflow.c:736 linux-mptcp/net/mptcp/subflow.c:832) [ 171.238181] subflow_state_change (linux-mptcp/net/mptcp/subflow.c:1085 (discriminator 1)) [ 171.239066] tcp_fin (linux-mptcp/net/ipv4/tcp_input.c:4217) [ 171.240123] tcp_data_queue (linux-mptcp/./include/linux/compiler.h:199 linux-mptcp/net/ipv4/tcp_input.c:4822) [ 171.245083] tcp_rcv_established (linux-mptcp/./include/linux/skbuff.h:1785 linux-mptcp/./include/net/tcp.h:1774 linux-mptcp/./include/net/tcp.h:1847 linux-mptcp/net/ipv4/tcp_input.c:5238 linux-mptcp/net/ipv4/tcp_input.c:5730) [ 171.254089] tcp_v4_rcv (linux-mptcp/./include/linux/spinlock.h:393 linux-mptcp/net/ipv4/tcp_ipv4.c:2009) [ 171.258969] ip_protocol_deliver_rcu (linux-mptcp/net/ipv4/ip_input.c:204 (discriminator 1)) [ 171.260214] ip_local_deliver_finish (linux-mptcp/./include/linux/rcupdate.h:651 linux-mptcp/net/ipv4/ip_input.c:232) [ 171.261389] ip_local_deliver (linux-mptcp/./include/linux/netfilter.h:307 linux-mptcp/./include/linux/netfilter.h:301 linux-mptcp/net/ipv4/ip_input.c:252) [ 171.265884] ip_rcv (linux-mptcp/./include/linux/netfilter.h:307 linux-mptcp/./include/linux/netfilter.h:301 linux-mptcp/net/ipv4/ip_input.c:539) [ 171.273666] process_backlog (linux-mptcp/./include/linux/rcupdate.h:651 linux-mptcp/net/core/dev.c:6135) [ 171.275328] net_rx_action (linux-mptcp/net/core/dev.c:6572 linux-mptcp/net/core/dev.c:6640) [ 171.280472] __do_softirq (linux-mptcp/./arch/x86/include/asm/jump_label.h:25 linux-mptcp/./include/linux/jump_label.h:200 linux-mptcp/./include/trace/events/irq.h:142 linux-mptcp/kernel/softirq.c:293) [ 171.281379] do_softirq_own_stack (linux-mptcp/arch/x86/entry/entry_64.S:1083) [ 171.282358] We could address the issue clearing explicitly the relevant fields in several places - tcp_parse_option, tcp_fast_parse_options, possibly others. Instead we move the MPTCP option parsing into the already existing mptcp ingress hook, so that we need to clear the fields in a single place. This allows us dropping an MPTCP hook from the TCP code and removing the quite large mptcp_options_received from the tcp_sock struct. On the flip side, the MPTCP sockets will traverse the option space twice (in tcp_parse_option() and in mptcp_incoming_options(). That looks acceptable: we already do that for syn and 3rd ack packets, plain TCP socket will benefit from it, and even MPTCP sockets will experience better code locality, reducing the jumps between TCP and MPTCP code. v1 -> v2: - rebased on current '-net' tree Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/tcp.h | 51 ---------------------------------- include/net/mptcp.h | 2 -- net/ipv4/tcp_input.c | 4 --- net/mptcp/options.c | 66 +++++++++++++++++++++++++------------------- net/mptcp/protocol.c | 6 ++-- net/mptcp/protocol.h | 43 +++++++++++++++++++++++++++-- net/mptcp/subflow.c | 65 ++++++++++++++++++++++--------------------- 7 files changed, 115 insertions(+), 122 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 421c99c12291..4f8159e90ce1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -78,47 +78,6 @@ struct tcp_sack_block { #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ -#if IS_ENABLED(CONFIG_MPTCP) -struct mptcp_options_received { - u64 sndr_key; - u64 rcvr_key; - u64 data_ack; - u64 data_seq; - u32 subflow_seq; - u16 data_len; - u16 mp_capable : 1, - mp_join : 1, - dss : 1, - add_addr : 1, - rm_addr : 1, - family : 4, - echo : 1, - backup : 1; - u32 token; - u32 nonce; - u64 thmac; - u8 hmac[20]; - u8 join_id; - u8 use_map:1, - dsn64:1, - data_fin:1, - use_ack:1, - ack64:1, - mpc_map:1, - __unused:2; - u8 addr_id; - u8 rm_id; - union { - struct in_addr addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; -#endif - }; - u64 ahmac; - u16 port; -}; -#endif - struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -136,9 +95,6 @@ struct tcp_options_received { u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ -#if IS_ENABLED(CONFIG_MPTCP) - struct mptcp_options_received mptcp; -#endif }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) @@ -148,13 +104,6 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif -#if IS_ENABLED(CONFIG_MPTCP) - rx_opt->mptcp.mp_capable = 0; - rx_opt->mptcp.mp_join = 0; - rx_opt->mptcp.add_addr = 0; - rx_opt->mptcp.rm_addr = 0; - rx_opt->mptcp.dss = 0; -#endif } /* This is the max number of SACKS that we'll generate and process. It's safe diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 4ecfa7d5e0c7..3bce2019e4da 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -68,8 +68,6 @@ static inline bool rsk_is_mptcp(const struct request_sock *req) return tcp_rsk(req)->is_mptcp; } -void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, - int opsize, struct tcp_options_received *opt_rx); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 81425542da44..b996dc1069c5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3926,10 +3926,6 @@ void tcp_parse_options(const struct net *net, */ break; #endif - case TCPOPT_MPTCP: - mptcp_parse_option(skb, ptr, opsize, opt_rx); - break; - case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 8fea686a5562..eadbd59586e4 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -16,10 +16,10 @@ static bool mptcp_cap_flag_sha256(u8 flags) return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } -void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, - int opsize, struct tcp_options_received *opt_rx) +static void mptcp_parse_option(const struct sk_buff *skb, + const unsigned char *ptr, int opsize, + struct mptcp_options_received *mp_opt) { - struct mptcp_options_received *mp_opt = &opt_rx->mptcp; u8 subtype = *ptr >> 4; int expected_opsize; u8 version; @@ -283,12 +283,20 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, } void mptcp_get_options(const struct sk_buff *skb, - struct tcp_options_received *opt_rx) + struct mptcp_options_received *mp_opt) { - const unsigned char *ptr; const struct tcphdr *th = tcp_hdr(skb); - int length = (th->doff * 4) - sizeof(struct tcphdr); + const unsigned char *ptr; + int length; + /* initialize option status */ + mp_opt->mp_capable = 0; + mp_opt->mp_join = 0; + mp_opt->add_addr = 0; + mp_opt->rm_addr = 0; + mp_opt->dss = 0; + + length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); while (length > 0) { @@ -308,7 +316,7 @@ void mptcp_get_options(const struct sk_buff *skb, if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) - mptcp_parse_option(skb, ptr, opsize, opt_rx); + mptcp_parse_option(skb, ptr, opsize, mp_opt); ptr += opsize - 2; length -= opsize; } @@ -797,41 +805,41 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - struct mptcp_options_received *mp_opt; + struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; - mp_opt = &opt_rx->mptcp; - if (!check_fully_established(msk, sk, subflow, skb, mp_opt)) + mptcp_get_options(skb, &mp_opt); + if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return; - if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) { + if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { struct mptcp_addr_info addr; - addr.port = htons(mp_opt->port); - addr.id = mp_opt->addr_id; - if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { + addr.port = htons(mp_opt.port); + addr.id = mp_opt.addr_id; + if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) { addr.family = AF_INET; - addr.addr = mp_opt->addr; + addr.addr = mp_opt.addr; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) { + else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) { addr.family = AF_INET6; - addr.addr6 = mp_opt->addr6; + addr.addr6 = mp_opt.addr6; } #endif - if (!mp_opt->echo) + if (!mp_opt.echo) mptcp_pm_add_addr_received(msk, &addr); - mp_opt->add_addr = 0; + mp_opt.add_addr = 0; } - if (!mp_opt->dss) + if (!mp_opt.dss) return; /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ - if (mp_opt->use_ack) - update_una(msk, mp_opt); + if (mp_opt.use_ack) + update_una(msk, &mp_opt); mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) @@ -839,8 +847,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, memset(mpext, 0, sizeof(*mpext)); - if (mp_opt->use_map) { - if (mp_opt->mpc_map) { + if (mp_opt.use_map) { + if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ @@ -851,12 +859,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, mpext->dsn64 = 1; mpext->mpc_map = 1; } else { - mpext->data_seq = mp_opt->data_seq; - mpext->subflow_seq = mp_opt->subflow_seq; - mpext->dsn64 = mp_opt->dsn64; - mpext->data_fin = mp_opt->data_fin; + mpext->data_seq = mp_opt.data_seq; + mpext->subflow_seq = mp_opt.subflow_seq; + mpext->dsn64 = mp_opt.dsn64; + mpext->data_fin = mp_opt.data_fin; } - mpext->data_len = mp_opt->data_len; + mpext->data_len = mp_opt.data_len; mpext->use_map = 1; } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6e0188f5d3f3..e1f23016ed3f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1334,7 +1334,7 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) #endif struct sock *mptcp_sk_clone(const struct sock *sk, - const struct tcp_options_received *opt_rx, + const struct mptcp_options_received *mp_opt, struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); @@ -1373,9 +1373,9 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->write_seq = subflow_req->idsn + 1; atomic64_set(&msk->snd_una, msk->write_seq); - if (opt_rx->mptcp.mp_capable) { + if (mp_opt->mp_capable) { msk->can_ack = true; - msk->remote_key = opt_rx->mptcp.sndr_key; + msk->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; msk->ack_seq = ack_seq; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a2b3048037d0..e4ca6320ce76 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -91,6 +91,45 @@ #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 +struct mptcp_options_received { + u64 sndr_key; + u64 rcvr_key; + u64 data_ack; + u64 data_seq; + u32 subflow_seq; + u16 data_len; + u16 mp_capable : 1, + mp_join : 1, + dss : 1, + add_addr : 1, + rm_addr : 1, + family : 4, + echo : 1, + backup : 1; + u32 token; + u32 nonce; + u64 thmac; + u8 hmac[20]; + u8 join_id; + u8 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + mpc_map:1, + __unused:2; + u8 addr_id; + u8 rm_id; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; + u64 ahmac; + u16 port; +}; + static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | @@ -331,10 +370,10 @@ int mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone(const struct sock *sk, - const struct tcp_options_received *opt_rx, + const struct mptcp_options_received *mp_opt, struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, - struct tcp_options_received *opt_rx); + struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 84f6408594c9..bad998529767 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -124,12 +124,11 @@ static void subflow_init_req(struct request_sock *req, { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - struct tcp_options_received rx_opt; + struct mptcp_options_received mp_opt; pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); - memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp)); - mptcp_get_options(skb, &rx_opt); + mptcp_get_options(skb, &mp_opt); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; @@ -142,16 +141,16 @@ static void subflow_init_req(struct request_sock *req, return; #endif - if (rx_opt.mptcp.mp_capable) { + if (mp_opt.mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); - if (rx_opt.mptcp.mp_join) + if (mp_opt.mp_join) return; - } else if (rx_opt.mptcp.mp_join) { + } else if (mp_opt.mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } - if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { + if (mp_opt.mp_capable && listener->request_mptcp) { int err; err = mptcp_token_new_request(req); @@ -159,13 +158,13 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; - } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) { + } else if (mp_opt.mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; - subflow_req->backup = rx_opt.mptcp.backup; - subflow_req->remote_id = rx_opt.mptcp.join_id; - subflow_req->token = rx_opt.mptcp.token; - subflow_req->remote_nonce = rx_opt.mptcp.nonce; + subflow_req->backup = mp_opt.backup; + subflow_req->remote_id = mp_opt.join_id; + subflow_req->token = mp_opt.token; + subflow_req->remote_nonce = mp_opt.nonce; pr_debug("token=%u, remote_nonce=%u", subflow_req->token, subflow_req->remote_nonce); if (!subflow_token_join_request(req, skb)) { @@ -221,6 +220,7 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; struct tcp_sock *tp = tcp_sk(sk); @@ -237,16 +237,17 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->conn_finished = 1; - if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { + mptcp_get_options(skb, &mp_opt); + if (subflow->request_mptcp && mp_opt.mp_capable) { subflow->mp_capable = 1; subflow->can_ack = 1; - subflow->remote_key = tp->rx_opt.mptcp.sndr_key; + subflow->remote_key = mp_opt.sndr_key; pr_debug("subflow=%p, remote_key=%llu", subflow, subflow->remote_key); - } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { + } else if (subflow->request_join && mp_opt.mp_join) { subflow->mp_join = 1; - subflow->thmac = tp->rx_opt.mptcp.thmac; - subflow->remote_nonce = tp->rx_opt.mptcp.nonce; + subflow->thmac = mp_opt.thmac; + subflow->remote_nonce = mp_opt.nonce; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, subflow->thmac, subflow->remote_nonce); } else if (subflow->request_mptcp) { @@ -343,7 +344,7 @@ drop: /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct request_sock *req, - const struct tcp_options_received *rx_opt) + const struct mptcp_options_received *mp_opt) { const struct mptcp_subflow_request_sock *subflow_req; u8 hmac[MPTCPOPT_HMAC_LEN]; @@ -360,7 +361,7 @@ static bool subflow_hmac_valid(const struct request_sock *req, subflow_req->local_nonce, hmac); ret = true; - if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac))) + if (crypto_memneq(hmac, mp_opt->hmac, sizeof(hmac))) ret = false; sock_put((struct sock *)msk); @@ -416,7 +417,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; - struct tcp_options_received opt_rx; + struct mptcp_options_received mp_opt; bool fallback_is_fatal = false; struct sock *new_msk = NULL; bool fallback = false; @@ -424,7 +425,10 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - opt_rx.mptcp.mp_capable = 0; + /* we need later a valid 'mp_capable' value even when options are not + * parsed + */ + mp_opt.mp_capable = 0; if (tcp_rsk(req)->is_mptcp == 0) goto create_child; @@ -439,22 +443,21 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, goto create_msk; } - mptcp_get_options(skb, &opt_rx); - if (!opt_rx.mptcp.mp_capable) { + mptcp_get_options(skb, &mp_opt); + if (!mp_opt.mp_capable) { fallback = true; goto create_child; } create_msk: - new_msk = mptcp_sk_clone(listener->conn, &opt_rx, req); + new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req); if (!new_msk) fallback = true; } else if (subflow_req->mp_join) { fallback_is_fatal = true; - opt_rx.mptcp.mp_join = 0; - mptcp_get_options(skb, &opt_rx); - if (!opt_rx.mptcp.mp_join || - !subflow_hmac_valid(req, &opt_rx)) { + mptcp_get_options(skb, &mp_opt); + if (!mp_opt.mp_join || + !subflow_hmac_valid(req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); return NULL; } @@ -494,9 +497,9 @@ create_child: /* with OoO packets we can reach here without ingress * mpc option */ - ctx->remote_key = opt_rx.mptcp.sndr_key; - ctx->fully_established = opt_rx.mptcp.mp_capable; - ctx->can_ack = opt_rx.mptcp.mp_capable; + ctx->remote_key = mp_opt.sndr_key; + ctx->fully_established = mp_opt.mp_capable; + ctx->can_ack = mp_opt.mp_capable; } else if (ctx->mp_join) { struct mptcp_sock *owner; From d6085fe19b8ec53d8ea2d8a10390b7ab6978ac44 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Apr 2020 15:01:53 +0200 Subject: [PATCH 109/300] mptcp: avoid a WARN on bad input. Syzcaller has found a way to trigger the WARN_ON_ONCE condition in check_fully_established(). The root cause is a legit fallback to TCP scenario, so replace the WARN with a plain message on a more strict condition. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index eadbd59586e4..ecf41d52d2fc 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -703,8 +703,6 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, goto fully_established; } - WARN_ON_ONCE(subflow->can_ack); - /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP */ @@ -714,6 +712,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, return false; } + if (unlikely(!READ_ONCE(msk->pm.server_side))) + pr_warn_once("bogus mpc option on established client sk"); subflow->fully_established = 1; subflow->remote_key = mp_opt->sndr_key; subflow->can_ack = 1; From 5a91e32b40af86c790c4348aa3a5926a5c5fbc6d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Apr 2020 15:01:54 +0200 Subject: [PATCH 110/300] mptcp: fix 'use_ack' option access. The mentioned RX option field is initialized only for DSS packet, we must access it only if 'dss' is set too, or the subflow will end-up in a bad status, leading to RFC violations. Fixes: d22f4988ffec ("mptcp: process MP_CAPABLE data option") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index ecf41d52d2fc..9486720c3256 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -695,7 +695,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) return subflow->mp_capable; - if (mp_opt->use_ack) { + if (mp_opt->dss && mp_opt->use_ack) { /* subflows are fully established as soon as we get any * additional ack. */ From a77895dbc0ad0400e377e8f26567820b3658e30e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Apr 2020 15:01:55 +0200 Subject: [PATCH 111/300] mptcp: initialize the data_fin field for mpc packets When parsing MPC+data packets we set the dss field, so we must also initialize the data_fin, or we can find stray value there. Fixes: 9a19371bf029 ("mptcp: fix data_fin handing in RX path") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 9486720c3256..45497af23906 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -858,6 +858,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, mpext->subflow_seq = 1; mpext->dsn64 = 1; mpext->mpc_map = 1; + mpext->data_fin = 0; } else { mpext->data_seq = mp_opt.data_seq; mpext->subflow_seq = mp_opt.subflow_seq; From ac2b47fb92c50682d89d7350a491a6a392bac5dd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 30 Apr 2020 15:03:22 +0200 Subject: [PATCH 112/300] mptcp: fix uninitialized value access tcp_v{4,6}_syn_recv_sock() set 'own_req' only when returning a not NULL 'child', let's check 'own_req' only if child is available to avoid an - unharmful - UBSAN splat. v1 -> v2: - reference the correct hash Fixes: 4c8941de781c ("mptcp: avoid flipping mp_capable field in syn_recv_sock()") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bad998529767..67a4e35d4838 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -523,7 +523,7 @@ out: /* check for expected invariant - should never trigger, just help * catching eariler subtle bugs */ - WARN_ON_ONCE(*own_req && child && tcp_sk(child)->is_mptcp && + WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || !mptcp_subflow_ctx(child)->conn)); return child; From f9c6cea0b38518741c8dcf26ac056d26ee2fd61d Mon Sep 17 00:00:00 2001 From: Juliet Kim Date: Thu, 30 Apr 2020 13:22:11 -0500 Subject: [PATCH 113/300] ibmvnic: Skip fatal error reset after passive init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During MTU change, the following events may happen. Client-driven CRQ initialization fails due to partner’s CRQ closed, causing client to enqueue a reset task for FATAL_ERROR. Then passive (server-driven) CRQ initialization succeeds, causing client to release CRQ and enqueue a reset task for failover. If the passive CRQ initialization occurs before the FATAL reset task is processed, the FATAL error reset task would try to access a CRQ message queue that was freed, causing an oops. The problem may be most likely to occur during DLPAR add vNIC with a non-default MTU, because the DLPAR process will automatically issue a change MTU request. Fix this by not processing fatal error reset if CRQ is passively initialized after client-driven CRQ initialization fails. Signed-off-by: Juliet Kim Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 4bd33245bad6..3de549c6c693 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2189,7 +2189,8 @@ static void __ibmvnic_reset(struct work_struct *work) rc = do_hard_reset(adapter, rwi, reset_state); rtnl_unlock(); } - } else { + } else if (!(rwi->reset_reason == VNIC_RESET_FATAL && + adapter->from_passive_init)) { rc = do_reset(adapter, rwi, reset_state); } kfree(rwi); From 7979457b1d3a069cd857f5bd69e070e30223dd0c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 30 Apr 2020 22:38:45 +0300 Subject: [PATCH 114/300] net: bridge: vlan: Add a schedule point during VLAN processing User space can request to delete a range of VLANs from a bridge slave in one netlink request. For each deleted VLAN the FDB needs to be traversed in order to flush all the affected entries. If a large range of VLANs is deleted and the number of FDB entries is large or the FDB lock is contented, it is possible for the kernel to loop through the deleted VLANs for a long time. In case preemption is disabled, this can result in a soft lockup. Fix this by adding a schedule point after each VLAN is deleted to yield the CPU, if needed. This is safe because the VLANs are traversed in process context. Fixes: bdced7ef7838 ("bridge: support for multiple vlans and vlan ranges in setlink and dellink requests") Signed-off-by: Ido Schimmel Reported-by: Stefan Priebe - Profihost AG Tested-by: Stefan Priebe - Profihost AG Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 43dab4066f91..a0f5dbee8f9c 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -612,6 +612,7 @@ int br_process_vlan_info(struct net_bridge *br, v - 1, rtm_cmd); v_change_start = 0; } + cond_resched(); } /* v_change_start is set only if the last/whole range changed */ if (v_change_start) From 865308373ed49c9fb05720d14cbf1315349b32a9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 30 Apr 2020 21:51:32 +0200 Subject: [PATCH 115/300] dp83640: reverse arguments to list_add_tail In this code, it appears that phyter_clocks is a list head, based on the previous list_for_each, and that clock->list is intended to be a list element, given that it has just been initialized in dp83640_clock_init. Accordingly, switch the arguments to list_add_tail, which takes the list head as the second argument. Fixes: cb646e2b02b27 ("ptp: Added a clock driver for the National Semiconductor PHYTER.") Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- drivers/net/phy/dp83640.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index 415c27310982..ecbd5e0d685c 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -1120,7 +1120,7 @@ static struct dp83640_clock *dp83640_clock_get_bus(struct mii_bus *bus) goto out; } dp83640_clock_init(clock, bus); - list_add_tail(&phyter_clocks, &clock->list); + list_add_tail(&clock->list, &phyter_clocks); out: mutex_unlock(&phyter_clocks_lock); From 16f3fd3dccaa167525018a2b9f8140f0fe5903d5 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 30 Apr 2020 14:33:41 -0700 Subject: [PATCH 116/300] ionic: no link check until after probe Don't bother with the link check during probe, let the watchdog notice the first link-up. This allows probe to finish cleanly without any interruptions from over excited user programs opening the device as soon as it is registered. Fixes: c672412f6172 ("ionic: remove lifs on fw reset") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 5acf4f46c268..bbc6c79e0f04 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -2549,8 +2549,6 @@ int ionic_lifs_register(struct ionic *ionic) dev_err(ionic->dev, "Cannot register net device, aborting\n"); return err; } - - ionic_link_status_check_request(ionic->master_lif); ionic->master_lif->registered = true; return 0; From 1d53aedcf98485d9911a524d1f66c66f4d53f4cf Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 30 Apr 2020 14:33:42 -0700 Subject: [PATCH 117/300] ionic: refresh devinfo after fw-upgrade Make sure we can report the new FW version after a fw-upgrade has finished by re-reading the device's fw version information. Fixes: c672412f6172 ("ionic: remove lifs on fw reset") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index bbc6c79e0f04..6fcfb634d809 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -2116,6 +2116,7 @@ static void ionic_lif_handle_fw_up(struct ionic_lif *lif) dev_info(ionic->dev, "FW Up: restarting LIFs\n"); + ionic_init_devinfo(ionic); err = ionic_qcqs_alloc(lif); if (err) goto err_out; From 6bc977faa0af88dfdb278f16b171d49d51f43733 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 30 Apr 2020 14:33:43 -0700 Subject: [PATCH 118/300] ionic: add device reset to fw upgrade down Doing a device reset addresses an obscure FW timing issue in the FW upgrade process. Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 6fcfb634d809..d5293bfded29 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -2101,6 +2101,7 @@ static void ionic_lif_handle_fw_down(struct ionic_lif *lif) ionic_txrx_free(lif); } ionic_lifs_deinit(ionic); + ionic_reset(ionic); ionic_qcqs_free(lif); dev_info(ionic->dev, "FW Down: LIFs stopped\n"); From 713b6ebb4c376b3fb65fdceb3b59e401c93248f9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Apr 2020 16:35:10 -0500 Subject: [PATCH 119/300] net: ipa: fix a bug in ipa_endpoint_stop() In ipa_endpoint_stop(), for TX endpoints we set the number of retries to 0. When we break out of the loop, retries being 0 means we return EIO rather than the value of ret (which should be 0). Fix this by using a non-zero retry count for both RX and TX channels, and just break out of the loop after calling gsi_channel_stop() for TX channels. This way only RX channels will retry, and the retry count will be non-zero at the end for TX channels (so the proper value gets returned). Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 6de03be28784..a21534f1462f 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1283,7 +1283,7 @@ static int ipa_endpoint_stop_rx_dma(struct ipa *ipa) */ int ipa_endpoint_stop(struct ipa_endpoint *endpoint) { - u32 retries = endpoint->toward_ipa ? 0 : IPA_ENDPOINT_STOP_RX_RETRIES; + u32 retries = IPA_ENDPOINT_STOP_RX_RETRIES; int ret; do { @@ -1291,12 +1291,9 @@ int ipa_endpoint_stop(struct ipa_endpoint *endpoint) struct gsi *gsi = &ipa->gsi; ret = gsi_channel_stop(gsi, endpoint->channel_id); - if (ret != -EAGAIN) + if (ret != -EAGAIN || endpoint->toward_ipa) break; - if (endpoint->toward_ipa) - continue; - /* For IPA v3.5.1, send a DMA read task and check again */ if (ipa->version == IPA_VERSION_3_5_1) { ret = ipa_endpoint_stop_rx_dma(ipa); From 0721999f157006367965f4c4e7d5594bd4c9bf15 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Apr 2020 16:35:11 -0500 Subject: [PATCH 120/300] net: ipa: fix an error message in gsi_channel_init_one() An error message about limiting the number of TREs used prints the wrong value. Fix this bug. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/gsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 845478a19a4f..b4206fda0b22 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -1798,9 +1798,9 @@ static int gsi_channel_init_one(struct gsi *gsi, /* Worst case we need an event for every outstanding TRE */ if (data->channel.tre_count > data->channel.event_count) { - dev_warn(gsi->dev, "channel %u limited to %u TREs\n", - data->channel_id, data->channel.tre_count); tre_count = data->channel.event_count; + dev_warn(gsi->dev, "channel %u limited to %u TREs\n", + data->channel_id, tre_count); } else { tre_count = data->channel.tre_count; } From 0b1ba18aec3e599c16cca7d99969282fb9db7e92 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 30 Apr 2020 16:35:12 -0500 Subject: [PATCH 121/300] net: ipa: zero return code before issuing generic EE command Zero the result code stored in a field of the scratch 0 register before issuing a generic EE command. This just guarantees that the value we read later was actually written as a result of the command. Also add the definitions of two more possible result codes that can be returned when issuing flow control enable or disable commands: INCORRECT_CHANNEL_STATE: - channel must be in started state INCORRECT_DIRECTION - flow control is only valid for TX channels Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/gsi.c | 7 +++++++ drivers/net/ipa/gsi_reg.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index b4206fda0b22..b671bea0aa7c 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -1041,6 +1041,7 @@ static void gsi_isr_gp_int1(struct gsi *gsi) complete(&gsi->completion); } + /* Inter-EE interrupt handler */ static void gsi_isr_glob_ee(struct gsi *gsi) { @@ -1493,6 +1494,12 @@ static int gsi_generic_command(struct gsi *gsi, u32 channel_id, struct completion *completion = &gsi->completion; u32 val; + /* First zero the result code field */ + val = ioread32(gsi->virt + GSI_CNTXT_SCRATCH_0_OFFSET); + val &= ~GENERIC_EE_RESULT_FMASK; + iowrite32(val, gsi->virt + GSI_CNTXT_SCRATCH_0_OFFSET); + + /* Now issue the command */ val = u32_encode_bits(opcode, GENERIC_OPCODE_FMASK); val |= u32_encode_bits(channel_id, GENERIC_CHID_FMASK); val |= u32_encode_bits(GSI_EE_MODEM, GENERIC_EE_FMASK); diff --git a/drivers/net/ipa/gsi_reg.h b/drivers/net/ipa/gsi_reg.h index 7613b9cc7cf6..acc9e744c67d 100644 --- a/drivers/net/ipa/gsi_reg.h +++ b/drivers/net/ipa/gsi_reg.h @@ -410,6 +410,8 @@ #define INTER_EE_RESULT_FMASK GENMASK(2, 0) #define GENERIC_EE_RESULT_FMASK GENMASK(7, 5) #define GENERIC_EE_SUCCESS_FVAL 1 +#define GENERIC_EE_INCORRECT_DIRECTION_FVAL 3 +#define GENERIC_EE_INCORRECT_CHANNEL_FVAL 5 #define GENERIC_EE_NO_RESOURCES_FVAL 7 #define USB_MAX_PACKET_FMASK GENMASK(15, 15) /* 0: HS; 1: SS */ #define MHI_BASE_CHANNEL_FMASK GENMASK(31, 24) From ab046a5d4be4c90a3952a0eae75617b49c0cb01b Mon Sep 17 00:00:00 2001 From: Scott Dial Date: Fri, 24 Apr 2020 18:51:08 -0400 Subject: [PATCH 122/300] net: macsec: preserve ingress frame ordering MACsec decryption always occurs in a softirq context. Since the FPU may not be usable in the softirq context, the call to decrypt may be scheduled on the cryptd work queue. The cryptd work queue does not provide ordering guarantees. Therefore, preserving order requires masking out ASYNC implementations of gcm(aes). For instance, an Intel CPU with AES-NI makes available the generic-gcm-aesni driver from the aesni_intel module to implement gcm(aes). However, this implementation requires the FPU, so it is not always available to use from a softirq context, and will fallback to the cryptd work queue, which does not preserve frame ordering. With this change, such a system would select gcm_base(ctr(aes-aesni),ghash-generic). While the aes-aesni implementation prefers to use the FPU, it will fallback to the aes-asm implementation if unavailable. By using a synchronous version of gcm(aes), the decryption will complete before returning from crypto_aead_decrypt(). Therefore, the macsec_decrypt_done() callback will be called before returning from macsec_decrypt(). Thus, the order of calls to macsec_post_decrypt() for the frames is preserved. While it's presumable that the pure AES-NI version of gcm(aes) is more performant, the hybrid solution is capable of gigabit speeds on modest hardware. Regardless, preserving the order of frames is paramount for many network protocols (e.g., triggering TCP retries). Within the MACsec driver itself, the replay protection is tripped by the out-of-order frames, and can cause frames to be dropped. This bug has been present in this code since it was added in v4.6, however it may not have been noticed since not all CPUs have FPU offload available. Additionally, the bug manifests as occasional out-of-order packets that are easily misattributed to other network phenomena. When this code was added in v4.6, the crypto/gcm.c code did not restrict selection of the ghash function based on the ASYNC flag. For instance, x86 CPUs with PCLMULQDQ would select the ghash-clmulni driver instead of ghash-generic, which submits to the cryptd work queue if the FPU is busy. However, this bug was was corrected in v4.8 by commit b30bdfa86431afbafe15284a3ad5ac19b49b88e3, and was backported all the way back to the v3.14 stable branch, so this patch should be applicable back to the v4.6 stable branch. Signed-off-by: Scott Dial Signed-off-by: David S. Miller --- drivers/net/macsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 758baf7cb8a1..d4034025c87c 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1305,7 +1305,8 @@ static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len) struct crypto_aead *tfm; int ret; - tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + /* Pick a sync gcm(aes) cipher to ensure order is preserved. */ + tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) return tfm; From ee8d2267f0e39a1bfd95532da3a6405004114b27 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 Apr 2020 22:59:21 +0200 Subject: [PATCH 123/300] net: moxa: Fix a potential double 'free_irq()' Should an irq requested with 'devm_request_irq' be released explicitly, it should be done by 'devm_free_irq()', not 'free_irq()'. Fixes: 6c821bd9edc9 ("net: Add MOXA ART SoCs ethernet driver") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/moxa/moxart_ether.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c index e1651756bf9d..f70bb81e1ed6 100644 --- a/drivers/net/ethernet/moxa/moxart_ether.c +++ b/drivers/net/ethernet/moxa/moxart_ether.c @@ -564,7 +564,7 @@ static int moxart_remove(struct platform_device *pdev) struct net_device *ndev = platform_get_drvdata(pdev); unregister_netdev(ndev); - free_irq(ndev->irq, ndev); + devm_free_irq(&pdev->dev, ndev->irq, ndev); moxart_mac_free_memory(ndev); free_netdev(ndev); From 0ce205d4660c312cdeb4a81066616dcc6f3799c4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 27 Apr 2020 13:51:20 +0300 Subject: [PATCH 124/300] net: macb: Fix runtime PM refcounting The commit e6a41c23df0d, while trying to fix an issue, ("net: macb: ensure interface is not suspended on at91rm9200") introduced a refcounting regression, because in error case refcounter must be balanced. Fix it by calling pm_runtime_put_noidle() in error case. While here, fix the same mistake in other couple of places. Fixes: e6a41c23df0d ("net: macb: ensure interface is not suspended on at91rm9200") Cc: Alexandre Belloni Cc: Claudiu Beznea Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index a0e8c5bbabc0..f739d16d29b1 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -334,8 +334,10 @@ static int macb_mdio_read(struct mii_bus *bus, int mii_id, int regnum) int status; status = pm_runtime_get_sync(&bp->pdev->dev); - if (status < 0) + if (status < 0) { + pm_runtime_put_noidle(&bp->pdev->dev); goto mdio_pm_exit; + } status = macb_mdio_wait_for_idle(bp); if (status < 0) @@ -386,8 +388,10 @@ static int macb_mdio_write(struct mii_bus *bus, int mii_id, int regnum, int status; status = pm_runtime_get_sync(&bp->pdev->dev); - if (status < 0) + if (status < 0) { + pm_runtime_put_noidle(&bp->pdev->dev); goto mdio_pm_exit; + } status = macb_mdio_wait_for_idle(bp); if (status < 0) @@ -3816,8 +3820,10 @@ static int at91ether_open(struct net_device *dev) int ret; ret = pm_runtime_get_sync(&lp->pdev->dev); - if (ret < 0) + if (ret < 0) { + pm_runtime_put_noidle(&lp->pdev->dev); return ret; + } /* Clear internal statistics */ ctl = macb_readl(lp, NCR); From 54261af473be4c5481f6196064445d2945f2bdab Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 30 Apr 2020 17:52:40 +0200 Subject: [PATCH 125/300] security: Fix the default value of fs_context_parse_param hook security_fs_context_parse_param is called by vfs_parse_fs_param and a succussful return value (i.e 0) implies that a parameter will be consumed by the LSM framework. This stops all further parsing of the parmeter by VFS. Furthermore, if an LSM hook returns a success, the remaining LSM hooks are not invoked for the parameter. The current default behavior of returning success means that all the parameters are expected to be parsed by the LSM hook and none of them end up being populated by vfs in fs_context This was noticed when lsm=bpf is supplied on the command line before any other LSM. As the bpf lsm uses this default value to implement a default hook, this resulted in a failure to parse any fs_context parameters and a failure to mount the root filesystem. Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks") Reported-by: Mikko Ylinen Signed-off-by: KP Singh Signed-off-by: James Morris --- include/linux/lsm_hook_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 9cd4455528e5..1bdd027766d4 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -55,7 +55,7 @@ LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm) LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc, struct fs_context *src_sc) -LSM_HOOK(int, 0, fs_context_parse_param, struct fs_context *fc, +LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc, struct fs_parameter *param) LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb) LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb) From b723748750ece7d844cdf2f52c01d37f83387208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 27 Apr 2020 16:11:05 +0200 Subject: [PATCH 126/300] tunnel: Propagate ECT(1) when decapsulating as recommended by RFC6040 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC 6040 recommends propagating an ECT(1) mark from an outer tunnel header to the inner header if that inner header is already marked as ECT(0). When RFC 6040 decapsulation was implemented, this case of propagation was not added. This simply appears to be an oversight, so let's fix that. Fixes: eccc1bb8d4b4 ("tunnel: drop packet if ECN present with not-ECT") Reported-by: Bob Briscoe Reported-by: Olivier Tilmans Cc: Dave Taht Cc: Stephen Hemminger Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 57 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index c8e2bebd8d93..0f0d1efe06dd 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -99,6 +99,20 @@ static inline int IP_ECN_set_ce(struct iphdr *iph) return 1; } +static inline int IP_ECN_set_ect1(struct iphdr *iph) +{ + u32 check = (__force u32)iph->check; + + if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) + return 0; + + check += (__force u16)htons(0x100); + + iph->check = (__force __sum16)(check + (check>=0xFFFF)); + iph->tos ^= INET_ECN_MASK; + return 1; +} + static inline void IP_ECN_clear(struct iphdr *iph) { iph->tos &= ~INET_ECN_MASK; @@ -134,6 +148,22 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) return 1; } +static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph) +{ + __be32 from, to; + + if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0) + return 0; + + from = *(__be32 *)iph; + to = from ^ htonl(INET_ECN_MASK << 20); + *(__be32 *)iph = to; + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), + (__force __wsum)to); + return 1; +} + static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; @@ -159,6 +189,25 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb) return 0; } +static inline int INET_ECN_set_ect1(struct sk_buff *skb) +{ + switch (skb->protocol) { + case cpu_to_be16(ETH_P_IP): + if (skb_network_header(skb) + sizeof(struct iphdr) <= + skb_tail_pointer(skb)) + return IP_ECN_set_ect1(ip_hdr(skb)); + break; + + case cpu_to_be16(ETH_P_IPV6): + if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= + skb_tail_pointer(skb)) + return IP6_ECN_set_ect1(skb, ipv6_hdr(skb)); + break; + } + + return 0; +} + /* * RFC 6040 4.2 * To decapsulate the inner header at the tunnel egress, a compliant @@ -208,8 +257,12 @@ static inline int INET_ECN_decapsulate(struct sk_buff *skb, int rc; rc = __INET_ECN_decapsulate(outer, inner, &set_ce); - if (!rc && set_ce) - INET_ECN_set_ce(skb); + if (!rc) { + if (set_ce) + INET_ECN_set_ce(skb); + else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1) + INET_ECN_set_ect1(skb); + } return rc; } From 6ef4889fc0b3aa6ab928e7565935ac6f762cee6e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 27 Apr 2020 18:05:47 +0300 Subject: [PATCH 127/300] mlxsw: spectrum_acl_tcam: Position vchunk in a vregion list properly Vregion helpers to get min and max priority depend on the correct ordering of vchunks in the vregion list. However, the current code always adds new chunk to the end of the list, no matter what the priority is. Fix this by finding the correct place in the list and put vchunk there. Fixes: 22a677661f56 ("mlxsw: spectrum: Introduce ACL core with simple TCAM implementation") Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c index 430da69003d8..a6e30e020b5c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c @@ -986,8 +986,9 @@ mlxsw_sp_acl_tcam_vchunk_create(struct mlxsw_sp *mlxsw_sp, unsigned int priority, struct mlxsw_afk_element_usage *elusage) { + struct mlxsw_sp_acl_tcam_vchunk *vchunk, *vchunk2; struct mlxsw_sp_acl_tcam_vregion *vregion; - struct mlxsw_sp_acl_tcam_vchunk *vchunk; + struct list_head *pos; int err; if (priority == MLXSW_SP_ACL_TCAM_CATCHALL_PRIO) @@ -1025,7 +1026,14 @@ mlxsw_sp_acl_tcam_vchunk_create(struct mlxsw_sp *mlxsw_sp, } mlxsw_sp_acl_tcam_rehash_ctx_vregion_changed(vregion); - list_add_tail(&vchunk->list, &vregion->vchunk_list); + + /* Position the vchunk inside the list according to priority */ + list_for_each(pos, &vregion->vchunk_list) { + vchunk2 = list_entry(pos, typeof(*vchunk2), list); + if (vchunk2->priority > priority) + break; + } + list_add_tail(&vchunk->list, pos); mutex_unlock(&vregion->lock); return vchunk; From ab1c637cc6d80c0ee1d5d15f7ec8977f49feee75 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 29 Apr 2020 18:09:32 +0300 Subject: [PATCH 128/300] stmmac: intel: Fix kernel crash due to wrong error path Unfortunately sometimes ->probe() may fail. The commit b9663b7ca6ff ("net: stmmac: Enable SERDES power up/down sequence") messed up with error handling and thus: [ 12.811311] ------------[ cut here ]------------ [ 12.811993] kernel BUG at net/core/dev.c:9937! Fix this by properly crafted error path. Fixes: b9663b7ca6ff ("net: stmmac: Enable SERDES power up/down sequence") Cc: Voon Weifeng Cc: Ong Boon Leong Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 565da6498c84..ff22f274aa43 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4991,7 +4991,7 @@ int stmmac_dvr_probe(struct device *device, priv->plat->bsp_priv); if (ret < 0) - return ret; + goto error_serdes_powerup; } #ifdef CONFIG_DEBUG_FS @@ -5000,6 +5000,8 @@ int stmmac_dvr_probe(struct device *device, return ret; +error_serdes_powerup: + unregister_netdev(ndev); error_netdev_register: phylink_destroy(priv->phylink); error_phy_setup: From 69422a7e5d578aab277091f4ebb7c1b387f3e355 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Thu, 30 Apr 2020 00:22:19 +0530 Subject: [PATCH 129/300] cxgb4: fix EOTID leak when disabling TC-MQPRIO offload Under heavy load, the EOTID termination FLOWC request fails to get enqueued to the end of the Tx ring due to lack of credits. This results in EOTID leak. When disabling TC-MQPRIO offload, the link is already brought down to cleanup EOTIDs. So, flush any pending enqueued skbs that can't be sent outside the wire, to make room for FLOWC request. Also, move the FLOWC descriptor consumption logic closer to when the FLOWC request is actually posted to hardware. Fixes: 0e395b3cb1fb ("cxgb4: add FLOWC based QoS offload") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 39 ++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index f5dd34db4b54..2cfb1f691bf2 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2207,6 +2207,9 @@ static void ethofld_hard_xmit(struct net_device *dev, if (unlikely(skip_eotx_wr)) { start = (u64 *)wr; eosw_txq->state = next_state; + eosw_txq->cred -= wrlen16; + eosw_txq->ncompl++; + eosw_txq->last_compl = 0; goto write_wr_headers; } @@ -2365,6 +2368,34 @@ netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev) return cxgb4_eth_xmit(skb, dev); } +static void eosw_txq_flush_pending_skbs(struct sge_eosw_txq *eosw_txq) +{ + int pktcount = eosw_txq->pidx - eosw_txq->last_pidx; + int pidx = eosw_txq->pidx; + struct sk_buff *skb; + + if (!pktcount) + return; + + if (pktcount < 0) + pktcount += eosw_txq->ndesc; + + while (pktcount--) { + pidx--; + if (pidx < 0) + pidx += eosw_txq->ndesc; + + skb = eosw_txq->desc[pidx].skb; + if (skb) { + dev_consume_skb_any(skb); + eosw_txq->desc[pidx].skb = NULL; + eosw_txq->inuse--; + } + } + + eosw_txq->pidx = eosw_txq->last_pidx + 1; +} + /** * cxgb4_ethofld_send_flowc - Send ETHOFLD flowc request to bind eotid to tc. * @dev - netdevice @@ -2440,9 +2471,11 @@ int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc) FW_FLOWC_MNEM_EOSTATE_CLOSING : FW_FLOWC_MNEM_EOSTATE_ESTABLISHED); - eosw_txq->cred -= len16; - eosw_txq->ncompl++; - eosw_txq->last_compl = 0; + /* Free up any pending skbs to ensure there's room for + * termination FLOWC. + */ + if (tc == FW_SCHED_CLS_NONE) + eosw_txq_flush_pending_skbs(eosw_txq); ret = eosw_txq_enqueue(eosw_txq, skb); if (ret) { From 0225fd5e0a6a32af7af0aefac45c8ebf19dc5183 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 29 Apr 2020 11:21:55 +0100 Subject: [PATCH 130/300] KVM: arm64: Fix 32bit PC wrap-around In the unlikely event that a 32bit vcpu traps into the hypervisor on an instruction that is located right at the end of the 32bit range, the emulation of that instruction is going to increment PC past the 32bit range. This isn't great, as userspace can then observe this value and get a bit confused. Conversly, userspace can do things like (in the context of a 64bit guest that is capable of 32bit EL0) setting PSTATE to AArch64-EL0, set PC to a 64bit value, change PSTATE to AArch32-USR, and observe that PC hasn't been truncated. More confusion. Fix both by: - truncating PC increments for 32bit guests - sanitizing all 32bit regs every time a core reg is changed by userspace, and that PSTATE indicates a 32bit mode. Cc: stable@vger.kernel.org Acked-by: Will Deacon Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 7 +++++++ virt/kvm/arm/hyp/aarch32.c | 8 ++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 23ebe51410f0..50a279d3ddd7 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -200,6 +200,13 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) } memcpy((u32 *)regs + off, valp, KVM_REG_SIZE(reg->id)); + + if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) { + int i; + + for (i = 0; i < 16; i++) + *vcpu_reg32(vcpu, i) = (u32)*vcpu_reg32(vcpu, i); + } out: return err; } diff --git a/virt/kvm/arm/hyp/aarch32.c b/virt/kvm/arm/hyp/aarch32.c index d31f267961e7..25c0e47d57cb 100644 --- a/virt/kvm/arm/hyp/aarch32.c +++ b/virt/kvm/arm/hyp/aarch32.c @@ -125,12 +125,16 @@ static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu) */ void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) { + u32 pc = *vcpu_pc(vcpu); bool is_thumb; is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT); if (is_thumb && !is_wide_instr) - *vcpu_pc(vcpu) += 2; + pc += 2; else - *vcpu_pc(vcpu) += 4; + pc += 4; + + *vcpu_pc(vcpu) = pc; + kvm_adjust_itstate(vcpu); } From c457a273e118bb96e1db8d1825f313e6cafe4258 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 24 Apr 2020 15:32:41 +0800 Subject: [PATCH 131/300] drm/amdgpu: move kfd suspend after ip_suspend_phase1 This sequence change should be safe as what did in ip_suspend_phase1 is to suspend DCE only. And this is a prerequisite for coming redundant cg/pg ungate dropping. Fixes: 487eca11a321ef ("drm/amdgpu: fix gfx hang during suspend with video playback (v2)") Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f84f9e35a73b..957d4872a855 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3375,12 +3375,12 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - amdgpu_amdkfd_suspend(adev, !fbcon); - amdgpu_ras_suspend(adev); r = amdgpu_device_ip_suspend_phase1(adev); + amdgpu_amdkfd_suspend(adev, !fbcon); + /* evict vram memory */ amdgpu_bo_evict_vram(adev); From f7b52890daba570bc8162d43c96b5583bbdd4edd Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 24 Apr 2020 15:36:22 +0800 Subject: [PATCH 132/300] drm/amdgpu: drop redundant cg/pg ungate on runpm enter CG/PG ungate is already performed in ip_suspend_phase1. Otherwise, the CG/PG ungate will be performed twice. That will cause gfxoff disablement is performed twice also on runpm enter while gfxoff enablemnt once on rump exit. That will put gfxoff into disabled state. Fixes: b2a7e9735ab286 ("drm/amdgpu: fix the hw hang during perform system reboot and reset") Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 957d4872a855..affde2de2a0d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3372,9 +3372,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) } } - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); - amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - amdgpu_ras_suspend(adev); r = amdgpu_device_ip_suspend_phase1(adev); From f33a6dec4e12fda128fde8f6b2fcc2252b4b59d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Wed, 29 Apr 2020 18:28:01 +0200 Subject: [PATCH 133/300] drm/amdgpu/dc: Use WARN_ON_ONCE for ASSERT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once should generally be enough for diagnosing what lead up to it, repeating it over and over can be pretty annoying. Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/os_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/os_types.h b/drivers/gpu/drm/amd/display/dc/os_types.h index c34eba19860a..6d7bca562eec 100644 --- a/drivers/gpu/drm/amd/display/dc/os_types.h +++ b/drivers/gpu/drm/amd/display/dc/os_types.h @@ -108,7 +108,7 @@ #define ASSERT(expr) ASSERT_CRITICAL(expr) #else -#define ASSERT(expr) WARN_ON(!(expr)) +#define ASSERT(expr) WARN_ON_ONCE(!(expr)) #endif #define BREAK_TO_DEBUGGER() ASSERT(0) From 59dfb0c64d3853d20dc84f4561f28d4f5a2ddc7d Mon Sep 17 00:00:00 2001 From: Daniel Kolesa Date: Wed, 29 Apr 2020 17:02:36 +0200 Subject: [PATCH 134/300] drm/amd/display: work around fp code being emitted outside of DC_FP_START/END The dcn20_validate_bandwidth function would have code touching the incorrect registers emitted outside of the boundaries of the DC_FP_START/END macros, at least on ppc64le. Work around the problem by wrapping the whole function instead. Signed-off-by: Daniel Kolesa Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.6.x --- .../drm/amd/display/dc/dcn20/dcn20_resource.c | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c index c3535bd3965b..e4348e3b6389 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c @@ -3068,25 +3068,32 @@ validate_out: return out; } - -bool dcn20_validate_bandwidth(struct dc *dc, struct dc_state *context, - bool fast_validate) +/* + * This must be noinline to ensure anything that deals with FP registers + * is contained within this call; previously our compiling with hard-float + * would result in fp instructions being emitted outside of the boundaries + * of the DC_FP_START/END macros, which makes sense as the compiler has no + * idea about what is wrapped and what is not + * + * This is largely just a workaround to avoid breakage introduced with 5.6, + * ideally all fp-using code should be moved into its own file, only that + * should be compiled with hard-float, and all code exported from there + * should be strictly wrapped with DC_FP_START/END + */ +static noinline bool dcn20_validate_bandwidth_fp(struct dc *dc, + struct dc_state *context, bool fast_validate) { bool voltage_supported = false; bool full_pstate_supported = false; bool dummy_pstate_supported = false; double p_state_latency_us; - DC_FP_START(); p_state_latency_us = context->bw_ctx.dml.soc.dram_clock_change_latency_us; context->bw_ctx.dml.soc.disable_dram_clock_change_vactive_support = dc->debug.disable_dram_clock_change_vactive_support; if (fast_validate) { - voltage_supported = dcn20_validate_bandwidth_internal(dc, context, true); - - DC_FP_END(); - return voltage_supported; + return dcn20_validate_bandwidth_internal(dc, context, true); } // Best case, we support full UCLK switch latency @@ -3115,7 +3122,15 @@ bool dcn20_validate_bandwidth(struct dc *dc, struct dc_state *context, restore_dml_state: context->bw_ctx.dml.soc.dram_clock_change_latency_us = p_state_latency_us; + return voltage_supported; +} +bool dcn20_validate_bandwidth(struct dc *dc, struct dc_state *context, + bool fast_validate) +{ + bool voltage_supported = false; + DC_FP_START(); + voltage_supported = dcn20_validate_bandwidth_fp(dc, context, fast_validate); DC_FP_END(); return voltage_supported; } From 57c4cfd4a2eef8f94052bd7c0fce0981f74fb213 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 19 Feb 2020 09:33:29 +0000 Subject: [PATCH 135/300] ftrace/selftests: workaround cgroup RT scheduling issues wakeup_rt.tc and wakeup.tc tests in tracers/ subdirectory fail due to the chrt command returning: chrt: failed to set pid 0's policy: Operation not permitted. To work around this, temporarily disable grout RT scheduling during ftracetest execution. Restore original value on test run completion. With these changes in place, both tests consistently pass. Fixes: c575dea2c1a5 ("selftests/ftrace: Add wakeup_rt tracer testcase") Fixes: c1edd060b413 ("selftests/ftrace: Add wakeup tracer testcase") Signed-off-by: Alan Maguire Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/ftracetest | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 063ecb290a5a..144308a757b7 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -29,8 +29,25 @@ err_ret=1 # kselftest skip code is 4 err_skip=4 +# cgroup RT scheduling prevents chrt commands from succeeding, which +# induces failures in test wakeup tests. Disable for the duration of +# the tests. + +readonly sched_rt_runtime=/proc/sys/kernel/sched_rt_runtime_us + +sched_rt_runtime_orig=$(cat $sched_rt_runtime) + +setup() { + echo -1 > $sched_rt_runtime +} + +cleanup() { + echo $sched_rt_runtime_orig > $sched_rt_runtime +} + errexit() { # message echo "Error: $1" 1>&2 + cleanup exit $err_ret } @@ -39,6 +56,8 @@ if [ `id -u` -ne 0 ]; then errexit "this must be run by root user" fi +setup + # Utilities absdir() { # file_path (cd `dirname $1`; pwd) @@ -235,6 +254,7 @@ TOTAL_RESULT=0 INSTANCE= CASENO=0 + testcase() { # testfile CASENO=$((CASENO+1)) desc=`grep "^#[ \t]*description:" $1 | cut -f2 -d:` @@ -406,5 +426,7 @@ prlog "# of unsupported: " `echo $UNSUPPORTED_CASES | wc -w` prlog "# of xfailed: " `echo $XFAILED_CASES | wc -w` prlog "# of undefined(test bug): " `echo $UNDEFINED_CASES | wc -w` +cleanup + # if no error, return 0 exit $TOTAL_RESULT From b730d668138cb3dd9ce78f8003986d1adae5523a Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 19 Feb 2020 09:33:30 +0000 Subject: [PATCH 136/300] ftrace/selftest: make unresolved cases cause failure if --fail-unresolved set Currently, ftracetest will return 1 (failure) if any unresolved cases are encountered. The unresolved status results from modules and programs not being available, and as such does not indicate any issues with ftrace itself. As such, change the behaviour of ftracetest in line with unsupported cases; if unsupported cases happen, ftracetest still returns 0 unless --fail-unsupported. Here --fail-unresolved is added and the default is to return 0 if unresolved results occur. Signed-off-by: Alan Maguire Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/ftracetest | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 144308a757b7..19e9236dec5e 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -17,6 +17,7 @@ echo " -v|--verbose Increase verbosity of test messages" echo " -vv Alias of -v -v (Show all results in stdout)" echo " -vvv Alias of -v -v -v (Show all commands immediately)" echo " --fail-unsupported Treat UNSUPPORTED as a failure" +echo " --fail-unresolved Treat UNRESOLVED as a failure" echo " -d|--debug Debug mode (trace all shell commands)" echo " -l|--logdir Save logs on the " echo " If is -, all logs output in console only" @@ -112,6 +113,10 @@ parse_opts() { # opts UNSUPPORTED_RESULT=1 shift 1 ;; + --fail-unresolved) + UNRESOLVED_RESULT=1 + shift 1 + ;; --logdir|-l) LOG_DIR=$2 shift 2 @@ -176,6 +181,7 @@ KEEP_LOG=0 DEBUG=0 VERBOSE=0 UNSUPPORTED_RESULT=0 +UNRESOLVED_RESULT=0 STOP_FAILURE=0 # Parse command-line options parse_opts $* @@ -280,7 +286,7 @@ eval_result() { # sigval $UNRESOLVED) prlog " [${color_blue}UNRESOLVED${color_reset}]" UNRESOLVED_CASES="$UNRESOLVED_CASES $CASENO" - return 1 # this is a kind of bug.. something happened. + return $UNRESOLVED_RESULT # depends on use case ;; $UNTESTED) prlog " [${color_blue}UNTESTED${color_reset}]" From 6734d211feaec0c24f90da77313dbe704437d8a8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 1 May 2020 22:37:41 +0900 Subject: [PATCH 137/300] selftests/ftrace: Make XFAIL green color Since XFAIL (Expected Failure) is expected to fail the test, which means that test case works as we expected. IOW, XFAIL is same as PASS. So make it green. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/ftracetest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 19e9236dec5e..a4605b5ee66d 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -299,7 +299,7 @@ eval_result() { # sigval return $UNSUPPORTED_RESULT # depends on use case ;; $XFAIL) - prlog " [${color_red}XFAIL${color_reset}]" + prlog " [${color_green}XFAIL${color_reset}]" XFAILED_CASES="$XFAILED_CASES $CASENO" return 0 ;; From 66d69e081b526b6a6031f0d3ca8ddff71e5707a5 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Mon, 27 Apr 2020 18:11:07 -0600 Subject: [PATCH 138/300] selftests: fix kvm relocatable native/cross builds and installs kvm test Makefile doesn't fully support cross-builds and installs. UNAME_M = $(shell uname -m) variable is used to define the target programs and libraries to be built from arch specific sources in sub-directories. For cross-builds to work, UNAME_M has to map to ARCH and arch specific directories and targets in this Makefile. UNAME_M variable to used to run the compiles pointing to the right arch directories and build the right targets for these supported architectures. TEST_GEN_PROGS and LIBKVM are set using UNAME_M variable. LINUX_TOOL_ARCH_INCLUDE is set using ARCH variable. x86_64 targets are named to include x86_64 as a suffix and directories for includes are in x86_64 sub-directory. s390x and aarch64 follow the same convention. "uname -m" doesn't result in the correct mapping for s390x and aarch64. Fix it to set UNAME_M correctly for s390x and aarch64 cross-builds. In addition, Makefile doesn't create arch sub-directories in the case of relocatable builds and test programs under s390x and x86_64 directories fail to build. This is a problem for native and cross-builds. Fix it to create all necessary directories keying off of TEST_GEN_PROGS. The following use-cases work with this change: Native x86_64: make O=/tmp/kselftest -C tools/testing/selftests TARGETS=kvm install \ INSTALL_PATH=$HOME/x86_64 arm64 cross-build: make O=$HOME/arm64_build/ ARCH=arm64 HOSTCC=gcc \ CROSS_COMPILE=aarch64-linux-gnu- defconfig make O=$HOME/arm64_build/ ARCH=arm64 HOSTCC=gcc \ CROSS_COMPILE=aarch64-linux-gnu- all make kselftest-install TARGETS=kvm O=$HOME/arm64_build ARCH=arm64 \ HOSTCC=gcc CROSS_COMPILE=aarch64-linux-gnu- s390x cross-build: make O=$HOME/s390x_build/ ARCH=s390 HOSTCC=gcc \ CROSS_COMPILE=s390x-linux-gnu- defconfig make O=$HOME/s390x_build/ ARCH=s390 HOSTCC=gcc \ CROSS_COMPILE=s390x-linux-gnu- all make kselftest-install TARGETS=kvm O=$HOME/s390x_build/ ARCH=s390 \ HOSTCC=gcc CROSS_COMPILE=s390x-linux-gnu- all No regressions in the following use-cases: make -C tools/testing/selftests TARGETS=kvm make kselftest-all TARGETS=kvm Signed-off-by: Shuah Khan --- tools/testing/selftests/kvm/Makefile | 29 +++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 712a2ddd2a27..b728c0a0f9b2 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -5,8 +5,34 @@ all: top_srcdir = ../../../.. KSFT_KHDR_INSTALL := 1 + +# For cross-builds to work, UNAME_M has to map to ARCH and arch specific +# directories and targets in this Makefile. "uname -m" doesn't map to +# arch specific sub-directory names. +# +# UNAME_M variable to used to run the compiles pointing to the right arch +# directories and build the right targets for these supported architectures. +# +# TEST_GEN_PROGS and LIBKVM are set using UNAME_M variable. +# LINUX_TOOL_ARCH_INCLUDE is set using ARCH variable. +# +# x86_64 targets are named to include x86_64 as a suffix and directories +# for includes are in x86_64 sub-directory. s390x and aarch64 follow the +# same convention. "uname -m" doesn't result in the correct mapping for +# s390x and aarch64. +# +# No change necessary for x86_64 UNAME_M := $(shell uname -m) +# Set UNAME_M for arm64 compile/install to work +ifeq ($(ARCH),arm64) + UNAME_M := aarch64 +endif +# Set UNAME_M s390x compile/install to work +ifeq ($(ARCH),s390) + UNAME_M := s390x +endif + LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c @@ -53,7 +79,7 @@ LIBKVM += $(LIBKVM_$(UNAME_M)) INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include -LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include +LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ @@ -84,6 +110,7 @@ $(LIBKVM_OBJ): $(OUTPUT)/%.o: %.c $(OUTPUT)/libkvm.a: $(LIBKVM_OBJ) $(AR) crs $@ $^ +x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS)))) all: $(STATIC_LIBS) $(TEST_GEN_PROGS): $(STATIC_LIBS) From fdc63ff0e49c588884992b4b2656345a5e878b32 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 8 Apr 2020 21:13:10 +0300 Subject: [PATCH 139/300] ftrace/x86: Fix trace event registration for syscalls without arguments The refactoring of SYSCALL_DEFINE0() macros removed the ABI stubs and simply defines __abi_sys_$NAME as alias of __do_sys_$NAME. As a result kallsyms_lookup() returns "__do_sys_$NAME" which does not match with the declared trace event name. See also commit 1c758a2202a6 ("tracing/x86: Update syscall trace events to handle new prefixed syscall func names"). Add __do_sys_ to the valid prefixes which are checked in arch_syscall_match_sym_name(). Fixes: d2b5de495ee9 ("x86/entry: Refactor SYSCALL_DEFINE0 macros") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Thomas Gleixner Acked-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/158636958997.7900.16485049455470033557.stgit@buzz --- arch/x86/include/asm/ftrace.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 85be2f506272..70b96cae5b42 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -61,11 +61,12 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name { /* * Compare the symbol name with the system call name. Skip the - * "__x64_sys", "__ia32_sys" or simple "sys" prefix. + * "__x64_sys", "__ia32_sys", "__do_sys" or simple "sys" prefix. */ return !strcmp(sym + 3, name + 3) || (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) || - (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)); + (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)) || + (!strncmp(sym, "__do_sys", 8) && !strcmp(sym + 8, name + 3)); } #ifndef COMPILE_OFFSETS From c84cb3735fd53c91101ccdb191f2e3331a9262cb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Apr 2020 16:55:57 +0200 Subject: [PATCH 140/300] x86/apic: Move TSC deadline timer debug printk Leon reported that the printk_once() in __setup_APIC_LVTT() triggers a lockdep splat due to a lock order violation between hrtimer_base::lock and console_sem, when the 'once' condition is reset via /sys/kernel/debug/clear_warn_once after boot. The initial printk cannot trigger this because that happens during boot when the local APIC timer is set up on the boot CPU. Prevent it by moving the printk to a place which is guaranteed to be only called once during boot. Mark the deadline timer check related functions and data __init while at it. Reported-by: Leon Romanovsky Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/87y2qhoshi.fsf@nanos.tec.linutronix.de --- arch/x86/kernel/apic/apic.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 81b9c63dae1b..e53dda210cd7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -352,8 +352,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * According to Intel, MFENCE can do the serialization here. */ asm volatile("mfence" : : : "memory"); - - printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); return; } @@ -546,7 +544,7 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -static u32 hsx_deadline_rev(void) +static __init u32 hsx_deadline_rev(void) { switch (boot_cpu_data.x86_stepping) { case 0x02: return 0x3a; /* EP */ @@ -556,7 +554,7 @@ static u32 hsx_deadline_rev(void) return ~0U; } -static u32 bdx_deadline_rev(void) +static __init u32 bdx_deadline_rev(void) { switch (boot_cpu_data.x86_stepping) { case 0x02: return 0x00000011; @@ -568,7 +566,7 @@ static u32 bdx_deadline_rev(void) return ~0U; } -static u32 skx_deadline_rev(void) +static __init u32 skx_deadline_rev(void) { switch (boot_cpu_data.x86_stepping) { case 0x03: return 0x01000136; @@ -581,7 +579,7 @@ static u32 skx_deadline_rev(void) return ~0U; } -static const struct x86_cpu_id deadline_match[] = { +static const struct x86_cpu_id deadline_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev), X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev), @@ -603,18 +601,19 @@ static const struct x86_cpu_id deadline_match[] = { {}, }; -static void apic_check_deadline_errata(void) +static __init bool apic_validate_deadline_timer(void) { const struct x86_cpu_id *m; u32 rev; - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || - boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; m = x86_match_cpu(deadline_match); if (!m) - return; + return true; /* * Function pointers will have the MSB set due to address layout, @@ -626,11 +625,12 @@ static void apic_check_deadline_errata(void) rev = (u32)m->driver_data; if (boot_cpu_data.microcode >= rev) - return; + return true; setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " "please update microcode to version: 0x%x (or later)\n", rev); + return false; } /* @@ -2092,7 +2092,8 @@ void __init init_apic_mappings(void) { unsigned int new_apicid; - apic_check_deadline_errata(); + if (apic_validate_deadline_timer()) + pr_debug("TSC deadline timer available\n"); if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); From 8f34e53b60b337e559f1ea19e2780ff95ab2fa65 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 1 May 2020 08:53:08 -0600 Subject: [PATCH 141/300] ipv6: Use global sernum for dst validation with nexthop objects Nik reported a bug with pcpu dst cache when nexthop objects are used illustrated by the following: $ ip netns add foo $ ip -netns foo li set lo up $ ip -netns foo addr add 2001:db8:11::1/128 dev lo $ ip netns exec foo sysctl net.ipv6.conf.all.forwarding=1 $ ip li add veth1 type veth peer name veth2 $ ip li set veth1 up $ ip addr add 2001:db8:10::1/64 dev veth1 $ ip li set dev veth2 netns foo $ ip -netns foo li set veth2 up $ ip -netns foo addr add 2001:db8:10::2/64 dev veth2 $ ip -6 nexthop add id 100 via 2001:db8:10::2 dev veth1 $ ip -6 route add 2001:db8:11::1/128 nhid 100 Create a pcpu entry on cpu 0: $ taskset -a -c 0 ip -6 route get 2001:db8:11::1 Re-add the route entry: $ ip -6 ro del 2001:db8:11::1 $ ip -6 route add 2001:db8:11::1/128 nhid 100 Route get on cpu 0 returns the stale pcpu: $ taskset -a -c 0 ip -6 route get 2001:db8:11::1 RTNETLINK answers: Network is unreachable While cpu 1 works: $ taskset -a -c 1 ip -6 route get 2001:db8:11::1 2001:db8:11::1 from :: via 2001:db8:10::2 dev veth1 src 2001:db8:10::1 metric 1024 pref medium Conversion of FIB entries to work with external nexthop objects missed an important difference between IPv4 and IPv6 - how dst entries are invalidated when the FIB changes. IPv4 has a per-network namespace generation id (rt_genid) that is bumped on changes to the FIB. Checking if a dst_entry is still valid means comparing rt_genid in the rtable to the current value of rt_genid for the namespace. IPv6 also has a per network namespace counter, fib6_sernum, but the count is saved per fib6_node. With the per-node counter only dst_entries based on fib entries under the node are invalidated when changes are made to the routes - limiting the scope of invalidations. IPv6 uses a reference in the rt6_info, 'from', to track the corresponding fib entry used to create the dst_entry. When validating a dst_entry, the 'from' is used to backtrack to the fib6_node and check the sernum of it to the cookie passed to the dst_check operation. With the inline format (nexthop definition inline with the fib6_info), dst_entries cached in the fib6_nh have a 1:1 correlation between fib entries, nexthop data and dst_entries. With external nexthops, IPv6 looks more like IPv4 which means multiple fib entries across disparate fib6_nodes can all reference the same fib6_nh. That means validation of dst_entries based on external nexthops needs to use the IPv4 format - the per-network namespace counter. Add sernum to rt6_info and set it when creating a pcpu dst entry. Update rt6_get_cookie to return sernum if it is set and update dst_check for IPv6 to look for sernum set and based the check on it if so. Finally, rt6_get_pcpu_route needs to validate the cached entry before returning a pcpu entry (similar to the rt_cache_valid calls in __mkroute_input and __mkroute_output for IPv4). This problem only affects routes using the new, external nexthops. Thanks to the kbuild test robot for catching the IS_ENABLED needed around rt_genid_ipv6 before I sent this out. Fixes: 5b98324ebe29 ("ipv6: Allow routes to use nexthop objects") Reported-by: Nikolay Aleksandrov Signed-off-by: David Ahern Reviewed-by: Nikolay Aleksandrov Tested-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 ++++ include/net/net_namespace.h | 7 +++++++ net/ipv6/route.c | 25 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 80262d2980f5..1d98828c6649 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -203,6 +203,7 @@ struct fib6_info { struct rt6_info { struct dst_entry dst; struct fib6_info __rcu *from; + int sernum; struct rt6key rt6i_dst; struct rt6key rt6i_src; @@ -291,6 +292,9 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) struct fib6_info *from; u32 cookie = 0; + if (rt->sernum) + return rt->sernum; + rcu_read_lock(); from = rcu_dereference(rt->from); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index ab96fb59131c..8e001e049497 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -437,6 +437,13 @@ static inline int rt_genid_ipv4(const struct net *net) return atomic_read(&net->ipv4.rt_genid); } +#if IS_ENABLED(CONFIG_IPV6) +static inline int rt_genid_ipv6(const struct net *net) +{ + return atomic_read(&net->ipv6.fib6_sernum); +} +#endif + static inline void rt_genid_bump_ipv4(struct net *net) { atomic_inc(&net->ipv4.rt_genid); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..8d418038fe32 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1385,9 +1385,18 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) } ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; + + if (f6i->nh) + pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); + return pcpu_rt; } +static bool rt6_is_valid(const struct rt6_info *rt6) +{ + return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); +} + /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { @@ -1395,6 +1404,19 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); + if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { + struct rt6_info *prev, **p; + + p = this_cpu_ptr(res->nh->rt6i_pcpu); + prev = xchg(p, NULL); + if (prev) { + dst_dev_put(&prev->dst); + dst_release(&prev->dst); + } + + pcpu_rt = NULL; + } + return pcpu_rt; } @@ -2593,6 +2615,9 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt = container_of(dst, struct rt6_info, dst); + if (rt->sernum) + return rt6_is_valid(rt) ? dst : NULL; + rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value From 600ac36b5327c949f6754be106e2a08e5d81e3a0 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Tue, 28 Apr 2020 11:03:53 -0500 Subject: [PATCH 142/300] net: phy: DP83822: Fix WoL in config init to be disabled The WoL feature should be disabled when config_init is called and the feature should turned on or off when set_wol is called. In addition updated the calls to modify the registers to use the set_bit and clear_bit function calls. Fixes: 3b427751a9d0 ("net: phy: DP83822 initial driver submission") Signed-off-by: Dan Murphy Signed-off-by: David S. Miller --- drivers/net/phy/dp83822.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index fe9aa3ad52a7..1dd19d0cb269 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -137,19 +137,18 @@ static int dp83822_set_wol(struct phy_device *phydev, value &= ~DP83822_WOL_SECURE_ON; } - value |= (DP83822_WOL_EN | DP83822_WOL_INDICATION_SEL | - DP83822_WOL_CLR_INDICATION); - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, - value); - } else { - value = phy_read_mmd(phydev, DP83822_DEVADDR, - MII_DP83822_WOL_CFG); - value &= ~DP83822_WOL_EN; - phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, - value); - } + /* Clear any pending WoL interrupt */ + phy_read(phydev, MII_DP83822_MISR2); - return 0; + value |= DP83822_WOL_EN | DP83822_WOL_INDICATION_SEL | + DP83822_WOL_CLR_INDICATION; + + return phy_write_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_WOL_CFG, value); + } else { + return phy_clear_bits_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_WOL_CFG, DP83822_WOL_EN); + } } static void dp83822_get_wol(struct phy_device *phydev, @@ -258,12 +257,11 @@ static int dp83822_config_intr(struct phy_device *phydev) static int dp83822_config_init(struct phy_device *phydev) { - int value; + int value = DP83822_WOL_EN | DP83822_WOL_MAGIC_EN | + DP83822_WOL_SECURE_ON; - value = DP83822_WOL_MAGIC_EN | DP83822_WOL_SECURE_ON | DP83822_WOL_EN; - - return phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, - value); + return phy_clear_bits_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_WOL_CFG, value); } static int dp83822_phy_reset(struct phy_device *phydev) From 6c599044b0c1c6668c5132ec86e11f3d06816ab8 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Tue, 28 Apr 2020 11:03:54 -0500 Subject: [PATCH 143/300] net: phy: DP83TC811: Fix WoL in config init to be disabled The WoL feature should be disabled when config_init is called and the feature should turned on or off when set_wol is called. In addition updated the calls to modify the registers to use the set_bit and clear_bit function calls. Fixes: 6d749428788b ("net: phy: DP83TC811: Introduce support for the DP83TC811 phy") Signed-off-by: Dan Murphy Signed-off-by: David S. Miller --- drivers/net/phy/dp83tc811.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/phy/dp83tc811.c b/drivers/net/phy/dp83tc811.c index 06f08832ebcd..d73725312c7c 100644 --- a/drivers/net/phy/dp83tc811.c +++ b/drivers/net/phy/dp83tc811.c @@ -139,16 +139,19 @@ static int dp83811_set_wol(struct phy_device *phydev, value &= ~DP83811_WOL_SECURE_ON; } - value |= (DP83811_WOL_EN | DP83811_WOL_INDICATION_SEL | - DP83811_WOL_CLR_INDICATION); - phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG, - value); + /* Clear any pending WoL interrupt */ + phy_read(phydev, MII_DP83811_INT_STAT1); + + value |= DP83811_WOL_EN | DP83811_WOL_INDICATION_SEL | + DP83811_WOL_CLR_INDICATION; + + return phy_write_mmd(phydev, DP83811_DEVADDR, + MII_DP83811_WOL_CFG, value); } else { - phy_clear_bits_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG, - DP83811_WOL_EN); + return phy_clear_bits_mmd(phydev, DP83811_DEVADDR, + MII_DP83811_WOL_CFG, DP83811_WOL_EN); } - return 0; } static void dp83811_get_wol(struct phy_device *phydev, @@ -292,8 +295,8 @@ static int dp83811_config_init(struct phy_device *phydev) value = DP83811_WOL_MAGIC_EN | DP83811_WOL_SECURE_ON | DP83811_WOL_EN; - return phy_write_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG, - value); + return phy_clear_bits_mmd(phydev, DP83811_DEVADDR, MII_DP83811_WOL_CFG, + value); } static int dp83811_phy_reset(struct phy_device *phydev) From 7fdc66debebc6a7170a37c8c9b0d9585a9788fb4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 28 Apr 2020 10:54:56 -0700 Subject: [PATCH 144/300] hv_netvsc: Fix netvsc_start_xmit's return type netvsc_start_xmit is used as a callback function for the ndo_start_xmit function pointer. ndo_start_xmit's return type is netdev_tx_t but netvsc_start_xmit's return type is int. This causes a failure with Control Flow Integrity (CFI), which requires function pointer prototypes and callback function definitions to match exactly. When CFI is in enforcing, the kernel panics. When booting a CFI kernel with WSL 2, the VM is immediately terminated because of this. The splat when CONFIG_CFI_PERMISSIVE is used: [ 5.916765] CFI failure (target: netvsc_start_xmit+0x0/0x10): [ 5.916771] WARNING: CPU: 8 PID: 0 at kernel/cfi.c:29 __cfi_check_fail+0x2e/0x40 [ 5.916772] Modules linked in: [ 5.916774] CPU: 8 PID: 0 Comm: swapper/8 Not tainted 5.7.0-rc3-next-20200424-microsoft-cbl-00001-ged4eb37d2c69-dirty #1 [ 5.916776] RIP: 0010:__cfi_check_fail+0x2e/0x40 [ 5.916777] Code: 48 c7 c7 70 98 63 a9 48 c7 c6 11 db 47 a9 e8 69 55 59 00 85 c0 75 02 5b c3 48 c7 c7 73 c6 43 a9 48 89 de 31 c0 e8 12 2d f0 ff <0f> 0b 5b c3 00 00 cc cc 00 00 cc cc 00 00 cc cc 00 00 85 f6 74 25 [ 5.916778] RSP: 0018:ffffa803c0260b78 EFLAGS: 00010246 [ 5.916779] RAX: 712a1af25779e900 RBX: ffffffffa8cf7950 RCX: ffffffffa962cf08 [ 5.916779] RDX: ffffffffa9c36b60 RSI: 0000000000000082 RDI: ffffffffa9c36b5c [ 5.916780] RBP: ffff8ffc4779c2c0 R08: 0000000000000001 R09: ffffffffa9c3c300 [ 5.916781] R10: 0000000000000151 R11: ffffffffa9c36b60 R12: ffff8ffe39084000 [ 5.916782] R13: ffffffffa8cf7950 R14: ffffffffa8d12cb0 R15: ffff8ffe39320140 [ 5.916784] FS: 0000000000000000(0000) GS:ffff8ffe3bc00000(0000) knlGS:0000000000000000 [ 5.916785] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5.916786] CR2: 00007ffef5749408 CR3: 00000002f4f5e000 CR4: 0000000000340ea0 [ 5.916787] Call Trace: [ 5.916788] [ 5.916790] __cfi_check+0x3ab58/0x450e0 [ 5.916793] ? dev_hard_start_xmit+0x11f/0x160 [ 5.916795] ? sch_direct_xmit+0xf2/0x230 [ 5.916796] ? __dev_queue_xmit.llvm.11471227737707190958+0x69d/0x8e0 [ 5.916797] ? neigh_resolve_output+0xdf/0x220 [ 5.916799] ? neigh_connected_output.cfi_jt+0x8/0x8 [ 5.916801] ? ip6_finish_output2+0x398/0x4c0 [ 5.916803] ? nf_nat_ipv6_out+0x10/0xa0 [ 5.916804] ? nf_hook_slow+0x84/0x100 [ 5.916807] ? ip6_input_finish+0x8/0x8 [ 5.916807] ? ip6_output+0x6f/0x110 [ 5.916808] ? __ip6_local_out.cfi_jt+0x8/0x8 [ 5.916810] ? mld_sendpack+0x28e/0x330 [ 5.916811] ? ip_rt_bug+0x8/0x8 [ 5.916813] ? mld_ifc_timer_expire+0x2db/0x400 [ 5.916814] ? neigh_proxy_process+0x8/0x8 [ 5.916816] ? call_timer_fn+0x3d/0xd0 [ 5.916817] ? __run_timers+0x2a9/0x300 [ 5.916819] ? rcu_core_si+0x8/0x8 [ 5.916820] ? run_timer_softirq+0x14/0x30 [ 5.916821] ? __do_softirq+0x154/0x262 [ 5.916822] ? native_x2apic_icr_write+0x8/0x8 [ 5.916824] ? irq_exit+0xba/0xc0 [ 5.916825] ? hv_stimer0_vector_handler+0x99/0xe0 [ 5.916826] ? hv_stimer0_callback_vector+0xf/0x20 [ 5.916826] [ 5.916828] ? hv_stimer_global_cleanup.cfi_jt+0x8/0x8 [ 5.916829] ? raw_setsockopt+0x8/0x8 [ 5.916830] ? default_idle+0xe/0x10 [ 5.916832] ? do_idle.llvm.10446269078108580492+0xb7/0x130 [ 5.916833] ? raw_setsockopt+0x8/0x8 [ 5.916833] ? cpu_startup_entry+0x15/0x20 [ 5.916835] ? cpu_hotplug_enable.cfi_jt+0x8/0x8 [ 5.916836] ? start_secondary+0x188/0x190 [ 5.916837] ? secondary_startup_64+0xa5/0xb0 [ 5.916838] ---[ end trace f2683fa869597ba5 ]--- Avoid this by using the right return type for netvsc_start_xmit. Fixes: fceaf24a943d8 ("Staging: hv: add the Hyper-V virtual network driver") Link: https://github.com/ClangBuiltLinux/linux/issues/1009 Signed-off-by: Nathan Chancellor Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d8e86bdbfba1..ebcfbae05690 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -707,7 +707,8 @@ no_memory: goto drop; } -static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *ndev) +static netdev_tx_t netvsc_start_xmit(struct sk_buff *skb, + struct net_device *ndev) { return netvsc_xmit(skb, ndev, false); } From 610a9346c138b9c2c93d38bf5f3728e74ae9cbd5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 28 Apr 2020 19:01:58 -0700 Subject: [PATCH 145/300] devlink: fix return value after hitting end in region read Commit d5b90e99e1d5 ("devlink: report 0 after hitting end in region read") fixed region dump, but region read still returns a spurious error: $ devlink region read netdevsim/netdevsim1/dummy snapshot 0 addr 0 len 128 0000000000000000 a6 f4 c4 1c 21 35 95 a6 9d 34 c3 5b 87 5b 35 79 0000000000000010 f3 a0 d7 ee 4f 2f 82 7f c6 dd c4 f6 a5 c3 1b ae 0000000000000020 a4 fd c8 62 07 59 48 03 70 3b c7 09 86 88 7f 68 0000000000000030 6f 45 5d 6d 7d 0e 16 38 a9 d0 7a 4b 1e 1e 2e a6 0000000000000040 e6 1d ae 06 d6 18 00 85 ca 62 e8 7e 11 7e f6 0f 0000000000000050 79 7e f7 0f f3 94 68 bd e6 40 22 85 b6 be 6f b1 0000000000000060 af db ef 5e 34 f0 98 4b 62 9a e3 1b 8b 93 fc 17 devlink answers: Invalid argument 0000000000000070 61 e8 11 11 66 10 a5 f7 b1 ea 8d 40 60 53 ed 12 This is a minimal fix, I'll follow up with a restructuring so we don't have two checks for the same condition. Fixes: fdd41ec21e15 ("devlink: Return right error code in case of errors for region read") Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/devlink.c b/net/core/devlink.c index 80f97722f31f..1ec2e9fd8898 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4283,6 +4283,11 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); dump = false; + + if (start_offset == end_offset) { + err = 0; + goto nla_put_failure; + } } err = devlink_nl_region_read_snapshot_fill(skb, devlink, From b6d49cab44b567b3e0a5544b3d61e516a7355fad Mon Sep 17 00:00:00 2001 From: Clay McClure Date: Wed, 29 Apr 2020 00:59:00 -0700 Subject: [PATCH 146/300] net: Make PTP-specific drivers depend on PTP_1588_CLOCK Commit d1cbfd771ce8 ("ptp_clock: Allow for it to be optional") changed all PTP-capable Ethernet drivers from `select PTP_1588_CLOCK` to `imply PTP_1588_CLOCK`, "in order to break the hard dependency between the PTP clock subsystem and ethernet drivers capable of being clock providers." As a result it is possible to build PTP-capable Ethernet drivers without the PTP subsystem by deselecting PTP_1588_CLOCK. Drivers are required to handle the missing dependency gracefully. Some PTP-capable Ethernet drivers (e.g., TI_CPSW) factor their PTP code out into separate drivers (e.g., TI_CPTS_MOD). The above commit also changed these PTP-specific drivers to `imply PTP_1588_CLOCK`, making it possible to build them without the PTP subsystem. But as Grygorii Strashko noted in [1]: On Wed, Apr 22, 2020 at 02:16:11PM +0300, Grygorii Strashko wrote: > Another question is that CPTS completely nonfunctional in this case and > it was never expected that somebody will even try to use/run such > configuration (except for random build purposes). In my view, enabling a PTP-specific driver without the PTP subsystem is a configuration error made possible by the above commit. Kconfig should not allow users to create a configuration with missing dependencies that results in "completely nonfunctional" drivers. I audited all network drivers that call ptp_clock_register() but merely `imply PTP_1588_CLOCK` and found five PTP-specific drivers that are likely nonfunctional without PTP_1588_CLOCK: NET_DSA_MV88E6XXX_PTP NET_DSA_SJA1105_PTP MACB_USE_HWSTAMP CAVIUM_PTP TI_CPTS_MOD Note how these symbols all reference PTP or timestamping in their name; this is a clue that they depend on PTP_1588_CLOCK. Change them from `imply PTP_1588_CLOCK` [2] to `depends on PTP_1588_CLOCK`. I'm not using `select PTP_1588_CLOCK` here because PTP_1588_CLOCK has its own dependencies, which `select` would not transitively apply. Additionally, remove the `select NET_PTP_CLASSIFY` from CPTS_TI_MOD; PTP_1588_CLOCK already selects that. [1]: https://lore.kernel.org/lkml/c04458ed-29ee-1797-3a11-7f3f560553e6@ti.com/ [2]: NET_DSA_SJA1105_PTP had never declared any type of dependency on PTP_1588_CLOCK (`imply` or otherwise); adding a `depends on PTP_1588_CLOCK` here seems appropriate. Cc: Arnd Bergmann Cc: Richard Cochran Cc: Nicolas Pitre Cc: Grygorii Strashko Cc: Geert Uytterhoeven Fixes: d1cbfd771ce8 ("ptp_clock: Allow for it to be optional") Signed-off-by: Clay McClure Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/Kconfig | 2 +- drivers/net/dsa/sja1105/Kconfig | 1 + drivers/net/ethernet/cadence/Kconfig | 2 +- drivers/net/ethernet/cavium/Kconfig | 2 +- drivers/net/ethernet/ti/Kconfig | 3 +-- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/Kconfig b/drivers/net/dsa/mv88e6xxx/Kconfig index 6435020d690d..51185e4d7d15 100644 --- a/drivers/net/dsa/mv88e6xxx/Kconfig +++ b/drivers/net/dsa/mv88e6xxx/Kconfig @@ -24,8 +24,8 @@ config NET_DSA_MV88E6XXX_PTP bool "PTP support for Marvell 88E6xxx" default n depends on NET_DSA_MV88E6XXX_GLOBAL2 + depends on PTP_1588_CLOCK imply NETWORK_PHY_TIMESTAMPING - imply PTP_1588_CLOCK help Say Y to enable PTP hardware timestamping on Marvell 88E6xxx switch chips that support it. diff --git a/drivers/net/dsa/sja1105/Kconfig b/drivers/net/dsa/sja1105/Kconfig index 0fe1ae173aa1..68c3086af9af 100644 --- a/drivers/net/dsa/sja1105/Kconfig +++ b/drivers/net/dsa/sja1105/Kconfig @@ -20,6 +20,7 @@ tristate "NXP SJA1105 Ethernet switch family support" config NET_DSA_SJA1105_PTP bool "Support for the PTP clock on the NXP SJA1105 Ethernet switch" depends on NET_DSA_SJA1105 + depends on PTP_1588_CLOCK help This enables support for timestamping and PTP clock manipulations in the SJA1105 DSA driver. diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig index 53b50c24d9c9..2c4c12b03502 100644 --- a/drivers/net/ethernet/cadence/Kconfig +++ b/drivers/net/ethernet/cadence/Kconfig @@ -35,8 +35,8 @@ config MACB config MACB_USE_HWSTAMP bool "Use IEEE 1588 hwstamp" depends on MACB + depends on PTP_1588_CLOCK default y - imply PTP_1588_CLOCK ---help--- Enable IEEE 1588 Precision Time Protocol (PTP) support for MACB. diff --git a/drivers/net/ethernet/cavium/Kconfig b/drivers/net/ethernet/cavium/Kconfig index 6a700d34019e..4520e7ee00fe 100644 --- a/drivers/net/ethernet/cavium/Kconfig +++ b/drivers/net/ethernet/cavium/Kconfig @@ -54,7 +54,7 @@ config THUNDER_NIC_RGX config CAVIUM_PTP tristate "Cavium PTP coprocessor as PTP clock" depends on 64BIT && PCI - imply PTP_1588_CLOCK + depends on PTP_1588_CLOCK ---help--- This driver adds support for the Precision Time Protocol Clocks and Timestamping coprocessor (PTP) found on Cavium processors. diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index 89cec778cf2d..8e348780efb6 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -90,9 +90,8 @@ config TI_CPTS config TI_CPTS_MOD tristate depends on TI_CPTS + depends on PTP_1588_CLOCK default y if TI_CPSW=y || TI_KEYSTONE_NETCP=y || TI_CPSW_SWITCHDEV=y - select NET_PTP_CLASSIFY - imply PTP_1588_CLOCK default m config TI_K3_AM65_CPSW_NUSS From 709e7158f0991ec26b2819ef944c276a19b910db Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 29 Apr 2020 13:59:50 -0700 Subject: [PATCH 147/300] ice: cleanup language in ice.rst for fw.app The documentation for the ice driver around "fw.app" has a spelling mistake in variation. Additionally, the language of "shall have a unique name" sounds like a requirement. Reword this to read more like a description or property. Reported-by: Benjamin Fisher Signed-off-by: Jacob Keller Acked-by: Jakub Kicinski Acked-by: Jeff Kirsher Signed-off-by: David S. Miller --- Documentation/networking/devlink/ice.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/devlink/ice.rst b/Documentation/networking/devlink/ice.rst index 5b58fc4e1268..4574352d6ff4 100644 --- a/Documentation/networking/devlink/ice.rst +++ b/Documentation/networking/devlink/ice.rst @@ -61,8 +61,8 @@ The ``ice`` driver reports the following versions - running - ICE OS Default Package - The name of the DDP package that is active in the device. The DDP - package is loaded by the driver during initialization. Each varation - of DDP package shall have a unique name. + package is loaded by the driver during initialization. Each + variation of the DDP package has a unique name. * - ``fw.app`` - running - 1.3.1.0 From cae9566acb1a4533ca85b63f7ac0cc7ebe4a0c30 Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Wed, 29 Apr 2020 23:57:22 +0100 Subject: [PATCH 148/300] cxgb4: Add missing annotation for service_ofldq() Sparse reports a warning at service_ofldq() warning: context imbalance in service_ofldq() - unexpected unlock The root cause is the missing annotation at service_ofldq() Add the missing __must_hold(&q->sendq.lock) annotation Signed-off-by: Jules Irenge Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 2cfb1f691bf2..6516c45864b3 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2728,6 +2728,7 @@ static void ofldtxq_stop(struct sge_uld_txq *q, struct fw_wr_hdr *wr) * is ever running at a time ... */ static void service_ofldq(struct sge_uld_txq *q) + __must_hold(&q->sendq.lock) { u64 *pos, *before, *end; int credits; From 846c68f7f1ac82c797a2f1db3344a2966c0fe2e1 Mon Sep 17 00:00:00 2001 From: Yoshiyuki Kurauchi Date: Thu, 30 Apr 2020 14:01:36 +0900 Subject: [PATCH 149/300] gtp: set NLM_F_MULTI flag in gtp_genl_dump_pdp() In drivers/net/gtp.c, gtp_genl_dump_pdp() should set NLM_F_MULTI flag since it returns multipart message. This patch adds a new arg "flags" in gtp_genl_fill_info() so that flags can be set by the callers. Signed-off-by: Yoshiyuki Kurauchi Signed-off-by: David S. Miller --- drivers/net/gtp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 672cd2caf2fb..21640a035d7d 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1169,11 +1169,11 @@ out_unlock: static struct genl_family gtp_genl_family; static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, - u32 type, struct pdp_ctx *pctx) + int flags, u32 type, struct pdp_ctx *pctx) { void *genlh; - genlh = genlmsg_put(skb, snd_portid, snd_seq, >p_genl_family, 0, + genlh = genlmsg_put(skb, snd_portid, snd_seq, >p_genl_family, flags, type); if (genlh == NULL) goto nlmsg_failure; @@ -1227,8 +1227,8 @@ static int gtp_genl_get_pdp(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - err = gtp_genl_fill_info(skb2, NETLINK_CB(skb).portid, - info->snd_seq, info->nlhdr->nlmsg_type, pctx); + err = gtp_genl_fill_info(skb2, NETLINK_CB(skb).portid, info->snd_seq, + 0, info->nlhdr->nlmsg_type, pctx); if (err < 0) goto err_unlock_free; @@ -1271,6 +1271,7 @@ static int gtp_genl_dump_pdp(struct sk_buff *skb, gtp_genl_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->nlh->nlmsg_type, pctx)) { cb->args[0] = i; cb->args[1] = j; From dc30b4059f6e2abf3712ab537c8718562b21c45d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Apr 2020 23:30:49 +0200 Subject: [PATCH 150/300] drop_monitor: work around gcc-10 stringop-overflow warning The current gcc-10 snapshot produces a false-positive warning: net/core/drop_monitor.c: In function 'trace_drop_common.constprop': cc1: error: writing 8 bytes into a region of size 0 [-Werror=stringop-overflow=] In file included from net/core/drop_monitor.c:23: include/uapi/linux/net_dropmon.h:36:8: note: at offset 0 to object 'entries' with size 4 declared here 36 | __u32 entries; | ^~~~~~~ I reported this in the gcc bugzilla, but in case it does not get fixed in the release, work around it by using a temporary variable. Fixes: 9a8afc8d3962 ("Network Drop Monitor: Adding drop monitor implementation & Netlink protocol") Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94881 Signed-off-by: Arnd Bergmann Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 8e33cec9fc4e..2ee7bc4c9e03 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -213,6 +213,7 @@ static void sched_send_work(struct timer_list *t) static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; + struct net_dm_drop_point *point; struct nlmsghdr *nlh; struct nlattr *nla; int i; @@ -231,11 +232,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location) nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); + point = msg->points; for (i = 0; i < msg->entries; i++) { - if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { - msg->points[i].count++; + if (!memcmp(&location, &point->pc, sizeof(void *))) { + point->count++; goto out; } + point++; } if (msg->entries == dm_hit_limit) goto out; @@ -244,8 +247,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location) */ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); - memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); - msg->points[msg->entries].count = 1; + memcpy(point->pc, &location, sizeof(void *)); + point->count = 1; msg->entries++; if (!timer_pending(&data->send_timer)) { From 90b5feb8c4bebc76c27fcaf3e1a0e5ca2d319e9e Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 30 Apr 2020 15:04:42 +0100 Subject: [PATCH 151/300] virtio-blk: handle block_device_operations callbacks after hot unplug A userspace process holding a file descriptor to a virtio_blk device can still invoke block_device_operations after hot unplug. This leads to a use-after-free accessing vblk->vdev in virtblk_getgeo() when ioctl(HDIO_GETGEO) is invoked: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 IP: [] virtio_check_driver_offered_feature+0x10/0x90 [virtio] PGD 800000003a92f067 PUD 3a930067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 1310 Comm: hdio-getgeo Tainted: G OE ------------ 3.10.0-1062.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 task: ffff9be5fbfb8000 ti: ffff9be5fa890000 task.ti: ffff9be5fa890000 RIP: 0010:[] [] virtio_check_driver_offered_feature+0x10/0x90 [virtio] RSP: 0018:ffff9be5fa893dc8 EFLAGS: 00010246 RAX: ffff9be5fc3f3400 RBX: ffff9be5fa893e30 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffff9be5fbc10b40 RBP: ffff9be5fa893dc8 R08: 0000000000000301 R09: 0000000000000301 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9be5fdc24680 R13: ffff9be5fbc10b40 R14: ffff9be5fbc10480 R15: 0000000000000000 FS: 00007f1bfb968740(0000) GS:ffff9be5ffc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000090 CR3: 000000003a894000 CR4: 0000000000360ff0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] virtblk_getgeo+0x47/0x110 [virtio_blk] [] ? handle_mm_fault+0x39d/0x9b0 [] blkdev_ioctl+0x1f5/0xa20 [] block_ioctl+0x41/0x50 [] do_vfs_ioctl+0x3a0/0x5a0 [] SyS_ioctl+0xa1/0xc0 A related problem is that virtblk_remove() leaks the vd_index_ida index when something still holds a reference to vblk->disk during hot unplug. This causes virtio-blk device names to be lost (vda, vdb, etc). Fix these issues by protecting vblk->vdev with a mutex and reference counting vblk so the vd_index_ida index can be removed in all cases. Fixes: 48e4043d4529 ("virtio: add virtio disk geometry feature") Reported-by: Lance Digby Signed-off-by: Stefan Hajnoczi Link: https://lore.kernel.org/r/20200430140442.171016-1-stefanha@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/block/virtio_blk.c | 86 ++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 8 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 93468b7c6701..9d21bf0f155e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -33,6 +33,15 @@ struct virtio_blk_vq { } ____cacheline_aligned_in_smp; struct virtio_blk { + /* + * This mutex must be held by anything that may run after + * virtblk_remove() sets vblk->vdev to NULL. + * + * blk-mq, virtqueue processing, and sysfs attribute code paths are + * shut down before vblk->vdev is set to NULL and therefore do not need + * to hold this mutex. + */ + struct mutex vdev_mutex; struct virtio_device *vdev; /* The disk structure for the kernel. */ @@ -44,6 +53,13 @@ struct virtio_blk { /* Process context for config space updates */ struct work_struct config_work; + /* + * Tracks references from block_device_operations open/release and + * virtio_driver probe/remove so this object can be freed once no + * longer in use. + */ + refcount_t refs; + /* What host tells us, plus 2 for header & tailer. */ unsigned int sg_elems; @@ -295,10 +311,55 @@ out: return err; } +static void virtblk_get(struct virtio_blk *vblk) +{ + refcount_inc(&vblk->refs); +} + +static void virtblk_put(struct virtio_blk *vblk) +{ + if (refcount_dec_and_test(&vblk->refs)) { + ida_simple_remove(&vd_index_ida, vblk->index); + mutex_destroy(&vblk->vdev_mutex); + kfree(vblk); + } +} + +static int virtblk_open(struct block_device *bd, fmode_t mode) +{ + struct virtio_blk *vblk = bd->bd_disk->private_data; + int ret = 0; + + mutex_lock(&vblk->vdev_mutex); + + if (vblk->vdev) + virtblk_get(vblk); + else + ret = -ENXIO; + + mutex_unlock(&vblk->vdev_mutex); + return ret; +} + +static void virtblk_release(struct gendisk *disk, fmode_t mode) +{ + struct virtio_blk *vblk = disk->private_data; + + virtblk_put(vblk); +} + /* We provide getgeo only to please some old bootloader/partitioning tools */ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) { struct virtio_blk *vblk = bd->bd_disk->private_data; + int ret = 0; + + mutex_lock(&vblk->vdev_mutex); + + if (!vblk->vdev) { + ret = -ENXIO; + goto out; + } /* see if the host passed in geometry config */ if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) { @@ -314,11 +375,15 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) geo->sectors = 1 << 5; geo->cylinders = get_capacity(bd->bd_disk) >> 11; } - return 0; +out: + mutex_unlock(&vblk->vdev_mutex); + return ret; } static const struct block_device_operations virtblk_fops = { .owner = THIS_MODULE, + .open = virtblk_open, + .release = virtblk_release, .getgeo = virtblk_getgeo, }; @@ -655,6 +720,10 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_free_index; } + /* This reference is dropped in virtblk_remove(). */ + refcount_set(&vblk->refs, 1); + mutex_init(&vblk->vdev_mutex); + vblk->vdev = vdev; vblk->sg_elems = sg_elems; @@ -820,8 +889,6 @@ out: static void virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; - int index = vblk->index; - int refc; /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); @@ -831,18 +898,21 @@ static void virtblk_remove(struct virtio_device *vdev) blk_mq_free_tag_set(&vblk->tag_set); + mutex_lock(&vblk->vdev_mutex); + /* Stop all the virtqueues. */ vdev->config->reset(vdev); - refc = kref_read(&disk_to_dev(vblk->disk)->kobj.kref); + /* Virtqueues are stopped, nothing can use vblk->vdev anymore. */ + vblk->vdev = NULL; + put_disk(vblk->disk); vdev->config->del_vqs(vdev); kfree(vblk->vqs); - kfree(vblk); - /* Only free device id if we don't have any users */ - if (refc == 1) - ida_simple_remove(&vd_index_ida, index); + mutex_unlock(&vblk->vdev_mutex); + + virtblk_put(vblk); } #ifdef CONFIG_PM_SLEEP From 0b841030625cde5f784dd62aec72d6a766faae70 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 1 May 2020 12:38:40 +0800 Subject: [PATCH 152/300] vhost: vsock: kick send_pkt worker once device is started Ning Bo reported an abnormal 2-second gap when booting Kata container [1]. The unconditional timeout was caused by VSOCK_DEFAULT_CONNECT_TIMEOUT of connecting from the client side. The vhost vsock client tries to connect an initializing virtio vsock server. The abnormal flow looks like: host-userspace vhost vsock guest vsock ============== =========== ============ connect() --------> vhost_transport_send_pkt_work() initializing | vq->private_data==NULL | will not be queued V schedule_timeout(2s) vhost_vsock_start() <--------- device ready set vq->private_data wait for 2s and failed connect() again vq->private_data!=NULL recv connecting pkt Details: 1. Host userspace sends a connect pkt, at that time, guest vsock is under initializing, hence the vhost_vsock_start has not been called. So vq->private_data==NULL, and the pkt is not been queued to send to guest 2. Then it sleeps for 2s 3. After guest vsock finishes initializing, vq->private_data is set 4. When host userspace wakes up after 2s, send connecting pkt again, everything is fine. As suggested by Stefano Garzarella, this fixes it by additional kicking the send_pkt worker in vhost_vsock_start once the virtio device is started. This makes the pending pkt sent again. After this patch, kata-runtime (with vsock enabled) boot time is reduced from 3s to 1s on a ThunderX2 arm64 server. [1] https://github.com/kata-containers/runtime/issues/1917 Reported-by: Ning Bo Suggested-by: Stefano Garzarella Signed-off-by: Jia He Link: https://lore.kernel.org/r/20200501043840.186557-1-justin.he@arm.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vsock.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index e36aaf9ba7bd..0716a9cdffee 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -543,6 +543,11 @@ static int vhost_vsock_start(struct vhost_vsock *vsock) mutex_unlock(&vq->mutex); } + /* Some packets may have been queued before the device was started, + * let's kick the send worker to send them. + */ + vhost_work_queue(&vsock->dev, &vsock->send_pkt_work); + mutex_unlock(&vsock->dev.mutex); return 0; From fb9cbbc895eb6e986dc90c928a35c793d75f435a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 28 Apr 2020 02:16:40 -0500 Subject: [PATCH 153/300] x86/unwind/orc: Move ORC sorting variables under !CONFIG_MODULES Fix the following warnings seen with !CONFIG_MODULES: arch/x86/kernel/unwind_orc.c:29:26: warning: 'cur_orc_table' defined but not used [-Wunused-variable] 29 | static struct orc_entry *cur_orc_table = __start_orc_unwind; | ^~~~~~~~~~~~~ arch/x86/kernel/unwind_orc.c:28:13: warning: 'cur_orc_ip_table' defined but not used [-Wunused-variable] 28 | static int *cur_orc_ip_table = __start_orc_unwind_ip; | ^~~~~~~~~~~~~~~~ Fixes: 153eb2223c79 ("x86/unwind/orc: Convert global variables to static") Reported-by: Stephen Rothwell Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: H. Peter Anvin Cc: Linux Next Mailing List Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20200428071640.psn5m7eh3zt2in4v@treble --- arch/x86/kernel/unwind_orc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 0ebc11a8bb45..5b0bd8581fe6 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -24,10 +24,6 @@ extern struct orc_entry __stop_orc_unwind[]; static bool orc_init __ro_after_init; static unsigned int lookup_num_blocks __ro_after_init; -static DEFINE_MUTEX(sort_mutex); -static int *cur_orc_ip_table = __start_orc_unwind_ip; -static struct orc_entry *cur_orc_table = __start_orc_unwind; - static inline unsigned long orc_ip(const int *ip) { return (unsigned long)ip + *ip; @@ -192,6 +188,10 @@ static struct orc_entry *orc_find(unsigned long ip) #ifdef CONFIG_MODULES +static DEFINE_MUTEX(sort_mutex); +static int *cur_orc_ip_table = __start_orc_unwind_ip; +static struct orc_entry *cur_orc_table = __start_orc_unwind; + static void orc_sort_swap(void *_a, void *_b, int size) { struct orc_entry *orc_a, *orc_b; From 2761121af87de45951989a0adada917837d8fa82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2020 20:09:25 -0700 Subject: [PATCH 154/300] net_sched: sch_skbprio: add message validation to skbprio_change() Do not assume the attribute has the right size. Fixes: aea5f654e6b7 ("net/sched: add skbprio scheduler") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_skbprio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 0fb10abf7579..7a5e4c454715 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -169,6 +169,9 @@ static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, { struct tc_skbprio_qopt *ctl = nla_data(opt); + if (opt->nla_len != nla_attr_size(sizeof(*ctl))) + return -EINVAL; + sch->limit = ctl->limit; return 0; } From 57c7f2bd758eed867295c81d3527fff4fab1ed74 Mon Sep 17 00:00:00 2001 From: Matt Jolly Date: Sun, 3 May 2020 01:52:28 +1000 Subject: [PATCH 155/300] net: usb: qmi_wwan: add support for DW5816e MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Dell Wireless 5816e to drivers/net/usb/qmi_wwan.c Signed-off-by: Matt Jolly Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 6c738a271257..4bb8552a00d3 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1359,6 +1359,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x413c, 0x81b3, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ + {QMI_FIXED_INTF(0x413c, 0x81cc, 8)}, /* Dell Wireless 5816e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 0)}, /* Dell Wireless 5821e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 1)}, /* Dell Wireless 5821e preproduction config */ {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ From b959c77dac09348955f344104c6a921ebe104753 Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Sun, 3 May 2020 20:32:26 +0800 Subject: [PATCH 156/300] net: macb: fix an issue about leak related system resources A call of the function macb_init() can fail in the function fu540_c000_init. The related system resources were not released then. use devm_platform_ioremap_resource() to replace ioremap() to fix it. Fixes: c218ad559020ff9 ("macb: Add support for SiFive FU540-C000") Cc: Andy Shevchenko Reviewed-by: Yash Shah Suggested-by: Nicolas Ferre Suggested-by: Andy Shevchenko Signed-off-by: Dejin Zheng Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index f739d16d29b1..36290a8e2a84 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4178,15 +4178,9 @@ static int fu540_c000_clk_init(struct platform_device *pdev, struct clk **pclk, static int fu540_c000_init(struct platform_device *pdev) { - struct resource *res; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 1); - if (!res) - return -ENODEV; - - mgmt->reg = ioremap(res->start, resource_size(res)); - if (!mgmt->reg) - return -ENOMEM; + mgmt->reg = devm_platform_ioremap_resource(pdev, 1); + if (IS_ERR(mgmt->reg)) + return PTR_ERR(mgmt->reg); return macb_init(pdev); } From c3e2850a9b68a62e7949633102d0351773846136 Mon Sep 17 00:00:00 2001 From: Gurchetan Singh Date: Fri, 1 May 2020 11:55:57 -0700 Subject: [PATCH 157/300] drm/virtio: create context before RESOURCE_CREATE_2D in 3D mode If 3D is enabled, but userspace requests a dumb buffer, we will call CTX_ATTACH_RESOURCE before actually creating the context. Fixes: 72b48ae800da ("drm/virtio: enqueue virtio_gpu_create_context after the first 3D ioctl") Signed-off-by: Gurchetan Singh Link: http://patchwork.freedesktop.org/patch/msgid/20200501185557.740-1-gurchetansingh@chromium.org Signed-off-by: Gerd Hoffmann --- drivers/gpu/drm/virtio/virtgpu_drv.h | 1 + drivers/gpu/drm/virtio/virtgpu_gem.c | 3 +++ drivers/gpu/drm/virtio/virtgpu_ioctl.c | 3 +-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h index c1824bdf2418..7879ff58236f 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.h +++ b/drivers/gpu/drm/virtio/virtgpu_drv.h @@ -221,6 +221,7 @@ struct virtio_gpu_fpriv { /* virtio_ioctl.c */ #define DRM_VIRTIO_NUM_IOCTLS 10 extern struct drm_ioctl_desc virtio_gpu_ioctls[DRM_VIRTIO_NUM_IOCTLS]; +void virtio_gpu_create_context(struct drm_device *dev, struct drm_file *file); /* virtio_kms.c */ int virtio_gpu_init(struct drm_device *dev); diff --git a/drivers/gpu/drm/virtio/virtgpu_gem.c b/drivers/gpu/drm/virtio/virtgpu_gem.c index 0d6152c99a27..f0d5a8974677 100644 --- a/drivers/gpu/drm/virtio/virtgpu_gem.c +++ b/drivers/gpu/drm/virtio/virtgpu_gem.c @@ -39,6 +39,9 @@ int virtio_gpu_gem_create(struct drm_file *file, int ret; u32 handle; + if (vgdev->has_virgl_3d) + virtio_gpu_create_context(dev, file); + ret = virtio_gpu_object_create(vgdev, params, &obj, NULL); if (ret < 0) return ret; diff --git a/drivers/gpu/drm/virtio/virtgpu_ioctl.c b/drivers/gpu/drm/virtio/virtgpu_ioctl.c index 336cc9143205..70ba13e8e58a 100644 --- a/drivers/gpu/drm/virtio/virtgpu_ioctl.c +++ b/drivers/gpu/drm/virtio/virtgpu_ioctl.c @@ -33,8 +33,7 @@ #include "virtgpu_drv.h" -static void virtio_gpu_create_context(struct drm_device *dev, - struct drm_file *file) +void virtio_gpu_create_context(struct drm_device *dev, struct drm_file *file) { struct virtio_gpu_device *vgdev = dev->dev_private; struct virtio_gpu_fpriv *vfpriv = file->driver_priv; From 1e189f267015a098bdcb82cc652d13fbf2203fa0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 2 May 2020 20:18:42 +0200 Subject: [PATCH 158/300] HID: quirks: Add HID_QUIRK_NO_INIT_REPORTS quirk for Dell K12A keyboard-dock Add a HID_QUIRK_NO_INIT_REPORTS quirk for the Dell K12A keyboard-dock, which can be used with various Dell Venue 11 models. Without this quirk the keyboard/touchpad combo works fine when connected at boot, but when hotplugged 9 out of 10 times it will not work properly. Adding the quirk fixes this. Cc: Mario Limonciello Signed-off-by: Hans de Goede Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-quirks.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 7354fa79cb67..1c71a1aa76b2 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -1116,6 +1116,7 @@ #define USB_DEVICE_ID_SYNAPTICS_LTS2 0x1d10 #define USB_DEVICE_ID_SYNAPTICS_HD 0x0ac3 #define USB_DEVICE_ID_SYNAPTICS_QUAD_HD 0x1ac3 +#define USB_DEVICE_ID_SYNAPTICS_DELL_K12A 0x2819 #define USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012 0x2968 #define USB_DEVICE_ID_SYNAPTICS_TP_V103 0x5710 #define USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5 0x81a7 diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index ebec818344af..e4cb543de0cd 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -163,6 +163,7 @@ static const struct hid_device_id hid_quirks[] = { { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_LTS2), HID_QUIRK_NO_INIT_REPORTS }, { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_QUAD_HD), HID_QUIRK_NO_INIT_REPORTS }, { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_TP_V103), HID_QUIRK_NO_INIT_REPORTS }, + { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_DELL_K12A), HID_QUIRK_NO_INIT_REPORTS }, { HID_USB_DEVICE(USB_VENDOR_ID_TOPMAX, USB_DEVICE_ID_TOPMAX_COBRAPAD), HID_QUIRK_BADPAD }, { HID_USB_DEVICE(USB_VENDOR_ID_TOUCHPACK, USB_DEVICE_ID_TOUCHPACK_RTS), HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_TPV, USB_DEVICE_ID_TPV_OPTICAL_TOUCHSCREEN_8882), HID_QUIRK_NOGET }, From 092a9f59bc05904d4555fa012db12e768734ba1a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 30 Apr 2020 18:39:04 -0700 Subject: [PATCH 159/300] Revert "tty: serial: bcm63xx: fix missing clk_put() in bcm63xx_uart" This reverts commit 580d952e44de5509c69c8f9346180ecaa78ebeec ("tty: serial: bcm63xx: fix missing clk_put() in bcm63xx_uart") because we should not be doing a clk_put() if we were not successful in getting a valid clock reference via clk_get() in the first place. Fixes: 580d952e44de ("tty: serial: bcm63xx: fix missing clk_put() in bcm63xx_uart") Signed-off-by: Florian Fainelli Cc: stable Link: https://lore.kernel.org/r/20200501013904.1394-1-f.fainelli@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/bcm63xx_uart.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/tty/serial/bcm63xx_uart.c b/drivers/tty/serial/bcm63xx_uart.c index ed0aa5c0d9b7..5674da2b76f0 100644 --- a/drivers/tty/serial/bcm63xx_uart.c +++ b/drivers/tty/serial/bcm63xx_uart.c @@ -843,10 +843,8 @@ static int bcm_uart_probe(struct platform_device *pdev) if (IS_ERR(clk) && pdev->dev.of_node) clk = of_clk_get(pdev->dev.of_node, 0); - if (IS_ERR(clk)) { - clk_put(clk); + if (IS_ERR(clk)) return -ENODEV; - } port->iotype = UPIO_MEM; port->irq = res_irq->start; From 57d38f26d81e4275748b69372f31df545dcd9b71 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Sat, 2 May 2020 11:01:07 -0400 Subject: [PATCH 160/300] vt: fix unicode console freeing with a common interface By directly using kfree() in different places we risk missing one if it is switched to using vfree(), especially if the corresponding vmalloc() is hidden away within a common abstraction. Oh wait, that's exactly what happened here. So let's fix this by creating a common abstraction for the free case as well. Signed-off-by: Nicolas Pitre Reported-by: syzbot+0bfda3ade1ee9288a1be@syzkaller.appspotmail.com Fixes: 9a98e7a80f95 ("vt: don't use kmalloc() for the unicode screen buffer") Cc: Reviewed-by: Sam Ravnborg Link: https://lore.kernel.org/r/nycvar.YSQ.7.76.2005021043110.2671@knanqh.ubzr Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index e5ffed795e4c..48a8199f7845 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -365,9 +365,14 @@ static struct uni_screen *vc_uniscr_alloc(unsigned int cols, unsigned int rows) return uniscr; } +static void vc_uniscr_free(struct uni_screen *uniscr) +{ + vfree(uniscr); +} + static void vc_uniscr_set(struct vc_data *vc, struct uni_screen *new_uniscr) { - vfree(vc->vc_uni_screen); + vc_uniscr_free(vc->vc_uni_screen); vc->vc_uni_screen = new_uniscr; } @@ -1230,7 +1235,7 @@ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc, err = resize_screen(vc, new_cols, new_rows, user); if (err) { kfree(newscreen); - kfree(new_uniscr); + vc_uniscr_free(new_uniscr); return err; } From c59359a02d14a7256cd508a4886b7d2012df2363 Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Mon, 4 May 2020 08:35:12 +0200 Subject: [PATCH 161/300] drm: ingenic-drm: add MODULE_DEVICE_TABLE so that the driver can load by matching the device tree if compiled as module. Cc: stable@vger.kernel.org # v5.3+ Fixes: 90b86fcc47b4 ("DRM: Add KMS driver for the Ingenic JZ47xx SoCs") Signed-off-by: H. Nikolaus Schaller Signed-off-by: Paul Cercueil Link: https://patchwork.freedesktop.org/patch/msgid/1694a29b7a3449b6b662cec33d1b33f2ee0b174a.1588574111.git.hns@goldelico.com --- drivers/gpu/drm/ingenic/ingenic-drm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/ingenic/ingenic-drm.c b/drivers/gpu/drm/ingenic/ingenic-drm.c index 9dfe7cb530e1..1754c0547069 100644 --- a/drivers/gpu/drm/ingenic/ingenic-drm.c +++ b/drivers/gpu/drm/ingenic/ingenic-drm.c @@ -843,6 +843,7 @@ static const struct of_device_id ingenic_drm_of_match[] = { { .compatible = "ingenic,jz4770-lcd", .data = &jz4770_soc_info }, { /* sentinel */ }, }; +MODULE_DEVICE_TABLE(of, ingenic_drm_of_match); static struct platform_driver ingenic_drm_driver = { .driver = { From 3a3a71f97c30983f1627c2c550d43566e9b634d2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 Apr 2020 23:50:51 +0200 Subject: [PATCH 162/300] sun6i: dsi: fix gcc-4.8 Older compilers warn about initializers with incorrect curly braces: drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c: In function 'sun6i_dsi_encoder_enable': drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c:720:8: error: missing braces around initializer [-Werror=missing-braces] union phy_configure_opts opts = { 0 }; ^ drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c:720:8: error: (near initialization for 'opts.mipi_dphy') [-Werror=missing-braces] Use the GNU empty initializer extension to avoid this. Fixes: bb3b6fcb6849 ("sun6i: dsi: Convert to generic phy handling") Reviewed-by: Paul Kocialkowski Signed-off-by: Arnd Bergmann Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20200428215105.3928459-1-arnd@arndb.de --- drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c b/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c index 059939789730..3eb89f1eb0e1 100644 --- a/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c +++ b/drivers/gpu/drm/sun4i/sun6i_mipi_dsi.c @@ -717,7 +717,7 @@ static void sun6i_dsi_encoder_enable(struct drm_encoder *encoder) struct drm_display_mode *mode = &encoder->crtc->state->adjusted_mode; struct sun6i_dsi *dsi = encoder_to_sun6i_dsi(encoder); struct mipi_dsi_device *device = dsi->device; - union phy_configure_opts opts = { 0 }; + union phy_configure_opts opts = { }; struct phy_configure_opts_mipi_dphy *cfg = &opts.mipi_dphy; u16 delay; int err; From d8f1b9716cfd1a1f74c0fedad40c5f65a25aa208 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Sun, 26 Apr 2020 15:54:43 +0800 Subject: [PATCH 163/300] io_uring: fix mismatched finish_wait() calls in io_uring_cancel_files() The prepare_to_wait() and finish_wait() calls in io_uring_cancel_files() are mismatched. Currently I don't see any issues related this bug, just find it by learning codes. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0b91b0631173..6d52ff98279d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7360,11 +7360,9 @@ static int io_uring_release(struct inode *inode, struct file *file) static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { - struct io_kiocb *req; - DEFINE_WAIT(wait); - while (!list_empty_careful(&ctx->inflight_list)) { - struct io_kiocb *cancel_req = NULL; + struct io_kiocb *cancel_req = NULL, *req; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { @@ -7404,6 +7402,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, */ if (refcount_sub_and_test(2, &cancel_req->refs)) { io_put_req(cancel_req); + finish_wait(&ctx->inflight_wait, &wait); continue; } } @@ -7411,8 +7410,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, io_wq_cancel_work(ctx->io_wq, &cancel_req->work); io_put_req(cancel_req); schedule(); + finish_wait(&ctx->inflight_wait, &wait); } - finish_wait(&ctx->inflight_wait, &wait); } static int io_uring_flush(struct file *file, void *data) From f9336e3281880b683137bc18f91848ac34af84c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 4 May 2020 08:35:06 -0700 Subject: [PATCH 164/300] KVM: nVMX: Replace a BUG_ON(1) with BUG() to squash clang warning Use BUG() in the impossible-to-hit default case when switching on the scope of INVEPT to squash a warning with clang 11 due to clang treating the BUG_ON() as conditional. >> arch/x86/kvm/vmx/nested.c:5246:3: warning: variable 'roots_to_free' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized] BUG_ON(1); Reported-by: kbuild test robot Fixes: ce8fe7b77bd8 ("KVM: nVMX: Free only the affected contexts when emulating INVEPT") Signed-off-by: Sean Christopherson Message-Id: <20200504153506.28898-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index fd78ffbde644..e44f33c82332 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5165,7 +5165,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) */ break; default: - BUG_ON(1); + BUG(); break; } From dee919d15dcf70f4ce84b7da9b77bdc1c307454c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 4 May 2020 09:34:10 -0400 Subject: [PATCH 165/300] KVM: SVM: fill in kvm_run->debug.arch.dr[67] The corresponding code was added for VMX in commit 42dbaa5a057 ("KVM: x86: Virtualize debug registers, 2008-12-15) but never for AMD. Fix this. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2f379bacbb26..38f6aeefeb55 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1752,6 +1752,8 @@ static int db_interception(struct vcpu_svm *svm) if (svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; + kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; + kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = DB_VECTOR; From 9d82973e032e246ff5663c9805fbb5407ae932e3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 4 May 2020 09:16:37 -0700 Subject: [PATCH 166/300] gcc-10 warnings: fix low-hanging fruit Due to a bug-report that was compiler-dependent, I updated one of my machines to gcc-10. That shows a lot of new warnings. Happily they seem to be mostly the valid kind, but it's going to cause a round of churn for getting rid of them.. This is the really low-hanging fruit of removing a couple of zero-sized arrays in some core code. We have had a round of these patches before, and we'll have many more coming, and there is nothing special about these except that they were particularly trivial, and triggered more warnings than most. Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- include/linux/tty.h | 2 +- scripts/kallsyms.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..45cc10cdf6dd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -983,7 +983,7 @@ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ - unsigned char f_handle[0]; + unsigned char f_handle[]; }; static inline struct file *get_file(struct file *f) diff --git a/include/linux/tty.h b/include/linux/tty.h index bd5fe0e907e8..a99e9b8e4e31 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -66,7 +66,7 @@ struct tty_buffer { int read; int flags; /* Data points here */ - unsigned long data[0]; + unsigned long data[]; }; /* Values for .flags field of tty_buffer */ diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 3e8dea6e0a95..6dc3078649fa 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -34,7 +34,7 @@ struct sym_entry { unsigned int len; unsigned int start_pos; unsigned int percpu_absolute; - unsigned char sym[0]; + unsigned char sym[]; }; struct addr_range { From 637543a8d61c6afe4e9be64bfb43c78701a83375 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 7 Apr 2020 01:13:09 -0500 Subject: [PATCH 167/300] KVM: x86: Fixes posted interrupt check for IRQs delivery modes Current logic incorrectly uses the enum ioapic_irq_destination_types to check the posted interrupt destination types. However, the value was set using APIC_DM_XXX macros, which are left-shifted by 8 bits. Fixes by using the APIC_DM_FIXED and APIC_DM_LOWEST instead. Fixes: (fdcf75621375 'KVM: x86: Disable posted interrupts for non-standard IRQs delivery modes') Cc: Alexander Graf Signed-off-by: Suravee Suthikulpanit Message-Id: <1586239989-58305-1-git-send-email-suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky Tested-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 42a2d0d3984a..0dea9f122bb9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1663,8 +1663,8 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ - return (irq->delivery_mode == dest_Fixed || - irq->delivery_mode == dest_LowestPrio); + return (irq->delivery_mode == APIC_DM_FIXED || + irq->delivery_mode == APIC_DM_LOWEST); } static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) From 78d6de3cfbd342918d31cf68d0d2eda401338aef Mon Sep 17 00:00:00 2001 From: Matt Jolly Date: Sun, 3 May 2020 01:03:47 +1000 Subject: [PATCH 168/300] USB: serial: qcserial: Add DW5816e support Add support for Dell Wireless 5816e to drivers/usb/serial/qcserial.c Signed-off-by: Matt Jolly Cc: stable Signed-off-by: Johan Hovold --- drivers/usb/serial/qcserial.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c index 613f91add03d..ce0401d3137f 100644 --- a/drivers/usb/serial/qcserial.c +++ b/drivers/usb/serial/qcserial.c @@ -173,6 +173,7 @@ static const struct usb_device_id id_table[] = { {DEVICE_SWI(0x413c, 0x81b3)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ {DEVICE_SWI(0x413c, 0x81b5)}, /* Dell Wireless 5811e QDL */ {DEVICE_SWI(0x413c, 0x81b6)}, /* Dell Wireless 5811e QDL */ + {DEVICE_SWI(0x413c, 0x81cc)}, /* Dell Wireless 5816e */ {DEVICE_SWI(0x413c, 0x81cf)}, /* Dell Wireless 5819 */ {DEVICE_SWI(0x413c, 0x81d0)}, /* Dell Wireless 5819 */ {DEVICE_SWI(0x413c, 0x81d1)}, /* Dell Wireless 5818 */ From 8be8f932e3db5fe4ed178b8892eeffeab530273a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 4 May 2020 12:19:45 -0400 Subject: [PATCH 169/300] kvm: ioapic: Restrict lazy EOI update to edge-triggered interrupts Commit f458d039db7e ("kvm: ioapic: Lazy update IOAPIC EOI") introduces the following infinite loop: BUG: stack guard page was hit at 000000008f595917 \ (stack is 00000000bdefe5a4..00000000ae2b06f5) kernel stack overflow (double-fault): 0000 [#1] SMP NOPTI RIP: 0010:kvm_set_irq+0x51/0x160 [kvm] Call Trace: irqfd_resampler_ack+0x32/0x90 [kvm] kvm_notify_acked_irq+0x62/0xd0 [kvm] kvm_ioapic_update_eoi_one.isra.0+0x30/0x120 [kvm] ioapic_set_irq+0x20e/0x240 [kvm] kvm_ioapic_set_irq+0x5c/0x80 [kvm] kvm_set_irq+0xbb/0x160 [kvm] ? kvm_hv_set_sint+0x20/0x20 [kvm] irqfd_resampler_ack+0x32/0x90 [kvm] kvm_notify_acked_irq+0x62/0xd0 [kvm] kvm_ioapic_update_eoi_one.isra.0+0x30/0x120 [kvm] ioapic_set_irq+0x20e/0x240 [kvm] kvm_ioapic_set_irq+0x5c/0x80 [kvm] kvm_set_irq+0xbb/0x160 [kvm] ? kvm_hv_set_sint+0x20/0x20 [kvm] .... The re-entrancy happens because the irq state is the OR of the interrupt state and the resamplefd state. That is, we don't want to show the state as 0 until we've had a chance to set the resamplefd. But if the interrupt has _not_ gone low then ioapic_set_irq is invoked again, causing an infinite loop. This can only happen for a level-triggered interrupt, otherwise irqfd_inject would immediately set the KVM_USERSPACE_IRQ_SOURCE_ID high and then low. Fortunately, in the case of level-triggered interrupts the VMEXIT already happens because TMR is set. Thus, fix the bug by restricting the lazy invocation of the ack notifier to edge-triggered interrupts, the only ones that need it. Tested-by: Suravee Suthikulpanit Reported-by: borisvk@bstnet.org Suggested-by: Paolo Bonzini Link: https://www.spinics.net/lists/kvm/msg213512.html Fixes: f458d039db7e ("kvm: ioapic: Lazy update IOAPIC EOI") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=207489 Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 750ff0b29404..d057376bd3d3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -225,12 +225,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, } /* - * AMD SVM AVIC accelerate EOI write and do not trap, - * in-kernel IOAPIC will not be able to receive the EOI. - * In this case, we do lazy update of the pending EOI when - * trying to set IOAPIC irq. + * AMD SVM AVIC accelerate EOI write iff the interrupt is edge + * triggered, in which case the in-kernel IOAPIC will not be able + * to receive the EOI. In this case, we do a lazy update of the + * pending EOI when trying to set IOAPIC irq. */ - if (kvm_apicv_activated(ioapic->kvm)) + if (edge && kvm_apicv_activated(ioapic->kvm)) ioapic_lazy_update_eoi(ioapic, irq); /* From 1e6e9d0f4859ec698d55381ea26f4136eff3afe1 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 24 Apr 2020 10:50:00 -0500 Subject: [PATCH 170/300] uapi: revert flexible-array conversions These structures can get embedded in other structures in user-space and cause all sorts of warnings and problems. So, we better don't take any chances and keep the zero-length arrays in place for now. Signed-off-by: Gustavo A. R. Silva --- include/uapi/linux/bpf.h | 2 +- include/uapi/linux/dlm_device.h | 4 ++-- include/uapi/linux/fiemap.h | 2 +- include/uapi/linux/if_arcnet.h | 6 +++--- include/uapi/linux/mmc/ioctl.h | 2 +- include/uapi/linux/net_dropmon.h | 4 ++-- include/uapi/linux/netfilter_bridge/ebt_among.h | 2 +- include/uapi/scsi/scsi_bsg_fc.h | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bbf1b65be10..f9b7fdd951e4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,7 +73,7 @@ struct bpf_insn { /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ struct bpf_lpm_trie_key { __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[]; /* Arbitrary size */ + __u8 data[0]; /* Arbitrary size */ }; struct bpf_cgroup_storage_key { diff --git a/include/uapi/linux/dlm_device.h b/include/uapi/linux/dlm_device.h index e83954c69fff..f880d2831160 100644 --- a/include/uapi/linux/dlm_device.h +++ b/include/uapi/linux/dlm_device.h @@ -45,13 +45,13 @@ struct dlm_lock_params { void __user *bastaddr; struct dlm_lksb __user *lksb; char lvb[DLM_USER_LVB_LEN]; - char name[]; + char name[0]; }; struct dlm_lspace_params { __u32 flags; __u32 minor; - char name[]; + char name[0]; }; struct dlm_purge_params { diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 7a900b2377b6..8c0bc24d5d95 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -34,7 +34,7 @@ struct fiemap { __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ __u32 fm_extent_count; /* size of fm_extents array (in) */ __u32 fm_reserved; - struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ + struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ }; #define FIEMAP_MAX_OFFSET (~0ULL) diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h index b122cfac7128..683878036d76 100644 --- a/include/uapi/linux/if_arcnet.h +++ b/include/uapi/linux/if_arcnet.h @@ -60,7 +60,7 @@ struct arc_rfc1201 { __u8 proto; /* protocol ID field - varies */ __u8 split_flag; /* for use with split packets */ __be16 sequence; /* sequence number */ - __u8 payload[]; /* space remaining in packet (504 bytes)*/ + __u8 payload[0]; /* space remaining in packet (504 bytes)*/ }; #define RFC1201_HDR_SIZE 4 @@ -69,7 +69,7 @@ struct arc_rfc1201 { */ struct arc_rfc1051 { __u8 proto; /* ARC_P_RFC1051_ARP/RFC1051_IP */ - __u8 payload[]; /* 507 bytes */ + __u8 payload[0]; /* 507 bytes */ }; #define RFC1051_HDR_SIZE 1 @@ -80,7 +80,7 @@ struct arc_rfc1051 { struct arc_eth_encap { __u8 proto; /* Always ARC_P_ETHER */ struct ethhdr eth; /* standard ethernet header (yuck!) */ - __u8 payload[]; /* 493 bytes */ + __u8 payload[0]; /* 493 bytes */ }; #define ETH_ENCAP_HDR_SIZE 14 diff --git a/include/uapi/linux/mmc/ioctl.h b/include/uapi/linux/mmc/ioctl.h index 98e29e7f54ac..00c08120f3ba 100644 --- a/include/uapi/linux/mmc/ioctl.h +++ b/include/uapi/linux/mmc/ioctl.h @@ -57,7 +57,7 @@ struct mmc_ioc_cmd { */ struct mmc_ioc_multi_cmd { __u64 num_of_cmds; - struct mmc_ioc_cmd cmds[]; + struct mmc_ioc_cmd cmds[0]; }; #define MMC_IOC_CMD _IOWR(MMC_BLOCK_MAJOR, 0, struct mmc_ioc_cmd) diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 67e31f329190..66048cc5d7b3 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -29,12 +29,12 @@ struct net_dm_config_entry { struct net_dm_config_msg { __u32 entries; - struct net_dm_config_entry options[]; + struct net_dm_config_entry options[0]; }; struct net_dm_alert_msg { __u32 entries; - struct net_dm_drop_point points[]; + struct net_dm_drop_point points[0]; }; struct net_dm_user_msg { diff --git a/include/uapi/linux/netfilter_bridge/ebt_among.h b/include/uapi/linux/netfilter_bridge/ebt_among.h index 73b26a280c4f..9acf757bc1f7 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_among.h +++ b/include/uapi/linux/netfilter_bridge/ebt_among.h @@ -40,7 +40,7 @@ struct ebt_mac_wormhash_tuple { struct ebt_mac_wormhash { int table[257]; int poolsize; - struct ebt_mac_wormhash_tuple pool[]; + struct ebt_mac_wormhash_tuple pool[0]; }; #define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \ diff --git a/include/uapi/scsi/scsi_bsg_fc.h b/include/uapi/scsi/scsi_bsg_fc.h index 7f5930801f72..3ae65e93235c 100644 --- a/include/uapi/scsi/scsi_bsg_fc.h +++ b/include/uapi/scsi/scsi_bsg_fc.h @@ -209,7 +209,7 @@ struct fc_bsg_host_vendor { __u64 vendor_id; /* start of vendor command area */ - __u32 vendor_cmd[]; + __u32 vendor_cmd[0]; }; /* Response: From 2ae11c46d5fdc46cb396e35911c713d271056d35 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 4 May 2020 16:27:28 +0200 Subject: [PATCH 171/300] tty: xilinx_uartps: Fix missing id assignment to the console When serial console has been assigned to ttyPS1 (which is serial1 alias) console index is not updated property and pointing to index -1 (statically initialized) which ends up in situation where nothing has been printed on the port. The commit 18cc7ac8a28e ("Revert "serial: uartps: Register own uart console and driver structures"") didn't contain this line which was removed by accident. Fixes: 18cc7ac8a28e ("Revert "serial: uartps: Register own uart console and driver structures"") Signed-off-by: Shubhrajyoti Datta Cc: stable Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/ed3111533ef5bd342ee5ec504812240b870f0853.1588602446.git.michal.simek@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/xilinx_uartps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index ac137b6a1dc1..35e9e8faf8de 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -1459,6 +1459,7 @@ static int cdns_uart_probe(struct platform_device *pdev) cdns_uart_uart_driver.nr = CDNS_UART_NR_PORTS; #ifdef CONFIG_SERIAL_XILINX_PS_UART_CONSOLE cdns_uart_uart_driver.cons = &cdns_uart_console; + cdns_uart_console.index = id; #endif rc = uart_register_driver(&cdns_uart_uart_driver); From 0fa8263367db9287aa0632f96c1a5f93cc478150 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 28 Apr 2020 08:10:22 -0400 Subject: [PATCH 172/300] ceph: fix endianness bug when handling MDS session feature bits Eduard reported a problem mounting cephfs on s390 arch. The feature mask sent by the MDS is little-endian, so we need to convert it before storing and testing against it. Cc: stable@vger.kernel.org Reported-and-Tested-by: Eduard Shishkin Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 486f91f9685b..7c63abf5bea9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3251,8 +3251,7 @@ static void handle_session(struct ceph_mds_session *session, void *end = p + msg->front.iov_len; struct ceph_mds_session_head *h; u32 op; - u64 seq; - unsigned long features = 0; + u64 seq, features = 0; int wake = 0; bool blacklisted = false; @@ -3271,9 +3270,8 @@ static void handle_session(struct ceph_mds_session *session, goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); - memcpy(&features, p, min_t(size_t, len, sizeof(features))); - p += len; + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); } mutex_lock(&mdsc->mutex); From 7d8976afad18d4548ee472e526b126ab74012807 Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Wed, 29 Apr 2020 10:01:55 +0800 Subject: [PATCH 173/300] ceph: fix special error code in ceph_try_get_caps() There are 3 speical error codes: -EAGAIN/-EFBIG/-ESTALE. After calling try_get_cap_refs, ceph_try_get_caps test for the -EAGAIN twice. Ensure that it tests for -ESTALE instead. Signed-off-by: Wu Bo Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 185db76300b3..1a8e20ef35bf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2749,7 +2749,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ - if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN) + if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) ret = 0; return ret; } From 4d8e28ff3106b093d98bfd2eceb9b430c70a8758 Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Thu, 30 Apr 2020 14:12:49 +0800 Subject: [PATCH 174/300] ceph: fix double unlock in handle_cap_export() If the ceph_mdsc_open_export_target_session() return fails, it will do a "goto retry", but the session mutex has already been unlocked. Re-lock the mutex in that case to ensure that we don't unlock it twice. Signed-off-by: Wu Bo Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1a8e20ef35bf..5f3aa4d607de 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3746,6 +3746,7 @@ retry: WARN_ON(1); tsession = NULL; target = -1; + mutex_lock(&session->s_mutex); } goto retry; From 86f8b1c01a0a537a73d2996615133be63cdf75db Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sun, 3 May 2020 20:50:57 -0700 Subject: [PATCH 175/300] net: dsa: Do not make user port errors fatal Prior to 1d27732f411d ("net: dsa: setup and teardown ports"), we would not treat failures to set-up an user port as fatal, but after this commit we would, which is a regression for some systems where interfaces may be declared in the Device Tree, but the underlying hardware may not be present (pluggable daughter cards for instance). Fixes: 1d27732f411d ("net: dsa: setup and teardown ports") Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 9a271a58a41d..d90665b465b8 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -459,7 +459,7 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) list_for_each_entry(dp, &dst->ports, list) { err = dsa_port_setup(dp); if (err) - goto teardown; + continue; } return 0; From 3a5ccecd9af71220e8b303f439ebccbc48385305 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 20 Apr 2020 13:44:25 +0000 Subject: [PATCH 176/300] MAINTAINERS: remove myself as ceph co-maintainer Jeff, Ilya, and Dongsheng are doing all of the Ceph maintainance these days. [ idryomov: Remove Sage's git tree too, it hasn't been pushed to in years. ] Signed-off-by: Sage Weil Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- MAINTAINERS | 6 ------ 1 file changed, 6 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2926327e4976..db4cc08583dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3936,11 +3936,9 @@ F: arch/powerpc/platforms/cell/ CEPH COMMON CODE (LIBCEPH) M: Ilya Dryomov M: Jeff Layton -M: Sage Weil L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git T: git git://github.com/ceph/ceph-client.git F: include/linux/ceph/ F: include/linux/crush/ @@ -3948,12 +3946,10 @@ F: net/ceph/ CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) M: Jeff Layton -M: Sage Weil M: Ilya Dryomov L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git T: git git://github.com/ceph/ceph-client.git F: Documentation/filesystems/ceph.rst F: fs/ceph/ @@ -14102,12 +14098,10 @@ F: drivers/media/radio/radio-tea5777.c RADOS BLOCK DEVICE (RBD) M: Ilya Dryomov -M: Sage Weil R: Dongsheng Yang L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git T: git git://github.com/ceph/ceph-client.git F: Documentation/ABI/testing/sysfs-bus-rbd F: drivers/block/rbd.c From 980d69276f3048af43a045be2925dacfb898a7be Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Mon, 4 May 2020 11:15:54 +0700 Subject: [PATCH 177/300] tipc: fix partial topology connection closure When an application connects to the TIPC topology server and subscribes to some services, a new connection is created along with some objects - 'tipc_subscription' to store related data correspondingly... However, there is one omission in the connection handling that when the connection or application is orderly shutdown (e.g. via SIGQUIT, etc.), the connection is not closed in kernel, the 'tipc_subscription' objects are not freed too. This results in: - The maximum number of subscriptions (65535) will be reached soon, new subscriptions will be rejected; - TIPC module cannot be removed (unless the objects are somehow forced to release first); The commit fixes the issue by closing the connection if the 'recvmsg()' returns '0' i.e. when the peer is shutdown gracefully. It also includes the other unexpected cases. Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/topsrv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 3a12fc18239b..73dbed0c4b6b 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -402,10 +402,11 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con) read_lock_bh(&sk->sk_callback_lock); ret = tipc_conn_rcv_sub(srv, con, &s); read_unlock_bh(&sk->sk_callback_lock); + if (!ret) + return 0; } - if (ret < 0) - tipc_conn_close(con); + tipc_conn_close(con); return ret; } From f42234ffd531ca6b13d9da02faa60b72eccf8334 Mon Sep 17 00:00:00 2001 From: Maxim Petrov Date: Mon, 4 May 2020 09:26:43 +0300 Subject: [PATCH 178/300] stmmac: fix pointer check after utilization in stmmac_interrupt The paranoidal pointer check in IRQ handler looks very strange - it really protects us only against bogus drivers which request IRQ line with null pointer dev_id. However, the code fragment is incorrect because the dev pointer is used before the actual check which leads to undefined behavior. Remove the check to avoid confusing people with incorrect code. Signed-off-by: Maxim Petrov Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ff22f274aa43..a999d6b33a64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4060,7 +4060,7 @@ static int stmmac_set_features(struct net_device *netdev, /** * stmmac_interrupt - main ISR * @irq: interrupt number. - * @dev_id: to pass the net device pointer. + * @dev_id: to pass the net device pointer (must be valid). * Description: this is the main driver interrupt service routine. * It can call: * o DMA service routine (to manage incoming frame reception and transmission @@ -4084,11 +4084,6 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id) if (priv->irq_wake) pm_wakeup_event(priv->device, 0); - if (unlikely(!dev)) { - netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); - return IRQ_NONE; - } - /* Check if adapter is up */ if (test_bit(STMMAC_DOWN, &priv->state)) return IRQ_HANDLED; From bea0c5c942d3b4e9fb6ed45f6a7de74c6b112437 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 4 May 2020 11:27:46 +0300 Subject: [PATCH 179/300] devlink: Fix reporter's recovery condition Devlink health core conditions the reporter's recovery with the expiration of the grace period. This is not relevant for the first recovery. Explicitly demand that the grace period will only apply to recoveries other than the first. Fixes: c8e1da0bf923 ("devlink: Add health report functionality") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index 1ec2e9fd8898..899edcee7dab 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5368,6 +5368,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, { enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; + unsigned long recover_ts_threshold; /* write a log message of the current error */ WARN_ON(!msg); @@ -5378,10 +5379,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); /* abort if the previous error wasn't recovered */ + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->graceful_period); if (reporter->auto_recover && (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - jiffies - reporter->last_recovery_ts < - msecs_to_jiffies(reporter->graceful_period))) { + (reporter->last_recovery_ts && reporter->recovery_count && + time_is_after_jiffies(recover_ts_threshold)))) { trace_devlink_health_recover_aborted(devlink, reporter->ops->name, reporter->health_state, From 40e473071dbad04316ddc3613c3a3d1c75458299 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 4 May 2020 11:36:02 +0300 Subject: [PATCH 180/300] net/mlx4_core: Fix use of ENOSPC around mlx4_counter_alloc() When ENOSPC is set the idx is still valid and gets set to the global MLX4_SINK_COUNTER_INDEX. However gcc's static analysis cannot tell that ENOSPC is impossible from mlx4_cmd_imm() and gives this warning: drivers/net/ethernet/mellanox/mlx4/main.c:2552:28: warning: 'idx' may be used uninitialized in this function [-Wmaybe-uninitialized] 2552 | priv->def_counter[port] = idx; Also, when ENOSPC is returned mlx4_allocate_default_counters should not fail. Fixes: 6de5f7f6a1fa ("net/mlx4_core: Allocate default counter per port") Signed-off-by: Jason Gunthorpe Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 5716c3d2bb86..c72c4e1ea383 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2550,6 +2550,7 @@ static int mlx4_allocate_default_counters(struct mlx4_dev *dev) if (!err || err == -ENOSPC) { priv->def_counter[port] = idx; + err = 0; } else if (err == -ENOENT) { err = 0; continue; @@ -2600,7 +2601,8 @@ int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage) MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (!err) *idx = get_param_l(&out_param); - + if (WARN_ON(err == -ENOSPC)) + err = -EINVAL; return err; } return __mlx4_counter_alloc(dev, idx); From d975cb7ea915e64a3ebcfef8a33051f3e6bf22a8 Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Mon, 4 May 2020 20:01:27 +0800 Subject: [PATCH 181/300] net: enetc: fix an issue about leak system resources the related system resources were not released when enetc_hw_alloc() return error in the enetc_pci_mdio_probe(), add iounmap() for error handling label "err_hw_alloc" to fix it. Fixes: 6517798dd3432a ("enetc: Make MDIO accessors more generic and export to include/linux/fsl") Cc: Andy Shevchenko Signed-off-by: Dejin Zheng Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c index ebc635f8a4cc..15f37c5b8dc1 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pci_mdio.c @@ -74,8 +74,8 @@ err_pci_mem_reg: pci_disable_device(pdev); err_pci_enable: err_mdiobus_alloc: - iounmap(port_regs); err_hw_alloc: + iounmap(port_regs); err_ioremap: return err; } From c649c41d5d0f7d153bd5d5d9c093fec16ee03e00 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 4 May 2020 19:39:42 +0200 Subject: [PATCH 182/300] s390/qeth: fix cancelling of TX timer on dev_close() With the introduction of TX coalescing, .ndo_start_xmit now potentially starts the TX completion timer. So only kill the timer _after_ TX has been disabled. Fixes: ee1e52d1e4bb ("s390/qeth: add TX IRQ coalescing support for IQD devices") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index f7689461c242..569966bdc513 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -6717,17 +6717,17 @@ int qeth_stop(struct net_device *dev) unsigned int i; /* Quiesce the NAPI instances: */ - qeth_for_each_output_queue(card, queue, i) { + qeth_for_each_output_queue(card, queue, i) napi_disable(&queue->napi); - del_timer_sync(&queue->timer); - } /* Stop .ndo_start_xmit, might still access queue->napi. */ netif_tx_disable(dev); - /* Queues may get re-allocated, so remove the NAPIs here. */ - qeth_for_each_output_queue(card, queue, i) + qeth_for_each_output_queue(card, queue, i) { + del_timer_sync(&queue->timer); + /* Queues may get re-allocated, so remove the NAPIs. */ netif_napi_del(&queue->napi); + } } else { netif_tx_disable(dev); } From 071a43e660996d1517d02ea7724491cc5b93bb03 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Apr 2020 12:39:02 +0200 Subject: [PATCH 183/300] cxgb4/chcr: avoid -Wreturn-local-addr warning gcc-10 warns about functions that return a pointer to a stack variable. In chcr_write_cpl_set_tcb_ulp(), this does not actually happen, but it's too hard to see for the compiler: drivers/crypto/chelsio/chcr_ktls.c: In function 'chcr_write_cpl_set_tcb_ulp.constprop': drivers/crypto/chelsio/chcr_ktls.c:760:9: error: function may return address of local variable [-Werror=return-local-addr] 760 | return pos; | ^~~ drivers/crypto/chelsio/chcr_ktls.c:712:5: note: declared here 712 | u8 buf[48] = {0}; | ^~~ Split the middle part of the function out into a helper to make it easier to understand by both humans and compilers, which avoids the warning. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_ktls.c | 83 +++++++++++++++++------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_ktls.c b/drivers/crypto/chelsio/chcr_ktls.c index e92b352fb0ad..43d9e2420110 100644 --- a/drivers/crypto/chelsio/chcr_ktls.c +++ b/drivers/crypto/chelsio/chcr_ktls.c @@ -673,41 +673,14 @@ int chcr_ktls_cpl_set_tcb_rpl(struct adapter *adap, unsigned char *input) return 0; } -/* - * chcr_write_cpl_set_tcb_ulp: update tcb values. - * TCB is responsible to create tcp headers, so all the related values - * should be correctly updated. - * @tx_info - driver specific tls info. - * @q - tx queue on which packet is going out. - * @tid - TCB identifier. - * @pos - current index where should we start writing. - * @word - TCB word. - * @mask - TCB word related mask. - * @val - TCB word related value. - * @reply - set 1 if looking for TP response. - * return - next position to write. - */ -static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, - struct sge_eth_txq *q, u32 tid, - void *pos, u16 word, u64 mask, +static void *__chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, + u32 tid, void *pos, u16 word, u64 mask, u64 val, u32 reply) { struct cpl_set_tcb_field_core *cpl; struct ulptx_idata *idata; struct ulp_txpkt *txpkt; - void *save_pos = NULL; - u8 buf[48] = {0}; - int left; - left = (void *)q->q.stat - pos; - if (unlikely(left < CHCR_SET_TCB_FIELD_LEN)) { - if (!left) { - pos = q->q.desc; - } else { - save_pos = pos; - pos = buf; - } - } /* ULP_TXPKT */ txpkt = pos; txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | ULP_TXPKT_DEST_V(0)); @@ -732,18 +705,54 @@ static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, idata = (struct ulptx_idata *)(cpl + 1); idata->cmd_more = htonl(ULPTX_CMD_V(ULP_TX_SC_NOOP)); idata->len = htonl(0); + pos = idata + 1; - if (save_pos) { - pos = chcr_copy_to_txd(buf, &q->q, save_pos, - CHCR_SET_TCB_FIELD_LEN); - } else { - /* check again if we are at the end of the queue */ - if (left == CHCR_SET_TCB_FIELD_LEN) + return pos; +} + + +/* + * chcr_write_cpl_set_tcb_ulp: update tcb values. + * TCB is responsible to create tcp headers, so all the related values + * should be correctly updated. + * @tx_info - driver specific tls info. + * @q - tx queue on which packet is going out. + * @tid - TCB identifier. + * @pos - current index where should we start writing. + * @word - TCB word. + * @mask - TCB word related mask. + * @val - TCB word related value. + * @reply - set 1 if looking for TP response. + * return - next position to write. + */ +static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, + struct sge_eth_txq *q, u32 tid, + void *pos, u16 word, u64 mask, + u64 val, u32 reply) +{ + int left = (void *)q->q.stat - pos; + + if (unlikely(left < CHCR_SET_TCB_FIELD_LEN)) { + if (!left) { pos = q->q.desc; - else - pos = idata + 1; + } else { + u8 buf[48] = {0}; + + __chcr_write_cpl_set_tcb_ulp(tx_info, tid, buf, word, + mask, val, reply); + + return chcr_copy_to_txd(buf, &q->q, pos, + CHCR_SET_TCB_FIELD_LEN); + } } + pos = __chcr_write_cpl_set_tcb_ulp(tx_info, tid, pos, word, + mask, val, reply); + + /* check again if we are at the end of the queue */ + if (left == CHCR_SET_TCB_FIELD_LEN) + pos = q->q.desc; + return pos; } From a7df4870d79b00742da6cc93ca2f336a71db77f7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 30 Apr 2020 20:53:49 -0700 Subject: [PATCH 184/300] net_sched: fix tcm_parent in tc filter dump When we tell kernel to dump filters from root (ffff:ffff), those filters on ingress (ffff:0000) are matched, but their true parents must be dumped as they are. However, kernel dumps just whatever we tell it, that is either ffff:ffff or ffff:0000: $ nl-cls-list --dev=dummy0 --parent=root cls basic dev dummy0 id none parent root prio 49152 protocol ip match-all cls basic dev dummy0 id :1 parent root prio 49152 protocol ip match-all $ nl-cls-list --dev=dummy0 --parent=ffff: cls basic dev dummy0 id none parent ffff: prio 49152 protocol ip match-all cls basic dev dummy0 id :1 parent ffff: prio 49152 protocol ip match-all This is confusing and misleading, more importantly this is a regression since 4.15, so the old behavior must be restored. And, when tc filters are installed on a tc class, the parent should be the classid, rather than the qdisc handle. Commit edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto") removed the classid we save for filters, we can just restore this classid in tcf_block. Steps to reproduce this: ip li set dev dummy0 up tc qd add dev dummy0 ingress tc filter add dev dummy0 parent ffff: protocol arp basic action pass tc filter show dev dummy0 root Before this patch: filter protocol arp pref 49152 basic filter protocol arp pref 49152 basic handle 0x1 action order 1: gact action pass random type none pass val 0 index 1 ref 1 bind 1 After this patch: filter parent ffff: protocol arp pref 49152 basic filter parent ffff: protocol arp pref 49152 basic handle 0x1 action order 1: gact action pass random type none pass val 0 index 1 ref 1 bind 1 Fixes: a10fa20101ae ("net: sched: propagate q and parent from caller down to tcf_fill_node") Fixes: edf6711c9840 ("net: sched: remove classid and q fields from tcf_proto") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/sched/cls_api.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 25d2ec4c8f00..8428aa614265 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -407,6 +407,7 @@ struct tcf_block { struct mutex lock; struct list_head chain_list; u32 index; /* block index for shared blocks */ + u32 classid; /* which class this block belongs to */ refcount_t refcnt; struct net *net; struct Qdisc *q; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 55bd1429678f..c0e5b64b3caf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2070,6 +2070,7 @@ replay: err = PTR_ERR(block); goto errout; } + block->classid = parent; chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; if (chain_index > TC_ACT_EXT_VAL_MASK) { @@ -2612,12 +2613,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; parent = tcm->tcm_parent; - if (!parent) { + if (!parent) q = dev->qdisc; - parent = q->handle; - } else { + else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); - } if (!q) goto out; cops = q->ops->cl_ops; @@ -2633,6 +2632,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) block = cops->tcf_block(q, cl, NULL); if (!block) goto out; + parent = block->classid; if (tcf_block_shared(block)) q = NULL; } From 44d95cc6b10ff7439d45839c96c581cb4368c088 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 1 May 2020 15:10:16 +0100 Subject: [PATCH 185/300] net: stmmac: gmac5+: fix potential integer overflow on 32 bit multiply The multiplication of cfg->ctr[1] by 1000000000 is performed using a 32 bit multiplication (since cfg->ctr[1] is a u32) and this can lead to a potential overflow. Fix this by making the constant a ULL to ensure a 64 bit multiply occurs. Fixes: 504723af0d85 ("net: stmmac: Add basic EST support for GMAC5+") Addresses-Coverity: ("Unintentional integer overflow") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c index 494c859b4ade..67ba67ed0cb9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c @@ -624,7 +624,7 @@ int dwmac5_est_configure(void __iomem *ioaddr, struct stmmac_est *cfg, total_offset += offset; } - total_ctr = cfg->ctr[0] + cfg->ctr[1] * 1000000000; + total_ctr = cfg->ctr[0] + cfg->ctr[1] * 1000000000ULL; total_ctr += total_offset; ctr_low = do_div(total_ctr, 1000000000); From 93a2014afbace907178afc3c9c1e62c9a338595a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 1 May 2020 11:11:08 -0700 Subject: [PATCH 186/300] atm: fix a UAF in lec_arp_clear_vccs() Gengming reported a UAF in lec_arp_clear_vccs(), where we add a vcc socket to an entry in a per-device list but free the socket without removing it from the list when vcc->dev is NULL. We need to call lec_vcc_close() to search and remove those entries contain the vcc being destroyed. This can be done by calling vcc->push(vcc, NULL) unconditionally in vcc_destroy_socket(). Another issue discovered by Gengming's reproducer is the vcc->dev may point to the static device lecatm_dev, for which we don't need to register/unregister device, so we can just check for vcc->dev->ops->owner. Reported-by: Gengming Liu Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/atm/common.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/net/atm/common.c b/net/atm/common.c index 0ce530af534d..8575f5d52087 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -177,18 +177,18 @@ static void vcc_destroy_socket(struct sock *sk) set_bit(ATM_VF_CLOSE, &vcc->flags); clear_bit(ATM_VF_READY, &vcc->flags); - if (vcc->dev) { - if (vcc->dev->ops->close) - vcc->dev->ops->close(vcc); - if (vcc->push) - vcc->push(vcc, NULL); /* atmarpd has no push */ - module_put(vcc->owner); + if (vcc->dev && vcc->dev->ops->close) + vcc->dev->ops->close(vcc); + if (vcc->push) + vcc->push(vcc, NULL); /* atmarpd has no push */ + module_put(vcc->owner); - while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { - atm_return(vcc, skb->truesize); - kfree_skb(skb); - } + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + atm_return(vcc, skb->truesize); + kfree_skb(skb); + } + if (vcc->dev && vcc->dev->ops->owner) { module_put(vcc->dev->ops->owner); atm_dev_put(vcc->dev); } From 8d9f73c0ad2f20e9fed5380de0a3097825859d03 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 1 May 2020 11:11:09 -0700 Subject: [PATCH 187/300] atm: fix a memory leak of vcc->user_back In lec_arp_clear_vccs() only entry->vcc is freed, but vcc could be installed on entry->recv_vcc too in lec_vcc_added(). This fixes the following memory leak: unreferenced object 0xffff8880d9266b90 (size 16): comm "atm2", pid 425, jiffies 4294907980 (age 23.488s) hex dump (first 16 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 6b 6b 6b a5 ............kkk. backtrace: [<(____ptrval____)>] kmem_cache_alloc_trace+0x10e/0x151 [<(____ptrval____)>] lane_ioctl+0x4b3/0x569 [<(____ptrval____)>] do_vcc_ioctl+0x1ea/0x236 [<(____ptrval____)>] svc_ioctl+0x17d/0x198 [<(____ptrval____)>] sock_do_ioctl+0x47/0x12f [<(____ptrval____)>] sock_ioctl+0x2f9/0x322 [<(____ptrval____)>] vfs_ioctl+0x1e/0x2b [<(____ptrval____)>] ksys_ioctl+0x61/0x80 [<(____ptrval____)>] __x64_sys_ioctl+0x16/0x19 [<(____ptrval____)>] do_syscall_64+0x57/0x65 [<(____ptrval____)>] entry_SYSCALL_64_after_hwframe+0x49/0xb3 Cc: Gengming Liu Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/atm/lec.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/atm/lec.c b/net/atm/lec.c index 25fa3a7b72bd..ca37f5a71f5e 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1264,6 +1264,12 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry) entry->vcc = NULL; } if (entry->recv_vcc) { + struct atm_vcc *vcc = entry->recv_vcc; + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + + kfree(vpriv); + vcc->user_back = NULL; + entry->recv_vcc->push = entry->old_recv_push; vcc_release_async(entry->recv_vcc, -EPIPE); entry->recv_vcc = NULL; From bd4af432cc71b5fbfe4833510359a6ad3ada250d Mon Sep 17 00:00:00 2001 From: Qiushi Wu Date: Sat, 2 May 2020 17:42:59 -0500 Subject: [PATCH 188/300] nfp: abm: fix a memory leak bug In function nfp_abm_vnic_set_mac, pointer nsp is allocated by nfp_nsp_open. But when nfp_nsp_has_hwinfo_lookup fail, the pointer is not released, which can lead to a memory leak bug. Fix this issue by adding nfp_nsp_close(nsp) in the error path. Fixes: f6e71efdf9fb1 ("nfp: abm: look up MAC addresses via management FW") Signed-off-by: Qiushi Wu Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/abm/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c index 9183b3e85d21..354efffac0f9 100644 --- a/drivers/net/ethernet/netronome/nfp/abm/main.c +++ b/drivers/net/ethernet/netronome/nfp/abm/main.c @@ -283,6 +283,7 @@ nfp_abm_vnic_set_mac(struct nfp_pf *pf, struct nfp_abm *abm, struct nfp_net *nn, if (!nfp_nsp_has_hwinfo_lookup(nsp)) { nfp_warn(pf->cpp, "NSP doesn't support PF MAC generation\n"); eth_hw_addr_random(nn->dp.netdev); + nfp_nsp_close(nsp); return; } From 7391efa48d88c8555a802bac562d02a38567127c Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 24 Apr 2020 10:29:26 +0530 Subject: [PATCH 189/300] RISC-V: Export riscv_cpuid_to_hartid_mask() API The riscv_cpuid_to_hartid_mask() API should be exported to allow building KVM RISC-V as loadable module. Signed-off-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/smp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index e0a6293093f1..a65a8fa0c22d 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -63,6 +64,7 @@ void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out) for_each_cpu(cpu, in) cpumask_set_cpu(cpuid_to_hartid_map(cpu), out); } +EXPORT_SYMBOL_GPL(riscv_cpuid_to_hartid_mask); bool arch_match_cpu_phys_id(int cpu, u64 phys_id) { From 6bcff51539ccae5431a01f60293419dbae21100f Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 24 Apr 2020 10:29:27 +0530 Subject: [PATCH 190/300] RISC-V: Add bitmap reprensenting ISA features common across CPUs This patch adds riscv_isa bitmap which represents Host ISA features common across all Host CPUs. The riscv_isa is not same as elf_hwcap because elf_hwcap will only have ISA features relevant for user-space apps whereas riscv_isa will have ISA features relevant to both kernel and user-space apps. One of the use-case for riscv_isa bitmap is in KVM hypervisor where we will use it to do following operations: 1. Check whether hypervisor extension is available 2. Find ISA features that need to be virtualized (e.g. floating point support, vector extension, etc.) Signed-off-by: Anup Patel Signed-off-by: Atish Patra Reviewed-by: Alexander Graf Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 22 +++++++++ arch/riscv/kernel/cpufeature.c | 83 ++++++++++++++++++++++++++++++++-- 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 1bb0cd04aec3..5ce50468aff1 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -8,6 +8,7 @@ #ifndef _ASM_RISCV_HWCAP_H #define _ASM_RISCV_HWCAP_H +#include #include #ifndef __ASSEMBLY__ @@ -22,6 +23,27 @@ enum { }; extern unsigned long elf_hwcap; + +#define RISCV_ISA_EXT_a ('a' - 'a') +#define RISCV_ISA_EXT_c ('c' - 'a') +#define RISCV_ISA_EXT_d ('d' - 'a') +#define RISCV_ISA_EXT_f ('f' - 'a') +#define RISCV_ISA_EXT_h ('h' - 'a') +#define RISCV_ISA_EXT_i ('i' - 'a') +#define RISCV_ISA_EXT_m ('m' - 'a') +#define RISCV_ISA_EXT_s ('s' - 'a') +#define RISCV_ISA_EXT_u ('u' - 'a') + +#define RISCV_ISA_EXT_MAX 64 + +unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); + +#define riscv_isa_extension_mask(ext) BIT_MASK(RISCV_ISA_EXT_##ext) + +bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit); +#define riscv_isa_extension_available(isa_bitmap, ext) \ + __riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext) + #endif #endif /* _ASM_RISCV_HWCAP_H */ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index a5ad00043104..ac202f44a670 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -6,6 +6,7 @@ * Copyright (C) 2017 SiFive */ +#include #include #include #include @@ -13,15 +14,57 @@ #include unsigned long elf_hwcap __read_mostly; + +/* Host ISA bitmap */ +static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly; + #ifdef CONFIG_FPU bool has_fpu __read_mostly; #endif +/** + * riscv_isa_extension_base() - Get base extension word + * + * @isa_bitmap: ISA bitmap to use + * Return: base extension word as unsigned long value + * + * NOTE: If isa_bitmap is NULL then Host ISA bitmap will be used. + */ +unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap) +{ + if (!isa_bitmap) + return riscv_isa[0]; + return isa_bitmap[0]; +} +EXPORT_SYMBOL_GPL(riscv_isa_extension_base); + +/** + * __riscv_isa_extension_available() - Check whether given extension + * is available or not + * + * @isa_bitmap: ISA bitmap to use + * @bit: bit position of the desired extension + * Return: true or false + * + * NOTE: If isa_bitmap is NULL then Host ISA bitmap will be used. + */ +bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, int bit) +{ + const unsigned long *bmap = (isa_bitmap) ? isa_bitmap : riscv_isa; + + if (bit >= RISCV_ISA_EXT_MAX) + return false; + + return test_bit(bit, bmap) ? true : false; +} +EXPORT_SYMBOL_GPL(__riscv_isa_extension_available); + void riscv_fill_hwcap(void) { struct device_node *node; const char *isa; - size_t i; + char print_str[BITS_PER_LONG + 1]; + size_t i, j, isa_len; static unsigned long isa2hwcap[256] = {0}; isa2hwcap['i'] = isa2hwcap['I'] = COMPAT_HWCAP_ISA_I; @@ -33,8 +76,11 @@ void riscv_fill_hwcap(void) elf_hwcap = 0; + bitmap_zero(riscv_isa, RISCV_ISA_EXT_MAX); + for_each_of_cpu_node(node) { unsigned long this_hwcap = 0; + unsigned long this_isa = 0; if (riscv_of_processor_hartid(node) < 0) continue; @@ -44,8 +90,24 @@ void riscv_fill_hwcap(void) continue; } - for (i = 0; i < strlen(isa); ++i) + i = 0; + isa_len = strlen(isa); +#if IS_ENABLED(CONFIG_32BIT) + if (!strncmp(isa, "rv32", 4)) + i += 4; +#elif IS_ENABLED(CONFIG_64BIT) + if (!strncmp(isa, "rv64", 4)) + i += 4; +#endif + for (; i < isa_len; ++i) { this_hwcap |= isa2hwcap[(unsigned char)(isa[i])]; + /* + * TODO: X, Y and Z extension parsing for Host ISA + * bitmap will be added in-future. + */ + if ('a' <= isa[i] && isa[i] < 'x') + this_isa |= (1UL << (isa[i] - 'a')); + } /* * All "okay" hart should have same isa. Set HWCAP based on @@ -56,6 +118,11 @@ void riscv_fill_hwcap(void) elf_hwcap &= this_hwcap; else elf_hwcap = this_hwcap; + + if (riscv_isa[0]) + riscv_isa[0] &= this_isa; + else + riscv_isa[0] = this_isa; } /* We don't support systems with F but without D, so mask those out @@ -65,7 +132,17 @@ void riscv_fill_hwcap(void) elf_hwcap &= ~COMPAT_HWCAP_ISA_F; } - pr_info("elf_hwcap is 0x%lx\n", elf_hwcap); + memset(print_str, 0, sizeof(print_str)); + for (i = 0, j = 0; i < BITS_PER_LONG; i++) + if (riscv_isa[0] & BIT_MASK(i)) + print_str[j++] = (char)('a' + i); + pr_info("riscv: ISA extensions %s\n", print_str); + + memset(print_str, 0, sizeof(print_str)); + for (i = 0, j = 0; i < BITS_PER_LONG; i++) + if (elf_hwcap & BIT_MASK(i)) + print_str[j++] = (char)('a' + i); + pr_info("riscv: ELF capabilities %s\n", print_str); #ifdef CONFIG_FPU if (elf_hwcap & (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D)) From a2da5b181f888b3cdb4727b6c60a8755cedce272 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 24 Apr 2020 10:29:28 +0530 Subject: [PATCH 191/300] RISC-V: Remove N-extension related defines The RISC-V N-extension is still in draft state hence remove N-extension related defines from asm/csr.h. Signed-off-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/csr.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 8e18d2c64399..cec462e198ce 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -51,13 +51,10 @@ #define CAUSE_IRQ_FLAG (_AC(1, UL) << (__riscv_xlen - 1)) /* Interrupt causes (minus the high bit) */ -#define IRQ_U_SOFT 0 #define IRQ_S_SOFT 1 #define IRQ_M_SOFT 3 -#define IRQ_U_TIMER 4 #define IRQ_S_TIMER 5 #define IRQ_M_TIMER 7 -#define IRQ_U_EXT 8 #define IRQ_S_EXT 9 #define IRQ_M_EXT 11 From c749bb2d554825e007cbc43b791f54e124dadfce Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 27 Apr 2020 14:59:24 +0800 Subject: [PATCH 192/300] riscv: set max_pfn to the PFN of the last page The current max_pfn equals to zero. In this case, I found it caused users cannot get some page information through /proc such as kpagecount in v5.6 kernel because of new sanity checks. The following message is displayed by stress-ng test suite with the command "stress-ng --verbose --physpage 1 -t 1" on HiFive unleashed board. # stress-ng --verbose --physpage 1 -t 1 stress-ng: debug: [109] 4 processors online, 4 processors configured stress-ng: info: [109] dispatching hogs: 1 physpage stress-ng: debug: [109] cache allocate: reducing cache level from L3 (too high) to L0 stress-ng: debug: [109] get_cpu_cache: invalid cache_level: 0 stress-ng: info: [109] cache allocate: using built-in defaults as no suitable cache found stress-ng: debug: [109] cache allocate: default cache size: 2048K stress-ng: debug: [109] starting stressors stress-ng: debug: [109] 1 stressor spawned stress-ng: debug: [110] stress-ng-physpage: started [110] (instance 0) stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd34de000 in /proc/kpagecount, errno=0 (Success) stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success) ... stress-ng: error: [110] stress-ng-physpage: cannot read page count for address 0x3fd32db078 in /proc/kpagecount, errno=0 (Success) stress-ng: debug: [110] stress-ng-physpage: exited [110] (instance 0) stress-ng: debug: [109] process [110] terminated stress-ng: info: [109] successful run completed in 1.00s # After applying this patch, the kernel can pass the test. # stress-ng --verbose --physpage 1 -t 1 stress-ng: debug: [104] 4 processors online, 4 processors configured stress-ng: info: [104] dispatching hogs: 1 physpage stress-ng: info: [104] cache allocate: using defaults, can't determine cache details from sysfs stress-ng: debug: [104] cache allocate: default cache size: 2048K stress-ng: debug: [104] starting stressors stress-ng: debug: [104] 1 stressor spawned stress-ng: debug: [105] stress-ng-physpage: started [105] (instance 0) stress-ng: debug: [105] stress-ng-physpage: exited [105] (instance 0) stress-ng: debug: [104] process [105] terminated stress-ng: info: [104] successful run completed in 1.01s # Cc: stable@vger.kernel.org Signed-off-by: Vincent Chen Reviewed-by: Anup Patel Reviewed-by: Yash Shah Tested-by: Yash Shah Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b55be44ff9bd..5b813532db59 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -150,7 +150,8 @@ void __init setup_bootmem(void) memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); set_max_mapnr(PFN_DOWN(mem_size)); - max_low_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_low_pfn = max_pfn; #ifdef CONFIG_BLK_DEV_INITRD setup_initrd(); From 0a9f2a6161dcd6be057f6c501453d28b3c4a3b0c Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 27 Apr 2020 17:13:34 +0200 Subject: [PATCH 193/300] riscv: add Linux note to vdso The Linux note in the vdso allows glibc to check the running kernel version without having to issue the uname syscall. Signed-off-by: Andreas Schwab Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/Makefile | 2 +- arch/riscv/kernel/vdso/note.S | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/kernel/vdso/note.S diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index a4ee3a0e7d20..4c8b2a4a6a70 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -12,7 +12,7 @@ vdso-syms += getcpu vdso-syms += flush_icache # Files to link into the vdso -obj-vdso = $(patsubst %, %.o, $(vdso-syms)) +obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-dummy.o diff --git a/arch/riscv/kernel/vdso/note.S b/arch/riscv/kernel/vdso/note.S new file mode 100644 index 000000000000..2a956c942211 --- /dev/null +++ b/arch/riscv/kernel/vdso/note.S @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END From d6d5161280b3d57163f5310f0a0007cdeb729984 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 4 May 2020 11:54:48 +0800 Subject: [PATCH 194/300] riscv: force __cpu_up_ variables to put in data section Put __cpu_up_stack_pointer and __cpu_up_task_pointer in data section. Currently, these two variables are put in bss section, there is a potential risk that secondary harts get the uninitialized value before main hart finishing the bss clearing. In this case, all secondary harts would pass the waiting loop and enable the MMU before main hart set up the page table. This issue happens on random booting of multiple harts, which means it will manifest for BBL and OpenSBI v0.6 (or older version). In OpenSBI v0.7 (or higher version), we have HSM extension so all the secondary harts are brought-up by Linux kernel in an orderly fashion. This means we don't need this change for OpenSBI v0.7 (or higher version). Signed-off-by: Zong Li Reviewed-by: Greentime Hu Reviewed-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu_ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index c4c33bf02369..0ec22354018c 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -15,8 +15,8 @@ const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; -void *__cpu_up_stack_pointer[NR_CPUS]; -void *__cpu_up_task_pointer[NR_CPUS]; +void *__cpu_up_stack_pointer[NR_CPUS] __section(.data); +void *__cpu_up_task_pointer[NR_CPUS] __section(.data); extern const struct cpu_operations cpu_ops_sbi; extern const struct cpu_operations cpu_ops_spinwait; From 5615e74f48dcc982655543e979b6c3f3f877e6f6 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 5 May 2020 09:27:15 +0200 Subject: [PATCH 195/300] KVM: s390: Remove false WARN_ON_ONCE for the PQAP instruction In LPAR we will only get an intercept for FC==3 for the PQAP instruction. Running nested under z/VM can result in other intercepts as well as ECA_APIE is an effective bit: If one hypervisor layer has turned this bit off, the end result will be that we will get intercepts for all function codes. Usually the first one will be a query like PQAP(QCI). So the WARN_ON_ONCE is not right. Let us simply remove it. Cc: Pierre Morel Cc: Tony Krowiak Cc: stable@vger.kernel.org # v5.3+ Fixes: e5282de93105 ("s390: ap: kvm: add PQAP interception for AQIC") Link: https://lore.kernel.org/kvm/20200505083515.2720-1-borntraeger@de.ibm.com Reported-by: Qian Cai Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 69a824f9ef0b..893893642415 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -626,10 +626,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu) * available for the guest are AQIC and TAPQ with the t bit set * since we do not set IC.3 (FIII) we currently will only intercept * the AQIC function code. + * Note: running nested under z/VM can result in intercepts for other + * function codes, e.g. PQAP(QCI). We do not support this and bail out. */ reg0 = vcpu->run->s.regs.gprs[0]; fc = (reg0 >> 24) & 0xff; - if (WARN_ON_ONCE(fc != 0x03)) + if (fc != 0x03) return -EOPNOTSUPP; /* PQAP instruction is allowed for guest kernel only */ From 3c5c0805ac925f2f8c68eeb1ba0f8a796a7b4525 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 2 May 2020 13:26:44 +0200 Subject: [PATCH 196/300] staging: ks7010: remove me from CC list I lost interest in this driver years ago because I could't keep up with testing the incoming janitorial patches. So, drop me from CC. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20200502112648.6942-1-wsa@the-dreams.de Signed-off-by: Greg Kroah-Hartman --- drivers/staging/ks7010/TODO | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/staging/ks7010/TODO b/drivers/staging/ks7010/TODO index 87a6dac4890d..ab6f39175d99 100644 --- a/drivers/staging/ks7010/TODO +++ b/drivers/staging/ks7010/TODO @@ -30,5 +30,4 @@ Now the TODOs: Please send any patches to: Greg Kroah-Hartman -Wolfram Sang Linux Driver Project Developer List From 769acc3656d93aaacada814939743361d284fd87 Mon Sep 17 00:00:00 2001 From: Oscar Carter Date: Fri, 1 May 2020 17:51:18 +0200 Subject: [PATCH 197/300] staging: gasket: Check the return value of gasket_get_bar_index() Check the return value of gasket_get_bar_index function as it can return a negative one (-EINVAL). If this happens, a negative index is used in the "gasket_dev->bar_data" array. Addresses-Coverity-ID: 1438542 ("Negative array index read") Fixes: 9a69f5087ccc2 ("drivers/staging: Gasket driver framework + Apex driver") Signed-off-by: Oscar Carter Cc: stable Reviewed-by: Richard Yeh Link: https://lore.kernel.org/r/20200501155118.13380-1-oscar.carter@gmx.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/gasket/gasket_core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/staging/gasket/gasket_core.c b/drivers/staging/gasket/gasket_core.c index 8e0575fcb4c8..67325fbaf760 100644 --- a/drivers/staging/gasket/gasket_core.c +++ b/drivers/staging/gasket/gasket_core.c @@ -925,6 +925,10 @@ do_map_region(const struct gasket_dev *gasket_dev, struct vm_area_struct *vma, gasket_get_bar_index(gasket_dev, (vma->vm_pgoff << PAGE_SHIFT) + driver_desc->legacy_mmap_address_offset); + + if (bar_index < 0) + return DO_MAP_REGION_INVALID; + phys_base = gasket_dev->bar_data[bar_index].phys_base + phys_offset; while (mapped_bytes < map_length) { /* From ac854131d9844f79e2fdcef67a7707227538d78a Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 1 May 2020 16:07:28 -0400 Subject: [PATCH 198/300] USB: core: Fix misleading driver bug report The syzbot fuzzer found a race between URB submission to endpoint 0 and device reset. Namely, during the reset we call usb_ep0_reinit() because the characteristics of ep0 may have changed (if the reset follows a firmware update, for example). While usb_ep0_reinit() is running there is a brief period during which the pointers stored in udev->ep_in[0] and udev->ep_out[0] are set to NULL, and if an URB is submitted to ep0 during that period, usb_urb_ep_type_check() will report it as a driver bug. In the absence of those pointers, the routine thinks that the endpoint doesn't exist. The log message looks like this: ------------[ cut here ]------------ usb 2-1: BOGUS urb xfer, pipe 2 != type 2 WARNING: CPU: 0 PID: 9241 at drivers/usb/core/urb.c:478 usb_submit_urb+0x1188/0x1460 drivers/usb/core/urb.c:478 Now, although submitting an URB while the device is being reset is a questionable thing to do, it shouldn't count as a driver bug as severe as submitting an URB for an endpoint that doesn't exist. Indeed, endpoint 0 always exists, even while the device is in its unconfigured state. To prevent these misleading driver bug reports, this patch updates usb_disable_endpoint() to avoid clearing the ep_in[] and ep_out[] pointers when the endpoint being disabled is ep0. There's no danger of leaving a stale pointer in place, because the usb_host_endpoint structure being pointed to is stored permanently in udev->ep0; it doesn't get deallocated until the entire usb_device structure does. Reported-and-tested-by: syzbot+db339689b2101f6f6071@syzkaller.appspotmail.com Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/Pine.LNX.4.44L0.2005011558590.903-100000@netrider.rowland.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index a48678a0c83a..6197938dcc2d 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -1144,11 +1144,11 @@ void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr, if (usb_endpoint_out(epaddr)) { ep = dev->ep_out[epnum]; - if (reset_hardware) + if (reset_hardware && epnum != 0) dev->ep_out[epnum] = NULL; } else { ep = dev->ep_in[epnum]; - if (reset_hardware) + if (reset_hardware && epnum != 0) dev->ep_in[epnum] = NULL; } if (ep) { From e283f5e89f44a80ca536e4a12903c64e9e9a82e4 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Thu, 30 Apr 2020 16:56:57 +0300 Subject: [PATCH 199/300] usb: typec: intel_pmc_mux: Fix the property names The device property names for the port index number are "usb2-port-number" and "usb3-port-number", not "usb2-port" and "usb3-port". Fixes: 6701adfa9693 ("usb: typec: driver for Intel PMC mux control") Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20200430135657.45169-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/intel_pmc_mux.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/mux/intel_pmc_mux.c b/drivers/usb/typec/mux/intel_pmc_mux.c index f5c5e0aef66f..bb23886c1768 100644 --- a/drivers/usb/typec/mux/intel_pmc_mux.c +++ b/drivers/usb/typec/mux/intel_pmc_mux.c @@ -298,11 +298,11 @@ static int pmc_usb_register_port(struct pmc_usb *pmc, int index, struct typec_mux_desc mux_desc = { }; int ret; - ret = fwnode_property_read_u8(fwnode, "usb2-port", &port->usb2_port); + ret = fwnode_property_read_u8(fwnode, "usb2-port-number", &port->usb2_port); if (ret) return ret; - ret = fwnode_property_read_u8(fwnode, "usb3-port", &port->usb3_port); + ret = fwnode_property_read_u8(fwnode, "usb3-port-number", &port->usb3_port); if (ret) return ret; From 2bef9aed6f0e22391c8d4570749b1acc9bc3981e Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Mon, 4 May 2020 15:13:48 -0500 Subject: [PATCH 200/300] usb: usbfs: correct kernel->user page attribute mismatch On some architectures (e.g. arm64) requests for IO coherent memory may use non-cachable attributes if the relevant device isn't cache coherent. If these pages are then remapped into userspace as cacheable, they may not be coherent with the non-cacheable mappings. In particular this happens with libusb, when it attempts to create zero-copy buffers for use by rtl-sdr (https://github.com/osmocom/rtl-sdr/). On low end arm devices with non-coherent USB ports, the application will be unexpectedly killed, while continuing to work fine on arm machines with coherent USB controllers. This bug has been discovered/reported a few times over the last few years. In the case of rtl-sdr a compile time option to enable/disable zero copy was implemented to work around it. Rather than relaying on application specific workarounds, dma_mmap_coherent() can be used instead of remap_pfn_range(). The page cache/etc attributes will then be correctly set in userspace to match the kernel mapping. Signed-off-by: Jeremy Linton Cc: stable Link: https://lore.kernel.org/r/20200504201348.1183246-1-jeremy.linton@arm.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 6833c918abce..b9db9812d6c5 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -217,6 +217,7 @@ static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) { struct usb_memory *usbm = NULL; struct usb_dev_state *ps = file->private_data; + struct usb_hcd *hcd = bus_to_hcd(ps->dev->bus); size_t size = vma->vm_end - vma->vm_start; void *mem; unsigned long flags; @@ -250,9 +251,7 @@ static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) usbm->vma_use_count = 1; INIT_LIST_HEAD(&usbm->memlist); - if (remap_pfn_range(vma, vma->vm_start, - virt_to_phys(usbm->mem) >> PAGE_SHIFT, - size, vma->vm_page_prot) < 0) { + if (dma_mmap_coherent(hcd->self.sysdev, vma, mem, dma_handle, size)) { dec_usb_memory_use_count(usbm, &usbm->vma_use_count); return -EAGAIN; } From 7990be48ef4d87163940d6c04c349c93f0bd9ae7 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 28 Apr 2020 22:44:28 -0700 Subject: [PATCH 201/300] usb: typec: mux: intel: Handle alt mode HPD_HIGH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the PMC Type C Subsystem (TCSS) Mux programming guide rev 0.6, when a device is transitioning to DP Alternate Mode state, if the HPD_STATE (bit 7) field in the status update command VDO is set to HPD_HIGH, the HPD_HIGH field in the Alternate Mode request “mode_data” field (bit 14) should also be set. Ensure the bit is correctly handled while issuing the Alternate Mode request. Signed-off-by: Prashant Malani Fixes: 6701adfa9693 ("usb: typec: driver for Intel PMC mux control") Acked-by: Heikki Krogerus Link: https://lore.kernel.org/r/20200429054432.134178-1-pmalani@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/intel_pmc_mux.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/typec/mux/intel_pmc_mux.c b/drivers/usb/typec/mux/intel_pmc_mux.c index bb23886c1768..67c5139cfa0d 100644 --- a/drivers/usb/typec/mux/intel_pmc_mux.c +++ b/drivers/usb/typec/mux/intel_pmc_mux.c @@ -157,6 +157,10 @@ pmc_usb_mux_dp(struct pmc_usb_port *port, struct typec_mux_state *state) req.mode_data |= (state->mode - TYPEC_STATE_MODAL) << PMC_USB_ALTMODE_DP_MODE_SHIFT; + if (data->status & DP_STATUS_HPD_STATE) + req.mode_data |= PMC_USB_DP_HPD_LVL << + PMC_USB_ALTMODE_DP_MODE_SHIFT; + return pmc_usb_command(port, (void *)&req, sizeof(req)); } From e87fa339d413c540c065c280ba9e7cc9a8dbcfd1 Mon Sep 17 00:00:00 2001 From: Archana Patni Date: Tue, 21 Apr 2020 14:10:19 +0530 Subject: [PATCH 202/300] platform/x86: intel_pmc_core: Change Jasper Lake S0ix debug reg map back to ICL Jasper Lake uses Icelake PCH IPs and the S0ix debug interfaces are same as Icelake. It uses SLP_S0_DBG register latch/read interface from Icelake generation. It doesn't use Tiger Lake LPM debug registers. Change the Jasper Lake S0ix debug interface to use the ICL reg map. Fixes: 16292bed9c56 ("platform/x86: intel_pmc_core: Add Atom based Jasper Lake (JSL) platform support") Signed-off-by: Archana Patni Acked-by: David E. Box Tested-by: Divagar Mohandass Signed-off-by: Andy Shevchenko --- drivers/platform/x86/intel_pmc_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c index d2a5d4c36715..a130859ec49e 100644 --- a/drivers/platform/x86/intel_pmc_core.c +++ b/drivers/platform/x86/intel_pmc_core.c @@ -255,7 +255,7 @@ static const struct pmc_bit_map *ext_cnp_pfear_map[] = { }; static const struct pmc_bit_map icl_pfear_map[] = { - /* Ice Lake generation onwards only */ + /* Ice Lake and Jasper Lake generation onwards only */ {"RES_65", BIT(0)}, {"RES_66", BIT(1)}, {"RES_67", BIT(2)}, @@ -274,7 +274,7 @@ static const struct pmc_bit_map *ext_icl_pfear_map[] = { }; static const struct pmc_bit_map tgl_pfear_map[] = { - /* Tiger Lake, Elkhart Lake and Jasper Lake generation onwards only */ + /* Tiger Lake and Elkhart Lake generation onwards only */ {"PSF9", BIT(0)}, {"RES_66", BIT(1)}, {"RES_67", BIT(2)}, @@ -1156,7 +1156,7 @@ static const struct x86_cpu_id intel_pmc_core_ids[] = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_reg_map), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_reg_map), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &tgl_reg_map), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &tgl_reg_map), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &icl_reg_map), {} }; From 3bd12da7f50b8bc191fcb3bab1f55c582234df59 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 23 Apr 2020 00:05:59 +0200 Subject: [PATCH 203/300] platform/x86: asus-nb-wmi: Do not load on Asus T100TA and T200TA asus-nb-wmi does not add any extra functionality on these Asus Transformer books. They have detachable keyboards, so the hotkeys are send through a HID device (and handled by the hid-asus driver) and also the rfkill functionality is not used on these devices. Besides not adding any extra functionality, initializing the WMI interface on these devices actually has a negative side-effect. For some reason the \_SB.ATKD.INIT() function which asus_wmi_platform_init() calls drives GPO2 (INT33FC:02) pin 8, which is connected to the front facing webcam LED, high and there is no (WMI or other) interface to drive this low again causing the LED to be permanently on, even during suspend. This commit adds a blacklist of DMI system_ids on which not to load the asus-nb-wmi and adds these Transformer books to this list. This fixes the webcam LED being permanently on under Linux. Signed-off-by: Hans de Goede Signed-off-by: Andy Shevchenko --- drivers/platform/x86/asus-nb-wmi.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 6f12747a359a..c4404d9c1de4 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -515,9 +515,33 @@ static struct asus_wmi_driver asus_nb_wmi_driver = { .detect_quirks = asus_nb_wmi_quirks, }; +static const struct dmi_system_id asus_nb_wmi_blacklist[] __initconst = { + { + /* + * asus-nb-wm adds no functionality. The T100TA has a detachable + * USB kbd, so no hotkeys and it has no WMI rfkill; and loading + * asus-nb-wm causes the camera LED to turn and _stay_ on. + */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T100TA"), + }, + }, + { + /* The Asus T200TA has the same issue as the T100TA */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "T200TA"), + }, + }, + {} /* Terminating entry */ +}; static int __init asus_nb_wmi_init(void) { + if (dmi_check_system(asus_nb_wmi_blacklist)) + return -ENODEV; + return asus_wmi_register_driver(&asus_nb_wmi_driver); } From 01f259f3720c3965443109d82a0f12611ea8bb58 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 Apr 2020 23:36:38 +0200 Subject: [PATCH 204/300] platform/x86: intel_pmc_core: avoid unused-function warnings When both CONFIG_DEBUG_FS and CONFIG_PM_SLEEP are disabled, the functions that got moved out of the #ifdef section now cause a warning: drivers/platform/x86/intel_pmc_core.c:654:13: error: 'pmc_core_lpm_display' defined but not used [-Werror=unused-function] 654 | static void pmc_core_lpm_display(struct pmc_dev *pmcdev, struct device *dev, | ^~~~~~~~~~~~~~~~~~~~ drivers/platform/x86/intel_pmc_core.c:617:13: error: 'pmc_core_slps0_display' defined but not used [-Werror=unused-function] 617 | static void pmc_core_slps0_display(struct pmc_dev *pmcdev, struct device *dev, | ^~~~~~~~~~~~~~~~~~~~~~ Rather than add even more #ifdefs here, remove them entirely and let the compiler work it out, it can actually get rid of all the debugfs calls without problems as long as the struct member is there. The two PM functions just need a __maybe_unused annotations to avoid another warning instead of the #ifdef. Fixes: aae43c2bcdc1 ("platform/x86: intel_pmc_core: Relocate pmc_core_*_display() to outside of CONFIG_DEBUG_FS") Signed-off-by: Arnd Bergmann Signed-off-by: Andy Shevchenko --- drivers/platform/x86/intel_pmc_core.c | 18 ++---------------- drivers/platform/x86/intel_pmc_core.h | 2 -- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c index a130859ec49e..7c8bdab078cf 100644 --- a/drivers/platform/x86/intel_pmc_core.c +++ b/drivers/platform/x86/intel_pmc_core.c @@ -692,7 +692,6 @@ static void pmc_core_lpm_display(struct pmc_dev *pmcdev, struct device *dev, kfree(lpm_regs); } -#if IS_ENABLED(CONFIG_DEBUG_FS) static bool slps0_dbg_latch; static inline u8 pmc_core_reg_read_byte(struct pmc_dev *pmcdev, int offset) @@ -1133,15 +1132,6 @@ static void pmc_core_dbgfs_register(struct pmc_dev *pmcdev) &pmc_core_substate_l_sts_regs_fops); } } -#else -static inline void pmc_core_dbgfs_register(struct pmc_dev *pmcdev) -{ -} - -static inline void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev) -{ -} -#endif /* CONFIG_DEBUG_FS */ static const struct x86_cpu_id intel_pmc_core_ids[] = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &spt_reg_map), @@ -1260,13 +1250,11 @@ static int pmc_core_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP - static bool warn_on_s0ix_failures; module_param(warn_on_s0ix_failures, bool, 0644); MODULE_PARM_DESC(warn_on_s0ix_failures, "Check and warn for S0ix failures"); -static int pmc_core_suspend(struct device *dev) +static __maybe_unused int pmc_core_suspend(struct device *dev) { struct pmc_dev *pmcdev = dev_get_drvdata(dev); @@ -1318,7 +1306,7 @@ static inline bool pmc_core_is_s0ix_failed(struct pmc_dev *pmcdev) return false; } -static int pmc_core_resume(struct device *dev) +static __maybe_unused int pmc_core_resume(struct device *dev) { struct pmc_dev *pmcdev = dev_get_drvdata(dev); const struct pmc_bit_map **maps = pmcdev->map->lpm_sts; @@ -1348,8 +1336,6 @@ static int pmc_core_resume(struct device *dev) return 0; } -#endif - static const struct dev_pm_ops pmc_core_pm_ops = { SET_LATE_SYSTEM_SLEEP_PM_OPS(pmc_core_suspend, pmc_core_resume) }; diff --git a/drivers/platform/x86/intel_pmc_core.h b/drivers/platform/x86/intel_pmc_core.h index 0d50b2402abe..5eae55d80226 100644 --- a/drivers/platform/x86/intel_pmc_core.h +++ b/drivers/platform/x86/intel_pmc_core.h @@ -282,9 +282,7 @@ struct pmc_dev { u32 base_addr; void __iomem *regbase; const struct pmc_reg_map *map; -#if IS_ENABLED(CONFIG_DEBUG_FS) struct dentry *dbgfs_dir; -#endif /* CONFIG_DEBUG_FS */ int pmc_xram_read_bit; struct mutex lock; /* generic mutex lock for PMC Core */ From f8a31eca47bec197f5ec5dc40ad675450c2058a5 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Wed, 29 Apr 2020 16:59:40 +0800 Subject: [PATCH 205/300] platform/x86: thinkpad_acpi: Remove always false 'value < 0' statement Since 'value' is declared as unsigned long, the following statement is always false. value < 0 So let's remove it. Reported-by: Hulk Robot Signed-off-by: Xiongfeng Wang Signed-off-by: Andy Shevchenko --- drivers/platform/x86/thinkpad_acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 8eaadbaf8ffa..0f704484ae1d 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -9548,7 +9548,7 @@ static ssize_t tpacpi_battery_store(int what, if (!battery_info.batteries[battery].start_support) return -ENODEV; /* valid values are [0, 99] */ - if (value < 0 || value > 99) + if (value > 99) return -EINVAL; if (value > battery_info.batteries[battery].charge_stop) return -EINVAL; From eb791aa70b90c559eeb371d807c8813d569393f0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2020 14:54:09 +0200 Subject: [PATCH 206/300] iommu/amd: Fix race in increase_address_space()/fetch_pte() The 'pt_root' and 'mode' struct members of 'struct protection_domain' need to be get/set atomically, otherwise the page-table of the domain can get corrupted. Merge the fields into one atomic64_t struct member which can be get/set atomically. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reported-by: Qian Cai Signed-off-by: Joerg Roedel Tested-by: Qian Cai Link: https://lore.kernel.org/r/20200504125413.16798-2-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 142 ++++++++++++++++++++++++-------- drivers/iommu/amd_iommu_types.h | 9 +- 2 files changed, 113 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 20cce366e951..28229a38af4d 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -151,6 +151,26 @@ static struct protection_domain *to_pdomain(struct iommu_domain *dom) return container_of(dom, struct protection_domain, domain); } +static void amd_iommu_domain_get_pgtable(struct protection_domain *domain, + struct domain_pgtable *pgtable) +{ + u64 pt_root = atomic64_read(&domain->pt_root); + + pgtable->root = (u64 *)(pt_root & PAGE_MASK); + pgtable->mode = pt_root & 7; /* lowest 3 bits encode pgtable mode */ +} + +static u64 amd_iommu_domain_encode_pgtable(u64 *root, int mode) +{ + u64 pt_root; + + /* lowest 3 bits encode pgtable mode */ + pt_root = mode & 7; + pt_root |= (u64)root; + + return pt_root; +} + static struct iommu_dev_data *alloc_dev_data(u16 devid) { struct iommu_dev_data *dev_data; @@ -1397,13 +1417,18 @@ static struct page *free_sub_pt(unsigned long root, int mode, static void free_pagetable(struct protection_domain *domain) { - unsigned long root = (unsigned long)domain->pt_root; + struct domain_pgtable pgtable; struct page *freelist = NULL; + unsigned long root; - BUG_ON(domain->mode < PAGE_MODE_NONE || - domain->mode > PAGE_MODE_6_LEVEL); + amd_iommu_domain_get_pgtable(domain, &pgtable); + atomic64_set(&domain->pt_root, 0); - freelist = free_sub_pt(root, domain->mode, freelist); + BUG_ON(pgtable.mode < PAGE_MODE_NONE || + pgtable.mode > PAGE_MODE_6_LEVEL); + + root = (unsigned long)pgtable.root; + freelist = free_sub_pt(root, pgtable.mode, freelist); free_page_list(freelist); } @@ -1417,24 +1442,28 @@ static bool increase_address_space(struct protection_domain *domain, unsigned long address, gfp_t gfp) { + struct domain_pgtable pgtable; unsigned long flags; bool ret = false; - u64 *pte; + u64 *pte, root; spin_lock_irqsave(&domain->lock, flags); - if (address <= PM_LEVEL_SIZE(domain->mode) || - WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL)) + amd_iommu_domain_get_pgtable(domain, &pgtable); + + if (address <= PM_LEVEL_SIZE(pgtable.mode) || + WARN_ON_ONCE(pgtable.mode == PAGE_MODE_6_LEVEL)) goto out; pte = (void *)get_zeroed_page(gfp); if (!pte) goto out; - *pte = PM_LEVEL_PDE(domain->mode, - iommu_virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; + *pte = PM_LEVEL_PDE(pgtable.mode, iommu_virt_to_phys(pgtable.root)); + + root = amd_iommu_domain_encode_pgtable(pte, pgtable.mode + 1); + + atomic64_set(&domain->pt_root, root); ret = true; @@ -1451,16 +1480,22 @@ static u64 *alloc_pte(struct protection_domain *domain, gfp_t gfp, bool *updated) { + struct domain_pgtable pgtable; int level, end_lvl; u64 *pte, *page; BUG_ON(!is_power_of_2(page_size)); - while (address > PM_LEVEL_SIZE(domain->mode)) - *updated = increase_address_space(domain, address, gfp) || *updated; + amd_iommu_domain_get_pgtable(domain, &pgtable); - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + while (address > PM_LEVEL_SIZE(pgtable.mode)) { + *updated = increase_address_space(domain, address, gfp) || *updated; + amd_iommu_domain_get_pgtable(domain, &pgtable); + } + + + level = pgtable.mode - 1; + pte = &pgtable.root[PM_LEVEL_INDEX(level, address)]; address = PAGE_SIZE_ALIGN(address, page_size); end_lvl = PAGE_SIZE_LEVEL(page_size); @@ -1536,16 +1571,19 @@ static u64 *fetch_pte(struct protection_domain *domain, unsigned long address, unsigned long *page_size) { + struct domain_pgtable pgtable; int level; u64 *pte; *page_size = 0; - if (address > PM_LEVEL_SIZE(domain->mode)) + amd_iommu_domain_get_pgtable(domain, &pgtable); + + if (address > PM_LEVEL_SIZE(pgtable.mode)) return NULL; - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + level = pgtable.mode - 1; + pte = &pgtable.root[PM_LEVEL_INDEX(level, address)]; *page_size = PTE_LEVEL_PAGE_SIZE(level); while (level > 0) { @@ -1806,6 +1844,7 @@ static void dma_ops_domain_free(struct protection_domain *domain) static struct protection_domain *dma_ops_domain_alloc(void) { struct protection_domain *domain; + u64 *pt_root, root; domain = kzalloc(sizeof(struct protection_domain), GFP_KERNEL); if (!domain) @@ -1814,12 +1853,14 @@ static struct protection_domain *dma_ops_domain_alloc(void) if (protection_domain_init(domain)) goto free_domain; - domain->mode = PAGE_MODE_3_LEVEL; - domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); - domain->flags = PD_DMA_OPS_MASK; - if (!domain->pt_root) + pt_root = (void *)get_zeroed_page(GFP_KERNEL); + if (!pt_root) goto free_domain; + root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL); + atomic64_set(&domain->pt_root, root); + domain->flags = PD_DMA_OPS_MASK; + if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM) goto free_domain; @@ -1843,14 +1884,17 @@ static bool dma_ops_domain(struct protection_domain *domain) static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats, bool ppr) { + struct domain_pgtable pgtable; u64 pte_root = 0; u64 flags = 0; u32 old_domid; - if (domain->mode != PAGE_MODE_NONE) - pte_root = iommu_virt_to_phys(domain->pt_root); + amd_iommu_domain_get_pgtable(domain, &pgtable); - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + if (pgtable.mode != PAGE_MODE_NONE) + pte_root = iommu_virt_to_phys(pgtable.root); + + pte_root |= (pgtable.mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV; @@ -2375,6 +2419,7 @@ out_err: static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) { struct protection_domain *pdomain; + u64 *pt_root, root; switch (type) { case IOMMU_DOMAIN_UNMANAGED: @@ -2382,13 +2427,15 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) if (!pdomain) return NULL; - pdomain->mode = PAGE_MODE_3_LEVEL; - pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); - if (!pdomain->pt_root) { + pt_root = (void *)get_zeroed_page(GFP_KERNEL); + if (!pt_root) { protection_domain_free(pdomain); return NULL; } + root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL); + atomic64_set(&pdomain->pt_root, root); + pdomain->domain.geometry.aperture_start = 0; pdomain->domain.geometry.aperture_end = ~0ULL; pdomain->domain.geometry.force_aperture = true; @@ -2406,7 +2453,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) if (!pdomain) return NULL; - pdomain->mode = PAGE_MODE_NONE; + atomic64_set(&pdomain->pt_root, PAGE_MODE_NONE); break; default: return NULL; @@ -2418,6 +2465,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) static void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain; + struct domain_pgtable pgtable; domain = to_pdomain(dom); @@ -2435,7 +2483,9 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) dma_ops_domain_free(domain); break; default: - if (domain->mode != PAGE_MODE_NONE) + amd_iommu_domain_get_pgtable(domain, &pgtable); + + if (pgtable.mode != PAGE_MODE_NONE) free_pagetable(domain); if (domain->flags & PD_IOMMUV2_MASK) @@ -2518,10 +2568,12 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, gfp_t gfp) { struct protection_domain *domain = to_pdomain(dom); + struct domain_pgtable pgtable; int prot = 0; int ret; - if (domain->mode == PAGE_MODE_NONE) + amd_iommu_domain_get_pgtable(domain, &pgtable); + if (pgtable.mode == PAGE_MODE_NONE) return -EINVAL; if (iommu_prot & IOMMU_READ) @@ -2541,8 +2593,10 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, struct iommu_iotlb_gather *gather) { struct protection_domain *domain = to_pdomain(dom); + struct domain_pgtable pgtable; - if (domain->mode == PAGE_MODE_NONE) + amd_iommu_domain_get_pgtable(domain, &pgtable); + if (pgtable.mode == PAGE_MODE_NONE) return 0; return iommu_unmap_page(domain, iova, page_size); @@ -2553,9 +2607,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, { struct protection_domain *domain = to_pdomain(dom); unsigned long offset_mask, pte_pgsize; + struct domain_pgtable pgtable; u64 *pte, __pte; - if (domain->mode == PAGE_MODE_NONE) + amd_iommu_domain_get_pgtable(domain, &pgtable); + if (pgtable.mode == PAGE_MODE_NONE) return iova; pte = fetch_pte(domain, iova, &pte_pgsize); @@ -2708,16 +2764,26 @@ EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); void amd_iommu_domain_direct_map(struct iommu_domain *dom) { struct protection_domain *domain = to_pdomain(dom); + struct domain_pgtable pgtable; unsigned long flags; + u64 pt_root; spin_lock_irqsave(&domain->lock, flags); + /* First save pgtable configuration*/ + amd_iommu_domain_get_pgtable(domain, &pgtable); + /* Update data structure */ - domain->mode = PAGE_MODE_NONE; + pt_root = amd_iommu_domain_encode_pgtable(NULL, PAGE_MODE_NONE); + atomic64_set(&domain->pt_root, pt_root); /* Make changes visible to IOMMUs */ update_domain(domain); + /* Restore old pgtable in domain->ptroot to free page-table */ + pt_root = amd_iommu_domain_encode_pgtable(pgtable.root, pgtable.mode); + atomic64_set(&domain->pt_root, pt_root); + /* Page-table is not visible to IOMMU anymore, so free it */ free_pagetable(domain); @@ -2908,9 +2974,11 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc) static int __set_gcr3(struct protection_domain *domain, int pasid, unsigned long cr3) { + struct domain_pgtable pgtable; u64 *pte; - if (domain->mode != PAGE_MODE_NONE) + amd_iommu_domain_get_pgtable(domain, &pgtable); + if (pgtable.mode != PAGE_MODE_NONE) return -EINVAL; pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); @@ -2924,9 +2992,11 @@ static int __set_gcr3(struct protection_domain *domain, int pasid, static int __clear_gcr3(struct protection_domain *domain, int pasid) { + struct domain_pgtable pgtable; u64 *pte; - if (domain->mode != PAGE_MODE_NONE) + amd_iommu_domain_get_pgtable(domain, &pgtable); + if (pgtable.mode != PAGE_MODE_NONE) return -EINVAL; pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index ca8c4522045b..7a8fdec138bd 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -468,8 +468,7 @@ struct protection_domain { iommu core code */ spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ - int mode; /* paging mode (0-6 levels) */ - u64 *pt_root; /* page table root pointer */ + atomic64_t pt_root; /* pgtable root and pgtable mode */ int glx; /* Number of levels for GCR3 table */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ @@ -477,6 +476,12 @@ struct protection_domain { unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; +/* For decocded pt_root */ +struct domain_pgtable { + int mode; + u64 *root; +}; + /* * Structure where we save information about one hardware AMD IOMMU in the * system. From 5b8a9a047b6cad361405c7900c1e1cdd378c4589 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2020 14:54:10 +0200 Subject: [PATCH 207/300] iommu/amd: Do not loop forever when trying to increase address space When increase_address_space() fails to allocate memory, alloc_pte() will call it again until it succeeds. Do not loop forever while trying to increase the address space and just return an error instead. Signed-off-by: Joerg Roedel Tested-by: Qian Cai Link: https://lore.kernel.org/r/20200504125413.16798-3-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 28229a38af4d..68da484a69dd 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1489,8 +1489,19 @@ static u64 *alloc_pte(struct protection_domain *domain, amd_iommu_domain_get_pgtable(domain, &pgtable); while (address > PM_LEVEL_SIZE(pgtable.mode)) { - *updated = increase_address_space(domain, address, gfp) || *updated; + bool upd = increase_address_space(domain, address, gfp); + + /* Read new values to check if update was successful */ amd_iommu_domain_get_pgtable(domain, &pgtable); + + /* + * Return an error if there is no memory to update the + * page-table. + */ + if (!upd && (address > PM_LEVEL_SIZE(pgtable.mode))) + return NULL; + + *updated = *updated || upd; } From f44a4d7e4f1cdef73c90b1dc749c4d8a7372a8eb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2020 14:54:11 +0200 Subject: [PATCH 208/300] iommu/amd: Call domain_flush_complete() in update_domain() The update_domain() function is expected to also inform the hardware about domain changes. This needs a COMPLETION_WAIT command to be sent to all IOMMUs which use the domain. Signed-off-by: Joerg Roedel Tested-by: Qian Cai Link: https://lore.kernel.org/r/20200504125413.16798-4-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 68da484a69dd..d2499c86d395 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2321,6 +2321,7 @@ static void update_domain(struct protection_domain *domain) domain_flush_devices(domain); domain_flush_tlb_pde(domain); + domain_flush_complete(domain); } int __init amd_iommu_init_api(void) From 19c6978fba68a2cdedee7d55fb8c3063d47982d9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2020 14:54:12 +0200 Subject: [PATCH 209/300] iommu/amd: Update Device Table in increase_address_space() The Device Table needs to be updated before the new page-table root can be published in domain->pt_root. Otherwise a concurrent call to fetch_pte might fetch a PTE which is not reachable through the Device Table Entry. Fixes: 92d420ec028d ("iommu/amd: Relax locking in dma_ops path") Reported-by: Qian Cai Signed-off-by: Joerg Roedel Tested-by: Qian Cai Link: https://lore.kernel.org/r/20200504125413.16798-5-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 49 ++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index d2499c86d395..2ae1daac888a 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -101,6 +101,8 @@ struct kmem_cache *amd_iommu_irq_cache; static void update_domain(struct protection_domain *domain); static int protection_domain_init(struct protection_domain *domain); static void detach_device(struct device *dev); +static void update_and_flush_device_table(struct protection_domain *domain, + struct domain_pgtable *pgtable); /**************************************************************************** * @@ -1461,8 +1463,16 @@ static bool increase_address_space(struct protection_domain *domain, *pte = PM_LEVEL_PDE(pgtable.mode, iommu_virt_to_phys(pgtable.root)); - root = amd_iommu_domain_encode_pgtable(pte, pgtable.mode + 1); + pgtable.root = pte; + pgtable.mode += 1; + update_and_flush_device_table(domain, &pgtable); + domain_flush_complete(domain); + /* + * Device Table needs to be updated and flushed before the new root can + * be published. + */ + root = amd_iommu_domain_encode_pgtable(pte, pgtable.mode); atomic64_set(&domain->pt_root, root); ret = true; @@ -1893,19 +1903,17 @@ static bool dma_ops_domain(struct protection_domain *domain) } static void set_dte_entry(u16 devid, struct protection_domain *domain, + struct domain_pgtable *pgtable, bool ats, bool ppr) { - struct domain_pgtable pgtable; u64 pte_root = 0; u64 flags = 0; u32 old_domid; - amd_iommu_domain_get_pgtable(domain, &pgtable); + if (pgtable->mode != PAGE_MODE_NONE) + pte_root = iommu_virt_to_phys(pgtable->root); - if (pgtable.mode != PAGE_MODE_NONE) - pte_root = iommu_virt_to_phys(pgtable.root); - - pte_root |= (pgtable.mode & DEV_ENTRY_MODE_MASK) + pte_root |= (pgtable->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV; @@ -1978,6 +1986,7 @@ static void clear_dte_entry(u16 devid) static void do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { + struct domain_pgtable pgtable; struct amd_iommu *iommu; bool ats; @@ -1993,7 +2002,9 @@ static void do_attach(struct iommu_dev_data *dev_data, domain->dev_cnt += 1; /* Update device table */ - set_dte_entry(dev_data->devid, domain, ats, dev_data->iommu_v2); + amd_iommu_domain_get_pgtable(domain, &pgtable); + set_dte_entry(dev_data->devid, domain, &pgtable, + ats, dev_data->iommu_v2); clone_aliases(dev_data->pdev); device_flush_dte(dev_data); @@ -2304,22 +2315,34 @@ static int amd_iommu_domain_get_attr(struct iommu_domain *domain, * *****************************************************************************/ -static void update_device_table(struct protection_domain *domain) +static void update_device_table(struct protection_domain *domain, + struct domain_pgtable *pgtable) { struct iommu_dev_data *dev_data; list_for_each_entry(dev_data, &domain->dev_list, list) { - set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled, - dev_data->iommu_v2); + set_dte_entry(dev_data->devid, domain, pgtable, + dev_data->ats.enabled, dev_data->iommu_v2); clone_aliases(dev_data->pdev); } } +static void update_and_flush_device_table(struct protection_domain *domain, + struct domain_pgtable *pgtable) +{ + update_device_table(domain, pgtable); + domain_flush_devices(domain); +} + static void update_domain(struct protection_domain *domain) { - update_device_table(domain); + struct domain_pgtable pgtable; - domain_flush_devices(domain); + /* Update device table */ + amd_iommu_domain_get_pgtable(domain, &pgtable); + update_and_flush_device_table(domain, &pgtable); + + /* Flush domain TLB(s) and wait for completion */ domain_flush_tlb_pde(domain); domain_flush_complete(domain); } From 119b2b2c3e256f4bcff7439acc25ac1e9f1aaa4e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 4 May 2020 14:54:13 +0200 Subject: [PATCH 210/300] iommu/amd: Do not flush Device Table in iommu_map_page() The flush of the Device Table Entries for the domain has already happened in increase_address_space(), if necessary. Do no flush them again in iommu_map_page(). Signed-off-by: Joerg Roedel Tested-by: Qian Cai Link: https://lore.kernel.org/r/20200504125413.16798-6-joro@8bytes.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 2ae1daac888a..1dc3718560d0 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1446,15 +1446,18 @@ static bool increase_address_space(struct protection_domain *domain, { struct domain_pgtable pgtable; unsigned long flags; - bool ret = false; + bool ret = true; u64 *pte, root; spin_lock_irqsave(&domain->lock, flags); amd_iommu_domain_get_pgtable(domain, &pgtable); - if (address <= PM_LEVEL_SIZE(pgtable.mode) || - WARN_ON_ONCE(pgtable.mode == PAGE_MODE_6_LEVEL)) + if (address <= PM_LEVEL_SIZE(pgtable.mode)) + goto out; + + ret = false; + if (WARN_ON_ONCE(pgtable.mode == PAGE_MODE_6_LEVEL)) goto out; pte = (void *)get_zeroed_page(gfp); @@ -1499,19 +1502,15 @@ static u64 *alloc_pte(struct protection_domain *domain, amd_iommu_domain_get_pgtable(domain, &pgtable); while (address > PM_LEVEL_SIZE(pgtable.mode)) { - bool upd = increase_address_space(domain, address, gfp); - - /* Read new values to check if update was successful */ - amd_iommu_domain_get_pgtable(domain, &pgtable); - /* * Return an error if there is no memory to update the * page-table. */ - if (!upd && (address > PM_LEVEL_SIZE(pgtable.mode))) + if (!increase_address_space(domain, address, gfp)) return NULL; - *updated = *updated || upd; + /* Read new values to check if update was successful */ + amd_iommu_domain_get_pgtable(domain, &pgtable); } @@ -1719,7 +1718,13 @@ out: unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - update_domain(dom); + /* + * Flush domain TLB(s) and wait for completion. Any Device-Table + * Updates and flushing already happened in + * increase_address_space(). + */ + domain_flush_tlb_pde(dom); + domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } From d76bc8200f9cf8b6746e66b37317ba477eda25c4 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Wed, 29 Apr 2020 00:12:00 +0300 Subject: [PATCH 211/300] mei: me: disable mei interface on LBG servers. Disable the MEI driver on LBG SPS (server) platforms, some corner flows such as recovery mode does not work, and the driver doesn't have working use cases. Cc: Signed-off-by: Tomas Winkler Link: https://lore.kernel.org/r/20200428211200.12200-1-tomas.winkler@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-me.c | 8 ++++++++ drivers/misc/mei/hw-me.h | 4 ++++ drivers/misc/mei/pci-me.c | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/misc/mei/hw-me.c b/drivers/misc/mei/hw-me.c index 668418d7ea77..f620442addf5 100644 --- a/drivers/misc/mei/hw-me.c +++ b/drivers/misc/mei/hw-me.c @@ -1465,6 +1465,13 @@ static const struct mei_cfg mei_me_pch12_cfg = { MEI_CFG_DMA_128, }; +/* LBG with quirk for SPS Firmware exclusion */ +static const struct mei_cfg mei_me_pch12_sps_cfg = { + MEI_CFG_PCH8_HFS, + MEI_CFG_FW_VER_SUPP, + MEI_CFG_FW_SPS, +}; + /* Tiger Lake and newer devices */ static const struct mei_cfg mei_me_pch15_cfg = { MEI_CFG_PCH8_HFS, @@ -1487,6 +1494,7 @@ static const struct mei_cfg *const mei_cfg_list[] = { [MEI_ME_PCH8_CFG] = &mei_me_pch8_cfg, [MEI_ME_PCH8_SPS_CFG] = &mei_me_pch8_sps_cfg, [MEI_ME_PCH12_CFG] = &mei_me_pch12_cfg, + [MEI_ME_PCH12_SPS_CFG] = &mei_me_pch12_sps_cfg, [MEI_ME_PCH15_CFG] = &mei_me_pch15_cfg, }; diff --git a/drivers/misc/mei/hw-me.h b/drivers/misc/mei/hw-me.h index 4a8d4dcd5a91..b6b94e211464 100644 --- a/drivers/misc/mei/hw-me.h +++ b/drivers/misc/mei/hw-me.h @@ -80,6 +80,9 @@ struct mei_me_hw { * servers platforms with quirk for * SPS firmware exclusion. * @MEI_ME_PCH12_CFG: Platform Controller Hub Gen12 and newer + * @MEI_ME_PCH12_SPS_CFG: Platform Controller Hub Gen12 and newer + * servers platforms with quirk for + * SPS firmware exclusion. * @MEI_ME_PCH15_CFG: Platform Controller Hub Gen15 and newer * @MEI_ME_NUM_CFG: Upper Sentinel. */ @@ -93,6 +96,7 @@ enum mei_cfg_idx { MEI_ME_PCH8_CFG, MEI_ME_PCH8_SPS_CFG, MEI_ME_PCH12_CFG, + MEI_ME_PCH12_SPS_CFG, MEI_ME_PCH15_CFG, MEI_ME_NUM_CFG, }; diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 0c390fe421ad..a1ed375fed37 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -70,7 +70,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_SPT_2, MEI_ME_PCH8_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_SPT_H, MEI_ME_PCH8_SPS_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_SPT_H_2, MEI_ME_PCH8_SPS_CFG)}, - {MEI_PCI_DEVICE(MEI_DEV_ID_LBG, MEI_ME_PCH12_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_LBG, MEI_ME_PCH12_SPS_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_BXT_M, MEI_ME_PCH8_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_APL_I, MEI_ME_PCH8_CFG)}, From 115f32512f13c0280161908e9de45a97a87673bb Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 1 May 2020 00:35:50 +0530 Subject: [PATCH 212/300] bus: mhi: Fix parsing of mhi_flags With the current parsing of mhi_flags, the following statement always return false: eob = !!(flags & MHI_EOB); This is due to the fact that 'enum mhi_flags' starts with index 0 and we are using direct AND operation to extract each bit. Fix this by using BIT() macros for defining the flags so that the reset of the code need not be touched. Fixes: 189ff97cca53 ("bus: mhi: core: Add support for data transfer") Reported-by: Dan Carpenter Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200430190555.32741-2-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mhi.h b/include/linux/mhi.h index ad1996001965..5642806360f3 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -53,9 +53,9 @@ enum mhi_callback { * @MHI_CHAIN: Linked transfer */ enum mhi_flags { - MHI_EOB, - MHI_EOT, - MHI_CHAIN, + MHI_EOB = BIT(0), + MHI_EOT = BIT(1), + MHI_CHAIN = BIT(2), }; /** From ce312258084ea0c828e11c831c7b9e8b42b02ef8 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 1 May 2020 00:35:51 +0530 Subject: [PATCH 213/300] bus: mhi: core: Make sure to powerdown if mhi_sync_power_up fails Powerdown is necessary if mhi_sync_power_up fails due to a timeout, to clean up the resources. Otherwise a BUG could be triggered when attempting to clean up MSIs because the IRQ is still active from a request_irq(). Signed-off-by: Jeffrey Hugo Reviewed-by: Hemant Kumar Reviewed-by: Manivannan Sadhasivam Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200430190555.32741-3-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/pm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/bus/mhi/core/pm.c b/drivers/bus/mhi/core/pm.c index 52690cb5c89c..dc83d65f7784 100644 --- a/drivers/bus/mhi/core/pm.c +++ b/drivers/bus/mhi/core/pm.c @@ -902,7 +902,11 @@ int mhi_sync_power_up(struct mhi_controller *mhi_cntrl) MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state), msecs_to_jiffies(mhi_cntrl->timeout_ms)); - return (MHI_IN_MISSION_MODE(mhi_cntrl->ee)) ? 0 : -EIO; + ret = (MHI_IN_MISSION_MODE(mhi_cntrl->ee)) ? 0 : -ETIMEDOUT; + if (ret) + mhi_power_down(mhi_cntrl, false); + + return ret; } EXPORT_SYMBOL(mhi_sync_power_up); From 85a087df4a719ebab940efa3c79625e68161f57b Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 1 May 2020 00:35:52 +0530 Subject: [PATCH 214/300] bus: mhi: core: Remove link_status() callback If the MHI core detects invalid data due to a PCI read, it calls into the controller via link_status() to double check that the link is infact down. All in all, this is pretty pointless, and racy. There are no good reasons for this, and only drawbacks. Its pointless because chances are, the controller is going to do the same thing to determine if the link is down - attempt a PCI access and compare the result. This does not make the link status decision any smarter. Its racy because its possible that the link was down at the time of the MHI core access, but then recovered before the controller access. In this case, the controller will indicate the link is not down, and the MHI core will precede to use a bad value as the MHI core does not attempt to retry the access. Retrying the access in the MHI core is a bad idea because again, it is racy - what if the link is down again? Furthermore, there may be some higher level state associated with the link status, that is now invalid because the link went down. The only reason why the MHI core could see "invalid" data when doing a PCI access, that is actually valid, is if the register actually contained the PCI spec defined sentinel for an invalid access. In this case, it is arguable that the MHI implementation broken, and should be fixed, not worked around. Therefore, remove the link_status() callback before anyone attempts to implement it. Signed-off-by: Jeffrey Hugo Reviewed-by: Manivannan Sadhasivam Reviewed-by: Hemant Kumar Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200430190555.32741-4-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/init.c | 6 ++---- drivers/bus/mhi/core/main.c | 5 ++--- include/linux/mhi.h | 2 -- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c index b38359c480ea..2af08d57ec28 100644 --- a/drivers/bus/mhi/core/init.c +++ b/drivers/bus/mhi/core/init.c @@ -812,10 +812,8 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl, if (!mhi_cntrl) return -EINVAL; - if (!mhi_cntrl->runtime_get || !mhi_cntrl->runtime_put) - return -EINVAL; - - if (!mhi_cntrl->status_cb || !mhi_cntrl->link_status) + if (!mhi_cntrl->runtime_get || !mhi_cntrl->runtime_put || + !mhi_cntrl->status_cb) return -EINVAL; ret = parse_config(mhi_cntrl, config); diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index 55928feea0c9..f8401535e61a 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -20,9 +20,8 @@ int __must_check mhi_read_reg(struct mhi_controller *mhi_cntrl, { u32 tmp = readl(base + offset); - /* If there is any unexpected value, query the link status */ - if (PCI_INVALID_READ(tmp) && - mhi_cntrl->link_status(mhi_cntrl)) + /* If the value is invalid, the link is down */ + if (PCI_INVALID_READ(tmp)) return -EIO; *out = tmp; diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 5642806360f3..c80ba559face 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -335,7 +335,6 @@ struct mhi_controller_config { * @syserr_worker: System error worker * @state_event: State change event * @status_cb: CB function to notify power states of the device (required) - * @link_status: CB function to query link status of the device (required) * @wake_get: CB function to assert device wake (optional) * @wake_put: CB function to de-assert device wake (optional) * @wake_toggle: CB function to assert and de-assert device wake (optional) @@ -417,7 +416,6 @@ struct mhi_controller { void (*status_cb)(struct mhi_controller *mhi_cntrl, enum mhi_callback cb); - int (*link_status)(struct mhi_controller *mhi_cntrl); void (*wake_get)(struct mhi_controller *mhi_cntrl, bool override); void (*wake_put)(struct mhi_controller *mhi_cntrl, bool override); void (*wake_toggle)(struct mhi_controller *mhi_cntrl); From 45723a44845c90c8e859fd0e2b0bb492322b5d0b Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 1 May 2020 00:35:53 +0530 Subject: [PATCH 215/300] bus: mhi: core: Offload register accesses to the controller When reading or writing MHI registers, the core assumes that the physical link is a memory mapped PCI link. This assumption may not hold for all MHI devices. The controller knows what is the physical link (ie PCI, I2C, SPI, etc), and therefore knows the proper methods to access that link. The controller can also handle link specific error scenarios, such as reading -1 when the PCI link went down. Therefore, it is appropriate that the MHI core requests the controller to make register accesses on behalf of the core, which abstracts the core from link specifics, and end up removing an unnecessary assumption. Signed-off-by: Jeffrey Hugo Reviewed-by: Hemant Kumar Reviewed-by: Manivannan Sadhasivam Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200430190555.32741-5-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/init.c | 3 ++- drivers/bus/mhi/core/internal.h | 3 --- drivers/bus/mhi/core/main.c | 12 ++---------- include/linux/mhi.h | 6 ++++++ 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/bus/mhi/core/init.c b/drivers/bus/mhi/core/init.c index 2af08d57ec28..eb2ab058a01d 100644 --- a/drivers/bus/mhi/core/init.c +++ b/drivers/bus/mhi/core/init.c @@ -813,7 +813,8 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl, return -EINVAL; if (!mhi_cntrl->runtime_get || !mhi_cntrl->runtime_put || - !mhi_cntrl->status_cb) + !mhi_cntrl->status_cb || !mhi_cntrl->read_reg || + !mhi_cntrl->write_reg) return -EINVAL; ret = parse_config(mhi_cntrl, config); diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h index 5deadfaa053a..095d95bc0e37 100644 --- a/drivers/bus/mhi/core/internal.h +++ b/drivers/bus/mhi/core/internal.h @@ -11,9 +11,6 @@ extern struct bus_type mhi_bus_type; -/* MHI MMIO register mapping */ -#define PCI_INVALID_READ(val) (val == U32_MAX) - #define MHIREGLEN (0x0) #define MHIREGLEN_MHIREGLEN_MASK (0xFFFFFFFF) #define MHIREGLEN_MHIREGLEN_SHIFT (0) diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index f8401535e61a..2aceb69f6ce8 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -18,15 +18,7 @@ int __must_check mhi_read_reg(struct mhi_controller *mhi_cntrl, void __iomem *base, u32 offset, u32 *out) { - u32 tmp = readl(base + offset); - - /* If the value is invalid, the link is down */ - if (PCI_INVALID_READ(tmp)) - return -EIO; - - *out = tmp; - - return 0; + return mhi_cntrl->read_reg(mhi_cntrl, base + offset, out); } int __must_check mhi_read_reg_field(struct mhi_controller *mhi_cntrl, @@ -48,7 +40,7 @@ int __must_check mhi_read_reg_field(struct mhi_controller *mhi_cntrl, void mhi_write_reg(struct mhi_controller *mhi_cntrl, void __iomem *base, u32 offset, u32 val) { - writel(val, base + offset); + mhi_cntrl->write_reg(mhi_cntrl, base + offset, val); } void mhi_write_reg_field(struct mhi_controller *mhi_cntrl, void __iomem *base, diff --git a/include/linux/mhi.h b/include/linux/mhi.h index c80ba559face..84a6c9e72f52 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -342,6 +342,8 @@ struct mhi_controller_config { * @runtimet_put: CB function to decrement pm usage (required) * @map_single: CB function to create TRE buffer * @unmap_single: CB function to destroy TRE buffer + * @read_reg: Read a MHI register via the physical link (required) + * @write_reg: Write a MHI register via the physical link (required) * @buffer_len: Bounce buffer length * @bounce_buf: Use of bounce buffer * @fbc_download: MHI host needs to do complete image transfer (optional) @@ -425,6 +427,10 @@ struct mhi_controller { struct mhi_buf_info *buf); void (*unmap_single)(struct mhi_controller *mhi_cntrl, struct mhi_buf_info *buf); + int (*read_reg)(struct mhi_controller *mhi_cntrl, void __iomem *addr, + u32 *out); + void (*write_reg)(struct mhi_controller *mhi_cntrl, void __iomem *addr, + u32 val); size_t buffer_len; bool bounce_buf; From af2e58818082ac0db29539444ca17eb1e77f6000 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 1 May 2020 00:35:54 +0530 Subject: [PATCH 216/300] bus: mhi: core: Fix typo in comment There is a typo - "runtimet" should be "runtime". Fix it. Signed-off-by: Jeffrey Hugo Reviewed-by: Hemant Kumar Reviewed-by: Manivannan Sadhasivam Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200430190555.32741-6-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mhi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 84a6c9e72f52..3d7c3c26eeb9 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -339,7 +339,7 @@ struct mhi_controller_config { * @wake_put: CB function to de-assert device wake (optional) * @wake_toggle: CB function to assert and de-assert device wake (optional) * @runtime_get: CB function to controller runtime resume (required) - * @runtimet_put: CB function to decrement pm usage (required) + * @runtime_put: CB function to decrement pm usage (required) * @map_single: CB function to create TRE buffer * @unmap_single: CB function to destroy TRE buffer * @read_reg: Read a MHI register via the physical link (required) From f0e1d3ac2d7c16a5d2c9d67f5a61133db7681af8 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 1 May 2020 00:35:55 +0530 Subject: [PATCH 217/300] bus: mhi: core: Fix channel device name conflict When multiple instances of the same MHI product are present in a system, we can see a splat from mhi_create_devices() - "sysfs: cannot create duplicate filename". This is because the device names assigned to the MHI channel devices are non-unique. They consist of the channel's name, and the channel's pipe id. For identical products, each instance is going to have the same set of channel (both in name and pipe id). To fix this, we prepend the device name of the parent device that the MHI channels belong to. Since different instances of the same product should have unique device names, this makes the MHI channel devices for each product also unique. Additionally, remove the pipe id from the MHI channel device name. This is an internal detail to the MHI product that provides little value, and imposes too much device specific internal details to userspace. It is expected that channel with a specific name (ie "SAHARA") has a specific client, and it does not matter what pipe id that channel is enumerated on. The pipe id is an internal detail between the MHI bus, and the hardware. The client is not expected to make decisions based on the pipe id, and to do so would require the client to have intimate knowledge of the hardware, which is inappropiate as it may violate the layering provided by the MHI bus. The limitation of doing this is that each product may only have one instance of a channel by a unique name. This limitation is appropriate given the usecases of MHI channels. Signed-off-by: Jeffrey Hugo Reviewed-by: Hemant Kumar Reviewed-by: Manivannan Sadhasivam Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200430190555.32741-7-manivannan.sadhasivam@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mhi/core/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index 2aceb69f6ce8..97e06cc586e4 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -327,7 +327,8 @@ void mhi_create_devices(struct mhi_controller *mhi_cntrl) /* Channel name is same for both UL and DL */ mhi_dev->chan_name = mhi_chan->name; - dev_set_name(&mhi_dev->dev, "%04x_%s", mhi_chan->chan, + dev_set_name(&mhi_dev->dev, "%s_%s", + dev_name(mhi_cntrl->cntrl_dev), mhi_dev->chan_name); /* Init wakeup source if available */ From 0b80f9866e6bbfb905140ed8787ff2af03652c0c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2020 19:27:54 -0400 Subject: [PATCH 218/300] iocost: protect iocg->abs_vdebt with iocg->waitq.lock abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup is and controls the activation of use_delay mechanism. Once a cgroup goes over budget from forced IOs, it has to pay it back with its future budget. The progress guarantee on debt paying comes from the iocg being active - active iocgs are processed by the periodic timer, which ensures that as time passes the debts dissipate and the iocg returns to normal operation. However, both iocg activation and vdebt handling are asynchronous and a sequence like the following may happen. 1. The iocg is in the process of being deactivated by the periodic timer. 2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns without anything because it still sees that the iocg is already active. 3. The iocg is deactivated. 4. The bio from #2 is over budget but needs to be forced. It increases abs_vdebt and goes over the threshold and enables use_delay. 5. IO control is enabled for the iocg's subtree and now IOs are attributed to the descendant cgroups and the iocg itself no longer issues IOs. This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no further IOs which can activate it. This can end up unduly punishing all the descendants cgroups. The usual throttling path has the same issue - the iocg must be active while throttled to ensure that future event will wake it up - and solves the problem by synchronizing the throttling path with a spinlock. abs_vdebt handling is another form of overage handling and shares a lot of characteristics including the fact that it isn't in the hottest path. This patch fixes the above and other possible races by strictly synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock. Signed-off-by: Tejun Heo Reported-by: Vlad Dmitriev Cc: stable@vger.kernel.org # v5.4+ Fixes: e1518f63f246 ("blk-iocost: Don't let merges push vtime into the future") Signed-off-by: Jens Axboe --- block/blk-iocost.c | 117 ++++++++++++++++++++------------- tools/cgroup/iocost_monitor.py | 7 +- 2 files changed, 77 insertions(+), 47 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 3ab0c1c704b6..7c1fe605d0d6 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -466,7 +466,7 @@ struct ioc_gq { */ atomic64_t vtime; atomic64_t done_vtime; - atomic64_t abs_vdebt; + u64 abs_vdebt; u64 last_vtime; /* @@ -1142,7 +1142,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 margin_ns = (u64)(ioc->period_us * WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; - u64 abs_vdebt, vdebt, vshortage, expires, oexpires; + u64 vdebt, vshortage, expires, oexpires; s64 vbudget; u32 hw_inuse; @@ -1152,18 +1152,15 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ - abs_vdebt = atomic64_read(&iocg->abs_vdebt); - vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse); + vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); if (vdebt && vbudget > 0) { u64 delta = min_t(u64, vbudget, vdebt); u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), - abs_vdebt); + iocg->abs_vdebt); atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); - atomic64_sub(abs_delta, &iocg->abs_vdebt); - if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0)) - atomic64_set(&iocg->abs_vdebt, 0); + iocg->abs_vdebt -= abs_delta; } /* @@ -1219,12 +1216,18 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) u64 expires, oexpires; u32 hw_inuse; + lockdep_assert_held(&iocg->waitq.lock); + /* debt-adjust vtime */ current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse); + vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - /* clear or maintain depending on the overage */ - if (time_before_eq64(vtime, now->vnow)) { + /* + * Clear or maintain depending on the overage. Non-zero vdebt is what + * guarantees that @iocg is online and future iocg_kick_delay() will + * clear use_delay. Don't leave it on when there's no vdebt. + */ + if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { blkcg_clear_delay(blkg); return false; } @@ -1258,9 +1261,12 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); struct ioc_now now; + unsigned long flags; + spin_lock_irqsave(&iocg->waitq.lock, flags); ioc_now(iocg->ioc, &now); iocg_kick_delay(iocg, &now, 0); + spin_unlock_irqrestore(&iocg->waitq.lock, flags); return HRTIMER_NORESTART; } @@ -1368,14 +1374,13 @@ static void ioc_timer_fn(struct timer_list *timer) * should have woken up in the last period and expire idle iocgs. */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && - !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) + if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && + !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); - if (waitqueue_active(&iocg->waitq) || - atomic64_read(&iocg->abs_vdebt)) { + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); iocg_kick_delay(iocg, &now, 0); @@ -1718,28 +1723,49 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * tests are racy but the races aren't systemic - we only miss once * in a while which is fine. */ - if (!waitqueue_active(&iocg->waitq) && - !atomic64_read(&iocg->abs_vdebt) && + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { iocg_commit_bio(iocg, bio, cost); return; } /* - * We're over budget. If @bio has to be issued regardless, - * remember the abs_cost instead of advancing vtime. - * iocg_kick_waitq() will pay off the debt before waking more IOs. + * We activated above but w/o any synchronization. Deactivation is + * synchronized with waitq.lock and we won't get deactivated as long + * as we're waiting or has debt, so we're good if we're activated + * here. In the unlikely case that we aren't, just issue the IO. + */ + spin_lock_irq(&iocg->waitq.lock); + + if (unlikely(list_empty(&iocg->active_list))) { + spin_unlock_irq(&iocg->waitq.lock); + iocg_commit_bio(iocg, bio, cost); + return; + } + + /* + * We're over budget. If @bio has to be issued regardless, remember + * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay + * off the debt before waking more IOs. + * * This way, the debt is continuously paid off each period with the - * actual budget available to the cgroup. If we just wound vtime, - * we would incorrectly use the current hw_inuse for the entire - * amount which, for example, can lead to the cgroup staying - * blocked for a long time even with substantially raised hw_inuse. + * actual budget available to the cgroup. If we just wound vtime, we + * would incorrectly use the current hw_inuse for the entire amount + * which, for example, can lead to the cgroup staying blocked for a + * long time even with substantially raised hw_inuse. + * + * An iocg with vdebt should stay online so that the timer can keep + * deducting its vdebt and [de]activate use_delay mechanism + * accordingly. We don't want to race against the timer trying to + * clear them and leave @iocg inactive w/ dangling use_delay heavily + * penalizing the cgroup and its descendants. */ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { - atomic64_add(abs_cost, &iocg->abs_vdebt); + iocg->abs_vdebt += abs_cost; if (iocg_kick_delay(iocg, &now, cost)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + spin_unlock_irq(&iocg->waitq.lock); return; } @@ -1756,20 +1782,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - spin_lock_irq(&iocg->waitq.lock); - - /* - * We activated above but w/o any synchronization. Deactivation is - * synchronized with waitq.lock and we won't get deactivated as - * long as we're waiting, so we're good if we're activated here. - * In the unlikely case that we are deactivated, just issue the IO. - */ - if (unlikely(list_empty(&iocg->active_list))) { - spin_unlock_irq(&iocg->waitq.lock); - iocg_commit_bio(iocg, bio, cost); - return; - } - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); wait.wait.private = current; wait.bio = bio; @@ -1801,6 +1813,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct ioc_now now; u32 hw_inuse; u64 abs_cost, cost; + unsigned long flags; /* bypass if disabled or for root cgroup */ if (!ioc->enabled || !iocg->level) @@ -1820,15 +1833,28 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, iocg->cursor = bio_end; /* - * Charge if there's enough vtime budget and the existing request - * has cost assigned. Otherwise, account it as debt. See debt - * handling in ioc_rqos_throttle() for details. + * Charge if there's enough vtime budget and the existing request has + * cost assigned. */ if (rq->bio && rq->bio->bi_iocost_cost && - time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) + time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { iocg_commit_bio(iocg, bio, cost); - else - atomic64_add(abs_cost, &iocg->abs_vdebt); + return; + } + + /* + * Otherwise, account it as debt if @iocg is online, which it should + * be for the vast majority of cases. See debt handling in + * ioc_rqos_throttle() for details. + */ + spin_lock_irqsave(&iocg->waitq.lock, flags); + if (likely(!list_empty(&iocg->active_list))) { + iocg->abs_vdebt += abs_cost; + iocg_kick_delay(iocg, &now, cost); + } else { + iocg_commit_bio(iocg, bio, cost); + } + spin_unlock_irqrestore(&iocg->waitq.lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) @@ -1998,7 +2024,6 @@ static void ioc_pd_init(struct blkg_policy_data *pd) iocg->ioc = ioc; atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow); - atomic64_set(&iocg->abs_vdebt, 0); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); iocg->hweight_active = HWEIGHT_WHOLE; diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 7427a5ee761b..9d8e9613008a 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -159,7 +159,12 @@ class IocgStat: else: self.inflight_pct = 0 - self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 + # vdebt used to be an atomic64_t and is now u64, support both + try: + self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 + except: + self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 + self.use_delay = blkg.use_delay.counter.value_() self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 From 5fe89a6acd668cbd1817fcdef5caa9fee568c2e8 Mon Sep 17 00:00:00 2001 From: Sean Paul Date: Tue, 14 Apr 2020 15:02:55 -0400 Subject: [PATCH 219/300] drm: Fix HDCP failures when SRM fw is missing The SRM cleanup in 79643fddd6eb2 ("drm/hdcp: optimizing the srm handling") inadvertently altered the behavior of HDCP auth when the SRM firmware is missing. Before that patch, missing SRM was interpreted as the device having no revoked keys. With that patch, if the SRM fw file is missing we reject _all_ keys. This patch fixes that regression by returning success if the file cannot be found. It also checks the return value from request_srm such that we won't end up trying to parse the ksv list if there is an error fetching it. Fixes: 79643fddd6eb ("drm/hdcp: optimizing the srm handling") Cc: stable@vger.kernel.org Cc: Ramalingam C Cc: Sean Paul Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Reviewed-by: Ramalingam C Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20200414190258.38873-1-sean@poorly.run Changes in v2: -Noticed a couple other things to clean up Reviewed-by: Ramalingam C --- drivers/gpu/drm/drm_hdcp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_hdcp.c b/drivers/gpu/drm/drm_hdcp.c index 7f386adcf872..910108ccaae1 100644 --- a/drivers/gpu/drm/drm_hdcp.c +++ b/drivers/gpu/drm/drm_hdcp.c @@ -241,8 +241,12 @@ static int drm_hdcp_request_srm(struct drm_device *drm_dev, ret = request_firmware_direct(&fw, (const char *)fw_name, drm_dev->dev); - if (ret < 0) + if (ret < 0) { + *revoked_ksv_cnt = 0; + *revoked_ksv_list = NULL; + ret = 0; goto exit; + } if (fw->size && fw->data) ret = drm_hdcp_srm_update(fw->data, fw->size, revoked_ksv_list, @@ -287,6 +291,8 @@ int drm_hdcp_check_ksvs_revoked(struct drm_device *drm_dev, u8 *ksvs, ret = drm_hdcp_request_srm(drm_dev, &revoked_ksv_list, &revoked_ksv_cnt); + if (ret) + return ret; /* revoked_ksv_cnt will be zero when above function failed */ for (i = 0; i < revoked_ksv_cnt; i++) From 755f5738ff981769211a0bfac709d514ef5b9f86 Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Tue, 5 May 2020 10:03:29 +0800 Subject: [PATCH 220/300] net: broadcom: fix a mistake about ioremap resource Commit d7a5502b0bb8b ("net: broadcom: convert to devm_platform_ioremap_resource_byname()") will broke this driver. idm_base and nicpm_base were optional, after this change, they are mandatory. it will probe fails with -22 when the dtb doesn't have them defined. so revert part of this commit and make idm_base and nicpm_base as optional. Fixes: d7a5502b0bb8bde ("net: broadcom: convert to devm_platform_ioremap_resource_byname()") Reported-by: Jonathan Richardson Cc: Scott Branden Cc: Ray Jui Cc: Florian Fainelli Cc: David S. Miller Signed-off-by: Dejin Zheng Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bgmac-platform.c | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index a5d1a6cb9ce3..6795b6d95f54 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -172,6 +172,7 @@ static int bgmac_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct bgmac *bgmac; + struct resource *regs; const u8 *mac_addr; bgmac = bgmac_alloc(&pdev->dev); @@ -206,16 +207,21 @@ static int bgmac_probe(struct platform_device *pdev) if (IS_ERR(bgmac->plat.base)) return PTR_ERR(bgmac->plat.base); - bgmac->plat.idm_base = - devm_platform_ioremap_resource_byname(pdev, "idm_base"); - if (IS_ERR(bgmac->plat.idm_base)) - return PTR_ERR(bgmac->plat.idm_base); - bgmac->feature_flags &= ~BGMAC_FEAT_IDM_MASK; + regs = platform_get_resource_byname(pdev, IORESOURCE_MEM, "idm_base"); + if (regs) { + bgmac->plat.idm_base = devm_ioremap_resource(&pdev->dev, regs); + if (IS_ERR(bgmac->plat.idm_base)) + return PTR_ERR(bgmac->plat.idm_base); + bgmac->feature_flags &= ~BGMAC_FEAT_IDM_MASK; + } - bgmac->plat.nicpm_base = - devm_platform_ioremap_resource_byname(pdev, "nicpm_base"); - if (IS_ERR(bgmac->plat.nicpm_base)) - return PTR_ERR(bgmac->plat.nicpm_base); + regs = platform_get_resource_byname(pdev, IORESOURCE_MEM, "nicpm_base"); + if (regs) { + bgmac->plat.nicpm_base = devm_ioremap_resource(&pdev->dev, + regs); + if (IS_ERR(bgmac->plat.nicpm_base)) + return PTR_ERR(bgmac->plat.nicpm_base); + } bgmac->read = platform_bgmac_read; bgmac->write = platform_bgmac_write; From 7f13657d141346125f4d0bb93eab4777f40c406e Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 5 May 2020 16:28:53 +0800 Subject: [PATCH 221/300] io_uring: handle -EFAULT properly in io_uring_setup() If copy_to_user() in io_uring_setup() failed, we'll leak many kernel resources, which will be recycled until process terminates. This bug can be reproduced by using mprotect to set params to PROT_READ. To fix this issue, refactor io_uring_create() a bit to add a new 'struct io_uring_params __user *params' parameter and move the copy_to_user() in io_uring_setup() to io_uring_setup(), if copy_to_user() failed, we can free kernel resource properly. Suggested-by: Jens Axboe Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6d52ff98279d..dd680eb153cb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7760,7 +7760,8 @@ err: return ret; } -static int io_uring_create(unsigned entries, struct io_uring_params *p) +static int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) { struct user_struct *user = NULL; struct io_ring_ctx *ctx; @@ -7852,6 +7853,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + + if (copy_to_user(params, p, sizeof(*p))) { + ret = -EFAULT; + goto err; + } /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -7860,9 +7869,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -7878,7 +7884,6 @@ err: static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; - long ret; int i; if (copy_from_user(&p, params, sizeof(p))) @@ -7893,14 +7898,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; - ret = io_uring_create(entries, &p); - if (ret < 0) - return ret; - - if (copy_to_user(params, &p, sizeof(p))) - return -EFAULT; - - return ret; + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, From b95e51eb9f2ee7b6d6c3203a2f75122349aa77be Mon Sep 17 00:00:00 2001 From: Sung Lee Date: Mon, 20 Apr 2020 11:38:30 -0400 Subject: [PATCH 222/300] drm/amd/display: Update DCN2.1 DV Code Revision [WHY & HOW] There is a problem in hscale_pixel_rate, the bug causes DCN to be more optimistic (more likely to underflow) in upscale cases during prefetch. This commit ports the fix from DV code to address these issues. Signed-off-by: Sung Lee Reviewed-by: Dmytro Laktyushkin Acked-by: Aurabindo Pillai Signed-off-by: Alex Deucher --- .../drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c b/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c index a38baa73d484..b8ec08e3b7a3 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn21/display_rq_dlg_calc_21.c @@ -1200,7 +1200,7 @@ static void dml_rq_dlg_get_dlg_params( min_hratio_fact_l = 1.0; min_hratio_fact_c = 1.0; - if (htaps_l <= 1) + if (hratio_l <= 1) min_hratio_fact_l = 2.0; else if (htaps_l <= 6) { if ((hratio_l * 2.0) > 4.0) @@ -1216,7 +1216,7 @@ static void dml_rq_dlg_get_dlg_params( hscale_pixel_rate_l = min_hratio_fact_l * dppclk_freq_in_mhz; - if (htaps_c <= 1) + if (hratio_c <= 1) min_hratio_fact_c = 2.0; else if (htaps_c <= 6) { if ((hratio_c * 2.0) > 4.0) @@ -1522,8 +1522,8 @@ static void dml_rq_dlg_get_dlg_params( disp_dlg_regs->refcyc_per_vm_group_vblank = get_refcyc_per_vm_group_vblank(mode_lib, e2e_pipe_param, num_pipes, pipe_idx) * refclk_freq_in_mhz; disp_dlg_regs->refcyc_per_vm_group_flip = get_refcyc_per_vm_group_flip(mode_lib, e2e_pipe_param, num_pipes, pipe_idx) * refclk_freq_in_mhz; - disp_dlg_regs->refcyc_per_vm_req_vblank = get_refcyc_per_vm_req_vblank(mode_lib, e2e_pipe_param, num_pipes, pipe_idx) * refclk_freq_in_mhz; - disp_dlg_regs->refcyc_per_vm_req_flip = get_refcyc_per_vm_req_flip(mode_lib, e2e_pipe_param, num_pipes, pipe_idx) * refclk_freq_in_mhz; + disp_dlg_regs->refcyc_per_vm_req_vblank = get_refcyc_per_vm_req_vblank(mode_lib, e2e_pipe_param, num_pipes, pipe_idx) * refclk_freq_in_mhz * dml_pow(2, 10); + disp_dlg_regs->refcyc_per_vm_req_flip = get_refcyc_per_vm_req_flip(mode_lib, e2e_pipe_param, num_pipes, pipe_idx) * refclk_freq_in_mhz * dml_pow(2, 10); // Clamp to max for now if (disp_dlg_regs->refcyc_per_vm_group_vblank >= (unsigned int)dml_pow(2, 23)) From 80797dd6f1a525d1160c463d6a9f9d29af182cbb Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 26 Feb 2020 17:30:29 -0500 Subject: [PATCH 223/300] drm/amd/display: fix counter in wait_for_no_pipes_pending [Why] Wait counter is not being reset for each pipe. [How] Move counter reset into pipe loop scope. Signed-off-by: Roman Li Reviewed-by: Zhan Liu Acked-by: Aurabindo Pillai Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 8489f1e56892..47431ca6986d 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -834,11 +834,10 @@ static void disable_dangling_plane(struct dc *dc, struct dc_state *context) static void wait_for_no_pipes_pending(struct dc *dc, struct dc_state *context) { int i; - int count = 0; - struct pipe_ctx *pipe; PERF_TRACE(); for (i = 0; i < MAX_PIPES; i++) { - pipe = &context->res_ctx.pipe_ctx[i]; + int count = 0; + struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; if (!pipe->plane_state) continue; From e6142dd511425cb827b5db869f489eb81f5f994d Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Wed, 22 Apr 2020 14:37:33 -0400 Subject: [PATCH 224/300] drm/amd/display: Prevent dpcd reads with passive dongles [why] During hotplug, a DP port may be connected to the sink through passive adapter which does not support DPCD reads. Issuing reads without checking for this condition will result in errors [how] Ensure the link is in aux_mode before initiating operation that result in a DPCD read. Signed-off-by: Aurabindo Pillai Reviewed-by: Harry Wentland Acked-by: Aurabindo Pillai Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 94c29b74c086..9c83c1303f08 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2008,17 +2008,22 @@ void amdgpu_dm_update_connector_after_detect( dc_sink_retain(aconnector->dc_sink); if (sink->dc_edid.length == 0) { aconnector->edid = NULL; - drm_dp_cec_unset_edid(&aconnector->dm_dp_aux.aux); + if (aconnector->dc_link->aux_mode) { + drm_dp_cec_unset_edid( + &aconnector->dm_dp_aux.aux); + } } else { aconnector->edid = - (struct edid *) sink->dc_edid.raw_edid; - + (struct edid *)sink->dc_edid.raw_edid; drm_connector_update_edid_property(connector, - aconnector->edid); - drm_dp_cec_set_edid(&aconnector->dm_dp_aux.aux, - aconnector->edid); + aconnector->edid); + + if (aconnector->dc_link->aux_mode) + drm_dp_cec_set_edid(&aconnector->dm_dp_aux.aux, + aconnector->edid); } + amdgpu_dm_update_freesync_caps(connector, aconnector->edid); update_connector_ext_caps(aconnector); } else { From 38212bb31fe923d0a2c6299bd2adfbb84cddef2a Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Fri, 1 May 2020 21:34:18 -0400 Subject: [PATCH 225/300] neigh: send protocol value in neighbor create notification When a new neighbor entry has been added, event is generated but it does not include protocol, because its value is assigned after the event notification routine has run, so move protocol assignment code earlier. Fixes: df9b0e30d44c ("neighbor: Add protocol attribute") Cc: David Ahern Signed-off-by: Roman Mashak Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 39d37d0ef575..116139233d57 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1956,6 +1956,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, NEIGH_UPDATE_F_OVERRIDE_ISROUTER); } + if (protocol) + neigh->protocol = protocol; + if (ndm->ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; @@ -1969,9 +1972,6 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); - if (protocol) - neigh->protocol = protocol; - neigh_release(neigh); out: From 73cb8e2a5863ccc5215660f5123db621bd57dff7 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Sun, 3 May 2020 21:03:19 -0700 Subject: [PATCH 226/300] RISC-V: Remove unused code from STRICT_KERNEL_RWX This patch removes the unused functions set_kernel_text_rw/ro. Currently, it is not being invoked from anywhere and no other architecture (except arm) uses this code. Even in ARM, these functions are not invoked from anywhere currently. Fixes: d27c3c90817e ("riscv: add STRICT_KERNEL_RWX support") Signed-off-by: Atish Patra Reviewed-by: Zong Li Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/set_memory.h | 8 -------- arch/riscv/mm/init.c | 16 ---------------- 2 files changed, 24 deletions(-) diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index c38df4771c09..4c5bae7ca01c 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -22,14 +22,6 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif -#ifdef CONFIG_STRICT_KERNEL_RWX -void set_kernel_text_ro(void); -void set_kernel_text_rw(void); -#else -static inline void set_kernel_text_ro(void) { } -static inline void set_kernel_text_rw(void) { } -#endif - int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 5b813532db59..27a334106708 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -502,22 +502,6 @@ static inline void setup_vm_final(void) #endif /* CONFIG_MMU */ #ifdef CONFIG_STRICT_KERNEL_RWX -void set_kernel_text_rw(void) -{ - unsigned long text_start = (unsigned long)_text; - unsigned long text_end = (unsigned long)_etext; - - set_memory_rw(text_start, (text_end - text_start) >> PAGE_SHIFT); -} - -void set_kernel_text_ro(void) -{ - unsigned long text_start = (unsigned long)_text; - unsigned long text_end = (unsigned long)_etext; - - set_memory_ro(text_start, (text_end - text_start) >> PAGE_SHIFT); -} - void mark_rodata_ro(void) { unsigned long text_start = (unsigned long)_text; From 27abe57770ff1c4c23a60993816f02657ba94bd1 Mon Sep 17 00:00:00 2001 From: Kashyap Chamarthy Date: Tue, 5 May 2020 13:28:39 +0200 Subject: [PATCH 227/300] docs/virt/kvm: Document configuring and running nested guests This is a rewrite of this[1] Wiki page with further enhancements. The doc also includes a section on debugging problems in nested environments, among other improvements. [1] https://www.linux-kvm.org/page/Nested_Guests Signed-off-by: Kashyap Chamarthy Message-Id: <20200505112839.30534-1-kchamart@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 2 + .../virt/kvm/running-nested-guests.rst | 276 ++++++++++++++++++ 2 files changed, 278 insertions(+) create mode 100644 Documentation/virt/kvm/running-nested-guests.rst diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index dcc252634cf9..b6833c7bb474 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -28,3 +28,5 @@ KVM arm/index devices/index + + running-nested-guests diff --git a/Documentation/virt/kvm/running-nested-guests.rst b/Documentation/virt/kvm/running-nested-guests.rst new file mode 100644 index 000000000000..d0a1fc754c84 --- /dev/null +++ b/Documentation/virt/kvm/running-nested-guests.rst @@ -0,0 +1,276 @@ +============================== +Running nested guests with KVM +============================== + +A nested guest is the ability to run a guest inside another guest (it +can be KVM-based or a different hypervisor). The straightforward +example is a KVM guest that in turn runs on a KVM guest (the rest of +this document is built on this example):: + + .----------------. .----------------. + | | | | + | L2 | | L2 | + | (Nested Guest) | | (Nested Guest) | + | | | | + |----------------'--'----------------| + | | + | L1 (Guest Hypervisor) | + | KVM (/dev/kvm) | + | | + .------------------------------------------------------. + | L0 (Host Hypervisor) | + | KVM (/dev/kvm) | + |------------------------------------------------------| + | Hardware (with virtualization extensions) | + '------------------------------------------------------' + +Terminology: + +- L0 – level-0; the bare metal host, running KVM + +- L1 – level-1 guest; a VM running on L0; also called the "guest + hypervisor", as it itself is capable of running KVM. + +- L2 – level-2 guest; a VM running on L1, this is the "nested guest" + +.. note:: The above diagram is modelled after the x86 architecture; + s390x, ppc64 and other architectures are likely to have + a different design for nesting. + + For example, s390x always has an LPAR (LogicalPARtition) + hypervisor running on bare metal, adding another layer and + resulting in at least four levels in a nested setup — L0 (bare + metal, running the LPAR hypervisor), L1 (host hypervisor), L2 + (guest hypervisor), L3 (nested guest). + + This document will stick with the three-level terminology (L0, + L1, and L2) for all architectures; and will largely focus on + x86. + + +Use Cases +--------- + +There are several scenarios where nested KVM can be useful, to name a +few: + +- As a developer, you want to test your software on different operating + systems (OSes). Instead of renting multiple VMs from a Cloud + Provider, using nested KVM lets you rent a large enough "guest + hypervisor" (level-1 guest). This in turn allows you to create + multiple nested guests (level-2 guests), running different OSes, on + which you can develop and test your software. + +- Live migration of "guest hypervisors" and their nested guests, for + load balancing, disaster recovery, etc. + +- VM image creation tools (e.g. ``virt-install``, etc) often run + their own VM, and users expect these to work inside a VM. + +- Some OSes use virtualization internally for security (e.g. to let + applications run safely in isolation). + + +Enabling "nested" (x86) +----------------------- + +From Linux kernel v4.19 onwards, the ``nested`` KVM parameter is enabled +by default for Intel and AMD. (Though your Linux distribution might +override this default.) + +In case you are running a Linux kernel older than v4.19, to enable +nesting, set the ``nested`` KVM module parameter to ``Y`` or ``1``. To +persist this setting across reboots, you can add it in a config file, as +shown below: + +1. On the bare metal host (L0), list the kernel modules and ensure that + the KVM modules:: + + $ lsmod | grep -i kvm + kvm_intel 133627 0 + kvm 435079 1 kvm_intel + +2. Show information for ``kvm_intel`` module:: + + $ modinfo kvm_intel | grep -i nested + parm: nested:bool + +3. For the nested KVM configuration to persist across reboots, place the + below in ``/etc/modprobed/kvm_intel.conf`` (create the file if it + doesn't exist):: + + $ cat /etc/modprobe.d/kvm_intel.conf + options kvm-intel nested=y + +4. Unload and re-load the KVM Intel module:: + + $ sudo rmmod kvm-intel + $ sudo modprobe kvm-intel + +5. Verify if the ``nested`` parameter for KVM is enabled:: + + $ cat /sys/module/kvm_intel/parameters/nested + Y + +For AMD hosts, the process is the same as above, except that the module +name is ``kvm-amd``. + + +Additional nested-related kernel parameters (x86) +------------------------------------------------- + +If your hardware is sufficiently advanced (Intel Haswell processor or +higher, which has newer hardware virt extensions), the following +additional features will also be enabled by default: "Shadow VMCS +(Virtual Machine Control Structure)", APIC Virtualization on your bare +metal host (L0). Parameters for Intel hosts:: + + $ cat /sys/module/kvm_intel/parameters/enable_shadow_vmcs + Y + + $ cat /sys/module/kvm_intel/parameters/enable_apicv + Y + + $ cat /sys/module/kvm_intel/parameters/ept + Y + +.. note:: If you suspect your L2 (i.e. nested guest) is running slower, + ensure the above are enabled (particularly + ``enable_shadow_vmcs`` and ``ept``). + + +Starting a nested guest (x86) +----------------------------- + +Once your bare metal host (L0) is configured for nesting, you should be +able to start an L1 guest with:: + + $ qemu-kvm -cpu host [...] + +The above will pass through the host CPU's capabilities as-is to the +gues); or for better live migration compatibility, use a named CPU +model supported by QEMU. e.g.:: + + $ qemu-kvm -cpu Haswell-noTSX-IBRS,vmx=on + +then the guest hypervisor will subsequently be capable of running a +nested guest with accelerated KVM. + + +Enabling "nested" (s390x) +------------------------- + +1. On the host hypervisor (L0), enable the ``nested`` parameter on + s390x:: + + $ rmmod kvm + $ modprobe kvm nested=1 + +.. note:: On s390x, the kernel parameter ``hpage`` is mutually exclusive + with the ``nested`` paramter — i.e. to be able to enable + ``nested``, the ``hpage`` parameter *must* be disabled. + +2. The guest hypervisor (L1) must be provided with the ``sie`` CPU + feature — with QEMU, this can be done by using "host passthrough" + (via the command-line ``-cpu host``). + +3. Now the KVM module can be loaded in the L1 (guest hypervisor):: + + $ modprobe kvm + + +Live migration with nested KVM +------------------------------ + +Migrating an L1 guest, with a *live* nested guest in it, to another +bare metal host, works as of Linux kernel 5.3 and QEMU 4.2.0 for +Intel x86 systems, and even on older versions for s390x. + +On AMD systems, once an L1 guest has started an L2 guest, the L1 guest +should no longer be migrated or saved (refer to QEMU documentation on +"savevm"/"loadvm") until the L2 guest shuts down. Attempting to migrate +or save-and-load an L1 guest while an L2 guest is running will result in +undefined behavior. You might see a ``kernel BUG!`` entry in ``dmesg``, a +kernel 'oops', or an outright kernel panic. Such a migrated or loaded L1 +guest can no longer be considered stable or secure, and must be restarted. +Migrating an L1 guest merely configured to support nesting, while not +actually running L2 guests, is expected to function normally even on AMD +systems but may fail once guests are started. + +Migrating an L2 guest is always expected to succeed, so all the following +scenarios should work even on AMD systems: + +- Migrating a nested guest (L2) to another L1 guest on the *same* bare + metal host. + +- Migrating a nested guest (L2) to another L1 guest on a *different* + bare metal host. + +- Migrating a nested guest (L2) to a bare metal host. + +Reporting bugs from nested setups +----------------------------------- + +Debugging "nested" problems can involve sifting through log files across +L0, L1 and L2; this can result in tedious back-n-forth between the bug +reporter and the bug fixer. + +- Mention that you are in a "nested" setup. If you are running any kind + of "nesting" at all, say so. Unfortunately, this needs to be called + out because when reporting bugs, people tend to forget to even + *mention* that they're using nested virtualization. + +- Ensure you are actually running KVM on KVM. Sometimes people do not + have KVM enabled for their guest hypervisor (L1), which results in + them running with pure emulation or what QEMU calls it as "TCG", but + they think they're running nested KVM. Thus confusing "nested Virt" + (which could also mean, QEMU on KVM) with "nested KVM" (KVM on KVM). + +Information to collect (generic) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following is not an exhaustive list, but a very good starting point: + + - Kernel, libvirt, and QEMU version from L0 + + - Kernel, libvirt and QEMU version from L1 + + - QEMU command-line of L1 -- when using libvirt, you'll find it here: + ``/var/log/libvirt/qemu/instance.log`` + + - QEMU command-line of L2 -- as above, when using libvirt, get the + complete libvirt-generated QEMU command-line + + - ``cat /sys/cpuinfo`` from L0 + + - ``cat /sys/cpuinfo`` from L1 + + - ``lscpu`` from L0 + + - ``lscpu`` from L1 + + - Full ``dmesg`` output from L0 + + - Full ``dmesg`` output from L1 + +x86-specific info to collect +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Both the below commands, ``x86info`` and ``dmidecode``, should be +available on most Linux distributions with the same name: + + - Output of: ``x86info -a`` from L0 + + - Output of: ``x86info -a`` from L1 + + - Output of: ``dmidecode`` from L0 + + - Output of: ``dmidecode`` from L1 + +s390x-specific info to collect +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Along with the earlier mentioned generic details, the below is +also recommended: + + - ``/proc/sysinfo`` from L1; this will also include the info from L0 From c7cb2d650c9e78c03bd2d1c0db89891825f8c0f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 May 2020 20:53:55 -0700 Subject: [PATCH 228/300] KVM: VMX: Explicitly clear RFLAGS.CF and RFLAGS.ZF in VM-Exit RSB path Clear CF and ZF in the VM-Exit path after doing __FILL_RETURN_BUFFER so that KVM doesn't interpret clobbered RFLAGS as a VM-Fail. Filling the RSB has always clobbered RFLAGS, its current incarnation just happens clear CF and ZF in the processs. Relying on the macro to clear CF and ZF is extremely fragile, e.g. commit 089dd8e53126e ("x86/speculation: Change FILL_RETURN_BUFFER to work with objtool") tweaks the loop such that the ZF flag is always set. Reported-by: Qian Cai Cc: Rick Edgecombe Cc: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: stable@vger.kernel.org Fixes: f2fde6a5bcfcf ("KVM: VMX: Move RSB stuffing to before the first RET after VM-Exit") Signed-off-by: Sean Christopherson Message-Id: <20200506035355.2242-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 87f3f24fef37..51d1a82742fd 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -82,6 +82,9 @@ SYM_FUNC_START(vmx_vmexit) /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */ + or $1, %_ASM_AX + pop %_ASM_AX .Lvmexit_skip_rsb: #endif From 139f7425fdf54f054463e7524b9f54c41af8407f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 5 May 2020 09:40:46 -0400 Subject: [PATCH 229/300] kvm: x86: Use KVM CPU capabilities to determine CR4 reserved bits Using CPUID data can be useful for the processor compatibility check, but that's it. Using it to compute guest-reserved bits can have both false positives (such as LA57 and UMIP which we are already handling) and false negatives: in particular, with this patch we don't allow anymore a KVM guest to set CR4.PKE when CR4.PKE is clear on the host. Fixes: b9dd21e104bc ("KVM: x86: simplify handling of PKRU") Reported-by: Jim Mattson Tested-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c5835f9cb9ad..8d296e3d0d56 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -926,19 +926,6 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); __reserved_bits; \ }) -static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) -{ - u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - - if (kvm_cpu_cap_has(X86_FEATURE_LA57)) - reserved_bits &= ~X86_CR4_LA57; - - if (kvm_cpu_cap_has(X86_FEATURE_UMIP)) - reserved_bits &= ~X86_CR4_UMIP; - - return reserved_bits; -} - static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) @@ -9675,7 +9662,9 @@ int kvm_arch_hardware_setup(void *opaque) if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; - cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has if (kvm_has_tsc_control) { /* @@ -9707,7 +9696,8 @@ int kvm_arch_check_processor_compat(void *opaque) WARN_ON(!irqs_disabled()); - if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; return ops->check_processor_compatibility(); From 8ffdaf9155ebe517cdec5edbcca19ba6e7ee9c3c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 4 May 2020 18:06:07 -0400 Subject: [PATCH 230/300] KVM: selftests: Fix build for evmcs.h I got this error when building kvm selftests: /usr/bin/ld: /home/xz/git/linux/tools/testing/selftests/kvm/libkvm.a(vmx.o):/home/xz/git/linux/tools/testing/selftests/kvm/include/evmcs.h:222: multiple definition of `current_evmcs'; /tmp/cco1G48P.o:/home/xz/git/linux/tools/testing/selftests/kvm/include/evmcs.h:222: first defined here /usr/bin/ld: /home/xz/git/linux/tools/testing/selftests/kvm/libkvm.a(vmx.o):/home/xz/git/linux/tools/testing/selftests/kvm/include/evmcs.h:223: multiple definition of `current_vp_assist'; /tmp/cco1G48P.o:/home/xz/git/linux/tools/testing/selftests/kvm/include/evmcs.h:223: first defined here I think it's because evmcs.h is included both in a test file and a lib file so the structs have multiple declarations when linking. After all it's not a good habit to declare structs in the header files. Cc: Vitaly Kuznetsov Signed-off-by: Peter Xu Message-Id: <20200504220607.99627-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/evmcs.h | 4 ++-- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h index d8f4d6bfe05d..a034438b6266 100644 --- a/tools/testing/selftests/kvm/include/evmcs.h +++ b/tools/testing/selftests/kvm/include/evmcs.h @@ -219,8 +219,8 @@ struct hv_enlightened_vmcs { #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) -struct hv_enlightened_vmcs *current_evmcs; -struct hv_vp_assist_page *current_vp_assist; +extern struct hv_enlightened_vmcs *current_evmcs; +extern struct hv_vp_assist_page *current_vp_assist; int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id); diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 6f17f69394be..4ae104f6ce69 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -17,6 +17,9 @@ bool enable_evmcs; +struct hv_enlightened_vmcs *current_evmcs; +struct hv_vp_assist_page *current_vp_assist; + struct eptPageTableEntry { uint64_t readable:1; uint64_t writable:1; From 495907ec36def1d28a44e2d1b5a51affe716aacf Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 May 2020 11:47:50 -0400 Subject: [PATCH 231/300] KVM: X86: Declare KVM_CAP_SET_GUEST_DEBUG properly KVM_CAP_SET_GUEST_DEBUG should be supported for x86 however it's not declared as supported. My wild guess is that userspaces like QEMU are using "#ifdef KVM_CAP_SET_GUEST_DEBUG" to check for the capability instead, but that could be wrong because the compilation host may not be the runtime host. The userspace might still want to keep the old "#ifdef" though to not break the guest debug on old kernels. Signed-off-by: Peter Xu Message-Id: <20200505154750.126300-1-peterx@redhat.com> [Do the same for PPC and s390. - Paolo] Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 1 + arch/s390/kvm/kvm-s390.c | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e15166b0a16d..ad2f172c26a6 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -521,6 +521,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_PPC_GUEST_DEBUG_SSTEP: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5dcf9ff12828..d05bb040fd42 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -545,6 +545,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_AIS: case KVM_CAP_S390_AIS_MIGRATION: case KVM_CAP_S390_VCPU_RESETS: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_S390_HPAGE_1M: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d296e3d0d56..d786c7d27ce5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3372,6 +3372,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_SET_GUEST_DEBUG: r = 1; break; case KVM_CAP_SYNC_REGS: From de462e5f10718517bacf2f84c8aa2804567ef7df Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 26 Apr 2020 15:53:30 +0900 Subject: [PATCH 232/300] bootconfig: Fix to remove bootconfig data from initrd while boot If there is a bootconfig data in the tail of initrd/initramfs, initrd image sanity check caused an error while decompression stage as follows. [ 0.883882] Unpacking initramfs... [ 2.696429] Initramfs unpacking failed: invalid magic at start of compressed archive This error will be ignored if CONFIG_BLK_DEV_RAM=n, but CONFIG_BLK_DEV_RAM=y the kernel failed to mount rootfs and causes a panic. To fix this issue, shrink down the initrd_end for removing tailing bootconfig data while boot the kernel. Link: http://lkml.kernel.org/r/158788401014.24243.17424755854115077915.stgit@devnote2 Cc: Borislav Petkov Cc: Kees Cook Cc: Ingo Molnar Cc: Andrew Morton Cc: stable@vger.kernel.org Fixes: 7684b8582c24 ("bootconfig: Load boot config from the tail of initrd") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 69 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/init/main.c b/init/main.c index a48617f2e5e5..1a5da2c2660c 100644 --- a/init/main.c +++ b/init/main.c @@ -257,6 +257,47 @@ static int __init loglevel(char *str) early_param("loglevel", loglevel); +#ifdef CONFIG_BLK_DEV_INITRD +static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum) +{ + u32 size, csum; + char *data; + u32 *hdr; + + if (!initrd_end) + return NULL; + + data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; + if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) + return NULL; + + hdr = (u32 *)(data - 8); + size = hdr[0]; + csum = hdr[1]; + + data = ((void *)hdr) - size; + if ((unsigned long)data < initrd_start) { + pr_err("bootconfig size %d is greater than initrd size %ld\n", + size, initrd_end - initrd_start); + return NULL; + } + + /* Remove bootconfig from initramfs/initrd */ + initrd_end = (unsigned long)data; + if (_size) + *_size = size; + if (_csum) + *_csum = csum; + + return data; +} +#else +static void * __init get_boot_config_from_initrd(u32 *_size, u32 *_csum) +{ + return NULL; +} +#endif + #ifdef CONFIG_BOOT_CONFIG char xbc_namebuf[XBC_KEYLEN_MAX] __initdata; @@ -357,9 +398,12 @@ static void __init setup_boot_config(const char *cmdline) int pos; u32 size, csum; char *data, *copy; - u32 *hdr; int ret; + data = get_boot_config_from_initrd(&size, &csum); + if (!data) + goto not_found; + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, bootconfig_params); @@ -367,27 +411,12 @@ static void __init setup_boot_config(const char *cmdline) if (!bootconfig_found) return; - if (!initrd_end) - goto not_found; - - data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; - if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) - goto not_found; - - hdr = (u32 *)(data - 8); - size = hdr[0]; - csum = hdr[1]; - if (size >= XBC_DATA_MAX) { pr_err("bootconfig size %d greater than max size %d\n", size, XBC_DATA_MAX); return; } - data = ((void *)hdr) - size; - if ((unsigned long)data < initrd_start) - goto not_found; - if (boot_config_checksum((unsigned char *)data, size) != csum) { pr_err("bootconfig checksum failed\n"); return; @@ -420,8 +449,14 @@ static void __init setup_boot_config(const char *cmdline) not_found: pr_err("'bootconfig' found on command line, but no bootconfig found\n"); } + #else -#define setup_boot_config(cmdline) do { } while (0) + +static void __init setup_boot_config(const char *cmdline) +{ + /* Remove bootconfig data from initrd */ + get_boot_config_from_initrd(NULL, NULL); +} static int __init warn_bootconfig(char *str) { From dcbd21c9fca5e954fd4e3d91884907eb6d47187e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Apr 2020 14:49:09 +0900 Subject: [PATCH 233/300] tracing/kprobes: Fix a double initialization typo Fix a typo that resulted in an unnecessary double initialization to addr. Link: http://lkml.kernel.org/r/158779374968.6082.2337484008464939919.stgit@devnote2 Cc: Tom Zanussi Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: c7411a1a126f ("tracing/kprobe: Check whether the non-suffixed symbol is notrace") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d0568af4a0ef..0d9300c3b084 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -453,7 +453,7 @@ static bool __within_notrace_func(unsigned long addr) static bool within_notrace_func(struct trace_kprobe *tk) { - unsigned long addr = addr = trace_kprobe_address(tk); + unsigned long addr = trace_kprobe_address(tk); char symname[KSYM_NAME_LEN], *p; if (!__within_notrace_func(addr)) From da0f1f4167e3af69e1d8b32d6d65195ddd2bfb64 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Apr 2020 14:49:17 +0900 Subject: [PATCH 234/300] tracing/boottime: Fix kprobe event API usage Fix boottime kprobe events to use API correctly for multiple events. For example, when we set a multiprobe kprobe events in bootconfig like below, ftrace.event.kprobes.myevent { probes = "vfs_read $arg1 $arg2", "vfs_write $arg1 $arg2" } This cause an error; trace_boot: Failed to add probe: p:kprobes/myevent (null) vfs_read $arg1 $arg2 vfs_write $arg1 $arg2 This shows the 1st argument becomes NULL and multiprobes are merged to 1 probe. Link: http://lkml.kernel.org/r/158779375766.6082.201939936008972838.stgit@devnote2 Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 29a154810546 ("tracing: Change trace_boot to use kprobe_event interface") Reviewed-by: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 06d7feb5255f..9de29bb45a27 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -95,23 +95,19 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) struct xbc_node *anode; char buf[MAX_BUF_LEN]; const char *val; - int ret; - - kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); - - ret = kprobe_event_gen_cmd_start(&cmd, event, NULL); - if (ret) - return ret; + int ret = 0; xbc_node_for_each_array_value(node, "probes", anode, val) { - ret = kprobe_event_add_field(&cmd, val); - if (ret) - return ret; - } + kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); - ret = kprobe_event_gen_cmd_end(&cmd); - if (ret) - pr_err("Failed to add probe: %s\n", buf); + ret = kprobe_event_gen_cmd_start(&cmd, event, val); + if (ret) + break; + + ret = kprobe_event_gen_cmd_end(&cmd); + if (ret) + pr_err("Failed to add probe: %s\n", buf); + } return ret; } From 5b4dcd2d201a395ad4054067bfae4a07554fbd65 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 25 Apr 2020 14:49:26 +0900 Subject: [PATCH 235/300] tracing/kprobes: Reject new event if loc is NULL Reject the new event which has NULL location for kprobes. For kprobes, user must specify at least the location. Link: http://lkml.kernel.org/r/158779376597.6082.1411212055469099461.stgit@devnote2 Cc: Tom Zanussi Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 2a588dd1d5d6 ("tracing: Add kprobe event command generation functions") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0d9300c3b084..35989383ae11 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -940,6 +940,9 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); * complete command or only the first part of it; in the latter case, * kprobe_event_add_fields() can be used to add more fields following this. * + * Unlikely the synth_event_gen_cmd_start(), @loc must be specified. This + * returns -EINVAL if @loc == NULL. + * * Return: 0 if successful, error otherwise. */ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, @@ -953,6 +956,9 @@ int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; + if (!loc) + return -EINVAL; + if (kretprobe) snprintf(buf, MAX_EVENT_NAME_LEN, "r:kprobes/%s", name); else From bf5525f3a8e3248be5aa5defe5aaadd60e1c1ba1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2020 20:51:06 -0700 Subject: [PATCH 236/300] selftests: net: tcp_mmap: clear whole tcp_zerocopy_receive struct We added fields in tcp_zerocopy_receive structure, so make sure to clear all fields to not pass garbage to the kernel. We were lucky because recent additions added 'out' parameters, still we need to clean our reference implementation, before folks copy/paste it. Fixes: c8856c051454 ("tcp-zerocopy: Return inq along with tcp receive zerocopy.") Fixes: 33946518d493 ("tcp-zerocopy: Return sk_err (if set) along with tcp receive zerocopy.") Signed-off-by: Eric Dumazet Cc: Arjun Roy Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- tools/testing/selftests/net/tcp_mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c index 35505b31e5cc..62171fd638c8 100644 --- a/tools/testing/selftests/net/tcp_mmap.c +++ b/tools/testing/selftests/net/tcp_mmap.c @@ -165,9 +165,10 @@ void *child_thread(void *arg) socklen_t zc_len = sizeof(zc); int res; + memset(&zc, 0, sizeof(zc)); zc.address = (__u64)((unsigned long)addr); zc.length = chunk_size; - zc.recv_skip_hint = 0; + res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len); if (res == -1) From 39bd16df7c31bb8cf5dfd0c88e42abd5ae10029d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 May 2020 13:16:22 +0300 Subject: [PATCH 237/300] net: mvpp2: prevent buffer overflow in mvpp22_rss_ctx() The "rss_context" variable comes from the user via ethtool_get_rxfh(). It can be any u32 value except zero. Eventually it gets passed to mvpp22_rss_ctx() and if it is over MVPP22_N_RSS_TABLES (8) then it results in an array overflow. Fixes: 895586d5dc32 ("net: mvpp2: cls: Use RSS contexts to handle RSS tables") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 1fa60e985b43..2b5dad2ec650 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4329,6 +4329,8 @@ static int mvpp2_ethtool_get_rxfh_context(struct net_device *dev, u32 *indir, if (!mvpp22_rss_is_supported()) return -EOPNOTSUPP; + if (rss_context >= MVPP22_N_RSS_TABLES) + return -EINVAL; if (hfunc) *hfunc = ETH_RSS_HASH_CRC32; From 722c0f00d4feea77475a5dc943b53d60824a1e4e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 May 2020 13:16:56 +0300 Subject: [PATCH 238/300] net: mvpp2: cls: Prevent buffer overflow in mvpp2_ethtool_cls_rule_del() The "info->fs.location" is a u32 that comes from the user via the ethtool_set_rxnfc() function. We need to check for invalid values to prevent a buffer overflow. I copy and pasted this check from the mvpp2_ethtool_cls_rule_ins() function. Fixes: 90b509b39ac9 ("net: mvpp2: cls: Add Classification offload support") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c index 8972cdd559e8..7352244c5e68 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c @@ -1428,6 +1428,9 @@ int mvpp2_ethtool_cls_rule_del(struct mvpp2_port *port, struct mvpp2_ethtool_fs *efs; int ret; + if (info->fs.location >= MVPP2_N_RFS_ENTRIES_PER_FLOW) + return -EINVAL; + efs = port->rfs_rules[info->fs.location]; if (!efs) return -EINVAL; From 29ca3cdfe13b2792b8624e6f769777e8cb387f9c Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 6 May 2020 15:58:30 +0200 Subject: [PATCH 239/300] net: macsec: fix rtnl locking issue netdev_update_features() must be called with the rtnl lock taken. Not doing so triggers a warning, as ASSERT_RTNL() is used in __netdev_update_features(), the first function called by netdev_update_features(). Fix this. Fixes: c850240b6c41 ("net: macsec: report real_dev features when HW offloading is enabled") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/macsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index d4034025c87c..d0d31cb99180 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -2641,11 +2641,12 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) if (ret) goto rollback; - rtnl_unlock(); /* Force features update, since they are different for SW MACSec and * HW offloading cases. */ netdev_update_features(dev); + + rtnl_unlock(); return 0; rollback: From f5dda315b63775fe00f5470687d40ce801ddfb38 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Wed, 6 May 2020 11:41:07 -0400 Subject: [PATCH 240/300] net: hsr: fix incorrect type usage for protocol variable Fix following sparse checker warning:- net/hsr/hsr_slave.c:38:18: warning: incorrect type in assignment (different base types) net/hsr/hsr_slave.c:38:18: expected unsigned short [unsigned] [usertype] protocol net/hsr/hsr_slave.c:38:18: got restricted __be16 [usertype] h_proto net/hsr/hsr_slave.c:39:25: warning: restricted __be16 degrades to integer net/hsr/hsr_slave.c:39:57: warning: restricted __be16 degrades to integer Signed-off-by: Murali Karicheri Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/hsr/hsr_slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index f4b9f7a3ce51..25b6ffba26cd 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -18,7 +18,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; - u16 protocol; + __be16 protocol; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); From a84724178bd7081cf3bd5b558616dd6a9a4ca63b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 May 2020 09:21:15 -0700 Subject: [PATCH 241/300] selftests: net: tcp_mmap: fix SO_RCVLOWAT setting Since chunk_size is no longer an integer, we can not use it directly as an argument of setsockopt(). This patch should fix tcp_mmap for Big Endian kernels. Fixes: 597b01edafac ("selftests: net: avoid ptl lock contention in tcp_mmap") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Arjun Roy Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- tools/testing/selftests/net/tcp_mmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c index 62171fd638c8..4555f88252ba 100644 --- a/tools/testing/selftests/net/tcp_mmap.c +++ b/tools/testing/selftests/net/tcp_mmap.c @@ -282,12 +282,14 @@ static void setup_sockaddr(int domain, const char *str_addr, static void do_accept(int fdlisten) { pthread_attr_t attr; + int rcvlowat; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rcvlowat = chunk_size; if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, - &chunk_size, sizeof(chunk_size)) == -1) { + &rcvlowat, sizeof(rcvlowat)) == -1) { perror("setsockopt SO_RCVLOWAT"); } From 0ba83aa037da456bb73868e5cc20558c5a644414 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 6 May 2020 20:48:13 +0300 Subject: [PATCH 242/300] net: dsa: sja1105: the PTP_CLK extts input reacts on both edges It looks like the sja1105 external timestamping input is not as generic as we thought. When fed a signal with 50% duty cycle, it will timestamp both the rising and the falling edge. When fed a short pulse signal, only the timestamp of the falling edge will be seen in the PTPSYNCTS register, because that of the rising edge had been overwritten. So the moral is: don't feed it short pulse inputs. Luckily this is not a complete deal breaker, as we can still work with 1 Hz square waves. But the problem is that the extts polling period was not dimensioned enough for this input signal. If we leave the period at half a second, we risk losing timestamps due to jitter in the measuring process. So we need to increase it to 4 times per second. Also, the very least we can do to inform the user is to deny any other flags combination than with PTP_RISING_EDGE and PTP_FALLING_EDGE both set. Fixes: 747e5eb31d59 ("net: dsa: sja1105: configure the PTP_CLK pin as EXT_TS or PER_OUT") Signed-off-by: Vladimir Oltean Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_ptp.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index a22f8e3fc06b..bc0e47c1dbb9 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -16,14 +16,15 @@ /* PTPSYNCTS has no interrupt or update mechanism, because the intended * hardware use case is for the timestamp to be collected synchronously, - * immediately after the CAS_MASTER SJA1105 switch has triggered a CASSYNC - * pulse on the PTP_CLK pin. When used as a generic extts source, it needs - * polling and a comparison with the old value. The polling interval is just - * the Nyquist rate of a canonical PPS input (e.g. from a GPS module). - * Anything of higher frequency than 1 Hz will be lost, since there is no - * timestamp FIFO. + * immediately after the CAS_MASTER SJA1105 switch has performed a CASSYNC + * one-shot toggle (no return to level) on the PTP_CLK pin. When used as a + * generic extts source, the PTPSYNCTS register needs polling and a comparison + * with the old value. The polling interval is configured as the Nyquist rate + * of a signal with 50% duty cycle and 1Hz frequency, which is sadly all that + * this hardware can do (but may be enough for some setups). Anything of higher + * frequency than 1 Hz will be lost, since there is no timestamp FIFO. */ -#define SJA1105_EXTTS_INTERVAL (HZ / 2) +#define SJA1105_EXTTS_INTERVAL (HZ / 4) /* This range is actually +/- SJA1105_MAX_ADJ_PPB * divided by 1000 (ppb -> ppm) and with a 16-bit @@ -754,7 +755,16 @@ static int sja1105_extts_enable(struct sja1105_private *priv, return -EOPNOTSUPP; /* Reject requests with unsupported flags */ - if (extts->flags) + if (extts->flags & ~(PTP_ENABLE_FEATURE | + PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS)) + return -EOPNOTSUPP; + + /* We can only enable time stamping on both edges, sadly. */ + if ((extts->flags & PTP_STRICT_FLAGS) && + (extts->flags & PTP_ENABLE_FEATURE) && + (extts->flags & PTP_EXTTS_EDGES) != PTP_EXTTS_EDGES) return -EOPNOTSUPP; rc = sja1105_change_ptp_clk_pin_func(priv, PTP_PF_EXTTS); From 8101b5a1531f3390b3a69fa7934c70a8fd6566ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Apr 2020 11:07:22 +0200 Subject: [PATCH 243/300] ARM: futex: Address build warning Stephen reported the following build warning on a ARM multi_v7_defconfig build with GCC 9.2.1: kernel/futex.c: In function 'do_futex': kernel/futex.c:1676:17: warning: 'oldval' may be used uninitialized in this function [-Wmaybe-uninitialized] 1676 | return oldval == cmparg; | ~~~~~~~^~~~~~~~~ kernel/futex.c:1652:6: note: 'oldval' was declared here 1652 | int oldval, ret; | ^~~~~~ introduced by commit a08971e9488d ("futex: arch_futex_atomic_op_inuser() calling conventions change"). While that change should not make any difference it confuses GCC which fails to work out that oldval is not referenced when the return value is not zero. GCC fails to properly analyze arch_futex_atomic_op_inuser(). It's not the early return, the issue is with the assembly macros. GCC fails to detect that those either set 'ret' to 0 and set oldval or set 'ret' to -EFAULT which makes oldval uninteresting. The store to the callsite supplied oldval pointer is conditional on ret == 0. The straight forward way to solve this is to make the store unconditional. Aside of addressing the build warning this makes sense anyway because it removes the conditional from the fastpath. In the error case the stored value is uninteresting and the extra store does not matter at all. Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/87pncao2ph.fsf@nanos.tec.linutronix.de --- arch/arm/include/asm/futex.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index e133da303a98..a9151884bc85 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -165,8 +165,13 @@ arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) - *oval = oldval; + /* + * Store unconditionally. If ret != 0 the extra store is the least + * of the worries but GCC cannot figure out that __futex_atomic_op() + * is either setting ret to -EFAULT or storing the old value in + * oldval which results in a uninitialized warning at the call site. + */ + *oval = oldval; return ret; } From 21ce7f3e16fbf89faaf149cfe0f730edfc553914 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 4 May 2020 01:20:26 +0300 Subject: [PATCH 244/300] net: dsa: ocelot: the MAC table on Felix is twice as large When running 'bridge fdb dump' on Felix, sometimes learnt and static MAC addresses would appear, sometimes they wouldn't. Turns out, the MAC table has 4096 entries on VSC7514 (Ocelot) and 8192 entries on VSC9959 (Felix), so the existing code from the Ocelot common library only dumped half of Felix's MAC table. They are both organized as a 4-way set-associative TCAM, so we just need a single variable indicating the correct number of rows. Fixes: 56051948773e ("net: dsa: ocelot: add driver for Felix switch family") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 1 + drivers/net/dsa/ocelot/felix.h | 1 + drivers/net/dsa/ocelot/felix_vsc9959.c | 1 + drivers/net/ethernet/mscc/ocelot.c | 6 ++---- drivers/net/ethernet/mscc/ocelot_regs.c | 1 + include/soc/mscc/ocelot.h | 1 + 6 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index d0a3764ff0cf..e2c6bf0e430e 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -400,6 +400,7 @@ static int felix_init_structs(struct felix *felix, int num_phys_ports) ocelot->stats_layout = felix->info->stats_layout; ocelot->num_stats = felix->info->num_stats; ocelot->shared_queue_sz = felix->info->shared_queue_sz; + ocelot->num_mact_rows = felix->info->num_mact_rows; ocelot->vcap_is2_keys = felix->info->vcap_is2_keys; ocelot->vcap_is2_actions= felix->info->vcap_is2_actions; ocelot->vcap = felix->info->vcap; diff --git a/drivers/net/dsa/ocelot/felix.h b/drivers/net/dsa/ocelot/felix.h index 82d46f260041..9af106513e53 100644 --- a/drivers/net/dsa/ocelot/felix.h +++ b/drivers/net/dsa/ocelot/felix.h @@ -15,6 +15,7 @@ struct felix_info { const u32 *const *map; const struct ocelot_ops *ops; int shared_queue_sz; + int num_mact_rows; const struct ocelot_stat_layout *stats_layout; unsigned int num_stats; int num_ports; diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index b4078f3c5c38..8bf395f12b47 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1220,6 +1220,7 @@ struct felix_info felix_info_vsc9959 = { .vcap_is2_actions = vsc9959_vcap_is2_actions, .vcap = vsc9959_vcap_props, .shared_queue_sz = 128 * 1024, + .num_mact_rows = 2048, .num_ports = 6, .switch_pci_bar = 4, .imdio_pci_bar = 0, diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index a8c48a4a708f..887b3cc88354 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1031,10 +1031,8 @@ int ocelot_fdb_dump(struct ocelot *ocelot, int port, { int i, j; - /* Loop through all the mac tables entries. There are 1024 rows of 4 - * entries. - */ - for (i = 0; i < 1024; i++) { + /* Loop through all the mac tables entries. */ + for (i = 0; i < ocelot->num_mact_rows; i++) { for (j = 0; j < 4; j++) { struct ocelot_mact_entry entry; bool is_static; diff --git a/drivers/net/ethernet/mscc/ocelot_regs.c b/drivers/net/ethernet/mscc/ocelot_regs.c index b88b5899b227..7d4fd1b6adda 100644 --- a/drivers/net/ethernet/mscc/ocelot_regs.c +++ b/drivers/net/ethernet/mscc/ocelot_regs.c @@ -431,6 +431,7 @@ int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops) ocelot->stats_layout = ocelot_stats_layout; ocelot->num_stats = ARRAY_SIZE(ocelot_stats_layout); ocelot->shared_queue_sz = 224 * 1024; + ocelot->num_mact_rows = 1024; ocelot->ops = ops; ret = ocelot_regfields_init(ocelot, ocelot_regfields); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6d6a3947c8b7..efc8b613d486 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -502,6 +502,7 @@ struct ocelot { unsigned int num_stats; int shared_queue_sz; + int num_mact_rows; struct net_device *hw_bridge_dev; u16 bridge_mask; From c0d7eccbc76115b7eb337956c03d47d6a889cf8c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 4 May 2020 01:20:27 +0300 Subject: [PATCH 245/300] net: mscc: ocelot: ANA_AUTOAGE_AGE_PERIOD holds a value in seconds, not ms One may notice that automatically-learnt entries 'never' expire, even though the bridge configures the address age period at 300 seconds. Actually the value written to hardware corresponds to a time interval 1000 times higher than intended, i.e. 83 hours. Fixes: a556c76adc05 ("net: mscc: Add initial Ocelot switch support") Signed-off-by: Vladimir Oltean Reviewed-by: Florian Faineli Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 887b3cc88354..02350c3d9d01 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1451,8 +1451,15 @@ static void ocelot_port_attr_stp_state_set(struct ocelot *ocelot, int port, void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs) { - ocelot_write(ocelot, ANA_AUTOAGE_AGE_PERIOD(msecs / 2), - ANA_AUTOAGE); + unsigned int age_period = ANA_AUTOAGE_AGE_PERIOD(msecs / 2000); + + /* Setting AGE_PERIOD to zero effectively disables automatic aging, + * which is clearly not what our intention is. So avoid that. + */ + if (!age_period) + age_period = 1; + + ocelot_rmw(ocelot, age_period, ANA_AUTOAGE_AGE_PERIOD_M, ANA_AUTOAGE); } EXPORT_SYMBOL(ocelot_set_ageing_time); From 0cb7498f234e4e7d75187a8cae6c7c2053f2488a Mon Sep 17 00:00:00 2001 From: Ahmed Abdelsalam Date: Mon, 4 May 2020 14:42:11 +0000 Subject: [PATCH 246/300] seg6: fix SRH processing to comply with RFC8754 The Segment Routing Header (SRH) which defines the SRv6 dataplane is defined in RFC8754. RFC8754 (section 4.1) defines the SR source node behavior which encapsulates packets into an outer IPv6 header and SRH. The SR source node encodes the full list of Segments that defines the packet path in the SRH. Then, the first segment from list of Segments is copied into the Destination address of the outer IPv6 header and the packet is sent to the first hop in its path towards the destination. If the Segment list has only one segment, the SR source node can omit the SRH as he only segment is added in the destination address. RFC8754 (section 4.1.1) defines the Reduced SRH, when a source does not require the entire SID list to be preserved in the SRH. A reduced SRH does not contain the first segment of the related SR Policy (the first segment is the one already in the DA of the IPv6 header), and the Last Entry field is set to n-2, where n is the number of elements in the SR Policy. RFC8754 (section 4.3.1.1) defines the SRH processing and the logic to validate the SRH (S09, S10, S11) which works for both reduced and non-reduced behaviors. This patch updates seg6_validate_srh() to validate the SRH as per RFC8754. Signed-off-by: Ahmed Abdelsalam Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 4c7e0a27fa9c..37b434293bda 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -27,8 +27,9 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) { - int trailing; unsigned int tlv_offset; + int max_last_entry; + int trailing; if (srh->type != IPV6_SRCRT_TYPE_4) return false; @@ -36,7 +37,12 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) if (((srh->hdrlen + 1) << 3) != len) return false; - if (srh->segments_left > srh->first_segment) + max_last_entry = (srh->hdrlen / 2) - 1; + + if (srh->first_segment > max_last_entry) + return false; + + if (srh->segments_left > srh->first_segment + 1) return false; tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4); From 9274124f023b5c56dc4326637d4f787968b03607 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 4 May 2020 12:48:54 -0400 Subject: [PATCH 247/300] net: stricter validation of untrusted gso packets Syzkaller again found a path to a kernel crash through bad gso input: a packet with transport header extending beyond skb_headlen(skb). Tighten validation at kernel entry: - Verify that the transport header lies within the linear section. To avoid pulling linux/tcp.h, verify just sizeof tcphdr. tcp_gso_segment will call pskb_may_pull (th->doff * 4) before use. - Match the gso_type against the ip_proto found by the flow dissector. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Reported-by: syzbot Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 0d1fe9297ac6..6f6ade63b04c 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -3,6 +3,8 @@ #define _LINUX_VIRTIO_NET_H #include +#include +#include #include static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, @@ -28,17 +30,25 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, bool little_endian) { unsigned int gso_type = 0; + unsigned int thlen = 0; + unsigned int ip_proto; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: gso_type = SKB_GSO_TCPV4; + ip_proto = IPPROTO_TCP; + thlen = sizeof(struct tcphdr); break; case VIRTIO_NET_HDR_GSO_TCPV6: gso_type = SKB_GSO_TCPV6; + ip_proto = IPPROTO_TCP; + thlen = sizeof(struct tcphdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; + ip_proto = IPPROTO_UDP; + thlen = sizeof(struct udphdr); break; default: return -EINVAL; @@ -57,16 +67,22 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; + + if (skb_transport_offset(skb) + thlen > skb_headlen(skb)) + return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. * probe and drop if does not match one of the above types. */ if (gso_type && skb->network_header) { + struct flow_keys_basic keys; + if (!skb->protocol) virtio_net_hdr_set_proto(skb, hdr); retry: - skb_probe_transport_header(skb); - if (!skb_transport_header_was_set(skb)) { + if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, + NULL, 0, 0, 0, + 0)) { /* UFO does not specify ipv4 or 6: try both */ if (gso_type & SKB_GSO_UDP && skb->protocol == htons(ETH_P_IP)) { @@ -75,6 +91,12 @@ retry: } return -EINVAL; } + + if (keys.control.thoff + thlen > skb_headlen(skb) || + keys.basic.ip_proto != ip_proto) + return -EINVAL; + + skb_set_transport_header(skb, keys.control.thoff); } } From 657221598f820ff860972b67583fb91d9ab7caf4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 4 May 2020 22:58:56 +0300 Subject: [PATCH 248/300] net: dsa: remove duplicate assignment in dsa_slave_add_cls_matchall_mirred This was caused by a poor merge conflict resolution on my side. The "act = &cls->rule->action.entries[0];" assignment was already present in the code prior to the patch mentioned below. Fixes: e13c2075280e ("net: dsa: refactor matchall mirred action to separate function") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/slave.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d1068803cd11..62f4ee3da172 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -856,20 +856,18 @@ dsa_slave_add_cls_matchall_mirred(struct net_device *dev, struct dsa_port *to_dp; int err; - act = &cls->rule->action.entries[0]; - if (!ds->ops->port_mirror_add) return -EOPNOTSUPP; - if (!act->dev) - return -EINVAL; - if (!flow_action_basic_hw_stats_check(&cls->rule->action, cls->common.extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; + if (!act->dev) + return -EINVAL; + if (!dsa_slave_dev_check(act->dev)) return -EOPNOTSUPP; From 050569fc8384c8056bacefcc246bcb2dfe574936 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 4 May 2020 13:18:06 -0700 Subject: [PATCH 249/300] net: dsa: Do not leave DSA master with NULL netdev_ops When ndo_get_phys_port_name() for the CPU port was added we introduced an early check for when the DSA master network device in dsa_master_ndo_setup() already implements ndo_get_phys_port_name(). When we perform the teardown operation in dsa_master_ndo_teardown() we would not be checking that cpu_dp->orig_ndo_ops was successfully allocated and non-NULL initialized. With network device drivers such as virtio_net, this leads to a NPD as soon as the DSA switch hanging off of it gets torn down because we are now assigning the virtio_net device's netdev_ops a NULL pointer. Fixes: da7b9e9b00d4 ("net: dsa: Add ndo_get_phys_port_name() for CPU port") Reported-by: Allen Pais Signed-off-by: Florian Fainelli Tested-by: Allen Pais Signed-off-by: David S. Miller --- net/dsa/master.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/dsa/master.c b/net/dsa/master.c index b5c535af63a3..a621367c6e8c 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -289,7 +289,8 @@ static void dsa_master_ndo_teardown(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; - dev->netdev_ops = cpu_dp->orig_ndo_ops; + if (cpu_dp->orig_ndo_ops) + dev->netdev_ops = cpu_dp->orig_ndo_ops; cpu_dp->orig_ndo_ops = NULL; } From 0735ccc9d9a4ce09154395feb70b804de7cc6ebf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 5 May 2020 15:28:09 +0200 Subject: [PATCH 250/300] ionic: Use debugfs_create_bool() to export bool Currently bool ionic_cq.done_color is exported using debugfs_create_u8(), which requires a cast, preventing further compiler checks. Fix this by switching to debugfs_create_bool(), and dropping the cast. Signed-off-by: Geert Uytterhoeven Acked-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_debugfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c index 5f8fc58d42b3..11621ccc1faf 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_debugfs.c @@ -170,8 +170,7 @@ void ionic_debugfs_add_qcq(struct ionic_lif *lif, struct ionic_qcq *qcq) debugfs_create_x64("base_pa", 0400, cq_dentry, &cq->base_pa); debugfs_create_u32("num_descs", 0400, cq_dentry, &cq->num_descs); debugfs_create_u32("desc_size", 0400, cq_dentry, &cq->desc_size); - debugfs_create_u8("done_color", 0400, cq_dentry, - (u8 *)&cq->done_color); + debugfs_create_bool("done_color", 0400, cq_dentry, &cq->done_color); debugfs_create_file("tail", 0400, cq_dentry, cq, &cq_tail_fops); From 6f5c27f9c6f89226f5b97476797e5abedd61912c Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Tue, 5 May 2020 19:31:26 +0300 Subject: [PATCH 251/300] net: ethernet: ti: am65-cpsw-nuss: fix irqs type The K3 INTA driver, which is source TX/RX IRQs for CPSW NUSS, defines IRQs triggering type as EDGE by default, but triggering type for CPSW NUSS TX/RX IRQs has to be LEVEL as the EDGE triggering type may cause unnecessary IRQs triggering and NAPI scheduling for empty queues. It was discovered with RT-kernel. Fix it by explicitly specifying CPSW NUSS TX/RX IRQ type as IRQF_TRIGGER_HIGH. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 2bf56733ba94..2517ffba8178 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1719,7 +1719,8 @@ static int am65_cpsw_nuss_ndev_add_napi_2g(struct am65_cpsw_common *common) ret = devm_request_irq(dev, tx_chn->irq, am65_cpsw_nuss_tx_irq, - 0, tx_chn->tx_chn_name, tx_chn); + IRQF_TRIGGER_HIGH, + tx_chn->tx_chn_name, tx_chn); if (ret) { dev_err(dev, "failure requesting tx%u irq %u, %d\n", tx_chn->id, tx_chn->irq, ret); @@ -1744,7 +1745,7 @@ static int am65_cpsw_nuss_ndev_reg_2g(struct am65_cpsw_common *common) ret = devm_request_irq(dev, common->rx_chns.irq, am65_cpsw_nuss_rx_irq, - 0, dev_name(dev), common); + IRQF_TRIGGER_HIGH, dev_name(dev), common); if (ret) { dev_err(dev, "failure requesting rx irq %u, %d\n", common->rx_chns.irq, ret); From a0fd7cc87a018df1a17f9d3f0bd994c1f22c6b34 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 6 May 2020 15:33:02 -0600 Subject: [PATCH 252/300] wireguard: selftests: use normal kernel stack size on ppc64 While at some point it might have made sense to be running these tests on ppc64 with 4k stacks, the kernel hasn't actually used 4k stacks on 64-bit powerpc in a long time, and more interesting things that we test don't really work when we deviate from the default (16k). So, we stop pushing our luck in this commit, and return to the default instead of the minimum. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config index 990c510a9cfa..f52f1e2bc7f6 100644 --- a/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config +++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config @@ -10,3 +10,4 @@ CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="console=hvc0 wg.success=hvc1" CONFIG_SECTION_MISMATCH_WARN_ONLY=y CONFIG_FRAME_WARN=1280 +CONFIG_THREAD_SHIFT=14 From b673e24aad36981f327a6570412ffa7754de8911 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 6 May 2020 15:33:03 -0600 Subject: [PATCH 253/300] wireguard: socket: remove errant restriction on looping to self It's already possible to create two different interfaces and loop packets between them. This has always been possible with tunnels in the kernel, and isn't specific to wireguard. Therefore, the networking stack already needs to deal with that. At the very least, the packet winds up exceeding the MTU and is discarded at that point. So, since this is already something that happens, there's no need to forbid the not very exceptional case of routing a packet back to the same interface; this loop is no different than others, and we shouldn't special case it, but rather rely on generic handling of loops in general. This also makes it easier to do interesting things with wireguard such as onion routing. At the same time, we add a selftest for this, ensuring that both onion routing works and infinite routing loops do not crash the kernel. We also add a test case for wireguard interfaces nesting packets and sending traffic between each other, as well as the loop in this case too. We make sure to send some throughput-heavy traffic for this use case, to stress out any possible recursion issues with the locks around workqueues. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/socket.c | 12 ----- tools/testing/selftests/wireguard/netns.sh | 54 ++++++++++++++++++++-- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index b0d6541582d3..f9018027fc13 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -76,12 +76,6 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); goto err; - } else if (unlikely(rt->dst.dev == skb->dev)) { - ip_rt_put(rt); - ret = -ELOOP; - net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n", - wg->dev->name, &endpoint->addr); - goto err; } if (cache) dst_cache_set_ip4(cache, &rt->dst, fl.saddr); @@ -149,12 +143,6 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); goto err; - } else if (unlikely(dst->dev == skb->dev)) { - dst_release(dst); - ret = -ELOOP; - net_dbg_ratelimited("%s: Avoiding routing loop to %pISpfsc\n", - wg->dev->name, &endpoint->addr); - goto err; } if (cache) dst_cache_set_ip6(cache, dst, &fl.saddr); diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 936e1ca9410e..17a1f53ceba0 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -48,8 +48,11 @@ cleanup() { exec 2>/dev/null printf "$orig_message_cost" > /proc/sys/net/core/message_cost ip0 link del dev wg0 + ip0 link del dev wg1 ip1 link del dev wg0 + ip1 link del dev wg1 ip2 link del dev wg0 + ip2 link del dev wg1 local to_kill="$(ip netns pids $netns0) $(ip netns pids $netns1) $(ip netns pids $netns2)" [[ -n $to_kill ]] && kill $to_kill pp ip netns del $netns1 @@ -77,18 +80,20 @@ ip0 link set wg0 netns $netns2 key1="$(pp wg genkey)" key2="$(pp wg genkey)" key3="$(pp wg genkey)" +key4="$(pp wg genkey)" pub1="$(pp wg pubkey <<<"$key1")" pub2="$(pp wg pubkey <<<"$key2")" pub3="$(pp wg pubkey <<<"$key3")" +pub4="$(pp wg pubkey <<<"$key4")" psk="$(pp wg genpsk)" [[ -n $key1 && -n $key2 && -n $psk ]] configure_peers() { ip1 addr add 192.168.241.1/24 dev wg0 - ip1 addr add fd00::1/24 dev wg0 + ip1 addr add fd00::1/112 dev wg0 ip2 addr add 192.168.241.2/24 dev wg0 - ip2 addr add fd00::2/24 dev wg0 + ip2 addr add fd00::2/112 dev wg0 n1 wg set wg0 \ private-key <(echo "$key1") \ @@ -230,9 +235,38 @@ n1 ping -W 1 -c 1 192.168.241.2 n1 wg set wg0 private-key <(echo "$key3") n2 wg set wg0 peer "$pub3" preshared-key <(echo "$psk") allowed-ips 192.168.241.1/32 peer "$pub1" remove n1 ping -W 1 -c 1 192.168.241.2 +n2 wg set wg0 peer "$pub3" remove -ip1 link del wg0 +# Test that we can route wg through wg +ip1 addr flush dev wg0 +ip2 addr flush dev wg0 +ip1 addr add fd00::5:1/112 dev wg0 +ip2 addr add fd00::5:2/112 dev wg0 +n1 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk") allowed-ips fd00::5:2/128 endpoint 127.0.0.1:2 +n2 wg set wg0 private-key <(echo "$key2") listen-port 2 peer "$pub1" preshared-key <(echo "$psk") allowed-ips fd00::5:1/128 endpoint 127.212.121.99:9998 +ip1 link add wg1 type wireguard +ip2 link add wg1 type wireguard +ip1 addr add 192.168.241.1/24 dev wg1 +ip1 addr add fd00::1/112 dev wg1 +ip2 addr add 192.168.241.2/24 dev wg1 +ip2 addr add fd00::2/112 dev wg1 +ip1 link set mtu 1340 up dev wg1 +ip2 link set mtu 1340 up dev wg1 +n1 wg set wg1 listen-port 5 private-key <(echo "$key3") peer "$pub4" allowed-ips 192.168.241.2/32,fd00::2/128 endpoint [fd00::5:2]:5 +n2 wg set wg1 listen-port 5 private-key <(echo "$key4") peer "$pub3" allowed-ips 192.168.241.1/32,fd00::1/128 endpoint [fd00::5:1]:5 +tests +# Try to set up a routing loop between the two namespaces +ip1 link set netns $netns0 dev wg1 +ip0 addr add 192.168.241.1/24 dev wg1 +ip0 link set up dev wg1 +n0 ping -W 1 -c 1 192.168.241.2 +n1 wg set wg0 peer "$pub2" endpoint 192.168.241.2:7 ip2 link del wg0 +ip2 link del wg1 +! n0 ping -W 1 -c 10 -f 192.168.241.2 || false # Should not crash kernel + +ip0 link del wg1 +ip1 link del wg0 # Test using NAT. We now change the topology to this: # ┌────────────────────────────────────────┐ ┌────────────────────────────────────────────────┐ ┌────────────────────────────────────────┐ @@ -282,6 +316,20 @@ pp sleep 3 n2 ping -W 1 -c 1 192.168.241.1 n1 wg set wg0 peer "$pub2" persistent-keepalive 0 +# Test that onion routing works, even when it loops +n1 wg set wg0 peer "$pub3" allowed-ips 192.168.242.2/32 endpoint 192.168.241.2:5 +ip1 addr add 192.168.242.1/24 dev wg0 +ip2 link add wg1 type wireguard +ip2 addr add 192.168.242.2/24 dev wg1 +n2 wg set wg1 private-key <(echo "$key3") listen-port 5 peer "$pub1" allowed-ips 192.168.242.1/32 +ip2 link set wg1 up +n1 ping -W 1 -c 1 192.168.242.2 +ip2 link del wg1 +n1 wg set wg0 peer "$pub3" endpoint 192.168.242.2:5 +! n1 ping -W 1 -c 1 192.168.242.2 || false # Should not crash kernel +n1 wg set wg0 peer "$pub3" remove +ip1 addr del 192.168.242.1/24 dev wg0 + # Do a wg-quick(8)-style policy routing for the default route, making sure vethc has a v6 address to tease out bugs. ip1 -6 addr add fc00::9/96 dev vethc ip1 -6 route add default via fc00::1 From 4005f5c3c9d006157ba716594e0d70c88a235c5e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 6 May 2020 15:33:04 -0600 Subject: [PATCH 254/300] wireguard: send/receive: cond_resched() when processing worker ringbuffers Users with pathological hardware reported CPU stalls on CONFIG_ PREEMPT_VOLUNTARY=y, because the ringbuffers would stay full, meaning these workers would never terminate. That turned out not to be okay on systems without forced preemption, which Sultan observed. This commit adds a cond_resched() to the bottom of each loop iteration, so that these workers don't hog the core. Note that we don't need this on the napi poll worker, since that terminates after its budget is expended. Suggested-by: Sultan Alsawaf Reported-by: Wang Jian Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/receive.c | 2 ++ drivers/net/wireguard/send.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 267f202f1931..2566e13a292d 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -516,6 +516,8 @@ void wg_packet_decrypt_worker(struct work_struct *work) &PACKET_CB(skb)->keypair->receiving)) ? PACKET_STATE_CRYPTED : PACKET_STATE_DEAD; wg_queue_enqueue_per_peer_napi(skb, state); + if (need_resched()) + cond_resched(); } } diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 3e030d614df5..dc3079e17c7f 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -281,6 +281,8 @@ void wg_packet_tx_worker(struct work_struct *work) wg_noise_keypair_put(keypair, false); wg_peer_put(peer); + if (need_resched()) + cond_resched(); } } @@ -304,6 +306,8 @@ void wg_packet_encrypt_worker(struct work_struct *work) } wg_queue_enqueue_per_peer(&PACKET_PEER(first)->tx_queue, first, state); + if (need_resched()) + cond_resched(); } } From 4fed818ef54b08d4b29200e416cce65546ad5312 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 6 May 2020 15:33:05 -0600 Subject: [PATCH 255/300] wireguard: selftests: initalize ipv6 members to NULL to squelch clang warning Without setting these to NULL, clang complains in certain configurations that have CONFIG_IPV6=n: In file included from drivers/net/wireguard/ratelimiter.c:223: drivers/net/wireguard/selftest/ratelimiter.c:173:34: error: variable 'skb6' is uninitialized when used here [-Werror,-Wuninitialized] ret = timings_test(skb4, hdr4, skb6, hdr6, &test_count); ^~~~ drivers/net/wireguard/selftest/ratelimiter.c:123:29: note: initialize the variable 'skb6' to silence this warning struct sk_buff *skb4, *skb6; ^ = NULL drivers/net/wireguard/selftest/ratelimiter.c:173:40: error: variable 'hdr6' is uninitialized when used here [-Werror,-Wuninitialized] ret = timings_test(skb4, hdr4, skb6, hdr6, &test_count); ^~~~ drivers/net/wireguard/selftest/ratelimiter.c:125:22: note: initialize the variable 'hdr6' to silence this warning struct ipv6hdr *hdr6; ^ We silence this warning by setting the variables to NULL as the warning suggests. Reported-by: Arnd Bergmann Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/selftest/ratelimiter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/selftest/ratelimiter.c b/drivers/net/wireguard/selftest/ratelimiter.c index bcd6462e4540..007cd4457c5f 100644 --- a/drivers/net/wireguard/selftest/ratelimiter.c +++ b/drivers/net/wireguard/selftest/ratelimiter.c @@ -120,9 +120,9 @@ bool __init wg_ratelimiter_selftest(void) enum { TRIALS_BEFORE_GIVING_UP = 5000 }; bool success = false; int test = 0, trials; - struct sk_buff *skb4, *skb6; + struct sk_buff *skb4, *skb6 = NULL; struct iphdr *hdr4; - struct ipv6hdr *hdr6; + struct ipv6hdr *hdr6 = NULL; if (IS_ENABLED(CONFIG_KASAN) || IS_ENABLED(CONFIG_UBSAN)) return true; From 243f2148937adc72bcaaa590d482d599c936efde Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 6 May 2020 15:33:06 -0600 Subject: [PATCH 256/300] wireguard: send/receive: use explicit unlikely branch instead of implicit coalescing It's very unlikely that send will become true. It's nearly always false between 0 and 120 seconds of a session, and in most cases becomes true only between 120 and 121 seconds before becoming false again. So, unlikely(send) is clearly the right option here. What happened before was that we had this complex boolean expression with multiple likely and unlikely clauses nested. Since this is evaluated left-to-right anyway, the whole thing got converted to unlikely. So, we can clean this up to better represent what's going on. The generated code is the same. Suggested-by: Sultan Alsawaf Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/receive.c | 13 ++++++------- drivers/net/wireguard/send.c | 15 ++++++--------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 2566e13a292d..3bb5b9ae7cd1 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -226,21 +226,20 @@ void wg_packet_handshake_receive_worker(struct work_struct *work) static void keep_key_fresh(struct wg_peer *peer) { struct noise_keypair *keypair; - bool send = false; + bool send; if (peer->sent_lastminute_handshake) return; rcu_read_lock_bh(); keypair = rcu_dereference_bh(peer->keypairs.current_keypair); - if (likely(keypair && READ_ONCE(keypair->sending.is_valid)) && - keypair->i_am_the_initiator && - unlikely(wg_birthdate_has_expired(keypair->sending.birthdate, - REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT))) - send = true; + send = keypair && READ_ONCE(keypair->sending.is_valid) && + keypair->i_am_the_initiator && + wg_birthdate_has_expired(keypair->sending.birthdate, + REJECT_AFTER_TIME - KEEPALIVE_TIMEOUT - REKEY_TIMEOUT); rcu_read_unlock_bh(); - if (send) { + if (unlikely(send)) { peer->sent_lastminute_handshake = true; wg_packet_send_queued_handshake_initiation(peer, false); } diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index dc3079e17c7f..6687db699803 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -124,20 +124,17 @@ void wg_packet_send_handshake_cookie(struct wg_device *wg, static void keep_key_fresh(struct wg_peer *peer) { struct noise_keypair *keypair; - bool send = false; + bool send; rcu_read_lock_bh(); keypair = rcu_dereference_bh(peer->keypairs.current_keypair); - if (likely(keypair && READ_ONCE(keypair->sending.is_valid)) && - (unlikely(atomic64_read(&keypair->sending.counter.counter) > - REKEY_AFTER_MESSAGES) || - (keypair->i_am_the_initiator && - unlikely(wg_birthdate_has_expired(keypair->sending.birthdate, - REKEY_AFTER_TIME))))) - send = true; + send = keypair && READ_ONCE(keypair->sending.is_valid) && + (atomic64_read(&keypair->sending.counter.counter) > REKEY_AFTER_MESSAGES || + (keypair->i_am_the_initiator && + wg_birthdate_has_expired(keypair->sending.birthdate, REKEY_AFTER_TIME))); rcu_read_unlock_bh(); - if (send) + if (unlikely(send)) wg_packet_send_queued_handshake_initiation(peer, false); } From b0956956bbd7b8c93112269a1e9c8309b0ceb0e7 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 6 May 2020 22:29:06 +0200 Subject: [PATCH 257/300] MAINTAINERS: put DYNAMIC INTERRUPT MODERATION in proper order Commit 9b038086f06b ("docs: networking: convert DIM to RST") added a new file entry to DYNAMIC INTERRUPT MODERATION to the end, and not following alphabetical order. So, ./scripts/checkpatch.pl -f MAINTAINERS complains: WARNING: Misordered MAINTAINERS entry - list file patterns in alphabetic order #5966: FILE: MAINTAINERS:5966: +F: lib/dim/ +F: Documentation/networking/net_dim.rst Reorder the file entries to keep MAINTAINERS nicely ordered. Signed-off-by: Lukas Bulwahn Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 453fe0713e68..232ca4cbbf8b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5936,9 +5936,9 @@ F: lib/dynamic_debug.c DYNAMIC INTERRUPT MODERATION M: Tal Gilboa S: Maintained +F: Documentation/networking/net_dim.rst F: include/linux/dim.h F: lib/dim/ -F: Documentation/networking/net_dim.rst DZ DECSTATION DZ11 SERIAL DRIVER M: "Maciej W. Rozycki" From 16f8036086a929694c3c62f577bb5925fe4fd607 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 6 May 2020 20:34:50 +0200 Subject: [PATCH 258/300] net: flow_offload: skip hw stats check for FLOW_ACTION_HW_STATS_DONT_CARE This patch adds FLOW_ACTION_HW_STATS_DONT_CARE which tells the driver that the frontend does not need counters, this hw stats type request never fails. The FLOW_ACTION_HW_STATS_DISABLED type explicitly requests the driver to disable the stats, however, if the driver cannot disable counters, it bails out. TCA_ACT_HW_STATS_* maintains the 1:1 mapping with FLOW_ACTION_HW_STATS_* except by disabled which is mapped to FLOW_ACTION_HW_STATS_DISABLED (this is 0 in tc). Add tc_act_hw_stats() to perform the mapping between TCA_ACT_HW_STATS_* and FLOW_ACTION_HW_STATS_*. Fixes: 319a1d19471e ("flow_offload: check for basic action hw stats type") Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_flower.c | 3 ++- include/net/flow_offload.h | 9 ++++++++- net/sched/cls_api.c | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 51117a5a6bbf..890b078851c9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -36,7 +36,8 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack); if (err) return err; - } else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED) { + } else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED && + act->hw_stats != FLOW_ACTION_HW_STATS_DONT_CARE) { NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type"); return -EOPNOTSUPP; } diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3619c6acf60f..efc8350b42fb 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -166,15 +166,18 @@ enum flow_action_mangle_base { enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, + FLOW_ACTION_HW_STATS_DISABLED_BIT, }; enum flow_action_hw_stats { - FLOW_ACTION_HW_STATS_DISABLED = 0, + FLOW_ACTION_HW_STATS_DONT_CARE = 0, FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), FLOW_ACTION_HW_STATS_ANY = FLOW_ACTION_HW_STATS_IMMEDIATE | FLOW_ACTION_HW_STATS_DELAYED, + FLOW_ACTION_HW_STATS_DISABLED = + BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), }; typedef void (*action_destr)(void *priv); @@ -325,7 +328,11 @@ __flow_action_hw_stats_check(const struct flow_action *action, return true; if (!flow_action_mixed_hw_stats_check(action, extack)) return false; + action_entry = flow_action_first_entry_get(action); + if (action_entry->hw_stats == FLOW_ACTION_HW_STATS_DONT_CARE) + return true; + if (!check_allow_bit && action_entry->hw_stats != FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c0e5b64b3caf..0a7ecc292bd3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3523,6 +3523,16 @@ static void tcf_sample_get_group(struct flow_action_entry *entry, #endif } +static enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) +{ + if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY)) + return FLOW_ACTION_HW_STATS_DONT_CARE; + else if (!hw_stats) + return FLOW_ACTION_HW_STATS_DISABLED; + + return hw_stats; +} + int tc_setup_flow_action(struct flow_action *flow_action, const struct tcf_exts *exts) { @@ -3546,7 +3556,7 @@ int tc_setup_flow_action(struct flow_action *flow_action, if (err) goto err_out_locked; - entry->hw_stats = act->hw_stats; + entry->hw_stats = tc_act_hw_stats(act->hw_stats); if (is_tcf_gact_ok(act)) { entry->id = FLOW_ACTION_ACCEPT; @@ -3614,7 +3624,7 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); - entry->hw_stats = act->hw_stats; + entry->hw_stats = tc_act_hw_stats(act->hw_stats); entry = &flow_action->entries[++j]; } } else if (is_tcf_csum(act)) { From 91edf63d5022bd0464788ffb4acc3d5febbaf81d Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Thu, 7 May 2020 08:49:18 +0800 Subject: [PATCH 259/300] usb: chipidea: msm: Ensure proper controller reset using role switch API Currently we check to make sure there is no error state on the extcon handle for VBUS when writing to the HS_PHY_GENCONFIG_2 register. When using the USB role-switch API we still need to write to this register absent an extcon handle. This patch makes the appropriate update to ensure the write happens if role-switching is true. Fixes: 05559f10ed79 ("usb: chipidea: add role switch class support") Cc: stable Cc: Greg Kroah-Hartman Cc: Philipp Zabel Cc: linux-usb@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Stephen Boyd Signed-off-by: Bryan O'Donoghue Signed-off-by: Peter Chen Link: https://lore.kernel.org/r/20200507004918.25975-2-peter.chen@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/ci_hdrc_msm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c index af648ba6544d..46105457e1ca 100644 --- a/drivers/usb/chipidea/ci_hdrc_msm.c +++ b/drivers/usb/chipidea/ci_hdrc_msm.c @@ -114,7 +114,7 @@ static int ci_hdrc_msm_notify_event(struct ci_hdrc *ci, unsigned event) hw_write_id_reg(ci, HS_PHY_GENCONFIG_2, HS_PHY_ULPI_TX_PKT_EN_CLR_FIX, 0); - if (!IS_ERR(ci->platdata->vbus_extcon.edev)) { + if (!IS_ERR(ci->platdata->vbus_extcon.edev) || ci->role_switch) { hw_write_id_reg(ci, HS_PHY_GENCONFIG_2, HS_PHY_SESS_VLD_CTRL_EN, HS_PHY_SESS_VLD_CTRL_EN); From 027d0c7101f50cf03aeea9eebf484afd4920c8d3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 5 May 2020 13:59:30 +0100 Subject: [PATCH 260/300] arm64: hugetlb: avoid potential NULL dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The static analyzer in GCC 10 spotted that in huge_pte_alloc() we may pass a NULL pmdp into pte_alloc_map() when pmd_alloc() returns NULL: | CC arch/arm64/mm/pageattr.o | CC arch/arm64/mm/hugetlbpage.o | from arch/arm64/mm/hugetlbpage.c:10: | arch/arm64/mm/hugetlbpage.c: In function ‘huge_pte_alloc’: | ./arch/arm64/include/asm/pgtable-types.h:28:24: warning: dereference of NULL ‘pmdp’ [CWE-690] [-Wanalyzer-null-dereference] | ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro ‘pmd_val’ | arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro ‘pte_alloc_map’ | |arch/arm64/mm/hugetlbpage.c:232:10: | |./arch/arm64/include/asm/pgtable-types.h:28:24: | ./arch/arm64/include/asm/pgtable.h:436:26: note: in expansion of macro ‘pmd_val’ | arch/arm64/mm/hugetlbpage.c:242:10: note: in expansion of macro ‘pte_alloc_map’ This can only occur when the kernel cannot allocate a page, and so is unlikely to happen in practice before other systems start failing. We can avoid this by bailing out if pmd_alloc() fails, as we do earlier in the function if pud_alloc() fails. Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Signed-off-by: Mark Rutland Reported-by: Kyrill Tkachov Cc: # 4.5.x- Cc: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/hugetlbpage.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index bbeb6a5a6ba6..0be3355e3499 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -230,6 +230,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, ptep = (pte_t *)pudp; } else if (sz == (CONT_PTE_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + return NULL; WARN_ON(addr & (sz - 1)); /* From 156c75737255b8db0aa887abcb66b709856cf453 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:53 +0200 Subject: [PATCH 261/300] vboxsf: don't use the source name in the bdi name Simplify the bdi name to mirror what we are doing elsewhere, and drop them name in favor of just using a number. This avoids a potentially very long bdi name. Signed-off-by: Christoph Hellwig Reviewed-by: Hans de Goede Signed-off-by: Jens Axboe --- fs/vboxsf/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 675e26989376..8fe03b4a0d2b 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -164,7 +164,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_free; } - err = super_setup_bdi_name(sb, "vboxsf-%s.%d", fc->source, sbi->bdi_id); + err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id); if (err) goto fail_free; From eb7ae5e06bb6e6ac6bb86872d27c43ebab92f6b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:54 +0200 Subject: [PATCH 262/300] bdi: move bdi_dev_name out of line bdi_dev_name is not a fast path function, move it out of line. This prepares for using it from modular callers without having to export an implementation detail like bdi_unknown_name. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 9 +-------- mm/backing-dev.c | 10 +++++++++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f88197c1ffc2..c9ad5c3b7b4b 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -505,13 +505,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); } -extern const char *bdi_unknown_name; - -static inline const char *bdi_dev_name(struct backing_dev_info *bdi) -{ - if (!bdi || !bdi->dev) - return bdi_unknown_name; - return dev_name(bdi->dev); -} +const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c81b4f3a7268..c2c44c89ee5d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -21,7 +21,7 @@ struct backing_dev_info noop_backing_dev_info = { EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; -const char *bdi_unknown_name = "(unknown)"; +static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU @@ -1043,6 +1043,14 @@ void bdi_put(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_put); +const char *bdi_dev_name(struct backing_dev_info *bdi) +{ + if (!bdi || !bdi->dev) + return bdi_unknown_name; + return dev_name(bdi->dev); +} +EXPORT_SYMBOL_GPL(bdi_dev_name); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) From 1119d265bc20226c241e5540fc8a246d9e30b272 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 28 Apr 2020 16:45:16 -0500 Subject: [PATCH 263/300] objtool: Fix infinite loop in find_jump_table() Kristen found a hang in objtool when building with -ffunction-sections. It was caused by evergreen_pcie_gen2_enable.cold() being laid out immediately before evergreen_pcie_gen2_enable(). Since their "pfunc" is always the same, find_jump_table() got into an infinite loop because it didn't recognize the boundary between the two functions. Fix that with a new prev_insn_same_sym() helper, which doesn't cross subfunction boundaries. Reported-by: Kristen Carlson Accardi Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/378b51c9d9c894dc3294bc460b4b0869e950b7c5.1588110291.git.jpoimboe@redhat.com --- tools/objtool/check.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4b170fd08a28..0e8f9a32e4d1 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -72,6 +72,17 @@ static struct instruction *next_insn_same_func(struct objtool_file *file, return find_insn(file, func->cfunc->sec, func->cfunc->offset); } +static struct instruction *prev_insn_same_sym(struct objtool_file *file, + struct instruction *insn) +{ + struct instruction *prev = list_prev_entry(insn, list); + + if (&prev->list != &file->insn_list && prev->func == insn->func) + return prev; + + return NULL; +} + #define func_for_each_insn(file, func, insn) \ for (insn = find_insn(file, func->sec, func->offset); \ insn; \ @@ -1050,8 +1061,8 @@ static struct rela *find_jump_table(struct objtool_file *file, * it. */ for (; - &insn->list != &file->insn_list && insn->func && insn->func->pfunc == func; - insn = insn->first_jump_src ?: list_prev_entry(insn, list)) { + insn && insn->func && insn->func->pfunc == func; + insn = insn->first_jump_src ?: prev_insn_same_sym(file, insn)) { if (insn != orig_insn && insn->type == INSN_JUMP_DYNAMIC) break; From 90da2e3f25c8b4d742b2687b8fed8fc4eb8851da Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 May 2020 22:39:35 +0300 Subject: [PATCH 264/300] splice: move f_mode checks to do_{splice,tee}() do_splice() is used by io_uring, as will be do_tee(). Move f_mode checks from sys_{splice,tee}() to do_{splice,tee}(), so they're enforced for io_uring as well. Fixes: 7d67af2c0134 ("io_uring: add splice(2) support") Reported-by: Jann Horn Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/splice.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 4735defc46ee..fd0a1e7e5959 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1118,6 +1118,10 @@ long do_splice(struct file *in, loff_t __user *off_in, loff_t offset; long ret; + if (unlikely(!(in->f_mode & FMODE_READ) || + !(out->f_mode & FMODE_WRITE))) + return -EBADF; + ipipe = get_pipe_info(in); opipe = get_pipe_info(out); @@ -1125,12 +1129,6 @@ long do_splice(struct file *in, loff_t __user *off_in, if (off_in || off_out) return -ESPIPE; - if (!(in->f_mode & FMODE_READ)) - return -EBADF; - - if (!(out->f_mode & FMODE_WRITE)) - return -EBADF; - /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; @@ -1153,9 +1151,6 @@ long do_splice(struct file *in, loff_t __user *off_in, offset = out->f_pos; } - if (unlikely(!(out->f_mode & FMODE_WRITE))) - return -EBADF; - if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; @@ -1440,15 +1435,11 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, error = -EBADF; in = fdget(fd_in); if (in.file) { - if (in.file->f_mode & FMODE_READ) { - out = fdget(fd_out); - if (out.file) { - if (out.file->f_mode & FMODE_WRITE) - error = do_splice(in.file, off_in, - out.file, off_out, - len, flags); - fdput(out); - } + out = fdget(fd_out); + if (out.file) { + error = do_splice(in.file, off_in, out.file, off_out, + len, flags); + fdput(out); } fdput(in); } @@ -1770,6 +1761,10 @@ static long do_tee(struct file *in, struct file *out, size_t len, struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; + if (unlikely(!(in->f_mode & FMODE_READ) || + !(out->f_mode & FMODE_WRITE))) + return -EBADF; + /* * Duplicate the contents of ipipe to opipe without actually * copying the data. @@ -1795,7 +1790,7 @@ static long do_tee(struct file *in, struct file *out, size_t len, SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { - struct fd in; + struct fd in, out; int error; if (unlikely(flags & ~SPLICE_F_ALL)) @@ -1807,14 +1802,10 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) error = -EBADF; in = fdget(fdin); if (in.file) { - if (in.file->f_mode & FMODE_READ) { - struct fd out = fdget(fdout); - if (out.file) { - if (out.file->f_mode & FMODE_WRITE) - error = do_tee(in.file, out.file, - len, flags); - fdput(out); - } + out = fdget(fdout); + if (out.file) { + error = do_tee(in.file, out.file, len, flags); + fdput(out); } fdput(in); } From d16a8c31077e75ecb9427fbfea59b74eed00f698 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 6 May 2020 10:20:10 -0400 Subject: [PATCH 265/300] tracing: Wait for preempt irq delay thread to finish Running on a slower machine, it is possible that the preempt delay kernel thread may still be executing if the module was immediately removed after added, and this can cause the kernel to crash as the kernel thread might be executing after its code has been removed. There's no reason that the caller of the code shouldn't just wait for the delay thread to finish, as the thread can also be created by a trigger in the sysfs code, which also has the same issues. Link: http://lore.kernel.org/r/5EA2B0C8.2080706@cn.fujitsu.com Cc: stable@vger.kernel.org Fixes: 793937236d1ee ("lib: Add module for testing preemptoff/irqsoff latency tracers") Reported-by: Xiao Yang Reviewed-by: Xiao Yang Reviewed-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/preemptirq_delay_test.c | 30 ++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 31c0fad4cb9e..c4c86de63cf9 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -113,22 +113,42 @@ static int preemptirq_delay_run(void *data) for (i = 0; i < s; i++) (testfuncs[i])(i); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + __set_current_state(TASK_RUNNING); + return 0; } -static struct task_struct *preemptirq_start_test(void) +static int preemptirq_run_test(void) { + struct task_struct *task; + char task_name[50]; snprintf(task_name, sizeof(task_name), "%s_test", test_mode); - return kthread_run(preemptirq_delay_run, NULL, task_name); + task = kthread_run(preemptirq_delay_run, NULL, task_name); + if (IS_ERR(task)) + return PTR_ERR(task); + if (task) + kthread_stop(task); + return 0; } static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - preemptirq_start_test(); + ssize_t ret; + + ret = preemptirq_run_test(); + if (ret) + return ret; return count; } @@ -148,11 +168,9 @@ static struct kobject *preemptirq_delay_kobj; static int __init preemptirq_delay_init(void) { - struct task_struct *test_task; int retval; - test_task = preemptirq_start_test(); - retval = PTR_ERR_OR_ZERO(test_task); + retval = preemptirq_run_test(); if (retval != 0) return retval; From 11f5efc3ab66284f7aaacc926e9351d658e2577b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 6 May 2020 10:36:18 -0400 Subject: [PATCH 266/300] tracing: Add a vmalloc_sync_mappings() for safe measure x86_64 lazily maps in the vmalloc pages, and the way this works with per_cpu areas can be complex, to say the least. Mappings may happen at boot up, and if nothing synchronizes the page tables, those page mappings may not be synced till they are used. This causes issues for anything that might touch one of those mappings in the path of the page fault handler. When one of those unmapped mappings is touched in the page fault handler, it will cause another page fault, which in turn will cause a page fault, and leave us in a loop of page faults. Commit 763802b53a42 ("x86/mm: split vmalloc_sync_all()") split vmalloc_sync_all() into vmalloc_sync_unmappings() and vmalloc_sync_mappings(), as on system exit, it did not need to do a full sync on x86_64 (although it still needed to be done on x86_32). By chance, the vmalloc_sync_all() would synchronize the page mappings done at boot up and prevent the per cpu area from being a problem for tracing in the page fault handler. But when that synchronization in the exit of a task became a nop, it caused the problem to appear. Link: https://lore.kernel.org/r/20200429054857.66e8e333@oasis.local.home Cc: stable@vger.kernel.org Fixes: 737223fbca3b1 ("tracing: Consolidate buffer allocation code") Reported-by: "Tzvetomir Stoyanov (VMware)" Suggested-by: Joerg Roedel Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..9ed6d92768af 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8525,6 +8525,19 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) */ allocate_snapshot = false; #endif + + /* + * Because of some magic with the way alloc_percpu() works on + * x86_64, we need to synchronize the pgd of all the tables, + * otherwise the trace events that happen in x86_64 page fault + * handlers can't cope with accessing the chance that a + * alloc_percpu()'d memory might be touched in the page fault trace + * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() + * calls in tracing, because something might get triggered within a + * page fault trace event! + */ + vmalloc_sync_mappings(); + return 0; } From 386c82a70319d42dba4f1b30e5e7076f2b4d8c2f Mon Sep 17 00:00:00 2001 From: Yiwei Zhang Date: Tue, 28 Apr 2020 15:08:25 -0700 Subject: [PATCH 267/300] gpu/trace: Minor comment updates for gpu_mem_total tracepoint This change updates the improper comment for the 'size' attribute in the tracepoint definition. Most gfx drivers pre-fault in physical pages instead of making virtual allocations. So we drop the 'Virtual' keyword here and leave this to the implementations. Link: http://lkml.kernel.org/r/20200428220825.169606-1-zzyiwei@google.com Signed-off-by: Yiwei Zhang Signed-off-by: Steven Rostedt (VMware) --- include/trace/events/gpu_mem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/gpu_mem.h b/include/trace/events/gpu_mem.h index 1897822a9150..26d871f96e94 100644 --- a/include/trace/events/gpu_mem.h +++ b/include/trace/events/gpu_mem.h @@ -24,7 +24,7 @@ * * @pid: Put 0 for global total, while positive pid for process total. * - * @size: Virtual size of the allocation in bytes. + * @size: Size of the allocation in bytes. * */ TRACE_EVENT(gpu_mem_total, From f094a233e1d5b1c61cc797d204aa28b611058827 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 28 Apr 2020 21:49:59 +0000 Subject: [PATCH 268/300] tracing: Fix doc mistakes in trace sample As the example below shows, DECLARE_EVENT_CLASS() is used instead of DEFINE_EVENT_CLASS(). Link: http://lkml.kernel.org/r/20200428214959.11259-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Signed-off-by: Steven Rostedt (VMware) --- samples/trace_events/trace-events-sample.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 80b4a70315b6..13a35f7cbe66 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -416,7 +416,7 @@ TRACE_EVENT_FN(foo_bar_with_fn, * Note, TRACE_EVENT() itself is simply defined as: * * #define TRACE_EVENT(name, proto, args, tstruct, assign, printk) \ - * DEFINE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ + * DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, printk); \ * DEFINE_EVENT(name, name, proto, args) * * The DEFINE_EVENT() also can be declared with conditions and reg functions: From 192b7993b3ff92b62b687e940e5e88fa0218d764 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Thu, 23 Apr 2020 12:08:25 +0800 Subject: [PATCH 269/300] tracing: Make tracing_snapshot_instance_cond() static Fix the following sparse warning: kernel/trace/trace.c:950:6: warning: symbol 'tracing_snapshot_instance_cond' was not declared. Should it be static? Link: http://lkml.kernel.org/r/1587614905-48692-1-git-send-email-zou_wei@huawei.com Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ed6d92768af..29615f15a820 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -947,7 +947,8 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) +static void tracing_snapshot_instance_cond(struct trace_array *tr, + void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; From 8842604446d1f005abcbf8c63c12eabdb5695094 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Thu, 7 May 2020 17:23:36 +0800 Subject: [PATCH 270/300] tools/bootconfig: Fix resource leak in apply_xbc() Fix the @data and @fd allocations that are leaked in the error path of apply_xbc(). Link: http://lkml.kernel.org/r/583a49c9-c27a-931d-e6c2-6f63a4b18bea@huawei.com Acked-by: Masami Hiramatsu Signed-off-by: Yunfeng Ye Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 16b9a420e6fd..001076c51712 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -314,6 +314,7 @@ int apply_xbc(const char *path, const char *xbc_path) ret = delete_xbc(path); if (ret < 0) { pr_err("Failed to delete previous boot config: %d\n", ret); + free(data); return ret; } @@ -321,24 +322,26 @@ int apply_xbc(const char *path, const char *xbc_path) fd = open(path, O_RDWR | O_APPEND); if (fd < 0) { pr_err("Failed to open %s: %d\n", path, fd); + free(data); return fd; } /* TODO: Ensure the @path is initramfs/initrd image */ ret = write(fd, data, size + 8); if (ret < 0) { pr_err("Failed to apply a boot config: %d\n", ret); - return ret; + goto out; } /* Write a magic word of the bootconfig */ ret = write(fd, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN); if (ret < 0) { pr_err("Failed to apply a boot config magic: %d\n", ret); - return ret; + goto out; } +out: close(fd); free(data); - return 0; + return ret; } int usage(void) From 63ff822358b276137059520cf16e587e8073e80f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 May 2020 14:56:15 -0600 Subject: [PATCH 271/300] io_uring: don't use 'fd' for openat/openat2/statx We currently make some guesses as when to open this fd, but in reality we have no business (or need) to do so at all. In fact, it makes certain things fail, like O_PATH. Remove the fd lookup from these opcodes, we're just passing the 'fd' to generic helpers anyway. With that, we can also remove the special casing of fd values in io_req_needs_file(), and the 'fd_non_neg' check that we have. And we can ensure that we only read sqe->fd once. This fixes O_PATH usage with openat/openat2, and ditto statx path side oddities. Cc: stable@vger.kernel.org: # v5.6 Reported-by: Max Kellermann Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dd680eb153cb..979d9f977409 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -680,8 +680,6 @@ struct io_op_def { unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; - /* needs req->file assigned IFF fd is >= 0 */ - unsigned fd_non_neg : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -784,8 +782,6 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, }, [IORING_OP_OPENAT] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, @@ -799,8 +795,6 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_STATX] = { .needs_mm = 1, - .needs_file = 1, - .fd_non_neg = 1, .needs_fs = 1, .file_table = 1, }, @@ -837,8 +831,6 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, }, [IORING_OP_OPENAT2] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, @@ -5368,15 +5360,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_steal_work(req, workptr); } -static int io_req_needs_file(struct io_kiocb *req, int fd) -{ - if (!io_op_defs[req->opcode].needs_file) - return 0; - if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg) - return 0; - return 1; -} - static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, int index) { @@ -5414,14 +5397,11 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - int fd, unsigned int flags) + int fd) { bool fixed; - if (!io_req_needs_file(req, fd)) - return 0; - - fixed = (flags & IOSQE_FIXED_FILE); + fixed = (req->flags & REQ_F_FIXED_FILE) != 0; if (unlikely(!fixed && req->needs_fixed_file)) return -EBADF; @@ -5798,7 +5778,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, struct io_submit_state *state, bool async) { unsigned int sqe_flags; - int id, fd; + int id; /* * All io need record the previous position, if LINK vs DARIN, @@ -5850,8 +5830,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, IOSQE_ASYNC | IOSQE_FIXED_FILE | IOSQE_BUFFER_SELECT | IOSQE_IO_LINK); - fd = READ_ONCE(sqe->fd); - return io_req_set_file(state, req, fd, sqe_flags); + if (!io_op_defs[req->opcode].needs_file) + return 0; + + return io_req_set_file(state, req, READ_ONCE(sqe->fd)); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, From b5f2006144c6ae941726037120fa1001ddede784 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 May 2020 18:35:39 -0700 Subject: [PATCH 272/300] ipc/mqueue.c: change __do_notify() to bypass check_kill_permission() Commit cc731525f26a ("signal: Remove kernel interal si_code magic") changed the value of SI_FROMUSER(SI_MESGQ), this means that mq_notify() no longer works if the sender doesn't have rights to send a signal. Change __do_notify() to use do_send_sig_info() instead of kill_pid_info() to avoid check_kill_permission(). This needs the additional notify.sigev_signo != 0 check, shouldn't we change do_mq_notify() to deny sigev_signo == 0 ? Test-case: #include #include #include #include #include static int notified; static void sigh(int sig) { notified = 1; } int main(void) { signal(SIGIO, sigh); int fd = mq_open("/mq", O_RDWR|O_CREAT, 0666, NULL); assert(fd >= 0); struct sigevent se = { .sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGIO, }; assert(mq_notify(fd, &se) == 0); if (!fork()) { assert(setuid(1) == 0); mq_send(fd, "",1,0); return 0; } wait(NULL); mq_unlink("/mq"); assert(notified); return 0; } [manfred@colorfullife.com: 1) Add self_exec_id evaluation so that the implementation matches do_notify_parent 2) use PIDTYPE_TGID everywhere] Fixes: cc731525f26a ("signal: Remove kernel interal si_code magic") Reported-by: Yoji Signed-off-by: Oleg Nesterov Signed-off-by: Manfred Spraul Signed-off-by: Andrew Morton Acked-by: "Eric W. Biederman" Cc: Davidlohr Bueso Cc: Markus Elfring Cc: <1vier1@web.de> Cc: Link: http://lkml.kernel.org/r/e2a782e4-eab9-4f5c-c749-c07a8f7a4e66@colorfullife.com Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index dc8307bf2d74..beff0cfcd1e8 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -142,6 +142,7 @@ struct mqueue_inode_info { struct sigevent notify; struct pid *notify_owner; + u32 notify_self_exec_id; struct user_namespace *notify_user_ns; struct user_struct *user; /* user who created, for accounting */ struct sock *notify_sock; @@ -773,28 +774,44 @@ static void __do_notify(struct mqueue_inode_info *info) * synchronously. */ if (info->notify_owner && info->attr.mq_curmsgs == 1) { - struct kernel_siginfo sig_i; switch (info->notify.sigev_notify) { case SIGEV_NONE: break; - case SIGEV_SIGNAL: - /* sends signal */ + case SIGEV_SIGNAL: { + struct kernel_siginfo sig_i; + struct task_struct *task; + + /* do_mq_notify() accepts sigev_signo == 0, why?? */ + if (!info->notify.sigev_signo) + break; clear_siginfo(&sig_i); sig_i.si_signo = info->notify.sigev_signo; sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; - /* map current pid/uid into info->owner's namespaces */ rcu_read_lock(); + /* map current pid/uid into info->owner's namespaces */ sig_i.si_pid = task_tgid_nr_ns(current, ns_of_pid(info->notify_owner)); - sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid()); + sig_i.si_uid = from_kuid_munged(info->notify_user_ns, + current_uid()); + /* + * We can't use kill_pid_info(), this signal should + * bypass check_kill_permission(). It is from kernel + * but si_fromuser() can't know this. + * We do check the self_exec_id, to avoid sending + * signals to programs that don't expect them. + */ + task = pid_task(info->notify_owner, PIDTYPE_TGID); + if (task && task->self_exec_id == + info->notify_self_exec_id) { + do_send_sig_info(info->notify.sigev_signo, + &sig_i, task, PIDTYPE_TGID); + } rcu_read_unlock(); - - kill_pid_info(info->notify.sigev_signo, - &sig_i, info->notify_owner); break; + } case SIGEV_THREAD: set_cookie(info->notify_cookie, NOTIFY_WOKENUP); netlink_sendskb(info->notify_sock, info->notify_cookie); @@ -1383,6 +1400,7 @@ retry: info->notify.sigev_signo = notification->sigev_signo; info->notify.sigev_value = notification->sigev_value; info->notify.sigev_notify = SIGEV_SIGNAL; + info->notify_self_exec_id = current->self_exec_id; break; } From 11d6761218d19ca06ae5387f4e3692c4fa9e7493 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 7 May 2020 18:35:43 -0700 Subject: [PATCH 273/300] mm, memcg: fix error return value of mem_cgroup_css_alloc() When I run my memcg testcase which creates lots of memcgs, I found there're unexpected out of memory logs while there're still enough available free memory. The error log is mkdir: cannot create directory 'foo.65533': Cannot allocate memory The reason is when we try to create more than MEM_CGROUP_ID_MAX memcgs, an -ENOMEM errno will be set by mem_cgroup_css_alloc(), but the right errno should be -ENOSPC "No space left on device", which is an appropriate errno for userspace's failed mkdir. As the errno really misled me, we should make it right. After this patch, the error log will be mkdir: cannot create directory 'foo.65533': No space left on device [akpm@linux-foundation.org: s/EBUSY/ENOSPC/, per Michal] [akpm@linux-foundation.org: s/EBUSY/ENOSPC/, per Michal] Fixes: 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure after many small jobs") Suggested-by: Matthew Wilcox Signed-off-by: Yafang Shao Signed-off-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20200407063621.GA18914@dhcp22.suse.cz Link: http://lkml.kernel.org/r/1586192163-20099-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5beea03dd58a..a3b97f103966 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4990,19 +4990,22 @@ static struct mem_cgroup *mem_cgroup_alloc(void) unsigned int size; int node; int __maybe_unused i; + long error = -ENOMEM; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); memcg = kzalloc(size, GFP_KERNEL); if (!memcg) - return NULL; + return ERR_PTR(error); memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX, GFP_KERNEL); - if (memcg->id.id < 0) + if (memcg->id.id < 0) { + error = memcg->id.id; goto fail; + } memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); if (!memcg->vmstats_local) @@ -5046,7 +5049,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) fail: mem_cgroup_id_remove(memcg); __mem_cgroup_free(memcg); - return NULL; + return ERR_PTR(error); } static struct cgroup_subsys_state * __ref @@ -5057,8 +5060,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) long error = -ENOMEM; memcg = mem_cgroup_alloc(); - if (!memcg) - return ERR_PTR(error); + if (IS_ERR(memcg)) + return ERR_CAST(memcg); WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; @@ -5108,7 +5111,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) fail: mem_cgroup_id_remove(memcg); mem_cgroup_free(memcg); - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); } static int mem_cgroup_css_online(struct cgroup_subsys_state *css) From e84fe99b68ce353c37ceeecc95dce9696c976556 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 7 May 2020 18:35:46 -0700 Subject: [PATCH 274/300] mm/page_alloc: fix watchdog soft lockups during set_zone_contiguous() Without CONFIG_PREEMPT, it can happen that we get soft lockups detected, e.g., while booting up. watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.6.0-next-20200331+ #4 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 RIP: __pageblock_pfn_to_page+0x134/0x1c0 Call Trace: set_zone_contiguous+0x56/0x70 page_alloc_init_late+0x166/0x176 kernel_init_freeable+0xfa/0x255 kernel_init+0xa/0x106 ret_from_fork+0x35/0x40 The issue becomes visible when having a lot of memory (e.g., 4TB) assigned to a single NUMA node - a system that can easily be created using QEMU. Inside VMs on a hypervisor with quite some memory overcommit, this is fairly easy to trigger. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Pavel Tatashin Reviewed-by: Pankaj Gupta Reviewed-by: Baoquan He Reviewed-by: Shile Zhang Acked-by: Michal Hocko Cc: Kirill Tkhai Cc: Shile Zhang Cc: Pavel Tatashin Cc: Daniel Jordan Cc: Michal Hocko Cc: Alexander Duyck Cc: Baoquan He Cc: Oscar Salvador Cc: Link: http://lkml.kernel.org/r/20200416073417.5003-1-david@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 69827d4fa052..f31eda080823 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1607,6 +1607,7 @@ void set_zone_contiguous(struct zone *zone) if (!__pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone)) return; + cond_resched(); } /* We confirm that there is no hole */ From 324cfb19567c80ed71d7a02f1d5ff4621902f4c3 Mon Sep 17 00:00:00 2001 From: Maciej Grochowski Date: Thu, 7 May 2020 18:35:49 -0700 Subject: [PATCH 275/300] kernel/kcov.c: fix typos in kcov_remote_start documentation Signed-off-by: Maciej Grochowski Signed-off-by: Andrew Morton Reviewed-by: Andrey Konovalov Link: http://lkml.kernel.org/r/20200420030259.31674-1-maciek.grochowski@gmail.com Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index f50354202dbe..8accc9722a81 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -740,8 +740,8 @@ static const struct file_operations kcov_fops = { * kcov_remote_handle() with KCOV_SUBSYSTEM_COMMON as the subsystem id and an * arbitrary 4-byte non-zero number as the instance id). This common handle * then gets saved into the task_struct of the process that issued the - * KCOV_REMOTE_ENABLE ioctl. When this proccess issues system calls that spawn - * kernel threads, the common handle must be retrived via kcov_common_handle() + * KCOV_REMOTE_ENABLE ioctl. When this process issues system calls that spawn + * kernel threads, the common handle must be retrieved via kcov_common_handle() * and passed to the spawned threads via custom annotations. Those kernel * threads must in turn be annotated with kcov_remote_start(common_handle) and * kcov_remote_stop(). All of the threads that are spawned by the same process From e08df079b23e2e982df15aa340bfbaf50f297504 Mon Sep 17 00:00:00 2001 From: Ivan Delalande Date: Thu, 7 May 2020 18:35:53 -0700 Subject: [PATCH 276/300] scripts/decodecode: fix trapping instruction formatting If the trapping instruction contains a ':', for a memory access through segment registers for example, the sed substitution will insert the '*' marker in the middle of the instruction instead of the line address: 2b: 65 48 0f c7 0f cmpxchg16b %gs:*(%rdi) <-- trapping instruction I started to think I had forgotten some quirk of the assembly syntax before noticing that it was actually coming from the script. Fix it to add the address marker at the right place for these instructions: 28: 49 8b 06 mov (%r14),%rax 2b:* 65 48 0f c7 0f cmpxchg16b %gs:(%rdi) <-- trapping instruction 30: 0f 94 c0 sete %al Fixes: 18ff44b189e2 ("scripts/decodecode: make faulting insn ptr more robust") Signed-off-by: Ivan Delalande Signed-off-by: Andrew Morton Reviewed-by: Borislav Petkov Link: http://lkml.kernel.org/r/20200419223653.GA31248@visor Signed-off-by: Linus Torvalds --- scripts/decodecode | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/decodecode b/scripts/decodecode index ba8b8d5834e6..fbdb325cdf4f 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -126,7 +126,7 @@ faultlinenum=$(( $(wc -l $T.oo | cut -d" " -f1) - \ faultline=`cat $T.dis | head -1 | cut -d":" -f2-` faultline=`echo "$faultline" | sed -e 's/\[/\\\[/g; s/\]/\\\]/g'` -cat $T.oo | sed -e "${faultlinenum}s/^\(.*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/" +cat $T.oo | sed -e "${faultlinenum}s/^\([^:]*:\)\(.*\)/\1\*\2\t\t<-- trapping instruction/" echo cat $T.aa cleanup From 996ed22c7a5251d76dcdfe5026ef8230e90066d9 Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Thu, 7 May 2020 18:35:56 -0700 Subject: [PATCH 277/300] arch/x86/kvm/svm/sev.c: change flag passed to GUP fast in sev_pin_memory() When trying to lock read-only pages, sev_pin_memory() fails because FOLL_WRITE is used as the flag for get_user_pages_fast(). Commit 73b0140bf0fe ("mm/gup: change GUP fast to use flags rather than a write 'bool'") updated the get_user_pages_fast() call sites to use flags, but incorrectly updated the call in sev_pin_memory(). As the original coding of this call was correct, revert the change made by that commit. Fixes: 73b0140bf0fe ("mm/gup: change GUP fast to use flags rather than a write 'bool'") Signed-off-by: Janakarajan Natarajan Signed-off-by: Andrew Morton Reviewed-by: Ira Weiny Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Wanpeng Li Cc: Jim Mattson Cc: Joerg Roedel Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Mike Marshall Cc: Brijesh Singh Link: http://lkml.kernel.org/r/20200423152419.87202-1-Janakarajan.Natarajan@amd.com Signed-off-by: Linus Torvalds --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index cf912b4aaba8..89f7f3aebd31 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -345,7 +345,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); + npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; From 0c54a6a44bf3d41e76ce3f583a6ece267618df2e Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Thu, 7 May 2020 18:35:59 -0700 Subject: [PATCH 278/300] eventpoll: fix missing wakeup for ovflist in ep_poll_callback In the event that we add to ovflist, before commit 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") we would be woken up by ep_scan_ready_list, and did no wakeup in ep_poll_callback. With that wakeup removed, if we add to ovflist here, we may never wake up. Rather than adding back the ep_scan_ready_list wakeup - which was resulting in unnecessary wakeups, trigger a wake-up in ep_poll_callback. We noticed that one of our workloads was missing wakeups starting with 339ddb53d373 and upon manual inspection, this wakeup seemed missing to me. With this patch added, we no longer see missing wakeups. I haven't yet tried to make a small reproducer, but the existing kselftests in filesystem/epoll passed for me with this patch. [khazhy@google.com: use if/elif instead of goto + cleanup suggested by Roman] Link: http://lkml.kernel.org/r/20200424190039.192373-1-khazhy@google.com Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") Signed-off-by: Khazhismel Kumykov Signed-off-by: Andrew Morton Reviewed-by: Roman Penyaev Cc: Alexander Viro Cc: Roman Penyaev Cc: Heiher Cc: Jason Baron Cc: Link: http://lkml.kernel.org/r/20200424025057.118641-1-khazhy@google.com Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8c596641a72b..d6ba0e52439b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1171,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; + /* Fast preliminary check */ + if (epi->next != EP_UNACTIVE_PTR) + return false; + /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; @@ -1237,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR && - chain_epi_lockless(epi)) + if (chain_epi_lockless(epi)) + ep_pm_stay_awake_rcu(epi); + } else if (!ep_is_linked(epi)) { + /* In the usual case, add event to ready list. */ + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); - goto out_unlock; - } - - /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(epi) && - list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { - ep_pm_stay_awake_rcu(epi); } /* From 50e36be1fb9572b2e4f2753340bdce3116bf2ce7 Mon Sep 17 00:00:00 2001 From: Aymeric Agon-Rambosson Date: Thu, 7 May 2020 18:36:03 -0700 Subject: [PATCH 279/300] scripts/gdb: repair rb_first() and rb_last() The current implementations of the rb_first() and rb_last() gdb functions have a variable that references itself in its instanciation, which causes the function to throw an error if a specific condition on the argument is met. The original author rather intended to reference the argument and made a typo. Referring the argument instead makes the function work as intended. Signed-off-by: Aymeric Agon-Rambosson Signed-off-by: Andrew Morton Reviewed-by: Stephen Boyd Cc: Jan Kiszka Cc: Kieran Bingham Cc: Douglas Anderson Cc: Nikolay Borisov Cc: Jackie Liu Cc: Jason Wessel Link: http://lkml.kernel.org/r/20200427051029.354840-1-aymeric.agon@yandex.com Signed-off-by: Linus Torvalds --- scripts/gdb/linux/rbtree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gdb/linux/rbtree.py b/scripts/gdb/linux/rbtree.py index 39db889b874c..c4b991607917 100644 --- a/scripts/gdb/linux/rbtree.py +++ b/scripts/gdb/linux/rbtree.py @@ -12,7 +12,7 @@ rb_node_type = utils.CachedType("struct rb_node") def rb_first(root): if root.type == rb_root_type.get_type(): - node = node.address.cast(rb_root_type.get_type().pointer()) + node = root.address.cast(rb_root_type.get_type().pointer()) elif root.type != rb_root_type.get_type().pointer(): raise gdb.GdbError("Must be struct rb_root not {}".format(root.type)) @@ -28,7 +28,7 @@ def rb_first(root): def rb_last(root): if root.type == rb_root_type.get_type(): - node = node.address.cast(rb_root_type.get_type().pointer()) + node = root.address.cast(rb_root_type.get_type().pointer()) elif root.type != rb_root_type.get_type().pointer(): raise gdb.GdbError("Must be struct rb_root not {}".format(root.type)) From cbfc35a48609ceac978791e3ab9dde0c01f8cb20 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 7 May 2020 18:36:06 -0700 Subject: [PATCH 280/300] mm/slub: fix incorrect interpretation of s->offset In a couple of places in the slub memory allocator, the code uses "s->offset" as a check to see if the free pointer is put right after the object. That check is no longer true with commit 3202fa62fb43 ("slub: relocate freelist pointer to middle of object"). As a result, echoing "1" into the validate sysfs file, e.g. of dentry, may cause a bunch of "Freepointer corrupt" error reports like the following to appear with the system in panic afterwards. ============================================================================= BUG dentry(666:pmcd.service) (Tainted: G B): Freepointer corrupt ----------------------------------------------------------------------------- To fix it, use the check "s->offset == s->inuse" in the new helper function freeptr_outside_object() instead. Also add another helper function get_info_end() to return the end of info block (inuse + free pointer if not overlapping with object). Fixes: 3202fa62fb43 ("slub: relocate freelist pointer to middle of object") Signed-off-by: Waiman Long Signed-off-by: Andrew Morton Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Kees Cook Acked-by: Rafael Aquini Cc: Christoph Lameter Cc: Vitaly Nikolenko Cc: Silvio Cesare Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Markus Elfring Cc: Changbin Du Link: http://lkml.kernel.org/r/20200429135328.26976-1-longman@redhat.com Signed-off-by: Linus Torvalds --- mm/slub.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 9bf44955c4f1..b762450fc9f0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -551,15 +551,32 @@ static void print_section(char *level, char *text, u8 *addr, metadata_access_disable(); } +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; +} + static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { struct track *p; - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + p = object + get_info_end(s); return p + alloc; } @@ -686,10 +703,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); - if (s->offset) - off = s->offset + sizeof(void *); - else - off = s->inuse; + off = get_info_end(s); if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); @@ -782,7 +796,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object address * Bytes of the object to be managed. * If the freepointer may overlay the object then the free - * pointer is the first word of the object. + * pointer is at the middle of the object. * * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) @@ -816,11 +830,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) { - unsigned long off = s->inuse; /* The end of info */ - - if (s->offset) - /* Freepointer is placed after the object. */ - off += sizeof(void *); + unsigned long off = get_info_end(s); /* The end of info */ if (s->flags & SLAB_STORE_USER) /* We also have user information there */ @@ -907,7 +917,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && val == SLUB_RED_ACTIVE) + if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -3587,6 +3597,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * * This is the case if we do RCU, have a constructor or * destructor or are poisoning the objects. + * + * The assumption that s->offset >= s->inuse means free + * pointer is outside of the object is used in the + * freeptr_outside_object() function. If that is no + * longer true, the function needs to be modified. */ s->offset = size; size += sizeof(void *); From 28307d938fb2e4056ed4c982c06d1503d7719813 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 7 May 2020 18:36:10 -0700 Subject: [PATCH 281/300] percpu: make pcpu_alloc() aware of current gfp context Since 5.7-rc1, on btrfs we have a percpu counter initialization for which we always pass a GFP_KERNEL gfp_t argument (this happens since commit 2992df73268f78 ("btrfs: Implement DREW lock")). That is safe in some contextes but not on others where allowing fs reclaim could lead to a deadlock because we are either holding some btrfs lock needed for a transaction commit or holding a btrfs transaction handle open. Because of that we surround the call to the function that initializes the percpu counter with a NOFS context using memalloc_nofs_save() (this is done at btrfs_init_fs_root()). However it turns out that this is not enough to prevent a possible deadlock because percpu_alloc() determines if it is in an atomic context by looking exclusively at the gfp flags passed to it (GFP_KERNEL in this case) and it is not aware that a NOFS context is set. Because percpu_alloc() thinks it is in a non atomic context it locks the pcpu_alloc_mutex. This can result in a btrfs deadlock when pcpu_balance_workfn() is running, has acquired that mutex and is waiting for reclaim, while the btrfs task that called percpu_counter_init() (and therefore percpu_alloc()) is holding either the btrfs commit_root semaphore or a transaction handle (done fs/btrfs/backref.c: iterate_extent_inodes()), which prevents reclaim from finishing as an attempt to commit the current btrfs transaction will deadlock. Lockdep reports this issue with the following trace: ====================================================== WARNING: possible circular locking dependency detected 5.6.0-rc7-btrfs-next-77 #1 Not tainted ------------------------------------------------------ kswapd0/91 is trying to acquire lock: ffff8938a3b3fdc8 (&delayed_node->mutex){+.+.}, at: __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] but task is already holding lock: ffffffffb4f0dbc0 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (fs_reclaim){+.+.}: fs_reclaim_acquire.part.0+0x25/0x30 __kmalloc+0x5f/0x3a0 pcpu_create_chunk+0x19/0x230 pcpu_balance_workfn+0x56a/0x680 process_one_work+0x235/0x5f0 worker_thread+0x50/0x3b0 kthread+0x120/0x140 ret_from_fork+0x3a/0x50 -> #3 (pcpu_alloc_mutex){+.+.}: __mutex_lock+0xa9/0xaf0 pcpu_alloc+0x480/0x7c0 __percpu_counter_init+0x50/0xd0 btrfs_drew_lock_init+0x22/0x70 [btrfs] btrfs_get_fs_root+0x29c/0x5c0 [btrfs] resolve_indirect_refs+0x120/0xa30 [btrfs] find_parent_nodes+0x50b/0xf30 [btrfs] btrfs_find_all_leafs+0x60/0xb0 [btrfs] iterate_extent_inodes+0x139/0x2f0 [btrfs] iterate_inodes_from_logical+0xa1/0xe0 [btrfs] btrfs_ioctl_logical_to_ino+0xb4/0x190 [btrfs] btrfs_ioctl+0x165a/0x3130 [btrfs] ksys_ioctl+0x87/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5c/0x260 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #2 (&fs_info->commit_root_sem){++++}: down_write+0x38/0x70 btrfs_cache_block_group+0x2ec/0x500 [btrfs] find_free_extent+0xc6a/0x1600 [btrfs] btrfs_reserve_extent+0x9b/0x180 [btrfs] btrfs_alloc_tree_block+0xc1/0x350 [btrfs] alloc_tree_block_no_bg_flush+0x4a/0x60 [btrfs] __btrfs_cow_block+0x122/0x5a0 [btrfs] btrfs_cow_block+0x106/0x240 [btrfs] commit_cowonly_roots+0x55/0x310 [btrfs] btrfs_commit_transaction+0x509/0xb20 [btrfs] sync_filesystem+0x74/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x93/0xc0 exit_to_usermode_loop+0xf9/0x100 do_syscall_64+0x20d/0x260 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #1 (&space_info->groups_sem){++++}: down_read+0x3c/0x140 find_free_extent+0xef6/0x1600 [btrfs] btrfs_reserve_extent+0x9b/0x180 [btrfs] btrfs_alloc_tree_block+0xc1/0x350 [btrfs] alloc_tree_block_no_bg_flush+0x4a/0x60 [btrfs] __btrfs_cow_block+0x122/0x5a0 [btrfs] btrfs_cow_block+0x106/0x240 [btrfs] btrfs_search_slot+0x50c/0xd60 [btrfs] btrfs_lookup_inode+0x3a/0xc0 [btrfs] __btrfs_update_delayed_inode+0x90/0x280 [btrfs] __btrfs_commit_inode_delayed_items+0x81f/0x870 [btrfs] __btrfs_run_delayed_items+0x8e/0x180 [btrfs] btrfs_commit_transaction+0x31b/0xb20 [btrfs] iterate_supers+0x87/0xf0 ksys_sync+0x60/0xb0 __ia32_sys_sync+0xa/0x10 do_syscall_64+0x5c/0x260 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&delayed_node->mutex){+.+.}: __lock_acquire+0xef0/0x1c80 lock_acquire+0xa2/0x1d0 __mutex_lock+0xa9/0xaf0 __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] btrfs_evict_inode+0x40d/0x560 [btrfs] evict+0xd9/0x1c0 dispose_list+0x48/0x70 prune_icache_sb+0x54/0x80 super_cache_scan+0x124/0x1a0 do_shrink_slab+0x176/0x440 shrink_slab+0x23a/0x2c0 shrink_node+0x188/0x6e0 balance_pgdat+0x31d/0x7f0 kswapd+0x238/0x550 kthread+0x120/0x140 ret_from_fork+0x3a/0x50 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> pcpu_alloc_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(pcpu_alloc_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/91: #0: (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 #1: (shrinker_rwsem){++++}, at: shrink_slab+0x12f/0x2c0 #2: (&type->s_umount_key#43){++++}, at: trylock_super+0x16/0x50 stack backtrace: CPU: 1 PID: 91 Comm: kswapd0 Not tainted 5.6.0-rc7-btrfs-next-77 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8f/0xd0 check_noncircular+0x170/0x190 __lock_acquire+0xef0/0x1c80 lock_acquire+0xa2/0x1d0 __mutex_lock+0xa9/0xaf0 __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] btrfs_evict_inode+0x40d/0x560 [btrfs] evict+0xd9/0x1c0 dispose_list+0x48/0x70 prune_icache_sb+0x54/0x80 super_cache_scan+0x124/0x1a0 do_shrink_slab+0x176/0x440 shrink_slab+0x23a/0x2c0 shrink_node+0x188/0x6e0 balance_pgdat+0x31d/0x7f0 kswapd+0x238/0x550 kthread+0x120/0x140 ret_from_fork+0x3a/0x50 This could be fixed by making btrfs pass GFP_NOFS instead of GFP_KERNEL to percpu_counter_init() in contextes where it is not reclaim safe, however that type of approach is discouraged since memalloc_[nofs|noio]_save() were introduced. Therefore this change makes pcpu_alloc() look up into an existing nofs/noio context before deciding whether it is in an atomic context or not. Signed-off-by: Filipe Manana Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Acked-by: Tejun Heo Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Link: http://lkml.kernel.org/r/20200430164356.15543-1-fdmanana@kernel.org Signed-off-by: Linus Torvalds --- mm/percpu.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index d7e3bc649f4e..7da7d7737dab 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include @@ -1557,10 +1558,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { - /* whitelisted flags that can be passed to the backing allocators */ - gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; - bool do_warn = !(gfp & __GFP_NOWARN); + gfp_t pcpu_gfp; + bool is_atomic; + bool do_warn; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; @@ -1569,6 +1569,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, void __percpu *ptr; size_t bits, bit_align; + gfp = current_gfp_context(gfp); + /* whitelisted flags that can be passed to the backing allocators */ + pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + do_warn = !(gfp & __GFP_NOWARN); + /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. From 474328c06e3ee75bb6b92826fec90fdc8ef3c573 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Thu, 7 May 2020 18:36:13 -0700 Subject: [PATCH 282/300] kselftests: introduce new epoll60 testcase for catching lost wakeups This test case catches lost wake up introduced by commit 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") The test is simple: we have 10 threads and 10 event fds. Each thread can harvest only 1 event. 1 producer fires all 10 events at once and waits that all 10 events will be observed by 10 threads. In case of lost wakeup epoll_wait() will timeout and 0 will be returned. Test case catches two sort of problems: forgotten wakeup on event, which hits the ->ovflist list, this problem was fixed by: 5a2513239750 ("eventpoll: fix missing wakeup for ovflist in ep_poll_callback") the other problem is when several sequential events hit the same waiting thread, thus other waiters get no wakeups. Problem is fixed in the following patch. Signed-off-by: Roman Penyaev Signed-off-by: Andrew Morton Cc: Khazhismel Kumykov Cc: Alexander Viro Cc: Heiher Cc: Jason Baron Link: http://lkml.kernel.org/r/20200430130326.1368509-1-rpenyaev@suse.de Signed-off-by: Linus Torvalds --- .../filesystems/epoll/epoll_wakeup_test.c | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index 11eee0b60040..d979ff14775a 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -3,6 +3,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include @@ -3136,4 +3137,149 @@ TEST(epoll59) close(ctx.sfd[0]); } +enum { + EPOLL60_EVENTS_NR = 10, +}; + +struct epoll60_ctx { + volatile int stopped; + int ready; + int waiters; + int epfd; + int evfd[EPOLL60_EVENTS_NR]; +}; + +static void *epoll60_wait_thread(void *ctx_) +{ + struct epoll60_ctx *ctx = ctx_; + struct epoll_event e; + sigset_t sigmask; + uint64_t v; + int ret; + + /* Block SIGUSR1 */ + sigemptyset(&sigmask); + sigaddset(&sigmask, SIGUSR1); + sigprocmask(SIG_SETMASK, &sigmask, NULL); + + /* Prepare empty mask for epoll_pwait() */ + sigemptyset(&sigmask); + + while (!ctx->stopped) { + /* Mark we are ready */ + __atomic_fetch_add(&ctx->ready, 1, __ATOMIC_ACQUIRE); + + /* Start when all are ready */ + while (__atomic_load_n(&ctx->ready, __ATOMIC_ACQUIRE) && + !ctx->stopped); + + /* Account this waiter */ + __atomic_fetch_add(&ctx->waiters, 1, __ATOMIC_ACQUIRE); + + ret = epoll_pwait(ctx->epfd, &e, 1, 2000, &sigmask); + if (ret != 1) { + /* We expect only signal delivery on stop */ + assert(ret < 0 && errno == EINTR && "Lost wakeup!\n"); + assert(ctx->stopped); + break; + } + + ret = read(e.data.fd, &v, sizeof(v)); + /* Since we are on ET mode, thus each thread gets its own fd. */ + assert(ret == sizeof(v)); + + __atomic_fetch_sub(&ctx->waiters, 1, __ATOMIC_RELEASE); + } + + return NULL; +} + +static inline unsigned long long msecs(void) +{ + struct timespec ts; + unsigned long long msecs; + + clock_gettime(CLOCK_REALTIME, &ts); + msecs = ts.tv_sec * 1000ull; + msecs += ts.tv_nsec / 1000000ull; + + return msecs; +} + +static inline int count_waiters(struct epoll60_ctx *ctx) +{ + return __atomic_load_n(&ctx->waiters, __ATOMIC_ACQUIRE); +} + +TEST(epoll60) +{ + struct epoll60_ctx ctx = { 0 }; + pthread_t waiters[ARRAY_SIZE(ctx.evfd)]; + struct epoll_event e; + int i, n, ret; + + signal(SIGUSR1, signal_handler); + + ctx.epfd = epoll_create1(0); + ASSERT_GE(ctx.epfd, 0); + + /* Create event fds */ + for (i = 0; i < ARRAY_SIZE(ctx.evfd); i++) { + ctx.evfd[i] = eventfd(0, EFD_NONBLOCK); + ASSERT_GE(ctx.evfd[i], 0); + + e.events = EPOLLIN | EPOLLET; + e.data.fd = ctx.evfd[i]; + ASSERT_EQ(epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd[i], &e), 0); + } + + /* Create waiter threads */ + for (i = 0; i < ARRAY_SIZE(waiters); i++) + ASSERT_EQ(pthread_create(&waiters[i], NULL, + epoll60_wait_thread, &ctx), 0); + + for (i = 0; i < 300; i++) { + uint64_t v = 1, ms; + + /* Wait for all to be ready */ + while (__atomic_load_n(&ctx.ready, __ATOMIC_ACQUIRE) != + ARRAY_SIZE(ctx.evfd)) + ; + + /* Steady, go */ + __atomic_fetch_sub(&ctx.ready, ARRAY_SIZE(ctx.evfd), + __ATOMIC_ACQUIRE); + + /* Wait all have gone to kernel */ + while (count_waiters(&ctx) != ARRAY_SIZE(ctx.evfd)) + ; + + /* 1ms should be enough to schedule away */ + usleep(1000); + + /* Quickly signal all handles at once */ + for (n = 0; n < ARRAY_SIZE(ctx.evfd); n++) { + ret = write(ctx.evfd[n], &v, sizeof(v)); + ASSERT_EQ(ret, sizeof(v)); + } + + /* Busy loop for 1s and wait for all waiters to wake up */ + ms = msecs(); + while (count_waiters(&ctx) && msecs() < ms + 1000) + ; + + ASSERT_EQ(count_waiters(&ctx), 0); + } + ctx.stopped = 1; + /* Stop waiters */ + for (i = 0; i < ARRAY_SIZE(waiters); i++) + ret = pthread_kill(waiters[i], SIGUSR1); + for (i = 0; i < ARRAY_SIZE(waiters); i++) + pthread_join(waiters[i], NULL); + + for (i = 0; i < ARRAY_SIZE(waiters); i++) + close(ctx.evfd[i]); + close(ctx.epfd); +} + TEST_HARNESS_MAIN From 412895f03cbf9633298111cb4dfde13b7720e2c5 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Thu, 7 May 2020 18:36:16 -0700 Subject: [PATCH 283/300] epoll: atomically remove wait entry on wake up This patch does two things: - fixes a lost wakeup introduced by commit 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") - improves performance for events delivery. The description of the problem is the following: if N (>1) threads are waiting on ep->wq for new events and M (>1) events come, it is quite likely that >1 wakeups hit the same wait queue entry, because there is quite a big window between __add_wait_queue_exclusive() and the following __remove_wait_queue() calls in ep_poll() function. This can lead to lost wakeups, because thread, which was woken up, can handle not all the events in ->rdllist. (in better words the problem is described here: https://lkml.org/lkml/2019/10/7/905) The idea of the current patch is to use init_wait() instead of init_waitqueue_entry(). Internally init_wait() sets autoremove_wake_function as a callback, which removes the wait entry atomically (under the wq locks) from the list, thus the next coming wakeup hits the next wait entry in the wait queue, thus preventing lost wakeups. Problem is very well reproduced by the epoll60 test case [1]. Wait entry removal on wakeup has also performance benefits, because there is no need to take a ep->lock and remove wait entry from the queue after the successful wakeup. Here is the timing output of the epoll60 test case: With explicit wakeup from ep_scan_ready_list() (the state of the code prior 339ddb53d373): real 0m6.970s user 0m49.786s sys 0m0.113s After this patch: real 0m5.220s user 0m36.879s sys 0m0.019s The other testcase is the stress-epoll [2], where one thread consumes all the events and other threads produce many events: With explicit wakeup from ep_scan_ready_list() (the state of the code prior 339ddb53d373): threads events/ms run-time ms 8 5427 1474 16 6163 2596 32 6824 4689 64 7060 9064 128 6991 18309 After this patch: threads events/ms run-time ms 8 5598 1429 16 7073 2262 32 7502 4265 64 7640 8376 128 7634 16767 (number of "events/ms" represents event bandwidth, thus higher is better; number of "run-time ms" represents overall time spent doing the benchmark, thus lower is better) [1] tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c [2] https://github.com/rouming/test-tools/blob/master/stress-epoll.c Signed-off-by: Roman Penyaev Signed-off-by: Andrew Morton Reviewed-by: Jason Baron Cc: Khazhismel Kumykov Cc: Alexander Viro Cc: Heiher Cc: Link: http://lkml.kernel.org/r/20200430130326.1368509-2-rpenyaev@suse.de Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d6ba0e52439b..aba03ee749f8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; u64 slack = 0; - bool waiter = false; wait_queue_entry_t wait; ktime_t expires, *to = NULL; @@ -1867,21 +1866,23 @@ fetch_events: */ ep_reset_busy_poll_napi_id(ep); - /* - * We don't have any available event to return to the caller. We need - * to sleep here, and we will be woken by ep_poll_callback() when events - * become available. - */ - if (!waiter) { - waiter = true; - init_waitqueue_entry(&wait, current); - + do { + /* + * Internally init_wait() uses autoremove_wake_function(), + * thus wait entry is removed from the wait queue on each + * wakeup. Why it is important? In case of several waiters + * each new wakeup will hit the next waiter, giving it the + * chance to harvest new event. Otherwise wakeup can be + * lost. This is also good performance-wise, because on + * normal wakeup path no need to call __remove_wait_queue() + * explicitly, thus ep->lock is not taken, which halts the + * event delivery. + */ + init_wait(&wait); write_lock_irq(&ep->lock); __add_wait_queue_exclusive(&ep->wq, &wait); write_unlock_irq(&ep->lock); - } - for (;;) { /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state @@ -1911,10 +1912,20 @@ fetch_events: timed_out = 1; break; } - } + + /* We were woken up, thus go and try to harvest some events */ + eavail = 1; + + } while (0); __set_current_state(TASK_RUNNING); + if (!list_empty_careful(&wait.entry)) { + write_lock_irq(&ep->lock); + __remove_wait_queue(&ep->wq, &wait); + write_unlock_irq(&ep->lock); + } + send_events: /* * Try to transfer events to user space. In case we get 0 events and @@ -1925,12 +1936,6 @@ send_events: !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; - if (waiter) { - write_lock_irq(&ep->lock); - __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - } - return res; } From 17e34526f0a8f81a214d1ee6f7d8ad2a9c9bae33 Mon Sep 17 00:00:00 2001 From: Qiwu Chen Date: Thu, 7 May 2020 18:36:20 -0700 Subject: [PATCH 284/300] mm/vmscan: remove unnecessary argument description of isolate_lru_pages() Since commit a9e7c39fa9fd9 ("mm/vmscan.c: remove 7th argument of isolate_lru_pages()"), the explanation of 'mode' argument has been unnecessary. Let's remove it. Signed-off-by: Qiwu Chen Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200501090346.2894-1-chenqiwu@xiaomi.com Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index b06868fc4926..a37c87b5aee2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1625,7 +1625,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. * @sc: The scan_control struct for this reclaim session - * @mode: One of the LRU isolation modes * @lru: LRU list id for isolating * * returns how many pages were moved onto *@dst. From 8d58f222e85f01da0c0e1fc1e77986c86de889e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 May 2020 18:36:23 -0700 Subject: [PATCH 285/300] ubsan: disable UBSAN_ALIGNMENT under COMPILE_TEST The documentation for UBSAN_ALIGNMENT already mentions that it should not be used on all*config builds (and for efficient-unaligned-access architectures), so just refactor the Kconfig to correctly implement this so randconfigs will stop creating insane images that freak out objtool under CONFIG_UBSAN_TRAP (due to the false positives producing functions that never return, etc). Link: http://lkml.kernel.org/r/202005011433.C42EA3E2D@keescook Fixes: 0887a7ebc977 ("ubsan: add trap instrumentation option") Signed-off-by: Kees Cook Reported-by: Randy Dunlap Link: https://lore.kernel.org/linux-next/202004231224.D6B3B650@keescook/ Signed-off-by: Andrew Morton Cc: Josh Poimboeuf Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: Andrey Ryabinin Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 17 +++++++---------- .../selftests/wireguard/qemu/debug.config | 1 - 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 48469c95d78e..929211039bac 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -60,17 +60,14 @@ config UBSAN_SANITIZE_ALL Enabling this option will get kernel image size increased significantly. -config UBSAN_NO_ALIGNMENT - bool "Disable checking of pointers alignment" - default y if HAVE_EFFICIENT_UNALIGNED_ACCESS - help - This option disables the check of unaligned memory accesses. - This option should be used when building allmodconfig. - Disabling this option on architectures that support unaligned - accesses may produce a lot of false positives. - config UBSAN_ALIGNMENT - def_bool !UBSAN_NO_ALIGNMENT + bool "Enable checks for pointers alignment" + default !HAVE_EFFICIENT_UNALIGNED_ACCESS + depends on !X86 || !COMPILE_TEST + help + This option enables the check of unaligned memory accesses. + Enabling this option on architectures that support unaligned + accesses may produce a lot of false positives. config TEST_UBSAN tristate "Module for testing for undefined behavior detection" diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 5909e7ef2a5c..9803dbb54181 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -25,7 +25,6 @@ CONFIG_KASAN=y CONFIG_KASAN_INLINE=y CONFIG_UBSAN=y CONFIG_UBSAN_SANITIZE_ALL=y -CONFIG_UBSAN_NO_ALIGNMENT=y CONFIG_UBSAN_NULL=y CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=8192 From 14f69140ff9c92a0928547ceefb153a842e8492c Mon Sep 17 00:00:00 2001 From: Henry Willard Date: Thu, 7 May 2020 18:36:27 -0700 Subject: [PATCH 286/300] mm: limit boost_watermark on small zones Commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") adds a boost_watermark() function which increases the min watermark in a zone by at least pageblock_nr_pages or the number of pages in a page block. On Arm64, with 64K pages and 512M huge pages, this is 8192 pages or 512M. It does this regardless of the number of managed pages managed in the zone or the likelihood of success. This can put the zone immediately under water in terms of allocating pages from the zone, and can cause a small machine to fail immediately due to OoM. Unlike set_recommended_min_free_kbytes(), which substantially increases min_free_kbytes and is tied to THP, boost_watermark() can be called even if THP is not active. The problem is most likely to appear on architectures such as Arm64 where pageblock_nr_pages is very large. It is desirable to run the kdump capture kernel in as small a space as possible to avoid wasting memory. In some architectures, such as Arm64, there are restrictions on where the capture kernel can run, and therefore, the space available. A capture kernel running in 768M can fail due to OoM immediately after boost_watermark() sets the min in zone DMA32, where most of the memory is, to 512M. It fails even though there is over 500M of free memory. With boost_watermark() suppressed, the capture kernel can run successfully in 448M. This patch limits boost_watermark() to boosting a zone's min watermark only when there are enough pages that the boost will produce positive results. In this case that is estimated to be four times as many pages as pageblock_nr_pages. Mel said: : There is no harm in marking it stable. Clearly it does not happen very : often but it's not impossible. 32-bit x86 is a lot less common now : which would previously have been vulnerable to triggering this easily. : ppc64 has a larger base page size but typically only has one zone. : arm64 is likely the most vulnerable, particularly when CMA is : configured with a small movable zone. Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") Signed-off-by: Henry Willard Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: Link: http://lkml.kernel.org/r/1588294148-6586-1-git-send-email-henry.willard@oracle.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f31eda080823..13cc653122b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2401,6 +2401,14 @@ static inline void boost_watermark(struct zone *zone) if (!watermark_boost_factor) return; + /* + * Don't bother in zones that are unlikely to produce results. + * On small machines, including kdump capture kernels running + * in a small area, boosting the watermark can cause an out of + * memory situation immediately. + */ + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) + return; max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); From fb3637a113349f53830f7d6ca45891b7192cd28f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 5 May 2020 20:47:47 +0200 Subject: [PATCH 287/300] iommu/virtio: Reverse arguments to list_add Elsewhere in the file, there is a list_for_each_entry with &vdev->resv_regions as the second argument, suggesting that &vdev->resv_regions is the list head. So exchange the arguments on the list_add call to put the list head in the second argument. Fixes: 2a5a31487445 ("iommu/virtio: Add probe request") Signed-off-by: Julia Lawall Reviewed-by: Jean-Philippe Brucker Link: https://lore.kernel.org/r/1588704467-13431-1-git-send-email-Julia.Lawall@inria.fr Signed-off-by: Joerg Roedel --- drivers/iommu/virtio-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index d5cac4f46ca5..4e1d11af23c8 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -453,7 +453,7 @@ static int viommu_add_resv_mem(struct viommu_endpoint *vdev, if (!region) return -ENOMEM; - list_add(&vdev->resv_regions, ®ion->list); + list_add(®ion->list, &vdev->resv_regions); return 0; } From 12ae44a40a1be891bdc6463f8c7072b4ede746ef Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 5 May 2020 13:59:02 +0100 Subject: [PATCH 288/300] ceph: demote quotarealm lookup warning to a debug message A misconfigured cephx can easily result in having the kernel client flooding the logs with: ceph: Can't lookup inode 1 (err: -13) Change this message to debug level. Cc: stable@vger.kernel.org URL: https://tracker.ceph.com/issues/44546 Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index de56dee60540..19507e2fdb57 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -159,8 +159,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, } if (IS_ERR(in)) { - pr_warn("Can't lookup inode %llx (err: %ld)\n", - realm->ino, PTR_ERR(in)); + dout("Can't lookup inode %llx (err: %ld)\n", + realm->ino, PTR_ERR(in)); qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ } else { qri->timeout = 0; From 78a5255ffb6a1af189a83e493d916ba1c54d8c75 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 13:57:10 -0700 Subject: [PATCH 289/300] Stop the ad-hoc games with -Wno-maybe-initialized We have some rather random rules about when we accept the "maybe-initialized" warnings, and when we don't. For example, we consider it unreliable for gcc versions < 4.9, but also if -O3 is enabled, or if optimizing for size. And then various kernel config options disabled it, because they know that they trigger that warning by confusing gcc sufficiently (ie PROFILE_ALL_BRANCHES). And now gcc-10 seems to be introducing a lot of those warnings too, so it falls under the same heading as 4.9 did. At the same time, we have a very straightforward way to _enable_ that warning when wanted: use "W=2" to enable more warnings. So stop playing these ad-hoc games, and just disable that warning by default, with the known and straight-forward "if you want to work on the extra compiler warnings, use W=123". Would it be great to have code that is always so obvious that it never confuses the compiler whether a variable is used initialized or not? Yes, it would. In a perfect world, the compilers would be smarter, and our source code would be simpler. That's currently not the world we live in, though. Signed-off-by: Linus Torvalds --- Makefile | 7 +++---- init/Kconfig | 18 ------------------ kernel/trace/Kconfig | 1 - 3 files changed, 3 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index 3512f7b243dc..1e94f4f15d92 100644 --- a/Makefile +++ b/Makefile @@ -729,10 +729,6 @@ else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os endif -ifdef CONFIG_CC_DISABLE_WARN_MAYBE_UNINITIALIZED -KBUILD_CFLAGS += -Wno-maybe-uninitialized -endif - # Tell gcc to never replace conditional load with a non-conditional one KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) KBUILD_CFLAGS += $(call cc-option,-fno-allow-store-data-races) @@ -881,6 +877,9 @@ KBUILD_CFLAGS += -Wno-pointer-sign # disable stringop warnings in gcc 8+ KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) +# Enabled with W=2, disabled by default as noisy +KBUILD_CFLAGS += $(call cc-disable-warning, maybe-uninitialized) + # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) diff --git a/init/Kconfig b/init/Kconfig index 9e22ee8fbd75..9278a603d399 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -39,22 +39,6 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) -config CC_HAS_WARN_MAYBE_UNINITIALIZED - def_bool $(cc-option,-Wmaybe-uninitialized) - help - GCC >= 4.7 supports this option. - -config CC_DISABLE_WARN_MAYBE_UNINITIALIZED - bool - depends on CC_HAS_WARN_MAYBE_UNINITIALIZED - default CC_IS_GCC && GCC_VERSION < 40900 # unreliable for GCC < 4.9 - help - GCC's -Wmaybe-uninitialized is not reliable by definition. - Lots of false positive warnings are produced in some cases. - - If this option is enabled, -Wno-maybe-uninitialzed is passed - to the compiler to suppress maybe-uninitialized warnings. - config CONSTRUCTORS bool depends on !UML @@ -1257,14 +1241,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE config CC_OPTIMIZE_FOR_PERFORMANCE_O3 bool "Optimize more for performance (-O3)" depends on ARC - imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help Choosing this option will pass "-O3" to your compiler to optimize the kernel yet more for performance. config CC_OPTIMIZE_FOR_SIZE bool "Optimize for size (-Os)" - imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help Choosing this option will pass "-Os" to your compiler resulting in a smaller kernel. diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 402eef84c859..743647005f64 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -466,7 +466,6 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING - imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. From 5c45de21a2223fe46cf9488c99a7fbcf01527670 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 14:30:29 -0700 Subject: [PATCH 290/300] gcc-10: disable 'zero-length-bounds' warning for now This is a fine warning, but we still have a number of zero-length arrays in the kernel that come from the traditional gcc extension. Yes, they are getting converted to flexible arrays, but in the meantime the gcc-10 warning about zero-length bounds is very verbose, and is hiding other issues. I missed one actual build failure because it was hidden among hundreds of lines of warning. Thankfully I caught it on the second go before pushing things out, but it convinced me that I really need to disable the new warnings for now. We'll hopefully be all done with our conversion to flexible arrays in the not too distant future, and we can then re-enable this warning. Signed-off-by: Linus Torvalds --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 1e94f4f15d92..68a781326dd7 100644 --- a/Makefile +++ b/Makefile @@ -877,6 +877,9 @@ KBUILD_CFLAGS += -Wno-pointer-sign # disable stringop warnings in gcc 8+ KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) +# We'll want to enable this eventually, but it's not going away for 5.7 at least +KBUILD_CFLAGS += $(call cc-disable-warning, zero-length-bounds) + # Enabled with W=2, disabled by default as noisy KBUILD_CFLAGS += $(call cc-disable-warning, maybe-uninitialized) From 44720996e2d79e47d508b0abe99b931a726a3197 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 14:52:44 -0700 Subject: [PATCH 291/300] gcc-10: disable 'array-bounds' warning for now This is another fine warning, related to the 'zero-length-bounds' one, but hitting the same historical code in the kernel. Because C didn't historically support flexible array members, we have code that instead uses a one-sized array, the same way we have cases of zero-sized arrays. The one-sized arrays come from either not wanting to use the gcc zero-sized array extension, or from a slight convenience-feature, where particularly for strings, the size of the structure now includes the allocation for the final NUL character. So with a "char name[1];" at the end of a structure, you can do things like v = my_malloc(sizeof(struct vendor) + strlen(name)); and avoid the "+1" for the terminator. Yes, the modern way to do that is with a flexible array, and using 'offsetof()' instead of 'sizeof()', and adding the "+1" by hand. That also technically gets the size "more correct" in that it avoids any alignment (and thus padding) issues, but this is another long-term cleanup thing that will not happen for 5.7. So disable the warning for now, even though it's potentially quite useful. Having a slew of warnings that then hide more urgent new issues is not an improvement. Signed-off-by: Linus Torvalds --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 68a781326dd7..b4c88f0eb6fe 100644 --- a/Makefile +++ b/Makefile @@ -879,6 +879,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) # We'll want to enable this eventually, but it's not going away for 5.7 at least KBUILD_CFLAGS += $(call cc-disable-warning, zero-length-bounds) +KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) # Enabled with W=2, disabled by default as noisy KBUILD_CFLAGS += $(call cc-disable-warning, maybe-uninitialized) From d51cfc53ade3189455a1b88ec7a2ff0c24597cf8 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Mon, 4 May 2020 14:47:55 +0200 Subject: [PATCH 292/300] bdi: use bdi_dev_name() to get device name Use the common interface bdi_dev_name() to get device name. Signed-off-by: Yufen Yu Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Bart Van Assche Add missing include BFQ Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++++-- block/blk-cgroup.c | 2 +- fs/ceph/debugfs.c | 2 +- include/trace/events/wbt.h | 8 ++++---- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 78ba57efd16b..3d411716d7ee 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -123,6 +123,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -4976,8 +4977,9 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); switch (ioprio_class) { default: - dev_err(bfqq->bfqd->queue->backing_dev_info->dev, - "bfq: bad prio class %d\n", ioprio_class); + pr_err("bdi %s: bfq: bad prio class %d\n", + bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), + ioprio_class); /* fall through */ case IOPRIO_CLASS_NONE: /* diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c5dc833212e1..930212c1a512 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -496,7 +496,7 @@ const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info->dev) - return dev_name(blkg->q->backing_dev_info->dev); + return bdi_dev_name(blkg->q->backing_dev_info); return NULL; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 481ac97b4d25..dcaed75de9e6 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -271,7 +271,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) &congestion_kb_fops); snprintf(name, sizeof(name), "../../bdi/%s", - dev_name(fsc->sb->s_bdi->dev)); + bdi_dev_name(fsc->sb->s_bdi)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index 784814160197..9c66e59d859c 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,7 +33,7 @@ TRACE_EVENT(wbt_stat, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; @@ -68,7 +68,7 @@ TRACE_EVENT(wbt_lat, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ), @@ -105,7 +105,7 @@ TRACE_EVENT(wbt_step, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; @@ -141,7 +141,7 @@ TRACE_EVENT(wbt_timer, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; From 6bd87eec23cbc9ed222bed0f5b5b02bf300e9a8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:56 +0200 Subject: [PATCH 293/300] bdi: add a ->dev_name field to struct backing_dev_info Cache a copy of the name for the life time of the backing_dev_info structure so that we can reference it even after unregistering. Fixes: 68f23b89067f ("memcg: fix a crash in wb_workfn when a device disappears") Reported-by: Yufen Yu Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ee577a83cfe6..7367150f962a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -219,6 +219,7 @@ struct backing_dev_info { wait_queue_head_t wb_waitq; struct device *dev; + char dev_name[64]; struct device *owner; struct timer_list laptop_mode_wb_timer; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c2c44c89ee5d..efc5b83acd2d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -938,7 +938,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; - dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args); + vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); + dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); @@ -1047,7 +1048,7 @@ const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; - return dev_name(bdi->dev); + return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); From a8de6639169b90e3dc4f27e752a3c5abac5e90da Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 May 2020 23:07:04 +0300 Subject: [PATCH 294/300] nvme-pci: fix "slimmer CQ head update" Pre-incrementing ->cq_head can't be done in memory because OOB value can be observed by another context. This devalues space savings compared to original code :-\ $ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux add/remove: 0/0 grow/shrink: 0/4 up/down: 0/-32 (-32) Function old new delta nvme_poll_irqdisable 464 456 -8 nvme_poll 455 447 -8 nvme_irq 388 380 -8 nvme_dev_disable 955 947 -8 But the code is minimal now: one read for head, one read for q_depth, one increment, one comparison, single instruction phase bit update and one write for new head. Signed-off-by: Alexey Dobriyan Reported-by: John Garry Tested-by: John Garry Fixes: e2a366a4b0feaeb ("nvme-pci: slimmer CQ head update") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4e79e412b276..e13c370de830 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -973,9 +973,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { - if (++nvmeq->cq_head == nvmeq->q_depth) { + u16 tmp = nvmeq->cq_head + 1; + + if (tmp == nvmeq->q_depth) { nvmeq->cq_head = 0; nvmeq->cq_phase ^= 1; + } else { + nvmeq->cq_head = tmp; } } From 59c7c3caaaf8750df4ec3255082f15eb4e371514 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 6 May 2020 15:44:02 -0700 Subject: [PATCH 295/300] nvme: fix possible hang when ns scanning fails during error recovery When the controller is reconnecting, the host fails I/O and admin commands as the host cannot reach the controller. ns scanning may revalidate namespaces during that period and it is wrong to remove namespaces due to these failures as we may hang (see 205da2434301). One command that may fail is nvme_identify_ns_descs. Since we return success due to having ns identify descriptor list optional, we continue to compare ns identifiers in nvme_revalidate_disk, obviously fail and return -ENODEV to nvme_validate_ns, which will remove the namespace. Exactly what we don't want to happen. Fixes: 22802bf742c2 ("nvme: Namepace identification descriptor list is optional") Tested-by: Anton Eidelman Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f2adea96b04c..f3c037f5a9ba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1110,7 +1110,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, * Don't treat an error as fatal, as we potentially already * have a NGUID or EUI-64. */ - if (status > 0) + if (status > 0 && !(status & NVME_SC_DNR)) status = 0; goto free_data; } From 5a76021c2eff7fcf2f0918a08fd8a37ce7922921 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 15:40:52 -0700 Subject: [PATCH 296/300] gcc-10: disable 'stringop-overflow' warning for now This is the final array bounds warning removal for gcc-10 for now. Again, the warning is good, and we should re-enable all these warnings when we have converted all the legacy array declaration cases to flexible arrays. But in the meantime, it's just noise. Signed-off-by: Linus Torvalds --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index b4c88f0eb6fe..abdb355d3f46 100644 --- a/Makefile +++ b/Makefile @@ -880,6 +880,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, stringop-truncation) # We'll want to enable this eventually, but it's not going away for 5.7 at least KBUILD_CFLAGS += $(call cc-disable-warning, zero-length-bounds) KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) +KBUILD_CFLAGS += $(call cc-disable-warning, stringop-overflow) # Enabled with W=2, disabled by default as noisy KBUILD_CFLAGS += $(call cc-disable-warning, maybe-uninitialized) From adc71920969870dfa54e8f40dac8616284832d02 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 15:45:21 -0700 Subject: [PATCH 297/300] gcc-10: disable 'restrict' warning for now gcc-10 now warns about passing aliasing pointers to functions that take restricted pointers. That's actually a great warning, and if we ever start using 'restrict' in the kernel, it might be quite useful. But right now we don't, and it turns out that the only thing this warns about is an idiom where we have declared a few functions to be "printf-like" (which seems to make gcc pick up the restricted pointer thing), and then we print to the same buffer that we also use as an input. And people do that as an odd concatenation pattern, with code like this: #define sysfs_show_gen_prop(buffer, fmt, ...) \ snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) where we have 'buffer' as both the destination of the final result, and as the initial argument. Yes, it's a bit questionable. And outside of the kernel, people do have standard declarations like int snprintf( char *restrict buffer, size_t bufsz, const char *restrict format, ... ); where that output buffer is marked as a restrict pointer that cannot alias with any other arguments. But in the context of the kernel, that 'use snprintf() to concatenate to the end result' does work, and the pattern shows up in multiple places. And we have not marked our own version of snprintf() as taking restrict pointers, so the warning is incorrect for now, and gcc picks it up on its own. If we do start using 'restrict' in the kernel (and it might be a good idea if people find places where it matters), we'll need to figure out how to avoid this issue for snprintf and friends. But in the meantime, this warning is not useful. Signed-off-by: Linus Torvalds --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index abdb355d3f46..a45d8d09f354 100644 --- a/Makefile +++ b/Makefile @@ -882,6 +882,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, zero-length-bounds) KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) KBUILD_CFLAGS += $(call cc-disable-warning, stringop-overflow) +# Another good warning that we'll want to enable eventually +KBUILD_CFLAGS += $(call cc-disable-warning, restrict) + # Enabled with W=2, disabled by default as noisy KBUILD_CFLAGS += $(call cc-disable-warning, maybe-uninitialized) From 1a263ae60b04de959d9ce9caea4889385eefcc7b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 15:58:04 -0700 Subject: [PATCH 298/300] gcc-10: avoid shadowing standard library 'free()' in crypto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc-10 has started warning about conflicting types for a few new built-in functions, particularly 'free()'. This results in warnings like: crypto/xts.c:325:13: warning: conflicting types for built-in function ‘free’; expected ‘void(void *)’ [-Wbuiltin-declaration-mismatch] because the crypto layer had its local freeing functions called 'free()'. Gcc-10 is in the wrong here, since that function is marked 'static', and thus there is no chance of confusion with any standard library function namespace. But the simplest thing to do is to just use a different name here, and avoid this gcc mis-feature. [ Side note: gcc knowing about 'free()' is in itself not the mis-feature: the semantics of 'free()' are special enough that a compiler can validly do special things when seeing it. So the mis-feature here is that gcc thinks that 'free()' is some restricted name, and you can't shadow it as a local static function. Making the special 'free()' semantics be a function attribute rather than tied to the name would be the much better model ] Signed-off-by: Linus Torvalds --- crypto/lrw.c | 6 +++--- crypto/xts.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crypto/lrw.c b/crypto/lrw.c index 376d7ed3f1f8..3c734b81b3a2 100644 --- a/crypto/lrw.c +++ b/crypto/lrw.c @@ -287,7 +287,7 @@ static void exit_tfm(struct crypto_skcipher *tfm) crypto_free_skcipher(ctx->child); } -static void free(struct skcipher_instance *inst) +static void free_inst(struct skcipher_instance *inst) { crypto_drop_skcipher(skcipher_instance_ctx(inst)); kfree(inst); @@ -400,12 +400,12 @@ static int create(struct crypto_template *tmpl, struct rtattr **tb) inst->alg.encrypt = encrypt; inst->alg.decrypt = decrypt; - inst->free = free; + inst->free = free_inst; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: - free(inst); + free_inst(inst); } return err; } diff --git a/crypto/xts.c b/crypto/xts.c index dbdd8af629e6..6d8cea94b3cf 100644 --- a/crypto/xts.c +++ b/crypto/xts.c @@ -322,7 +322,7 @@ static void exit_tfm(struct crypto_skcipher *tfm) crypto_free_cipher(ctx->tweak); } -static void free(struct skcipher_instance *inst) +static void free_inst(struct skcipher_instance *inst) { crypto_drop_skcipher(skcipher_instance_ctx(inst)); kfree(inst); @@ -434,12 +434,12 @@ static int create(struct crypto_template *tmpl, struct rtattr **tb) inst->alg.encrypt = encrypt; inst->alg.decrypt = decrypt; - inst->free = free; + inst->free = free_inst; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: - free(inst); + free_inst(inst); } return err; } From e99332e7b4cda6e60f5b5916cf9943a79dbef902 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 May 2020 17:50:03 -0700 Subject: [PATCH 299/300] gcc-10: mark more functions __init to avoid section mismatch warnings It seems that for whatever reason, gcc-10 ends up not inlining a couple of functions that used to be inlined before. Even if they only have one single callsite - it looks like gcc may have decided that the code was unlikely, and not worth inlining. The code generation difference is harmless, but caused a few new section mismatch errors, since the (now no longer inlined) function wasn't in the __init section, but called other init functions: Section mismatch in reference from the function kexec_free_initrd() to the function .init.text:free_initrd_mem() Section mismatch in reference from the function tpm2_calc_event_log_size() to the function .init.text:early_memremap() Section mismatch in reference from the function tpm2_calc_event_log_size() to the function .init.text:early_memunmap() So add the appropriate __init annotation to make modpost not complain. In both cases there were trivially just a single callsite from another __init function. Signed-off-by: Linus Torvalds --- drivers/firmware/efi/tpm.c | 2 +- init/initramfs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c index 31f9f0e369b9..55b031d2c989 100644 --- a/drivers/firmware/efi/tpm.c +++ b/drivers/firmware/efi/tpm.c @@ -16,7 +16,7 @@ int efi_tpm_final_log_size; EXPORT_SYMBOL(efi_tpm_final_log_size); -static int tpm2_calc_event_log_size(void *data, int count, void *size_info) +static int __init tpm2_calc_event_log_size(void *data, int count, void *size_info) { struct tcg_pcr_event2_head *header; int event_size, size = 0; diff --git a/init/initramfs.c b/init/initramfs.c index 8ec1be4d7d51..7a38012e1af7 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -542,7 +542,7 @@ void __weak free_initrd_mem(unsigned long start, unsigned long end) } #ifdef CONFIG_KEXEC_CORE -static bool kexec_free_initrd(void) +static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); From 2ef96a5bb12be62ef75b5828c0aab838ebb29cb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 10 May 2020 15:16:58 -0700 Subject: [PATCH 300/300] Linux 5.7-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a45d8d09f354..11fe9b1535de 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 7 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Kleptomaniac Octopus # *DOCUMENTATION*