diff --git a/Documentation/ABI/testing/sysfs-platform-dptf b/Documentation/ABI/testing/sysfs-platform-dptf index eeed81ca6949..2cbc660d163b 100644 --- a/Documentation/ABI/testing/sysfs-platform-dptf +++ b/Documentation/ABI/testing/sysfs-platform-dptf @@ -92,3 +92,19 @@ Contact: linux-acpi@vger.kernel.org Description: (RO) The battery discharge current capability obtained from battery fuel gauge in milli Amps. + +What: /sys/bus/platform/devices/INTC1045:00/pch_fivr_switch_frequency/freq_mhz_low_clock +Date: November, 2020 +KernelVersion: v5.10 +Contact: linux-acpi@vger.kernel.org +Description: + (RW) The PCH FIVR (Fully Integrated Voltage Regulator) switching frequency in MHz, + when FIVR clock is 19.2MHz or 24MHz. + +What: /sys/bus/platform/devices/INTC1045:00/pch_fivr_switch_frequency/freq_mhz_high_clock +Date: November, 2020 +KernelVersion: v5.10 +Contact: linux-acpi@vger.kernel.org +Description: + (RW) The PCH FIVR (Fully Integrated Voltage Regulator) switching frequency in MHz, + when FIVR clock is 38.4MHz. diff --git a/Documentation/admin-guide/mm/numaperf.rst b/Documentation/admin-guide/mm/numaperf.rst index 4d69ef1de830..86f2a3c4b638 100644 --- a/Documentation/admin-guide/mm/numaperf.rst +++ b/Documentation/admin-guide/mm/numaperf.rst @@ -56,6 +56,11 @@ nodes' access characteristics share the same performance relative to other linked initiator nodes. Each target within an initiator's access class, though, do not necessarily perform the same as each other. +The access class "1" is used to allow differentiation between initiators +that are CPUs and hence suitable for generic task scheduling, and +IO initiators such as GPUs and NICs. Unlike access class 0, only +nodes containing CPUs are considered. + ================ NUMA Performance ================ @@ -88,6 +93,9 @@ The latency attributes are provided in nanoseconds. The values reported here correspond to the rated latency and bandwidth for the platform. +Access class 1 takes the same form but only includes values for CPU to +memory activity. + ========== NUMA Cache ========== diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index 6ebe163f9dfe..37940a0584ec 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -528,6 +528,10 @@ object corresponding to it, as follows: Total number of times the hardware has been asked by the given CPU to enter this idle state. +``rejected`` + Total number of times a request to enter this idle state on the given + CPU was rejected. + The :file:`desc` and :file:`name` files both contain strings. The difference between them is that the name is expected to be more concise, while the description may be longer and it may contain white space or special characters. @@ -572,6 +576,11 @@ particular case. For these reasons, the only reliable way to find out how much time has been spent by the hardware in different idle states supported by it is to use idle state residency counters in the hardware, if available. +Generally, an interrupt received when trying to enter an idle state causes the +idle state entry request to be rejected, in which case the ``CPUIdle`` driver +may return an error code to indicate that this was the case. The :file:`usage` +and :file:`rejected` files report the number of times the given idle state +was entered successfully or rejected, respectively. .. _cpu-pm-qos: diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst index bab2d10631f0..3eda08191d13 100644 --- a/Documentation/admin-guide/pnp.rst +++ b/Documentation/admin-guide/pnp.rst @@ -281,10 +281,6 @@ ISAPNP drivers. They should serve as a temporary solution only. They are as follows:: - struct pnp_card *pnp_find_card(unsigned short vendor, - unsigned short device, - struct pnp_card *from) - struct pnp_dev *pnp_find_dev(struct pnp_card *card, unsigned short vendor, unsigned short function, diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt index 33856947c561..9299028ee712 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt @@ -8,7 +8,7 @@ Properties: - compatible Usage: required Value type: - Definition: must be "qcom,cpufreq-hw". + Definition: must be "qcom,cpufreq-hw" or "qcom,cpufreq-epss". - clocks Usage: required diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt index 9d16d417e9be..9847dfeeffcb 100644 --- a/Documentation/devicetree/bindings/opp/opp.txt +++ b/Documentation/devicetree/bindings/opp/opp.txt @@ -154,25 +154,27 @@ Optional properties: - opp-suspend: Marks the OPP to be used during device suspend. If multiple OPPs in the table have this, the OPP with highest opp-hz will be used. -- opp-supported-hw: This enables us to select only a subset of OPPs from the - larger OPP table, based on what version of the hardware we are running on. We - still can't have multiple nodes with the same opp-hz value in OPP table. +- opp-supported-hw: This property allows a platform to enable only a subset of + the OPPs from the larger set present in the OPP table, based on the current + version of the hardware (already known to the operating system). - It's a user defined array containing a hierarchy of hardware version numbers, - supported by the OPP. For example: a platform with hierarchy of three levels - of versions (A, B and C), this field should be like , where X - corresponds to Version hierarchy A, Y corresponds to version hierarchy B and Z - corresponds to version hierarchy C. + Each block present in the array of blocks in this property, represents a + sub-group of hardware versions supported by the OPP. i.e. , + , etc. The OPP will be enabled if _any_ of these sub-groups match + the hardware's version. - Each level of hierarchy is represented by a 32 bit value, and so there can be - only 32 different supported version per hierarchy. i.e. 1 bit per version. A - value of 0xFFFFFFFF will enable the OPP for all versions for that hierarchy - level. And a value of 0x00000000 will disable the OPP completely, and so we - never want that to happen. + Each sub-group is a platform defined array representing the hierarchy of + hardware versions supported by the platform. For a platform with three + hierarchical levels of version (X.Y.Z), this field shall look like - If 32 values aren't sufficient for a version hierarchy, than that version - hierarchy can be contained in multiple 32 bit values. i.e. in the - above example, Z1 & Z2 refer to the version hierarchy Z. + opp-supported-hw = , , . + + Each level (eg. X1) in version hierarchy is represented by a 32 bit value, one + bit per version and so there can be maximum 32 versions per level. Logical AND + (&) operation is performed for each level with the hardware's level version + and a non-zero output for _all_ the levels in a sub-group means the OPP is + supported by hardware. A value of 0xFFFFFFFF for each level in the sub-group + will enable the OPP for all versions for the hardware. - status: Marks the node enabled/disabled. @@ -503,7 +505,6 @@ Example 5: opp-supported-hw */ opp-supported-hw = <0xF 0xFFFFFFFF 0xFFFFFFFF> opp-hz = /bits/ 64 <600000000>; - opp-microvolt = <915000 900000 925000>; ... }; @@ -516,7 +517,17 @@ Example 5: opp-supported-hw */ opp-supported-hw = <0x20 0xff0000ff 0x0000f4f0> opp-hz = /bits/ 64 <800000000>; - opp-microvolt = <915000 900000 925000>; + ... + }; + + opp-900000000 { + /* + * Supports: + * - All cuts and substrate where process version is 0x2. + * - All cuts and process where substrate version is 0x2. + */ + opp-supported-hw = <0xFFFFFFFF 0xFFFFFFFF 0x02>, <0xFFFFFFFF 0x01 0xFFFFFFFF> + opp-hz = /bits/ 64 <900000000>; ... }; }; diff --git a/MAINTAINERS b/MAINTAINERS index 57c6346606cf..6976e311034a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -405,7 +405,7 @@ F: drivers/platform/x86/i2c-multi-instantiate.c ACPI PMIC DRIVERS M: "Rafael J. Wysocki" M: Len Brown -R: Andy Shevchenko +R: Andy Shevchenko R: Mika Westerberg L: linux-acpi@vger.kernel.org S: Supported @@ -5388,7 +5388,6 @@ F: include/linux/kobj* F: lib/kobj* DRIVERS FOR ADAPTIVE VOLTAGE SCALING (AVS) -M: Kevin Hilman M: Nishanth Menon L: linux-pm@vger.kernel.org S: Maintained @@ -8947,8 +8946,8 @@ F: arch/x86/include/asm/intel_punit_ipc.h F: drivers/platform/x86/intel_punit_ipc.c INTEL PMC CORE DRIVER -M: Rajneesh Bhardwaj -M: Vishwanath Somayaji +M: Rajneesh Bhardwaj +M: David E Box L: platform-driver-x86@vger.kernel.org S: Maintained F: drivers/platform/x86/intel_pmc_core* @@ -8961,7 +8960,7 @@ F: drivers/gpio/gpio-*cove.c F: drivers/gpio/gpio-msic.c INTEL PMIC MULTIFUNCTION DEVICE DRIVERS -R: Andy Shevchenko +M: Andy Shevchenko S: Maintained F: drivers/mfd/intel_msic.c F: drivers/mfd/intel_soc_pmic* @@ -12528,6 +12527,7 @@ M: Josh Poimboeuf M: Peter Zijlstra S: Supported F: tools/objtool/ +F: include/linux/objtool.h OCELOT ETHERNET SWITCH DRIVER M: Microchip Linux Driver Support @@ -18933,7 +18933,7 @@ M: Hans de Goede M: Mark Gross L: platform-driver-x86@vger.kernel.org S: Maintained -T: git git://git.infradead.org/linux-platform-drivers-x86.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86.git F: drivers/platform/olpc/ F: drivers/platform/x86/ diff --git a/arch/arm/boot/dts/tegra20-cpu-opp-microvolt.dtsi b/arch/arm/boot/dts/tegra20-cpu-opp-microvolt.dtsi index dce85d39480d..6f3e8c5fc5f0 100644 --- a/arch/arm/boot/dts/tegra20-cpu-opp-microvolt.dtsi +++ b/arch/arm/boot/dts/tegra20-cpu-opp-microvolt.dtsi @@ -26,14 +26,6 @@ opp-microvolt = <800000 800000 1125000>; }; - opp@456000000,800,2,2 { - opp-microvolt = <800000 800000 1125000>; - }; - - opp@456000000,800,3,2 { - opp-microvolt = <800000 800000 1125000>; - }; - opp@456000000,825 { opp-microvolt = <825000 825000 1125000>; }; @@ -46,10 +38,6 @@ opp-microvolt = <800000 800000 1125000>; }; - opp@608000000,800,3,2 { - opp-microvolt = <800000 800000 1125000>; - }; - opp@608000000,825 { opp-microvolt = <825000 825000 1125000>; }; @@ -78,18 +66,6 @@ opp-microvolt = <875000 875000 1125000>; }; - opp@760000000,875,1,1 { - opp-microvolt = <875000 875000 1125000>; - }; - - opp@760000000,875,0,2 { - opp-microvolt = <875000 875000 1125000>; - }; - - opp@760000000,875,1,2 { - opp-microvolt = <875000 875000 1125000>; - }; - opp@760000000,900 { opp-microvolt = <900000 900000 1125000>; }; @@ -134,14 +110,6 @@ opp-microvolt = <950000 950000 1125000>; }; - opp@912000000,950,0,2 { - opp-microvolt = <950000 950000 1125000>; - }; - - opp@912000000,950,2,2 { - opp-microvolt = <950000 950000 1125000>; - }; - opp@912000000,1000 { opp-microvolt = <1000000 1000000 1125000>; }; @@ -170,10 +138,6 @@ opp-microvolt = <1000000 1000000 1125000>; }; - opp@1000000000,1000,0,2 { - opp-microvolt = <1000000 1000000 1125000>; - }; - opp@1000000000,1025 { opp-microvolt = <1025000 1025000 1125000>; }; diff --git a/arch/arm/boot/dts/tegra20-cpu-opp.dtsi b/arch/arm/boot/dts/tegra20-cpu-opp.dtsi index 9b8fedb57a1b..702a635e88e7 100644 --- a/arch/arm/boot/dts/tegra20-cpu-opp.dtsi +++ b/arch/arm/boot/dts/tegra20-cpu-opp.dtsi @@ -37,19 +37,8 @@ opp@456000000,800 { clock-latency-ns = <400000>; - opp-supported-hw = <0x03 0x0006>; - opp-hz = /bits/ 64 <456000000>; - }; - - opp@456000000,800,2,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x04 0x0004>; - opp-hz = /bits/ 64 <456000000>; - }; - - opp@456000000,800,3,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x08 0x0004>; + opp-supported-hw = <0x03 0x0006>, <0x04 0x0004>, + <0x08 0x0004>; opp-hz = /bits/ 64 <456000000>; }; @@ -67,13 +56,7 @@ opp@608000000,800 { clock-latency-ns = <400000>; - opp-supported-hw = <0x04 0x0006>; - opp-hz = /bits/ 64 <608000000>; - }; - - opp@608000000,800,3,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x08 0x0004>; + opp-supported-hw = <0x04 0x0006>, <0x08 0x0004>; opp-hz = /bits/ 64 <608000000>; }; @@ -115,25 +98,8 @@ opp@760000000,875 { clock-latency-ns = <400000>; - opp-supported-hw = <0x04 0x0001>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,875,1,1 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x02 0x0002>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,875,0,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x01 0x0004>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,875,1,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x02 0x0004>; + opp-supported-hw = <0x04 0x0001>, <0x02 0x0002>, + <0x01 0x0004>, <0x02 0x0004>; opp-hz = /bits/ 64 <760000000>; }; @@ -199,19 +165,8 @@ opp@912000000,950 { clock-latency-ns = <400000>; - opp-supported-hw = <0x02 0x0006>; - opp-hz = /bits/ 64 <912000000>; - }; - - opp@912000000,950,0,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x01 0x0004>; - opp-hz = /bits/ 64 <912000000>; - }; - - opp@912000000,950,2,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x04 0x0004>; + opp-supported-hw = <0x02 0x0006>, <0x01 0x0004>, + <0x04 0x0004>; opp-hz = /bits/ 64 <912000000>; }; @@ -253,13 +208,7 @@ opp@1000000000,1000 { clock-latency-ns = <400000>; - opp-supported-hw = <0x02 0x0006>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,1000,0,2 { - clock-latency-ns = <400000>; - opp-supported-hw = <0x01 0x0004>; + opp-supported-hw = <0x02 0x0006>, <0x01 0x0004>; opp-hz = /bits/ 64 <1000000000>; }; diff --git a/arch/arm/boot/dts/tegra30-cpu-opp-microvolt.dtsi b/arch/arm/boot/dts/tegra30-cpu-opp-microvolt.dtsi index d682f7437146..1be715d2a442 100644 --- a/arch/arm/boot/dts/tegra30-cpu-opp-microvolt.dtsi +++ b/arch/arm/boot/dts/tegra30-cpu-opp-microvolt.dtsi @@ -74,22 +74,6 @@ opp-microvolt = <850000 850000 1250000>; }; - opp@475000000,850,0,1 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@475000000,850,0,4 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@475000000,850,0,7 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@475000000,850,0,8 { - opp-microvolt = <850000 850000 1250000>; - }; - opp@608000000,850 { opp-microvolt = <850000 850000 1250000>; }; @@ -106,62 +90,6 @@ opp-microvolt = <850000 850000 1250000>; }; - opp@640000000,850,1,1 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,2,1 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,3,1 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,1,4 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,2,4 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,3,4 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,1,7 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,2,7 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,3,7 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,4,7 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,1,8 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,2,8 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,3,8 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@640000000,850,4,8 { - opp-microvolt = <850000 850000 1250000>; - }; - opp@640000000,900 { opp-microvolt = <900000 900000 1250000>; }; @@ -170,94 +98,10 @@ opp-microvolt = <850000 850000 1250000>; }; - opp@760000000,850,3,1 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,3,2 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,3,3 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,3,4 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,3,7 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,4,7 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,3,8 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,4,8 { - opp-microvolt = <850000 850000 1250000>; - }; - - opp@760000000,850,0,10 { - opp-microvolt = <850000 850000 1250000>; - }; - opp@760000000,900 { opp-microvolt = <900000 900000 1250000>; }; - opp@760000000,900,1,1 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,2,1 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,1,2 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,2,2 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,1,3 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,2,3 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,1,4 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,2,4 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,1,7 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,2,7 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,1,8 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@760000000,900,2,8 { - opp-microvolt = <900000 900000 1250000>; - }; - opp@760000000,912 { opp-microvolt = <912000 912000 1250000>; }; @@ -282,90 +126,10 @@ opp-microvolt = <900000 900000 1250000>; }; - opp@860000000,900,2,1 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,3,1 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,2,2 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,3,2 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,2,3 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,3,3 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,2,4 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,3,4 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,2,7 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,3,7 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,4,7 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,2,8 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,3,8 { - opp-microvolt = <900000 900000 1250000>; - }; - - opp@860000000,900,4,8 { - opp-microvolt = <900000 900000 1250000>; - }; - opp@860000000,975 { opp-microvolt = <975000 975000 1250000>; }; - opp@860000000,975,1,1 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@860000000,975,1,2 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@860000000,975,1,3 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@860000000,975,1,4 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@860000000,975,1,7 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@860000000,975,1,8 { - opp-microvolt = <975000 975000 1250000>; - }; - opp@860000000,1000 { opp-microvolt = <1000000 1000000 1250000>; }; @@ -382,62 +146,6 @@ opp-microvolt = <975000 975000 1250000>; }; - opp@1000000000,975,2,1 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,3,1 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,2,2 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,3,2 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,2,3 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,3,3 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,2,4 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,3,4 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,2,7 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,3,7 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,4,7 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,2,8 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,3,8 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1000000000,975,4,8 { - opp-microvolt = <975000 975000 1250000>; - }; - opp@1000000000,1000 { opp-microvolt = <1000000 1000000 1250000>; }; @@ -454,66 +162,10 @@ opp-microvolt = <975000 975000 1250000>; }; - opp@1100000000,975,3,1 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1100000000,975,3,2 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1100000000,975,3,3 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1100000000,975,3,4 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1100000000,975,3,7 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1100000000,975,4,7 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1100000000,975,3,8 { - opp-microvolt = <975000 975000 1250000>; - }; - - opp@1100000000,975,4,8 { - opp-microvolt = <975000 975000 1250000>; - }; - opp@1100000000,1000 { opp-microvolt = <1000000 1000000 1250000>; }; - opp@1100000000,1000,2,1 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1100000000,1000,2,2 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1100000000,1000,2,3 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1100000000,1000,2,4 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1100000000,1000,2,7 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1100000000,1000,2,8 { - opp-microvolt = <1000000 1000000 1250000>; - }; - opp@1100000000,1025 { opp-microvolt = <1025000 1025000 1250000>; }; @@ -534,66 +186,10 @@ opp-microvolt = <1000000 1000000 1250000>; }; - opp@1200000000,1000,3,1 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1200000000,1000,3,2 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1200000000,1000,3,3 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1200000000,1000,3,4 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1200000000,1000,3,7 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1200000000,1000,4,7 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1200000000,1000,3,8 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1200000000,1000,4,8 { - opp-microvolt = <1000000 1000000 1250000>; - }; - opp@1200000000,1025 { opp-microvolt = <1025000 1025000 1250000>; }; - opp@1200000000,1025,2,1 { - opp-microvolt = <1025000 1025000 1250000>; - }; - - opp@1200000000,1025,2,2 { - opp-microvolt = <1025000 1025000 1250000>; - }; - - opp@1200000000,1025,2,3 { - opp-microvolt = <1025000 1025000 1250000>; - }; - - opp@1200000000,1025,2,4 { - opp-microvolt = <1025000 1025000 1250000>; - }; - - opp@1200000000,1025,2,7 { - opp-microvolt = <1025000 1025000 1250000>; - }; - - opp@1200000000,1025,2,8 { - opp-microvolt = <1025000 1025000 1250000>; - }; - opp@1200000000,1050 { opp-microvolt = <1050000 1050000 1250000>; }; @@ -610,90 +206,18 @@ opp-microvolt = <1000000 1000000 1250000>; }; - opp@1300000000,1000,4,7 { - opp-microvolt = <1000000 1000000 1250000>; - }; - - opp@1300000000,1000,4,8 { - opp-microvolt = <1000000 1000000 1250000>; - }; - opp@1300000000,1025 { opp-microvolt = <1025000 1025000 1250000>; }; - opp@1300000000,1025,3,1 { - opp-microvolt = <1025000 1025000 1250000>; - }; - - opp@1300000000,1025,3,7 { - opp-microvolt = <1025000 1025000 1250000>; - }; - - opp@1300000000,1025,3,8 { - opp-microvolt = <1025000 1025000 1250000>; - }; - opp@1300000000,1050 { opp-microvolt = <1050000 1050000 1250000>; }; - opp@1300000000,1050,2,1 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,3,2 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,3,3 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,3,4 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,3,5 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,3,6 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,2,7 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,2,8 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,3,12 { - opp-microvolt = <1050000 1050000 1250000>; - }; - - opp@1300000000,1050,3,13 { - opp-microvolt = <1050000 1050000 1250000>; - }; - opp@1300000000,1075 { opp-microvolt = <1075000 1075000 1250000>; }; - opp@1300000000,1075,2,2 { - opp-microvolt = <1075000 1075000 1250000>; - }; - - opp@1300000000,1075,2,3 { - opp-microvolt = <1075000 1075000 1250000>; - }; - - opp@1300000000,1075,2,4 { - opp-microvolt = <1075000 1075000 1250000>; - }; - opp@1300000000,1100 { opp-microvolt = <1100000 1100000 1250000>; }; @@ -722,10 +246,6 @@ opp-microvolt = <1150000 1150000 1250000>; }; - opp@1400000000,1150,2,4 { - opp-microvolt = <1150000 1150000 1250000>; - }; - opp@1400000000,1175 { opp-microvolt = <1175000 1175000 1250000>; }; @@ -738,42 +258,10 @@ opp-microvolt = <1125000 1125000 1250000>; }; - opp@1500000000,1125,4,5 { - opp-microvolt = <1125000 1125000 1250000>; - }; - - opp@1500000000,1125,4,6 { - opp-microvolt = <1125000 1125000 1250000>; - }; - - opp@1500000000,1125,4,12 { - opp-microvolt = <1125000 1125000 1250000>; - }; - - opp@1500000000,1125,4,13 { - opp-microvolt = <1125000 1125000 1250000>; - }; - opp@1500000000,1150 { opp-microvolt = <1150000 1150000 1250000>; }; - opp@1500000000,1150,3,5 { - opp-microvolt = <1150000 1150000 1250000>; - }; - - opp@1500000000,1150,3,6 { - opp-microvolt = <1150000 1150000 1250000>; - }; - - opp@1500000000,1150,3,12 { - opp-microvolt = <1150000 1150000 1250000>; - }; - - opp@1500000000,1150,3,13 { - opp-microvolt = <1150000 1150000 1250000>; - }; - opp@1500000000,1200 { opp-microvolt = <1200000 1200000 1250000>; }; diff --git a/arch/arm/boot/dts/tegra30-cpu-opp.dtsi b/arch/arm/boot/dts/tegra30-cpu-opp.dtsi index 8e434f6713cd..0f7135006d19 100644 --- a/arch/arm/boot/dts/tegra30-cpu-opp.dtsi +++ b/arch/arm/boot/dts/tegra30-cpu-opp.dtsi @@ -109,31 +109,9 @@ opp@475000000,850 { clock-latency-ns = <100000>; - opp-supported-hw = <0x0F 0x0001>; - opp-hz = /bits/ 64 <475000000>; - }; - - opp@475000000,850,0,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0002>; - opp-hz = /bits/ 64 <475000000>; - }; - - opp@475000000,850,0,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0010>; - opp-hz = /bits/ 64 <475000000>; - }; - - opp@475000000,850,0,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0080>; - opp-hz = /bits/ 64 <475000000>; - }; - - opp@475000000,850,0,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0100>; + opp-supported-hw = <0x0F 0x0001>, <0x01 0x0002>, + <0x01 0x0010>, <0x01 0x0080>, + <0x01 0x0100>; opp-hz = /bits/ 64 <475000000>; }; @@ -157,91 +135,14 @@ opp@640000000,850 { clock-latency-ns = <100000>; - opp-supported-hw = <0x0F 0x0001>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,1,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0002>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,2,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0002>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,3,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0002>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,1,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0010>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,3,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,1,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0080>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,2,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0080>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,3,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0080>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,4,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0080>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,1,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0100>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,2,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0100>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,3,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0100>; - opp-hz = /bits/ 64 <640000000>; - }; - - opp@640000000,850,4,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0100>; + opp-supported-hw = <0x0F 0x0001>, <0x02 0x0002>, + <0x04 0x0002>, <0x08 0x0002>, + <0x02 0x0010>, <0x04 0x0010>, + <0x08 0x0010>, <0x02 0x0080>, + <0x04 0x0080>, <0x08 0x0080>, + <0x10 0x0080>, <0x02 0x0100>, + <0x04 0x0100>, <0x08 0x0100>, + <0x10 0x0100>; opp-hz = /bits/ 64 <640000000>; }; @@ -253,139 +154,23 @@ opp@760000000,850 { clock-latency-ns = <100000>; - opp-supported-hw = <0x1E 0x3461>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,3,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0002>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,3,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0004>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,3,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0008>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,3,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,3,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0080>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,4,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0080>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,3,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0100>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,4,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0100>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,850,0,10 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0400>; + opp-supported-hw = <0x1E 0x3461>, <0x08 0x0002>, + <0x08 0x0004>, <0x08 0x0008>, + <0x08 0x0010>, <0x08 0x0080>, + <0x10 0x0080>, <0x08 0x0100>, + <0x10 0x0100>, <0x01 0x0400>; opp-hz = /bits/ 64 <760000000>; }; opp@760000000,900 { clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0001>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,1,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0002>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,2,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0002>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,1,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0004>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,2,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0004>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,1,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0008>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,2,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0008>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,1,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0010>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,1,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0080>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,2,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0080>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,1,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0100>; - opp-hz = /bits/ 64 <760000000>; - }; - - opp@760000000,900,2,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0100>; + opp-supported-hw = <0x01 0x0001>, <0x02 0x0002>, + <0x04 0x0002>, <0x02 0x0004>, + <0x04 0x0004>, <0x02 0x0008>, + <0x04 0x0008>, <0x02 0x0010>, + <0x04 0x0010>, <0x02 0x0080>, + <0x04 0x0080>, <0x02 0x0100>, + <0x04 0x0100>; opp-hz = /bits/ 64 <760000000>; }; @@ -421,133 +206,23 @@ opp@860000000,900 { clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0001>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,2,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0002>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,3,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0002>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,2,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0004>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,3,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0004>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,2,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0008>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,3,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0008>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,3,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,2,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0080>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,3,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0080>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,4,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0080>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,2,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0100>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,3,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0100>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,900,4,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0100>; + opp-supported-hw = <0x02 0x0001>, <0x04 0x0002>, + <0x08 0x0002>, <0x04 0x0004>, + <0x08 0x0004>, <0x04 0x0008>, + <0x08 0x0008>, <0x04 0x0010>, + <0x08 0x0010>, <0x04 0x0080>, + <0x08 0x0080>, <0x10 0x0080>, + <0x04 0x0100>, <0x08 0x0100>, + <0x10 0x0100>; opp-hz = /bits/ 64 <860000000>; }; opp@860000000,975 { clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0001>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,975,1,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0002>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,975,1,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0004>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,975,1,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0008>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,975,1,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0010>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,975,1,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0080>; - opp-hz = /bits/ 64 <860000000>; - }; - - opp@860000000,975,1,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0100>; + opp-supported-hw = <0x01 0x0001>, <0x02 0x0002>, + <0x02 0x0004>, <0x02 0x0008>, + <0x02 0x0010>, <0x02 0x0080>, + <0x02 0x0100>; opp-hz = /bits/ 64 <860000000>; }; @@ -571,91 +246,14 @@ opp@1000000000,975 { clock-latency-ns = <100000>; - opp-supported-hw = <0x03 0x0001>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,2,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0002>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,3,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0002>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,2,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0004>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,3,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0004>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,2,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0008>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,3,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0008>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,3,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,2,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0080>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,3,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0080>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,4,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0080>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,2,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0100>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,3,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0100>; - opp-hz = /bits/ 64 <1000000000>; - }; - - opp@1000000000,975,4,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0100>; + opp-supported-hw = <0x03 0x0001>, <0x04 0x0002>, + <0x08 0x0002>, <0x04 0x0004>, + <0x08 0x0004>, <0x04 0x0008>, + <0x08 0x0008>, <0x04 0x0010>, + <0x08 0x0010>, <0x04 0x0080>, + <0x08 0x0080>, <0x10 0x0080>, + <0x04 0x0100>, <0x08 0x0100>, + <0x10 0x0100>; opp-hz = /bits/ 64 <1000000000>; }; @@ -679,97 +277,20 @@ opp@1100000000,975 { clock-latency-ns = <100000>; - opp-supported-hw = <0x06 0x0001>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,3,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0002>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,3,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0004>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,3,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0008>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,3,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,3,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0080>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,4,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0080>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,3,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0100>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,975,4,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0100>; + opp-supported-hw = <0x06 0x0001>, <0x08 0x0002>, + <0x08 0x0004>, <0x08 0x0008>, + <0x08 0x0010>, <0x08 0x0080>, + <0x10 0x0080>, <0x08 0x0100>, + <0x10 0x0100>; opp-hz = /bits/ 64 <1100000000>; }; opp@1100000000,1000 { clock-latency-ns = <100000>; - opp-supported-hw = <0x01 0x0001>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,1000,2,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0002>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,1000,2,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0004>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,1000,2,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0008>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,1000,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,1000,2,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0080>; - opp-hz = /bits/ 64 <1100000000>; - }; - - opp@1100000000,1000,2,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0100>; + opp-supported-hw = <0x01 0x0001>, <0x04 0x0002>, + <0x04 0x0004>, <0x04 0x0008>, + <0x04 0x0010>, <0x04 0x0080>, + <0x04 0x0100>; opp-hz = /bits/ 64 <1100000000>; }; @@ -799,97 +320,20 @@ opp@1200000000,1000 { clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0001>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,3,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0002>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,3,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0004>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,3,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0008>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,3,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,3,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0080>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,4,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0080>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,3,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0100>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1000,4,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0100>; + opp-supported-hw = <0x04 0x0001>, <0x08 0x0002>, + <0x08 0x0004>, <0x08 0x0008>, + <0x08 0x0010>, <0x08 0x0080>, + <0x10 0x0080>, <0x08 0x0100>, + <0x10 0x0100>; opp-hz = /bits/ 64 <1200000000>; }; opp@1200000000,1025 { clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0001>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1025,2,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0002>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1025,2,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0004>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1025,2,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0008>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1025,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1025,2,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0080>; - opp-hz = /bits/ 64 <1200000000>; - }; - - opp@1200000000,1025,2,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0100>; + opp-supported-hw = <0x02 0x0001>, <0x04 0x0002>, + <0x04 0x0004>, <0x04 0x0008>, + <0x04 0x0010>, <0x04 0x0080>, + <0x04 0x0100>; opp-hz = /bits/ 64 <1200000000>; }; @@ -913,133 +357,33 @@ opp@1300000000,1000 { clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0001>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1000,4,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0080>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1000,4,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0100>; + opp-supported-hw = <0x08 0x0001>, <0x10 0x0080>, + <0x10 0x0100>; opp-hz = /bits/ 64 <1300000000>; }; opp@1300000000,1025 { clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0001>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1025,3,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0002>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1025,3,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0080>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1025,3,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0100>; + opp-supported-hw = <0x04 0x0001>, <0x08 0x0002>, + <0x08 0x0080>, <0x08 0x0100>; opp-hz = /bits/ 64 <1300000000>; }; opp@1300000000,1050 { clock-latency-ns = <100000>; - opp-supported-hw = <0x12 0x3061>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,2,1 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0002>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,3,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0004>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,3,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0008>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,3,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,3,5 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0020>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,3,6 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0040>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,2,7 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0080>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,2,8 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0100>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,3,12 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x1000>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1050,3,13 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x2000>; + opp-supported-hw = <0x12 0x3061>, <0x04 0x0002>, + <0x08 0x0004>, <0x08 0x0008>, + <0x08 0x0010>, <0x08 0x0020>, + <0x08 0x0040>, <0x04 0x0080>, + <0x04 0x0100>, <0x08 0x1000>, + <0x08 0x2000>; opp-hz = /bits/ 64 <1300000000>; }; opp@1300000000,1075 { clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x0182>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1075,2,2 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0004>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1075,2,3 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0008>; - opp-hz = /bits/ 64 <1300000000>; - }; - - opp@1300000000,1075,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; + opp-supported-hw = <0x02 0x0182>, <0x04 0x0004>, + <0x04 0x0008>, <0x04 0x0010>; opp-hz = /bits/ 64 <1300000000>; }; @@ -1081,13 +425,7 @@ opp@1400000000,1150 { clock-latency-ns = <100000>; - opp-supported-hw = <0x02 0x000C>; - opp-hz = /bits/ 64 <1400000000>; - }; - - opp@1400000000,1150,2,4 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; + opp-supported-hw = <0x02 0x000C>, <0x04 0x0010>; opp-hz = /bits/ 64 <1400000000>; }; @@ -1105,61 +443,17 @@ opp@1500000000,1125 { clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0010>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1125,4,5 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0020>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1125,4,6 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x0040>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1125,4,12 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x1000>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1125,4,13 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x10 0x2000>; + opp-supported-hw = <0x08 0x0010>, <0x10 0x0020>, + <0x10 0x0040>, <0x10 0x1000>, + <0x10 0x2000>; opp-hz = /bits/ 64 <1500000000>; }; opp@1500000000,1150 { clock-latency-ns = <100000>; - opp-supported-hw = <0x04 0x0010>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1150,3,5 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0020>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1150,3,6 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x0040>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1150,3,12 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x1000>; - opp-hz = /bits/ 64 <1500000000>; - }; - - opp@1500000000,1150,3,13 { - clock-latency-ns = <100000>; - opp-supported-hw = <0x08 0x2000>; + opp-supported-hw = <0x04 0x0010>, <0x08 0x0020>, + <0x08 0x0040>, <0x08 0x1000>, + <0x08 0x2000>; opp-hz = /bits/ 64 <1500000000>; }; diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index e0593cf095d0..470299ee2fba 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -7,8 +7,13 @@ #include #include +/* big.LITTLE switcher is incompatible with frequency invariance */ +#ifndef CONFIG_BL_SWITCHER /* Replace task scheduler's default frequency-invariant accounting */ +#define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale +#define arch_scale_freq_invariant topology_scale_freq_invariant +#endif /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 31bbc803cecb..dc7f6e91aafa 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -1 +1,6 @@ #include + +static inline bool xen_kernel_unmapped_at_usr(void) +{ + return false; +} diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 4e36bc001749..b81231ea8d4a 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -159,7 +159,8 @@ static int xen_starting_cpu(unsigned int cpu) BUG_ON(err); per_cpu(xen_vcpu, cpu) = vcpup; - xen_setup_runstate_info(cpu); + if (!xen_kernel_unmapped_at_usr()) + xen_setup_runstate_info(cpu); after_register_vcpu_info: enable_percpu_irq(xen_events_irq, 0); @@ -395,7 +396,8 @@ static int __init xen_guest_init(void) return -EINVAL; } - xen_time_setup_guest(); + if (!xen_kernel_unmapped_at_usr()) + xen_time_setup_guest(); if (xen_initial_domain()) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index e042f6527981..11a465243f66 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -26,7 +26,9 @@ void topology_scale_freq_tick(void); #endif /* CONFIG_ARM64_AMU_EXTN */ /* Replace task scheduler's default frequency-invariant accounting */ +#define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale +#define arch_scale_freq_invariant topology_scale_freq_invariant /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale diff --git a/arch/arm64/include/asm/xen/page.h b/arch/arm64/include/asm/xen/page.h index 31bbc803cecb..dffdc773221b 100644 --- a/arch/arm64/include/asm/xen/page.h +++ b/arch/arm64/include/asm/xen/page.h @@ -1 +1,7 @@ #include +#include + +static inline bool xen_kernel_unmapped_at_usr(void) +{ + return arm64_kernel_unmapped_at_el0(); +} diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index ff1dd1dbfe64..543c67cae02f 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -248,6 +248,13 @@ static int __init init_amu_fie(void) static_branch_enable(&amu_fie_key); } + /* + * If the system is not fully invariant after AMU init, disable + * partial use of counters for frequency invariance. + */ + if (!topology_scale_freq_invariant()) + static_branch_disable(&amu_fie_key); + free_valid_mask: free_cpumask_var(valid_cpus); @@ -255,7 +262,7 @@ free_valid_mask: } late_initcall_sync(init_amu_fie); -bool arch_freq_counters_available(struct cpumask *cpus) +bool arch_freq_counters_available(const struct cpumask *cpus) { return amu_freq_invariant() && cpumask_subset(cpus, amu_fie_cpus); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 255084c65138..f6946b81f74a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1523,6 +1523,7 @@ config AMD_MEM_ENCRYPT select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED + select INSTRUCTION_DECODER help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 4fb989ef5665..ee249088cbfe 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -32,7 +32,7 @@ KBUILD_CFLAGS := -m$(BITS) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 -cflags-$(CONFIG_X86_64) := -mcmodel=small +cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -ffreestanding @@ -47,6 +47,11 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h +# sev-es.c indirectly inludes inat-table.h which is generated during +# compilation and stored in $(objtree). Add the directory to the includes so +# that the compiler finds it even with out-of-tree builds (make O=/some/path). +CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/ + KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE :=n @@ -81,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o + vmlinux-objs-y += $(obj)/ident_map_64.o + vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c index 6448a8196d32..0cc1323896d1 100644 --- a/arch/x86/boot/compressed/cpuflags.c +++ b/arch/x86/boot/compressed/cpuflags.c @@ -1,6 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_RANDOMIZE_BASE - #include "../cpuflags.c" bool has_cpuflag(int flag) @@ -9,5 +7,3 @@ bool has_cpuflag(int flag) return test_bit(flag, cpu.flags); } - -#endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 9e46729cf162..1c80f1738fd9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include #include #include +#include #include "pgtable.h" /* @@ -415,6 +416,10 @@ SYM_CODE_START(startup_64) .Lon_kernel_cs: + pushq %rsi + call load_stage1_idt + popq %rsi + /* * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. @@ -527,6 +532,21 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) shrq $3, %rcx rep stosq +/* + * If running as an SEV guest, the encryption mask is required in the + * page-table setup code below. When the guest also has SEV-ES enabled + * set_sev_encryption_mask() will cause #VC exceptions, but the stage2 + * handler can't map its GHCB because the page-table is not set up yet. + * So set up the encryption mask here while still on the stage1 #VC + * handler. Then load stage2 IDT and switch to the kernel's own + * page-table. + */ + pushq %rsi + call set_sev_encryption_mask + call load_stage2_idt + call initialize_identity_maps + popq %rsi + /* * Do the extraction, and jump to the new kernel.. */ @@ -659,10 +679,21 @@ SYM_DATA_START_LOCAL(gdt) .quad 0x0000000000000000 /* TS continued */ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) +SYM_DATA_START(boot_idt_desc) + .word boot_idt_end - boot_idt - 1 + .quad 0 +SYM_DATA_END(boot_idt_desc) + .balign 8 +SYM_DATA_START(boot_idt) + .rept BOOT_IDT_ENTRIES + .quad 0 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) + #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif - #ifdef CONFIG_EFI_MIXED SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) SYM_DATA(efi_is64, .byte 1) diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c new file mode 100644 index 000000000000..063a60edcf99 --- /dev/null +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016 Yinghai Lu + * Copyright (C) 2016 Kees Cook + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + +#include "error.h" +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include +#include +#include +#include +#include +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#include "../../mm/ident_map.c" + +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled; +unsigned int pgdir_shift = 39; +unsigned int ptrs_per_p4d = 1; +#endif + +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long top_level_pgt; + +phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; + +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info; + +/* + * Adds the specified range to the identity mappings. + */ +static void add_identity_map(unsigned long start, unsigned long end) +{ + int ret; + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); + if (ret) + error("Error: kernel_ident_mapping_init() failed\n"); +} + +/* Locates and clears a region for a new top level page table. */ +void initialize_identity_maps(void) +{ + /* Exclude the encryption mask from __PHYSICAL_MASK */ + physical_mask &= ~sme_me_mask; + + /* Init mapping_info with run-time function/buffer pointers. */ + mapping_info.alloc_pgt_page = alloc_pgt_page; + mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; + + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + * + * With 5-level paging, we use '_pgtable' to allocate the p4d page table, + * the top-level page table is allocated separately. + * + * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level + * cases. On 4-level paging it's equal to 'top_level_pgt'. + */ + top_level_pgt = read_cr3_pa(); + if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); + } + + /* + * New page-table is set up - map the kernel image and load it + * into cr3. + */ + add_identity_map((unsigned long)_head, (unsigned long)_end); + write_cr3(top_level_pgt); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ + write_cr3(top_level_pgt); +} + +static pte_t *split_large_pmd(struct x86_mapping_info *info, + pmd_t *pmdp, unsigned long __address) +{ + unsigned long page_flags; + unsigned long address; + pte_t *pte; + pmd_t pmd; + int i; + + pte = (pte_t *)info->alloc_pgt_page(info->context); + if (!pte) + return NULL; + + address = __address & PMD_MASK; + /* No large page - clear PSE flag */ + page_flags = info->page_flag & ~_PAGE_PSE; + + /* Populate the PTEs */ + for (i = 0; i < PTRS_PER_PMD; i++) { + set_pte(&pte[i], __pte(address | page_flags)); + address += PAGE_SIZE; + } + + /* + * Ideally we need to clear the large PMD first and do a TLB + * flush before we write the new PMD. But the 2M range of the + * PMD might contain the code we execute and/or the stack + * we are on, so we can't do that. But that should be safe here + * because we are going from large to small mappings and we are + * also the only user of the page-table, so there is no chance + * of a TLB multihit. + */ + pmd = __pmd((unsigned long)pte | info->kernpg_flag); + set_pmd(pmdp, pmd); + /* Flush TLB to establish the new PMD */ + write_cr3(top_level_pgt); + + return pte + pte_index(__address); +} + +static void clflush_page(unsigned long address) +{ + unsigned int flush_size; + char *cl, *start, *end; + + /* + * Hardcode cl-size to 64 - CPUID can't be used here because that might + * cause another #VC exception and the GHCB is not ready to use yet. + */ + flush_size = 64; + start = (char *)(address & PAGE_MASK); + end = start + PAGE_SIZE; + + /* + * First make sure there are no pending writes on the cache-lines to + * flush. + */ + asm volatile("mfence" : : : "memory"); + + for (cl = start; cl != end; cl += flush_size) + clflush(cl); +} + +static int set_clr_page_flags(struct x86_mapping_info *info, + unsigned long address, + pteval_t set, pteval_t clr) +{ + pgd_t *pgdp = (pgd_t *)top_level_pgt; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep, pte; + + /* + * First make sure there is a PMD mapping for 'address'. + * It should already exist, but keep things generic. + * + * To map the page just read from it and fault it in if there is no + * mapping yet. add_identity_map() can't be called here because that + * would unconditionally map the address on PMD level, destroying any + * PTE-level mappings that might already exist. Use assembly here so + * the access won't be optimized away. + */ + asm volatile("mov %[address], %%r9" + :: [address] "g" (*(unsigned long *)address) + : "r9", "memory"); + + /* + * The page is mapped at least with PMD size - so skip checks and walk + * directly to the PMD. + */ + p4dp = p4d_offset(pgdp, address); + pudp = pud_offset(p4dp, address); + pmdp = pmd_offset(pudp, address); + + if (pmd_large(*pmdp)) + ptep = split_large_pmd(info, pmdp, address); + else + ptep = pte_offset_kernel(pmdp, address); + + if (!ptep) + return -ENOMEM; + + /* + * Changing encryption attributes of a page requires to flush it from + * the caches. + */ + if ((set | clr) & _PAGE_ENC) + clflush_page(address); + + /* Update PTE */ + pte = *ptep; + pte = pte_set_flags(pte, set); + pte = pte_clear_flags(pte, clr); + set_pte(ptep, pte); + + /* Flush TLB after changing encryption attribute */ + write_cr3(top_level_pgt); + + return 0; +} + +int set_page_decrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC); +} + +int set_page_encrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0); +} + +int set_page_non_present(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT); +} + +static void do_pf_error(const char *msg, unsigned long error_code, + unsigned long address, unsigned long ip) +{ + error_putstr(msg); + + error_putstr("\nError Code: "); + error_puthex(error_code); + error_putstr("\nCR2: 0x"); + error_puthex(address); + error_putstr("\nRIP relative to _head: 0x"); + error_puthex(ip - (unsigned long)_head); + error_putstr("\n"); + + error("Stopping.\n"); +} + +void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address = native_read_cr2(); + unsigned long end; + bool ghcb_fault; + + ghcb_fault = sev_es_check_ghcb_fault(address); + + address &= PMD_MASK; + end = address + PMD_SIZE; + + /* + * Check for unexpected error codes. Unexpected are: + * - Faults on present pages + * - User faults + * - Reserved bits set + */ + if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) + do_pf_error("Unexpected page-fault:", error_code, address, regs->ip); + else if (ghcb_fault) + do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip); + + /* + * Error code is sane - now identity map the 2M region around + * the faulting address. + */ + add_identity_map(address, end); +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c new file mode 100644 index 000000000000..804a502ee0d2 --- /dev/null +++ b/arch/x86/boot/compressed/idt_64.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include "misc.h" + +static void set_idt_entry(int vector, void (*handler)(void)) +{ + unsigned long address = (unsigned long)handler; + gate_desc entry; + + memset(&entry, 0, sizeof(entry)); + + entry.offset_low = (u16)(address & 0xffff); + entry.segment = __KERNEL_CS; + entry.bits.type = GATE_TRAP; + entry.bits.p = 1; + entry.offset_middle = (u16)((address >> 16) & 0xffff); + entry.offset_high = (u32)(address >> 32); + + memcpy(&boot_idt[vector], &entry, sizeof(entry)); +} + +/* Have this here so we don't need to include */ +static void load_boot_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +/* Setup IDT before kernel jumping to .Lrelocated */ +void load_stage1_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + set_idt_entry(X86_TRAP_VC, boot_stage1_vc); + + load_boot_idt(&boot_idt_desc); +} + +/* Setup IDT after kernel jumping to .Lrelocated */ +void load_stage2_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + set_idt_entry(X86_TRAP_PF, boot_page_fault); + +#ifdef CONFIG_AMD_MEM_ENCRYPT + set_idt_entry(X86_TRAP_VC, boot_stage2_vc); +#endif + + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S new file mode 100644 index 000000000000..22890e199f5b --- /dev/null +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Early IDT handler entry points + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel + */ + +#include + +/* For ORIG_RAX */ +#include "../../entry/calling.h" + +.macro EXCEPTION_HANDLER name function error_code=0 +SYM_FUNC_START(\name) + + /* Build pt_regs */ + .if \error_code == 0 + pushq $0 + .endif + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* Call handler with pt_regs */ + movq %rsp, %rdi + /* Error code is second parameter */ + movq ORIG_RAX(%rsp), %rsi + call \function + + /* Restore regs */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + + /* Remove error code and return */ + addq $8, %rsp + + iretq +SYM_FUNC_END(\name) + .endm + + .text + .code64 + +EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 + +#ifdef CONFIG_AMD_MEM_ENCRYPT +EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 +EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1 +#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 877970d76249..b59547ce5b19 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -40,17 +40,8 @@ #include /* For COMMAND_LINE_SIZE */ #undef _SETUP -#ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled; -unsigned int pgdir_shift __ro_after_init = 39; -unsigned int ptrs_per_p4d __ro_after_init = 1; -#endif - extern unsigned long get_cmd_line_ptr(void); -/* Used by PAGE_KERN* macros: */ -pteval_t __default_kernel_pte_mask __read_mostly = ~0; - /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -406,8 +397,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, */ mem_avoid[MEM_AVOID_ZO_RANGE].start = input; mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; - add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, - mem_avoid[MEM_AVOID_ZO_RANGE].size); /* Avoid initrd. */ initrd_start = (u64)boot_params->ext_ramdisk_image << 32; @@ -425,15 +414,11 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; - add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, - mem_avoid[MEM_AVOID_CMDLINE].size); } /* Avoid boot parameters. */ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); - add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, - mem_avoid[MEM_AVOID_BOOTPARAMS].size); /* We don't need to set a mapping for setup_data. */ @@ -442,11 +427,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* Enumerate the immovable memory regions */ num_immovable_mem = count_immovable_mem_regions(); - -#ifdef CONFIG_X86_VERBOSE_BOOTUP - /* Make sure video RAM can be used. */ - add_identity_map(0, PMD_SIZE); -#endif } /* @@ -870,9 +850,6 @@ void choose_random_location(unsigned long input, boot_params->hdr.loadflags |= KASLR_FLAG; - /* Prepare to add new identity pagetables on demand. */ - initialize_identity_maps(); - if (IS_ENABLED(CONFIG_X86_32)) mem_limit = KERNEL_IMAGE_SIZE; else @@ -896,19 +873,8 @@ void choose_random_location(unsigned long input, warn("Physical KASLR disabled: no suitable memory region!"); } else { /* Update the new physical address location. */ - if (*output != random_addr) { - add_identity_map(random_addr, output_size); + if (*output != random_addr) *output = random_addr; - } - - /* - * This loads the identity mapping page table. - * This should only be done if a new physical address - * is found for the kernel, otherwise we should keep - * the old page table to make it be like the "nokaslr" - * case. - */ - finalize_identity_maps(); } diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c deleted file mode 100644 index f9c5c13d979b..000000000000 --- a/arch/x86/boot/compressed/kaslr_64.c +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code is used on x86_64 to create page table identity mappings on - * demand by building up a new set of page tables (or appending to the - * existing ones), and then switching over to them when ready. - * - * Copyright (C) 2015-2016 Yinghai Lu - * Copyright (C) 2016 Kees Cook - */ - -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION - -#include "misc.h" - -/* These actually do the work of building the kernel identity maps. */ -#include -#include -/* Use the static base for this part of the boot process */ -#undef __PAGE_OFFSET -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#include "../../mm/ident_map.c" - -/* Used to track our page table allocation area. */ -struct alloc_pgt_data { - unsigned char *pgt_buf; - unsigned long pgt_buf_size; - unsigned long pgt_buf_offset; -}; - -/* - * Allocates space for a page table entry, using struct alloc_pgt_data - * above. Besides the local callers, this is used as the allocation - * callback in mapping_info below. - */ -static void *alloc_pgt_page(void *context) -{ - struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; - unsigned char *entry; - - /* Validate there is space available for a new page. */ - if (pages->pgt_buf_offset >= pages->pgt_buf_size) { - debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); - debug_putaddr(pages->pgt_buf_offset); - debug_putaddr(pages->pgt_buf_size); - return NULL; - } - - entry = pages->pgt_buf + pages->pgt_buf_offset; - pages->pgt_buf_offset += PAGE_SIZE; - - return entry; -} - -/* Used to track our allocated page tables. */ -static struct alloc_pgt_data pgt_data; - -/* The top level page table entry pointer. */ -static unsigned long top_level_pgt; - -phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; - -/* - * Mapping information structure passed to kernel_ident_mapping_init(). - * Due to relocation, pointers must be assigned at run time not build time. - */ -static struct x86_mapping_info mapping_info; - -/* Locates and clears a region for a new top level page table. */ -void initialize_identity_maps(void) -{ - /* If running as an SEV guest, the encryption mask is required. */ - set_sev_encryption_mask(); - - /* Exclude the encryption mask from __PHYSICAL_MASK */ - physical_mask &= ~sme_me_mask; - - /* Init mapping_info with run-time function/buffer pointers. */ - mapping_info.alloc_pgt_page = alloc_pgt_page; - mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE; - - /* - * It should be impossible for this not to already be true, - * but since calling this a second time would rewind the other - * counters, let's just make sure this is reset too. - */ - pgt_data.pgt_buf_offset = 0; - - /* - * If we came here via startup_32(), cr3 will be _pgtable already - * and we must append to the existing area instead of entirely - * overwriting it. - * - * With 5-level paging, we use '_pgtable' to allocate the p4d page table, - * the top-level page table is allocated separately. - * - * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level - * cases. On 4-level paging it's equal to 'top_level_pgt'. - */ - top_level_pgt = read_cr3_pa(); - if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { - debug_putstr("booted via startup_32()\n"); - pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - } else { - debug_putstr("booted via startup_64()\n"); - pgt_data.pgt_buf = _pgtable; - pgt_data.pgt_buf_size = BOOT_PGT_SIZE; - memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); - } -} - -/* - * Adds the specified range to what will become the new identity mappings. - * Once all ranges have been added, the new mapping is activated by calling - * finalize_identity_maps() below. - */ -void add_identity_map(unsigned long start, unsigned long size) -{ - unsigned long end = start + size; - - /* Align boundary to 2M. */ - start = round_down(start, PMD_SIZE); - end = round_up(end, PMD_SIZE); - if (start >= end) - return; - - /* Build the mapping. */ - kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, - start, end); -} - -/* - * This switches the page tables to the new level4 that has been built - * via calls to add_identity_map() above. If booted via startup_32(), - * this is effectively a no-op. - */ -void finalize_identity_maps(void) -{ - write_cr3(top_level_pgt); -} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e478e40fbe5a..267e7f93050e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -442,6 +442,13 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, parse_elf(output); handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); + + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3efce27ba35c..6d31f1b4c4d1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,6 +23,7 @@ #include #include #include +#include #define BOOT_CTYPE_H #include @@ -36,6 +37,9 @@ #define memptr unsigned #endif +/* boot/compressed/vmlinux start and end markers */ +extern char _head[], _end[]; + /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; @@ -81,8 +85,6 @@ void choose_random_location(unsigned long input, unsigned long *output, unsigned long output_size, unsigned long *virt_addr); -/* cpuflags.c */ -bool has_cpuflag(int flag); #else static inline void choose_random_location(unsigned long input, unsigned long input_size, @@ -93,18 +95,14 @@ static inline void choose_random_location(unsigned long input, } #endif +/* cpuflags.c */ +bool has_cpuflag(int flag); + #ifdef CONFIG_X86_64 -void initialize_identity_maps(void); -void add_identity_map(unsigned long start, unsigned long size); -void finalize_identity_maps(void); +extern int set_page_decrypted(unsigned long address); +extern int set_page_encrypted(unsigned long address); +extern int set_page_non_present(unsigned long address); extern unsigned char _pgtable[]; -#else -static inline void initialize_identity_maps(void) -{ } -static inline void add_identity_map(unsigned long start, unsigned long size) -{ } -static inline void finalize_identity_maps(void) -{ } #endif #ifdef CONFIG_EARLY_PRINTK @@ -119,6 +117,17 @@ static inline void console_init(void) void set_sev_encryption_mask(void); +#ifdef CONFIG_AMD_MEM_ENCRYPT +void sev_es_shutdown_ghcb(void); +extern bool sev_es_check_ghcb_fault(unsigned long address); +#else +static inline void sev_es_shutdown_ghcb(void) { } +static inline bool sev_es_check_ghcb_fault(unsigned long address) +{ + return false; +} +#endif + /* acpi.c */ #ifdef CONFIG_ACPI acpi_physical_address get_rsdp_addr(void); @@ -133,4 +142,21 @@ int count_immovable_mem_regions(void); static inline int count_immovable_mem_regions(void) { return 0; } #endif +/* ident_map_64.c */ +#ifdef CONFIG_X86_5LEVEL +extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; +#endif + +/* Used by PAGE_KERN* macros: */ +extern pteval_t __default_kernel_pte_mask; + +/* idt_64.c */ +extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; +extern struct desc_ptr boot_idt_desc; + +/* IDT Entry Points */ +void boot_page_fault(void); +void boot_stage1_vc(void); +void boot_stage2_vc(void); + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c new file mode 100644 index 000000000000..954cb2702e23 --- /dev/null +++ b/arch/x86/boot/compressed/sev-es.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + */ + +/* + * misc.h needs to be first because it knows how to include the other kernel + * headers in the pre-decompression code in a way that does not break + * compilation. + */ +#include "misc.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "error.h" + +struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +struct ghcb *boot_ghcb; + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +static bool insn_has_rep_prefix(struct insn *insn) +{ + int i; + + insn_get_prefixes(insn); + + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_byte_t p = insn->prefixes.bytes[i]; + + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + unsigned long low, high; + + asm volatile("rdmsr" : "=a" (low), "=d" (high) : + "c" (MSR_AMD64_SEV_ES_GHCB)); + + return ((high << 32) | low); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = val & 0xffffffffUL; + high = val >> 32; + + asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), + "a"(low), "d" (high) : "memory"); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + enum es_result ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); + insn_get_length(&ctxt->insn); + + ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + + return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +#undef __init +#undef __pa +#define __init +#define __pa(x) ((unsigned long)(x)) + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* Include code for early handlers */ +#include "../../kernel/sev-es-shared.c" + +static bool early_setup_sev_es(void) +{ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + + if (set_page_decrypted((unsigned long)&boot_ghcb_page)) + return false; + + /* Page is now mapped decrypted, clear it */ + memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); + + boot_ghcb = &boot_ghcb_page; + + /* Initialize lookup tables for the instruction decoder */ + inat_init_tables(); + + return true; +} + +void sev_es_shutdown_ghcb(void) +{ + if (!boot_ghcb) + return; + + if (!sev_es_check_cpu_features()) + error("SEV-ES CPU Features missing."); + + /* + * GHCB Page must be flushed from the cache and mapped encrypted again. + * Otherwise the running kernel will see strange cache effects when + * trying to use that page. + */ + if (set_page_encrypted((unsigned long)&boot_ghcb_page)) + error("Can't map GHCB page encrypted"); + + /* + * GHCB page is mapped encrypted again and flushed from the cache. + * Mark it non-present now to catch bugs when #VC exceptions trigger + * after this point. + */ + if (set_page_non_present((unsigned long)&boot_ghcb_page)) + error("Can't unmap GHCB page"); +} + +bool sev_es_check_ghcb_fault(unsigned long address) +{ + /* Check whether the fault was on the GHCB page */ + return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); +} + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_sev_es()) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) { + vc_finish_insn(&ctxt); + } else if (result != ES_RETRY) { + /* + * For now, just halt the machine. That makes debugging easier, + * later we just call sev_es_terminate() here. + */ + while (true) + asm volatile("hlt\n"); + } +} diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 826e73488308..cad08703c4ad 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -101,6 +101,8 @@ SYM_CODE_START(entry_SYSCALL_64) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) + /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ @@ -446,6 +448,84 @@ _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) .endm +#ifdef CONFIG_AMD_MEM_ENCRYPT +/** + * idtentry_vc - Macro to generate entry stub for #VC + * @vector: Vector number + * @asmsym: ASM symbol for the entry point + * @cfunc: C function to be called + * + * The macro emits code to set up the kernel context for #VC. The #VC handler + * runs on an IST stack and needs to be able to cause nested #VC exceptions. + * + * To make this work the #VC entry code tries its best to pretend it doesn't use + * an IST stack by switching to the task stack if coming from user-space (which + * includes early SYSCALL entry path) or back to the stack in the IRET frame if + * entered from kernel-mode. + * + * If entered from kernel-mode the return stack is validated first, and if it is + * not safe to use (e.g. because it points to the entry stack) the #VC handler + * will switch to a fall-back stack (VC2) and call a special handler function. + * + * The macro is only used for one vector, but it is planned to be extended in + * the future for the #HV exception. + */ +.macro idtentry_vc vector asmsym cfunc +SYM_CODE_START(\asmsym) + UNWIND_HINT_IRET_REGS + ASM_CLAC + + /* + * If the entry is from userspace, switch stacks and treat it as + * a normal entry. + */ + testb $3, CS-ORIG_RAX(%rsp) + jnz .Lfrom_usermode_switch_stack_\@ + + /* + * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. + * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS + */ + call paranoid_entry + + UNWIND_HINT_REGS + + /* + * Switch off the IST stack to make it free for nested exceptions. The + * vc_switch_off_ist() function will switch back to the interrupted + * stack if it is safe to do so. If not it switches to the VC fall-back + * stack. + */ + movq %rsp, %rdi /* pt_regs pointer */ + call vc_switch_off_ist + movq %rax, %rsp /* Switch to new stack */ + + UNWIND_HINT_REGS + + /* Update pt_regs */ + movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + + movq %rsp, %rdi /* pt_regs pointer */ + + call \cfunc + + /* + * No need to switch back to the IST stack. The current stack is either + * identical to the stack in the IRET frame or the VC fall-back stack, + * so it is definitly mapped even with PTI enabled. + */ + jmp paranoid_exit + + /* Switch to the regular task stack */ +.Lfrom_usermode_switch_stack_\@: + idtentry_body safe_stack_\cfunc, has_error_code=1 + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm +#endif + /* * Double fault entry. Straight paranoid. No checks from which context * this comes because for the espfix induced #DF this would do the wrong diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 8902fdb7de13..3d52b094850a 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -11,25 +11,29 @@ #ifdef CONFIG_X86_64 /* Macro to enforce the same ordering and stack sizes */ -#define ESTACKS_MEMBERS(guardsize) \ - char DF_stack_guard[guardsize]; \ - char DF_stack[EXCEPTION_STKSZ]; \ - char NMI_stack_guard[guardsize]; \ - char NMI_stack[EXCEPTION_STKSZ]; \ - char DB_stack_guard[guardsize]; \ - char DB_stack[EXCEPTION_STKSZ]; \ - char MCE_stack_guard[guardsize]; \ - char MCE_stack[EXCEPTION_STKSZ]; \ - char IST_top_guard[guardsize]; \ +#define ESTACKS_MEMBERS(guardsize, optional_stack_size) \ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[EXCEPTION_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char VC_stack_guard[guardsize]; \ + char VC_stack[optional_stack_size]; \ + char VC2_stack_guard[guardsize]; \ + char VC2_stack[optional_stack_size]; \ + char IST_top_guard[guardsize]; \ /* The exception stacks' physical storage. No guard pages required */ struct exception_stacks { - ESTACKS_MEMBERS(0) + ESTACKS_MEMBERS(0, 0) }; /* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(PAGE_SIZE) + ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) }; /* @@ -40,6 +44,8 @@ enum exception_stack_ordering { ESTACK_NMI, ESTACK_DB, ESTACK_MCE, + ESTACK_VC, + ESTACK_VC2, N_EXCEPTION_STACKS }; @@ -139,4 +145,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) #define __this_cpu_ist_top_va(name) \ CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) +#define __this_cpu_ist_bottom_va(name) \ + CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name) + #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7b0afd5e6c57..dad350d42ecf 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -236,6 +236,7 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1ced11d31932..476082a83d1c 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -383,6 +383,33 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) void alloc_intr_gate(unsigned int n, const void *addr); +static inline void init_idt_data(struct idt_data *data, unsigned int n, + const void *addr) +{ + BUG_ON(n > 0xFF); + + memset(data, 0, sizeof(*data)); + data->vector = n; + data->addr = addr; + data->segment = __KERNEL_CS; + data->bits.type = GATE_INTERRUPT; + data->bits.p = 1; +} + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low = (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + extern unsigned long system_vectors[]; extern void load_current_idt(void); diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a91f3b6e4f2a..f7e7099af595 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -74,6 +74,13 @@ struct idt_bits { p : 1; } __attribute__((packed)); +struct idt_data { + unsigned int vector; + unsigned int segment; + struct idt_bits bits; + const void *addr; +}; + struct gate_struct { u16 offset_low; u16 segment; @@ -109,6 +116,9 @@ struct desc_ptr { #endif /* !__ASSEMBLY__ */ +/* Boot IDT definitions */ +#define BOOT_IDT_ENTRIES 32 + /* Access rights as returned by LAR */ #define AR_TYPE_RODATA (0 * (1 << 9)) #define AR_TYPE_RWDATA (1 * (1 << 9)) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index eb1ed3bd8d96..8d33ad80704f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -592,33 +593,4 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) update_pasid(); } -/* - * MXCSR and XCR definitions: - */ - -static inline void ldmxcsr(u32 mxcsr) -{ - asm volatile("ldmxcsr %0" :: "m" (mxcsr)); -} - -extern unsigned int mxcsr_feature_mask; - -#define XCR_XFEATURE_ENABLED_MASK 0x00000000 - -static inline u64 xgetbv(u32 index) -{ - u32 eax, edx; - - asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); - return eax + ((u64)edx << 32); -} - -static inline void xsetbv(u32 index, u64 value) -{ - u32 eax = value; - u32 edx = value >> 32; - - asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); -} - #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h new file mode 100644 index 000000000000..1c7ab8d95da5 --- /dev/null +++ b/arch/x86/include/asm/fpu/xcr.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_FPU_XCR_H +#define _ASM_X86_FPU_XCR_H + +/* + * MXCSR and XCR definitions: + */ + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +extern unsigned int mxcsr_feature_mask; + +#define XCR_XFEATURE_ENABLED_MASK 0x00000000 + +static inline u64 xgetbv(u32 index) +{ + u32 eax, edx; + + asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); + return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ + u32 eax = value; + u32 edx = value >> 32; + + asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); +} + +#endif /* _ASM_X86_FPU_XCR_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cdd41d039cd1..b2442eb0ac2f 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -308,6 +308,19 @@ static __always_inline void __##func(struct pt_regs *regs) DECLARE_IDTENTRY_RAW(vector, func); \ __visible void noist_##func(struct pt_regs *regs) +/** + * DECLARE_IDTENTRY_VC - Declare functions for the VC entry point + * @vector: Vector number (ignored for C) + * @func: Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE, but declares also the + * safe_stack C handler. + */ +#define DECLARE_IDTENTRY_VC(vector, func) \ + DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \ + __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \ + __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code) + /** * DEFINE_IDTENTRY_IST - Emit code for IST entry points * @func: Function name of the entry point @@ -347,6 +360,35 @@ static __always_inline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_DF(func) \ DEFINE_IDTENTRY_RAW_ERRORCODE(func) +/** + * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler + which runs on a safe stack. + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func) + +/** + * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler + which runs on the VC fall-back stack + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC_IST(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func) + +/** + * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler + * @func: Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(func) + #else /* CONFIG_X86_64 */ /** @@ -433,6 +475,9 @@ __visible noinstr void func(struct pt_regs *regs, \ # define DECLARE_IDTENTRY_XENCB(vector, func) \ DECLARE_IDTENTRY(vector, func) +# define DECLARE_IDTENTRY_VC(vector, func) \ + idtentry_vc vector asm_##func func + #else # define DECLARE_IDTENTRY_MCE(vector, func) \ DECLARE_IDTENTRY(vector, func) @@ -564,6 +609,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); /* #DF */ DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); +/* #VC */ +#ifdef CONFIG_AMD_MEM_ENCRYPT +DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication); +#endif + #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); #endif diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 2b6ccf2c49f1..a0f839aa144d 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -15,9 +15,15 @@ #define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf) #define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) +bool insn_has_rep_prefix(struct insn *insn); void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); +int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs); unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); +int insn_fetch_from_user(struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE]); +bool insn_decode(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 5049f6c22683..c9f5df0a1c10 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -19,6 +19,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT extern u64 sme_me_mask; +extern u64 sev_status; extern bool sev_enabled; void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, @@ -48,8 +49,10 @@ void __init mem_encrypt_free_decrypted_mem(void); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init sev_es_init_vc_handling(void); bool sme_active(void); bool sev_active(void); +bool sev_es_active(void); #define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) @@ -70,8 +73,10 @@ static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } +static inline void sev_es_init_vc_handling(void) { } static inline bool sme_active(void) { return false; } static inline bool sev_active(void) { return false; } +static inline bool sev_es_active(void) { return false; } static inline int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c07a70ce7ffd..972a34d93505 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -470,9 +470,12 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e7752b4038ff..86651e86289d 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -4,7 +4,7 @@ #define _ASM_X86_NOSPEC_BRANCH_H_ #include -#include +#include #include #include diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 0aecc0b629e0..e3bae2b60a0d 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -63,12 +63,14 @@ extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); extern void numa_add_cpu(int cpu); extern void numa_remove_cpu(int cpu); +extern void init_gi_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(int cpu) { } static inline void numa_remove_cpu(int cpu) { } +static inline void init_gi_nodes(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index d25534940bde..fdbffec4cfde 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -39,27 +39,6 @@ #define ORC_REG_SP_INDIRECT 9 #define ORC_REG_MAX 15 -/* - * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the - * caller's SP right before it made the call). Used for all callable - * functions, i.e. all C code and all callable asm functions. - * - * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points - * to a fully populated pt_regs from a syscall, interrupt, or exception. - * - * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset - * points to the iret return frame. - * - * The UNWIND_HINT macros are used only for the unwind_hint struct. They - * aren't used in struct orc_entry due to size and complexity constraints. - * Objtool converts them to real types when it converts the hints to orc - * entries. - */ -#define ORC_TYPE_CALL 0 -#define ORC_TYPE_REGS 1 -#define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_RET_OFFSET 3 - #ifndef __ASSEMBLY__ /* * This struct is more or less a vastly simplified version of the DWARF Call @@ -78,19 +57,6 @@ struct orc_entry { unsigned end:1; } __packed; -/* - * This struct is used by asm and inline asm code to manually annotate the - * location of registers on the stack for the ORC unwinder. - * - * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. - */ -struct unwind_hint { - u32 ip; - s16 sp_offset; - u8 sp_reg; - u8 type; - u8 end; -}; #endif /* __ASSEMBLY__ */ #endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 288b065955b7..d0c6c10c18a0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -28,6 +28,7 @@ #define IST_INDEX_NMI 1 #define IST_INDEX_DB 2 #define IST_INDEX_MCE 3 +#define IST_INDEX_VC 4 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5e0dcc20614d..a02c67291cfc 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,7 +28,7 @@ #include extern pgd_t early_top_pgt[PTRS_PER_PGD]; -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d8a82e650810..5ac507586769 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -696,6 +696,7 @@ extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); +extern void cpu_init_exception_handling(void); extern void cr4_init(void); static inline unsigned long get_debugctlmsr(void) diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 28996fe19301..2c35f1c01a2d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -10,6 +10,7 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); +void entry_SYSCALL_64_safe_stack(void); long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index b35030eeec36..5db5d083c873 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -21,6 +21,9 @@ struct real_mode_header { /* SMP trampoline */ u32 trampoline_start; u32 trampoline_header; +#ifdef CONFIG_AMD_MEM_ENCRYPT + u32 sev_es_trampoline_start; +#endif #ifdef CONFIG_X86_64 u32 trampoline_pgd; #endif @@ -57,6 +60,9 @@ extern unsigned char real_mode_blob_end[]; extern unsigned long initial_code; extern unsigned long initial_gs; extern unsigned long initial_stack; +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern unsigned long initial_vc_handler; +#endif extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; @@ -66,6 +72,7 @@ extern unsigned char startup_32_smp[]; extern unsigned char boot_gdt[]; #else extern unsigned char secondary_startup_64[]; +extern unsigned char secondary_startup_64_no_verify[]; #endif static inline size_t real_mode_size_needed(void) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 517920928989..7fdd4facfce7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -226,7 +226,7 @@ #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack: */ -#define EXCEPTION_ERRCODE_MASK 0x00027d00 +#define EXCEPTION_ERRCODE_MASK 0x20027d00 #define GDT_SIZE (GDT_ENTRIES*8) #define GDT_ENTRY_TLS_ENTRIES 3 diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 84b645cc8bc9..7d7a064af6ff 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,8 @@ void vsmp_init(void); static inline void vsmp_init(void) { } #endif +struct pt_regs; + void setup_bios_corruption_check(void); void early_platform_quirks(void); @@ -48,7 +50,9 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); extern unsigned long __startup_secondary_64(void); -extern int early_make_pgtable(unsigned long address); +extern void startup_64_setup_env(unsigned long physbase); +extern void early_setup_idt(void); +extern void __init do_early_exception(struct pt_regs *regs, int trapnr); #ifdef CONFIG_X86_INTEL_MID extern void x86_intel_mid_early_setup(void); diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h new file mode 100644 index 000000000000..cf1d957c7091 --- /dev/null +++ b/arch/x86/include/asm/sev-es.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + */ + +#ifndef __ASM_ENCRYPTED_STATE_H +#define __ASM_ENCRYPTED_STATE_H + +#include +#include + +#define GHCB_SEV_INFO 0x001UL +#define GHCB_SEV_INFO_REQ 0x002UL +#define GHCB_INFO(v) ((v) & 0xfffUL) +#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) +#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) +#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_SEV_CPUID_REQ 0x004UL +#define GHCB_CPUID_REQ_EAX 0 +#define GHCB_CPUID_REQ_EBX 1 +#define GHCB_CPUID_REQ_ECX 2 +#define GHCB_CPUID_REQ_EDX 3 +#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ + (((unsigned long)reg & 3) << 30) | \ + (((unsigned long)fn) << 32)) + +#define GHCB_PROTOCOL_MAX 0x0001UL +#define GHCB_DEFAULT_USAGE 0x0000UL + +#define GHCB_SEV_CPUID_RESP 0x005UL +#define GHCB_SEV_TERMINATE 0x100UL +#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ + (((((u64)reason_set) & 0x7) << 12) | \ + ((((u64)reason_val) & 0xff) << 16)) +#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 +#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 + +#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) +#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } + +enum es_result { + ES_OK, /* All good */ + ES_UNSUPPORTED, /* Requested operation not supported */ + ES_VMM_ERROR, /* Unexpected state from the VMM */ + ES_DECODE_FAILED, /* Instruction decoding failed */ + ES_EXCEPTION, /* Instruction caused exception */ + ES_RETRY, /* Retry instruction emulation */ +}; + +struct es_fault_info { + unsigned long vector; + unsigned long error_code; + unsigned long cr2; +}; + +struct pt_regs; + +/* ES instruction emulation context */ +struct es_em_ctxt { + struct pt_regs *regs; + struct insn insn; + struct es_fault_info fi; +}; + +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); + +static inline u64 lower_bits(u64 val, unsigned int bits) +{ + u64 mask = (1ULL << bits) - 1; + + return (val & mask); +} + +struct real_mode_header; +enum stack_type; + +/* Early IDT entry points for #VC handler */ +extern void vc_no_ghcb(void); +extern void vc_boot_ghcb(void); +extern bool handle_vc_boot_ghcb(struct pt_regs *regs); + +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern struct static_key_false sev_es_enable_key; +extern void __sev_es_ist_enter(struct pt_regs *regs); +extern void __sev_es_ist_exit(void); +static __always_inline void sev_es_ist_enter(struct pt_regs *regs) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_enter(regs); +} +static __always_inline void sev_es_ist_exit(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_exit(); +} +extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); +extern void __sev_es_nmi_complete(void); +static __always_inline void sev_es_nmi_complete(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_nmi_complete(); +} +extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +#else +static inline void sev_es_ist_enter(struct pt_regs *regs) { } +static inline void sev_es_ist_exit(void) { } +static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } +static inline void sev_es_nmi_complete(void) { } +static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +#endif + +#endif diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 5ae5a68e469d..49600643faba 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -35,6 +35,8 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); +bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, + struct stack_info *info); const char *stack_type_name(enum stack_type type); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 8a1f5382a4ea..cf13f9e78585 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -150,14 +150,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) -struct __attribute__ ((__packed__)) vmcb_seg { +struct vmcb_seg { u16 selector; u16 attrib; u32 limit; u64 base; -}; +} __packed; -struct __attribute__ ((__packed__)) vmcb_save_area { +struct vmcb_save_area { struct vmcb_seg es; struct vmcb_seg cs; struct vmcb_seg ss; @@ -200,20 +200,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area { u64 br_to; u64 last_excp_from; u64 last_excp_to; -}; + /* + * The following part of the save area is valid only for + * SEV-ES guests when referenced through the GHCB. + */ + u8 reserved_7[104]; + u64 reserved_8; /* rax already available at 0x01f8 */ + u64 rcx; + u64 rdx; + u64 rbx; + u64 reserved_9; /* rsp already available at 0x01d8 */ + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + u8 reserved_10[16]; + u64 sw_exit_code; + u64 sw_exit_info_1; + u64 sw_exit_info_2; + u64 sw_scratch; + u8 reserved_11[56]; + u64 xcr0; + u8 valid_bitmap[16]; + u64 x87_state_gpa; +} __packed; + +struct ghcb { + struct vmcb_save_area save; + u8 reserved_save[2048 - sizeof(struct vmcb_save_area)]; + + u8 shared_buffer[2032]; + + u8 reserved_1[10]; + u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ + u32 ghcb_usage; +} __packed; + + +#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256 +#define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) { - BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); - BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); + BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); } -struct __attribute__ ((__packed__)) vmcb { +struct vmcb { struct vmcb_control_area control; u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; -}; +} __packed; #define SVM_CPUID_FUNC 0x8000000a @@ -298,4 +345,47 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) +/* GHCB Accessor functions */ + +#define GHCB_BITMAP_IDX(field) \ + (offsetof(struct vmcb_save_area, field) / sizeof(u64)) + +#define DEFINE_GHCB_ACCESSORS(field) \ + static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ + { \ + return test_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&ghcb->save.valid_bitmap); \ + } \ + \ + static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ + { \ + __set_bit(GHCB_BITMAP_IDX(field), \ + (unsigned long *)&ghcb->save.valid_bitmap); \ + ghcb->save.field = value; \ + } + +DEFINE_GHCB_ACCESSORS(cpl) +DEFINE_GHCB_ACCESSORS(rip) +DEFINE_GHCB_ACCESSORS(rsp) +DEFINE_GHCB_ACCESSORS(rax) +DEFINE_GHCB_ACCESSORS(rcx) +DEFINE_GHCB_ACCESSORS(rdx) +DEFINE_GHCB_ACCESSORS(rbx) +DEFINE_GHCB_ACCESSORS(rbp) +DEFINE_GHCB_ACCESSORS(rsi) +DEFINE_GHCB_ACCESSORS(rdi) +DEFINE_GHCB_ACCESSORS(r8) +DEFINE_GHCB_ACCESSORS(r9) +DEFINE_GHCB_ACCESSORS(r10) +DEFINE_GHCB_ACCESSORS(r11) +DEFINE_GHCB_ACCESSORS(r12) +DEFINE_GHCB_ACCESSORS(r13) +DEFINE_GHCB_ACCESSORS(r14) +DEFINE_GHCB_ACCESSORS(r15) +DEFINE_GHCB_ACCESSORS(sw_exit_code) +DEFINE_GHCB_ACCESSORS(sw_exit_info_1) +DEFINE_GHCB_ACCESSORS(sw_exit_info_2) +DEFINE_GHCB_ACCESSORS(sw_scratch) +DEFINE_GHCB_ACCESSORS(xcr0) + #endif diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h new file mode 100644 index 000000000000..305bc1214aef --- /dev/null +++ b/arch/x86/include/asm/trap_pf.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_TRAP_PF_H +#define _ASM_X86_TRAP_PF_H + +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + */ +enum x86_pf_error_code { + X86_PF_PROT = 1 << 0, + X86_PF_WRITE = 1 << 1, + X86_PF_USER = 1 << 2, + X86_PF_RSVD = 1 << 3, + X86_PF_INSTR = 1 << 4, + X86_PF_PK = 1 << 5, +}; + +#endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index 082f45631fa9..f5d2325aa0b7 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -26,6 +26,7 @@ #define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */ #define X86_TRAP_VE 20 /* Virtualization Exception */ #define X86_TRAP_CP 21 /* Control Protection Exception */ +#define X86_TRAP_VC 29 /* VMM Communication Exception */ #define X86_TRAP_IRET 32 /* IRET Exception */ #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index df0b7bfc1234..7f7200021bd1 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -8,12 +8,14 @@ #include #include #include /* TRAP_TRACE, ... */ +#include #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); void __init trap_init(void); +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif #ifdef CONFIG_X86_F00F_BUG @@ -43,22 +45,4 @@ void __noreturn handle_stack_overflow(const char *message, unsigned long fault_address); #endif -/* - * Page fault error code bits: - * - * bit 0 == 0: no page found 1: protection fault - * bit 1 == 0: read access 1: write access - * bit 2 == 0: kernel-mode access 1: user-mode access - * bit 3 == 1: use of reserved bit detected - * bit 4 == 1: fault was an instruction fetch - * bit 5 == 1: protection keys block access - */ -enum x86_pf_error_code { - X86_PF_PROT = 1 << 0, - X86_PF_WRITE = 1 << 1, - X86_PF_USER = 1 << 2, - X86_PF_RSVD = 1 << 3, - X86_PF_INSTR = 1 << 4, - X86_PF_PK = 1 << 5, -}; #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 7d903fdb3f43..664d4610d700 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -1,51 +1,17 @@ #ifndef _ASM_X86_UNWIND_HINTS_H #define _ASM_X86_UNWIND_HINTS_H +#include + #include "orc_types.h" #ifdef __ASSEMBLY__ -/* - * In asm, there are two kinds of code: normal C-type callable functions and - * the rest. The normal callable functions can be called by other code, and - * don't do anything unusual with the stack. Such normal callable functions - * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this - * category. In this case, no special debugging annotations are needed because - * objtool can automatically generate the ORC data for the ORC unwinder to read - * at runtime. - * - * Anything which doesn't fall into the above category, such as syscall and - * interrupt handlers, tends to not be called directly by other functions, and - * often does unusual non-C-function-type things with the stack pointer. Such - * code needs to be annotated such that objtool can understand it. The - * following CFI hint macros are for this type of code. - * - * These macros provide hints to objtool about the state of the stack at each - * instruction. Objtool starts from the hints and follows the code flow, - * making automatic CFI adjustments when it sees pushes and pops, filling out - * the debuginfo as necessary. It will also warn if it sees any - * inconsistencies. - */ -.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL end=0 -#ifdef CONFIG_STACK_VALIDATION -.Lunwind_hint_ip_\@: - .pushsection .discard.unwind_hints - /* struct unwind_hint */ - .long .Lunwind_hint_ip_\@ - . - .short \sp_offset - .byte \sp_reg - .byte \type - .byte \end - .balign 4 - .popsection -#endif -.endm - .macro UNWIND_HINT_EMPTY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED end=1 + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 .endm -.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 .if \base == %rsp .if \indirect .set sp_reg, ORC_REG_SP_INDIRECT @@ -66,24 +32,24 @@ .set sp_offset, \offset - .if \iret - .set type, ORC_TYPE_REGS_IRET + .if \partial + .set type, UNWIND_HINT_TYPE_REGS_PARTIAL .elseif \extra == 0 - .set type, ORC_TYPE_REGS_IRET + .set type, UNWIND_HINT_TYPE_REGS_PARTIAL .set sp_offset, \offset + (16*8) .else - .set type, ORC_TYPE_REGS + .set type, UNWIND_HINT_TYPE_REGS .endif UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type .endm .macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 - UNWIND_HINT_REGS base=\base offset=\offset iret=1 + UNWIND_HINT_REGS base=\base offset=\offset partial=1 .endm .macro UNWIND_HINT_FUNC sp_offset=8 - UNWIND_HINT sp_offset=\sp_offset + UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL .endm /* @@ -92,7 +58,7 @@ * initial_func_cfi. */ .macro UNWIND_HINT_RET_OFFSET sp_offset=8 - UNWIND_HINT type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset + UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 397196fae24d..dde5b3f1e7cd 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -4,8 +4,10 @@ #include +struct ghcb; struct mpc_bus; struct mpc_cpu; +struct pt_regs; struct mpc_table; struct cpuinfo_x86; struct irq_domain; @@ -229,10 +231,22 @@ struct x86_legacy_features { /** * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks * - * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) + * @pin_vcpu: pin current vcpu to specified physical + * cpu (run rarely) + * @sev_es_hcall_prepare: Load additional hypervisor-specific + * state into the GHCB when doing a VMMCALL under + * SEV-ES. Called from the #VC exception handler. + * @sev_es_hcall_finish: Copies state from the GHCB back into the + * processor (or pt_regs). Also runs checks on the + * state returned from the hypervisor after a + * VMMCALL under SEV-ES. Needs to return 'false' + * if the checks fail. Called from the #VC + * exception handler. */ struct x86_hyper_runtime { void (*pin_vcpu)(int cpu); + void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs); + bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs); }; /** diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 2e8a30f06c74..a7a3403645e5 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -29,6 +29,7 @@ #define SVM_EXIT_WRITE_DR6 0x036 #define SVM_EXIT_WRITE_DR7 0x037 #define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_LAST_EXCP 0x05f #define SVM_EXIT_INTR 0x060 #define SVM_EXIT_NMI 0x061 #define SVM_EXIT_SMI 0x062 @@ -80,6 +81,16 @@ #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 +/* SEV-ES software-defined VMGEXIT events */ +#define SVM_VMGEXIT_MMIO_READ 0x80000001 +#define SVM_VMGEXIT_MMIO_WRITE 0x80000002 +#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003 +#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004 +#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 +#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 +#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 +#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff + #define SVM_EXIT_ERR -1 #define SVM_EXIT_REASONS \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index de09af019e23..04ceea8f4a89 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -20,6 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg +CFLAGS_REMOVE_sev-es.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -27,6 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n +KASAN_SANITIZE_sev-es.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -146,6 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dcc3d943c68f..6062ce586b95 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -614,7 +614,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * If BIOS has not enabled SME then don't advertise the * SME feature (set in scattered.c). * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV feature (set in scattered.c). + * SEV and SEV_ES feature (set in scattered.c). * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -645,6 +645,7 @@ clear_all: setup_clear_cpu_cap(X86_FEATURE_SME); clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV_ES); } } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c51158914ea2..35ad8480c464 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1876,6 +1876,8 @@ static inline void tss_setup_ist(struct tss_struct *tss) tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + /* Only mapped when SEV-ES is active */ + tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC); } #else /* CONFIG_X86_64 */ @@ -1907,6 +1909,29 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss) #endif } +/* + * Setup everything needed to handle exceptions from the IDT, including the IST + * exceptions which use paranoid_entry(). + */ +void cpu_init_exception_handling(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + int cpu = raw_smp_processor_id(); + + /* paranoid_entry() gets the CPU number from the GDT */ + setup_getcpu(cpu); + + /* IST vectors need TSS to be set up. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + + load_TR_desc(); + + /* Finally load the IDT */ + load_current_idt(); +} + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 2eb0a8c44b35..866c9a9bcdee 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -42,6 +42,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, + { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 }, { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 9b6fafa69be9..924571fe5864 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -33,6 +33,7 @@ #include #include #include +#include #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt @@ -476,10 +477,49 @@ static bool __init vmware_legacy_x2apic_available(void) (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; } +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, + struct pt_regs *regs) +{ + /* Copy VMWARE specific Hypercall parameters to the GHCB */ + ghcb_set_rip(ghcb, regs->ip); + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); + ghcb_set_rdi(ghcb, regs->di); + ghcb_set_rbp(ghcb, regs->bp); +} + +static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + if (!(ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb) && + ghcb_rsi_is_valid(ghcb) && + ghcb_rdi_is_valid(ghcb) && + ghcb_rbp_is_valid(ghcb))) + return false; + + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + regs->si = ghcb->save.rsi; + regs->di = ghcb->save.rdi; + regs->bp = ghcb->save.rbp; + + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_vmware = { - .name = "VMware", - .detect = vmware_platform, - .type = X86_HYPER_VMWARE, - .init.init_platform = vmware_platform_setup, - .init.x2apic_available = vmware_legacy_x2apic_available, + .name = "VMware", + .detect = vmware_platform, + .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish, +#endif }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ea8d51ec251b..25c06b67e7e0 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -29,8 +29,8 @@ static int die_counter; static struct pt_regs exec_summary_regs; -bool in_task_stack(unsigned long *stack, struct task_struct *task, - struct stack_info *info) +bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) { unsigned long *begin = task_stack_page(task); unsigned long *end = task_stack_page(task) + THREAD_SIZE; @@ -46,7 +46,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } -bool in_entry_stack(unsigned long *stack, struct stack_info *info) +/* Called from get_stack_info_noinstr - so must be noinstr too */ +bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) { struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 4a94d38cd141..1dd851397bd9 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,11 +24,13 @@ static const char * const exception_stack_names[] = { [ ESTACK_NMI ] = "NMI", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", + [ ESTACK_VC ] = "#VC", + [ ESTACK_VC2 ] = "#VC2", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -79,16 +81,18 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(NMI), EPAGERANGE(DB), EPAGERANGE(MCE), + EPAGERANGE(VC), + EPAGERANGE(VC2), }; -static bool in_exception_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long begin, end, stk = (unsigned long)stack; const struct estack_pages *ep; struct pt_regs *regs; unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* @@ -122,7 +126,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) return true; } -static bool in_irq_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); @@ -147,32 +151,38 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) return true; } +bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + if (in_task_stack(stack, task, info)) + return true; + + if (task != current) + return false; + + if (in_exception_stack(stack, info)) + return true; + + if (in_irq_stack(stack, info)) + return true; + + if (in_entry_stack(stack, info)) + return true; + + return false; +} + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask) { + task = task ? : current; + if (!stack) goto unknown; - task = task ? : current; - - if (in_task_stack(stack, task, info)) - goto recursion_check; - - if (task != current) + if (!get_stack_info_noinstr(stack, task, info)) goto unknown; - if (in_exception_stack(stack, info)) - goto recursion_check; - - if (in_irq_stack(stack, info)) - goto recursion_check; - - if (in_entry_stack(stack, info)) - goto recursion_check; - - goto unknown; - -recursion_check: /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index cbb71c1b574f..4199f25c0063 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -36,6 +36,11 @@ #include #include #include +#include +#include +#include +#include +#include /* * Manage page tables very early on. @@ -61,6 +66,24 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif +/* + * GDT used on the boot CPU before switching to virtual addresses. + */ +static struct desc_struct startup_gdt[GDT_ENTRIES] = { + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +}; + +/* + * Address needs to be set at runtime because it references the startup_gdt + * while the kernel still uses a direct mapping. + */ +static struct desc_ptr startup_gdt_descr = { + .size = sizeof(startup_gdt), + .address = 0, +}; + #define __head __section(.head.text) static void __head *fixup_pointer(void *ptr, unsigned long physaddr) @@ -297,7 +320,7 @@ static void __init reset_early_page_tables(void) } /* Create a new PMD entry */ -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; @@ -307,7 +330,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) - return -1; + return false; again: pgd_p = &early_top_pgt[pgd_index(address)].pgd; @@ -364,10 +387,10 @@ again: } pmd_p[pmd_index(address)] = pmd; - return 0; + return true; } -int __init early_make_pgtable(unsigned long address) +static bool __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; pmdval_t pmd; @@ -377,6 +400,19 @@ int __init early_make_pgtable(unsigned long address) return __early_make_pgtable(address, pmd); } +void __init do_early_exception(struct pt_regs *regs, int trapnr) +{ + if (trapnr == X86_TRAP_PF && + early_make_pgtable(native_read_cr2())) + return; + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) && + trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) + return; + + early_fixup_exception(regs, trapnr); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -489,3 +525,81 @@ void __init x86_64_start_reservations(char *real_mode_data) start_kernel(); } + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +static struct desc_ptr bringup_idt_descr = { + .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, + .address = 0, /* Set at runtime */ +}; + +static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + struct idt_data data; + gate_desc desc; + + init_idt_data(&data, n, handler); + idt_init_desc(&desc, &data); + native_write_idt_entry(idt, n, &desc); +#endif +} + +/* This runs while still in the direct mapping */ +static void startup_64_load_idt(unsigned long physbase) +{ + struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); + gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + void *handler; + + /* VMM Communication Exception */ + handler = fixup_pointer(vc_no_ghcb, physbase); + set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + } + + desc->address = (unsigned long)idt; + native_load_idt(desc); +} + +/* This is used when running on kernel addresses */ +void early_setup_idt(void) +{ + /* VMM Communication Exception */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + + bringup_idt_descr.address = (unsigned long)bringup_idt_table; + native_load_idt(&bringup_idt_descr); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_env(unsigned long physbase) +{ + /* Load GDT */ + startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + startup_64_load_idt(physbase); +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 16da4ac01597..7eb2a1c87969 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,6 +73,20 @@ SYM_CODE_START_NOALIGN(startup_64) /* Set up the stack for verify_cpu(), similar to initial_stack below */ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp + leaq _text(%rip), %rdi + pushq %rsi + call startup_64_setup_env + popq %rsi + + /* Now switch to __KERNEL_CS so IRET works reliably */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + UNWIND_HINT_EMPTY + /* Sanitize CPU configuration */ call verify_cpu @@ -111,6 +125,18 @@ SYM_CODE_START(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu + /* + * The secondary_startup_64_no_verify entry point is only used by + * SEV-ES guests. In those guests the call to verify_cpu() would cause + * #VC exceptions which can not be handled at this stage of secondary + * CPU bringup. + * + * All non SEV-ES systems, especially Intel systems, need to execute + * verify_cpu() above to make sure NX is enabled. + */ +SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. @@ -144,33 +170,6 @@ SYM_CODE_START(secondary_startup_64) 1: UNWIND_HINT_EMPTY - /* Check if nx is implemented */ - movl $0x80000001, %eax - cpuid - movl %edx,%edi - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_SCE, %eax /* Enable System Call */ - btl $20,%edi /* No Execute supported? */ - jnc 1f - btsl $_EFER_NX, %eax - btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ - - /* Setup cr0 */ - movl $CR0_STATE, %eax - /* Make changes effective */ - movq %rax, %cr0 - - /* Setup a boot time stack */ - movq initial_stack(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace @@ -205,6 +204,41 @@ SYM_CODE_START(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr + /* + * Setup a boot time stack - Any secondary CPU will have lost its stack + * by now because the cr3-switch above unmaps the real-mode stack + */ + movq initial_stack(%rip), %rsp + + /* Setup and Load IDT */ + pushq %rsi + call early_setup_idt + popq %rsi + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax + btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) +1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi @@ -257,6 +291,39 @@ SYM_CODE_START(start_cpu0) movq initial_stack(%rip), %rsp jmp .Ljump_to_C_code SYM_CODE_END(start_cpu0) +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during early boot when running on kernel + * addresses, but before the switch to the idt_table can be made. + * The early_idt_handler_array can't be used here because it calls into a lot + * of __init code and this handler is also used during CPU offlining/onlining. + * Therefore this handler ends up in the .text section so that it stays around + * when .init.text is freed. + */ +SYM_CODE_START_NOALIGN(vc_boot_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + movq initial_vc_handler(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + call *%rax + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_boot_ghcb) #endif /* Both SMP bootup and ACPI suspend change these variables */ @@ -264,6 +331,9 @@ SYM_CODE_END(start_cpu0) .balign 8 SYM_DATA(initial_code, .quad x86_64_start_kernel) SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) +#endif /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder @@ -319,22 +389,43 @@ SYM_CODE_START_LOCAL(early_idt_handler_common) pushq %r15 /* pt_regs->r15 */ UNWIND_HINT_REGS - cmpq $14,%rsi /* Page fault? */ - jnz 10f - GET_CR2_INTO(%rdi) /* can clobber %rax if pv */ - call early_make_pgtable - andl %eax,%eax - jz 20f /* All good */ - -10: movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ - call early_fixup_exception + call do_early_exception -20: decl early_recursion_flag(%rip) jmp restore_regs_and_return_to_kernel SYM_CODE_END(early_idt_handler_common) +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during very early boot. The + * early_idt_handler_array can't be used because it returns via the + * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early. + * + * This handler will end up in the .init.text section and not be + * available to boot secondary CPUs. + */ +SYM_CODE_START_NOALIGN(vc_no_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + call do_vc_no_ghcb + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_no_ghcb) +#endif #define SYM_DATA_START_PAGE_ALIGNED(name) \ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 1bffb87dcfdc..ee1a283f8e96 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -11,13 +11,6 @@ #include #include -struct idt_data { - unsigned int vector; - unsigned int segment; - struct idt_bits bits; - const void *addr; -}; - #define DPL0 0x0 #define DPL3 0x3 @@ -175,20 +168,6 @@ bool idt_is_f00f_address(unsigned long address) } #endif -static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) -{ - unsigned long addr = (unsigned long) d->addr; - - gate->offset_low = (u16) addr; - gate->segment = (u16) d->segment; - gate->bits = d->bits; - gate->offset_middle = (u16) (addr >> 16); -#ifdef CONFIG_X86_64 - gate->offset_high = (u32) (addr >> 32); - gate->reserved = 0; -#endif -} - static __init void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { @@ -206,14 +185,7 @@ static __init void set_intr_gate(unsigned int n, const void *addr) { struct idt_data data; - BUG_ON(n > 0xFF); - - memset(&data, 0, sizeof(data)); - data.vector = n; - data.addr = addr; - data.segment = __KERNEL_CS; - data.bits.type = GATE_INTERRUPT; - data.bits.p = 1; + init_idt_data(&data, n, addr); idt_setup_from_table(idt_table, &data, 1, false); } @@ -254,11 +226,14 @@ static const __initconst struct idt_data early_pf_idts[] = { * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), #endif }; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index db8f8693ab8d..547c7abb39f5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -38,9 +38,9 @@ #include #include #include -#include #include #include +#include #include #include diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 15e06408f6ba..041f0b50bc27 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9663ba31347c..1c0f2560a41c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -744,13 +746,34 @@ static void __init kvm_init_platform(void) x86_platform.apic_post_init = kvm_apic_init; } +#if defined(CONFIG_AMD_MEM_ENCRYPT) +static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); +} + +static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_kvm = { - .name = "KVM", - .detect = kvm_detect, - .type = X86_HYPER_KVM, - .init.guest_late_init = kvm_guest_init, - .init.x2apic_available = kvm_para_available, - .init.init_platform = kvm_init_platform, + .name = "KVM", + .detect = kvm_detect, + .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, + .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, +#if defined(CONFIG_AMD_MEM_ENCRYPT) + .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, +#endif }; static __init int activate_jump_labels(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47381666d6a5..4bc77aaf1303 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -33,6 +33,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -476,6 +477,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi) { bool irq_state; + /* + * Re-enable NMIs right here when running as an SEV-ES guest. This might + * cause nested NMIs, but those can be handled safely. + */ + sev_es_nmi_complete(); + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -487,6 +494,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi) this_cpu_write(nmi_cr2, read_cr2()); nmi_restart: + /* + * Needs to happen before DR7 is accessed, because the hypervisor can + * intercept DR7 reads/writes, turning those into #VC exceptions. + */ + sev_es_ist_enter(regs); + this_cpu_write(nmi_dr7, local_db_save()); irq_state = idtentry_enter_nmi(regs); @@ -500,6 +513,8 @@ nmi_restart: local_db_restore(this_cpu_read(nmi_dr7)); + sev_es_ist_exit(); + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); if (this_cpu_dec_return(nmi_state)) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a515e2d230b7..db115943e8bd 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 210e878c4c0d..b16caee53bea 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1199,6 +1199,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); init_cpu_to_node(); + init_gi_nodes(); io_apic_init_mappings(); diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c new file mode 100644 index 000000000000..5f83ccaab877 --- /dev/null +++ b/arch/x86/kernel/sev-es-shared.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#endif + +static bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +static void sev_es_terminate(unsigned int reason) +{ + u64 val = GHCB_SEV_TERMINATE; + + /* + * Tell the hypervisor what went wrong - only reason-set 0 is + * currently supported. + */ + val |= GHCB_SEV_TERMINATE_REASON(0, reason); + + /* Request Guest Termination from Hypvervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +static bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_INFO(val) != GHCB_SEV_INFO) + return false; + + if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + return false; + + return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + enum es_result ret; + + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v; + + info = ghcb->save.sw_exit_info_2; + v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + ret = ES_EXCEPTION; + } else { + ret = ES_VMM_ERROR; + } + } else { + ret = ES_OK; + } + + return ret; +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int fn = lower_bits(regs->ax, 32); + unsigned long val; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->ax = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->bx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->cx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->dx = val >> 32; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); + VMGEXIT(); + + /* Shouldn't get here - if we do halt the machine */ + while (true) + asm volatile("hlt\n"); +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (u64)insn->immediate.value << 16; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (u64)insn->immediate.value << 16; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + default: + return ES_DECODE_FAILED; + } + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + } + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return ES_OK; +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c new file mode 100644 index 000000000000..4a96726fbaf8 --- /dev/null +++ b/arch/x86/kernel/sev-es.c @@ -0,0 +1,1404 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel + */ + +#define pr_fmt(fmt) "SEV-ES: " fmt + +#include /* For show_regs() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DR7_RESET_VALUE 0x400 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb __initdata *boot_ghcb; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* Physical storage for the per-CPU IST stack of the #VC handler */ + char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Physical storage for the per-CPU fall-back stack of the #VC handler. + * The fall-back stack is used when it is not safe to switch back to the + * interrupted stack in the #VC entry code. + */ + char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static void __init setup_vc_stacks(int cpu) +{ + struct sev_es_runtime_data *data; + struct cpu_entry_area *cea; + unsigned long vaddr; + phys_addr_t pa; + + data = per_cpu(runtime_data, cpu); + cea = get_cpu_entry_area(cpu); + + /* Map #VC IST stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); + pa = __pa(data->ist_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); + + /* Map VC fall-back stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); + pa = __pa(data->fallback_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); +} + +static __always_inline bool on_vc_stack(unsigned long sp) +{ + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC exception + * handler entry code. In this case, the IST entry for #VC must be adjusted, so + * that any subsequent #VC exception will not overwrite the stack contents of the + * interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested + * sev_es_ist_exit() call may adjust back the IST entry too early. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* Make room on the IST stack */ + if (on_vc_stack(regs->sp)) + new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); + else + new_ist = old_ist - sizeof(old_ist); + + /* Store old IST entry */ + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) + return NULL; + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + data->ghcb_active = false; + } +} + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + enum es_result ret; + int res; + + if (user_mode(ctxt->regs)) { + res = insn_fetch_from_user(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res)) + return ES_DECODE_FAILED; + } else { + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); + insn_get_length(&ctxt->insn); + } + + ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + + return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + char __user *target = (char __user *)dst; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + switch (size) { + case 1: + memcpy(&d1, buf, 1); + if (put_user(d1, target)) + goto fault; + break; + case 2: + memcpy(&d2, buf, 2); + if (put_user(d2, target)) + goto fault; + break; + case 4: + memcpy(&d4, buf, 4); + if (put_user(d4, target)) + goto fault; + break; + case 8: + memcpy(&d8, buf, 8); + if (put_user(d8, target)) + goto fault; + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + char __user *s = (char __user *)src; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + switch (size) { + case 1: + if (get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + case 2: + if (get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + case 4: + if (get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + case 8: + if (get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return false; + } + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return true; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-es-shared.c" + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + sev_es_put_ghcb(&state); +} + +static u64 get_jump_table_addr(void) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u64 ret = 0; + + local_irq_save(flags); + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; + + sev_es_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ + u16 startup_cs, startup_ip; + phys_addr_t jump_table_pa; + u64 jump_table_addr; + u16 __iomem *jump_table; + + jump_table_addr = get_jump_table_addr(); + + /* On UP guests there is no jump table so this is not a failure */ + if (!jump_table_addr) + return 0; + + /* Check if AP Jump Table is page-aligned */ + if (jump_table_addr & ~PAGE_MASK) + return -EINVAL; + + jump_table_pa = jump_table_addr & PAGE_MASK; + + startup_cs = (u16)(rmh->trampoline_start >> 4); + startup_ip = (u16)(rmh->sev_es_trampoline_start - + rmh->trampoline_start); + + jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); + if (!jump_table) + return -EIO; + + writew(startup_ip, &jump_table[0]); + writew(startup_cs, &jump_table[1]); + + iounmap(jump_table); + + return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ + struct sev_es_runtime_data *data; + unsigned long address, pflags; + int cpu; + u64 pfn; + + if (!sev_es_active()) + return 0; + + pflags = _PAGE_NX | _PAGE_RW; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + + address = __pa(&data->ghcb_page); + pfn = address >> PAGE_SHIFT; + + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) + return 1; + } + + return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + u64 exit_info_1; + + /* Is it a WRMSR? */ + exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + + ghcb_set_rcx(ghcb, regs->cx); + if (exit_info_1) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + + if ((ret == ES_OK) && (!exit_info_1)) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +/* + * This function runs on the first #VC exception after the kernel + * switched to virtual addresses. + */ +static bool __init sev_es_setup_ghcb(void) +{ + /* First make sure the hypervisor talks a supported protocol. */ + if (!sev_es_negotiate_protocol()) + return false; + + /* + * Clear the boot_ghcb. The first exception comes in before the bss + * section is cleared. + */ + memset(&boot_ghcb_page, 0, PAGE_SIZE); + + /* Alright - Make the boot-ghcb public */ + boot_ghcb = &boot_ghcb_page; + + return true; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + while (true) { + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + /* Wakeup signal? */ + if (ghcb_sw_exit_info_2_is_valid(ghcb) && + ghcb->save.sw_exit_info_2) + break; + } + + sev_es_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ + play_dead_common(); + + /* IRQs now disabled */ + + sev_es_ap_hlt_loop(); + + /* + * If we get here, the VCPU was woken up again. Jump to CPU + * startup code to get it back online. + */ + start_cpu0(); +} +#else /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ + smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ + struct sev_es_runtime_data *data; + + data = memblock_alloc(sizeof(*data), PAGE_SIZE); + if (!data) + panic("Can't allocate SEV-ES runtime data"); + + per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ + struct sev_es_runtime_data *data; + int err; + + data = per_cpu(runtime_data, cpu); + + err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, + sizeof(data->ghcb_page)); + if (err) + panic("Can't map GHCBs unencrypted"); + + memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + + data->ghcb_active = false; + data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + + if (!sev_es_active()) + return; + + if (!sev_es_check_cpu_features()) + panic("SEV-ES CPU Features missing"); + + /* Enable SEV-ES special handling */ + static_branch_enable(&sev_es_enable_key); + + /* Initialize per-cpu GHCB pages */ + for_each_possible_cpu(cpu) { + alloc_runtime_data(cpu); + init_ghcb(cpu); + setup_vc_stacks(cpu); + } + + sev_es_setup_play_dead(); + + /* Secondary CPUs use the runtime #VC handler */ + initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) { + if (!read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + int sign_byte; + long *reg_data; + + switch (insn->opcode.bytes[1]) { + /* MMIO Read w/ zero-extension */ + case 0xb6: + bytes = 1; + fallthrough; + case 0xb7: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memset(reg_data, 0, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MMIO Read w/ sign-extension */ + case 0xbe: + bytes = 1; + fallthrough; + case 0xbf: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Sign extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + memset(reg_data, sign_byte, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + long *reg_data; + + switch (insn->opcode.bytes[0]) { + /* MMIO Write */ + case 0x88: + bytes = 1; + fallthrough; + case 0x89: + if (!bytes) + bytes = insn->opnd_bytes; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memcpy(ghcb->shared_buffer, reg_data, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + case 0xc6: + bytes = 1; + fallthrough; + case 0xc7: + if (!bytes) + bytes = insn->opnd_bytes; + + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + /* MMIO Read */ + case 0x8a: + bytes = 1; + fallthrough; + case 0x8b: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MOVS instruction */ + case 0xa4: + bytes = 1; + fallthrough; + case 0xa5: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + /* Two-Byte Opcodes */ + case 0x0f: + ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + break; + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static __always_inline void vc_handle_trap_db(struct pt_regs *regs) +{ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +{ + unsigned long sp = (unsigned long)regs; + + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +/* + * Main #VC exception handler. It is called when the entry code was able to + * switch off the IST to a safe kernel stack. + * + * With the current implementation it is always possible to switch to a safe + * stack because #VC exceptions only happen at known places, like intercepted + * instructions or accesses to MMIO areas/IO ports. They can also happen with + * code instrumentation when the hypervisor intercepts #DB, but the critical + * paths are forbidden to be instrumented, so #DB exceptions currently also + * only happen in safe places. + */ +DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + + lockdep_assert_irqs_disabled(); + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { + vc_handle_trap_db(regs); + return; + } + + instrumentation_begin(); + + /* + * This is invoked through an interrupt gate, so IRQs are disabled. The + * code below might walk page-tables for user or kernel addresses, so + * keep the IRQs disabled to protect us against concurrent TLB flushes. + */ + + ghcb = sev_es_get_ghcb(&state); + if (!ghcb) { + /* + * Mark GHCBs inactive so that panic() is able to print the + * message. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + } + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + sev_es_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + +out: + instrumentation_end(); + + return; + +fail: + if (user_mode(regs)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } else { + pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", + result); + + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + goto out; +} + +/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ +DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) +{ + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); +} + +DEFINE_IDTENTRY_VC(exc_vmm_communication) +{ + if (likely(!on_vc_fallback_stack(regs))) + safe_stack_exc_vmm_communication(regs, error_code); + else + ist_exc_vmm_communication(regs, error_code); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + /* Do initial setup or terminate the guest */ + if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + while (true) + halt(); +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f5ef689dd62a..de776b2e6046 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -227,7 +227,7 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - load_current_idt(); + cpu_init_exception_handling(); cpu_init(); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ec3a2572843f..3c70fb34028b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -673,6 +674,50 @@ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } +#ifdef CONFIG_AMD_MEM_ENCRYPT +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) +{ + unsigned long sp, *stack; + struct stack_info info; + struct pt_regs *regs_ret; + + /* + * In the SYSCALL entry path the RSP value comes from user-space - don't + * trust it and switch to the current kernel stack + */ + if (regs->ip >= (unsigned long)entry_SYSCALL_64 && + regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) { + sp = this_cpu_read(cpu_current_top_of_stack); + goto sync; + } + + /* + * From here on the RSP value is trusted. Now check whether entry + * happened from a safe stack. Not safe are the entry or unknown stacks, + * use the fall-back stack instead in this case. + */ + sp = regs->sp; + stack = (unsigned long *)sp; + + if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || + info.type >= STACK_TYPE_EXCEPTION_LAST) + sp = __this_cpu_ist_top_va(VC2); + +sync: + /* + * Found a safe stack - switch to it as if the entry didn't happen via + * IST stack. The code below only copies pt_regs, the real switch happens + * in assembly code. + */ + sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); + + regs_ret = (struct pt_regs *)sp; + *regs_ret = *regs; + + return regs_ret; +} +#endif + struct bad_iret_stack { void *error_entry_ret; struct pt_regs regs; @@ -1082,6 +1127,9 @@ void __init trap_init(void) /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); + /* Init GHCB memory pages when running as an SEV-ES guest */ + sev_es_init_vc_handling(); + idt_setup_traps(); /* diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 2c304fd0bb1a..f6225bf22c02 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -335,63 +335,28 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) */ bool fixup_umip_exception(struct pt_regs *regs) { - int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; - unsigned long seg_base = 0, *reg_addr; + int nr_copied, reg_offset, dummy_data_size, umip_inst; /* 10 bytes is the maximum size of the result of UMIP instructions */ unsigned char dummy_data[10] = { 0 }; unsigned char buf[MAX_INSN_SIZE]; + unsigned long *reg_addr; void __user *uaddr; struct insn insn; - int seg_defs; if (!regs) return false; - /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - - if (seg_base == -1L) - return false; - - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - sizeof(buf)); - nr_copied = sizeof(buf) - not_copied; + nr_copied = insn_fetch_from_user(regs, buf); /* - * The copy_from_user above could have failed if user code is protected - * by a memory protection key. Give up on emulation in such a case. - * Should we issue a page fault? + * The insn_fetch_from_user above could have failed if user code + * is protected by a memory protection key. Give up on emulation + * in such a case. Should we issue a page fault? */ if (!nr_copied) return false; - insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); - - /* - * Override the default operand and address sizes with what is specified - * in the code segment descriptor. The instruction decoder only sets - * the address size it to either 4 or 8 address bytes and does nothing - * for the operand bytes. This OK for most of the cases, but we could - * have special cases where, for instance, a 16-bit code segment - * descriptor is used. - * If there is an address override prefix, the instruction decoder - * correctly updates these values, even for 16-bit defaults. - */ - seg_defs = insn_get_code_seg_params(regs); - if (seg_defs == -EINVAL) - return false; - - insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); - insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); - - insn_get_length(&insn); - if (nr_copied < insn.length) + if (!insn_decode(&insn, regs, buf, nr_copied)) return false; umip_inst = identify_insn(&insn); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index ec88bbe08a32..6a339ce328e0 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -127,12 +128,12 @@ static struct orc_entry null_orc_entry = { .sp_offset = sizeof(long), .sp_reg = ORC_REG_SP, .bp_reg = ORC_REG_UNDEFINED, - .type = ORC_TYPE_CALL + .type = UNWIND_HINT_TYPE_CALL }; /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { - .type = ORC_TYPE_CALL, + .type = UNWIND_HINT_TYPE_CALL, .sp_reg = ORC_REG_BP, .sp_offset = 16, .bp_reg = ORC_REG_PREV_SP, @@ -531,7 +532,7 @@ bool unwind_next_frame(struct unwind_state *state) /* Find IP, SP and possibly regs: */ switch (orc->type) { - case ORC_TYPE_CALL: + case UNWIND_HINT_TYPE_CALL: ip_p = sp - sizeof(long); if (!deref_stack_reg(state, ip_p, &state->ip)) @@ -546,7 +547,7 @@ bool unwind_next_frame(struct unwind_state *state) state->signal = false; break; - case ORC_TYPE_REGS: + case UNWIND_HINT_TYPE_REGS: if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn_current("can't access registers at %pB\n", (void *)orig_ip); @@ -559,7 +560,7 @@ bool unwind_next_frame(struct unwind_state *state) state->signal = true; break; - case ORC_TYPE_REGS_IRET: + case UNWIND_HINT_TYPE_REGS_PARTIAL: if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn_current("can't access iret registers at %pB\n", (void *)orig_ip); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e90bc436f584..598a769f1961 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1062,10 +1062,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, struct vmcb *hsave = svm->nested.hsave; struct vmcb __user *user_vmcb = (struct vmcb __user *) &user_kvm_nested_state->data.svm[0]; - struct vmcb_control_area ctl; - struct vmcb_save_area save; + struct vmcb_control_area *ctl; + struct vmcb_save_area *save; + int ret; u32 cr0; + BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) > + KVM_STATE_NESTED_SVM_VMCB_SIZE); + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) return -EINVAL; @@ -1097,13 +1101,22 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) return -EINVAL; - if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl))) - return -EFAULT; - if (copy_from_user(&save, &user_vmcb->save, sizeof(save))) - return -EFAULT; - if (!nested_vmcb_check_controls(&ctl)) - return -EINVAL; + ret = -ENOMEM; + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + save = kzalloc(sizeof(*save), GFP_KERNEL); + if (!ctl || !save) + goto out_free; + + ret = -EFAULT; + if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl))) + goto out_free; + if (copy_from_user(save, &user_vmcb->save, sizeof(*save))) + goto out_free; + + ret = -EINVAL; + if (!nested_vmcb_check_controls(ctl)) + goto out_free; /* * Processor state contains L2 state. Check that it is @@ -1111,15 +1124,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ cr0 = kvm_read_cr0(vcpu); if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) - return -EINVAL; + goto out_free; /* * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). * TODO: validate reserved bits for all saved state. */ - if (!(save.cr0 & X86_CR0_PG)) - return -EINVAL; + if (!(save->cr0 & X86_CR0_PG)) + goto out_free; /* * All checks done, we can enter guest mode. L1 control fields @@ -1128,10 +1141,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * contains saved L1 state. */ copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); - hsave->save = save; + hsave->save = *save; svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; - load_nested_vmcb_control(svm, &ctl); + load_nested_vmcb_control(svm, ctl); nested_prepare_vmcb_control(svm); if (!nested_svm_vmrun_msrpm(svm)) @@ -1139,7 +1152,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, out_set_gif: svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); - return 0; + + ret = 0; +out_free: + kfree(save); + kfree(ctl); + + return ret; } struct kvm_x86_nested_ops svm_nested_ops = { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 91ea74ae71b8..9709c98d0d6c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -4176,6 +4176,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static int __init svm_init(void) { + __unused_size_checks(); + return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), THIS_MODULE); } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1bb6b31eb646..19e2265956ba 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 96979c09ebd1..f0a9954c49db 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -13,7 +13,6 @@ * Yaniv Kamay */ -#include #include #include #include @@ -22,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 5e69603ff63f..58f7fb95c7f4 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -20,6 +20,7 @@ enum reg_type { REG_TYPE_RM = 0, + REG_TYPE_REG, REG_TYPE_INDEX, REG_TYPE_BASE, }; @@ -52,6 +53,30 @@ static bool is_string_insn(struct insn *insn) } } +/** + * insn_has_rep_prefix() - Determine if instruction has a REP prefix + * @insn: Instruction containing the prefix to inspect + * + * Returns: + * + * true if the instruction has a REP prefix, false if not. + */ +bool insn_has_rep_prefix(struct insn *insn) +{ + int i; + + insn_get_prefixes(insn); + + for (i = 0; i < insn->prefixes.nbytes; i++) { + insn_byte_t p = insn->prefixes.bytes[i]; + + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + /** * get_seg_reg_override_idx() - obtain segment register override index * @insn: Valid instruction with segment override prefixes @@ -439,6 +464,13 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, regno += 8; break; + case REG_TYPE_REG: + regno = X86_MODRM_REG(insn->modrm.value); + + if (X86_REX_R(insn->rex_prefix.value)) + regno += 8; + break; + case REG_TYPE_INDEX: regno = X86_SIB_INDEX(insn->sib.value); if (X86_REX_X(insn->rex_prefix.value)) @@ -807,6 +839,21 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) return get_reg_offset(insn, regs, REG_TYPE_RM); } +/** + * insn_get_modrm_reg_off() - Obtain register in reg part of the ModRM byte + * @insn: Instruction containing the ModRM byte + * @regs: Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the reg part of the ModRM byte. The + * register is obtained as an offset from the base of pt_regs. + */ +int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs) +{ + return get_reg_offset(insn, regs, REG_TYPE_REG); +} + /** * get_seg_base_limit() - obtain base address and limit of a segment * @insn: Instruction. Must be valid. @@ -1367,3 +1414,86 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) return (void __user *)-1L; } } + +/** + * insn_fetch_from_user() - Copy instruction bytes from user-space memory + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ + unsigned long seg_base = 0; + int not_copied; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) { + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + if (seg_base == -1L) + return 0; + } + + + not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), + MAX_INSN_SIZE); + + return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_decode() - Decode an instruction + * @insn: Structure to store decoded instruction + * @regs: Structure with register values as seen when entering kernel mode + * @buf: Buffer containing the instruction bytes + * @buf_size: Number of instruction bytes available in buf + * + * Decodes the instruction provided in buf and stores the decoding results in + * insn. Also determines the correct address and operand sizes. + * + * Returns: + * + * True if instruction was decoded, False otherwise. + */ +bool insn_decode(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size) +{ + int seg_defs; + + insn_init(insn, buf, buf_size, user_64bit_mode(regs)); + + /* + * Override the default operand and address sizes with what is specified + * in the code segment descriptor. The instruction decoder only sets + * the address size it to either 4 or 8 address bytes and does nothing + * for the operand bytes. This OK for most of the cases, but we could + * have special cases where, for instance, a 16-bit code segment + * descriptor is used. + * If there is an address override prefix, the instruction decoder + * correctly updates these values, even for 16-bit defaults. + */ + seg_defs = insn_get_code_seg_params(regs); + if (seg_defs == -EINVAL) + return false; + + insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); + insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); + + insn_get_length(insn); + if (buf_size < insn->length) + return false; + + return true; +} diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 770b613790b3..f5e1e60c9095 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -21,7 +21,8 @@ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); #endif -struct cpu_entry_area *get_cpu_entry_area(int cpu) +/* Is called from entry code, so must be noinstr */ +noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu) { unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 5829457f7ca3..b93d6cd08a7f 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -5,6 +5,7 @@ #include #include +#include #include #include diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 9f1177edc2e7..ebb7edc8bc0a 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -38,6 +38,7 @@ * section is later cleared. */ u64 sme_me_mask __section(.data) = 0; +u64 sev_status __section(.data) = 0; EXPORT_SYMBOL(sme_me_mask); DEFINE_STATIC_KEY_FALSE(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key); @@ -347,7 +348,13 @@ bool sme_active(void) bool sev_active(void) { - return sme_me_mask && sev_enabled; + return sev_status & MSR_AMD64_SEV_ENABLED; +} + +/* Needs to be called from non-instrumentable code */ +bool noinstr sev_es_active(void) +{ + return sev_status & MSR_AMD64_SEV_ES_ENABLED; } /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ @@ -400,6 +407,31 @@ void __init mem_encrypt_free_decrypted_mem(void) free_init_pages("unused decrypted", vaddr, vaddr_end); } +static void print_mem_encrypt_feature_info(void) +{ + pr_info("AMD Memory Encryption Features active:"); + + /* Secure Memory Encryption */ + if (sme_active()) { + /* + * SME is mutually exclusive with any of the SEV + * features below. + */ + pr_cont(" SME\n"); + return; + } + + /* Secure Encrypted Virtualization */ + if (sev_active()) + pr_cont(" SEV"); + + /* Encrypted Register State */ + if (sev_es_active()) + pr_cont(" SEV-ES"); + + pr_cont("\n"); +} + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { @@ -415,8 +447,6 @@ void __init mem_encrypt_init(void) if (sev_active()) static_branch_enable(&sev_enable_key); - pr_info("AMD %s active\n", - sev_active() ? "Secure Encrypted Virtualization (SEV)" - : "Secure Memory Encryption (SME)"); + print_mem_encrypt_feature_info(); } diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index e2b0e2ac07bb..68d75379e06a 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -540,6 +540,9 @@ void __init sme_enable(struct boot_params *bp) if (!(msr & MSR_AMD64_SEV_ENABLED)) return; + /* Save SEV_STATUS to avoid reading MSR again */ + sev_status = msr; + /* SEV state cannot be controlled by a command line option */ sme_me_mask = me_mask; sev_enabled = true; diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 9df94e0aaee1..44148691d78b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -745,6 +745,27 @@ static void __init init_memory_less_node(int nid) */ } +/* + * A node may exist which has one or more Generic Initiators but no CPUs and no + * memory. + * + * This function must be called after init_cpu_to_node(), to ensure that any + * memoryless CPU nodes have already been brought online, and before the + * node_data[nid] is needed for zone list setup in build_all_zonelists(). + * + * When this function is called, any nodes containing either memory and/or CPUs + * will already be online and there is no need to do anything extra, even if + * they also contain one or more Generic Initiators. + */ +void __init init_gi_nodes(void) +{ + int nid; + + for_each_node_state(nid, N_GENERIC_INITIATOR) + if (!node_online(nid)) + init_memory_less_node(nid); +} + /* * Setup early cpu_to_node. * diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index c313d784efab..11c0e80b9ed4 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -15,7 +15,6 @@ #include #define STA2X11_SWIOTLB_SIZE (4*1024*1024) -extern int swiotlb_late_init_with_default_size(size_t default_size); /* * We build a list of bus numbers that are under the ConneXt. The diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6af4da1149ba..8f5759df7776 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -47,6 +47,7 @@ #include #include #include +#include /* * We allocate runtime services regions top-down, starting from -4G, i.e. @@ -229,6 +230,15 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 1; } + /* + * When SEV-ES is active, the GHCB as set by the kernel will be used + * by firmware. Create a 1:1 unencrypted mapping for each GHCB. + */ + if (sev_es_efi_map_ghcbs(pgd)) { + pr_err("Failed to create 1:1 mapping for the GHCBs!\n"); + return 1; + } + /* * When making calls to the firmware everything needs to be 1:1 * mapped and addressable with 32-bit pointers. Map the kernel diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 1ed1208931e0..22fda7d99159 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -9,6 +9,7 @@ #include #include #include +#include struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -38,6 +39,25 @@ void __init reserve_real_mode(void) crash_reserve_low_1M(); } +static void sme_sev_setup_real_mode(struct trampoline_header *th) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + if (sme_active()) + th->flags |= TH_FLAGS_SME_ACTIVE; + + if (sev_es_active()) { + /* + * Skip the call to verify_cpu() in secondary_startup_64 as it + * will cause #VC exceptions when the AP can't handle them yet. + */ + th->start = (u64) secondary_startup_64_no_verify; + + if (sev_es_setup_ap_jump_table(real_mode_header)) + panic("Failed to get/update SEV-ES AP Jump Table"); + } +#endif +} + static void __init setup_real_mode(void) { u16 real_mode_seg; @@ -104,13 +124,13 @@ static void __init setup_real_mode(void) *trampoline_cr4_features = mmu_cr4_features; trampoline_header->flags = 0; - if (sme_active()) - trampoline_header->flags |= TH_FLAGS_SME_ACTIVE; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[511] = init_top_pgt[511].pgd; #endif + + sme_sev_setup_real_mode(trampoline_header); } /* diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index af04512c02d9..8c1db5bf5d78 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -20,6 +20,9 @@ SYM_DATA_START(real_mode_header) /* SMP trampoline */ .long pa_trampoline_start .long pa_trampoline_header +#ifdef CONFIG_AMD_MEM_ENCRYPT + .long pa_sev_es_trampoline_start +#endif #ifdef CONFIG_X86_64 .long pa_trampoline_pgd; #endif diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 251758ed7443..84c5d1b33d10 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -56,6 +56,7 @@ SYM_CODE_START(trampoline_start) testl %eax, %eax # Check for return code jnz no_longmode +.Lswitch_to_protected: /* * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default @@ -80,6 +81,25 @@ no_longmode: jmp no_longmode SYM_CODE_END(trampoline_start) +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* SEV-ES supports non-zero IP for entry points - no alignment needed */ +SYM_CODE_START(sev_es_trampoline_start) + cli # We should be safe anyway + + LJMPW_RM(1f) +1: + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + mov %ax, %es + mov %ax, %ss + + # Setup stack + movl $rm_stack_end, %esp + + jmp .Lswitch_to_protected +SYM_CODE_END(sev_es_trampoline_start) +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + #include "../kernel/verify_cpu.S" .section ".text32","ax" diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index a42015b305f4..af38469afd14 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod) END { if (awkchecked != "") exit 1 + + print "#ifndef __BOOT_COMPRESSED\n" + # print escape opcode map's array print "/* Escape opcode map array */" print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ @@ -388,6 +391,51 @@ END { for (j = 0; j < max_lprefix; j++) if (atable[i,j]) print " ["i"]["j"] = "atable[i,j]"," - print "};" + print "};\n" + + print "#else /* !__BOOT_COMPRESSED */\n" + + print "/* Escape opcode map array */" + print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "/* Group opcode map array */" + print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "/* AVX opcode map array */" + print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "static void inat_init_tables(void)" + print "{" + + # print escape opcode map's array + print "\t/* Print Escape opcode map array */" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";" + print "" + + # print group opcode map's array + print "\t/* Print Group opcode map array */" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";" + print "" + # print AVX opcode map's array + print "\t/* Print AVX opcode map array */" + for (i = 0; i < gaid; i++) + for (j = 0; j < max_lprefix; j++) + if (atable[i,j]) + print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + + print "}" + print "#endif" } diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index b1418a6c0e90..4409306364dc 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include @@ -1370,6 +1370,15 @@ asmlinkage __visible void __init xen_start_kernel(void) x86_init.mpparse.get_smp_config = x86_init_uint_noop; xen_boot_params_init_edd(); + +#ifdef CONFIG_ACPI + /* + * Disable selecting "Firmware First mode" for correctable + * memory errors, as this is the duty of the hypervisor to + * decide. + */ + acpi_disable_cmcff = 1; +#endif } if (!boot_params.screen_info.orig_video_isVGA) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index eda78144c000..cf2ade864c30 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1142,7 +1142,7 @@ static void __init xen_pagetable_p2m_free(void) * We could be in __ka space. * We roundup to the PMD, which means that if anybody at this stage is * using the __ka address of xen_start_info or - * xen_start_info->shared_info they are in going to crash. Fortunatly + * xen_start_info->shared_info they are in going to crash. Fortunately * we have already revectored in xen_setup_kernel_pagetable. */ size = roundup(size, PMD_SIZE); diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 7540a5179a47..edf1558c1105 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -504,55 +504,6 @@ config ACPI_EXTLOG config ACPI_ADXL bool -menuconfig PMIC_OPREGION - bool "PMIC (Power Management Integrated Circuit) operation region support" - help - Select this option to enable support for ACPI operation - region of the PMIC chip. The operation region can be used - to control power rails and sensor reading/writing on the - PMIC chip. - -if PMIC_OPREGION -config BYTCRC_PMIC_OPREGION - bool "ACPI operation region support for Bay Trail Crystal Cove PMIC" - depends on INTEL_SOC_PMIC - help - This config adds ACPI operation region support for the Bay Trail - version of the Crystal Cove PMIC. - -config CHTCRC_PMIC_OPREGION - bool "ACPI operation region support for Cherry Trail Crystal Cove PMIC" - depends on INTEL_SOC_PMIC - help - This config adds ACPI operation region support for the Cherry Trail - version of the Crystal Cove PMIC. - -config XPOWER_PMIC_OPREGION - bool "ACPI operation region support for XPower AXP288 PMIC" - depends on MFD_AXP20X_I2C && IOSF_MBI=y - help - This config adds ACPI operation region support for XPower AXP288 PMIC. - -config BXT_WC_PMIC_OPREGION - bool "ACPI operation region support for BXT WhiskeyCove PMIC" - depends on INTEL_SOC_PMIC_BXTWC - help - This config adds ACPI operation region support for BXT WhiskeyCove PMIC. - -config CHT_WC_PMIC_OPREGION - bool "ACPI operation region support for CHT Whiskey Cove PMIC" - depends on INTEL_SOC_PMIC_CHTWC - help - This config adds ACPI operation region support for CHT Whiskey Cove PMIC. - -config CHT_DC_TI_PMIC_OPREGION - bool "ACPI operation region support for Dollar Cove TI PMIC" - depends on INTEL_SOC_PMIC_CHTDC_TI - help - This config adds ACPI operation region support for Dollar Cove TI PMIC. - -endif - config ACPI_CONFIGFS tristate "ACPI configfs support" select CONFIGFS_FS @@ -568,21 +519,7 @@ config ACPI_PPTT bool endif -config TPS68470_PMIC_OPREGION - bool "ACPI operation region support for TPS68470 PMIC" - depends on MFD_TPS68470 - help - This config adds ACPI operation region support for TI TPS68470 PMIC. - TPS68470 device is an advanced power management unit that powers - a Compact Camera Module (CCM), generates clocks for image sensors, - drives a dual LED for flash and incorporates two LED drivers for - general purpose indicators. - This driver enables ACPI operation region support control voltage - regulators and clocks. - - This option is a bool as it provides an ACPI operation - region, which must be available before any of the devices - using this, are probed. +source "drivers/acpi/pmic/Kconfig" endif # ACPI diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 9a957544e357..44e412506317 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -107,17 +107,9 @@ obj-$(CONFIG_ACPI_APEI) += apei/ obj-$(CONFIG_ACPI_EXTLOG) += acpi_extlog.o -obj-$(CONFIG_PMIC_OPREGION) += pmic/intel_pmic.o -obj-$(CONFIG_BYTCRC_PMIC_OPREGION) += pmic/intel_pmic_bytcrc.o -obj-$(CONFIG_CHTCRC_PMIC_OPREGION) += pmic/intel_pmic_chtcrc.o -obj-$(CONFIG_XPOWER_PMIC_OPREGION) += pmic/intel_pmic_xpower.o -obj-$(CONFIG_BXT_WC_PMIC_OPREGION) += pmic/intel_pmic_bxtwc.o -obj-$(CONFIG_CHT_WC_PMIC_OPREGION) += pmic/intel_pmic_chtwc.o -obj-$(CONFIG_CHT_DC_TI_PMIC_OPREGION) += pmic/intel_pmic_chtdc_ti.o - obj-$(CONFIG_ACPI_CONFIGFS) += acpi_configfs.o -obj-$(CONFIG_TPS68470_PMIC_OPREGION) += pmic/tps68470_pmic.o +obj-y += pmic/ video-objs += acpi_video.o video_detect.o obj-y += dptf/ diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c index 806b8ce05624..39359ce0eb2c 100644 --- a/drivers/acpi/acpi_apd.c +++ b/drivers/acpi/acpi_apd.c @@ -7,39 +7,28 @@ * Wu, Jeff */ -#include -#include -#include -#include -#include #include +#include +#include #include #include -#include +#include +#include #include "internal.h" -ACPI_MODULE_NAME("acpi_apd"); struct apd_private_data; -/** - * ACPI_APD_SYSFS : add device attributes in sysfs - * ACPI_APD_PM : attach power domain to device - */ -#define ACPI_APD_SYSFS BIT(0) -#define ACPI_APD_PM BIT(1) - /** * struct apd_device_desc - a descriptor for apd device - * @flags: device flags like %ACPI_APD_SYSFS, %ACPI_APD_PM * @fixed_clk_rate: fixed rate input clock source for acpi device; * 0 means no fixed rate input clock source + * @properties: build-in properties of the device such as UART * @setup: a hook routine to set device resource during create platform device * * Device description defined as acpi_device_id.driver_data */ struct apd_device_desc { - unsigned int flags; unsigned int fixed_clk_rate; struct property_entry *properties; int (*setup)(struct apd_private_data *pdata); @@ -71,7 +60,6 @@ static int acpi_apd_setup(struct apd_private_data *pdata) } #ifdef CONFIG_X86_AMD_PLATFORM_DEVICE - static int misc_check_res(struct acpi_resource *ares, void *data) { struct resource res; @@ -142,7 +130,7 @@ static const struct apd_device_desc cz_uart_desc = { static const struct apd_device_desc fch_misc_desc = { .setup = fch_misc_setup, }; -#endif +#endif /* CONFIG_X86_AMD_PLATFORM_DEVICE */ #ifdef CONFIG_ARM64 static const struct apd_device_desc xgene_i2c_desc = { @@ -184,14 +172,10 @@ static const struct apd_device_desc hip08_spi_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 250000000, }; +#endif /* CONFIG_ARM64 */ + #endif -#else - -#define APD_ADDR(desc) (0UL) - -#endif /* CONFIG_X86_AMD_PLATFORM_DEVICE */ - /** * Create platform device during acpi scan attach handle. * Return value > 0 on success of creating device. diff --git a/drivers/acpi/acpi_cmos_rtc.c b/drivers/acpi/acpi_cmos_rtc.c index 33ac6cb428fe..67f1d33d15c4 100644 --- a/drivers/acpi/acpi_cmos_rtc.c +++ b/drivers/acpi/acpi_cmos_rtc.c @@ -15,8 +15,6 @@ #include "internal.h" -ACPI_MODULE_NAME("cmos rtc"); - static const struct acpi_device_id acpi_cmos_rtc_ids[] = { { "PNP0B00" }, { "PNP0B01" }, diff --git a/drivers/acpi/acpi_configfs.c b/drivers/acpi/acpi_configfs.c index 88c8af455ea3..cf91f49101ea 100644 --- a/drivers/acpi/acpi_configfs.c +++ b/drivers/acpi/acpi_configfs.c @@ -228,6 +228,7 @@ static void acpi_table_drop_item(struct config_group *group, ACPI_INFO(("Host-directed Dynamic ACPI Table Unload")); acpi_unload_table(table->index); + config_item_put(cfg); } static struct configfs_group_operations acpi_table_group_ops = { diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index f138e12b7b82..72f1fb77abcd 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -222,9 +222,9 @@ static int __init extlog_init(void) u64 cap; int rc; - rdmsrl(MSR_IA32_MCG_CAP, cap); - - if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr()) + if (rdmsrl_safe(MSR_IA32_MCG_CAP, &cap) || + !(cap & MCG_ELOG_P) || + !extlog_get_l1addr()) return -ENODEV; rc = -EINVAL; diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index 5e2bfbcf526f..46e307ea0f78 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -26,8 +26,6 @@ #include "internal.h" -ACPI_MODULE_NAME("acpi_lpss"); - #ifdef CONFIG_X86_INTEL_LPSS #include diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index e294f44a7850..ad6e90fbc813 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -22,13 +22,6 @@ #define ACPI_MEMORY_DEVICE_HID "PNP0C80" #define ACPI_MEMORY_DEVICE_NAME "Hotplug Mem Device" -#define _COMPONENT ACPI_MEMORY_DEVICE_COMPONENT - -#undef PREFIX -#define PREFIX "ACPI:memory_hp:" - -ACPI_MODULE_NAME("acpi_memhotplug"); - static const struct acpi_device_id memory_device_ids[] = { {ACPI_MEMORY_DEVICE_HID, 0}, {"", 0}, @@ -36,11 +29,6 @@ static const struct acpi_device_id memory_device_ids[] = { #ifdef CONFIG_ACPI_HOTPLUG_MEMORY -/* Memory Device States */ -#define MEMORY_INVALID_STATE 0 -#define MEMORY_POWER_ON_STATE 1 -#define MEMORY_POWER_OFF_STATE 2 - static int acpi_memory_device_add(struct acpi_device *device, const struct acpi_device_id *not_used); static void acpi_memory_device_remove(struct acpi_device *device); @@ -64,8 +52,7 @@ struct acpi_memory_info { }; struct acpi_memory_device { - struct acpi_device * device; - unsigned int state; /* State of the memory device */ + struct acpi_device *device; struct list_head res_list; }; @@ -233,7 +220,6 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) } if (!num_enabled) { dev_err(&mem_device->device->dev, "add_memory failed\n"); - mem_device->state = MEMORY_INVALID_STATE; return -EINVAL; } /* @@ -304,9 +290,6 @@ static int acpi_memory_device_add(struct acpi_device *device, return result; } - /* Set the device state */ - mem_device->state = MEMORY_POWER_ON_STATE; - result = acpi_memory_check_device(mem_device); if (result) { acpi_memory_device_free(mem_device); diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index c05050f474cd..78d621290a35 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -19,8 +19,6 @@ #include "internal.h" -ACPI_MODULE_NAME("platform"); - static const struct acpi_device_id forbidden_id_list[] = { {"PNP0000", 0}, /* PIC */ {"PNP0100", 0}, /* Timer */ diff --git a/drivers/acpi/acpi_pnp.c b/drivers/acpi/acpi_pnp.c index f3039b93ff61..4ed755a963aa 100644 --- a/drivers/acpi/acpi_pnp.c +++ b/drivers/acpi/acpi_pnp.c @@ -11,6 +11,8 @@ #include #include +#include "internal.h" + static const struct acpi_device_id acpi_pnp_device_ids[] = { /* pata_isapnp */ {"PNP0600"}, /* Generic ESDI/IDE/ATA compatible hard disk controller */ diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index b51ddf3bb616..412a9725cc1e 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -798,22 +798,34 @@ int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, memset(&cx, 0, sizeof(cx)); element = &cst->package.elements[i]; - if (element->type != ACPI_TYPE_PACKAGE) + if (element->type != ACPI_TYPE_PACKAGE) { + acpi_handle_info(handle, "_CST C%d type(%x) is not package, skip...\n", + i, element->type); continue; + } - if (element->package.count != 4) + if (element->package.count != 4) { + acpi_handle_info(handle, "_CST C%d package count(%d) is not 4, skip...\n", + i, element->package.count); continue; + } obj = &element->package.elements[0]; - if (obj->type != ACPI_TYPE_BUFFER) + if (obj->type != ACPI_TYPE_BUFFER) { + acpi_handle_info(handle, "_CST C%d package element[0] type(%x) is not buffer, skip...\n", + i, obj->type); continue; + } reg = (struct acpi_power_register *)obj->buffer.pointer; obj = &element->package.elements[1]; - if (obj->type != ACPI_TYPE_INTEGER) + if (obj->type != ACPI_TYPE_INTEGER) { + acpi_handle_info(handle, "_CST C[%d] package element[1] type(%x) is not integer, skip...\n", + i, obj->type); continue; + } cx.type = obj->integer.value; /* @@ -850,6 +862,8 @@ int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, cx.entry_method = ACPI_CSTATE_HALT; snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT"); } else { + acpi_handle_info(handle, "_CST C%d declares FIXED_HARDWARE C-state but not supported in hardware, skip...\n", + i); continue; } } else if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { @@ -857,6 +871,8 @@ int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x", cx.address); } else { + acpi_handle_info(handle, "_CST C%d space_id(%x) neither FIXED_HARDWARE nor SYSTEM_IO, skip...\n", + i, reg->space_id); continue; } @@ -864,14 +880,20 @@ int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, cx.valid = 1; obj = &element->package.elements[2]; - if (obj->type != ACPI_TYPE_INTEGER) + if (obj->type != ACPI_TYPE_INTEGER) { + acpi_handle_info(handle, "_CST C%d package element[2] type(%x) not integer, skip...\n", + i, obj->type); continue; + } cx.latency = obj->integer.value; obj = &element->package.elements[3]; - if (obj->type != ACPI_TYPE_INTEGER) + if (obj->type != ACPI_TYPE_INTEGER) { + acpi_handle_info(handle, "_CST C%d package element[3] type(%x) not integer, skip...\n", + i, obj->type); continue; + } memcpy(&info->states[++last_index], &cx, sizeof(cx)); } diff --git a/drivers/acpi/acpica/acdebug.h b/drivers/acpi/acpica/acdebug.h index a676daaa2da5..f8a3abdfe250 100644 --- a/drivers/acpi/acpica/acdebug.h +++ b/drivers/acpi/acpica/acdebug.h @@ -37,12 +37,14 @@ struct acpi_db_argument_info { struct acpi_db_execute_walk { u32 count; u32 max_count; + char name_seg[ACPI_NAMESEG_SIZE + 1]; }; #define PARAM_LIST(pl) pl #define EX_NO_SINGLE_STEP 1 #define EX_SINGLE_STEP 2 +#define EX_ALL 4 /* * dbxface - external debugger interfaces @@ -124,6 +126,8 @@ void acpi_db_disassemble_aml(char *statements, union acpi_parse_object *op); void acpi_db_evaluate_predefined_names(void); +void acpi_db_evaluate_all(char *name_seg); + /* * dbnames - namespace commands */ diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index 1030a0ce1599..2fee91f57b21 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -42,6 +42,12 @@ ACPI_GLOBAL(struct acpi_generic_address, acpi_gbl_xpm1a_enable); ACPI_GLOBAL(struct acpi_generic_address, acpi_gbl_xpm1b_status); ACPI_GLOBAL(struct acpi_generic_address, acpi_gbl_xpm1b_enable); +#ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES +ACPI_GLOBAL(unsigned long, acpi_gbl_xgpe0_block_logical_address); +ACPI_GLOBAL(unsigned long, acpi_gbl_xgpe1_block_logical_address); + +#endif /* ACPI_GPE_USE_LOGICAL_ADDRESSES */ + /* * Handle both ACPI 1.0 and ACPI 2.0+ Integer widths. The integer width is * determined by the revision of the DSDT: If the DSDT revision is less than diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index ebf6453d0e21..6ab92e28330d 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -73,9 +73,15 @@ acpi_status acpi_hw_read_port(acpi_io_address address, u32 *value, u32 width); acpi_status acpi_hw_write_port(acpi_io_address address, u32 value, u32 width); +acpi_status acpi_hw_validate_io_block(u64 address, u32 bit_width, u32 count); + /* * hwgpe - GPE support */ +acpi_status acpi_hw_gpe_read(u64 *value, struct acpi_gpe_address *reg); + +acpi_status acpi_hw_gpe_write(u64 value, struct acpi_gpe_address *reg); + u32 acpi_hw_get_gpe_register_bit(struct acpi_gpe_event_info *gpe_event_info); acpi_status diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index af58cd2dc9d3..f83b98fa13ac 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -454,11 +454,18 @@ struct acpi_gpe_event_info { u8 disable_for_dispatch; /* Masked during dispatching */ }; +/* GPE register address */ + +struct acpi_gpe_address { + u8 space_id; /* Address space where the register exists */ + u64 address; /* 64-bit address of the register */ +}; + /* Information about a GPE register pair, one per each status/enable pair in an array */ struct acpi_gpe_register_info { - struct acpi_generic_address status_address; /* Address of status reg */ - struct acpi_generic_address enable_address; /* Address of enable reg */ + struct acpi_gpe_address status_address; /* Address of status reg */ + struct acpi_gpe_address enable_address; /* Address of enable reg */ u16 base_gpe_number; /* Base GPE number for this register */ u8 enable_for_wake; /* GPEs to keep enabled when sleeping */ u8 enable_for_run; /* GPEs to keep enabled when running */ diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h index 2cbb56652f1c..57ea2276790f 100644 --- a/drivers/acpi/acpica/acpredef.h +++ b/drivers/acpi/acpica/acpredef.h @@ -101,7 +101,7 @@ enum acpi_return_package_types { /* Support macros for users of the predefined info table */ -#define METHOD_PREDEF_ARGS_MAX 4 +#define METHOD_PREDEF_ARGS_MAX 5 #define METHOD_ARG_BIT_WIDTH 3 #define METHOD_ARG_MASK 0x0007 #define ARG_COUNT_IS_MINIMUM 0x8000 @@ -117,6 +117,7 @@ enum acpi_return_package_types { #define METHOD_2ARGS(a1,a2) (2 | (a1 << 3) | (a2 << 6)) #define METHOD_3ARGS(a1,a2,a3) (3 | (a1 << 3) | (a2 << 6) | (a3 << 9)) #define METHOD_4ARGS(a1,a2,a3,a4) (4 | (a1 << 3) | (a2 << 6) | (a3 << 9) | (a4 << 12)) +#define METHOD_5ARGS(a1,a2,a3,a4,a5) (5 | (a1 << 3) | (a2 << 6) | (a3 << 9) | (a4 << 12) | (a5 << 15)) #define METHOD_RETURNS(type) (type) #define METHOD_NO_RETURN_VALUE 0 @@ -902,9 +903,39 @@ const union acpi_predefined_info acpi_gbl_predefined_methods[] = { {{"_S4W", METHOD_0ARGS, METHOD_RETURNS(ACPI_RTYPE_INTEGER)}}, + {{"_SBA", METHOD_0ARGS, + METHOD_RETURNS(ACPI_RTYPE_PACKAGE)}}, /* Fixed-length (4 Int) */ + PACKAGE_INFO(ACPI_PTYPE1_FIXED, ACPI_RTYPE_INTEGER, 4, 0, 0, 0), + + {{"_SBI", METHOD_0ARGS, + METHOD_RETURNS(ACPI_RTYPE_PACKAGE)}}, /* Fixed-length (1 Int, 1 Buf) */ + PACKAGE_INFO(ACPI_PTYPE1_FIXED, ACPI_RTYPE_INTEGER, 1, + ACPI_RTYPE_BUFFER, 1, 0), + + {{"_SBR", + METHOD_3ARGS(ACPI_TYPE_INTEGER, ACPI_TYPE_INTEGER, + ACPI_TYPE_INTEGER), + METHOD_RETURNS(ACPI_RTYPE_PACKAGE)}}, /* Fixed-length (2 Int) */ + PACKAGE_INFO(ACPI_PTYPE1_FIXED, ACPI_RTYPE_INTEGER, 2, + ACPI_RTYPE_BUFFER | ACPI_RTYPE_INTEGER, 1, 0), + {{"_SBS", METHOD_0ARGS, METHOD_RETURNS(ACPI_RTYPE_INTEGER)}}, + {{"_SBT", + METHOD_4ARGS(ACPI_TYPE_INTEGER, ACPI_TYPE_INTEGER, ACPI_TYPE_INTEGER, + ACPI_TYPE_ANY), + METHOD_RETURNS(ACPI_RTYPE_PACKAGE)}}, /* Fixed-length (2 Int, 1 Buf | Int) */ + PACKAGE_INFO(ACPI_PTYPE1_FIXED, ACPI_RTYPE_INTEGER, 2, + ACPI_RTYPE_BUFFER | ACPI_RTYPE_INTEGER, 1, 0), + + {{"_SBW", + METHOD_5ARGS(ACPI_TYPE_INTEGER, ACPI_TYPE_INTEGER, ACPI_TYPE_INTEGER, + ACPI_TYPE_INTEGER, ACPI_TYPE_ANY), + METHOD_RETURNS(ACPI_RTYPE_PACKAGE)}}, + PACKAGE_INFO(ACPI_PTYPE1_FIXED, ACPI_RTYPE_BUFFER | ACPI_RTYPE_INTEGER, + 1, 0, 0, 0), + {{"_SCP", METHOD_1ARGS(ACPI_TYPE_INTEGER) | ARG_COUNT_IS_MINIMUM, METHOD_NO_RETURN_VALUE}}, /* Acpi 1.0 allowed 1 integer arg. Acpi 3.0 expanded to 3 args. Allow both. */ diff --git a/drivers/acpi/acpica/dbexec.c b/drivers/acpi/acpica/dbexec.c index 4027eaab18a4..d3a9521e2dc8 100644 --- a/drivers/acpi/acpica/dbexec.c +++ b/drivers/acpi/acpica/dbexec.c @@ -86,7 +86,8 @@ void acpi_db_delete_objects(u32 count, union acpi_object *objects) * * RETURN: Status * - * DESCRIPTION: Execute a control method. + * DESCRIPTION: Execute a control method. Used to evaluate objects via the + * "EXECUTE" or "EVALUATE" commands. * ******************************************************************************/ @@ -314,11 +315,12 @@ acpi_db_execution_walk(acpi_handle obj_handle, status = acpi_evaluate_object(node, NULL, NULL, &return_obj); + acpi_gbl_method_executing = FALSE; + acpi_os_printf("Evaluation of [%4.4s] returned %s\n", acpi_ut_get_node_name(node), acpi_format_exception(status)); - acpi_gbl_method_executing = FALSE; return (AE_OK); } @@ -334,7 +336,8 @@ acpi_db_execution_walk(acpi_handle obj_handle, * RETURN: None * * DESCRIPTION: Execute a control method. Name is relative to the current - * scope. + * scope. Function used for the "EXECUTE", "EVALUATE", and + * "ALL" commands * ******************************************************************************/ @@ -372,6 +375,12 @@ acpi_db_execute(char *name, char **args, acpi_object_type *types, u32 flags) return; } + if ((flags & EX_ALL) && (strlen(name) > 4)) { + acpi_os_printf("Input name (%s) must be a 4-char NameSeg\n", + name); + return; + } + name_string = ACPI_ALLOCATE(strlen(name) + 1); if (!name_string) { return; @@ -389,13 +398,24 @@ acpi_db_execute(char *name, char **args, acpi_object_type *types, u32 flags) return; } - acpi_gbl_db_method_info.name = name_string; - acpi_gbl_db_method_info.args = args; - acpi_gbl_db_method_info.types = types; - acpi_gbl_db_method_info.flags = flags; + /* Command (ALL ) to execute all methods of a particular name */ - return_obj.pointer = NULL; - return_obj.length = ACPI_ALLOCATE_BUFFER; + else if (flags & EX_ALL) { + acpi_gbl_db_method_info.name = name_string; + return_obj.pointer = NULL; + return_obj.length = ACPI_ALLOCATE_BUFFER; + acpi_db_evaluate_all(name_string); + ACPI_FREE(name_string); + return; + } else { + acpi_gbl_db_method_info.name = name_string; + acpi_gbl_db_method_info.args = args; + acpi_gbl_db_method_info.types = types; + acpi_gbl_db_method_info.flags = flags; + + return_obj.pointer = NULL; + return_obj.length = ACPI_ALLOCATE_BUFFER; + } status = acpi_db_execute_setup(&acpi_gbl_db_method_info); if (ACPI_FAILURE(status)) { @@ -450,6 +470,7 @@ acpi_db_execute(char *name, char **args, acpi_object_type *types, u32 flags) (u32)return_obj.length); acpi_db_dump_external_object(return_obj.pointer, 1); + acpi_os_printf("\n"); /* Dump a _PLD buffer if present */ diff --git a/drivers/acpi/acpica/dbinput.c b/drivers/acpi/acpica/dbinput.c index ee6a1b77af3f..2952856b8a67 100644 --- a/drivers/acpi/acpica/dbinput.c +++ b/drivers/acpi/acpica/dbinput.c @@ -37,6 +37,7 @@ acpi_db_match_command_help(const char *command, enum acpi_ex_debugger_commands { CMD_NOT_FOUND = 0, CMD_NULL, + CMD_ALL, CMD_ALLOCATIONS, CMD_ARGS, CMD_ARGUMENTS, @@ -115,6 +116,7 @@ enum acpi_ex_debugger_commands { static const struct acpi_db_command_info acpi_gbl_db_commands[] = { {"", 0}, {"", 0}, + {"ALL", 1}, {"ALLOCATIONS", 0}, {"ARGS", 0}, {"ARGUMENTS", 0}, @@ -222,6 +224,7 @@ static const struct acpi_db_command_help acpi_gbl_db_command_help[] = { {1, " Type ", "Display object type\n"}, {0, "\nControl Method Execution:", "\n"}, + {1, " All ", "Evaluate all objects named NameSeg\n"}, {1, " Evaluate [Arguments]", "Evaluate object or control method\n"}, {1, " Execute [Arguments]", "Synonym for Evaluate\n"}, @@ -436,7 +439,7 @@ static void acpi_db_display_help(char *command) acpi_os_printf("\n"); } else { - /* Display help for all commands that match the subtring */ + /* Display help for all commands that match the substring */ acpi_db_display_command_info(command, TRUE); } @@ -740,6 +743,15 @@ acpi_db_command_dispatch(char *input_buffer, } break; + case CMD_ALL: + + acpi_os_printf("Executing all objects with NameSeg: %s\n", + acpi_gbl_db_args[1]); + acpi_db_execute(acpi_gbl_db_args[1], &acpi_gbl_db_args[2], + &acpi_gbl_db_arg_types[2], + EX_NO_SINGLE_STEP | EX_ALL); + break; + case CMD_ALLOCATIONS: #ifdef ACPI_DBG_TRACK_ALLOCATIONS diff --git a/drivers/acpi/acpica/dbmethod.c b/drivers/acpi/acpica/dbmethod.c index 4e48a7de7413..889d13828e49 100644 --- a/drivers/acpi/acpica/dbmethod.c +++ b/drivers/acpi/acpica/dbmethod.c @@ -21,6 +21,8 @@ static acpi_status acpi_db_walk_for_execute(acpi_handle obj_handle, u32 nesting_level, void *context, void **return_value); +static acpi_status acpi_db_evaluate_object(struct acpi_namespace_node *node); + /******************************************************************************* * * FUNCTION: acpi_db_set_method_breakpoint @@ -346,42 +348,26 @@ acpi_status acpi_db_disassemble_method(char *name) /******************************************************************************* * - * FUNCTION: acpi_db_walk_for_execute + * FUNCTION: acpi_db_evaluate_object * - * PARAMETERS: Callback from walk_namespace + * PARAMETERS: node - Namespace node for the object * * RETURN: Status * - * DESCRIPTION: Batch execution module. Currently only executes predefined - * ACPI names. + * DESCRIPTION: Main execution function for the Evaluate/Execute/All debugger + * commands. * ******************************************************************************/ -static acpi_status -acpi_db_walk_for_execute(acpi_handle obj_handle, - u32 nesting_level, void *context, void **return_value) +static acpi_status acpi_db_evaluate_object(struct acpi_namespace_node *node) { - struct acpi_namespace_node *node = - (struct acpi_namespace_node *)obj_handle; - struct acpi_db_execute_walk *info = - (struct acpi_db_execute_walk *)context; - struct acpi_buffer return_obj; - acpi_status status; char *pathname; u32 i; struct acpi_device_info *obj_info; struct acpi_object_list param_objects; union acpi_object params[ACPI_METHOD_NUM_ARGS]; - const union acpi_predefined_info *predefined; - - predefined = acpi_ut_match_predefined_method(node->name.ascii); - if (!predefined) { - return (AE_OK); - } - - if (node->type == ACPI_TYPE_LOCAL_SCOPE) { - return (AE_OK); - } + struct acpi_buffer return_obj; + acpi_status status; pathname = acpi_ns_get_external_pathname(node); if (!pathname) { @@ -390,7 +376,7 @@ acpi_db_walk_for_execute(acpi_handle obj_handle, /* Get the object info for number of method parameters */ - status = acpi_get_object_info(obj_handle, &obj_info); + status = acpi_get_object_info(node, &obj_info); if (ACPI_FAILURE(status)) { ACPI_FREE(pathname); return (status); @@ -421,14 +407,67 @@ acpi_db_walk_for_execute(acpi_handle obj_handle, acpi_gbl_method_executing = TRUE; status = acpi_evaluate_object(node, NULL, ¶m_objects, &return_obj); + acpi_gbl_method_executing = FALSE; acpi_os_printf("%-32s returned %s\n", pathname, acpi_format_exception(status)); - acpi_gbl_method_executing = FALSE; + if (return_obj.length) { + acpi_os_printf("Evaluation of %s returned object %p, " + "external buffer length %X\n", + pathname, return_obj.pointer, + (u32)return_obj.length); + + acpi_db_dump_external_object(return_obj.pointer, 1); + acpi_os_printf("\n"); + } + ACPI_FREE(pathname); /* Ignore status from method execution */ + return (AE_OK); + + /* Update count, check if we have executed enough methods */ + +} + +/******************************************************************************* + * + * FUNCTION: acpi_db_walk_for_execute + * + * PARAMETERS: Callback from walk_namespace + * + * RETURN: Status + * + * DESCRIPTION: Batch execution function. Evaluates all "predefined" objects -- + * the nameseg begins with an underscore. + * + ******************************************************************************/ + +static acpi_status +acpi_db_walk_for_execute(acpi_handle obj_handle, + u32 nesting_level, void *context, void **return_value) +{ + struct acpi_namespace_node *node = + (struct acpi_namespace_node *)obj_handle; + struct acpi_db_execute_walk *info = + (struct acpi_db_execute_walk *)context; + acpi_status status; + const union acpi_predefined_info *predefined; + + predefined = acpi_ut_match_predefined_method(node->name.ascii); + if (!predefined) { + return (AE_OK); + } + + if (node->type == ACPI_TYPE_LOCAL_SCOPE) { + return (AE_OK); + } + + acpi_db_evaluate_object(node); + + /* Ignore status from object evaluation */ + status = AE_OK; /* Update count, check if we have executed enough methods */ @@ -441,6 +480,52 @@ acpi_db_walk_for_execute(acpi_handle obj_handle, return (status); } +/******************************************************************************* + * + * FUNCTION: acpi_db_walk_for_execute_all + * + * PARAMETERS: Callback from walk_namespace + * + * RETURN: Status + * + * DESCRIPTION: Batch execution function. Evaluates all objects whose path ends + * with the nameseg "Info->NameSeg". Used for the "ALL" command. + * + ******************************************************************************/ + +static acpi_status +acpi_db_walk_for_execute_all(acpi_handle obj_handle, + u32 nesting_level, + void *context, void **return_value) +{ + struct acpi_namespace_node *node = + (struct acpi_namespace_node *)obj_handle; + struct acpi_db_execute_walk *info = + (struct acpi_db_execute_walk *)context; + acpi_status status; + + if (!ACPI_COMPARE_NAMESEG(node->name.ascii, info->name_seg)) { + return (AE_OK); + } + + if (node->type == ACPI_TYPE_LOCAL_SCOPE) { + return (AE_OK); + } + + /* Now evaluate the input object (node) */ + + acpi_db_evaluate_object(node); + + /* Ignore status from method execution */ + + status = AE_OK; + + /* Update count of executed methods/objects */ + + info->count++; + return (status); +} + /******************************************************************************* * * FUNCTION: acpi_db_evaluate_predefined_names @@ -470,3 +555,35 @@ void acpi_db_evaluate_predefined_names(void) acpi_os_printf("Evaluated %u predefined names in the namespace\n", info.count); } + +/******************************************************************************* + * + * FUNCTION: acpi_db_evaluate_all + * + * PARAMETERS: none_acpi_gbl_db_method_info + * + * RETURN: None + * + * DESCRIPTION: Namespace batch execution. Implements the "ALL" command. + * Execute all namepaths whose final nameseg matches the + * input nameseg. + * + ******************************************************************************/ + +void acpi_db_evaluate_all(char *name_seg) +{ + struct acpi_db_execute_walk info; + + info.count = 0; + info.max_count = ACPI_UINT32_MAX; + ACPI_COPY_NAMESEG(info.name_seg, name_seg); + info.name_seg[ACPI_NAMESEG_SIZE] = 0; + + /* Search all nodes in namespace */ + + (void)acpi_walk_namespace(ACPI_TYPE_ANY, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, acpi_db_walk_for_execute_all, + NULL, (void *)&info, NULL); + + acpi_os_printf("Evaluated %u names in the namespace\n", info.count); +} diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c index 3e39907fedd9..06b9c8dd11c9 100644 --- a/drivers/acpi/acpica/evgpe.c +++ b/drivers/acpi/acpica/evgpe.c @@ -656,14 +656,14 @@ acpi_ev_detect_gpe(struct acpi_namespace_node *gpe_device, /* GPE currently enabled (enable bit == 1)? */ - status = acpi_hw_read(&enable_reg, &gpe_register_info->enable_address); + status = acpi_hw_gpe_read(&enable_reg, &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { goto error_exit; } /* GPE currently active (status bit == 1)? */ - status = acpi_hw_read(&status_reg, &gpe_register_info->status_address); + status = acpi_hw_gpe_read(&status_reg, &gpe_register_info->status_address); if (ACPI_FAILURE(status)) { goto error_exit; } diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c index 132adff1e131..f5298be4273a 100644 --- a/drivers/acpi/acpica/evgpeblk.c +++ b/drivers/acpi/acpica/evgpeblk.c @@ -233,12 +233,6 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block) this_register->status_address.space_id = gpe_block->space_id; this_register->enable_address.space_id = gpe_block->space_id; - this_register->status_address.bit_width = - ACPI_GPE_REGISTER_WIDTH; - this_register->enable_address.bit_width = - ACPI_GPE_REGISTER_WIDTH; - this_register->status_address.bit_offset = 0; - this_register->enable_address.bit_offset = 0; /* Init the event_info for each GPE within this register */ @@ -251,14 +245,14 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block) /* Disable all GPEs within this register */ - status = acpi_hw_write(0x00, &this_register->enable_address); + status = acpi_hw_gpe_write(0x00, &this_register->enable_address); if (ACPI_FAILURE(status)) { goto error_exit; } /* Clear any pending GPE events within this register */ - status = acpi_hw_write(0xFF, &this_register->status_address); + status = acpi_hw_gpe_write(0xFF, &this_register->status_address); if (ACPI_FAILURE(status)) { goto error_exit; } @@ -317,6 +311,23 @@ acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device, return_ACPI_STATUS(AE_OK); } + /* Validate the space_ID */ + + if ((space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) && + (space_id != ACPI_ADR_SPACE_SYSTEM_IO)) { + ACPI_ERROR((AE_INFO, + "Unsupported address space: 0x%X", space_id)); + return_ACPI_STATUS(AE_SUPPORT); + } + + if (space_id == ACPI_ADR_SPACE_SYSTEM_IO) { + status = acpi_hw_validate_io_block(address, + ACPI_GPE_REGISTER_WIDTH, + register_count); + if (ACPI_FAILURE(status)) + return_ACPI_STATUS(status); + } + /* Allocate a new GPE block */ gpe_block = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_gpe_block_info)); diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c index 6effd8076dcc..6d82d30d8f7b 100644 --- a/drivers/acpi/acpica/evgpeinit.c +++ b/drivers/acpi/acpica/evgpeinit.c @@ -32,6 +32,16 @@ ACPI_MODULE_NAME("evgpeinit") * kernel boot time as well. */ +#ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES +#define ACPI_FADT_GPE_BLOCK_ADDRESS(N) \ + acpi_gbl_FADT.xgpe##N##_block.space_id == \ + ACPI_ADR_SPACE_SYSTEM_MEMORY ? \ + (u64)acpi_gbl_xgpe##N##_block_logical_address : \ + acpi_gbl_FADT.xgpe##N##_block.address +#else +#define ACPI_FADT_GPE_BLOCK_ADDRESS(N) acpi_gbl_FADT.xgpe##N##_block.address +#endif /* ACPI_GPE_USE_LOGICAL_ADDRESSES */ + /******************************************************************************* * * FUNCTION: acpi_ev_gpe_initialize @@ -49,6 +59,7 @@ acpi_status acpi_ev_gpe_initialize(void) u32 register_count1 = 0; u32 gpe_number_max = 0; acpi_status status; + u64 address; ACPI_FUNCTION_TRACE(ev_gpe_initialize); @@ -85,8 +96,9 @@ acpi_status acpi_ev_gpe_initialize(void) * If EITHER the register length OR the block address are zero, then that * particular block is not supported. */ - if (acpi_gbl_FADT.gpe0_block_length && - acpi_gbl_FADT.xgpe0_block.address) { + address = ACPI_FADT_GPE_BLOCK_ADDRESS(0); + + if (acpi_gbl_FADT.gpe0_block_length && address) { /* GPE block 0 exists (has both length and address > 0) */ @@ -97,7 +109,6 @@ acpi_status acpi_ev_gpe_initialize(void) /* Install GPE Block 0 */ status = acpi_ev_create_gpe_block(acpi_gbl_fadt_gpe_device, - acpi_gbl_FADT.xgpe0_block. address, acpi_gbl_FADT.xgpe0_block. space_id, register_count0, 0, @@ -110,8 +121,9 @@ acpi_status acpi_ev_gpe_initialize(void) } } - if (acpi_gbl_FADT.gpe1_block_length && - acpi_gbl_FADT.xgpe1_block.address) { + address = ACPI_FADT_GPE_BLOCK_ADDRESS(1); + + if (acpi_gbl_FADT.gpe1_block_length && address) { /* GPE block 1 exists (has both length and address > 0) */ @@ -137,7 +149,6 @@ acpi_status acpi_ev_gpe_initialize(void) status = acpi_ev_create_gpe_block(acpi_gbl_fadt_gpe_device, - acpi_gbl_FADT.xgpe1_block. address, acpi_gbl_FADT.xgpe1_block. space_id, register_count1, diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 49c46d4dd070..37bb67ef3232 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -24,6 +24,76 @@ static acpi_status acpi_hw_gpe_enable_write(u8 enable_mask, struct acpi_gpe_register_info *gpe_register_info); +/****************************************************************************** + * + * FUNCTION: acpi_hw_gpe_read + * + * PARAMETERS: value - Where the value is returned + * reg - GPE register structure + * + * RETURN: Status + * + * DESCRIPTION: Read from a GPE register in either memory or IO space. + * + * LIMITATIONS: + * space_ID must be system_memory or system_IO. + * + ******************************************************************************/ + +acpi_status acpi_hw_gpe_read(u64 *value, struct acpi_gpe_address *reg) +{ + acpi_status status; + u32 value32; + + if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) { +#ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES + *value = (u64)ACPI_GET8(reg->address); + return_ACPI_STATUS(AE_OK); +#else + return acpi_os_read_memory((acpi_physical_address)reg->address, + value, ACPI_GPE_REGISTER_WIDTH); +#endif + } + + status = acpi_os_read_port((acpi_io_address)reg->address, + &value32, ACPI_GPE_REGISTER_WIDTH); + if (ACPI_FAILURE(status)) + return_ACPI_STATUS(status); + + *value = (u64)value32; + + return_ACPI_STATUS(AE_OK); +} + +/****************************************************************************** + * + * FUNCTION: acpi_hw_gpe_write + * + * PARAMETERS: value - Value to be written + * reg - GPE register structure + * + * RETURN: Status + * + * DESCRIPTION: Write to a GPE register in either memory or IO space. + * + ******************************************************************************/ + +acpi_status acpi_hw_gpe_write(u64 value, struct acpi_gpe_address *reg) +{ + if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) { +#ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES + ACPI_SET8(reg->address, value); + return_ACPI_STATUS(AE_OK); +#else + return acpi_os_write_memory((acpi_physical_address)reg->address, + value, ACPI_GPE_REGISTER_WIDTH); +#endif + } + + return acpi_os_write_port((acpi_io_address)reg->address, (u32)value, + ACPI_GPE_REGISTER_WIDTH); +} + /****************************************************************************** * * FUNCTION: acpi_hw_get_gpe_register_bit @@ -79,7 +149,8 @@ acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u32 action) /* Get current value of the enable register that contains this GPE */ - status = acpi_hw_read(&enable_mask, &gpe_register_info->enable_address); + status = acpi_hw_gpe_read(&enable_mask, + &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { return (status); } @@ -118,9 +189,8 @@ acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u32 action) /* Write the updated enable mask */ - status = - acpi_hw_write(enable_mask, - &gpe_register_info->enable_address); + status = acpi_hw_gpe_write(enable_mask, + &gpe_register_info->enable_address); } return (status); } @@ -158,8 +228,8 @@ acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info *gpe_event_info) */ register_bit = acpi_hw_get_gpe_register_bit(gpe_event_info); - status = - acpi_hw_write(register_bit, &gpe_register_info->status_address); + status = acpi_hw_gpe_write(register_bit, + &gpe_register_info->status_address); return (status); } @@ -227,7 +297,7 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, /* GPE currently enabled (enable bit == 1)? */ - status = acpi_hw_read(&in_byte, &gpe_register_info->enable_address); + status = acpi_hw_gpe_read(&in_byte, &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { return (status); } @@ -238,7 +308,7 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, /* GPE currently active (status bit == 1)? */ - status = acpi_hw_read(&in_byte, &gpe_register_info->status_address); + status = acpi_hw_gpe_read(&in_byte, &gpe_register_info->status_address); if (ACPI_FAILURE(status)) { return (status); } @@ -274,7 +344,8 @@ acpi_hw_gpe_enable_write(u8 enable_mask, gpe_register_info->enable_mask = enable_mask; - status = acpi_hw_write(enable_mask, &gpe_register_info->enable_address); + status = acpi_hw_gpe_write(enable_mask, + &gpe_register_info->enable_address); return (status); } @@ -341,9 +412,8 @@ acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, /* Clear status on all GPEs in this register */ - status = - acpi_hw_write(0xFF, - &gpe_block->register_info[i].status_address); + status = acpi_hw_gpe_write(0xFF, + &gpe_block->register_info[i].status_address); if (ACPI_FAILURE(status)) { return (status); } @@ -481,14 +551,14 @@ acpi_hw_get_gpe_block_status(struct acpi_gpe_xrupt_info *gpe_xrupt_info, for (i = 0; i < gpe_block->register_count; i++) { gpe_register_info = &gpe_block->register_info[i]; - status = acpi_hw_read(&in_enable, - &gpe_register_info->enable_address); + status = acpi_hw_gpe_read(&in_enable, + &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { continue; } - status = acpi_hw_read(&in_status, - &gpe_register_info->status_address); + status = acpi_hw_gpe_read(&in_status, + &gpe_register_info->status_address); if (ACPI_FAILURE(status)) { continue; } diff --git a/drivers/acpi/acpica/hwvalid.c b/drivers/acpi/acpica/hwvalid.c index 4d94861e6093..b2ca7dfd3fc9 100644 --- a/drivers/acpi/acpica/hwvalid.c +++ b/drivers/acpi/acpica/hwvalid.c @@ -292,3 +292,33 @@ acpi_status acpi_hw_write_port(acpi_io_address address, u32 value, u32 width) return (AE_OK); } + +/****************************************************************************** + * + * FUNCTION: acpi_hw_validate_io_block + * + * PARAMETERS: Address Address of I/O port/register blobk + * bit_width Number of bits (8,16,32) in each register + * count Number of registers in the block + * + * RETURN: Status + * + * DESCRIPTION: Validates a block of I/O ports/registers. + * + ******************************************************************************/ + +acpi_status acpi_hw_validate_io_block(u64 address, u32 bit_width, u32 count) +{ + acpi_status status; + + while (count--) { + status = acpi_hw_validate_io_request((acpi_io_address)address, + bit_width); + if (ACPI_FAILURE(status)) + return_ACPI_STATUS(status); + + address += ACPI_DIV_8(bit_width); + } + + return_ACPI_STATUS(AE_OK); +} diff --git a/drivers/acpi/acpica/nsalloc.c b/drivers/acpi/acpica/nsalloc.c index fe9b3639a87d..83d26abcf448 100644 --- a/drivers/acpi/acpica/nsalloc.c +++ b/drivers/acpi/acpica/nsalloc.c @@ -294,7 +294,7 @@ void acpi_ns_delete_children(struct acpi_namespace_node *parent_node) node_to_delete = next_node; next_node = next_node->peer; acpi_ns_delete_node(node_to_delete); - }; + } /* Clear the parent's child pointer */ diff --git a/drivers/acpi/acpica/nsarguments.c b/drivers/acpi/acpica/nsarguments.c index d5e8405e9d8f..6bbc7d350a16 100644 --- a/drivers/acpi/acpica/nsarguments.c +++ b/drivers/acpi/acpica/nsarguments.c @@ -55,7 +55,9 @@ void acpi_ns_check_argument_types(struct acpi_evaluate_info *info) arg_type = METHOD_GET_NEXT_TYPE(arg_type_list); user_arg_type = info->parameters[i]->common.type; - if (user_arg_type != arg_type) { + /* No typechecking for ACPI_TYPE_ANY */ + + if ((user_arg_type != arg_type) && (arg_type != ACPI_TYPE_ANY)) { ACPI_WARN_PREDEFINED((AE_INFO, info->full_pathname, ACPI_WARN_ALWAYS, "Argument #%u type mismatch - " diff --git a/drivers/acpi/acpica/nsxfobj.c b/drivers/acpi/acpica/nsxfobj.c index c022bef263e5..324269481160 100644 --- a/drivers/acpi/acpica/nsxfobj.c +++ b/drivers/acpi/acpica/nsxfobj.c @@ -24,7 +24,8 @@ ACPI_MODULE_NAME("nsxfobj") * * RETURN: Status * - * DESCRIPTION: This routine returns the type associatd with a particular handle + * DESCRIPTION: This routine returns the type associated with a particular + * handle * ******************************************************************************/ acpi_status acpi_get_type(acpi_handle handle, acpi_object_type *ret_type) diff --git a/drivers/acpi/acpica/psparse.c b/drivers/acpi/acpica/psparse.c index c780046bf294..bd3caf735be3 100644 --- a/drivers/acpi/acpica/psparse.c +++ b/drivers/acpi/acpica/psparse.c @@ -508,8 +508,8 @@ acpi_status acpi_ps_parse_aml(struct acpi_walk_state *walk_state) } /* - * If the transfer to the new method method call worked - *, a new walk state was created -- get it + * If the transfer to the new method method call worked, + * a new walk state was created -- get it */ walk_state = acpi_ds_get_current_walk_state(thread); continue; diff --git a/drivers/acpi/acpica/utpredef.c b/drivers/acpi/acpica/utpredef.c index 05fe3470fb93..dd277f7e9f10 100644 --- a/drivers/acpi/acpica/utpredef.c +++ b/drivers/acpi/acpica/utpredef.c @@ -151,7 +151,7 @@ static u32 acpi_ut_get_argument_types(char *buffer, u16 argument_types); static const char *ut_external_type_names[] = /* Indexed by ACPI_TYPE_* */ { - ", UNSUPPORTED-TYPE", + ", Type_ANY", ", Integer", ", String", ", Buffer", @@ -311,8 +311,7 @@ static u32 acpi_ut_get_argument_types(char *buffer, u16 argument_types) for (i = 0; i < arg_count; i++) { this_argument_type = METHOD_GET_NEXT_TYPE(argument_types); - if (!this_argument_type - || (this_argument_type > METHOD_MAX_ARG_TYPE)) { + if (this_argument_type > METHOD_MAX_ARG_TYPE) { printf("**** Invalid argument type (%u) " "in predefined info structure\n", this_argument_type); diff --git a/drivers/acpi/acpica/utstrsuppt.c b/drivers/acpi/acpica/utstrsuppt.c index 05ff20049b87..2d91003fcf26 100644 --- a/drivers/acpi/acpica/utstrsuppt.c +++ b/drivers/acpi/acpica/utstrsuppt.c @@ -45,10 +45,15 @@ acpi_status acpi_ut_convert_octal_string(char *string, u64 *return_value_ptr) /* Convert each ASCII byte in the input string */ while (*string) { - - /* Character must be ASCII 0-7, otherwise terminate with no error */ - + /* + * Character must be ASCII 0-7, otherwise: + * 1) Runtime: terminate with no error, per the ACPI spec + * 2) Compiler: return an error + */ if (!(ACPI_IS_OCTAL_DIGIT(*string))) { +#ifdef ACPI_ASL_COMPILER + status = AE_BAD_OCTAL_CONSTANT; +#endif break; } @@ -94,10 +99,15 @@ acpi_status acpi_ut_convert_decimal_string(char *string, u64 *return_value_ptr) /* Convert each ASCII byte in the input string */ while (*string) { - - /* Character must be ASCII 0-9, otherwise terminate with no error */ - + /* + * Character must be ASCII 0-9, otherwise: + * 1) Runtime: terminate with no error, per the ACPI spec + * 2) Compiler: return an error + */ if (!isdigit(*string)) { +#ifdef ACPI_ASL_COMPILER + status = AE_BAD_DECIMAL_CONSTANT; +#endif break; } @@ -143,10 +153,15 @@ acpi_status acpi_ut_convert_hex_string(char *string, u64 *return_value_ptr) /* Convert each ASCII byte in the input string */ while (*string) { - - /* Must be ASCII A-F, a-f, or 0-9, otherwise terminate with no error */ - + /* + * Character must be ASCII A-F, a-f, or 0-9, otherwise: + * 1) Runtime: terminate with no error, per the ACPI spec + * 2) Compiler: return an error + */ if (!isxdigit(*string)) { +#ifdef ACPI_ASL_COMPILER + status = AE_BAD_HEX_CONSTANT; +#endif break; } diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index e358d0046494..552fd9ffaca4 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c @@ -632,7 +632,11 @@ int apei_map_generic_address(struct acpi_generic_address *reg) rc = apei_check_gar(reg, &address, &access_bit_width); if (rc) return rc; - return acpi_os_map_generic_address(reg); + + if (!acpi_os_map_generic_address(reg)) + return -ENXIO; + + return 0; } EXPORT_SYMBOL_GPL(apei_map_generic_address); diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index e670785a6201..6f89c16f45f3 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1329,7 +1329,7 @@ static int __init arm_smmu_v3_set_proximity(struct device *dev, smmu = (struct acpi_iort_smmu_v3 *)node->node_data; if (smmu->flags & ACPI_IORT_SMMU_V3_PXM_VALID) { - int dev_node = acpi_map_pxm_to_node(smmu->pxm); + int dev_node = pxm_to_node(smmu->pxm); if (dev_node != NUMA_NO_NODE && !node_online(dev_node)) return -EINVAL; diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 54002670cb7a..113c661eb848 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -303,7 +303,11 @@ static void acpi_bus_osc_support(void) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; +#ifdef CONFIG_ARM64 + capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; +#endif #ifdef CONFIG_X86 + capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; if (boot_cpu_has(X86_FEATURE_HWP)) { capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index a4eda7fe50d3..da4b125ab4c3 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -153,6 +153,7 @@ struct acpi_button { int last_state; ktime_t last_time; bool suspended; + bool lid_state_initialized; }; static struct acpi_device *lid_device; @@ -383,6 +384,8 @@ static int acpi_lid_update_state(struct acpi_device *device, static void acpi_lid_initialize_state(struct acpi_device *device) { + struct acpi_button *button = acpi_driver_data(device); + switch (lid_init_state) { case ACPI_BUTTON_LID_INIT_OPEN: (void)acpi_lid_notify_state(device, 1); @@ -394,13 +397,14 @@ static void acpi_lid_initialize_state(struct acpi_device *device) default: break; } + + button->lid_state_initialized = true; } static void acpi_button_notify(struct acpi_device *device, u32 event) { struct acpi_button *button = acpi_driver_data(device); struct input_dev *input; - int users; switch (event) { case ACPI_FIXED_HARDWARE_EVENT: @@ -409,10 +413,7 @@ static void acpi_button_notify(struct acpi_device *device, u32 event) case ACPI_BUTTON_NOTIFY_STATUS: input = button->input; if (button->type == ACPI_BUTTON_TYPE_LID) { - mutex_lock(&button->input->mutex); - users = button->input->users; - mutex_unlock(&button->input->mutex); - if (users) + if (button->lid_state_initialized) acpi_lid_update_state(device, true); } else { int keycode; @@ -457,7 +458,7 @@ static int acpi_button_resume(struct device *dev) struct acpi_button *button = acpi_driver_data(device); button->suspended = false; - if (button->type == ACPI_BUTTON_TYPE_LID && button->input->users) { + if (button->type == ACPI_BUTTON_TYPE_LID) { button->last_state = !!acpi_lid_evaluate_state(device); button->last_time = ktime_get(); acpi_lid_initialize_state(device); diff --git a/drivers/acpi/container.c b/drivers/acpi/container.c index 9ea5f55d97e3..ccaa647ac3d4 100644 --- a/drivers/acpi/container.c +++ b/drivers/acpi/container.c @@ -14,9 +14,6 @@ #include "internal.h" -#define _COMPONENT ACPI_CONTAINER_COMPONENT -ACPI_MODULE_NAME("container"); - static const struct acpi_device_id container_device_ids[] = { {"ACPI0004", 0}, {"PNP0A05", 0}, diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c index b097ef209313..7b54dc95d36b 100644 --- a/drivers/acpi/custom_method.c +++ b/drivers/acpi/custom_method.c @@ -13,8 +13,6 @@ #include "internal.h" -#define _COMPONENT ACPI_SYSTEM_COMPONENT -ACPI_MODULE_NAME("custom_method"); MODULE_LICENSE("GPL"); static struct dentry *cm_dentry; diff --git a/drivers/acpi/debugfs.c b/drivers/acpi/debugfs.c index d5ecea3715f8..074eb98d213e 100644 --- a/drivers/acpi/debugfs.c +++ b/drivers/acpi/debugfs.c @@ -10,9 +10,6 @@ #include "internal.h" -#define _COMPONENT ACPI_SYSTEM_COMPONENT -ACPI_MODULE_NAME("debugfs"); - struct dentry *acpi_debugfs_dir; EXPORT_SYMBOL_GPL(acpi_debugfs_dir); diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c index 9bd72c26ef46..45d4b7b69de8 100644 --- a/drivers/acpi/dock.c +++ b/drivers/acpi/dock.c @@ -20,8 +20,6 @@ #include "internal.h" -ACPI_MODULE_NAME("dock"); - static bool immediate_undock = 1; module_param(immediate_undock, bool, 0644); MODULE_PARM_DESC(immediate_undock, "1 (default) will cause the driver to " diff --git a/drivers/acpi/dptf/Kconfig b/drivers/acpi/dptf/Kconfig index 90a2fd979282..51f06f36cafa 100644 --- a/drivers/acpi/dptf/Kconfig +++ b/drivers/acpi/dptf/Kconfig @@ -14,3 +14,17 @@ config DPTF_POWER To compile this driver as a module, choose M here: the module will be called dptf_power. + +config DPTF_PCH_FIVR + tristate "DPTF PCH FIVR Participant" + depends on X86 + help + This driver adds support for Dynamic Platform and Thermal Framework + (DPTF) PCH FIVR Participant device support. This driver allows to + switch PCH FIVR (Fully Integrated Voltage Regulator) frequency. + This participant is responsible for exposing: + freq_mhz_low_clock + freq_mhz_high_clock + + To compile this driver as a module, choose M here: + the module will be called dptf_pch_fivr. diff --git a/drivers/acpi/dptf/Makefile b/drivers/acpi/dptf/Makefile index 1a9b0a2b25bf..297340682f66 100644 --- a/drivers/acpi/dptf/Makefile +++ b/drivers/acpi/dptf/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_ACPI) += int340x_thermal.o obj-$(CONFIG_DPTF_POWER) += dptf_power.o +obj-$(CONFIG_DPTF_PCH_FIVR) += dptf_pch_fivr.o diff --git a/drivers/acpi/dptf/dptf_pch_fivr.c b/drivers/acpi/dptf/dptf_pch_fivr.c new file mode 100644 index 000000000000..4ab288827747 --- /dev/null +++ b/drivers/acpi/dptf/dptf_pch_fivr.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * dptf_pch_fivr: DPTF PCH FIVR Participant driver + * Copyright (c) 2020, Intel Corporation. + */ + +#include +#include +#include +#include + +/* + * Presentation of attributes which are defined for INT1045 + * They are: + * freq_mhz_low_clock : Set PCH FIVR switching freq for + * FIVR clock 19.2MHz and 24MHz + * freq_mhz_high_clock : Set PCH FIVR switching freq for + * FIVR clock 38.4MHz + */ +#define PCH_FIVR_SHOW(name, method) \ +static ssize_t name##_show(struct device *dev,\ + struct device_attribute *attr,\ + char *buf)\ +{\ + struct acpi_device *acpi_dev = dev_get_drvdata(dev);\ + unsigned long long val;\ + acpi_status status;\ +\ + status = acpi_evaluate_integer(acpi_dev->handle, #method,\ + NULL, &val);\ + if (ACPI_SUCCESS(status))\ + return sprintf(buf, "%d\n", (int)val);\ + else\ + return -EINVAL;\ +} + +#define PCH_FIVR_STORE(name, method) \ +static ssize_t name##_store(struct device *dev,\ + struct device_attribute *attr,\ + const char *buf, size_t count)\ +{\ + struct acpi_device *acpi_dev = dev_get_drvdata(dev);\ + acpi_status status;\ + u32 val;\ +\ + if (kstrtouint(buf, 0, &val) < 0)\ + return -EINVAL;\ +\ + status = acpi_execute_simple_method(acpi_dev->handle, #method, val);\ + if (ACPI_SUCCESS(status))\ + return count;\ +\ + return -EINVAL;\ +} + +PCH_FIVR_SHOW(freq_mhz_low_clock, GFC0) +PCH_FIVR_SHOW(freq_mhz_high_clock, GFC1) +PCH_FIVR_STORE(freq_mhz_low_clock, RFC0) +PCH_FIVR_STORE(freq_mhz_high_clock, RFC1) + +static DEVICE_ATTR_RW(freq_mhz_low_clock); +static DEVICE_ATTR_RW(freq_mhz_high_clock); + +static struct attribute *fivr_attrs[] = { + &dev_attr_freq_mhz_low_clock.attr, + &dev_attr_freq_mhz_high_clock.attr, + NULL +}; + +static const struct attribute_group pch_fivr_attribute_group = { + .attrs = fivr_attrs, + .name = "pch_fivr_switch_frequency" +}; + +static int pch_fivr_add(struct platform_device *pdev) +{ + struct acpi_device *acpi_dev; + unsigned long long ptype; + acpi_status status; + int result; + + acpi_dev = ACPI_COMPANION(&(pdev->dev)); + if (!acpi_dev) + return -ENODEV; + + status = acpi_evaluate_integer(acpi_dev->handle, "PTYP", NULL, &ptype); + if (ACPI_FAILURE(status) || ptype != 0x05) + return -ENODEV; + + result = sysfs_create_group(&pdev->dev.kobj, + &pch_fivr_attribute_group); + if (result) + return result; + + platform_set_drvdata(pdev, acpi_dev); + + return 0; +} + +static int pch_fivr_remove(struct platform_device *pdev) +{ + sysfs_remove_group(&pdev->dev.kobj, &pch_fivr_attribute_group); + + return 0; +} + +static const struct acpi_device_id pch_fivr_device_ids[] = { + {"INTC1045", 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, pch_fivr_device_ids); + +static struct platform_driver pch_fivr_driver = { + .probe = pch_fivr_add, + .remove = pch_fivr_remove, + .driver = { + .name = "DPTF PCH FIVR", + .acpi_match_table = pch_fivr_device_ids, + }, +}; + +module_platform_driver(pch_fivr_driver); + +MODULE_AUTHOR("Srinivas Pandruvada "); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("ACPI DPTF PCH FIVR driver"); diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c index bc71a6a60334..8d420c7e7178 100644 --- a/drivers/acpi/dptf/int340x_thermal.c +++ b/drivers/acpi/dptf/int340x_thermal.c @@ -27,6 +27,7 @@ static const struct acpi_device_id int340x_thermal_device_ids[] = { {"INTC1040"}, {"INTC1043"}, {"INTC1044"}, + {"INTC1045"}, {"INTC1047"}, {""}, }; diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index fcddda3d6712..e0cb1bcfffb2 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -2011,20 +2011,16 @@ bool acpi_ec_dispatch_gpe(void) if (acpi_any_gpe_status_set(first_ec->gpe)) return true; - if (ec_no_wakeup) - return false; - /* * Dispatch the EC GPE in-band, but do not report wakeup in any case * to allow the caller to process events properly after that. */ ret = acpi_dispatch_gpe(NULL, first_ec->gpe); - if (ret == ACPI_INTERRUPT_HANDLED) { + if (ret == ACPI_INTERRUPT_HANDLED) pm_pr_dbg("ACPI EC GPE dispatched\n"); - /* Flush the event and query workqueues. */ - acpi_ec_flush_work(); - } + /* Flush the event and query workqueues. */ + acpi_ec_flush_work(); return false; } diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index 47f21599f2ab..170643927044 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -19,9 +19,6 @@ #include "internal.h" -#define _COMPONENT ACPI_SYSTEM_COMPONENT -ACPI_MODULE_NAME("event"); - /* ACPI notifier chain */ static BLOCKING_NOTIFIER_HEAD(acpi_chain_head); diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 26dd208a0d63..756227837b3b 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1389,7 +1389,7 @@ static bool ars_supported(struct nvdimm_bus *nvdimm_bus) static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); if (a == &dev_attr_scrub.attr) @@ -1679,7 +1679,7 @@ static struct attribute *acpi_nfit_dimm_attributes[] = { static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); @@ -3006,10 +3006,8 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, ndr_desc->provider_data = nfit_spa; ndr_desc->attr_groups = acpi_nfit_region_attribute_groups; if (spa->flags & ACPI_NFIT_PROXIMITY_VALID) { - ndr_desc->numa_node = acpi_map_pxm_to_online_node( - spa->proximity_domain); - ndr_desc->target_node = acpi_map_pxm_to_node( - spa->proximity_domain); + ndr_desc->numa_node = pxm_to_online_node(spa->proximity_domain); + ndr_desc->target_node = pxm_to_node(spa->proximity_domain); } else { ndr_desc->numa_node = NUMA_NO_NODE; ndr_desc->target_node = NUMA_NO_NODE; diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 134bcb40b2af..cb73a5d6ea76 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -63,7 +63,7 @@ struct memory_target { unsigned int memory_pxm; unsigned int processor_pxm; struct resource memregions; - struct node_hmem_attrs hmem_attrs; + struct node_hmem_attrs hmem_attrs[2]; struct list_head caches; struct node_cache_attrs cache_attrs; bool registered; @@ -72,6 +72,7 @@ struct memory_target { struct memory_initiator { struct list_head node; unsigned int processor_pxm; + bool has_cpu; }; struct memory_locality { @@ -115,6 +116,7 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm) return; initiator->processor_pxm = cpu_pxm; + initiator->has_cpu = node_state(pxm_to_node(cpu_pxm), N_CPU); list_add_tail(&initiator->node, &initiators); } @@ -222,28 +224,28 @@ static u32 hmat_normalize(u16 entry, u64 base, u8 type) } static void hmat_update_target_access(struct memory_target *target, - u8 type, u32 value) + u8 type, u32 value, int access) { switch (type) { case ACPI_HMAT_ACCESS_LATENCY: - target->hmem_attrs.read_latency = value; - target->hmem_attrs.write_latency = value; + target->hmem_attrs[access].read_latency = value; + target->hmem_attrs[access].write_latency = value; break; case ACPI_HMAT_READ_LATENCY: - target->hmem_attrs.read_latency = value; + target->hmem_attrs[access].read_latency = value; break; case ACPI_HMAT_WRITE_LATENCY: - target->hmem_attrs.write_latency = value; + target->hmem_attrs[access].write_latency = value; break; case ACPI_HMAT_ACCESS_BANDWIDTH: - target->hmem_attrs.read_bandwidth = value; - target->hmem_attrs.write_bandwidth = value; + target->hmem_attrs[access].read_bandwidth = value; + target->hmem_attrs[access].write_bandwidth = value; break; case ACPI_HMAT_READ_BANDWIDTH: - target->hmem_attrs.read_bandwidth = value; + target->hmem_attrs[access].read_bandwidth = value; break; case ACPI_HMAT_WRITE_BANDWIDTH: - target->hmem_attrs.write_bandwidth = value; + target->hmem_attrs[access].write_bandwidth = value; break; default: break; @@ -336,8 +338,12 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header, if (mem_hier == ACPI_HMAT_MEMORY) { target = find_mem_target(targs[targ]); - if (target && target->processor_pxm == inits[init]) - hmat_update_target_access(target, type, value); + if (target && target->processor_pxm == inits[init]) { + hmat_update_target_access(target, type, value, 0); + /* If the node has a CPU, update access 1 */ + if (node_state(pxm_to_node(inits[init]), N_CPU)) + hmat_update_target_access(target, type, value, 1); + } } } } @@ -431,7 +437,8 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade pr_info("HMAT: Memory Flags:%04x Processor Domain:%u Memory Domain:%u\n", p->flags, p->processor_PD, p->memory_PD); - if (p->flags & ACPI_HMAT_MEMORY_PD_VALID && hmat_revision == 1) { + if ((hmat_revision == 1 && p->flags & ACPI_HMAT_MEMORY_PD_VALID) || + hmat_revision > 1) { target = find_mem_target(p->memory_PD); if (!target) { pr_debug("HMAT: Memory Domain missing from SRAT\n"); @@ -573,6 +580,7 @@ static void hmat_register_target_initiators(struct memory_target *target) unsigned int mem_nid, cpu_nid; struct memory_locality *loc = NULL; u32 best = 0; + bool access0done = false; int i; mem_nid = pxm_to_node(target->memory_pxm); @@ -584,7 +592,11 @@ static void hmat_register_target_initiators(struct memory_target *target) if (target->processor_pxm != PXM_INVAL) { cpu_nid = pxm_to_node(target->processor_pxm); register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); - return; + access0done = true; + if (node_state(cpu_nid, N_CPU)) { + register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); + return; + } } if (list_empty(&localities)) @@ -598,6 +610,41 @@ static void hmat_register_target_initiators(struct memory_target *target) */ bitmap_zero(p_nodes, MAX_NUMNODES); list_sort(p_nodes, &initiators, initiator_cmp); + if (!access0done) { + for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { + loc = localities_types[i]; + if (!loc) + continue; + + best = 0; + list_for_each_entry(initiator, &initiators, node) { + u32 value; + + if (!test_bit(initiator->processor_pxm, p_nodes)) + continue; + + value = hmat_initiator_perf(target, initiator, + loc->hmat_loc); + if (hmat_update_best(loc->hmat_loc->data_type, value, &best)) + bitmap_clear(p_nodes, 0, initiator->processor_pxm); + if (value != best) + clear_bit(initiator->processor_pxm, p_nodes); + } + if (best) + hmat_update_target_access(target, loc->hmat_loc->data_type, + best, 0); + } + + for_each_set_bit(i, p_nodes, MAX_NUMNODES) { + cpu_nid = pxm_to_node(i); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + } + } + + /* Access 1 ignores Generic Initiators */ + bitmap_zero(p_nodes, MAX_NUMNODES); + list_sort(p_nodes, &initiators, initiator_cmp); + best = 0; for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { loc = localities_types[i]; if (!loc) @@ -607,6 +654,10 @@ static void hmat_register_target_initiators(struct memory_target *target) list_for_each_entry(initiator, &initiators, node) { u32 value; + if (!initiator->has_cpu) { + clear_bit(initiator->processor_pxm, p_nodes); + continue; + } if (!test_bit(initiator->processor_pxm, p_nodes)) continue; @@ -617,12 +668,11 @@ static void hmat_register_target_initiators(struct memory_target *target) clear_bit(initiator->processor_pxm, p_nodes); } if (best) - hmat_update_target_access(target, loc->hmat_loc->data_type, best); + hmat_update_target_access(target, loc->hmat_loc->data_type, best, 1); } - for_each_set_bit(i, p_nodes, MAX_NUMNODES) { cpu_nid = pxm_to_node(i); - register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); } } @@ -635,10 +685,10 @@ static void hmat_register_target_cache(struct memory_target *target) node_add_cache(mem_nid, &tcache->cache_attrs); } -static void hmat_register_target_perf(struct memory_target *target) +static void hmat_register_target_perf(struct memory_target *target, int access) { unsigned mem_nid = pxm_to_node(target->memory_pxm); - node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); + node_set_perf_attrs(mem_nid, &target->hmem_attrs[access], access); } static void hmat_register_target_devices(struct memory_target *target) @@ -653,7 +703,7 @@ static void hmat_register_target_devices(struct memory_target *target) return; for (res = target->memregions.child; res; res = res->sibling) { - int target_nid = acpi_map_pxm_to_node(target->memory_pxm); + int target_nid = pxm_to_node(target->memory_pxm); hmem_register_device(target_nid, res); } @@ -683,7 +733,8 @@ static void hmat_register_target(struct memory_target *target) if (!target->registered) { hmat_register_target_initiators(target); hmat_register_target_cache(target); - hmat_register_target_perf(target); + hmat_register_target_perf(target, 0); + hmat_register_target_perf(target, 1); target->registered = true; } mutex_unlock(&target_lock); diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 1b0ae0a1959b..6021a1013442 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -36,7 +36,7 @@ void __init disable_srat(void) int pxm_to_node(int pxm) { - if (pxm < 0) + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS || numa_off) return NUMA_NO_NODE; return pxm_to_node_map[pxm]; } @@ -135,6 +135,36 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header) } break; + case ACPI_SRAT_TYPE_GENERIC_AFFINITY: + { + struct acpi_srat_generic_affinity *p = + (struct acpi_srat_generic_affinity *)header; + + if (p->device_handle_type == 0) { + /* + * For pci devices this may be the only place they + * are assigned a proximity domain + */ + pr_debug("SRAT Generic Initiator(Seg:%u BDF:%u) in proximity domain %d %s\n", + *(u16 *)(&p->device_handle[0]), + *(u16 *)(&p->device_handle[2]), + p->proximity_domain, + (p->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED) ? + "enabled" : "disabled"); + } else { + /* + * In this case we can rely on the device having a + * proximity domain reference + */ + pr_debug("SRAT Generic Initiator(HID=%.8s UID=%.4s) in proximity domain %d %s\n", + (char *)(&p->device_handle[0]), + (char *)(&p->device_handle[8]), + p->proximity_domain, + (p->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED) ? + "enabled" : "disabled"); + } + } + break; default: pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", header->type); @@ -337,6 +367,41 @@ acpi_parse_gicc_affinity(union acpi_subtable_headers *header, return 0; } +#if defined(CONFIG_X86) || defined(CONFIG_ARM64) +static int __init +acpi_parse_gi_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_generic_affinity *gi_affinity; + int node; + + gi_affinity = (struct acpi_srat_generic_affinity *)header; + if (!gi_affinity) + return -EINVAL; + acpi_table_print_srat_entry(&header->common); + + if (!(gi_affinity->flags & ACPI_SRAT_GENERIC_AFFINITY_ENABLED)) + return -EINVAL; + + node = acpi_map_pxm_to_node(gi_affinity->proximity_domain); + if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + pr_err("SRAT: Too many proximity domains.\n"); + return -EINVAL; + } + node_set(node, numa_nodes_parsed); + node_set_state(node, N_GENERIC_INITIATOR); + + return 0; +} +#else +static int __init +acpi_parse_gi_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + return 0; +} +#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ + static int __initdata parsed_numa_memblks; static int __init @@ -390,7 +455,7 @@ int __init acpi_numa_init(void) /* SRAT: System Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { - struct acpi_subtable_proc srat_proc[3]; + struct acpi_subtable_proc srat_proc[4]; memset(srat_proc, 0, sizeof(srat_proc)); srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY; @@ -399,6 +464,8 @@ int __init acpi_numa_init(void) srat_proc[1].handler = acpi_parse_x2apic_affinity; srat_proc[2].id = ACPI_SRAT_TYPE_GICC_AFFINITY; srat_proc[2].handler = acpi_parse_gicc_affinity; + srat_proc[3].id = ACPI_SRAT_TYPE_GENERIC_AFFINITY; + srat_proc[3].handler = acpi_parse_gi_affinity; acpi_table_parse_entries_array(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), @@ -441,6 +508,6 @@ int acpi_get_node(acpi_handle handle) pxm = acpi_get_pxm(handle); - return acpi_map_pxm_to_node(pxm); + return pxm_to_node(pxm); } EXPORT_SYMBOL(acpi_get_node); diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 4a0b07792233..0418febc5cf2 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -447,24 +447,19 @@ void __ref acpi_os_unmap_memory(void *virt, acpi_size size) } EXPORT_SYMBOL_GPL(acpi_os_unmap_memory); -int acpi_os_map_generic_address(struct acpi_generic_address *gas) +void __iomem *acpi_os_map_generic_address(struct acpi_generic_address *gas) { u64 addr; - void __iomem *virt; if (gas->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) - return 0; + return NULL; /* Handle possible alignment issues */ memcpy(&addr, &gas->address, sizeof(addr)); if (!addr || !gas->bit_width) - return -EINVAL; + return NULL; - virt = acpi_os_map_iomem(addr, gas->bit_width / 8); - if (!virt) - return -EIO; - - return 0; + return acpi_os_map_iomem(addr, gas->bit_width / 8); } EXPORT_SYMBOL(acpi_os_map_generic_address); @@ -1749,17 +1744,22 @@ acpi_status __init acpi_os_initialize(void) { acpi_os_map_generic_address(&acpi_gbl_FADT.xpm1a_event_block); acpi_os_map_generic_address(&acpi_gbl_FADT.xpm1b_event_block); - acpi_os_map_generic_address(&acpi_gbl_FADT.xgpe0_block); - acpi_os_map_generic_address(&acpi_gbl_FADT.xgpe1_block); + + acpi_gbl_xgpe0_block_logical_address = + (unsigned long)acpi_os_map_generic_address(&acpi_gbl_FADT.xgpe0_block); + acpi_gbl_xgpe1_block_logical_address = + (unsigned long)acpi_os_map_generic_address(&acpi_gbl_FADT.xgpe1_block); + if (acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER) { /* * Use acpi_os_map_generic_address to pre-map the reset * register if it's in system memory. */ - int rv; + void *rv; rv = acpi_os_map_generic_address(&acpi_gbl_FADT.reset_register); - pr_debug(PREFIX "%s: map reset_reg status %d\n", __func__, rv); + pr_debug(PREFIX "%s: map reset_reg %s\n", __func__, + rv ? "successful" : "failed"); } acpi_os_initialized = true; @@ -1787,8 +1787,12 @@ acpi_status acpi_os_terminate(void) acpi_os_unmap_generic_address(&acpi_gbl_FADT.xgpe1_block); acpi_os_unmap_generic_address(&acpi_gbl_FADT.xgpe0_block); + acpi_gbl_xgpe0_block_logical_address = 0UL; + acpi_gbl_xgpe1_block_logical_address = 0UL; + acpi_os_unmap_generic_address(&acpi_gbl_FADT.xpm1b_event_block); acpi_os_unmap_generic_address(&acpi_gbl_FADT.xpm1a_event_block); + if (acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER) acpi_os_unmap_generic_address(&acpi_gbl_FADT.reset_register); diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index f90e841c59f5..c12b5fb3e8fb 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -24,8 +24,6 @@ #include "internal.h" -#define _COMPONENT ACPI_PCI_COMPONENT -ACPI_MODULE_NAME("pci_root"); #define ACPI_PCI_ROOT_CLASS "pci_bridge" #define ACPI_PCI_ROOT_DEVICE_NAME "PCI Root Bridge" static int acpi_pci_root_add(struct acpi_device *device, @@ -62,7 +60,7 @@ static DEFINE_MUTEX(osc_lock); /** * acpi_is_root_bridge - determine whether an ACPI CA node is a PCI root bridge - * @handle - the ACPI CA node in question. + * @handle: the ACPI CA node in question. * * Note: we could make this API take a struct acpi_device * instead, but * for now, it's more convenient to operate on an acpi_handle. diff --git a/drivers/acpi/pci_slot.c b/drivers/acpi/pci_slot.c index ca2461d1bf14..d6cb2c27a23b 100644 --- a/drivers/acpi/pci_slot.c +++ b/drivers/acpi/pci_slot.c @@ -28,9 +28,6 @@ static int check_sta_before_sun; -#define _COMPONENT ACPI_PCI_COMPONENT -ACPI_MODULE_NAME("pci_slot"); - #define SLOT_NAME_SIZE 21 /* Inspired by #define in acpiphp.h */ struct acpi_pci_slot { diff --git a/drivers/acpi/pmic/Kconfig b/drivers/acpi/pmic/Kconfig new file mode 100644 index 000000000000..56bbcb2ce61b --- /dev/null +++ b/drivers/acpi/pmic/Kconfig @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: GPL-2.0 + +menuconfig PMIC_OPREGION + bool "PMIC (Power Management Integrated Circuit) operation region support" + help + Select this option to enable support for ACPI operation + region of the PMIC chip. The operation region can be used + to control power rails and sensor reading/writing on the + PMIC chip. + +if PMIC_OPREGION + +config BYTCRC_PMIC_OPREGION + bool "ACPI operation region support for Bay Trail Crystal Cove PMIC" + depends on INTEL_SOC_PMIC + help + This config adds ACPI operation region support for the Bay Trail + version of the Crystal Cove PMIC. + +config CHTCRC_PMIC_OPREGION + bool "ACPI operation region support for Cherry Trail Crystal Cove PMIC" + depends on INTEL_SOC_PMIC + help + This config adds ACPI operation region support for the Cherry Trail + version of the Crystal Cove PMIC. + +config XPOWER_PMIC_OPREGION + bool "ACPI operation region support for XPower AXP288 PMIC" + depends on MFD_AXP20X_I2C && IOSF_MBI=y + help + This config adds ACPI operation region support for XPower AXP288 PMIC. + +config BXT_WC_PMIC_OPREGION + bool "ACPI operation region support for BXT WhiskeyCove PMIC" + depends on INTEL_SOC_PMIC_BXTWC + help + This config adds ACPI operation region support for BXT WhiskeyCove PMIC. + +config CHT_WC_PMIC_OPREGION + bool "ACPI operation region support for CHT Whiskey Cove PMIC" + depends on INTEL_SOC_PMIC_CHTWC + help + This config adds ACPI operation region support for CHT Whiskey Cove PMIC. + +config CHT_DC_TI_PMIC_OPREGION + bool "ACPI operation region support for Dollar Cove TI PMIC" + depends on INTEL_SOC_PMIC_CHTDC_TI + help + This config adds ACPI operation region support for Dollar Cove TI PMIC. + +endif # PMIC_OPREGION + +config TPS68470_PMIC_OPREGION + bool "ACPI operation region support for TPS68470 PMIC" + depends on MFD_TPS68470 + help + This config adds ACPI operation region support for TI TPS68470 PMIC. + TPS68470 device is an advanced power management unit that powers + a Compact Camera Module (CCM), generates clocks for image sensors, + drives a dual LED for flash and incorporates two LED drivers for + general purpose indicators. + This driver enables ACPI operation region support control voltage + regulators and clocks. + + This option is a bool as it provides an ACPI operation + region, which must be available before any of the devices + using this, are probed. diff --git a/drivers/acpi/pmic/Makefile b/drivers/acpi/pmic/Makefile new file mode 100644 index 000000000000..cd072c64920c --- /dev/null +++ b/drivers/acpi/pmic/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_PMIC_OPREGION) += intel_pmic.o +obj-$(CONFIG_BYTCRC_PMIC_OPREGION) += intel_pmic_bytcrc.o +obj-$(CONFIG_CHTCRC_PMIC_OPREGION) += intel_pmic_chtcrc.o +obj-$(CONFIG_XPOWER_PMIC_OPREGION) += intel_pmic_xpower.o +obj-$(CONFIG_BXT_WC_PMIC_OPREGION) += intel_pmic_bxtwc.o +obj-$(CONFIG_CHT_WC_PMIC_OPREGION) += intel_pmic_chtwc.o +obj-$(CONFIG_CHT_DC_TI_PMIC_OPREGION) += intel_pmic_chtdc_ti.o +obj-$(CONFIG_TPS68470_PMIC_OPREGION) += tps68470_pmic.o diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c index 7892980b3ce4..0cca7991f186 100644 --- a/drivers/acpi/proc.c +++ b/drivers/acpi/proc.c @@ -10,15 +10,11 @@ #include "sleep.h" #include "internal.h" -#define _COMPONENT ACPI_SYSTEM_COMPONENT - /* * this file provides support for: * /proc/acpi/wakeup */ -ACPI_MODULE_NAME("sleep") - static int acpi_system_wakeup_device_seq_show(struct seq_file *seq, void *offset) { diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index f32beb7d7882..2ac48cda5b20 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -14,9 +14,6 @@ #include #include -#define _COMPONENT ACPI_PROCESSOR_COMPONENT -ACPI_MODULE_NAME("processor_core"); - static struct acpi_table_madt *get_madt_table(void) { static struct acpi_table_madt *madt; diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c index 41feb88ee92d..6c7d05b37c98 100644 --- a/drivers/acpi/processor_thermal.c +++ b/drivers/acpi/processor_thermal.c @@ -20,8 +20,6 @@ #define PREFIX "ACPI: " #define ACPI_PROCESSOR_CLASS "processor" -#define _COMPONENT ACPI_PROCESSOR_COMPONENT -ACPI_MODULE_NAME("processor_thermal"); #ifdef CONFIG_CPU_FREQ diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 2142f1554761..684c726828e1 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -898,8 +898,7 @@ static void acpi_bus_get_wakeup_device_flags(struct acpi_device *device) */ err = acpi_device_sleep_wake(device, 0, 0, 0); if (err) - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "error in _DSW or _PSW evaluation\n")); + pr_debug("error in _DSW or _PSW evaluation\n"); } static void acpi_bus_init_power_state(struct acpi_device *device, int state) diff --git a/drivers/acpi/tiny-power-button.c b/drivers/acpi/tiny-power-button.c index 6273d73c0b59..420e61b8eaae 100644 --- a/drivers/acpi/tiny-power-button.c +++ b/drivers/acpi/tiny-power-button.c @@ -4,7 +4,6 @@ #include #include -ACPI_MODULE_NAME("tiny-power-button"); MODULE_AUTHOR("Josh Triplett"); MODULE_DESCRIPTION("ACPI Tiny Power Button Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index 2499d7e3c710..3a032afd9d05 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -35,9 +35,6 @@ #include #include -ACPI_MODULE_NAME("video"); -#define _COMPONENT ACPI_VIDEO_COMPONENT - void acpi_video_unregister_backlight(void); static bool backlight_notifier_registered; @@ -282,6 +279,15 @@ static const struct dmi_system_id video_detect_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "530U4E/540U4E"), }, }, + /* https://bugs.launchpad.net/bugs/1894667 */ + { + .callback = video_detect_force_video, + .ident = "HP 635 Notebook", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP 635 Notebook PC"), + }, + }, /* Non win8 machines which need native backlight nevertheless */ { diff --git a/drivers/acpi/wakeup.c b/drivers/acpi/wakeup.c index 0b2e42530adf..f89dd9a99e6e 100644 --- a/drivers/acpi/wakeup.c +++ b/drivers/acpi/wakeup.c @@ -26,8 +26,6 @@ static DEFINE_MUTEX(acpi_wakeup_handler_mutex); * suspend/resume and isn't really required as this is called in S-state. At * that time, there is no device hotplug **/ -#define _COMPONENT ACPI_SYSTEM_COMPONENT -ACPI_MODULE_NAME("wakeup_devices") /** * acpi_enable_wakeup_devices - Enable wake-up device GPEs. diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index a7bce08a11e2..13316122a990 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -22,18 +22,27 @@ #include #include -__weak bool arch_freq_counters_available(struct cpumask *cpus) +bool topology_scale_freq_invariant(void) +{ + return cpufreq_supports_freq_invariance() || + arch_freq_counters_available(cpu_online_mask); +} + +__weak bool arch_freq_counters_available(const struct cpumask *cpus) { return false; } DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; -void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, - unsigned long max_freq) +void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, + unsigned long max_freq) { unsigned long scale; int i; + if (WARN_ON_ONCE(!cur_freq || !max_freq)) + return; + /* * If the use of counters for FIE is enabled, just return as we don't * want to update the scale factor with information from CPUFREQ. diff --git a/drivers/base/node.c b/drivers/base/node.c index 50af16e68d98..5ca6f8905db2 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -1005,6 +1005,8 @@ static struct node_attr node_state_attr[] = { #endif [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), + [N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator, + N_GENERIC_INITIATOR), }; static struct attribute *node_state_attrs[] = { @@ -1016,6 +1018,7 @@ static struct attribute *node_state_attrs[] = { #endif &node_state_attr[N_MEMORY].attr.attr, &node_state_attr[N_CPU].attr.attr, + &node_state_attr[N_GENERIC_INITIATOR].attr.attr, NULL }; diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 2cb5e04cf86c..05bb4d4401b2 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -123,7 +123,7 @@ static const struct genpd_lock_ops genpd_spin_ops = { #define genpd_lock_interruptible(p) p->lock_ops->lock_interruptible(p) #define genpd_unlock(p) p->lock_ops->unlock(p) -#define genpd_status_on(genpd) (genpd->status == GPD_STATE_ACTIVE) +#define genpd_status_on(genpd) (genpd->status == GENPD_STATE_ON) #define genpd_is_irq_safe(genpd) (genpd->flags & GENPD_FLAG_IRQ_SAFE) #define genpd_is_always_on(genpd) (genpd->flags & GENPD_FLAG_ALWAYS_ON) #define genpd_is_active_wakeup(genpd) (genpd->flags & GENPD_FLAG_ACTIVE_WAKEUP) @@ -222,7 +222,7 @@ static void genpd_update_accounting(struct generic_pm_domain *genpd) * out of off and so update the idle time and vice * versa. */ - if (genpd->status == GPD_STATE_ACTIVE) { + if (genpd->status == GENPD_STATE_ON) { int state_idx = genpd->state_idx; genpd->states[state_idx].idle_time = @@ -497,6 +497,7 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, struct pm_domain_data *pdd; struct gpd_link *link; unsigned int not_suspended = 0; + int ret; /* * Do not try to power off the domain in the following situations: @@ -544,26 +545,15 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, if (!genpd->gov) genpd->state_idx = 0; - if (genpd->power_off) { - int ret; + /* Don't power off, if a child domain is waiting to power on. */ + if (atomic_read(&genpd->sd_count) > 0) + return -EBUSY; - if (atomic_read(&genpd->sd_count) > 0) - return -EBUSY; + ret = _genpd_power_off(genpd, true); + if (ret) + return ret; - /* - * If sd_count > 0 at this point, one of the subdomains hasn't - * managed to call genpd_power_on() for the parent yet after - * incrementing it. In that case genpd_power_on() will wait - * for us to drop the lock, so we can call .power_off() and let - * the genpd_power_on() restore power for us (this shouldn't - * happen very often). - */ - ret = _genpd_power_off(genpd, true); - if (ret) - return ret; - } - - genpd->status = GPD_STATE_POWER_OFF; + genpd->status = GENPD_STATE_OFF; genpd_update_accounting(genpd); list_for_each_entry(link, &genpd->child_links, child_node) { @@ -616,7 +606,7 @@ static int genpd_power_on(struct generic_pm_domain *genpd, unsigned int depth) if (ret) goto err; - genpd->status = GPD_STATE_ACTIVE; + genpd->status = GENPD_STATE_ON; genpd_update_accounting(genpd); return 0; @@ -961,7 +951,7 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock, if (_genpd_power_off(genpd, false)) return; - genpd->status = GPD_STATE_POWER_OFF; + genpd->status = GENPD_STATE_OFF; list_for_each_entry(link, &genpd->child_links, child_node) { genpd_sd_counter_dec(link->parent); @@ -1007,8 +997,7 @@ static void genpd_sync_power_on(struct generic_pm_domain *genpd, bool use_lock, } _genpd_power_on(genpd, false); - - genpd->status = GPD_STATE_ACTIVE; + genpd->status = GENPD_STATE_ON; } /** @@ -1287,7 +1276,7 @@ static int genpd_restore_noirq(struct device *dev) * so make it appear as powered off to genpd_sync_power_on(), * so that it tries to power it on in case it was really off. */ - genpd->status = GPD_STATE_POWER_OFF; + genpd->status = GENPD_STATE_OFF; genpd_sync_power_on(genpd, true, 0); genpd_unlock(genpd); @@ -1777,7 +1766,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd, genpd->gov = gov; INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); atomic_set(&genpd->sd_count, 0); - genpd->status = is_off ? GPD_STATE_POWER_OFF : GPD_STATE_ACTIVE; + genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON; genpd->device_count = 0; genpd->max_off_time_ns = -1; genpd->max_off_time_changed = true; @@ -2044,8 +2033,9 @@ int of_genpd_add_provider_simple(struct device_node *np, if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table(&genpd->dev); if (ret) { - dev_err(&genpd->dev, "Failed to add OPP table: %d\n", - ret); + if (ret != -EPROBE_DEFER) + dev_err(&genpd->dev, "Failed to add OPP table: %d\n", + ret); goto unlock; } @@ -2054,7 +2044,7 @@ int of_genpd_add_provider_simple(struct device_node *np, * state. */ genpd->opp_table = dev_pm_opp_get_opp_table(&genpd->dev); - WARN_ON(!genpd->opp_table); + WARN_ON(IS_ERR(genpd->opp_table)); } ret = genpd_add_provider(np, genpd_xlate_simple, genpd); @@ -2111,8 +2101,9 @@ int of_genpd_add_provider_onecell(struct device_node *np, if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i); if (ret) { - dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n", - i, ret); + if (ret != -EPROBE_DEFER) + dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n", + i, ret); goto error; } @@ -2121,7 +2112,7 @@ int of_genpd_add_provider_onecell(struct device_node *np, * performance state. */ genpd->opp_table = dev_pm_opp_get_opp_table_indexed(&genpd->dev, i); - WARN_ON(!genpd->opp_table); + WARN_ON(IS_ERR(genpd->opp_table)); } genpd->provider = &np->fwnode; @@ -2802,8 +2793,8 @@ static int genpd_summary_one(struct seq_file *s, struct generic_pm_domain *genpd) { static const char * const status_lookup[] = { - [GPD_STATE_ACTIVE] = "on", - [GPD_STATE_POWER_OFF] = "off" + [GENPD_STATE_ON] = "on", + [GENPD_STATE_OFF] = "off" }; struct pm_domain_data *pm_data; const char *kobj_path; @@ -2881,8 +2872,8 @@ static int summary_show(struct seq_file *s, void *data) static int status_show(struct seq_file *s, void *data) { static const char * const status_lookup[] = { - [GPD_STATE_ACTIVE] = "on", - [GPD_STATE_POWER_OFF] = "off" + [GENPD_STATE_ON] = "on", + [GENPD_STATE_OFF] = "off" }; struct generic_pm_domain *genpd = s->private; @@ -2895,7 +2886,7 @@ static int status_show(struct seq_file *s, void *data) if (WARN_ON_ONCE(genpd->status >= ARRAY_SIZE(status_lookup))) goto exit; - if (genpd->status == GPD_STATE_POWER_OFF) + if (genpd->status == GENPD_STATE_OFF) seq_printf(s, "%s-%u\n", status_lookup[genpd->status], genpd->state_idx); else @@ -2938,7 +2929,7 @@ static int idle_states_show(struct seq_file *s, void *data) ktime_t delta = 0; s64 msecs; - if ((genpd->status == GPD_STATE_POWER_OFF) && + if ((genpd->status == GENPD_STATE_OFF) && (genpd->state_idx == i)) delta = ktime_sub(ktime_get(), genpd->accounting_time); @@ -2961,7 +2952,7 @@ static int active_time_show(struct seq_file *s, void *data) if (ret) return -ERESTARTSYS; - if (genpd->status == GPD_STATE_ACTIVE) + if (genpd->status == GENPD_STATE_ON) delta = ktime_sub(ktime_get(), genpd->accounting_time); seq_printf(s, "%lld ms\n", ktime_to_ms( @@ -2984,7 +2975,7 @@ static int total_idle_time_show(struct seq_file *s, void *data) for (i = 0; i < genpd->state_count; i++) { - if ((genpd->status == GPD_STATE_POWER_OFF) && + if ((genpd->status == GENPD_STATE_OFF) && (genpd->state_idx == i)) delta = ktime_sub(ktime_get(), genpd->accounting_time); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 8143210a5c54..6f605f7820bb 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -291,8 +291,7 @@ static int rpm_get_suppliers(struct device *dev) device_links_read_lock_held()) { int retval; - if (!(link->flags & DL_FLAG_PM_RUNTIME) || - READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) + if (!(link->flags & DL_FLAG_PM_RUNTIME)) continue; retval = pm_runtime_get_sync(link->supplier); @@ -312,8 +311,6 @@ static void rpm_put_suppliers(struct device *dev) list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { - if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) - continue; while (refcount_dec_not_one(&link->rpm_active)) pm_runtime_put(link->supplier); diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 09aa44cb8a91..ba04cb381cd3 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -341,7 +341,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) return read_hv_clock_tsc(); } -static u64 read_hv_sched_clock_tsc(void) +static u64 notrace read_hv_sched_clock_tsc(void) { return (read_hv_clock_tsc() - hv_sched_clock_offset) * (NSEC_PER_SEC / HV_CLOCK_HZ); @@ -404,7 +404,7 @@ static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) return read_hv_clock_msr(); } -static u64 read_hv_sched_clock_msr(void) +static u64 notrace read_hv_sched_clock_msr(void) { return (read_hv_clock_msr() - hv_sched_clock_offset) * (NSEC_PER_SEC / HV_CLOCK_HZ); diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index cb72fb507d57..bf5830eb664f 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -283,7 +283,7 @@ config ARM_SPEAR_CPUFREQ config ARM_STI_CPUFREQ tristate "STi CPUFreq support" - depends on SOC_STIH407 + depends on CPUFREQ_DT && SOC_STIH407 help This driver uses the generic OPP framework to match the running platform with a predefined set of suitable values. If not provided diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c index df1c941260d1..b4af4094309b 100644 --- a/drivers/cpufreq/armada-37xx-cpufreq.c +++ b/drivers/cpufreq/armada-37xx-cpufreq.c @@ -484,6 +484,12 @@ remove_opp: /* late_initcall, to guarantee the driver is loaded after A37xx clock driver */ late_initcall(armada37xx_cpufreq_driver_init); +static const struct of_device_id __maybe_unused armada37xx_cpufreq_of_match[] = { + { .compatible = "marvell,armada-3700-nb-pm" }, + { }, +}; +MODULE_DEVICE_TABLE(of, armada37xx_cpufreq_of_match); + MODULE_AUTHOR("Gregory CLEMENT "); MODULE_DESCRIPTION("Armada 37xx cpufreq driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 7d01df7bfa6c..3776d960f405 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -137,6 +137,7 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "st,stih407", }, { .compatible = "st,stih410", }, + { .compatible = "st,stih418", }, { .compatible = "sigma,tango4", }, diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 944d7b45afe9..e363ae04aac6 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -24,32 +25,41 @@ #include "cpufreq-dt.h" struct private_data { - struct opp_table *opp_table; + struct list_head node; + + cpumask_var_t cpus; struct device *cpu_dev; - const char *reg_name; + struct opp_table *opp_table; + struct opp_table *reg_opp_table; bool have_static_opps; }; +static LIST_HEAD(priv_list); + static struct freq_attr *cpufreq_dt_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, /* Extra space for boost-attr if required */ NULL, }; +static struct private_data *cpufreq_dt_find_data(int cpu) +{ + struct private_data *priv; + + list_for_each_entry(priv, &priv_list, node) { + if (cpumask_test_cpu(cpu, priv->cpus)) + return priv; + } + + return NULL; +} + static int set_target(struct cpufreq_policy *policy, unsigned int index) { struct private_data *priv = policy->driver_data; unsigned long freq = policy->freq_table[index].frequency; - int ret; - ret = dev_pm_opp_set_rate(priv->cpu_dev, freq * 1000); - - if (!ret) { - arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); - } - - return ret; + return dev_pm_opp_set_rate(priv->cpu_dev, freq * 1000); } /* @@ -90,83 +100,24 @@ node_put: return name; } -static int resources_available(void) -{ - struct device *cpu_dev; - struct regulator *cpu_reg; - struct clk *cpu_clk; - int ret = 0; - const char *name; - - cpu_dev = get_cpu_device(0); - if (!cpu_dev) { - pr_err("failed to get cpu0 device\n"); - return -ENODEV; - } - - cpu_clk = clk_get(cpu_dev, NULL); - ret = PTR_ERR_OR_ZERO(cpu_clk); - if (ret) { - /* - * If cpu's clk node is present, but clock is not yet - * registered, we should try defering probe. - */ - if (ret == -EPROBE_DEFER) - dev_dbg(cpu_dev, "clock not ready, retry\n"); - else - dev_err(cpu_dev, "failed to get clock: %d\n", ret); - - return ret; - } - - clk_put(cpu_clk); - - ret = dev_pm_opp_of_find_icc_paths(cpu_dev, NULL); - if (ret) - return ret; - - name = find_supply_name(cpu_dev); - /* Platform doesn't require regulator */ - if (!name) - return 0; - - cpu_reg = regulator_get_optional(cpu_dev, name); - ret = PTR_ERR_OR_ZERO(cpu_reg); - if (ret) { - /* - * If cpu's regulator supply node is present, but regulator is - * not yet registered, we should try defering probe. - */ - if (ret == -EPROBE_DEFER) - dev_dbg(cpu_dev, "cpu0 regulator not ready, retry\n"); - else - dev_dbg(cpu_dev, "no regulator for cpu0: %d\n", ret); - - return ret; - } - - regulator_put(cpu_reg); - return 0; -} - static int cpufreq_init(struct cpufreq_policy *policy) { struct cpufreq_frequency_table *freq_table; - struct opp_table *opp_table = NULL; struct private_data *priv; struct device *cpu_dev; struct clk *cpu_clk; unsigned int transition_latency; - bool fallback = false; - const char *name; int ret; - cpu_dev = get_cpu_device(policy->cpu); - if (!cpu_dev) { - pr_err("failed to get cpu%d device\n", policy->cpu); + priv = cpufreq_dt_find_data(policy->cpu); + if (!priv) { + pr_err("failed to find data for cpu%d\n", policy->cpu); return -ENODEV; } + cpu_dev = priv->cpu_dev; + cpumask_copy(policy->cpus, priv->cpus); + cpu_clk = clk_get(cpu_dev, NULL); if (IS_ERR(cpu_clk)) { ret = PTR_ERR(cpu_clk); @@ -174,45 +125,6 @@ static int cpufreq_init(struct cpufreq_policy *policy) return ret; } - /* Get OPP-sharing information from "operating-points-v2" bindings */ - ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, policy->cpus); - if (ret) { - if (ret != -ENOENT) - goto out_put_clk; - - /* - * operating-points-v2 not supported, fallback to old method of - * finding shared-OPPs for backward compatibility if the - * platform hasn't set sharing CPUs. - */ - if (dev_pm_opp_get_sharing_cpus(cpu_dev, policy->cpus)) - fallback = true; - } - - /* - * OPP layer will be taking care of regulators now, but it needs to know - * the name of the regulator first. - */ - name = find_supply_name(cpu_dev); - if (name) { - opp_table = dev_pm_opp_set_regulators(cpu_dev, &name, 1); - if (IS_ERR(opp_table)) { - ret = PTR_ERR(opp_table); - dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n", - policy->cpu, ret); - goto out_put_clk; - } - } - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (!priv) { - ret = -ENOMEM; - goto out_put_regulator; - } - - priv->reg_name = name; - priv->opp_table = opp_table; - /* * Initialize OPP tables for all policy->cpus. They will be shared by * all CPUs which have marked their CPUs shared with OPP bindings. @@ -232,31 +144,17 @@ static int cpufreq_init(struct cpufreq_policy *policy) */ ret = dev_pm_opp_get_opp_count(cpu_dev); if (ret <= 0) { - dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n"); - ret = -EPROBE_DEFER; + dev_err(cpu_dev, "OPP table can't be empty\n"); + ret = -ENODEV; goto out_free_opp; } - if (fallback) { - cpumask_setall(policy->cpus); - - /* - * OPP tables are initialized only for policy->cpu, do it for - * others as well. - */ - ret = dev_pm_opp_set_sharing_cpus(cpu_dev, policy->cpus); - if (ret) - dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n", - __func__, ret); - } - ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table); if (ret) { dev_err(cpu_dev, "failed to init cpufreq table: %d\n", ret); goto out_free_opp; } - priv->cpu_dev = cpu_dev; policy->driver_data = priv; policy->clk = cpu_clk; policy->freq_table = freq_table; @@ -288,11 +186,6 @@ out_free_cpufreq_table: out_free_opp: if (priv->have_static_opps) dev_pm_opp_of_cpumask_remove_table(policy->cpus); - kfree(priv); -out_put_regulator: - if (name) - dev_pm_opp_put_regulators(opp_table); -out_put_clk: clk_put(cpu_clk); return ret; @@ -320,12 +213,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy) dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table); if (priv->have_static_opps) dev_pm_opp_of_cpumask_remove_table(policy->related_cpus); - if (priv->reg_name) - dev_pm_opp_put_regulators(priv->opp_table); - clk_put(policy->clk); - kfree(priv); - return 0; } @@ -344,21 +232,119 @@ static struct cpufreq_driver dt_cpufreq_driver = { .suspend = cpufreq_generic_suspend, }; +static int dt_cpufreq_early_init(struct device *dev, int cpu) +{ + struct private_data *priv; + struct device *cpu_dev; + const char *reg_name; + int ret; + + /* Check if this CPU is already covered by some other policy */ + if (cpufreq_dt_find_data(cpu)) + return 0; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return -EPROBE_DEFER; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + if (!alloc_cpumask_var(&priv->cpus, GFP_KERNEL)) + return -ENOMEM; + + priv->cpu_dev = cpu_dev; + + /* Try to get OPP table early to ensure resources are available */ + priv->opp_table = dev_pm_opp_get_opp_table(cpu_dev); + if (IS_ERR(priv->opp_table)) { + ret = PTR_ERR(priv->opp_table); + if (ret != -EPROBE_DEFER) + dev_err(cpu_dev, "failed to get OPP table: %d\n", ret); + goto free_cpumask; + } + + /* + * OPP layer will be taking care of regulators now, but it needs to know + * the name of the regulator first. + */ + reg_name = find_supply_name(cpu_dev); + if (reg_name) { + priv->reg_opp_table = dev_pm_opp_set_regulators(cpu_dev, + ®_name, 1); + if (IS_ERR(priv->reg_opp_table)) { + ret = PTR_ERR(priv->reg_opp_table); + if (ret != -EPROBE_DEFER) + dev_err(cpu_dev, "failed to set regulators: %d\n", + ret); + goto put_table; + } + } + + /* Find OPP sharing information so we can fill pri->cpus here */ + /* Get OPP-sharing information from "operating-points-v2" bindings */ + ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, priv->cpus); + if (ret) { + if (ret != -ENOENT) + goto put_reg; + + /* + * operating-points-v2 not supported, fallback to all CPUs share + * OPP for backward compatibility if the platform hasn't set + * sharing CPUs. + */ + if (dev_pm_opp_get_sharing_cpus(cpu_dev, priv->cpus)) { + cpumask_setall(priv->cpus); + + /* + * OPP tables are initialized only for cpu, do it for + * others as well. + */ + ret = dev_pm_opp_set_sharing_cpus(cpu_dev, priv->cpus); + if (ret) + dev_err(cpu_dev, "%s: failed to mark OPPs as shared: %d\n", + __func__, ret); + } + } + + list_add(&priv->node, &priv_list); + return 0; + +put_reg: + if (priv->reg_opp_table) + dev_pm_opp_put_regulators(priv->reg_opp_table); +put_table: + dev_pm_opp_put_opp_table(priv->opp_table); +free_cpumask: + free_cpumask_var(priv->cpus); + return ret; +} + +static void dt_cpufreq_release(void) +{ + struct private_data *priv, *tmp; + + list_for_each_entry_safe(priv, tmp, &priv_list, node) { + if (priv->reg_opp_table) + dev_pm_opp_put_regulators(priv->reg_opp_table); + dev_pm_opp_put_opp_table(priv->opp_table); + free_cpumask_var(priv->cpus); + list_del(&priv->node); + } +} + static int dt_cpufreq_probe(struct platform_device *pdev) { struct cpufreq_dt_platform_data *data = dev_get_platdata(&pdev->dev); - int ret; + int ret, cpu; - /* - * All per-cluster (CPUs sharing clock/voltages) initialization is done - * from ->init(). In probe(), we just need to make sure that clk and - * regulators are available. Else defer probe and retry. - * - * FIXME: Is checking this only for CPU0 sufficient ? - */ - ret = resources_available(); - if (ret) - return ret; + /* Request resources early so we can return in case of -EPROBE_DEFER */ + for_each_possible_cpu(cpu) { + ret = dt_cpufreq_early_init(&pdev->dev, cpu); + if (ret) + goto err; + } if (data) { if (data->have_governor_per_policy) @@ -374,15 +360,21 @@ static int dt_cpufreq_probe(struct platform_device *pdev) } ret = cpufreq_register_driver(&dt_cpufreq_driver); - if (ret) + if (ret) { dev_err(&pdev->dev, "failed register driver: %d\n", ret); + goto err; + } + return 0; +err: + dt_cpufreq_release(); return ret; } static int dt_cpufreq_remove(struct platform_device *pdev) { cpufreq_unregister_driver(&dt_cpufreq_driver); + dt_cpufreq_release(); return 0; } diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 9160c8222b8a..fd6205994401 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -62,6 +62,12 @@ static struct cpufreq_driver *cpufreq_driver; static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data); static DEFINE_RWLOCK(cpufreq_driver_lock); +static DEFINE_STATIC_KEY_FALSE(cpufreq_freq_invariance); +bool cpufreq_supports_freq_invariance(void) +{ + return static_branch_likely(&cpufreq_freq_invariance); +} + /* Flag to suspend/resume CPUFreq governors */ static bool cpufreq_suspended; @@ -155,12 +161,6 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy) } EXPORT_SYMBOL_GPL(get_cpu_idle_time); -__weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, - unsigned long max_freq) -{ -} -EXPORT_SYMBOL_GPL(arch_set_freq_scale); - /* * This is a generic cpufreq init() routine which can be used by cpufreq * drivers of SMP systems. It will do following: @@ -448,6 +448,10 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy, cpufreq_notify_post_transition(policy, freqs, transition_failed); + arch_set_freq_scale(policy->related_cpus, + policy->cur, + policy->cpuinfo.max_freq); + policy->transition_ongoing = false; policy->transition_task = NULL; @@ -2059,15 +2063,26 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier); unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { - int ret; + unsigned int freq; + int cpu; target_freq = clamp_val(target_freq, policy->min, policy->max); + freq = cpufreq_driver->fast_switch(policy, target_freq); - ret = cpufreq_driver->fast_switch(policy, target_freq); - if (ret) - cpufreq_times_record_transition(policy, ret); + if (!freq) + return 0; - return ret; + policy->cur = freq; + arch_set_freq_scale(policy->related_cpus, freq, + policy->cpuinfo.max_freq); + cpufreq_stats_record_transition(policy, freq); + + if (trace_cpu_frequency_enabled()) { + for_each_cpu(cpu, policy->cpus) + trace_cpu_frequency(freq, cpu); + } + + return freq; } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); @@ -2718,6 +2733,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) cpufreq_driver = driver_data; write_unlock_irqrestore(&cpufreq_driver_lock, flags); + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("supports frequency invariance"); + } + if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; @@ -2787,6 +2811,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) cpus_read_lock(); subsys_interface_unregister(&cpufreq_interface); remove_boost_sysfs_file(); + static_branch_disable_cpuslocked(&cpufreq_freq_invariance); cpuhp_remove_state_nocalls_cpuslocked(hp_online); write_lock_irqsave(&cpufreq_driver_lock, flags); diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 94d959a8e954..6cd5c8ab5d49 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -19,64 +19,104 @@ struct cpufreq_stats { unsigned int state_num; unsigned int last_index; u64 *time_in_state; - spinlock_t lock; unsigned int *freq_table; unsigned int *trans_table; + + /* Deferred reset */ + unsigned int reset_pending; + unsigned long long reset_time; }; -static void cpufreq_stats_update(struct cpufreq_stats *stats) +static void cpufreq_stats_update(struct cpufreq_stats *stats, + unsigned long long time) { unsigned long long cur_time = get_jiffies_64(); - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; + stats->time_in_state[stats->last_index] += cur_time - time; stats->last_time = cur_time; } -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; - spin_lock(&stats->lock); memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); stats->last_time = get_jiffies_64(); stats->total_trans = 0; - spin_unlock(&stats->lock); + + /* Adjust for the time elapsed since reset was requested */ + WRITE_ONCE(stats->reset_pending, 0); + /* + * Prevent the reset_time read from being reordered before the + * reset_pending accesses in cpufreq_stats_record_transition(). + */ + smp_rmb(); + cpufreq_stats_update(stats, READ_ONCE(stats->reset_time)); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) { - return sprintf(buf, "%d\n", policy->stats->total_trans); + struct cpufreq_stats *stats = policy->stats; + + if (READ_ONCE(stats->reset_pending)) + return sprintf(buf, "%d\n", 0); + else + return sprintf(buf, "%u\n", stats->total_trans); } cpufreq_freq_attr_ro(total_trans); static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); + unsigned long long time; ssize_t len = 0; int i; - if (policy->fast_switch_enabled) - return 0; - - spin_lock(&stats->lock); - cpufreq_stats_update(stats); - spin_unlock(&stats->lock); - for (i = 0; i < stats->state_num; i++) { + if (pending) { + if (i == stats->last_index) { + /* + * Prevent the reset_time read from occurring + * before the reset_pending read above. + */ + smp_rmb(); + time = get_jiffies_64() - READ_ONCE(stats->reset_time); + } else { + time = 0; + } + } else { + time = stats->time_in_state[i]; + if (i == stats->last_index) + time += get_jiffies_64() - stats->last_time; + } + len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], - (unsigned long long) - jiffies_64_to_clock_t(stats->time_in_state[i])); + jiffies_64_to_clock_t(time)); } return len; } cpufreq_freq_attr_ro(time_in_state); +/* We don't care what is written to the attribute */ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, size_t count) { - /* We don't care what is written to the attribute. */ - cpufreq_stats_clear_table(policy->stats); + struct cpufreq_stats *stats = policy->stats; + + /* + * Defer resetting of stats to cpufreq_stats_record_transition() to + * avoid races. + */ + WRITE_ONCE(stats->reset_time, get_jiffies_64()); + /* + * The memory barrier below is to prevent the readers of reset_time from + * seeing a stale or partially updated value. + */ + smp_wmb(); + WRITE_ONCE(stats->reset_pending, 1); + return count; } cpufreq_freq_attr_wo(reset); @@ -84,11 +124,9 @@ cpufreq_freq_attr_wo(reset); static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); ssize_t len = 0; - int i, j; - - if (policy->fast_switch_enabled) - return 0; + int i, j, count; len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); len += scnprintf(buf + len, PAGE_SIZE - len, " : "); @@ -113,8 +151,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) for (j = 0; j < stats->state_num; j++) { if (len >= PAGE_SIZE) break; - len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", - stats->trans_table[i*stats->max_state+j]); + + if (pending) + count = 0; + else + count = stats->trans_table[i * stats->max_state + j]; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count); } if (len >= PAGE_SIZE) break; @@ -208,7 +251,6 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->state_num = i; stats->last_time = get_jiffies_64(); stats->last_index = freq_table_get_index(stats, policy->cur); - spin_lock_init(&stats->lock); policy->stats = stats; ret = sysfs_create_group(&policy->kobj, &stats_attr_group); @@ -228,23 +270,22 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, struct cpufreq_stats *stats = policy->stats; int old_index, new_index; - if (!stats) { - pr_debug("%s: No stats found\n", __func__); + if (unlikely(!stats)) return; - } + + if (unlikely(READ_ONCE(stats->reset_pending))) + cpufreq_stats_reset_table(stats); old_index = stats->last_index; new_index = freq_table_get_index(stats, new_freq); /* We can't do stats->time_in_state[-1]= .. */ - if (old_index == -1 || new_index == -1 || old_index == new_index) + if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index)) return; - spin_lock(&stats->lock); - cpufreq_stats_update(stats); + cpufreq_stats_update(stats, stats->last_time); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; stats->total_trans++; - spin_unlock(&stats->lock); } diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index ef7b34c1fd2b..5bf5fc759881 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -48,7 +48,6 @@ static struct clk_bulk_data clks[] = { }; static struct device *cpu_dev; -static bool free_opp; static struct cpufreq_frequency_table *freq_table; static unsigned int max_freq; static unsigned int transition_latency; @@ -390,9 +389,6 @@ static int imx6q_cpufreq_probe(struct platform_device *pdev) goto put_reg; } - /* Because we have added the OPPs here, we must free them */ - free_opp = true; - if (of_machine_is_compatible("fsl,imx6ul") || of_machine_is_compatible("fsl,imx6ull")) { ret = imx6ul_opp_check_speed_grading(cpu_dev); @@ -507,8 +503,7 @@ soc_opp_out: free_freq_table: dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table); out_free_opp: - if (free_opp) - dev_pm_opp_of_remove_table(cpu_dev); + dev_pm_opp_of_remove_table(cpu_dev); put_reg: if (!IS_ERR(arm_reg)) regulator_put(arm_reg); @@ -528,8 +523,7 @@ static int imx6q_cpufreq_remove(struct platform_device *pdev) { cpufreq_unregister_driver(&imx6q_cpufreq_driver); dev_pm_opp_free_cpufreq_table(cpu_dev, &freq_table); - if (free_opp) - dev_pm_opp_of_remove_table(cpu_dev); + dev_pm_opp_of_remove_table(cpu_dev); regulator_put(arm_reg); if (!IS_ERR(pu_reg)) regulator_put(pu_reg); diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 3fb044b907a8..9ed5341dc515 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -19,18 +19,23 @@ #define LUT_L_VAL GENMASK(7, 0) #define LUT_CORE_COUNT GENMASK(18, 16) #define LUT_VOLT GENMASK(11, 0) -#define LUT_ROW_SIZE 32 #define CLK_HW_DIV 2 #define LUT_TURBO_IND 1 -/* Register offsets */ -#define REG_ENABLE 0x0 -#define REG_FREQ_LUT 0x110 -#define REG_VOLT_LUT 0x114 -#define REG_PERF_STATE 0x920 +struct qcom_cpufreq_soc_data { + u32 reg_enable; + u32 reg_freq_lut; + u32 reg_volt_lut; + u32 reg_perf_state; + u8 lut_row_size; +}; + +struct qcom_cpufreq_data { + void __iomem *base; + const struct qcom_cpufreq_soc_data *soc_data; +}; static unsigned long cpu_hw_rate, xo_rate; -static struct platform_device *global_pdev; static bool icc_scaling_enabled; static int qcom_cpufreq_set_bw(struct cpufreq_policy *policy, @@ -77,22 +82,22 @@ static int qcom_cpufreq_update_opp(struct device *cpu_dev, static int qcom_cpufreq_hw_target_index(struct cpufreq_policy *policy, unsigned int index) { - void __iomem *perf_state_reg = policy->driver_data; + struct qcom_cpufreq_data *data = policy->driver_data; + const struct qcom_cpufreq_soc_data *soc_data = data->soc_data; unsigned long freq = policy->freq_table[index].frequency; - writel_relaxed(index, perf_state_reg); + writel_relaxed(index, data->base + soc_data->reg_perf_state); if (icc_scaling_enabled) qcom_cpufreq_set_bw(policy, freq); - arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); return 0; } static unsigned int qcom_cpufreq_hw_get(unsigned int cpu) { - void __iomem *perf_state_reg; + struct qcom_cpufreq_data *data; + const struct qcom_cpufreq_soc_data *soc_data; struct cpufreq_policy *policy; unsigned int index; @@ -100,9 +105,10 @@ static unsigned int qcom_cpufreq_hw_get(unsigned int cpu) if (!policy) return 0; - perf_state_reg = policy->driver_data; + data = policy->driver_data; + soc_data = data->soc_data; - index = readl_relaxed(perf_state_reg); + index = readl_relaxed(data->base + soc_data->reg_perf_state); index = min(index, LUT_MAX_ENTRIES - 1); return policy->freq_table[index].frequency; @@ -111,23 +117,18 @@ static unsigned int qcom_cpufreq_hw_get(unsigned int cpu) static unsigned int qcom_cpufreq_hw_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq) { - void __iomem *perf_state_reg = policy->driver_data; + struct qcom_cpufreq_data *data = policy->driver_data; + const struct qcom_cpufreq_soc_data *soc_data = data->soc_data; unsigned int index; - unsigned long freq; index = policy->cached_resolved_idx; - writel_relaxed(index, perf_state_reg); + writel_relaxed(index, data->base + soc_data->reg_perf_state); - freq = policy->freq_table[index].frequency; - arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); - - return freq; + return policy->freq_table[index].frequency; } static int qcom_cpufreq_hw_read_lut(struct device *cpu_dev, - struct cpufreq_policy *policy, - void __iomem *base) + struct cpufreq_policy *policy) { u32 data, src, lval, i, core_count, prev_freq = 0, freq; u32 volt; @@ -135,6 +136,8 @@ static int qcom_cpufreq_hw_read_lut(struct device *cpu_dev, struct dev_pm_opp *opp; unsigned long rate; int ret; + struct qcom_cpufreq_data *drv_data = policy->driver_data; + const struct qcom_cpufreq_soc_data *soc_data = drv_data->soc_data; table = kcalloc(LUT_MAX_ENTRIES + 1, sizeof(*table), GFP_KERNEL); if (!table) @@ -161,14 +164,14 @@ static int qcom_cpufreq_hw_read_lut(struct device *cpu_dev, } for (i = 0; i < LUT_MAX_ENTRIES; i++) { - data = readl_relaxed(base + REG_FREQ_LUT + - i * LUT_ROW_SIZE); + data = readl_relaxed(drv_data->base + soc_data->reg_freq_lut + + i * soc_data->lut_row_size); src = FIELD_GET(LUT_SRC, data); lval = FIELD_GET(LUT_L_VAL, data); core_count = FIELD_GET(LUT_CORE_COUNT, data); - data = readl_relaxed(base + REG_VOLT_LUT + - i * LUT_ROW_SIZE); + data = readl_relaxed(drv_data->base + soc_data->reg_volt_lut + + i * soc_data->lut_row_size); volt = FIELD_GET(LUT_VOLT, data) * 1000; if (src) @@ -177,10 +180,15 @@ static int qcom_cpufreq_hw_read_lut(struct device *cpu_dev, freq = cpu_hw_rate / 1000; if (freq != prev_freq && core_count != LUT_TURBO_IND) { - table[i].frequency = freq; - qcom_cpufreq_update_opp(cpu_dev, freq, volt); - dev_dbg(cpu_dev, "index=%d freq=%d, core_count %d\n", i, + if (!qcom_cpufreq_update_opp(cpu_dev, freq, volt)) { + table[i].frequency = freq; + dev_dbg(cpu_dev, "index=%d freq=%d, core_count %d\n", i, freq, core_count); + } else { + dev_warn(cpu_dev, "failed to update OPP for freq=%d\n", freq); + table[i].frequency = CPUFREQ_ENTRY_INVALID; + } + } else if (core_count == LUT_TURBO_IND) { table[i].frequency = CPUFREQ_ENTRY_INVALID; } @@ -197,9 +205,13 @@ static int qcom_cpufreq_hw_read_lut(struct device *cpu_dev, * as the boost frequency */ if (prev->frequency == CPUFREQ_ENTRY_INVALID) { - prev->frequency = prev_freq; - prev->flags = CPUFREQ_BOOST_FREQ; - qcom_cpufreq_update_opp(cpu_dev, prev_freq, volt); + if (!qcom_cpufreq_update_opp(cpu_dev, prev_freq, volt)) { + prev->frequency = prev_freq; + prev->flags = CPUFREQ_BOOST_FREQ; + } else { + dev_warn(cpu_dev, "failed to update OPP for freq=%d\n", + freq); + } } break; @@ -238,14 +250,38 @@ static void qcom_get_related_cpus(int index, struct cpumask *m) } } +static const struct qcom_cpufreq_soc_data qcom_soc_data = { + .reg_enable = 0x0, + .reg_freq_lut = 0x110, + .reg_volt_lut = 0x114, + .reg_perf_state = 0x920, + .lut_row_size = 32, +}; + +static const struct qcom_cpufreq_soc_data epss_soc_data = { + .reg_enable = 0x0, + .reg_freq_lut = 0x100, + .reg_volt_lut = 0x200, + .reg_perf_state = 0x320, + .lut_row_size = 4, +}; + +static const struct of_device_id qcom_cpufreq_hw_match[] = { + { .compatible = "qcom,cpufreq-hw", .data = &qcom_soc_data }, + { .compatible = "qcom,cpufreq-epss", .data = &epss_soc_data }, + {} +}; +MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match); + static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) { - struct device *dev = &global_pdev->dev; + struct platform_device *pdev = cpufreq_get_driver_data(); + struct device *dev = &pdev->dev; struct of_phandle_args args; struct device_node *cpu_np; struct device *cpu_dev; - struct resource *res; void __iomem *base; + struct qcom_cpufreq_data *data; int ret, index; cpu_dev = get_cpu_device(policy->cpu); @@ -267,16 +303,21 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) index = args.args[0]; - res = platform_get_resource(global_pdev, IORESOURCE_MEM, index); - if (!res) - return -ENODEV; + base = devm_platform_ioremap_resource(pdev, index); + if (IS_ERR(base)) + return PTR_ERR(base); - base = devm_ioremap(dev, res->start, resource_size(res)); - if (!base) - return -ENOMEM; + data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto error; + } + + data->soc_data = of_device_get_match_data(&pdev->dev); + data->base = base; /* HW should be in enabled state to proceed */ - if (!(readl_relaxed(base + REG_ENABLE) & 0x1)) { + if (!(readl_relaxed(base + data->soc_data->reg_enable) & 0x1)) { dev_err(dev, "Domain-%d cpufreq hardware not enabled\n", index); ret = -ENODEV; goto error; @@ -289,9 +330,9 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) goto error; } - policy->driver_data = base + REG_PERF_STATE; + policy->driver_data = data; - ret = qcom_cpufreq_hw_read_lut(cpu_dev, policy, base); + ret = qcom_cpufreq_hw_read_lut(cpu_dev, policy); if (ret) { dev_err(dev, "Domain-%d failed to read LUT\n", index); goto error; @@ -315,12 +356,13 @@ error: static int qcom_cpufreq_hw_cpu_exit(struct cpufreq_policy *policy) { struct device *cpu_dev = get_cpu_device(policy->cpu); - void __iomem *base = policy->driver_data - REG_PERF_STATE; + struct qcom_cpufreq_data *data = policy->driver_data; + struct platform_device *pdev = cpufreq_get_driver_data(); dev_pm_opp_remove_all_dynamic(cpu_dev); dev_pm_opp_of_cpumask_remove_table(policy->related_cpus); kfree(policy->freq_table); - devm_iounmap(&global_pdev->dev, base); + devm_iounmap(&pdev->dev, data->base); return 0; } @@ -365,7 +407,7 @@ static int qcom_cpufreq_hw_driver_probe(struct platform_device *pdev) cpu_hw_rate = clk_get_rate(clk) / CLK_HW_DIV; clk_put(clk); - global_pdev = pdev; + cpufreq_qcom_hw_driver.driver_data = pdev; /* Check for optional interconnect paths on CPU0 */ cpu_dev = get_cpu_device(0); @@ -390,12 +432,6 @@ static int qcom_cpufreq_hw_driver_remove(struct platform_device *pdev) return cpufreq_unregister_driver(&cpufreq_qcom_hw_driver); } -static const struct of_device_id qcom_cpufreq_hw_match[] = { - { .compatible = "qcom,cpufreq-hw" }, - {} -}; -MODULE_DEVICE_TABLE(of, qcom_cpufreq_hw_match); - static struct platform_driver qcom_cpufreq_hw_driver = { .probe = qcom_cpufreq_hw_driver_probe, .remove = qcom_cpufreq_hw_driver_remove, diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c index e84281e2561d..bed496cf8d24 100644 --- a/drivers/cpufreq/s5pv210-cpufreq.c +++ b/drivers/cpufreq/s5pv210-cpufreq.c @@ -590,6 +590,7 @@ static struct notifier_block s5pv210_cpufreq_reboot_notifier = { static int s5pv210_cpufreq_probe(struct platform_device *pdev) { + struct device *dev = &pdev->dev; struct device_node *np; int id, result = 0; @@ -602,28 +603,20 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev) * cpufreq-dt driver. */ arm_regulator = regulator_get(NULL, "vddarm"); - if (IS_ERR(arm_regulator)) { - if (PTR_ERR(arm_regulator) == -EPROBE_DEFER) - pr_debug("vddarm regulator not ready, defer\n"); - else - pr_err("failed to get regulator vddarm\n"); - return PTR_ERR(arm_regulator); - } + if (IS_ERR(arm_regulator)) + return dev_err_probe(dev, PTR_ERR(arm_regulator), + "failed to get regulator vddarm\n"); int_regulator = regulator_get(NULL, "vddint"); if (IS_ERR(int_regulator)) { - if (PTR_ERR(int_regulator) == -EPROBE_DEFER) - pr_debug("vddint regulator not ready, defer\n"); - else - pr_err("failed to get regulator vddint\n"); - result = PTR_ERR(int_regulator); + result = dev_err_probe(dev, PTR_ERR(int_regulator), + "failed to get regulator vddint\n"); goto err_int_regulator; } np = of_find_compatible_node(NULL, NULL, "samsung,s5pv210-clock"); if (!np) { - pr_err("%s: failed to find clock controller DT node\n", - __func__); + dev_err(dev, "failed to find clock controller DT node\n"); result = -ENODEV; goto err_clock; } @@ -631,7 +624,7 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev) clk_base = of_iomap(np, 0); of_node_put(np); if (!clk_base) { - pr_err("%s: failed to map clock registers\n", __func__); + dev_err(dev, "failed to map clock registers\n"); result = -EFAULT; goto err_clock; } @@ -639,8 +632,7 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev) for_each_compatible_node(np, NULL, "samsung,s5pv210-dmc") { id = of_alias_get_id(np, "dmc"); if (id < 0 || id >= ARRAY_SIZE(dmc_base)) { - pr_err("%s: failed to get alias of dmc node '%pOFn'\n", - __func__, np); + dev_err(dev, "failed to get alias of dmc node '%pOFn'\n", np); of_node_put(np); result = id; goto err_clk_base; @@ -648,8 +640,7 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev) dmc_base[id] = of_iomap(np, 0); if (!dmc_base[id]) { - pr_err("%s: failed to map dmc%d registers\n", - __func__, id); + dev_err(dev, "failed to map dmc%d registers\n", id); of_node_put(np); result = -EFAULT; goto err_dmc; @@ -658,7 +649,7 @@ static int s5pv210_cpufreq_probe(struct platform_device *pdev) for (id = 0; id < ARRAY_SIZE(dmc_base); ++id) { if (!dmc_base[id]) { - pr_err("%s: failed to find dmc%d node\n", __func__, id); + dev_err(dev, "failed to find dmc%d node\n", id); result = -ENODEV; goto err_dmc; } diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index fb42e3390377..6dd1311660b5 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -48,16 +48,11 @@ static unsigned int scmi_cpufreq_get_rate(unsigned int cpu) static int scmi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index) { - int ret; struct scmi_data *priv = policy->driver_data; struct scmi_perf_ops *perf_ops = handle->perf_ops; u64 freq = policy->freq_table[index].frequency; - ret = perf_ops->freq_set(handle, priv->domain_id, freq * 1000, false); - if (!ret) - arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); - return ret; + return perf_ops->freq_set(handle, priv->domain_id, freq * 1000, false); } static unsigned int scmi_cpufreq_fast_switch(struct cpufreq_policy *policy, @@ -67,11 +62,8 @@ static unsigned int scmi_cpufreq_fast_switch(struct cpufreq_policy *policy, struct scmi_perf_ops *perf_ops = handle->perf_ops; if (!perf_ops->freq_set(handle, priv->domain_id, - target_freq * 1000, true)) { - arch_set_freq_scale(policy->related_cpus, target_freq, - policy->cpuinfo.max_freq); + target_freq * 1000, true)) return target_freq; - } return 0; } diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index b0f5388b8854..43db05b949d9 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -47,9 +47,8 @@ static unsigned int scpi_cpufreq_get_rate(unsigned int cpu) static int scpi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index) { - unsigned long freq = policy->freq_table[index].frequency; + u64 rate = policy->freq_table[index].frequency * 1000; struct scpi_data *priv = policy->driver_data; - u64 rate = freq * 1000; int ret; ret = clk_set_rate(priv->clk, rate); @@ -60,9 +59,6 @@ scpi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index) if (clk_get_rate(priv->clk) != rate) return -EIO; - arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); - return 0; } diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c index a5ad96d29adc..4ac6fb23792a 100644 --- a/drivers/cpufreq/sti-cpufreq.c +++ b/drivers/cpufreq/sti-cpufreq.c @@ -141,7 +141,8 @@ static const struct reg_field sti_stih407_dvfs_regfields[DVFS_MAX_REGFIELDS] = { static const struct reg_field *sti_cpufreq_match(void) { if (of_machine_is_compatible("st,stih407") || - of_machine_is_compatible("st,stih410")) + of_machine_is_compatible("st,stih410") || + of_machine_is_compatible("st,stih418")) return sti_stih407_dvfs_regfields; return NULL; @@ -258,7 +259,8 @@ static int sti_cpufreq_init(void) int ret; if ((!of_machine_is_compatible("st,stih407")) && - (!of_machine_is_compatible("st,stih410"))) + (!of_machine_is_compatible("st,stih410")) && + (!of_machine_is_compatible("st,stih418"))) return -ENODEV; ddata.cpu = get_cpu_device(0); diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index 01e1f58ba422..4b4079f51559 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -14,6 +14,7 @@ #define EDVD_CORE_VOLT_FREQ(core) (0x20 + (core) * 0x4) #define EDVD_CORE_VOLT_FREQ_F_SHIFT 0 +#define EDVD_CORE_VOLT_FREQ_F_MASK 0xffff #define EDVD_CORE_VOLT_FREQ_V_SHIFT 16 struct tegra186_cpufreq_cluster_info { @@ -91,10 +92,39 @@ static int tegra186_cpufreq_set_target(struct cpufreq_policy *policy, return 0; } +static unsigned int tegra186_cpufreq_get(unsigned int cpu) +{ + struct cpufreq_frequency_table *tbl; + struct cpufreq_policy *policy; + void __iomem *edvd_reg; + unsigned int i, freq = 0; + u32 ndiv; + + policy = cpufreq_cpu_get(cpu); + if (!policy) + return 0; + + tbl = policy->freq_table; + edvd_reg = policy->driver_data; + ndiv = readl(edvd_reg) & EDVD_CORE_VOLT_FREQ_F_MASK; + + for (i = 0; tbl[i].frequency != CPUFREQ_TABLE_END; i++) { + if ((tbl[i].driver_data & EDVD_CORE_VOLT_FREQ_F_MASK) == ndiv) { + freq = tbl[i].frequency; + break; + } + } + + cpufreq_cpu_put(policy); + + return freq; +} + static struct cpufreq_driver tegra186_cpufreq_driver = { .name = "tegra186", .flags = CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .get = tegra186_cpufreq_get, .verify = cpufreq_generic_frequency_table_verify, .target_index = tegra186_cpufreq_set_target, .init = tegra186_cpufreq_init, diff --git a/drivers/cpufreq/vexpress-spc-cpufreq.c b/drivers/cpufreq/vexpress-spc-cpufreq.c index 4e8b1dee7c9a..e89b905754d2 100644 --- a/drivers/cpufreq/vexpress-spc-cpufreq.c +++ b/drivers/cpufreq/vexpress-spc-cpufreq.c @@ -182,7 +182,6 @@ static int ve_spc_cpufreq_set_target(struct cpufreq_policy *policy, { u32 cpu = policy->cpu, cur_cluster, new_cluster, actual_cluster; unsigned int freqs_new; - int ret; cur_cluster = cpu_to_cluster(cpu); new_cluster = actual_cluster = per_cpu(physical_cluster, cpu); @@ -197,15 +196,8 @@ static int ve_spc_cpufreq_set_target(struct cpufreq_policy *policy, new_cluster = A15_CLUSTER; } - ret = ve_spc_cpufreq_set_rate(cpu, actual_cluster, new_cluster, - freqs_new); - - if (!ret) { - arch_set_freq_scale(policy->related_cpus, freqs_new, - policy->cpuinfo.max_freq); - } - - return ret; + return ve_spc_cpufreq_set_rate(cpu, actual_cluster, new_cluster, + freqs_new); } static inline u32 get_table_count(struct cpufreq_frequency_table *table) diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c index b6e9649ab0da..4a031c62f92a 100644 --- a/drivers/cpuidle/cpuidle-psci-domain.c +++ b/drivers/cpuidle/cpuidle-psci-domain.c @@ -105,7 +105,7 @@ static void psci_pd_free_states(struct genpd_power_state *states, kfree(states); } -static int psci_pd_init(struct device_node *np) +static int psci_pd_init(struct device_node *np, bool use_osi) { struct generic_pm_domain *pd; struct psci_pd_provider *pd_provider; @@ -135,11 +135,16 @@ static int psci_pd_init(struct device_node *np) pd->free_states = psci_pd_free_states; pd->name = kbasename(pd->name); - pd->power_off = psci_pd_power_off; pd->states = states; pd->state_count = state_count; pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN; + /* Allow power off when OSI has been successfully enabled. */ + if (use_osi) + pd->power_off = psci_pd_power_off; + else + pd->flags |= GENPD_FLAG_ALWAYS_ON; + /* Use governor for CPU PM domains if it has some states to manage. */ pd_gov = state_count > 0 ? &pm_domain_cpu_gov : NULL; @@ -190,7 +195,7 @@ static void psci_pd_remove(void) } } -static int psci_pd_init_topology(struct device_node *np, bool add) +static int psci_pd_init_topology(struct device_node *np) { struct device_node *node; struct of_phandle_args child, parent; @@ -203,9 +208,7 @@ static int psci_pd_init_topology(struct device_node *np, bool add) child.np = node; child.args_count = 0; - - ret = add ? of_genpd_add_subdomain(&parent, &child) : - of_genpd_remove_subdomain(&parent, &child); + ret = of_genpd_add_subdomain(&parent, &child); of_node_put(parent.np); if (ret) { of_node_put(node); @@ -216,14 +219,20 @@ static int psci_pd_init_topology(struct device_node *np, bool add) return 0; } -static int psci_pd_add_topology(struct device_node *np) +static bool psci_pd_try_set_osi_mode(void) { - return psci_pd_init_topology(np, true); -} + int ret; -static void psci_pd_remove_topology(struct device_node *np) -{ - psci_pd_init_topology(np, false); + if (!psci_has_osi_support()) + return false; + + ret = psci_set_osi_mode(true); + if (ret) { + pr_warn("failed to enable OSI mode: %d\n", ret); + return false; + } + + return true; } static void psci_cpuidle_domain_sync_state(struct device *dev) @@ -244,14 +253,14 @@ static int psci_cpuidle_domain_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; struct device_node *node; + bool use_osi; int ret = 0, pd_count = 0; if (!np) return -ENODEV; - /* Currently limit the hierarchical topology to be used in OSI mode. */ - if (!psci_has_osi_support()) - return 0; + /* If OSI mode is supported, let's try to enable it. */ + use_osi = psci_pd_try_set_osi_mode(); /* * Parse child nodes for the "#power-domain-cells" property and @@ -261,7 +270,7 @@ static int psci_cpuidle_domain_probe(struct platform_device *pdev) if (!of_find_property(node, "#power-domain-cells", NULL)) continue; - ret = psci_pd_init(node); + ret = psci_pd_init(node, use_osi); if (ret) goto put_node; @@ -270,30 +279,24 @@ static int psci_cpuidle_domain_probe(struct platform_device *pdev) /* Bail out if not using the hierarchical CPU topology. */ if (!pd_count) - return 0; + goto no_pd; /* Link genpd masters/subdomains to model the CPU topology. */ - ret = psci_pd_add_topology(np); + ret = psci_pd_init_topology(np); if (ret) goto remove_pd; - /* Try to enable OSI mode. */ - ret = psci_set_osi_mode(); - if (ret) { - pr_warn("failed to enable OSI mode: %d\n", ret); - psci_pd_remove_topology(np); - goto remove_pd; - } - pr_info("Initialized CPU PM domain topology\n"); return 0; put_node: of_node_put(node); remove_pd: - if (pd_count) - psci_pd_remove(); + psci_pd_remove(); pr_err("failed to create CPU PM domains ret=%d\n", ret); +no_pd: + if (use_osi) + psci_set_osi_mode(false); return ret; } diff --git a/drivers/cpuidle/cpuidle-tegra.c b/drivers/cpuidle/cpuidle-tegra.c index a12fb141875a..e8956706a291 100644 --- a/drivers/cpuidle/cpuidle-tegra.c +++ b/drivers/cpuidle/cpuidle-tegra.c @@ -172,7 +172,7 @@ static int tegra_cpuidle_coupled_barrier(struct cpuidle_device *dev) static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, int index, unsigned int cpu) { - int ret; + int err; /* * CC6 state is the "CPU cluster power-off" state. In order to @@ -183,9 +183,9 @@ static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, * CPU cores, GIC and L2 cache). */ if (index == TEGRA_CC6) { - ret = tegra_cpuidle_coupled_barrier(dev); - if (ret) - return ret; + err = tegra_cpuidle_coupled_barrier(dev); + if (err) + return err; } local_fiq_disable(); @@ -194,15 +194,15 @@ static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, switch (index) { case TEGRA_C7: - ret = tegra_cpuidle_c7_enter(); + err = tegra_cpuidle_c7_enter(); break; case TEGRA_CC6: - ret = tegra_cpuidle_cc6_enter(cpu); + err = tegra_cpuidle_cc6_enter(cpu); break; default: - ret = -EINVAL; + err = -EINVAL; break; } @@ -210,7 +210,7 @@ static int tegra_cpuidle_state_enter(struct cpuidle_device *dev, tegra_pm_clear_cpu_in_lp2(); local_fiq_enable(); - return ret; + return err ?: index; } static int tegra_cpuidle_adjust_state_index(int index, unsigned int cpu) @@ -236,21 +236,27 @@ static int tegra_cpuidle_enter(struct cpuidle_device *dev, int index) { unsigned int cpu = cpu_logical_map(dev->cpu); - int err; + int ret; index = tegra_cpuidle_adjust_state_index(index, cpu); if (dev->states_usage[index].disable) return -1; if (index == TEGRA_C1) - err = arm_cpuidle_simple_enter(dev, drv, index); + ret = arm_cpuidle_simple_enter(dev, drv, index); else - err = tegra_cpuidle_state_enter(dev, index, cpu); + ret = tegra_cpuidle_state_enter(dev, index, cpu); - if (err && (err != -EINTR || index != TEGRA_CC6)) - pr_err_once("failed to enter state %d err: %d\n", index, err); + if (ret < 0) { + if (ret != -EINTR || index != TEGRA_CC6) + pr_err_once("failed to enter state %d err: %d\n", + index, ret); + index = -1; + } else { + index = ret; + } - return err ? -1 : index; + return index; } static int tegra114_enter_s2idle(struct cpuidle_device *dev, diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 180c6eae684b..fa0024b4549b 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -300,6 +300,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, } } else { dev->last_residency_ns = 0; + dev->states_usage[index].rejected++; } return entered_state; diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 091d1caceb41..53ec9585ccd4 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -256,6 +256,7 @@ define_show_state_time_function(exit_latency) define_show_state_time_function(target_residency) define_show_state_function(power_usage) define_show_state_ull_function(usage) +define_show_state_ull_function(rejected) define_show_state_str_function(name) define_show_state_str_function(desc) define_show_state_ull_function(above) @@ -312,6 +313,7 @@ define_one_state_ro(latency, show_state_exit_latency); define_one_state_ro(residency, show_state_target_residency); define_one_state_ro(power, show_state_power_usage); define_one_state_ro(usage, show_state_usage); +define_one_state_ro(rejected, show_state_rejected); define_one_state_ro(time, show_state_time); define_one_state_rw(disable, show_state_disable, store_state_disable); define_one_state_ro(above, show_state_above); @@ -325,6 +327,7 @@ static struct attribute *cpuidle_state_default_attrs[] = { &attr_residency.attr, &attr_power.attr, &attr_usage.attr, + &attr_rejected.attr, &attr_time.attr, &attr_disable.attr, &attr_above.attr, diff --git a/drivers/devfreq/devfreq-event.c b/drivers/devfreq/devfreq-event.c index 56efbeb7851e..6765c03334bc 100644 --- a/drivers/devfreq/devfreq-event.c +++ b/drivers/devfreq/devfreq-event.c @@ -213,20 +213,21 @@ EXPORT_SYMBOL_GPL(devfreq_event_reset_event); * devfreq_event_get_edev_by_phandle() - Get the devfreq-event dev from * devicetree. * @dev : the pointer to the given device + * @phandle_name: name of property holding a phandle value * @index : the index into list of devfreq-event device * * Note that this function return the pointer of devfreq-event device. */ struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(struct device *dev, - int index) + const char *phandle_name, int index) { struct device_node *node; struct devfreq_event_dev *edev; - if (!dev->of_node) + if (!dev->of_node || !phandle_name) return ERR_PTR(-EINVAL); - node = of_parse_phandle(dev->of_node, "devfreq-events", index); + node = of_parse_phandle(dev->of_node, phandle_name, index); if (!node) return ERR_PTR(-ENODEV); @@ -258,19 +259,20 @@ EXPORT_SYMBOL_GPL(devfreq_event_get_edev_by_phandle); /** * devfreq_event_get_edev_count() - Get the count of devfreq-event dev * @dev : the pointer to the given device + * @phandle_name: name of property holding a phandle value * * Note that this function return the count of devfreq-event devices. */ -int devfreq_event_get_edev_count(struct device *dev) +int devfreq_event_get_edev_count(struct device *dev, const char *phandle_name) { int count; - if (!dev->of_node) { + if (!dev->of_node || !phandle_name) { dev_err(dev, "device does not have a device node entry\n"); return -EINVAL; } - count = of_property_count_elems_of_size(dev->of_node, "devfreq-events", + count = of_property_count_elems_of_size(dev->of_node, phandle_name, sizeof(u32)); if (count < 0) { dev_err(dev, diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 071b59fe84d2..861c100f9fac 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -984,47 +984,74 @@ EXPORT_SYMBOL(devm_devfreq_add_device); #ifdef CONFIG_OF /* - * devfreq_get_devfreq_by_phandle - Get the devfreq device from devicetree - * @dev - instance to the given device - * @index - index into list of devfreq + * devfreq_get_devfreq_by_node - Get the devfreq device from devicetree + * @node - pointer to device_node * * return the instance of devfreq device */ -struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev, int index) +struct devfreq *devfreq_get_devfreq_by_node(struct device_node *node) { - struct device_node *node; struct devfreq *devfreq; - if (!dev) - return ERR_PTR(-EINVAL); - - if (!dev->of_node) - return ERR_PTR(-EINVAL); - - node = of_parse_phandle(dev->of_node, "devfreq", index); if (!node) - return ERR_PTR(-ENODEV); + return ERR_PTR(-EINVAL); mutex_lock(&devfreq_list_lock); list_for_each_entry(devfreq, &devfreq_list, node) { if (devfreq->dev.parent && devfreq->dev.parent->of_node == node) { mutex_unlock(&devfreq_list_lock); - of_node_put(node); return devfreq; } } mutex_unlock(&devfreq_list_lock); + + return ERR_PTR(-ENODEV); +} + +/* + * devfreq_get_devfreq_by_phandle - Get the devfreq device from devicetree + * @dev - instance to the given device + * @phandle_name - name of property holding a phandle value + * @index - index into list of devfreq + * + * return the instance of devfreq device + */ +struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev, + const char *phandle_name, int index) +{ + struct device_node *node; + struct devfreq *devfreq; + + if (!dev || !phandle_name) + return ERR_PTR(-EINVAL); + + if (!dev->of_node) + return ERR_PTR(-EINVAL); + + node = of_parse_phandle(dev->of_node, phandle_name, index); + if (!node) + return ERR_PTR(-ENODEV); + + devfreq = devfreq_get_devfreq_by_node(node); of_node_put(node); - return ERR_PTR(-EPROBE_DEFER); + return devfreq; } + #else -struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev, int index) +struct devfreq *devfreq_get_devfreq_by_node(struct device_node *node) +{ + return ERR_PTR(-ENODEV); +} + +struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev, + const char *phandle_name, int index) { return ERR_PTR(-ENODEV); } #endif /* CONFIG_OF */ +EXPORT_SYMBOL_GPL(devfreq_get_devfreq_by_node); EXPORT_SYMBOL_GPL(devfreq_get_devfreq_by_phandle); /** diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c index 8fa8eb541373..1e684a448c9e 100644 --- a/drivers/devfreq/exynos-bus.c +++ b/drivers/devfreq/exynos-bus.c @@ -193,7 +193,7 @@ static int exynos_bus_parent_parse_of(struct device_node *np, * Get the devfreq-event devices to get the current utilization of * buses. This raw data will be used in devfreq ondemand governor. */ - count = devfreq_event_get_edev_count(dev); + count = devfreq_event_get_edev_count(dev, "devfreq-events"); if (count < 0) { dev_err(dev, "failed to get the count of devfreq-event dev\n"); ret = count; @@ -209,7 +209,8 @@ static int exynos_bus_parent_parse_of(struct device_node *np, } for (i = 0; i < count; i++) { - bus->edev[i] = devfreq_event_get_edev_by_phandle(dev, i); + bus->edev[i] = devfreq_event_get_edev_by_phandle(dev, + "devfreq-events", i); if (IS_ERR(bus->edev[i])) { ret = -EPROBE_DEFER; goto err_regulator; @@ -360,7 +361,7 @@ static int exynos_bus_profile_init_passive(struct exynos_bus *bus, profile->exit = exynos_bus_passive_exit; /* Get the instance of parent devfreq device */ - parent_devfreq = devfreq_get_devfreq_by_phandle(dev, 0); + parent_devfreq = devfreq_get_devfreq_by_phandle(dev, "devfreq", 0); if (IS_ERR(parent_devfreq)) return -EPROBE_DEFER; diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c index 027769e39f9b..2e912166a993 100644 --- a/drivers/devfreq/rk3399_dmc.c +++ b/drivers/devfreq/rk3399_dmc.c @@ -341,7 +341,7 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev) return PTR_ERR(data->dmc_clk); } - data->edev = devfreq_event_get_edev_by_phandle(dev, 0); + data->edev = devfreq_event_get_edev_by_phandle(dev, "devfreq-events", 0); if (IS_ERR(data->edev)) return -EPROBE_DEFER; diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c index dedd39de7367..f5e74c2ede85 100644 --- a/drivers/devfreq/tegra30-devfreq.c +++ b/drivers/devfreq/tegra30-devfreq.c @@ -822,8 +822,6 @@ static int tegra_devfreq_probe(struct platform_device *pdev) return err; } - reset_control_assert(tegra->reset); - err = clk_prepare_enable(tegra->clock); if (err) { dev_err(&pdev->dev, @@ -831,7 +829,11 @@ static int tegra_devfreq_probe(struct platform_device *pdev) return err; } - reset_control_deassert(tegra->reset); + err = reset_control_reset(tegra->reset); + if (err) { + dev_err(&pdev->dev, "Failed to reset hardware: %d\n", err); + goto disable_clk; + } rate = clk_round_rate(tegra->emc_clock, ULONG_MAX); if (rate < 0) { diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index ecff93ede65a..e0427042eb9b 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -151,12 +151,15 @@ static u32 psci_get_version(void) return invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0); } -int psci_set_osi_mode(void) +int psci_set_osi_mode(bool enable) { + unsigned long suspend_mode; int err; - err = invoke_psci_fn(PSCI_1_0_FN_SET_SUSPEND_MODE, - PSCI_1_0_SUSPEND_MODE_OSI, 0, 0); + suspend_mode = enable ? PSCI_1_0_SUSPEND_MODE_OSI : + PSCI_1_0_SUSPEND_MODE_PC; + + err = invoke_psci_fn(PSCI_1_0_FN_SET_SUSPEND_MODE, suspend_mode, 0, 0); return psci_to_linux_errno(err); } @@ -554,8 +557,7 @@ static int __init psci_1_0_init(struct device_node *np) pr_info("OSI mode supported.\n"); /* Default to PC mode. */ - invoke_psci_fn(PSCI_1_0_FN_SET_SUSPEND_MODE, - PSCI_1_0_SUSPEND_MODE_PC, 0, 0); + psci_set_osi_mode(false); } return 0; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c b/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c index e9f448a5ebb3..15b5bde69324 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_msg.c @@ -24,7 +24,7 @@ * */ -#include +#include #include #include #include @@ -599,4 +599,3 @@ out_open: return -EINVAL; } - diff --git a/drivers/hid/hid-hyperv.c b/drivers/hid/hid-hyperv.c index 0b6ee1dee625..978ee2aab2d4 100644 --- a/drivers/hid/hid-hyperv.c +++ b/drivers/hid/hid-hyperv.c @@ -104,8 +104,8 @@ struct synthhid_input_report { #pragma pack(pop) -#define INPUTVSC_SEND_RING_BUFFER_SIZE (40 * 1024) -#define INPUTVSC_RECV_RING_BUFFER_SIZE (40 * 1024) +#define INPUTVSC_SEND_RING_BUFFER_SIZE VMBUS_RING_SIZE(36 * 1024) +#define INPUTVSC_RECV_RING_BUFFER_SIZE VMBUS_RING_SIZE(36 * 1024) enum pipe_prot_msg_type { diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 3ebda7707e46..fbdda9938039 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -22,20 +22,97 @@ #include "hyperv_vmbus.h" -#define NUM_PAGES_SPANNED(addr, len) \ -((PAGE_ALIGN(addr + len) >> PAGE_SHIFT) - (addr >> PAGE_SHIFT)) - -static unsigned long virt_to_hvpfn(void *addr) +/* + * hv_gpadl_size - Return the real size of a gpadl, the size that Hyper-V uses + * + * For BUFFER gpadl, Hyper-V uses the exact same size as the guest does. + * + * For RING gpadl, in each ring, the guest uses one PAGE_SIZE as the header + * (because of the alignment requirement), however, the hypervisor only + * uses the first HV_HYP_PAGE_SIZE as the header, therefore leaving a + * (PAGE_SIZE - HV_HYP_PAGE_SIZE) gap. And since there are two rings in a + * ringbuffer, the total size for a RING gpadl that Hyper-V uses is the + * total size that the guest uses minus twice of the gap size. + */ +static inline u32 hv_gpadl_size(enum hv_gpadl_type type, u32 size) { - phys_addr_t paddr; + switch (type) { + case HV_GPADL_BUFFER: + return size; + case HV_GPADL_RING: + /* The size of a ringbuffer must be page-aligned */ + BUG_ON(size % PAGE_SIZE); + /* + * Two things to notice here: + * 1) We're processing two ring buffers as a unit + * 2) We're skipping any space larger than HV_HYP_PAGE_SIZE in + * the first guest-size page of each of the two ring buffers. + * So we effectively subtract out two guest-size pages, and add + * back two Hyper-V size pages. + */ + return size - 2 * (PAGE_SIZE - HV_HYP_PAGE_SIZE); + } + BUG(); + return 0; +} - if (is_vmalloc_addr(addr)) - paddr = page_to_phys(vmalloc_to_page(addr)) + - offset_in_page(addr); - else - paddr = __pa(addr); +/* + * hv_ring_gpadl_send_hvpgoffset - Calculate the send offset (in unit of + * HV_HYP_PAGE) in a ring gpadl based on the + * offset in the guest + * + * @offset: the offset (in bytes) where the send ringbuffer starts in the + * virtual address space of the guest + */ +static inline u32 hv_ring_gpadl_send_hvpgoffset(u32 offset) +{ - return paddr >> PAGE_SHIFT; + /* + * For RING gpadl, in each ring, the guest uses one PAGE_SIZE as the + * header (because of the alignment requirement), however, the + * hypervisor only uses the first HV_HYP_PAGE_SIZE as the header, + * therefore leaving a (PAGE_SIZE - HV_HYP_PAGE_SIZE) gap. + * + * And to calculate the effective send offset in gpadl, we need to + * substract this gap. + */ + return (offset - (PAGE_SIZE - HV_HYP_PAGE_SIZE)) >> HV_HYP_PAGE_SHIFT; +} + +/* + * hv_gpadl_hvpfn - Return the Hyper-V page PFN of the @i th Hyper-V page in + * the gpadl + * + * @type: the type of the gpadl + * @kbuffer: the pointer to the gpadl in the guest + * @size: the total size (in bytes) of the gpadl + * @send_offset: the offset (in bytes) where the send ringbuffer starts in the + * virtual address space of the guest + * @i: the index + */ +static inline u64 hv_gpadl_hvpfn(enum hv_gpadl_type type, void *kbuffer, + u32 size, u32 send_offset, int i) +{ + int send_idx = hv_ring_gpadl_send_hvpgoffset(send_offset); + unsigned long delta = 0UL; + + switch (type) { + case HV_GPADL_BUFFER: + break; + case HV_GPADL_RING: + if (i == 0) + delta = 0; + else if (i <= send_idx) + delta = PAGE_SIZE - HV_HYP_PAGE_SIZE; + else + delta = 2 * (PAGE_SIZE - HV_HYP_PAGE_SIZE); + break; + default: + BUG(); + break; + } + + return virt_to_hvpfn(kbuffer + delta + (HV_HYP_PAGE_SIZE * i)); } /* @@ -112,6 +189,320 @@ int vmbus_alloc_ring(struct vmbus_channel *newchannel, } EXPORT_SYMBOL_GPL(vmbus_alloc_ring); +/* Used for Hyper-V Socket: a guest client's connect() to the host */ +int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, + const guid_t *shv_host_servie_id) +{ + struct vmbus_channel_tl_connect_request conn_msg; + int ret; + + memset(&conn_msg, 0, sizeof(conn_msg)); + conn_msg.header.msgtype = CHANNELMSG_TL_CONNECT_REQUEST; + conn_msg.guest_endpoint_id = *shv_guest_servie_id; + conn_msg.host_service_id = *shv_host_servie_id; + + ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); + + trace_vmbus_send_tl_connect_request(&conn_msg, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); + +/* + * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. + * + * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not + * ACK such messages. IOW we can't know when the host will stop interrupting + * the "old" vCPU and start interrupting the "new" vCPU for the given channel. + * + * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version + * VERSION_WIN10_V4_1. + */ +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) +{ + struct vmbus_channel_modifychannel conn_msg; + int ret; + + memset(&conn_msg, 0, sizeof(conn_msg)); + conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; + conn_msg.child_relid = child_relid; + conn_msg.target_vp = target_vp; + + ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); + + trace_vmbus_send_modifychannel(&conn_msg, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); + +/* + * create_gpadl_header - Creates a gpadl for the specified buffer + */ +static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer, + u32 size, u32 send_offset, + struct vmbus_channel_msginfo **msginfo) +{ + int i; + int pagecount; + struct vmbus_channel_gpadl_header *gpadl_header; + struct vmbus_channel_gpadl_body *gpadl_body; + struct vmbus_channel_msginfo *msgheader; + struct vmbus_channel_msginfo *msgbody = NULL; + u32 msgsize; + + int pfnsum, pfncount, pfnleft, pfncurr, pfnsize; + + pagecount = hv_gpadl_size(type, size) >> HV_HYP_PAGE_SHIFT; + + /* do we need a gpadl body msg */ + pfnsize = MAX_SIZE_CHANNEL_MESSAGE - + sizeof(struct vmbus_channel_gpadl_header) - + sizeof(struct gpa_range); + pfncount = pfnsize / sizeof(u64); + + if (pagecount > pfncount) { + /* we need a gpadl body */ + /* fill in the header */ + msgsize = sizeof(struct vmbus_channel_msginfo) + + sizeof(struct vmbus_channel_gpadl_header) + + sizeof(struct gpa_range) + pfncount * sizeof(u64); + msgheader = kzalloc(msgsize, GFP_KERNEL); + if (!msgheader) + goto nomem; + + INIT_LIST_HEAD(&msgheader->submsglist); + msgheader->msgsize = msgsize; + + gpadl_header = (struct vmbus_channel_gpadl_header *) + msgheader->msg; + gpadl_header->rangecount = 1; + gpadl_header->range_buflen = sizeof(struct gpa_range) + + pagecount * sizeof(u64); + gpadl_header->range[0].byte_offset = 0; + gpadl_header->range[0].byte_count = hv_gpadl_size(type, size); + for (i = 0; i < pfncount; i++) + gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn( + type, kbuffer, size, send_offset, i); + *msginfo = msgheader; + + pfnsum = pfncount; + pfnleft = pagecount - pfncount; + + /* how many pfns can we fit */ + pfnsize = MAX_SIZE_CHANNEL_MESSAGE - + sizeof(struct vmbus_channel_gpadl_body); + pfncount = pfnsize / sizeof(u64); + + /* fill in the body */ + while (pfnleft) { + if (pfnleft > pfncount) + pfncurr = pfncount; + else + pfncurr = pfnleft; + + msgsize = sizeof(struct vmbus_channel_msginfo) + + sizeof(struct vmbus_channel_gpadl_body) + + pfncurr * sizeof(u64); + msgbody = kzalloc(msgsize, GFP_KERNEL); + + if (!msgbody) { + struct vmbus_channel_msginfo *pos = NULL; + struct vmbus_channel_msginfo *tmp = NULL; + /* + * Free up all the allocated messages. + */ + list_for_each_entry_safe(pos, tmp, + &msgheader->submsglist, + msglistentry) { + + list_del(&pos->msglistentry); + kfree(pos); + } + + goto nomem; + } + + msgbody->msgsize = msgsize; + gpadl_body = + (struct vmbus_channel_gpadl_body *)msgbody->msg; + + /* + * Gpadl is u32 and we are using a pointer which could + * be 64-bit + * This is governed by the guest/host protocol and + * so the hypervisor guarantees that this is ok. + */ + for (i = 0; i < pfncurr; i++) + gpadl_body->pfn[i] = hv_gpadl_hvpfn(type, + kbuffer, size, send_offset, pfnsum + i); + + /* add to msg header */ + list_add_tail(&msgbody->msglistentry, + &msgheader->submsglist); + pfnsum += pfncurr; + pfnleft -= pfncurr; + } + } else { + /* everything fits in a header */ + msgsize = sizeof(struct vmbus_channel_msginfo) + + sizeof(struct vmbus_channel_gpadl_header) + + sizeof(struct gpa_range) + pagecount * sizeof(u64); + msgheader = kzalloc(msgsize, GFP_KERNEL); + if (msgheader == NULL) + goto nomem; + + INIT_LIST_HEAD(&msgheader->submsglist); + msgheader->msgsize = msgsize; + + gpadl_header = (struct vmbus_channel_gpadl_header *) + msgheader->msg; + gpadl_header->rangecount = 1; + gpadl_header->range_buflen = sizeof(struct gpa_range) + + pagecount * sizeof(u64); + gpadl_header->range[0].byte_offset = 0; + gpadl_header->range[0].byte_count = hv_gpadl_size(type, size); + for (i = 0; i < pagecount; i++) + gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn( + type, kbuffer, size, send_offset, i); + + *msginfo = msgheader; + } + + return 0; +nomem: + kfree(msgheader); + kfree(msgbody); + return -ENOMEM; +} + +/* + * __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer + * + * @channel: a channel + * @type: the type of the corresponding GPADL, only meaningful for the guest. + * @kbuffer: from kmalloc or vmalloc + * @size: page-size multiple + * @send_offset: the offset (in bytes) where the send ring buffer starts, + * should be 0 for BUFFER type gpadl + * @gpadl_handle: some funky thing + */ +static int __vmbus_establish_gpadl(struct vmbus_channel *channel, + enum hv_gpadl_type type, void *kbuffer, + u32 size, u32 send_offset, + u32 *gpadl_handle) +{ + struct vmbus_channel_gpadl_header *gpadlmsg; + struct vmbus_channel_gpadl_body *gpadl_body; + struct vmbus_channel_msginfo *msginfo = NULL; + struct vmbus_channel_msginfo *submsginfo, *tmp; + struct list_head *curr; + u32 next_gpadl_handle; + unsigned long flags; + int ret = 0; + + next_gpadl_handle = + (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); + + ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo); + if (ret) + return ret; + + init_completion(&msginfo->waitevent); + msginfo->waiting_channel = channel; + + gpadlmsg = (struct vmbus_channel_gpadl_header *)msginfo->msg; + gpadlmsg->header.msgtype = CHANNELMSG_GPADL_HEADER; + gpadlmsg->child_relid = channel->offermsg.child_relid; + gpadlmsg->gpadl = next_gpadl_handle; + + + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_add_tail(&msginfo->msglistentry, + &vmbus_connection.chn_msg_list); + + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + + if (channel->rescind) { + ret = -ENODEV; + goto cleanup; + } + + ret = vmbus_post_msg(gpadlmsg, msginfo->msgsize - + sizeof(*msginfo), true); + + trace_vmbus_establish_gpadl_header(gpadlmsg, ret); + + if (ret != 0) + goto cleanup; + + list_for_each(curr, &msginfo->submsglist) { + submsginfo = (struct vmbus_channel_msginfo *)curr; + gpadl_body = + (struct vmbus_channel_gpadl_body *)submsginfo->msg; + + gpadl_body->header.msgtype = + CHANNELMSG_GPADL_BODY; + gpadl_body->gpadl = next_gpadl_handle; + + ret = vmbus_post_msg(gpadl_body, + submsginfo->msgsize - sizeof(*submsginfo), + true); + + trace_vmbus_establish_gpadl_body(gpadl_body, ret); + + if (ret != 0) + goto cleanup; + + } + wait_for_completion(&msginfo->waitevent); + + if (msginfo->response.gpadl_created.creation_status != 0) { + pr_err("Failed to establish GPADL: err = 0x%x\n", + msginfo->response.gpadl_created.creation_status); + + ret = -EDQUOT; + goto cleanup; + } + + if (channel->rescind) { + ret = -ENODEV; + goto cleanup; + } + + /* At this point, we received the gpadl created msg */ + *gpadl_handle = gpadlmsg->gpadl; + +cleanup: + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&msginfo->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, + msglistentry) { + kfree(submsginfo); + } + + kfree(msginfo); + return ret; +} + +/* + * vmbus_establish_gpadl - Establish a GPADL for the specified buffer + * + * @channel: a channel + * @kbuffer: from kmalloc or vmalloc + * @size: page-size multiple + * @gpadl_handle: some funky thing + */ +int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, + u32 size, u32 *gpadl_handle) +{ + return __vmbus_establish_gpadl(channel, HV_GPADL_BUFFER, kbuffer, size, + 0U, gpadl_handle); +} +EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); + static int __vmbus_open(struct vmbus_channel *newchannel, void *userdata, u32 userdatalen, void (*onchannelcallback)(void *context), void *context) @@ -148,10 +539,11 @@ static int __vmbus_open(struct vmbus_channel *newchannel, /* Establish the gpadl for the ring buffer */ newchannel->ringbuffer_gpadlhandle = 0; - err = vmbus_establish_gpadl(newchannel, - page_address(newchannel->ringbuffer_page), - (send_pages + recv_pages) << PAGE_SHIFT, - &newchannel->ringbuffer_gpadlhandle); + err = __vmbus_establish_gpadl(newchannel, HV_GPADL_RING, + page_address(newchannel->ringbuffer_page), + (send_pages + recv_pages) << PAGE_SHIFT, + newchannel->ringbuffer_send_offset << PAGE_SHIFT, + &newchannel->ringbuffer_gpadlhandle); if (err) goto error_clean_ring; @@ -172,7 +564,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel, open_msg->openid = newchannel->offermsg.child_relid; open_msg->child_relid = newchannel->offermsg.child_relid; open_msg->ringbuffer_gpadlhandle = newchannel->ringbuffer_gpadlhandle; - open_msg->downstream_ringbuffer_pageoffset = newchannel->ringbuffer_send_offset; + /* + * The unit of ->downstream_ringbuffer_pageoffset is HV_HYP_PAGE and + * the unit of ->ringbuffer_send_offset (i.e. send_pages) is PAGE, so + * here we calculate it into HV_HYP_PAGE. + */ + open_msg->downstream_ringbuffer_pageoffset = + hv_ring_gpadl_send_hvpgoffset(send_pages << PAGE_SHIFT); open_msg->target_vp = hv_cpu_number_to_vp_number(newchannel->target_cpu); if (userdatalen) @@ -266,299 +664,6 @@ int vmbus_open(struct vmbus_channel *newchannel, } EXPORT_SYMBOL_GPL(vmbus_open); -/* Used for Hyper-V Socket: a guest client's connect() to the host */ -int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, - const guid_t *shv_host_servie_id) -{ - struct vmbus_channel_tl_connect_request conn_msg; - int ret; - - memset(&conn_msg, 0, sizeof(conn_msg)); - conn_msg.header.msgtype = CHANNELMSG_TL_CONNECT_REQUEST; - conn_msg.guest_endpoint_id = *shv_guest_servie_id; - conn_msg.host_service_id = *shv_host_servie_id; - - ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); - - trace_vmbus_send_tl_connect_request(&conn_msg, ret); - - return ret; -} -EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); - -/* - * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. - * - * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not - * ACK such messages. IOW we can't know when the host will stop interrupting - * the "old" vCPU and start interrupting the "new" vCPU for the given channel. - * - * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version - * VERSION_WIN10_V4_1. - */ -int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) -{ - struct vmbus_channel_modifychannel conn_msg; - int ret; - - memset(&conn_msg, 0, sizeof(conn_msg)); - conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; - conn_msg.child_relid = child_relid; - conn_msg.target_vp = target_vp; - - ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); - - trace_vmbus_send_modifychannel(&conn_msg, ret); - - return ret; -} -EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); - -/* - * create_gpadl_header - Creates a gpadl for the specified buffer - */ -static int create_gpadl_header(void *kbuffer, u32 size, - struct vmbus_channel_msginfo **msginfo) -{ - int i; - int pagecount; - struct vmbus_channel_gpadl_header *gpadl_header; - struct vmbus_channel_gpadl_body *gpadl_body; - struct vmbus_channel_msginfo *msgheader; - struct vmbus_channel_msginfo *msgbody = NULL; - u32 msgsize; - - int pfnsum, pfncount, pfnleft, pfncurr, pfnsize; - - pagecount = size >> PAGE_SHIFT; - - /* do we need a gpadl body msg */ - pfnsize = MAX_SIZE_CHANNEL_MESSAGE - - sizeof(struct vmbus_channel_gpadl_header) - - sizeof(struct gpa_range); - pfncount = pfnsize / sizeof(u64); - - if (pagecount > pfncount) { - /* we need a gpadl body */ - /* fill in the header */ - msgsize = sizeof(struct vmbus_channel_msginfo) + - sizeof(struct vmbus_channel_gpadl_header) + - sizeof(struct gpa_range) + pfncount * sizeof(u64); - msgheader = kzalloc(msgsize, GFP_KERNEL); - if (!msgheader) - goto nomem; - - INIT_LIST_HEAD(&msgheader->submsglist); - msgheader->msgsize = msgsize; - - gpadl_header = (struct vmbus_channel_gpadl_header *) - msgheader->msg; - gpadl_header->rangecount = 1; - gpadl_header->range_buflen = sizeof(struct gpa_range) + - pagecount * sizeof(u64); - gpadl_header->range[0].byte_offset = 0; - gpadl_header->range[0].byte_count = size; - for (i = 0; i < pfncount; i++) - gpadl_header->range[0].pfn_array[i] = virt_to_hvpfn( - kbuffer + PAGE_SIZE * i); - *msginfo = msgheader; - - pfnsum = pfncount; - pfnleft = pagecount - pfncount; - - /* how many pfns can we fit */ - pfnsize = MAX_SIZE_CHANNEL_MESSAGE - - sizeof(struct vmbus_channel_gpadl_body); - pfncount = pfnsize / sizeof(u64); - - /* fill in the body */ - while (pfnleft) { - if (pfnleft > pfncount) - pfncurr = pfncount; - else - pfncurr = pfnleft; - - msgsize = sizeof(struct vmbus_channel_msginfo) + - sizeof(struct vmbus_channel_gpadl_body) + - pfncurr * sizeof(u64); - msgbody = kzalloc(msgsize, GFP_KERNEL); - - if (!msgbody) { - struct vmbus_channel_msginfo *pos = NULL; - struct vmbus_channel_msginfo *tmp = NULL; - /* - * Free up all the allocated messages. - */ - list_for_each_entry_safe(pos, tmp, - &msgheader->submsglist, - msglistentry) { - - list_del(&pos->msglistentry); - kfree(pos); - } - - goto nomem; - } - - msgbody->msgsize = msgsize; - gpadl_body = - (struct vmbus_channel_gpadl_body *)msgbody->msg; - - /* - * Gpadl is u32 and we are using a pointer which could - * be 64-bit - * This is governed by the guest/host protocol and - * so the hypervisor guarantees that this is ok. - */ - for (i = 0; i < pfncurr; i++) - gpadl_body->pfn[i] = virt_to_hvpfn( - kbuffer + PAGE_SIZE * (pfnsum + i)); - - /* add to msg header */ - list_add_tail(&msgbody->msglistentry, - &msgheader->submsglist); - pfnsum += pfncurr; - pfnleft -= pfncurr; - } - } else { - /* everything fits in a header */ - msgsize = sizeof(struct vmbus_channel_msginfo) + - sizeof(struct vmbus_channel_gpadl_header) + - sizeof(struct gpa_range) + pagecount * sizeof(u64); - msgheader = kzalloc(msgsize, GFP_KERNEL); - if (msgheader == NULL) - goto nomem; - - INIT_LIST_HEAD(&msgheader->submsglist); - msgheader->msgsize = msgsize; - - gpadl_header = (struct vmbus_channel_gpadl_header *) - msgheader->msg; - gpadl_header->rangecount = 1; - gpadl_header->range_buflen = sizeof(struct gpa_range) + - pagecount * sizeof(u64); - gpadl_header->range[0].byte_offset = 0; - gpadl_header->range[0].byte_count = size; - for (i = 0; i < pagecount; i++) - gpadl_header->range[0].pfn_array[i] = virt_to_hvpfn( - kbuffer + PAGE_SIZE * i); - - *msginfo = msgheader; - } - - return 0; -nomem: - kfree(msgheader); - kfree(msgbody); - return -ENOMEM; -} - -/* - * vmbus_establish_gpadl - Establish a GPADL for the specified buffer - * - * @channel: a channel - * @kbuffer: from kmalloc or vmalloc - * @size: page-size multiple - * @gpadl_handle: some funky thing - */ -int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, - u32 size, u32 *gpadl_handle) -{ - struct vmbus_channel_gpadl_header *gpadlmsg; - struct vmbus_channel_gpadl_body *gpadl_body; - struct vmbus_channel_msginfo *msginfo = NULL; - struct vmbus_channel_msginfo *submsginfo, *tmp; - struct list_head *curr; - u32 next_gpadl_handle; - unsigned long flags; - int ret = 0; - - next_gpadl_handle = - (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); - - ret = create_gpadl_header(kbuffer, size, &msginfo); - if (ret) - return ret; - - init_completion(&msginfo->waitevent); - msginfo->waiting_channel = channel; - - gpadlmsg = (struct vmbus_channel_gpadl_header *)msginfo->msg; - gpadlmsg->header.msgtype = CHANNELMSG_GPADL_HEADER; - gpadlmsg->child_relid = channel->offermsg.child_relid; - gpadlmsg->gpadl = next_gpadl_handle; - - - spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); - list_add_tail(&msginfo->msglistentry, - &vmbus_connection.chn_msg_list); - - spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); - - if (channel->rescind) { - ret = -ENODEV; - goto cleanup; - } - - ret = vmbus_post_msg(gpadlmsg, msginfo->msgsize - - sizeof(*msginfo), true); - - trace_vmbus_establish_gpadl_header(gpadlmsg, ret); - - if (ret != 0) - goto cleanup; - - list_for_each(curr, &msginfo->submsglist) { - submsginfo = (struct vmbus_channel_msginfo *)curr; - gpadl_body = - (struct vmbus_channel_gpadl_body *)submsginfo->msg; - - gpadl_body->header.msgtype = - CHANNELMSG_GPADL_BODY; - gpadl_body->gpadl = next_gpadl_handle; - - ret = vmbus_post_msg(gpadl_body, - submsginfo->msgsize - sizeof(*submsginfo), - true); - - trace_vmbus_establish_gpadl_body(gpadl_body, ret); - - if (ret != 0) - goto cleanup; - - } - wait_for_completion(&msginfo->waitevent); - - if (msginfo->response.gpadl_created.creation_status != 0) { - pr_err("Failed to establish GPADL: err = 0x%x\n", - msginfo->response.gpadl_created.creation_status); - - ret = -EDQUOT; - goto cleanup; - } - - if (channel->rescind) { - ret = -ENODEV; - goto cleanup; - } - - /* At this point, we received the gpadl created msg */ - *gpadl_handle = gpadlmsg->gpadl; - -cleanup: - spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); - list_del(&msginfo->msglistentry); - spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); - list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, - msglistentry) { - kfree(submsginfo); - } - - kfree(msginfo); - return ret; -} -EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); - /* * vmbus_teardown_gpadl -Teardown the specified GPADL handle */ diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index da69338f92f5..410f1fab519c 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -165,7 +165,7 @@ void hv_synic_enable_regs(unsigned int cpu) hv_get_simp(simp.as_uint64); simp.simp_enabled = 1; simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) - >> PAGE_SHIFT; + >> HV_HYP_PAGE_SHIFT; hv_set_simp(simp.as_uint64); @@ -173,7 +173,7 @@ void hv_synic_enable_regs(unsigned int cpu) hv_get_siefp(siefp.as_uint64); siefp.siefp_enabled = 1; siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) - >> PAGE_SHIFT; + >> HV_HYP_PAGE_SHIFT; hv_set_siefp(siefp.as_uint64); diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index a4e8d96513c2..05566ecdbe4b 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -500,6 +500,9 @@ static void heartbeat_onchannelcallback(void *context) } } +#define HV_UTIL_RING_SEND_SIZE VMBUS_RING_SIZE(3 * HV_HYP_PAGE_SIZE) +#define HV_UTIL_RING_RECV_SIZE VMBUS_RING_SIZE(3 * HV_HYP_PAGE_SIZE) + static int util_probe(struct hv_device *dev, const struct hv_vmbus_device_id *dev_id) { @@ -530,8 +533,8 @@ static int util_probe(struct hv_device *dev, hv_set_drvdata(dev, srv); - ret = vmbus_open(dev->channel, 4 * HV_HYP_PAGE_SIZE, - 4 * HV_HYP_PAGE_SIZE, NULL, 0, srv->util_cb, + ret = vmbus_open(dev->channel, HV_UTIL_RING_SEND_SIZE, + HV_UTIL_RING_RECV_SIZE, NULL, 0, srv->util_cb, dev->channel); if (ret) goto error; @@ -590,8 +593,8 @@ static int util_resume(struct hv_device *dev) return ret; } - ret = vmbus_open(dev->channel, 4 * HV_HYP_PAGE_SIZE, - 4 * HV_HYP_PAGE_SIZE, NULL, 0, srv->util_cb, + ret = vmbus_open(dev->channel, HV_UTIL_RING_SEND_SIZE, + HV_UTIL_RING_RECV_SIZE, NULL, 0, srv->util_cb, dev->channel); return ret; } diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 946d0aba101f..7b8816c2e5f9 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -83,7 +83,7 @@ static int hyperv_panic_event(struct notifier_block *nb, unsigned long val, static int hyperv_die_event(struct notifier_block *nb, unsigned long val, void *args) { - struct die_args *die = (struct die_args *)args; + struct die_args *die = args; struct pt_regs *regs = die->regs; /* Don't notify Hyper-V if the die event is other than oops */ diff --git a/drivers/input/serio/hyperv-keyboard.c b/drivers/input/serio/hyperv-keyboard.c index df4e9f6f4529..1a7b72a9016d 100644 --- a/drivers/input/serio/hyperv-keyboard.c +++ b/drivers/input/serio/hyperv-keyboard.c @@ -75,8 +75,8 @@ struct synth_kbd_keystroke { #define HK_MAXIMUM_MESSAGE_SIZE 256 -#define KBD_VSC_SEND_RING_BUFFER_SIZE (40 * 1024) -#define KBD_VSC_RECV_RING_BUFFER_SIZE (40 * 1024) +#define KBD_VSC_SEND_RING_BUFFER_SIZE VMBUS_RING_SIZE(36 * 1024) +#define KBD_VSC_RECV_RING_BUFFER_SIZE VMBUS_RING_SIZE(36 * 1024) #define XTKBD_EMUL0 0xe0 #define XTKBD_EMUL1 0xe1 diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index a8fb82c166eb..df50a00ccfe8 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -476,7 +476,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) rhsa = (struct acpi_dmar_rhsa *)header; for_each_drhd_unit(drhd) { if (drhd->reg_base_addr == rhsa->base_address) { - int node = acpi_map_pxm_to_node(rhsa->proximity_domain); + int node = pxm_to_node(rhsa->proximity_domain); if (!node_online(node)) node = NUMA_NO_NODE; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 46d885575601..0fec31931e11 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -5269,7 +5269,12 @@ static int __init gic_acpi_parse_srat_its(union acpi_subtable_headers *header, return -EINVAL; } - node = acpi_map_pxm_to_node(its_affinity->proximity_domain); + /* + * Note that in theory a new proximity node could be created by this + * entry as it is an SRAT resource allocation structure. + * We do not currently support doing so. + */ + node = pxm_to_node(its_affinity->proximity_domain); if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { pr_err("SRAT: Invalid NUMA node %d in ITS affinity\n", node); diff --git a/drivers/memory/samsung/exynos5422-dmc.c b/drivers/memory/samsung/exynos5422-dmc.c index b9c7956e5031..714d1f6f077c 100644 --- a/drivers/memory/samsung/exynos5422-dmc.c +++ b/drivers/memory/samsung/exynos5422-dmc.c @@ -1293,7 +1293,8 @@ static int exynos5_performance_counters_init(struct exynos5_dmc *dmc) int counters_size; int ret, i; - dmc->num_counters = devfreq_event_get_edev_count(dmc->dev); + dmc->num_counters = devfreq_event_get_edev_count(dmc->dev, + "devfreq-events"); if (dmc->num_counters < 0) { dev_err(dmc->dev, "could not get devfreq-event counters\n"); return dmc->num_counters; @@ -1306,7 +1307,8 @@ static int exynos5_performance_counters_init(struct exynos5_dmc *dmc) for (i = 0; i < dmc->num_counters; i++) { dmc->counter[i] = - devfreq_event_get_edev_by_phandle(dmc->dev, i); + devfreq_event_get_edev_by_phandle(dmc->dev, + "devfreq-events", i); if (IS_ERR_OR_NULL(dmc->counter[i])) return -EPROBE_DEFER; } diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 5a57d1985bae..0c3de94b5178 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -846,7 +846,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, } for (i = 0; i < page_count; i++) { - char *src = phys_to_virt(pb[i].pfn << PAGE_SHIFT); + char *src = phys_to_virt(pb[i].pfn << HV_HYP_PAGE_SHIFT); u32 offset = pb[i].offset; u32 len = pb[i].len; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 9869e390875e..261e6e55a907 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -373,32 +373,29 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, return txq; } -static u32 fill_pg_buf(struct page *page, u32 offset, u32 len, +static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len, struct hv_page_buffer *pb) { int j = 0; - /* Deal with compound pages by ignoring unused part - * of the page. - */ - page += (offset >> PAGE_SHIFT); - offset &= ~PAGE_MASK; + hvpfn += offset >> HV_HYP_PAGE_SHIFT; + offset = offset & ~HV_HYP_PAGE_MASK; while (len > 0) { unsigned long bytes; - bytes = PAGE_SIZE - offset; + bytes = HV_HYP_PAGE_SIZE - offset; if (bytes > len) bytes = len; - pb[j].pfn = page_to_pfn(page); + pb[j].pfn = hvpfn; pb[j].offset = offset; pb[j].len = bytes; offset += bytes; len -= bytes; - if (offset == PAGE_SIZE && len) { - page++; + if (offset == HV_HYP_PAGE_SIZE && len) { + hvpfn++; offset = 0; j++; } @@ -421,23 +418,26 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, * 2. skb linear data * 3. skb fragment data */ - slots_used += fill_pg_buf(virt_to_page(hdr), - offset_in_page(hdr), - len, &pb[slots_used]); + slots_used += fill_pg_buf(virt_to_hvpfn(hdr), + offset_in_hvpage(hdr), + len, + &pb[slots_used]); packet->rmsg_size = len; packet->rmsg_pgcnt = slots_used; - slots_used += fill_pg_buf(virt_to_page(data), - offset_in_page(data), - skb_headlen(skb), &pb[slots_used]); + slots_used += fill_pg_buf(virt_to_hvpfn(data), + offset_in_hvpage(data), + skb_headlen(skb), + &pb[slots_used]); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; - slots_used += fill_pg_buf(skb_frag_page(frag), - skb_frag_off(frag), - skb_frag_size(frag), &pb[slots_used]); + slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)), + skb_frag_off(frag), + skb_frag_size(frag), + &pb[slots_used]); } return slots_used; } @@ -453,8 +453,8 @@ static int count_skb_frag_slots(struct sk_buff *skb) unsigned long offset = skb_frag_off(frag); /* Skip unused frames from start of page */ - offset &= ~PAGE_MASK; - pages += PFN_UP(offset + size); + offset &= ~HV_HYP_PAGE_MASK; + pages += HVPFN_UP(offset + size); } return pages; } @@ -462,12 +462,12 @@ static int count_skb_frag_slots(struct sk_buff *skb) static int netvsc_get_slots(struct sk_buff *skb) { char *data = skb->data; - unsigned int offset = offset_in_page(data); + unsigned int offset = offset_in_hvpage(data); unsigned int len = skb_headlen(skb); int slots; int frag_slots; - slots = DIV_ROUND_UP(offset + len, PAGE_SIZE); + slots = DIV_ROUND_UP(offset + len, HV_HYP_PAGE_SIZE); frag_slots = count_skb_frag_slots(skb); return slots + frag_slots; } diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 12ad471ac5e1..b22e47bcfeca 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -25,7 +25,7 @@ static void rndis_set_multicast(struct work_struct *w); -#define RNDIS_EXT_LEN PAGE_SIZE +#define RNDIS_EXT_LEN HV_HYP_PAGE_SIZE struct rndis_request { struct list_head list_ent; struct completion wait_event; @@ -215,18 +215,17 @@ static int rndis_filter_send_request(struct rndis_device *dev, packet->page_buf_cnt = 1; pb[0].pfn = virt_to_phys(&req->request_msg) >> - PAGE_SHIFT; + HV_HYP_PAGE_SHIFT; pb[0].len = req->request_msg.msg_len; - pb[0].offset = - (unsigned long)&req->request_msg & (PAGE_SIZE - 1); + pb[0].offset = offset_in_hvpage(&req->request_msg); /* Add one page_buf when request_msg crossing page boundary */ - if (pb[0].offset + pb[0].len > PAGE_SIZE) { + if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) { packet->page_buf_cnt++; - pb[0].len = PAGE_SIZE - + pb[0].len = HV_HYP_PAGE_SIZE - pb[0].offset; pb[1].pfn = virt_to_phys((void *)&req->request_msg - + pb[0].len) >> PAGE_SHIFT; + + pb[0].len) >> HV_HYP_PAGE_SHIFT; pb[1].offset = 0; pb[1].len = req->request_msg.msg_len - pb[0].len; diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 3ca7543142bf..2483e765318a 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -703,12 +703,10 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table, * Enable the regulator after setting its voltages, otherwise it breaks * some boot-enabled regulators. */ - if (unlikely(!opp_table->regulator_enabled)) { + if (unlikely(!opp_table->enabled)) { ret = regulator_enable(reg); if (ret < 0) dev_warn(dev, "Failed to enable regulator: %d", ret); - else - opp_table->regulator_enabled = true; } return 0; @@ -781,29 +779,39 @@ static int _set_opp_custom(const struct opp_table *opp_table, return opp_table->set_opp(data); } +static int _set_required_opp(struct device *dev, struct device *pd_dev, + struct dev_pm_opp *opp, int i) +{ + unsigned int pstate = likely(opp) ? opp->required_opps[i]->pstate : 0; + int ret; + + if (!pd_dev) + return 0; + + ret = dev_pm_genpd_set_performance_state(pd_dev, pstate); + if (ret) { + dev_err(dev, "Failed to set performance rate of %s: %d (%d)\n", + dev_name(pd_dev), pstate, ret); + } + + return ret; +} + /* This is only called for PM domain for now */ static int _set_required_opps(struct device *dev, struct opp_table *opp_table, - struct dev_pm_opp *opp) + struct dev_pm_opp *opp, bool up) { struct opp_table **required_opp_tables = opp_table->required_opp_tables; struct device **genpd_virt_devs = opp_table->genpd_virt_devs; - unsigned int pstate; int i, ret = 0; if (!required_opp_tables) return 0; /* Single genpd case */ - if (!genpd_virt_devs) { - pstate = likely(opp) ? opp->required_opps[0]->pstate : 0; - ret = dev_pm_genpd_set_performance_state(dev, pstate); - if (ret) { - dev_err(dev, "Failed to set performance state of %s: %d (%d)\n", - dev_name(dev), pstate, ret); - } - return ret; - } + if (!genpd_virt_devs) + return _set_required_opp(dev, dev, opp, 0); /* Multiple genpd case */ @@ -813,19 +821,21 @@ static int _set_required_opps(struct device *dev, */ mutex_lock(&opp_table->genpd_virt_dev_lock); - for (i = 0; i < opp_table->required_opp_count; i++) { - pstate = likely(opp) ? opp->required_opps[i]->pstate : 0; - - if (!genpd_virt_devs[i]) - continue; - - ret = dev_pm_genpd_set_performance_state(genpd_virt_devs[i], pstate); - if (ret) { - dev_err(dev, "Failed to set performance rate of %s: %d (%d)\n", - dev_name(genpd_virt_devs[i]), pstate, ret); - break; + /* Scaling up? Set required OPPs in normal order, else reverse */ + if (up) { + for (i = 0; i < opp_table->required_opp_count; i++) { + ret = _set_required_opp(dev, genpd_virt_devs[i], opp, i); + if (ret) + break; + } + } else { + for (i = opp_table->required_opp_count - 1; i >= 0; i--) { + ret = _set_required_opp(dev, genpd_virt_devs[i], opp, i); + if (ret) + break; } } + mutex_unlock(&opp_table->genpd_virt_dev_lock); return ret; @@ -862,6 +872,34 @@ int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp) } EXPORT_SYMBOL_GPL(dev_pm_opp_set_bw); +static int _opp_set_rate_zero(struct device *dev, struct opp_table *opp_table) +{ + int ret; + + if (!opp_table->enabled) + return 0; + + /* + * Some drivers need to support cases where some platforms may + * have OPP table for the device, while others don't and + * opp_set_rate() just needs to behave like clk_set_rate(). + */ + if (!_get_opp_count(opp_table)) + return 0; + + ret = _set_opp_bw(opp_table, NULL, dev, true); + if (ret) + return ret; + + if (opp_table->regulators) + regulator_disable(opp_table->regulators[0]); + + ret = _set_required_opps(dev, opp_table, NULL, false); + + opp_table->enabled = false; + return ret; +} + /** * dev_pm_opp_set_rate() - Configure new OPP based on frequency * @dev: device for which we do this operation @@ -888,33 +926,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) } if (unlikely(!target_freq)) { - /* - * Some drivers need to support cases where some platforms may - * have OPP table for the device, while others don't and - * opp_set_rate() just needs to behave like clk_set_rate(). - */ - if (!_get_opp_count(opp_table)) { - ret = 0; - goto put_opp_table; - } - - if (!opp_table->required_opp_tables && !opp_table->regulators && - !opp_table->paths) { - dev_err(dev, "target frequency can't be 0\n"); - ret = -EINVAL; - goto put_opp_table; - } - - ret = _set_opp_bw(opp_table, NULL, dev, true); - if (ret) - goto put_opp_table; - - if (opp_table->regulator_enabled) { - regulator_disable(opp_table->regulators[0]); - opp_table->regulator_enabled = false; - } - - ret = _set_required_opps(dev, opp_table, NULL); + ret = _opp_set_rate_zero(dev, opp_table); goto put_opp_table; } @@ -933,14 +945,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) old_freq = clk_get_rate(clk); /* Return early if nothing to do */ - if (old_freq == freq) { - if (!opp_table->required_opp_tables && !opp_table->regulators && - !opp_table->paths) { - dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n", - __func__, freq); - ret = 0; - goto put_opp_table; - } + if (opp_table->enabled && old_freq == freq) { + dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n", + __func__, freq); + ret = 0; + goto put_opp_table; } /* @@ -976,7 +985,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) /* Scaling up? Configure required OPPs before frequency */ if (freq >= old_freq) { - ret = _set_required_opps(dev, opp_table, opp); + ret = _set_required_opps(dev, opp_table, opp, true); if (ret) goto put_opp; } @@ -996,13 +1005,16 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) /* Scaling down? Configure required OPPs after frequency */ if (!ret && freq < old_freq) { - ret = _set_required_opps(dev, opp_table, opp); + ret = _set_required_opps(dev, opp_table, opp, false); if (ret) dev_err(dev, "Failed to set required opps: %d\n", ret); } - if (!ret) + if (!ret) { ret = _set_opp_bw(opp_table, opp, dev, false); + if (!ret) + opp_table->enabled = true; + } put_opp: dev_pm_opp_put(opp); @@ -1068,7 +1080,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) */ opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL); if (!opp_table) - return NULL; + return ERR_PTR(-ENOMEM); mutex_init(&opp_table->lock); mutex_init(&opp_table->genpd_virt_dev_lock); @@ -1079,8 +1091,8 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) opp_dev = _add_opp_dev(dev, opp_table); if (!opp_dev) { - kfree(opp_table); - return NULL; + ret = -ENOMEM; + goto err; } _of_init_opp_table(opp_table, dev, index); @@ -1089,16 +1101,21 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) opp_table->clk = clk_get(dev, NULL); if (IS_ERR(opp_table->clk)) { ret = PTR_ERR(opp_table->clk); - if (ret != -EPROBE_DEFER) - dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, - ret); + if (ret == -EPROBE_DEFER) + goto err; + + dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret); } /* Find interconnect path(s) for the device */ ret = dev_pm_opp_of_find_icc_paths(dev, opp_table); - if (ret) + if (ret) { + if (ret == -EPROBE_DEFER) + goto err; + dev_warn(dev, "%s: Error finding interconnect paths: %d\n", __func__, ret); + } BLOCKING_INIT_NOTIFIER_HEAD(&opp_table->head); INIT_LIST_HEAD(&opp_table->opp_list); @@ -1107,6 +1124,10 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) /* Secure the device table modification */ list_add(&opp_table->node, &opp_tables); return opp_table; + +err: + kfree(opp_table); + return ERR_PTR(ret); } void _get_opp_table_kref(struct opp_table *opp_table) @@ -1129,7 +1150,7 @@ static struct opp_table *_opp_get_opp_table(struct device *dev, int index) if (opp_table) { if (!_add_opp_dev_unlocked(dev, opp_table)) { dev_pm_opp_put_opp_table(opp_table); - opp_table = NULL; + opp_table = ERR_PTR(-ENOMEM); } goto unlock; } @@ -1581,8 +1602,8 @@ struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, struct opp_table *opp_table; opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return ERR_PTR(-ENOMEM); + if (IS_ERR(opp_table)) + return opp_table; /* Make sure there are no concurrent readers while updating opp_table */ WARN_ON(!list_empty(&opp_table->opp_list)); @@ -1640,8 +1661,8 @@ struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name) struct opp_table *opp_table; opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return ERR_PTR(-ENOMEM); + if (IS_ERR(opp_table)) + return opp_table; /* Make sure there are no concurrent readers while updating opp_table */ WARN_ON(!list_empty(&opp_table->opp_list)); @@ -1733,8 +1754,8 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev, int ret, i; opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return ERR_PTR(-ENOMEM); + if (IS_ERR(opp_table)) + return opp_table; /* This should be called before OPPs are initialized */ if (WARN_ON(!list_empty(&opp_table->opp_list))) { @@ -1804,11 +1825,9 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table) /* Make sure there are no concurrent readers while updating opp_table */ WARN_ON(!list_empty(&opp_table->opp_list)); - if (opp_table->regulator_enabled) { + if (opp_table->enabled) { for (i = opp_table->regulator_count - 1; i >= 0; i--) regulator_disable(opp_table->regulators[i]); - - opp_table->regulator_enabled = false; } for (i = opp_table->regulator_count - 1; i >= 0; i--) @@ -1843,8 +1862,8 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name) int ret; opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return ERR_PTR(-ENOMEM); + if (IS_ERR(opp_table)) + return opp_table; /* This should be called before OPPs are initialized */ if (WARN_ON(!list_empty(&opp_table->opp_list))) { @@ -1911,8 +1930,8 @@ struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, return ERR_PTR(-EINVAL); opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return ERR_PTR(-ENOMEM); + if (!IS_ERR(opp_table)) + return opp_table; /* This should be called before OPPs are initialized */ if (WARN_ON(!list_empty(&opp_table->opp_list))) { @@ -1949,6 +1968,9 @@ static void _opp_detach_genpd(struct opp_table *opp_table) { int index; + if (!opp_table->genpd_virt_devs) + return; + for (index = 0; index < opp_table->required_opp_count; index++) { if (!opp_table->genpd_virt_devs[index]) continue; @@ -1992,8 +2014,11 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **name = names; opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return ERR_PTR(-ENOMEM); + if (IS_ERR(opp_table)) + return opp_table; + + if (opp_table->genpd_virt_devs) + return opp_table; /* * If the genpd's OPP table isn't already initialized, parsing of the @@ -2020,12 +2045,6 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, goto err; } - if (opp_table->genpd_virt_devs[index]) { - dev_err(dev, "Genpd virtual device already set %s\n", - *name); - goto err; - } - virt_dev = dev_pm_domain_attach_by_name(dev, *name); if (IS_ERR(virt_dev)) { ret = PTR_ERR(virt_dev); @@ -2098,9 +2117,6 @@ int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, int dest_pstate = -EINVAL; int i; - if (!pstate) - return 0; - /* * Normally the src_table will have the "required_opps" property set to * point to one of the OPPs in the dst_table, but in some cases the @@ -2163,8 +2179,8 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) int ret; opp_table = dev_pm_opp_get_opp_table(dev); - if (!opp_table) - return -ENOMEM; + if (IS_ERR(opp_table)) + return PTR_ERR(opp_table); /* Fix regulator count for dynamic OPPs */ opp_table->regulator_count = 1; @@ -2405,7 +2421,14 @@ int dev_pm_opp_unregister_notifier(struct device *dev, } EXPORT_SYMBOL(dev_pm_opp_unregister_notifier); -void _dev_pm_opp_find_and_remove_table(struct device *dev) +/** + * dev_pm_opp_remove_table() - Free all OPPs associated with the device + * @dev: device pointer used to lookup OPP table. + * + * Free both OPPs created using static entries present in DT and the + * dynamically added entries. + */ +void dev_pm_opp_remove_table(struct device *dev) { struct opp_table *opp_table; @@ -2432,16 +2455,4 @@ void _dev_pm_opp_find_and_remove_table(struct device *dev) /* Drop reference taken by _find_opp_table() */ dev_pm_opp_put_opp_table(opp_table); } - -/** - * dev_pm_opp_remove_table() - Free all OPPs associated with the device - * @dev: device pointer used to lookup OPP table. - * - * Free both OPPs created using static entries present in DT and the - * dynamically added entries. - */ -void dev_pm_opp_remove_table(struct device *dev) -{ - _dev_pm_opp_find_and_remove_table(dev); -} EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table); diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c index b5055cc886ef..5004335cf0de 100644 --- a/drivers/opp/cpu.c +++ b/drivers/opp/cpu.c @@ -124,7 +124,7 @@ void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, continue; } - _dev_pm_opp_find_and_remove_table(cpu_dev); + dev_pm_opp_remove_table(cpu_dev); } } diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 0430290670ab..874b58756220 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -434,9 +434,9 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_find_icc_paths); static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table, struct device_node *np) { - unsigned int count = opp_table->supported_hw_count; - u32 version; - int ret; + unsigned int levels = opp_table->supported_hw_count; + int count, versions, ret, i, j; + u32 val; if (!opp_table->supported_hw) { /* @@ -451,21 +451,40 @@ static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table, return true; } - while (count--) { - ret = of_property_read_u32_index(np, "opp-supported-hw", count, - &version); - if (ret) { - dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n", - __func__, count, ret); - return false; - } - - /* Both of these are bitwise masks of the versions */ - if (!(version & opp_table->supported_hw[count])) - return false; + count = of_property_count_u32_elems(np, "opp-supported-hw"); + if (count <= 0 || count % levels) { + dev_err(dev, "%s: Invalid opp-supported-hw property (%d)\n", + __func__, count); + return false; } - return true; + versions = count / levels; + + /* All levels in at least one of the versions should match */ + for (i = 0; i < versions; i++) { + bool supported = true; + + for (j = 0; j < levels; j++) { + ret = of_property_read_u32_index(np, "opp-supported-hw", + i * levels + j, &val); + if (ret) { + dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n", + __func__, i * levels + j, ret); + return false; + } + + /* Check if the level is supported */ + if (!(val & opp_table->supported_hw[j])) { + supported = false; + break; + } + } + + if (supported) + return true; + } + + return false; } static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev, @@ -616,7 +635,7 @@ free_microvolt: */ void dev_pm_opp_of_remove_table(struct device *dev) { - _dev_pm_opp_find_and_remove_table(dev); + dev_pm_opp_remove_table(dev); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table); @@ -823,7 +842,7 @@ free_opp: static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table) { struct device_node *np; - int ret, count = 0, pstate_count = 0; + int ret, count = 0; struct dev_pm_opp *opp; /* OPP table is already initialized for the device */ @@ -857,20 +876,14 @@ static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table) goto remove_static_opp; } - list_for_each_entry(opp, &opp_table->opp_list, node) - pstate_count += !!opp->pstate; - - /* Either all or none of the nodes shall have performance state set */ - if (pstate_count && pstate_count != count) { - dev_err(dev, "Not all nodes have performance state set (%d: %d)\n", - count, pstate_count); - ret = -ENOENT; - goto remove_static_opp; + list_for_each_entry(opp, &opp_table->opp_list, node) { + /* Any non-zero performance state would enable the feature */ + if (opp->pstate) { + opp_table->genpd_performance_state = true; + break; + } } - if (pstate_count) - opp_table->genpd_performance_state = true; - return 0; remove_static_opp: @@ -886,11 +899,25 @@ static int _of_add_opp_table_v1(struct device *dev, struct opp_table *opp_table) const __be32 *val; int nr, ret = 0; + mutex_lock(&opp_table->lock); + if (opp_table->parsed_static_opps) { + opp_table->parsed_static_opps++; + mutex_unlock(&opp_table->lock); + return 0; + } + + opp_table->parsed_static_opps = 1; + mutex_unlock(&opp_table->lock); + prop = of_find_property(dev->of_node, "operating-points", NULL); - if (!prop) - return -ENODEV; - if (!prop->value) - return -ENODATA; + if (!prop) { + ret = -ENODEV; + goto remove_static_opp; + } + if (!prop->value) { + ret = -ENODATA; + goto remove_static_opp; + } /* * Each OPP is a set of tuples consisting of frequency and @@ -899,13 +926,10 @@ static int _of_add_opp_table_v1(struct device *dev, struct opp_table *opp_table) nr = prop->length / sizeof(u32); if (nr % 2) { dev_err(dev, "%s: Invalid OPP table\n", __func__); - return -EINVAL; + ret = -EINVAL; + goto remove_static_opp; } - mutex_lock(&opp_table->lock); - opp_table->parsed_static_opps = 1; - mutex_unlock(&opp_table->lock); - val = prop->value; while (nr) { unsigned long freq = be32_to_cpup(val++) * 1000; @@ -915,12 +939,14 @@ static int _of_add_opp_table_v1(struct device *dev, struct opp_table *opp_table) if (ret) { dev_err(dev, "%s: Failed to add OPP %ld (%d)\n", __func__, freq, ret); - _opp_remove_all_static(opp_table); - return ret; + goto remove_static_opp; } nr -= 2; } +remove_static_opp: + _opp_remove_all_static(opp_table); + return ret; } @@ -947,8 +973,8 @@ int dev_pm_opp_of_add_table(struct device *dev) int ret; opp_table = dev_pm_opp_get_opp_table_indexed(dev, 0); - if (!opp_table) - return -ENOMEM; + if (IS_ERR(opp_table)) + return PTR_ERR(opp_table); /* * OPPs have two version of bindings now. Also try the old (v1) @@ -1002,8 +1028,8 @@ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) } opp_table = dev_pm_opp_get_opp_table_indexed(dev, index); - if (!opp_table) - return -ENOMEM; + if (IS_ERR(opp_table)) + return PTR_ERR(opp_table); ret = _of_add_opp_table_v2(dev, opp_table); if (ret) diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index c3fcd571e446..ebd930e0b3ca 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -147,11 +147,11 @@ enum opp_table_access { * @clk: Device's clock handle * @regulators: Supply regulators * @regulator_count: Number of power supply regulators. Its value can be -1 - * @regulator_enabled: Set to true if regulators were previously enabled. * (uninitialized), 0 (no opp-microvolt property) or > 0 (has opp-microvolt * property). * @paths: Interconnect path handles * @path_count: Number of interconnect paths + * @enabled: Set to true if the device's resources are enabled/configured. * @genpd_performance_state: Device's power domain support performance state. * @is_genpd: Marks if the OPP table belongs to a genpd. * @set_opp: Platform specific set_opp callback @@ -195,9 +195,9 @@ struct opp_table { struct clk *clk; struct regulator **regulators; int regulator_count; - bool regulator_enabled; struct icc_path **paths; unsigned int path_count; + bool enabled; bool genpd_performance_state; bool is_genpd; @@ -217,7 +217,6 @@ void _get_opp_table_kref(struct opp_table *opp_table); int _get_opp_count(struct opp_table *opp_table); struct opp_table *_find_opp_table(struct device *dev); struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table); -void _dev_pm_opp_find_and_remove_table(struct device *dev); struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table); void _opp_free(struct dev_pm_opp *opp); int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 25b4c9023bfa..4e992403fffe 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2507,7 +2507,10 @@ static void hv_pci_onchannelcallback(void *context) /** * hv_pci_protocol_negotiation() - Set up protocol - * @hdev: VMBus's tracking struct for this root PCI bus + * @hdev: VMBus's tracking struct for this root PCI bus. + * @version: Array of supported channel protocol versions in + * the order of probing - highest go first. + * @num_version: Number of elements in the version array. * * This driver is intended to support running on Windows 10 * (server) and later versions. It will not run on earlier diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d5869a03f748..d9aa551f8423 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -944,6 +944,16 @@ static bool acpi_pci_bridge_d3(struct pci_dev *dev) if (!dev->is_hotplug_bridge) return false; + /* Assume D3 support if the bridge is power-manageable by ACPI. */ + adev = ACPI_COMPANION(&dev->dev); + if (!adev && !pci_dev_is_added(dev)) { + adev = acpi_pci_find_companion(&dev->dev); + ACPI_COMPANION_SET(&dev->dev, adev); + } + + if (adev && acpi_device_power_manageable(adev)) + return true; + /* * Look for a special _DSD property for the root port and if it * is set we know the hierarchy behind it supports D3 just fine. diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index 1762f335bac9..ecd477964d11 100644 --- a/drivers/platform/x86/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c @@ -81,6 +81,7 @@ enum hp_wmi_commandtype { HPWMI_FEATURE2_QUERY = 0x0d, HPWMI_WIRELESS2_QUERY = 0x1b, HPWMI_POSTCODEERROR_QUERY = 0x2a, + HPWMI_THERMAL_POLICY_QUERY = 0x4c, }; enum hp_wmi_command { @@ -861,6 +862,26 @@ fail: return err; } +static int thermal_policy_setup(struct platform_device *device) +{ + int err, tp; + + tp = hp_wmi_read_int(HPWMI_THERMAL_POLICY_QUERY); + if (tp < 0) + return tp; + + /* + * call thermal policy write command to ensure that the firmware correctly + * sets the OEM variables for the DPTF + */ + err = hp_wmi_perform_query(HPWMI_THERMAL_POLICY_QUERY, HPWMI_WRITE, &tp, + sizeof(tp), 0); + if (err) + return err; + + return 0; +} + static int __init hp_wmi_bios_setup(struct platform_device *device) { /* clear detected rfkill devices */ @@ -872,6 +893,8 @@ static int __init hp_wmi_bios_setup(struct platform_device *device) if (hp_wmi_rfkill_setup(device)) hp_wmi_rfkill2_setup(device); + thermal_policy_setup(device); + return 0; } diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c index 338ea5222555..3e5fe66333f1 100644 --- a/drivers/platform/x86/intel_pmc_core.c +++ b/drivers/platform/x86/intel_pmc_core.c @@ -118,6 +118,10 @@ static const struct pmc_bit_map spt_pfear_map[] = { }; static const struct pmc_bit_map *ext_spt_pfear_map[] = { + /* + * Check intel_pmc_core_ids[] users of spt_reg_map for + * a list of core SoCs using this. + */ spt_pfear_map, NULL }; @@ -154,6 +158,7 @@ static const struct pmc_reg_map spt_reg_map = { .ltr_show_sts = spt_ltr_show_map, .msr_sts = msr_map, .slp_s0_offset = SPT_PMC_SLP_S0_RES_COUNTER_OFFSET, + .slp_s0_res_counter_step = SPT_PMC_SLP_S0_RES_COUNTER_STEP, .ltr_ignore_offset = SPT_PMC_LTR_IGNORE_OFFSET, .regmap_length = SPT_PMC_MMIO_REG_LEN, .ppfear0_offset = SPT_PMC_XRAM_PPFEAR0A, @@ -166,7 +171,6 @@ static const struct pmc_reg_map spt_reg_map = { /* Cannon Lake: PGD PFET Enable Ack Status Register(s) bitmap */ static const struct pmc_bit_map cnp_pfear_map[] = { - /* Reserved for Cannon Lake but valid for Comet Lake */ {"PMC", BIT(0)}, {"OPI-DMI", BIT(1)}, {"SPI/eSPI", BIT(2)}, @@ -192,10 +196,6 @@ static const struct pmc_bit_map cnp_pfear_map[] = { {"SDX", BIT(4)}, {"SPE", BIT(5)}, {"Fuse", BIT(6)}, - /* - * Reserved for Cannon Lake but valid for Ice Lake, Comet Lake, - * Tiger Lake, Elkhart Lake and Jasper Lake. - */ {"SBR8", BIT(7)}, {"CSME_FSC", BIT(0)}, @@ -239,10 +239,6 @@ static const struct pmc_bit_map cnp_pfear_map[] = { {"HDA_PGD4", BIT(2)}, {"HDA_PGD5", BIT(3)}, {"HDA_PGD6", BIT(4)}, - /* - * Reserved for Cannon Lake but valid for Ice Lake, Comet Lake, - * Tiger Lake, ELkhart Lake and Jasper Lake. - */ {"PSF6", BIT(5)}, {"PSF7", BIT(6)}, {"PSF8", BIT(7)}, @@ -250,12 +246,15 @@ static const struct pmc_bit_map cnp_pfear_map[] = { }; static const struct pmc_bit_map *ext_cnp_pfear_map[] = { + /* + * Check intel_pmc_core_ids[] users of cnp_reg_map for + * a list of core SoCs using this. + */ cnp_pfear_map, NULL }; static const struct pmc_bit_map icl_pfear_map[] = { - /* Ice Lake and Jasper Lake generation onwards only */ {"RES_65", BIT(0)}, {"RES_66", BIT(1)}, {"RES_67", BIT(2)}, @@ -268,13 +267,16 @@ static const struct pmc_bit_map icl_pfear_map[] = { }; static const struct pmc_bit_map *ext_icl_pfear_map[] = { + /* + * Check intel_pmc_core_ids[] users of icl_reg_map for + * a list of core SoCs using this. + */ cnp_pfear_map, icl_pfear_map, NULL }; static const struct pmc_bit_map tgl_pfear_map[] = { - /* Tiger Lake and Elkhart Lake generation onwards only */ {"PSF9", BIT(0)}, {"RES_66", BIT(1)}, {"RES_67", BIT(2)}, @@ -286,6 +288,10 @@ static const struct pmc_bit_map tgl_pfear_map[] = { }; static const struct pmc_bit_map *ext_tgl_pfear_map[] = { + /* + * Check intel_pmc_core_ids[] users of tgl_reg_map for + * a list of core SoCs using this. + */ cnp_pfear_map, tgl_pfear_map, NULL @@ -369,7 +375,10 @@ static const struct pmc_bit_map cnp_ltr_show_map[] = { {"ISH", CNP_PMC_LTR_ISH}, {"UFSX2", CNP_PMC_LTR_UFSX2}, {"EMMC", CNP_PMC_LTR_EMMC}, - /* Reserved for Cannon Lake but valid for Ice Lake */ + /* + * Check intel_pmc_core_ids[] users of cnp_reg_map for + * a list of core SoCs using this. + */ {"WIGIG", ICL_PMC_LTR_WIGIG}, /* Below two cannot be used for LTR_IGNORE */ {"CURRENT_PLATFORM", CNP_PMC_LTR_CUR_PLT}, @@ -380,6 +389,7 @@ static const struct pmc_bit_map cnp_ltr_show_map[] = { static const struct pmc_reg_map cnp_reg_map = { .pfear_sts = ext_cnp_pfear_map, .slp_s0_offset = CNP_PMC_SLP_S0_RES_COUNTER_OFFSET, + .slp_s0_res_counter_step = SPT_PMC_SLP_S0_RES_COUNTER_STEP, .slps0_dbg_maps = cnp_slps0_dbg_maps, .ltr_show_sts = cnp_ltr_show_map, .msr_sts = msr_map, @@ -396,6 +406,7 @@ static const struct pmc_reg_map cnp_reg_map = { static const struct pmc_reg_map icl_reg_map = { .pfear_sts = ext_icl_pfear_map, .slp_s0_offset = CNP_PMC_SLP_S0_RES_COUNTER_OFFSET, + .slp_s0_res_counter_step = ICL_PMC_SLP_S0_RES_COUNTER_STEP, .slps0_dbg_maps = cnp_slps0_dbg_maps, .ltr_show_sts = cnp_ltr_show_map, .msr_sts = msr_map, @@ -409,7 +420,7 @@ static const struct pmc_reg_map icl_reg_map = { .ltr_ignore_max = ICL_NUM_IP_IGN_ALLOWED, }; -static const struct pmc_bit_map tgl_lpm0_map[] = { +static const struct pmc_bit_map tgl_clocksource_status_map[] = { {"USB2PLL_OFF_STS", BIT(18)}, {"PCIe/USB3.1_Gen2PLL_OFF_STS", BIT(19)}, {"PCIe_Gen3PLL_OFF_STS", BIT(20)}, @@ -425,35 +436,35 @@ static const struct pmc_bit_map tgl_lpm0_map[] = { {} }; -static const struct pmc_bit_map tgl_lpm1_map[] = { - {"SPI_PG_STS", BIT(2)}, - {"xHCI_PG_STS", BIT(3)}, - {"PCIe_Ctrller_A_PG_STS", BIT(4)}, - {"PCIe_Ctrller_B_PG_STS", BIT(5)}, - {"PCIe_Ctrller_C_PG_STS", BIT(6)}, - {"GBE_PG_STS", BIT(7)}, - {"SATA_PG_STS", BIT(8)}, - {"HDA0_PG_STS", BIT(9)}, - {"HDA1_PG_STS", BIT(10)}, - {"HDA2_PG_STS", BIT(11)}, - {"HDA3_PG_STS", BIT(12)}, - {"PCIe_Ctrller_D_PG_STS", BIT(13)}, - {"ISIO_PG_STS", BIT(14)}, - {"SMB_PG_STS", BIT(16)}, - {"ISH_PG_STS", BIT(17)}, - {"ITH_PG_STS", BIT(19)}, - {"SDX_PG_STS", BIT(20)}, - {"xDCI_PG_STS", BIT(25)}, - {"DCI_PG_STS", BIT(26)}, - {"CSME0_PG_STS", BIT(27)}, - {"CSME_KVM_PG_STS", BIT(28)}, - {"CSME1_PG_STS", BIT(29)}, - {"CSME_CLINK_PG_STS", BIT(30)}, - {"CSME2_PG_STS", BIT(31)}, +static const struct pmc_bit_map tgl_power_gating_status_map[] = { + {"CSME_PG_STS", BIT(0)}, + {"SATA_PG_STS", BIT(1)}, + {"xHCI_PG_STS", BIT(2)}, + {"UFSX2_PG_STS", BIT(3)}, + {"OTG_PG_STS", BIT(5)}, + {"SPA_PG_STS", BIT(6)}, + {"SPB_PG_STS", BIT(7)}, + {"SPC_PG_STS", BIT(8)}, + {"SPD_PG_STS", BIT(9)}, + {"SPE_PG_STS", BIT(10)}, + {"SPF_PG_STS", BIT(11)}, + {"LSX_PG_STS", BIT(13)}, + {"P2SB_PG_STS", BIT(14)}, + {"PSF_PG_STS", BIT(15)}, + {"SBR_PG_STS", BIT(16)}, + {"OPIDMI_PG_STS", BIT(17)}, + {"THC0_PG_STS", BIT(18)}, + {"THC1_PG_STS", BIT(19)}, + {"GBETSN_PG_STS", BIT(20)}, + {"GBE_PG_STS", BIT(21)}, + {"LPSS_PG_STS", BIT(22)}, + {"MMP_UFSX2_PG_STS", BIT(23)}, + {"MMP_UFSX2B_PG_STS", BIT(24)}, + {"FIA_PG_STS", BIT(25)}, {} }; -static const struct pmc_bit_map tgl_lpm2_map[] = { +static const struct pmc_bit_map tgl_d3_status_map[] = { {"ADSP_D3_STS", BIT(0)}, {"SATA_D3_STS", BIT(1)}, {"xHCI0_D3_STS", BIT(2)}, @@ -468,7 +479,7 @@ static const struct pmc_bit_map tgl_lpm2_map[] = { {} }; -static const struct pmc_bit_map tgl_lpm3_map[] = { +static const struct pmc_bit_map tgl_vnn_req_status_map[] = { {"GPIO_COM0_VNN_REQ_STS", BIT(1)}, {"GPIO_COM1_VNN_REQ_STS", BIT(2)}, {"GPIO_COM2_VNN_REQ_STS", BIT(3)}, @@ -493,7 +504,7 @@ static const struct pmc_bit_map tgl_lpm3_map[] = { {} }; -static const struct pmc_bit_map tgl_lpm4_map[] = { +static const struct pmc_bit_map tgl_vnn_misc_status_map[] = { {"CPU_C10_REQ_STS_0", BIT(0)}, {"PCIe_LPM_En_REQ_STS_3", BIT(3)}, {"ITH_REQ_STS_5", BIT(5)}, @@ -509,7 +520,7 @@ static const struct pmc_bit_map tgl_lpm4_map[] = { {} }; -static const struct pmc_bit_map tgl_lpm5_map[] = { +static const struct pmc_bit_map tgl_signal_status_map[] = { {"LSX_Wake0_En_STS", BIT(0)}, {"LSX_Wake0_Pol_STS", BIT(1)}, {"LSX_Wake1_En_STS", BIT(2)}, @@ -546,18 +557,19 @@ static const struct pmc_bit_map tgl_lpm5_map[] = { }; static const struct pmc_bit_map *tgl_lpm_maps[] = { - tgl_lpm0_map, - tgl_lpm1_map, - tgl_lpm2_map, - tgl_lpm3_map, - tgl_lpm4_map, - tgl_lpm5_map, + tgl_clocksource_status_map, + tgl_power_gating_status_map, + tgl_d3_status_map, + tgl_vnn_req_status_map, + tgl_vnn_misc_status_map, + tgl_signal_status_map, NULL }; static const struct pmc_reg_map tgl_reg_map = { .pfear_sts = ext_tgl_pfear_map, .slp_s0_offset = CNP_PMC_SLP_S0_RES_COUNTER_OFFSET, + .slp_s0_res_counter_step = TGL_PMC_SLP_S0_RES_COUNTER_STEP, .ltr_show_sts = cnp_ltr_show_map, .msr_sts = msr_map, .ltr_ignore_offset = CNP_PMC_LTR_IGNORE_OFFSET, @@ -586,9 +598,9 @@ static inline void pmc_core_reg_write(struct pmc_dev *pmcdev, int reg_offset, writel(val, pmcdev->regbase + reg_offset); } -static inline u64 pmc_core_adjust_slp_s0_step(u32 value) +static inline u64 pmc_core_adjust_slp_s0_step(struct pmc_dev *pmcdev, u32 value) { - return (u64)value * SPT_PMC_SLP_S0_RES_COUNTER_STEP; + return (u64)value * pmcdev->map->slp_s0_res_counter_step; } static int pmc_core_dev_state_get(void *data, u64 *val) @@ -598,7 +610,7 @@ static int pmc_core_dev_state_get(void *data, u64 *val) u32 value; value = pmc_core_reg_read(pmcdev, map->slp_s0_offset); - *val = pmc_core_adjust_slp_s0_step(value); + *val = pmc_core_adjust_slp_s0_step(pmcdev, value); return 0; } @@ -628,7 +640,7 @@ static void pmc_core_slps0_display(struct pmc_dev *pmcdev, struct device *dev, offset += 4; while (map->name) { if (dev) - dev_dbg(dev, "SLP_S0_DBG: %-32s\tState: %s\n", + dev_info(dev, "SLP_S0_DBG: %-32s\tState: %s\n", map->name, data & map->bit_mask ? "Yes" : "No"); if (s) @@ -671,7 +683,7 @@ static void pmc_core_lpm_display(struct pmc_dev *pmcdev, struct device *dev, for (idx = 0; idx < arr_size; idx++) { if (dev) - dev_dbg(dev, "\nLPM_%s_%d:\t0x%x\n", str, idx, + dev_info(dev, "\nLPM_%s_%d:\t0x%x\n", str, idx, lpm_regs[idx]); if (s) seq_printf(s, "\nLPM_%s_%d:\t0x%x\n", str, idx, @@ -679,7 +691,7 @@ static void pmc_core_lpm_display(struct pmc_dev *pmcdev, struct device *dev, for (index = 0; maps[idx][index].name && index < len; index++) { bit_mask = maps[idx][index].bit_mask; if (dev) - dev_dbg(dev, "%-30s %-30d\n", + dev_info(dev, "%-30s %-30d\n", maps[idx][index].name, lpm_regs[idx] & bit_mask ? 1 : 0); if (s) @@ -1147,6 +1159,7 @@ static const struct x86_cpu_id intel_pmc_core_ids[] = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_reg_map), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &tgl_reg_map), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &icl_reg_map), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &tgl_reg_map), {} }; diff --git a/drivers/platform/x86/intel_pmc_core.h b/drivers/platform/x86/intel_pmc_core.h index 5eae55d80226..f33cd2c34835 100644 --- a/drivers/platform/x86/intel_pmc_core.h +++ b/drivers/platform/x86/intel_pmc_core.h @@ -30,7 +30,7 @@ #define SPT_PMC_MPHY_CORE_STS_1 0x1142 #define SPT_PMC_MPHY_COM_STS_0 0x1155 #define SPT_PMC_MMIO_REG_LEN 0x1000 -#define SPT_PMC_SLP_S0_RES_COUNTER_STEP 0x64 +#define SPT_PMC_SLP_S0_RES_COUNTER_STEP 0x68 #define PMC_BASE_ADDR_MASK ~(SPT_PMC_MMIO_REG_LEN - 1) #define MTPMC_MASK 0xffff0000 #define PPFEAR_MAX_NUM_ENTRIES 12 @@ -185,8 +185,10 @@ enum ppfear_regs { #define ICL_PPFEAR_NUM_ENTRIES 9 #define ICL_NUM_IP_IGN_ALLOWED 20 #define ICL_PMC_LTR_WIGIG 0x1BFC +#define ICL_PMC_SLP_S0_RES_COUNTER_STEP 0x64 #define TGL_NUM_IP_IGN_ALLOWED 22 +#define TGL_PMC_SLP_S0_RES_COUNTER_STEP 0x7A /* * Tigerlake Power Management Controller register offsets @@ -245,6 +247,7 @@ struct pmc_reg_map { const struct pmc_bit_map *msr_sts; const struct pmc_bit_map **lpm_sts; const u32 slp_s0_offset; + const int slp_s0_res_counter_step; const u32 ltr_ignore_offset; const int regmap_length; const u32 ppfear0_offset; diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c index 1506ec0a4777..986ad3dda1c1 100644 --- a/drivers/platform/x86/mlx-platform.c +++ b/drivers/platform/x86/mlx-platform.c @@ -328,15 +328,6 @@ static struct i2c_board_info mlxplat_mlxcpld_psu[] = { }, }; -static struct i2c_board_info mlxplat_mlxcpld_ng_psu[] = { - { - I2C_BOARD_INFO("24c32", 0x51), - }, - { - I2C_BOARD_INFO("24c32", 0x50), - }, -}; - static struct i2c_board_info mlxplat_mlxcpld_pwr[] = { { I2C_BOARD_INFO("dps460", 0x59), @@ -770,15 +761,13 @@ static struct mlxreg_core_data mlxplat_mlxcpld_default_ng_psu_items_data[] = { .label = "psu1", .reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET, .mask = BIT(0), - .hpdev.brdinfo = &mlxplat_mlxcpld_ng_psu[0], - .hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR, + .hpdev.nr = MLXPLAT_CPLD_NR_NONE, }, { .label = "psu2", .reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET, .mask = BIT(1), - .hpdev.brdinfo = &mlxplat_mlxcpld_ng_psu[1], - .hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR, + .hpdev.nr = MLXPLAT_CPLD_NR_NONE, }, }; @@ -1950,6 +1939,7 @@ static struct mlxreg_core_data mlxplat_mlxcpld_default_fan_data[] = { static struct mlxreg_core_platform_data mlxplat_default_fan_data = { .data = mlxplat_mlxcpld_default_fan_data, .counter = ARRAY_SIZE(mlxplat_mlxcpld_default_fan_data), + .capability = MLXPLAT_CPLD_LPC_REG_FAN_DRW_CAP_OFFSET, }; /* Watchdog type1: hardware implementation version1 diff --git a/drivers/pnp/isapnp/compat.c b/drivers/pnp/isapnp/compat.c index 6c845b628316..035e95092489 100644 --- a/drivers/pnp/isapnp/compat.c +++ b/drivers/pnp/isapnp/compat.c @@ -21,28 +21,6 @@ static void pnp_convert_id(char *buf, unsigned short vendor, (device >> 12) & 0x0f, (device >> 8) & 0x0f); } -struct pnp_card *pnp_find_card(unsigned short vendor, unsigned short device, - struct pnp_card *from) -{ - char id[8]; - char any[8]; - struct list_head *list; - - pnp_convert_id(id, vendor, device); - pnp_convert_id(any, ISAPNP_ANY_ID, ISAPNP_ANY_ID); - - list = from ? from->global_list.next : pnp_cards.next; - - while (list != &pnp_cards) { - struct pnp_card *card = global_to_pnp_card(list); - - if (compare_pnp_id(card->id, id) || (memcmp(id, any, 7) == 0)) - return card; - list = list->next; - } - return NULL; -} - struct pnp_dev *pnp_find_dev(struct pnp_card *card, unsigned short vendor, unsigned short function, struct pnp_dev *from) { @@ -86,5 +64,4 @@ struct pnp_dev *pnp_find_dev(struct pnp_card *card, unsigned short vendor, return NULL; } -EXPORT_SYMBOL(pnp_find_card); EXPORT_SYMBOL(pnp_find_dev); diff --git a/drivers/pnp/quirks.c b/drivers/pnp/quirks.c index de99f371d362..ac98b9919029 100644 --- a/drivers/pnp/quirks.c +++ b/drivers/pnp/quirks.c @@ -226,8 +226,6 @@ static void quirk_ad1815_mpu_resources(struct pnp_dev *dev) dev_info(&dev->dev, "made independent IRQ optional\n"); } -#include - static void quirk_system_pci_resources(struct pnp_dev *dev) { struct pci_dev *pdev = NULL; diff --git a/drivers/power/avs/qcom-cpr.c b/drivers/power/avs/qcom-cpr.c index bd7c3e48b386..b24cc77d1889 100644 --- a/drivers/power/avs/qcom-cpr.c +++ b/drivers/power/avs/qcom-cpr.c @@ -665,8 +665,6 @@ static int cpr_enable(struct cpr_drv *drv) static int cpr_disable(struct cpr_drv *drv) { - int ret; - mutex_lock(&drv->lock); if (cpr_is_allowed(drv)) { @@ -676,11 +674,7 @@ static int cpr_disable(struct cpr_drv *drv) mutex_unlock(&drv->lock); - ret = regulator_disable(drv->vdd_apc); - if (ret) - return ret; - - return 0; + return regulator_disable(drv->vdd_apc); } static int cpr_config(struct cpr_drv *drv) diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index 4310901a074e..6e1a0043c411 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -43,6 +43,7 @@ #include #include #include +#include #include diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 8f5f5dc863a4..0c65fbd41035 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1739,23 +1739,65 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) payload_sz = sizeof(cmd_request->mpb); if (sg_count) { - if (sg_count > MAX_PAGE_BUFFER_COUNT) { + unsigned int hvpgoff = 0; + unsigned long offset_in_hvpg = sgl->offset & ~HV_HYP_PAGE_MASK; + unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length); + u64 hvpfn; - payload_sz = (sg_count * sizeof(u64) + + if (hvpg_count > MAX_PAGE_BUFFER_COUNT) { + + payload_sz = (hvpg_count * sizeof(u64) + sizeof(struct vmbus_packet_mpb_array)); payload = kzalloc(payload_sz, GFP_ATOMIC); if (!payload) return SCSI_MLQUEUE_DEVICE_BUSY; } + /* + * sgl is a list of PAGEs, and payload->range.pfn_array + * expects the page number in the unit of HV_HYP_PAGE_SIZE (the + * page size that Hyper-V uses, so here we need to divide PAGEs + * into HV_HYP_PAGE in case that PAGE_SIZE > HV_HYP_PAGE_SIZE. + * Besides, payload->range.offset should be the offset in one + * HV_HYP_PAGE. + */ payload->range.len = length; - payload->range.offset = sgl[0].offset; + payload->range.offset = offset_in_hvpg; + hvpgoff = sgl->offset >> HV_HYP_PAGE_SHIFT; cur_sgl = sgl; - for (i = 0; i < sg_count; i++) { - payload->range.pfn_array[i] = - page_to_pfn(sg_page((cur_sgl))); - cur_sgl = sg_next(cur_sgl); + for (i = 0; i < hvpg_count; i++) { + /* + * 'i' is the index of hv pages in the payload and + * 'hvpgoff' is the offset (in hv pages) of the first + * hv page in the the first page. The relationship + * between the sum of 'i' and 'hvpgoff' and the offset + * (in hv pages) in a payload page ('hvpgoff_in_page') + * is as follow: + * + * |------------------ PAGE -------------------| + * | NR_HV_HYP_PAGES_IN_PAGE hvpgs in total | + * |hvpg|hvpg| ... |hvpg|... |hvpg| + * ^ ^ ^ ^ + * +-hvpgoff-+ +-hvpgoff_in_page-+ + * ^ | + * +--------------------- i ---------------------------+ + */ + unsigned int hvpgoff_in_page = + (i + hvpgoff) % NR_HV_HYP_PAGES_IN_PAGE; + + /* + * Two cases that we need to fetch a page: + * 1) i == 0, the first step or + * 2) hvpgoff_in_page == 0, when we reach the boundary + * of a page. + */ + if (hvpgoff_in_page == 0 || i == 0) { + hvpfn = page_to_hvpfn(sg_page(cur_sgl)); + cur_sgl = sg_next(cur_sgl); + } + + payload->range.pfn_array[i] = hvpfn + hvpgoff_in_page; } } diff --git a/drivers/soc/samsung/exynos-asv.c b/drivers/soc/samsung/exynos-asv.c index 30bb7b7cc769..8abf4dfaa5c5 100644 --- a/drivers/soc/samsung/exynos-asv.c +++ b/drivers/soc/samsung/exynos-asv.c @@ -93,7 +93,7 @@ static int exynos_asv_update_opps(struct exynos_asv *asv) continue; opp_table = dev_pm_opp_get_opp_table(cpu); - if (IS_ERR_OR_NULL(opp_table)) + if (IS_ERR(opp_table)) continue; if (!last_opp_table || opp_table != last_opp_table) { diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 64a9025a87be..a36b71286bcf 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -720,17 +720,18 @@ struct gntdev_copy_batch { s16 __user *status[GNTDEV_COPY_BATCH]; unsigned int nr_ops; unsigned int nr_pages; + bool writeable; }; static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, - bool writeable, unsigned long *gfn) + unsigned long *gfn) { unsigned long addr = (unsigned long)virt; struct page *page; unsigned long xen_pfn; int ret; - ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page); + ret = pin_user_pages_fast(addr, 1, batch->writeable ? FOLL_WRITE : 0, &page); if (ret < 0) return ret; @@ -744,11 +745,9 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, static void gntdev_put_pages(struct gntdev_copy_batch *batch) { - unsigned int i; - - for (i = 0; i < batch->nr_pages; i++) - put_page(batch->pages[i]); + unpin_user_pages_dirty_lock(batch->pages, batch->nr_pages, batch->writeable); batch->nr_pages = 0; + batch->writeable = false; } static int gntdev_copy(struct gntdev_copy_batch *batch) @@ -837,8 +836,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch, virt = seg->source.virt + copied; off = (unsigned long)virt & ~XEN_PAGE_MASK; len = min(len, (size_t)XEN_PAGE_SIZE - off); + batch->writeable = false; - ret = gntdev_get_page(batch, virt, false, &gfn); + ret = gntdev_get_page(batch, virt, &gfn); if (ret < 0) return ret; @@ -856,8 +856,9 @@ static int gntdev_grant_copy_seg(struct gntdev_copy_batch *batch, virt = seg->dest.virt + copied; off = (unsigned long)virt & ~XEN_PAGE_MASK; len = min(len, (size_t)XEN_PAGE_SIZE - off); + batch->writeable = true; - ret = gntdev_get_page(batch, virt, true, &gfn); + ret = gntdev_get_page(batch, virt, &gfn); if (ret < 0) return ret; diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 72d725a0ab5c..7984645b5956 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -371,7 +371,7 @@ out: static int create_active(struct sock_mapping *map, evtchn_port_t *evtchn) { void *bytes; - int ret = -ENOMEM, irq = -1, i; + int ret, irq = -1, i; *evtchn = 0; init_waitqueue_head(&map->active.inflight_conn_req); diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h index 5940a3c68a96..a225eff499c8 100644 --- a/include/acpi/acconfig.h +++ b/include/acpi/acconfig.h @@ -121,7 +121,7 @@ * *****************************************************************************/ -/* Method info (in WALK_STATE), containing local variables and argumetns */ +/* Method info (in WALK_STATE), containing local variables and arguments */ #define ACPI_METHOD_NUM_LOCALS 8 #define ACPI_METHOD_MAX_LOCAL 7 diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 436cd1411c3a..2fc624a61769 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -40,12 +40,12 @@ struct acpi_exception_info { char *name; -#ifdef ACPI_HELP_APP +#if defined (ACPI_HELP_APP) || defined (ACPI_ASL_COMPILER) char *description; #endif }; -#ifdef ACPI_HELP_APP +#if defined (ACPI_HELP_APP) || defined (ACPI_ASL_COMPILER) #define EXCEP_TXT(name,description) {name, description} #else #define EXCEP_TXT(name,description) {name} diff --git a/include/acpi/acpi_io.h b/include/acpi/acpi_io.h index 12d8bd333fe7..027faa8883aa 100644 --- a/include/acpi/acpi_io.h +++ b/include/acpi/acpi_io.h @@ -21,7 +21,7 @@ void __iomem __ref void __ref acpi_os_unmap_iomem(void __iomem *virt, acpi_size size); void __iomem *acpi_os_get_iomem(acpi_physical_address phys, unsigned int size); -int acpi_os_map_generic_address(struct acpi_generic_address *addr); +void __iomem *acpi_os_map_generic_address(struct acpi_generic_address *addr); void acpi_os_unmap_generic_address(struct acpi_generic_address *addr); #endif diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index 0e9302285f14..a4c6ef809e27 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -26,6 +26,10 @@ extern int srat_disabled(void); static inline void disable_srat(void) { } +static inline int pxm_to_node(int pxm) +{ + return 0; +} #endif /* CONFIG_ACPI_NUMA */ #ifdef CONFIG_ACPI_HMAT diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 9dc816641286..be7de305a622 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -12,7 +12,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20200717 +#define ACPI_CA_VERSION 0x20200925 #include #include diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index d50e61384f1f..647cb11d0a0a 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -824,7 +824,7 @@ typedef u8 acpi_adr_space_type; * * Note: A Data Table region is a special type of operation region * that has its own AML opcode. However, internally, the AML - * interpreter simply creates an operation region with an an address + * interpreter simply creates an operation region with an address * space type of ACPI_ADR_SPACE_DATA_TABLE. */ #define ACPI_ADR_SPACE_DATA_TABLE (acpi_adr_space_type) 0x7E /* Internal to ACPICA only */ diff --git a/include/acpi/acuuid.h b/include/acpi/acuuid.h index 9e1367b19069..10e30a5030ee 100644 --- a/include/acpi/acuuid.h +++ b/include/acpi/acuuid.h @@ -27,6 +27,10 @@ #define UUID_PCI_HOST_BRIDGE "33db4d5b-1ff7-401c-9657-7441c03dd766" #define UUID_I2C_DEVICE "3cdff6f7-4267-4555-ad05-b30a3d8938de" #define UUID_POWER_BUTTON "dfbcf3c5-e7a5-44e6-9c1f-29c76f6e059c" +#define UUID_MEMORY_DEVICE "03b19910-f473-11dd-87af-0800200c9a66" +#define UUID_GENERIC_BUTTONS_DEVICE "fa6bd625-9ce8-470d-a2c7-b3ca36c4282e" +#define UUID_NVDIMM_ROOT_DEVICE "2f10e7a4-9e91-11e4-89d3-123b93f75cba" +#define UUID_CONTROL_METHOD_BATTERY "f18fc78b-0f15-4978-b793-53f833a1d35b" /* Interfaces */ @@ -56,5 +60,8 @@ #define UUID_BATTERY_THERMAL_LIMIT "4c2067e3-887d-475c-9720-4af1d3ed602e" #define UUID_THERMAL_EXTENSIONS "14d399cd-7a27-4b18-8fb4-7cb7b9f4e500" #define UUID_DEVICE_PROPERTIES "daffd814-6eba-4d8c-8a91-bc9bbf4aa301" +#define UUID_DEVICE_GRAPHS "ab02a46b-74c7-45a2-bd68-f7d344ef2153" +#define UUID_HIERARCHICAL_DATA_EXTENSION "dbb8e3e6-5886-4ba6-8795-1319f52a966b" +#define UUID_CORESIGHT_GRAPH "3ecbc8b6-1d0e-4fb3-8107-e627f805c6cd" #endif /* __ACUUID_H__ */ diff --git a/include/acpi/battery.h b/include/acpi/battery.h index 5d8f5d910c82..b8d56b702c7a 100644 --- a/include/acpi/battery.h +++ b/include/acpi/battery.h @@ -2,6 +2,8 @@ #ifndef __ACPI_BATTERY_H #define __ACPI_BATTERY_H +#include + #define ACPI_BATTERY_CLASS "battery" #define ACPI_BATTERY_NOTIFY_STATUS 0x80 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h index 987e2af7c335..72f52a1342a0 100644 --- a/include/acpi/platform/aclinux.h +++ b/include/acpi/platform/aclinux.h @@ -118,6 +118,10 @@ #define USE_NATIVE_ALLOCATE_ZEROED +/* Use logical addresses for accessing GPE registers in system memory */ + +#define ACPI_GPE_USE_LOGICAL_ADDRESSES + /* * Overrides for in-kernel ACPICA */ @@ -190,7 +194,8 @@ #if defined(__ia64__) || (defined(__x86_64__) && !defined(__ILP32__)) ||\ defined(__aarch64__) || defined(__PPC64__) ||\ - defined(__s390x__) + defined(__s390x__) ||\ + (defined(__riscv) && (defined(__LP64__) || defined(_LP64))) #define ACPI_MACHINE_WIDTH 64 #define COMPILER_DEPENDENT_INT64 long #define COMPILER_DEPENDENT_UINT64 unsigned long diff --git a/include/linux/acpi.h b/include/linux/acpi.h index cfa8c0015863..143c6ffce2db 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -420,28 +420,27 @@ int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); /** - * acpi_map_pxm_to_online_node - Map proximity ID to online node + * pxm_to_online_node - Map proximity ID to online node * @pxm: ACPI proximity ID * - * This is similar to acpi_map_pxm_to_node(), but always returns an online + * This is similar to pxm_to_node(), but always returns an online * node. When the mapped node from a given proximity ID is offline, it * looks up the node distance table and returns the nearest online node. * * ACPI device drivers, which are called after the NUMA initialization has * completed in the kernel, can call this interface to obtain their device * NUMA topology from ACPI tables. Such drivers do not have to deal with - * offline nodes. A node may be offline when a device proximity ID is - * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. - * "numa=off" on x86. + * offline nodes. A node may be offline when SRAT memory entry does not exist, + * or NUMA is disabled, ex. "numa=off" on x86. */ -static inline int acpi_map_pxm_to_online_node(int pxm) +static inline int pxm_to_online_node(int pxm) { - int node = acpi_map_pxm_to_node(pxm); + int node = pxm_to_node(pxm); return numa_map_to_online_node(node); } #else -static inline int acpi_map_pxm_to_online_node(int pxm) +static inline int pxm_to_online_node(int pxm) { return 0; } @@ -546,6 +545,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); #define OSC_SB_PCLPI_SUPPORT 0x00000080 #define OSC_SB_OSLPI_SUPPORT 0x00000100 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 +#define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00002000 extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; @@ -867,7 +867,7 @@ static inline bool acpi_driver_match_device(struct device *dev, static inline union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, - int rev, int func, + u64 rev, u64 func, union acpi_object *argv4) { return NULL; @@ -979,8 +979,6 @@ int acpi_subsys_runtime_suspend(struct device *dev); int acpi_subsys_runtime_resume(struct device *dev); int acpi_dev_pm_attach(struct device *dev, bool power_on); #else -static inline int acpi_dev_runtime_suspend(struct device *dev) { return 0; } -static inline int acpi_dev_runtime_resume(struct device *dev) { return 0; } static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; } static inline int acpi_dev_pm_attach(struct device *dev, bool power_on) @@ -1218,13 +1216,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, return -ENXIO; } -static inline int acpi_dev_prop_get(const struct acpi_device *adev, - const char *propname, - void **valptr) -{ - return -ENXIO; -} - static inline int acpi_dev_prop_read_single(const struct acpi_device *adev, const char *propname, enum dev_prop_type proptype, diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 69b1dabe39dc..0f6cd6b73a61 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -30,7 +30,11 @@ static inline unsigned long topology_get_freq_scale(int cpu) return per_cpu(freq_scale, cpu); } -bool arch_freq_counters_available(struct cpumask *cpus); +void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, + unsigned long max_freq); +bool topology_scale_freq_invariant(void); + +bool arch_freq_counters_available(const struct cpumask *cpus); DECLARE_PER_CPU(unsigned long, thermal_pressure); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 40044fb77d54..f5513a424072 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -217,6 +217,7 @@ void refresh_frequency_limits(struct cpufreq_policy *policy); void cpufreq_update_policy(unsigned int cpu); void cpufreq_update_limits(unsigned int cpu); bool have_governor_per_policy(void); +bool cpufreq_supports_freq_invariance(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); void cpufreq_enable_fast_switch(struct cpufreq_policy *policy); void cpufreq_disable_fast_switch(struct cpufreq_policy *policy); @@ -237,6 +238,10 @@ static inline unsigned int cpufreq_get_hw_max_freq(unsigned int cpu) { return 0; } +static inline bool cpufreq_supports_freq_invariance(void) +{ + return false; +} static inline void disable_cpufreq(void) { } #endif @@ -998,8 +1003,14 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) extern void arch_freq_prepare_all(void); extern unsigned int arch_freq_get_on_cpu(int cpu); -extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, - unsigned long max_freq); +#ifndef arch_set_freq_scale +static __always_inline +void arch_set_freq_scale(const struct cpumask *cpus, + unsigned long cur_freq, + unsigned long max_freq) +{ +} +#endif /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 6175c77bf25e..ed0da0e58e8b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -38,6 +38,7 @@ struct cpuidle_state_usage { u64 time_ns; unsigned long long above; /* Number of times it's been too deep */ unsigned long long below; /* Number of times it's been too shallow */ + unsigned long long rejected; /* Number of times idle entry was rejected */ #ifdef CONFIG_SUSPEND unsigned long long s2idle_usage; unsigned long long s2idle_time; /* in US */ diff --git a/include/linux/devfreq-event.h b/include/linux/devfreq-event.h index f14f17f8cb7f..4a50a5c71a5f 100644 --- a/include/linux/devfreq-event.h +++ b/include/linux/devfreq-event.h @@ -106,8 +106,11 @@ extern int devfreq_event_get_event(struct devfreq_event_dev *edev, struct devfreq_event_data *edata); extern int devfreq_event_reset_event(struct devfreq_event_dev *edev); extern struct devfreq_event_dev *devfreq_event_get_edev_by_phandle( - struct device *dev, int index); -extern int devfreq_event_get_edev_count(struct device *dev); + struct device *dev, + const char *phandle_name, + int index); +extern int devfreq_event_get_edev_count(struct device *dev, + const char *phandle_name); extern struct devfreq_event_dev *devfreq_event_add_edev(struct device *dev, struct devfreq_event_desc *desc); extern int devfreq_event_remove_edev(struct devfreq_event_dev *edev); @@ -152,12 +155,15 @@ static inline int devfreq_event_reset_event(struct devfreq_event_dev *edev) } static inline struct devfreq_event_dev *devfreq_event_get_edev_by_phandle( - struct device *dev, int index) + struct device *dev, + const char *phandle_name, + int index) { return ERR_PTR(-EINVAL); } -static inline int devfreq_event_get_edev_count(struct device *dev) +static inline int devfreq_event_get_edev_count(struct device *dev, + const char *phandle_name) { return -EINVAL; } diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index 12782fbb4c25..2f4a74efa6be 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -261,7 +261,9 @@ void devm_devfreq_unregister_notifier(struct device *dev, struct devfreq *devfreq, struct notifier_block *nb, unsigned int list); -struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev, int index); +struct devfreq *devfreq_get_devfreq_by_node(struct device_node *node); +struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev, + const char *phandle_name, int index); #if IS_ENABLED(CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND) /** @@ -414,8 +416,13 @@ static inline void devm_devfreq_unregister_notifier(struct device *dev, { } +static inline struct devfreq *devfreq_get_devfreq_by_node(struct device_node *node) +{ + return ERR_PTR(-ENODEV); +} + static inline struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev, - int index) + const char *phandle_name, int index) { return ERR_PTR(-ENODEV); } diff --git a/include/linux/frame.h b/include/linux/frame.h deleted file mode 100644 index 303cda600e56..000000000000 --- a/include/linux/frame.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FRAME_H -#define _LINUX_FRAME_H - -#ifdef CONFIG_STACK_VALIDATION -/* - * This macro marks the given function's stack frame as "non-standard", which - * tells objtool to ignore the function when doing stack metadata validation. - * It should only be used in special cases where you're 100% sure it won't - * affect the reliability of frame pointers and kernel stack traces. - * - * For more information, see tools/objtool/Documentation/stack-validation.txt. - */ -#define STACK_FRAME_NON_STANDARD(func) \ - static void __used __section(.discard.func_stack_frame_non_standard) \ - *__func_stack_frame_non_standard_##func = func - -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL \ - 999: \ - .pushsection .discard.intra_function_calls; \ - .long 999b; \ - .popsection; - -#else /* !CONFIG_STACK_VALIDATION */ - -#define STACK_FRAME_NON_STANDARD(func) -#define ANNOTATE_INTRA_FUNCTION_CALL - -#endif /* CONFIG_STACK_VALIDATION */ - -#endif /* _LINUX_FRAME_H */ diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 38100e80360a..1ce131f29f3b 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -14,6 +14,7 @@ #include +#include #include #include #include @@ -23,12 +24,55 @@ #include #include #include +#include #define MAX_PAGE_BUFFER_COUNT 32 #define MAX_MULTIPAGE_BUFFER_COUNT 32 /* 128K */ #pragma pack(push, 1) +/* + * Types for GPADL, decides is how GPADL header is created. + * + * It doesn't make much difference between BUFFER and RING if PAGE_SIZE is the + * same as HV_HYP_PAGE_SIZE. + * + * If PAGE_SIZE is bigger than HV_HYP_PAGE_SIZE, the headers of ring buffers + * will be of PAGE_SIZE, however, only the first HV_HYP_PAGE will be put + * into gpadl, therefore the number for HV_HYP_PAGE and the indexes of each + * HV_HYP_PAGE will be different between different types of GPADL, for example + * if PAGE_SIZE is 64K: + * + * BUFFER: + * + * gva: |-- 64k --|-- 64k --| ... | + * gpa: | 4k | 4k | ... | 4k | 4k | 4k | ... | 4k | + * index: 0 1 2 15 16 17 18 .. 31 32 ... + * | | ... | | | ... | ... + * v V V V V V + * gpadl: | 4k | 4k | ... | 4k | 4k | 4k | ... | 4k | ... | + * index: 0 1 2 ... 15 16 17 18 .. 31 32 ... + * + * RING: + * + * | header | data | header | data | + * gva: |-- 64k --|-- 64k --| ... |-- 64k --|-- 64k --| ... | + * gpa: | 4k | .. | 4k | 4k | ... | 4k | ... | 4k | .. | 4k | .. | ... | + * index: 0 1 16 17 18 31 ... n n+1 n+16 ... 2n + * | / / / | / / + * | / / / | / / + * | / / ... / ... | / ... / + * | / / / | / / + * | / / / | / / + * V V V V V V v + * gpadl: | 4k | 4k | ... | ... | 4k | 4k | ... | + * index: 0 1 2 ... 16 ... n-15 n-14 n-13 ... 2n-30 + */ +enum hv_gpadl_type { + HV_GPADL_BUFFER, + HV_GPADL_RING +}; + /* Single-page buffer */ struct hv_page_buffer { u32 len; @@ -111,7 +155,7 @@ struct hv_ring_buffer { } feature_bits; /* Pad it to PAGE_SIZE so that data starts on page boundary */ - u8 reserved2[4028]; + u8 reserved2[PAGE_SIZE - 68]; /* * Ring data starts here + RingDataStartOffset @@ -120,6 +164,10 @@ struct hv_ring_buffer { u8 buffer[]; } __packed; +/* Calculate the proper size of a ringbuffer, it must be page-aligned */ +#define VMBUS_RING_SIZE(payload_sz) PAGE_ALIGN(sizeof(struct hv_ring_buffer) + \ + (payload_sz)) + struct hv_ring_buffer_info { struct hv_ring_buffer *ring_buffer; u32 ring_size; /* Include the shared header */ @@ -1630,4 +1678,22 @@ struct hyperv_pci_block_ops { extern struct hyperv_pci_block_ops hvpci_block_ops; +static inline unsigned long virt_to_hvpfn(void *addr) +{ + phys_addr_t paddr; + + if (is_vmalloc_addr(addr)) + paddr = page_to_phys(vmalloc_to_page(addr)) + + offset_in_page(addr); + else + paddr = __pa(addr); + + return paddr >> HV_HYP_PAGE_SHIFT; +} + +#define NR_HV_HYP_PAGES_IN_PAGE (PAGE_SIZE / HV_HYP_PAGE_SIZE) +#define offset_in_hvpage(ptr) ((unsigned long)(ptr) & ~HV_HYP_PAGE_MASK) +#define HVPFN_UP(x) (((x) + HV_HYP_PAGE_SIZE-1) >> HV_HYP_PAGE_SHIFT) +#define page_to_hvpfn(page) (page_to_pfn(page) * NR_HV_HYP_PAGES_IN_PAGE) + #endif /* _HYPERV_H */ diff --git a/include/linux/isapnp.h b/include/linux/isapnp.h index 11edb2109a68..dba18c95844b 100644 --- a/include/linux/isapnp.h +++ b/include/linux/isapnp.h @@ -75,9 +75,6 @@ static inline int isapnp_proc_done(void) { return 0; } #endif /* compat */ -struct pnp_card *pnp_find_card(unsigned short vendor, - unsigned short device, - struct pnp_card *from); struct pnp_dev *pnp_find_dev(struct pnp_card *card, unsigned short vendor, unsigned short function, @@ -92,9 +89,6 @@ static inline int isapnp_cfg_end(void) { return -ENODEV; } static inline unsigned char isapnp_read_byte(unsigned char idx) { return 0xff; } static inline void isapnp_write_byte(unsigned char idx, unsigned char val) { ; } -static inline struct pnp_card *pnp_find_card(unsigned short vendor, - unsigned short device, - struct pnp_card *from) { return NULL; } static inline struct pnp_dev *pnp_find_dev(struct pnp_card *card, unsigned short vendor, unsigned short function, diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 27e7fa36f707..3334ce056335 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -399,6 +399,7 @@ enum node_states { #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ + N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; diff --git a/include/linux/objtool.h b/include/linux/objtool.h new file mode 100644 index 000000000000..ab82c793c897 --- /dev/null +++ b/include/linux/objtool.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_OBJTOOL_H +#define _LINUX_OBJTOOL_H + +#ifndef __ASSEMBLY__ + +#include + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; + u8 end; +}; +#endif + +/* + * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP + * (the caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset + * points to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that + * sp_reg+sp_offset points to the iret return frame. + */ +#define UNWIND_HINT_TYPE_CALL 0 +#define UNWIND_HINT_TYPE_REGS 1 +#define UNWIND_HINT_TYPE_REGS_PARTIAL 2 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 + +#ifdef CONFIG_STACK_VALIDATION + +#ifndef __ASSEMBLY__ + +#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".byte " __stringify(end) "\n\t" \ + ".balign 4 \n\t" \ + ".popsection\n\t" + +/* + * This macro marks the given function's stack frame as "non-standard", which + * tells objtool to ignore the function when doing stack metadata validation. + * It should only be used in special cases where you're 100% sure it won't + * affect the reliability of frame pointers and kernel stack traces. + * + * For more information, see tools/objtool/Documentation/stack-validation.txt. + */ +#define STACK_FRAME_NON_STANDARD(func) \ + static void __used __section(.discard.func_stack_frame_non_standard) \ + *__func_stack_frame_non_standard_##func = func + +#else /* __ASSEMBLY__ */ + +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL \ + 999: \ + .pushsection .discard.intra_function_calls; \ + .long 999b; \ + .popsection; + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .byte \end + .balign 4 + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_STACK_VALIDATION */ + +#ifndef __ASSEMBLY__ + +#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ + "\n\t" +#define STACK_FRAME_NON_STANDARD(func) +#else +#define ANNOTATE_INTRA_FUNCTION_CALL +.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.endm +#endif + +#endif /* CONFIG_STACK_VALIDATION */ + +#endif /* _LINUX_OBJTOOL_H */ diff --git a/include/linux/platform_data/mlxreg.h b/include/linux/platform_data/mlxreg.h index 1af9c01563f9..101333fe2b8d 100644 --- a/include/linux/platform_data/mlxreg.h +++ b/include/linux/platform_data/mlxreg.h @@ -1,34 +1,6 @@ +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 */ /* - * Copyright (c) 2017 Mellanox Technologies. All rights reserved. - * Copyright (c) 2017 Vadim Pasternak - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Copyright (C) 2017-2020 Mellanox Technologies Ltd. */ #ifndef __LINUX_PLATFORM_DATA_MLXREG_H @@ -137,6 +109,7 @@ struct mlxreg_core_item { * @features: supported features of device; * @version: implementation version; * @identity: device identity name; + * @capability: device capability register; */ struct mlxreg_core_platform_data { struct mlxreg_core_data *data; @@ -145,6 +118,7 @@ struct mlxreg_core_platform_data { u32 features; u32 version; char identity[MLXREG_CORE_LABEL_MAX_SIZE]; + u32 capability; }; /** diff --git a/include/linux/pm.h b/include/linux/pm.h index a30a4b54df52..47aca6bac1d6 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -590,7 +590,7 @@ struct dev_pm_info { #endif #ifdef CONFIG_PM struct hrtimer suspend_timer; - unsigned long timer_expires; + u64 timer_expires; struct work_struct work; wait_queue_head_t wait_queue; struct wake_irq *wakeirq; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index ee11502a575b..66f3c5d64d81 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -64,8 +64,8 @@ #define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5) enum gpd_status { - GPD_STATE_ACTIVE = 0, /* PM domain is active */ - GPD_STATE_POWER_OFF, /* PM domain is off */ + GENPD_STATE_ON = 0, /* PM domain is on */ + GENPD_STATE_OFF, /* PM domain is off */ }; struct dev_power_governor { diff --git a/include/linux/psci.h b/include/linux/psci.h index 14ad9b9ebcd6..2a1bfb890e58 100644 --- a/include/linux/psci.h +++ b/include/linux/psci.h @@ -18,7 +18,7 @@ bool psci_tos_resident_on(int cpu); int psci_cpu_suspend_enter(u32 state); bool psci_power_state_is_valid(u32 state); -int psci_set_osi_mode(void); +int psci_set_osi_mode(bool enable); bool psci_has_osi_support(void); struct psci_operations { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 046bb94bd4d6..513913ff7486 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -34,6 +34,7 @@ int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); +extern int swiotlb_late_init_with_default_size(size_t default_size); extern void __init swiotlb_update_mem_attributes(void); /* diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed0b3578867c..03e284873644 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c19379fabd20..465a567678d9 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -93,7 +93,7 @@ static unsigned int io_tlb_index; * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). */ -unsigned int max_segment; +static unsigned int max_segment; /* * We need to save away the original address corresponding to a mapped entry @@ -172,9 +172,7 @@ void swiotlb_print_info(void) return; } - pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", - (unsigned long long)io_tlb_start, - (unsigned long long)io_tlb_end, + pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end, bytes >> 20); } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c19c0dad1ebe..c5e5e5a11535 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1dee70815f3c..2fc7d509a34f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -946,17 +946,6 @@ static int software_resume(void) /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); - - /* - * name_to_dev_t is ineffective to verify parition if resume_file is in - * integer format. (e.g. major:minor) - */ - if (isdigit(resume_file[0]) && resume_wait) { - int partno; - while (!get_gendisk(swsusp_resume_device, &partno)) - msleep(10); - } - if (!swsusp_resume_device) { /* * Some device discovery might still be in progress; we need diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 71385bedcc3a..c73f2e295167 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -226,6 +226,7 @@ struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; blk_status_t error; + struct blk_plug plug; }; static void hib_init_batch(struct hib_bio_batch *hb) @@ -233,6 +234,12 @@ static void hib_init_batch(struct hib_bio_batch *hb) atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); hb->error = BLK_STS_OK; + blk_start_plug(&hb->plug); +} + +static void hib_finish_batch(struct hib_bio_batch *hb) +{ + blk_finish_plug(&hb->plug); } static void hib_end_io(struct bio *bio) @@ -294,6 +301,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { + /* + * We are relying on the behavior of blk_plug that a thread with + * a plug will flush the plug list before sleeping. + */ wait_event(hb->wait, atomic_read(&hb->count) == 0); return blk_status_to_errno(hb->error); } @@ -558,6 +569,7 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -851,6 +863,7 @@ out_finish: pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: + hib_finish_batch(&hb); if (crc) { if (crc->thr) kthread_stop(crc->thr); @@ -1081,6 +1094,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -1444,6 +1458,7 @@ out_finish: } swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: + hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); if (crc) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 19a5400de2db..d1d2e5437879 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -114,22 +114,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - int cpu; - - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - - if (trace_cpu_frequency_enabled()) { - for_each_cpu(cpu, policy->cpus) - trace_cpu_frequency(next_freq, cpu); - } + if (sugov_update_next_freq(sg_policy, time, next_freq)) + cpufreq_driver_fast_switch(sg_policy->policy, next_freq); } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index d25534940bde..fdbffec4cfde 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -39,27 +39,6 @@ #define ORC_REG_SP_INDIRECT 9 #define ORC_REG_MAX 15 -/* - * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the - * caller's SP right before it made the call). Used for all callable - * functions, i.e. all C code and all callable asm functions. - * - * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points - * to a fully populated pt_regs from a syscall, interrupt, or exception. - * - * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset - * points to the iret return frame. - * - * The UNWIND_HINT macros are used only for the unwind_hint struct. They - * aren't used in struct orc_entry due to size and complexity constraints. - * Objtool converts them to real types when it converts the hints to orc - * entries. - */ -#define ORC_TYPE_CALL 0 -#define ORC_TYPE_REGS 1 -#define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_RET_OFFSET 3 - #ifndef __ASSEMBLY__ /* * This struct is more or less a vastly simplified version of the DWARF Call @@ -78,19 +57,6 @@ struct orc_entry { unsigned end:1; } __packed; -/* - * This struct is used by asm and inline asm code to manually annotate the - * location of registers on the stack for the ORC unwinder. - * - * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. - */ -struct unwind_hint { - u32 ip; - s16 sp_offset; - u8 sp_reg; - u8 type; - u8 end; -}; #endif /* __ASSEMBLY__ */ #endif /* _ORC_TYPES_H */ diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index a42015b305f4..af38469afd14 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod) END { if (awkchecked != "") exit 1 + + print "#ifndef __BOOT_COMPRESSED\n" + # print escape opcode map's array print "/* Escape opcode map array */" print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ @@ -388,6 +391,51 @@ END { for (j = 0; j < max_lprefix; j++) if (atable[i,j]) print " ["i"]["j"] = "atable[i,j]"," - print "};" + print "};\n" + + print "#else /* !__BOOT_COMPRESSED */\n" + + print "/* Escape opcode map array */" + print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "/* Group opcode map array */" + print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "/* AVX opcode map array */" + print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1];" + print "" + + print "static void inat_init_tables(void)" + print "{" + + # print escape opcode map's array + print "\t/* Print Escape opcode map array */" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";" + print "" + + # print group opcode map's array + print "\t/* Print Group opcode map array */" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";" + print "" + # print AVX opcode map's array + print "\t/* Print AVX opcode map array */" + for (i = 0; i < gaid; i++) + for (j = 0; j < max_lprefix; j++) + if (atable[i,j]) + print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + + print "}" + print "#endif" } diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h new file mode 100644 index 000000000000..ab82c793c897 --- /dev/null +++ b/tools/include/linux/objtool.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_OBJTOOL_H +#define _LINUX_OBJTOOL_H + +#ifndef __ASSEMBLY__ + +#include + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; + u8 end; +}; +#endif + +/* + * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP + * (the caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset + * points to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that + * sp_reg+sp_offset points to the iret return frame. + */ +#define UNWIND_HINT_TYPE_CALL 0 +#define UNWIND_HINT_TYPE_REGS 1 +#define UNWIND_HINT_TYPE_REGS_PARTIAL 2 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 + +#ifdef CONFIG_STACK_VALIDATION + +#ifndef __ASSEMBLY__ + +#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".byte " __stringify(end) "\n\t" \ + ".balign 4 \n\t" \ + ".popsection\n\t" + +/* + * This macro marks the given function's stack frame as "non-standard", which + * tells objtool to ignore the function when doing stack metadata validation. + * It should only be used in special cases where you're 100% sure it won't + * affect the reliability of frame pointers and kernel stack traces. + * + * For more information, see tools/objtool/Documentation/stack-validation.txt. + */ +#define STACK_FRAME_NON_STANDARD(func) \ + static void __used __section(.discard.func_stack_frame_non_standard) \ + *__func_stack_frame_non_standard_##func = func + +#else /* __ASSEMBLY__ */ + +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL \ + 999: \ + .pushsection .discard.intra_function_calls; \ + .long 999b; \ + .popsection; + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .byte \end + .balign 4 + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_STACK_VALIDATION */ + +#ifndef __ASSEMBLY__ + +#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ + "\n\t" +#define STACK_FRAME_NON_STANDARD(func) +#else +#define ANNOTATE_INTRA_FUNCTION_CALL +.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.endm +#endif + +#endif /* CONFIG_STACK_VALIDATION */ + +#endif /* _LINUX_OBJTOOL_H */ diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 7770edcda3a0..4ea9a833dde7 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -37,7 +37,7 @@ INCLUDES := -I$(srctree)/tools/include \ -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ -I$(srctree)/tools/arch/$(SRCARCH)/include \ -I$(srctree)/tools/objtool/arch/$(SRCARCH)/include -WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed +WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed -Wno-nested-externs CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBELF_FLAGS) LDFLAGS += $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) @@ -55,6 +55,10 @@ ifeq ($(SRCARCH),x86) SUBCMD_ORC := y endif +ifeq ($(SUBCMD_ORC),y) + CFLAGS += -DINSN_USE_ORC +endif + export SUBCMD_CHECK SUBCMD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index 2e2ce089b0e9..4a84c3081b8e 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -11,7 +11,9 @@ #include "objtool.h" #include "cfi.h" +#ifdef INSN_USE_ORC #include +#endif enum insn_type { INSN_JUMP_CONDITIONAL, @@ -86,4 +88,6 @@ unsigned long arch_dest_reloc_offset(int addend); const char *arch_nop_insn(int len); +int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg); + #endif /* _ARCH_H */ diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build index 7c5004008e97..9f7869b5c5e0 100644 --- a/tools/objtool/arch/x86/Build +++ b/tools/objtool/arch/x86/Build @@ -1,3 +1,4 @@ +objtool-y += special.o objtool-y += decode.o inat_tables_script = ../arch/x86/tools/gen-insn-attr-x86.awk diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 1967370440b3..cde9c36e40ae 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -15,6 +15,7 @@ #include "../../elf.h" #include "../../arch.h" #include "../../warn.h" +#include static unsigned char op_to_cfi_reg[][2] = { {CFI_AX, CFI_R8}, @@ -583,3 +584,39 @@ const char *arch_nop_insn(int len) return nops[len-1]; } + +int arch_decode_hint_reg(struct instruction *insn, u8 sp_reg) +{ + struct cfi_reg *cfa = &insn->cfi.cfa; + + switch (sp_reg) { + case ORC_REG_UNDEFINED: + cfa->base = CFI_UNDEFINED; + break; + case ORC_REG_SP: + cfa->base = CFI_SP; + break; + case ORC_REG_BP: + cfa->base = CFI_BP; + break; + case ORC_REG_SP_INDIRECT: + cfa->base = CFI_SP_INDIRECT; + break; + case ORC_REG_R10: + cfa->base = CFI_R10; + break; + case ORC_REG_R13: + cfa->base = CFI_R13; + break; + case ORC_REG_DI: + cfa->base = CFI_DI; + break; + case ORC_REG_DX: + cfa->base = CFI_DX; + break; + default: + return -1; + } + + return 0; +} diff --git a/tools/objtool/arch/x86/include/arch_special.h b/tools/objtool/arch/x86/include/arch_special.h new file mode 100644 index 000000000000..d818b2bffa02 --- /dev/null +++ b/tools/objtool/arch/x86/include/arch_special.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _X86_ARCH_SPECIAL_H +#define _X86_ARCH_SPECIAL_H + +#define EX_ENTRY_SIZE 12 +#define EX_ORIG_OFFSET 0 +#define EX_NEW_OFFSET 4 + +#define JUMP_ENTRY_SIZE 16 +#define JUMP_ORIG_OFFSET 0 +#define JUMP_NEW_OFFSET 4 + +#define ALT_ENTRY_SIZE 13 +#define ALT_ORIG_OFFSET 0 +#define ALT_NEW_OFFSET 4 +#define ALT_FEATURE_OFFSET 8 +#define ALT_ORIG_LEN_OFFSET 10 +#define ALT_NEW_LEN_OFFSET 11 + +#endif /* _X86_ARCH_SPECIAL_H */ diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c new file mode 100644 index 000000000000..fd4af88c0ea5 --- /dev/null +++ b/tools/objtool/arch/x86/special.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include + +#include "../../special.h" +#include "../../builtin.h" + +#define X86_FEATURE_POPCNT (4 * 32 + 23) +#define X86_FEATURE_SMAP (9 * 32 + 20) + +void arch_handle_alternative(unsigned short feature, struct special_alt *alt) +{ + switch (feature) { + case X86_FEATURE_SMAP: + /* + * If UACCESS validation is enabled; force that alternative; + * otherwise force it the other way. + * + * What we want to avoid is having both the original and the + * alternative code flow at the same time, in that case we can + * find paths that see the STAC but take the NOP instead of + * CLAC and the other way around. + */ + if (uaccess) + alt->skip_orig = true; + else + alt->skip_alt = true; + break; + case X86_FEATURE_POPCNT: + /* + * It has been requested that we don't validate the !POPCNT + * feature path which is a "very very small percentage of + * machines". + */ + alt->skip_orig = true; + break; + default: + break; + } +} + +bool arch_support_alt_relocation(struct special_alt *special_alt, + struct instruction *insn, + struct reloc *reloc) +{ + /* + * The x86 alternatives code adjusts the offsets only when it + * encounters a branch instruction at the very beginning of the + * replacement group. + */ + return insn->offset == special_alt->new_off && + (insn->type == INSN_CALL || is_static_jump(insn)); +} + +/* + * There are 3 basic jump table patterns: + * + * 1. jmpq *[rodata addr](,%reg,8) + * + * This is the most common case by far. It jumps to an address in a simple + * jump table which is stored in .rodata. + * + * 2. jmpq *[rodata addr](%rip) + * + * This is caused by a rare GCC quirk, currently only seen in three driver + * functions in the kernel, only with certain obscure non-distro configs. + * + * As part of an optimization, GCC makes a copy of an existing switch jump + * table, modifies it, and then hard-codes the jump (albeit with an indirect + * jump) to use a single entry in the table. The rest of the jump table and + * some of its jump targets remain as dead code. + * + * In such a case we can just crudely ignore all unreachable instruction + * warnings for the entire object file. Ideally we would just ignore them + * for the function, but that would require redesigning the code quite a + * bit. And honestly that's just not worth doing: unreachable instruction + * warnings are of questionable value anyway, and this is such a rare issue. + * + * 3. mov [rodata addr],%reg1 + * ... some instructions ... + * jmpq *(%reg1,%reg2,8) + * + * This is a fairly uncommon pattern which is new for GCC 6. As of this + * writing, there are 11 occurrences of it in the allmodconfig kernel. + * + * As of GCC 7 there are quite a few more of these and the 'in between' code + * is significant. Esp. with KASAN enabled some of the code between the mov + * and jmpq uses .rodata itself, which can confuse things. + * + * TODO: Once we have DWARF CFI and smarter instruction decoding logic, + * ensure the same register is used in the mov and jump instructions. + * + * NOTE: RETPOLINE made it harder still to decode dynamic jumps. + */ +struct reloc *arch_find_switch_table(struct objtool_file *file, + struct instruction *insn) +{ + struct reloc *text_reloc, *rodata_reloc; + struct section *table_sec; + unsigned long table_offset; + + /* look for a relocation which references .rodata */ + text_reloc = find_reloc_by_dest_range(file->elf, insn->sec, + insn->offset, insn->len); + if (!text_reloc || text_reloc->sym->type != STT_SECTION || + !text_reloc->sym->sec->rodata) + return NULL; + + table_offset = text_reloc->addend; + table_sec = text_reloc->sym->sec; + + if (text_reloc->type == R_X86_64_PC32) + table_offset += 4; + + /* + * Make sure the .rodata address isn't associated with a + * symbol. GCC jump tables are anonymous data. + * + * Also support C jump tables which are in the same format as + * switch jump tables. For objtool to recognize them, they + * need to be placed in the C_JUMP_TABLE_SECTION section. They + * have symbols associated with them. + */ + if (find_symbol_containing(table_sec, table_offset) && + strcmp(table_sec->name, C_JUMP_TABLE_SECTION)) + return NULL; + + /* + * Each table entry has a rela associated with it. The rela + * should reference text in the same function as the original + * instruction. + */ + rodata_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset); + if (!rodata_reloc) + return NULL; + + /* + * Use of RIP-relative switch jumps is quite rare, and + * indicates a rare GCC quirk/bug which can leave dead + * code behind. + */ + if (text_reloc->type == R_X86_64_PC32) + file->ignore_unreachables = true; + + return rodata_reloc; +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 7a44174967b5..c6d199bfd0ae 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -41,6 +41,8 @@ const struct option check_options[] = { int cmd_check(int argc, const char **argv) { const char *objname, *s; + struct objtool_file *file; + int ret; argc = parse_options(argc, argv, check_options, check_usage, 0); @@ -53,5 +55,16 @@ int cmd_check(int argc, const char **argv) if (s && !s[9]) vmlinux = true; - return check(objname, false); + file = objtool_open_read(objname); + if (!file) + return 1; + + ret = check(file); + if (ret) + return ret; + + if (file->elf->changed) + return elf_write(file->elf); + + return 0; } diff --git a/tools/objtool/builtin-orc.c b/tools/objtool/builtin-orc.c index b1dfe2007962..7b31121fa60b 100644 --- a/tools/objtool/builtin-orc.c +++ b/tools/objtool/builtin-orc.c @@ -31,13 +31,38 @@ int cmd_orc(int argc, const char **argv) usage_with_options(orc_usage, check_options); if (!strncmp(argv[0], "gen", 3)) { + struct objtool_file *file; + int ret; + argc = parse_options(argc, argv, check_options, orc_usage, 0); if (argc != 1) usage_with_options(orc_usage, check_options); objname = argv[0]; - return check(objname, true); + file = objtool_open_read(objname); + if (!file) + return 1; + + ret = check(file); + if (ret) + return ret; + + if (list_empty(&file->insn_list)) + return 0; + + ret = create_orc(file); + if (ret) + return ret; + + ret = create_orc_sections(file); + if (ret) + return ret; + + if (!file->elf->changed) + return 0; + + return elf_write(file->elf); } if (!strcmp(argv[0], "dump")) { diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 326ac390168b..c6ab44543c92 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -14,21 +14,19 @@ #include "warn.h" #include "arch_elf.h" +#include #include #include #include #define FAKE_JUMP_OFFSET -1 -#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" - struct alternative { struct list_head list; struct instruction *insn; bool skip_orig; }; -const char *objname; struct cfi_init_state initial_func_cfi; struct instruction *find_insn(struct objtool_file *file, @@ -111,12 +109,6 @@ static struct instruction *prev_insn_same_sym(struct objtool_file *file, for (insn = next_insn_same_sec(file, insn); insn; \ insn = next_insn_same_sec(file, insn)) -static bool is_static_jump(struct instruction *insn) -{ - return insn->type == INSN_JUMP_CONDITIONAL || - insn->type == INSN_JUMP_UNCONDITIONAL; -} - static bool is_sibling_call(struct instruction *insn) { /* An indirect jump is either a sibling call or a jump to a table. */ @@ -591,6 +583,8 @@ static const char *uaccess_safe_builtin[] = { "__asan_store4_noabort", "__asan_store8_noabort", "__asan_store16_noabort", + "__kasan_check_read", + "__kasan_check_write", /* KASAN in-line */ "__asan_report_load_n_noabort", "__asan_report_load1_noabort", @@ -879,6 +873,17 @@ static void remove_insn_ops(struct instruction *insn) } } +static struct symbol *find_call_destination(struct section *sec, unsigned long offset) +{ + struct symbol *call_dest; + + call_dest = find_func_by_offset(sec, offset); + if (!call_dest) + call_dest = find_symbol_by_offset(sec, offset); + + return call_dest; +} + /* * Find the destination instructions for all calls. */ @@ -896,9 +901,7 @@ static int add_call_destinations(struct objtool_file *file) insn->offset, insn->len); if (!reloc) { dest_off = arch_jump_destination(insn); - insn->call_dest = find_func_by_offset(insn->sec, dest_off); - if (!insn->call_dest) - insn->call_dest = find_symbol_by_offset(insn->sec, dest_off); + insn->call_dest = find_call_destination(insn->sec, dest_off); if (insn->ignore) continue; @@ -916,8 +919,8 @@ static int add_call_destinations(struct objtool_file *file) } else if (reloc->sym->type == STT_SECTION) { dest_off = arch_dest_reloc_offset(reloc->addend); - insn->call_dest = find_func_by_offset(reloc->sym->sec, - dest_off); + insn->call_dest = find_call_destination(reloc->sym->sec, + dest_off); if (!insn->call_dest) { WARN_FUNC("can't find call dest symbol at %s+0x%lx", insn->sec, insn->offset, @@ -1029,6 +1032,8 @@ static int handle_group_alt(struct objtool_file *file, alt_group = alt_group_next_index++; insn = *new_insn; sec_for_each_insn_from(file, insn) { + struct reloc *alt_reloc; + if (insn->offset >= special_alt->new_off + special_alt->new_len) break; @@ -1045,14 +1050,11 @@ static int handle_group_alt(struct objtool_file *file, * .altinstr_replacement section, unless the arch's * alternatives code can adjust the relative offsets * accordingly. - * - * The x86 alternatives code adjusts the offsets only when it - * encounters a branch instruction at the very beginning of the - * replacement group. */ - if ((insn->offset != special_alt->new_off || - (insn->type != INSN_CALL && !is_static_jump(insn))) && - find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) { + alt_reloc = find_reloc_by_dest_range(file->elf, insn->sec, + insn->offset, insn->len); + if (alt_reloc && + !arch_support_alt_relocation(special_alt, insn, alt_reloc)) { WARN_FUNC("unsupported relocation in alternatives section", insn->sec, insn->offset); @@ -1254,56 +1256,15 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, } /* - * find_jump_table() - Given a dynamic jump, find the switch jump table in - * .rodata associated with it. - * - * There are 3 basic patterns: - * - * 1. jmpq *[rodata addr](,%reg,8) - * - * This is the most common case by far. It jumps to an address in a simple - * jump table which is stored in .rodata. - * - * 2. jmpq *[rodata addr](%rip) - * - * This is caused by a rare GCC quirk, currently only seen in three driver - * functions in the kernel, only with certain obscure non-distro configs. - * - * As part of an optimization, GCC makes a copy of an existing switch jump - * table, modifies it, and then hard-codes the jump (albeit with an indirect - * jump) to use a single entry in the table. The rest of the jump table and - * some of its jump targets remain as dead code. - * - * In such a case we can just crudely ignore all unreachable instruction - * warnings for the entire object file. Ideally we would just ignore them - * for the function, but that would require redesigning the code quite a - * bit. And honestly that's just not worth doing: unreachable instruction - * warnings are of questionable value anyway, and this is such a rare issue. - * - * 3. mov [rodata addr],%reg1 - * ... some instructions ... - * jmpq *(%reg1,%reg2,8) - * - * This is a fairly uncommon pattern which is new for GCC 6. As of this - * writing, there are 11 occurrences of it in the allmodconfig kernel. - * - * As of GCC 7 there are quite a few more of these and the 'in between' code - * is significant. Esp. with KASAN enabled some of the code between the mov - * and jmpq uses .rodata itself, which can confuse things. - * - * TODO: Once we have DWARF CFI and smarter instruction decoding logic, - * ensure the same register is used in the mov and jump instructions. - * - * NOTE: RETPOLINE made it harder still to decode dynamic jumps. + * find_jump_table() - Given a dynamic jump, find the switch jump table + * associated with it. */ static struct reloc *find_jump_table(struct objtool_file *file, struct symbol *func, struct instruction *insn) { - struct reloc *text_reloc, *table_reloc; + struct reloc *table_reloc; struct instruction *dest_insn, *orig_insn = insn; - struct section *table_sec; - unsigned long table_offset; /* * Backward search using the @first_jump_src links, these help avoid @@ -1324,52 +1285,13 @@ static struct reloc *find_jump_table(struct objtool_file *file, insn->jump_dest->offset > orig_insn->offset)) break; - /* look for a relocation which references .rodata */ - text_reloc = find_reloc_by_dest_range(file->elf, insn->sec, - insn->offset, insn->len); - if (!text_reloc || text_reloc->sym->type != STT_SECTION || - !text_reloc->sym->sec->rodata) - continue; - - table_offset = text_reloc->addend; - table_sec = text_reloc->sym->sec; - - if (text_reloc->type == R_X86_64_PC32) - table_offset += 4; - - /* - * Make sure the .rodata address isn't associated with a - * symbol. GCC jump tables are anonymous data. - * - * Also support C jump tables which are in the same format as - * switch jump tables. For objtool to recognize them, they - * need to be placed in the C_JUMP_TABLE_SECTION section. They - * have symbols associated with them. - */ - if (find_symbol_containing(table_sec, table_offset) && - strcmp(table_sec->name, C_JUMP_TABLE_SECTION)) - continue; - - /* - * Each table entry has a reloc associated with it. The reloc - * should reference text in the same function as the original - * instruction. - */ - table_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset); + table_reloc = arch_find_switch_table(file, insn); if (!table_reloc) continue; dest_insn = find_insn(file, table_reloc->sym->sec, table_reloc->addend); if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func) continue; - /* - * Use of RIP-relative switch jumps is quite rare, and - * indicates a rare GCC quirk/bug which can leave dead code - * behind. - */ - if (text_reloc->type == R_X86_64_PC32) - file->ignore_unreachables = true; - return table_reloc; } @@ -1512,32 +1434,7 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; - switch (hint->sp_reg) { - case ORC_REG_UNDEFINED: - cfa->base = CFI_UNDEFINED; - break; - case ORC_REG_SP: - cfa->base = CFI_SP; - break; - case ORC_REG_BP: - cfa->base = CFI_BP; - break; - case ORC_REG_SP_INDIRECT: - cfa->base = CFI_SP_INDIRECT; - break; - case ORC_REG_R10: - cfa->base = CFI_R10; - break; - case ORC_REG_R13: - cfa->base = CFI_R13; - break; - case ORC_REG_DI: - cfa->base = CFI_DI; - break; - case ORC_REG_DX: - cfa->base = CFI_DX; - break; - default: + if (arch_decode_hint_reg(insn, hint->sp_reg)) { WARN_FUNC("unsupported unwind_hint sp base reg %d", insn->sec, insn->offset, hint->sp_reg); return -1; @@ -1951,7 +1848,8 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, return 0; } - if (cfi->type == ORC_TYPE_REGS || cfi->type == ORC_TYPE_REGS_IRET) + if (cfi->type == UNWIND_HINT_TYPE_REGS || + cfi->type == UNWIND_HINT_TYPE_REGS_PARTIAL) return update_cfi_state_regs(insn, cfi, op); switch (op->dest.type) { @@ -2199,7 +2097,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, /* drap: push %rbp */ cfi->stack_size = 0; - } else if (regs[op->src.reg].base == CFI_UNDEFINED) { + } else { /* drap: push %reg */ save_reg(cfi, op->src.reg, CFI_BP, -cfi->stack_size); @@ -2228,9 +2126,7 @@ static int update_cfi_state(struct instruction *insn, struct cfi_state *cfi, /* save drap offset so we know when to restore it */ cfi->drap_offset = op->dest.offset; - } - - else if (regs[op->src.reg].base == CFI_UNDEFINED) { + } else { /* drap: mov reg, disp(%rbp) */ save_reg(cfi, op->src.reg, CFI_BP, op->dest.offset); @@ -2800,9 +2696,10 @@ static bool is_ubsan_insn(struct instruction *insn) "__ubsan_handle_builtin_unreachable")); } -static bool ignore_unreachable_insn(struct instruction *insn) +static bool ignore_unreachable_insn(struct objtool_file *file, struct instruction *insn) { int i; + struct instruction *prev_insn; if (insn->ignore || insn->type == INSN_NOP) return true; @@ -2819,6 +2716,9 @@ static bool ignore_unreachable_insn(struct instruction *insn) !strcmp(insn->sec->name, ".altinstr_aux")) return true; + if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->offset == FAKE_JUMP_OFFSET) + return true; + if (!insn->func) return false; @@ -2827,8 +2727,11 @@ static bool ignore_unreachable_insn(struct instruction *insn) * __builtin_unreachable(). The BUG() macro has an unreachable() after * the UD2, which causes GCC's undefined trap logic to emit another UD2 * (or occasionally a JMP to UD2). + * + * It may also insert a UD2 after calling a __noreturn function. */ - if (list_prev_entry(insn, list)->dead_end && + prev_insn = list_prev_entry(insn, list); + if ((prev_insn->dead_end || dead_end_function(file, prev_insn->call_dest)) && (insn->type == INSN_BUG || (insn->type == INSN_JUMP_UNCONDITIONAL && insn->jump_dest && insn->jump_dest->type == INSN_BUG))) @@ -2955,7 +2858,7 @@ static int validate_reachable_instructions(struct objtool_file *file) return 0; for_each_insn(file, insn) { - if (insn->visited || ignore_unreachable_insn(insn)) + if (insn->visited || ignore_unreachable_insn(file, insn)) continue; WARN_FUNC("unreachable instruction", insn->sec, insn->offset); @@ -2965,37 +2868,22 @@ static int validate_reachable_instructions(struct objtool_file *file) return 0; } -static struct objtool_file file; - -int check(const char *_objname, bool orc) +int check(struct objtool_file *file) { int ret, warnings = 0; - objname = _objname; - - file.elf = elf_open_read(objname, O_RDWR); - if (!file.elf) - return 1; - - INIT_LIST_HEAD(&file.insn_list); - hash_init(file.insn_hash); - INIT_LIST_HEAD(&file.static_call_list); - file.c_file = !vmlinux && find_section_by_name(file.elf, ".comment"); - file.ignore_unreachables = no_unreachable; - file.hints = false; - arch_initial_func_cfi_state(&initial_func_cfi); - ret = decode_sections(&file); + ret = decode_sections(file); if (ret < 0) goto out; warnings += ret; - if (list_empty(&file.insn_list)) + if (list_empty(&file->insn_list)) goto out; if (vmlinux && !validate_dup) { - ret = validate_vmlinux_functions(&file); + ret = validate_vmlinux_functions(file); if (ret < 0) goto out; @@ -3004,50 +2892,34 @@ int check(const char *_objname, bool orc) } if (retpoline) { - ret = validate_retpoline(&file); + ret = validate_retpoline(file); if (ret < 0) return ret; warnings += ret; } - ret = validate_functions(&file); + ret = validate_functions(file); if (ret < 0) goto out; warnings += ret; - ret = validate_unwind_hints(&file, NULL); + ret = validate_unwind_hints(file, NULL); if (ret < 0) goto out; warnings += ret; if (!warnings) { - ret = validate_reachable_instructions(&file); + ret = validate_reachable_instructions(file); if (ret < 0) goto out; warnings += ret; } - ret = create_static_call_sections(&file); + ret = create_static_call_sections(file); if (ret < 0) goto out; warnings += ret; - if (orc) { - ret = create_orc(&file); - if (ret < 0) - goto out; - - ret = create_orc_sections(&file); - if (ret < 0) - goto out; - } - - if (file.elf->changed) { - ret = elf_write(file.elf); - if (ret < 0) - goto out; - } - out: if (ret < 0) { /* diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 36d38b9153ac..5ec00a4b891b 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -43,9 +43,17 @@ struct instruction { struct symbol *func; struct list_head stack_ops; struct cfi_state cfi; +#ifdef INSN_USE_ORC struct orc_entry orc; +#endif }; +static inline bool is_static_jump(struct instruction *insn) +{ + return insn->type == INSN_JUMP_CONDITIONAL || + insn->type == INSN_JUMP_UNCONDITIONAL; +} + struct instruction *find_insn(struct objtool_file *file, struct section *sec, unsigned long offset); @@ -58,5 +66,4 @@ struct instruction *find_insn(struct objtool_file *file, insn->sec == sec; \ insn = list_next_entry(insn, list)) - #endif /* _CHECK_H */ diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index 58fdda510653..9df0cd86d310 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -22,6 +22,8 @@ #include #include "builtin.h" +#include "objtool.h" +#include "warn.h" struct cmd_struct { const char *name; @@ -39,6 +41,34 @@ static struct cmd_struct objtool_cmds[] = { bool help; +const char *objname; +static struct objtool_file file; + +struct objtool_file *objtool_open_read(const char *_objname) +{ + if (objname) { + if (strcmp(objname, _objname)) { + WARN("won't handle more than one file at a time"); + return NULL; + } + return &file; + } + objname = _objname; + + file.elf = elf_open_read(objname, O_RDWR); + if (!file.elf) + return NULL; + + INIT_LIST_HEAD(&file.insn_list); + hash_init(file.insn_hash); + INIT_LIST_HEAD(&file.static_call_list); + file.c_file = !vmlinux && find_section_by_name(file.elf, ".comment"); + file.ignore_unreachables = no_unreachable; + file.hints = false; + + return &file; +} + static void cmd_usage(void) { unsigned int i, longest = 0; diff --git a/tools/objtool/objtool.h b/tools/objtool/objtool.h index 9a7cd0b88bd8..4125d4578b23 100644 --- a/tools/objtool/objtool.h +++ b/tools/objtool/objtool.h @@ -12,6 +12,8 @@ #include "elf.h" +#define __weak __attribute__((weak)) + struct objtool_file { struct elf *elf; struct list_head insn_list; @@ -20,7 +22,9 @@ struct objtool_file { bool ignore_unreachables, c_file, hints, rodata; }; -int check(const char *objname, bool orc); +struct objtool_file *objtool_open_read(const char *_objname); + +int check(struct objtool_file *file); int orc_dump(const char *objname); int create_orc(struct objtool_file *file); int create_orc_sections(struct objtool_file *file); diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index fca46e006fc2..5e6a95368d35 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -4,6 +4,7 @@ */ #include +#include #include #include "objtool.h" #include "warn.h" @@ -37,12 +38,12 @@ static const char *reg_name(unsigned int reg) static const char *orc_type_name(unsigned int type) { switch (type) { - case ORC_TYPE_CALL: + case UNWIND_HINT_TYPE_CALL: return "call"; - case ORC_TYPE_REGS: + case UNWIND_HINT_TYPE_REGS: return "regs"; - case ORC_TYPE_REGS_IRET: - return "iret"; + case UNWIND_HINT_TYPE_REGS_PARTIAL: + return "regs (partial)"; default: return "?"; } diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index e6b2363c2e03..235663b96adc 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -6,6 +6,9 @@ #include #include +#include +#include + #include "check.h" #include "warn.h" @@ -18,6 +21,9 @@ int create_orc(struct objtool_file *file) struct cfi_reg *cfa = &insn->cfi.cfa; struct cfi_reg *bp = &insn->cfi.regs[CFI_BP]; + if (!insn->sec->text) + continue; + orc->end = insn->cfi.end; if (cfa->base == CFI_UNDEFINED) { @@ -143,7 +149,7 @@ int create_orc_sections(struct objtool_file *file) struct orc_entry empty = { .sp_reg = ORC_REG_UNDEFINED, .bp_reg = ORC_REG_UNDEFINED, - .type = ORC_TYPE_CALL, + .type = UNWIND_HINT_TYPE_CALL, }; sec = find_section_by_name(file->elf, ".orc_unwind"); diff --git a/tools/objtool/special.c b/tools/objtool/special.c index e893f1e48e44..1a2420febd08 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -14,24 +14,7 @@ #include "builtin.h" #include "special.h" #include "warn.h" - -#define EX_ENTRY_SIZE 12 -#define EX_ORIG_OFFSET 0 -#define EX_NEW_OFFSET 4 - -#define JUMP_ENTRY_SIZE 16 -#define JUMP_ORIG_OFFSET 0 -#define JUMP_NEW_OFFSET 4 - -#define ALT_ENTRY_SIZE 13 -#define ALT_ORIG_OFFSET 0 -#define ALT_NEW_OFFSET 4 -#define ALT_FEATURE_OFFSET 8 -#define ALT_ORIG_LEN_OFFSET 10 -#define ALT_NEW_LEN_OFFSET 11 - -#define X86_FEATURE_POPCNT (4*32+23) -#define X86_FEATURE_SMAP (9*32+20) +#include "arch_special.h" struct special_entry { const char *sec; @@ -68,6 +51,10 @@ struct special_entry entries[] = { {}, }; +void __weak arch_handle_alternative(unsigned short feature, struct special_alt *alt) +{ +} + static int get_alt_entry(struct elf *elf, struct special_entry *entry, struct section *sec, int idx, struct special_alt *alt) @@ -92,30 +79,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, feature = *(unsigned short *)(sec->data->d_buf + offset + entry->feature); - - /* - * It has been requested that we don't validate the !POPCNT - * feature path which is a "very very small percentage of - * machines". - */ - if (feature == X86_FEATURE_POPCNT) - alt->skip_orig = true; - - /* - * If UACCESS validation is enabled; force that alternative; - * otherwise force it the other way. - * - * What we want to avoid is having both the original and the - * alternative code flow at the same time, in that case we can - * find paths that see the STAC but take the NOP instead of - * CLAC and the other way around. - */ - if (feature == X86_FEATURE_SMAP) { - if (uaccess) - alt->skip_orig = true; - else - alt->skip_alt = true; - } + arch_handle_alternative(feature, alt); } orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig); diff --git a/tools/objtool/special.h b/tools/objtool/special.h index 35061530e46e..abddf38ef334 100644 --- a/tools/objtool/special.h +++ b/tools/objtool/special.h @@ -7,8 +7,11 @@ #define _SPECIAL_H #include +#include "check.h" #include "elf.h" +#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" + struct special_alt { struct list_head list; @@ -28,4 +31,11 @@ struct special_alt { int special_get_alts(struct elf *elf, struct list_head *alts); +void arch_handle_alternative(unsigned short feature, struct special_alt *alt); + +bool arch_support_alt_relocation(struct special_alt *special_alt, + struct instruction *insn, + struct reloc *reloc); +struct reloc *arch_find_switch_table(struct objtool_file *file, + struct instruction *insn); #endif /* _SPECIAL_H */ diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh index aa099b21dffa..606a4b5e929f 100755 --- a/tools/objtool/sync-check.sh +++ b/tools/objtool/sync-check.sh @@ -1,14 +1,27 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -FILES=' +if [ -z "$SRCARCH" ]; then + echo 'sync-check.sh: error: missing $SRCARCH environment variable' >&2 + exit 1 +fi + +FILES="include/linux/objtool.h" + +if [ "$SRCARCH" = "x86" ]; then +FILES="$FILES arch/x86/include/asm/inat_types.h arch/x86/include/asm/orc_types.h arch/x86/include/asm/emulate_prefix.h arch/x86/lib/x86-opcode-map.txt arch/x86/tools/gen-insn-attr-x86.awk include/linux/static_call_types.h -' +arch/x86/include/asm/inat.h -I '^#include [\"<]\(asm/\)*inat_types.h[\">]' +arch/x86/include/asm/insn.h -I '^#include [\"<]\(asm/\)*inat.h[\">]' +arch/x86/lib/inat.c -I '^#include [\"<]\(../include/\)*asm/insn.h[\">]' +arch/x86/lib/insn.c -I '^#include [\"<]\(../include/\)*asm/in\(at\|sn\).h[\">]' -I '^#include [\"<]\(../include/\)*asm/emulate_prefix.h[\">]' +" +fi check_2 () { file1=$1 @@ -41,11 +54,12 @@ fi cd ../.. -for i in $FILES; do - check $i -done +while read -r file_entry; do + if [ -z "$file_entry" ]; then + continue + fi -check arch/x86/include/asm/inat.h '-I "^#include [\"<]\(asm/\)*inat_types.h[\">]"' -check arch/x86/include/asm/insn.h '-I "^#include [\"<]\(asm/\)*inat.h[\">]"' -check arch/x86/lib/inat.c '-I "^#include [\"<]\(../include/\)*asm/insn.h[\">]"' -check arch/x86/lib/insn.c '-I "^#include [\"<]\(../include/\)*asm/in\(at\|sn\).h[\">]" -I "^#include [\"<]\(../include/\)*asm/emulate_prefix.h[\">]"' + check $file_entry +done < #include "objtool.h" -#define __weak __attribute__((weak)) - #define UNSUPPORTED(name) \ ({ \ fprintf(stderr, "error: objtool: " name " not implemented\n"); \ return ENOSYS; \ }) -const char __weak *objname; - -int __weak check(const char *_objname, bool orc) +int __weak check(struct objtool_file *file) { UNSUPPORTED("check subcommand"); } diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 190be4fa5c21..8137a6046a47 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -16,7 +16,7 @@ $(shell printf "" > $(OUTPUT).config-detected) detected = $(shell echo "$(1)=y" >> $(OUTPUT).config-detected) detected_var = $(shell echo "$(1)=$($(1))" >> $(OUTPUT).config-detected) -CFLAGS := $(EXTRA_CFLAGS) $(EXTRA_WARNINGS) +CFLAGS := $(EXTRA_CFLAGS) $(filter-out -Wnested-externs,$(EXTRA_WARNINGS)) include $(srctree)/tools/scripts/Makefile.arch diff --git a/tools/power/acpi/Makefile b/tools/power/acpi/Makefile index ebd3e1a1c28e..a249c50ebf55 100644 --- a/tools/power/acpi/Makefile +++ b/tools/power/acpi/Makefile @@ -7,6 +7,8 @@ include ../../scripts/Makefile.include +.NOTPARALLEL: + all: acpidbg acpidump ec clean: acpidbg_clean acpidump_clean ec_clean install: acpidbg_install acpidump_install ec_install diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c index dd38c2b2e1b4..11c5046dce16 100644 --- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c +++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c @@ -110,7 +110,7 @@ u32 gbl_table_count = 0; * * RETURN: Status; Converted from errno. * - * DESCRIPTION: Get last errno and conver it to acpi_status. + * DESCRIPTION: Get last errno and convert it to acpi_status. * *****************************************************************************/ diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 9f4b190f1d74..cd089a505859 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -15,7 +15,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.5"; +static const char *version_str = "v1.6"; static const int supported_api_ver = 1; static struct isst_if_platform_info isst_platform_info; static char *progname; @@ -545,20 +545,23 @@ static void set_cpu_present_cpu_mask(void) } } -int get_core_count(int pkg_id, int die_id) +int get_max_punit_core_id(int pkg_id, int die_id) { - int cnt = 0; + int max_id = 0; + int i; - if (pkg_id < MAX_PACKAGE_COUNT && die_id < MAX_DIE_PER_PACKAGE) { - int i; + for (i = 0; i < topo_max_cpus; ++i) + { + if (!CPU_ISSET_S(i, present_cpumask_size, present_cpumask)) + continue; - for (i = 0; i < sizeof(long long) * 8; ++i) { - if (core_mask[pkg_id][die_id] & (1ULL << i)) - cnt++; - } + if (cpu_map[i].pkg_id == pkg_id && + cpu_map[i].die_id == die_id && + cpu_map[i].punit_cpu_core > max_id) + max_id = cpu_map[i].punit_cpu_core; } - return cnt; + return max_id; } int get_cpu_count(int pkg_id, int die_id) diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index a7f4337c5777..1d7ecb54352e 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -396,7 +396,7 @@ int isst_get_pbf_info(int cpu, int level, struct isst_pbf_info *pbf_info) { struct isst_pkg_ctdp_level_info ctdp_level; struct isst_pkg_ctdp pkg_dev; - int i, ret, core_cnt, max; + int i, ret, max_punit_core, max_mask_index; unsigned int req, resp; ret = isst_get_ctdp_levels(cpu, &pkg_dev); @@ -421,10 +421,10 @@ int isst_get_pbf_info(int cpu, int level, struct isst_pbf_info *pbf_info) pbf_info->core_cpumask_size = alloc_cpu_set(&pbf_info->core_cpumask); - core_cnt = get_core_count(get_physical_package_id(cpu), get_physical_die_id(cpu)); - max = core_cnt > 32 ? 2 : 1; + max_punit_core = get_max_punit_core_id(get_physical_package_id(cpu), get_physical_die_id(cpu)); + max_mask_index = max_punit_core > 32 ? 2 : 1; - for (i = 0; i < max; ++i) { + for (i = 0; i < max_mask_index; ++i) { unsigned long long mask; int count; diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 094ba4589a9c..29715e9c2e06 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -170,7 +170,7 @@ struct isst_pkg_ctdp { extern int get_topo_max_cpus(void); extern int get_cpu_count(int pkg_id, int die_id); -extern int get_core_count(int pkg_id, int die_id); +extern int get_max_punit_core_id(int pkg_id, int die_id); /* Common interfaces */ FILE *get_output_file(void);