From e6c591e7a430197777a6456a3cd2db38fffd7647 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Apr 2016 17:35:29 +0200 Subject: [PATCH 001/131] gcov: disable for COMPILE_TEST commit cc622420798c4bcf093785d872525087a7798db9 upstream. Enabling gcov is counterproductive to compile testing: it significantly increases the kernel image size, compile time, and it produces lots of false positive "may be used uninitialized" warnings as the result of missed optimizations. This is in line with how UBSAN_SANITIZE_ALL and PROFILE_ALL_BRANCHES work, both of which have similar problems. With an ARM allmodconfig kernel, I see the build time drop from 283 minutes CPU time to 225 minutes, and the vmlinux size drops from 43MB to 26MB. Signed-off-by: Arnd Bergmann Acked-by: Peter Oberparleiter Signed-off-by: Michal Marek Signed-off-by: Greg Kroah-Hartman --- kernel/gcov/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index c92e44855ddd..1276aabaab55 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL config GCOV_PROFILE_ALL bool "Profile entire Kernel" + depends on !COMPILE_TEST depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n From 20c28c04a6bc2ebd60fa20e5c3a6bf3bfa736d81 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:21 -0600 Subject: [PATCH 002/131] x86/cpu/AMD: Make LFENCE a serializing instruction commit e4d0e84e490790798691aaa0f2e598637f1867ec upstream. To aid in speculation control, make LFENCE a serializing instruction since it has less overhead than MFENCE. This is done by setting bit 1 of MSR 0xc0011029 (DE_CFG). Some families that support LFENCE do not have this MSR. For these families, the LFENCE instruction is already serializing. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 37db36fddc88..f00dfff81370 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -330,6 +330,8 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c +#define MSR_F10H_DECFG 0xc0011029 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e2defc7593a4..7f2e9c6ff239 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -746,6 +746,16 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } From 642ce1bb5ea64b1d036dff04a005d9bbe28ab5b4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 8 Jan 2018 16:09:32 -0600 Subject: [PATCH 003/131] x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC commit 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f upstream. With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference to MFENCE_RDTSC. However, since the kernel could be running under a hypervisor that does not support writing that MSR, read the MSR back and verify that the bit has been set successfully. If the MSR can be read and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the MFENCE_RDTSC feature. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Reviewed-by: Borislav Petkov Cc: Peter Zijlstra Cc: Tim Chen Cc: Dave Hansen Cc: Borislav Petkov Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: David Woodhouse Cc: Paul Turner Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/amd.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f00dfff81370..b8911aecf035 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -332,6 +332,7 @@ #define MSR_FAM10H_NODE_ID 0xc001100c #define MSR_F10H_DECFG 0xc0011029 #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7f2e9c6ff239..4bf9e77f3e05 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -746,6 +746,9 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { + unsigned long long val; + int ret; + /* * A serializing LFENCE has less overhead than MFENCE, so * use it for execution serialization. On families which @@ -756,8 +759,19 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + /* + * Verify that the MSR write was successful (could be running + * under a hypervisor) and only then assume that LFENCE is + * serializing. + */ + ret = rdmsrl_safe(MSR_F10H_DECFG, &val); + if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } else { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } } /* From 416f66509fce36e128ea2d9e62739a21f1a1d04d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 17 Sep 2017 09:03:50 -0700 Subject: [PATCH 004/131] x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier commit b8b7abaed7a49b350f8ba659ddc264b04931d581 upstream. Otherwise we might have the PCID feature bit set during cpu_init(). This is just for robustness. I haven't seen any actual bugs here. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: cba4671af755 ("x86/mm: Disable PCID on 32-bit kernels") Link: http://lkml.kernel.org/r/b16dae9d6b0db5d9801ddbebbfd83384097c61f3.1505663533.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 8 -------- arch/x86/kernel/cpu/common.c | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cd46f9039119..cb6b4f9d0b7a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,14 +22,6 @@ void __init check_bugs(void) { -#ifdef CONFIG_X86_32 - /* - * Regardless of whether PCID is enumerated, the SDM says - * that it can't be enabled in 32-bit mode. - */ - setup_clear_cpu_cap(X86_FEATURE_PCID); -#endif - identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dc4dfad66a70..0531c1707b40 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -838,6 +838,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); fpu__init_system(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif } void __init early_cpu_init(void) From cfc8c1d61e46fd3c60a34a5b1962eeeb03222a3d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 29 Sep 2017 17:15:36 +0300 Subject: [PATCH 005/131] x86/asm: Use register variable to get stack pointer value commit 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc upstream. Currently we use current_stack_pointer() function to get the value of the stack pointer register. Since commit: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang") ... we have a stack register variable declared. It can be used instead of current_stack_pointer() function which allows to optimize away some excessive "mov %rsp, %" instructions: -mov %rsp,%rdx -sub %rdx,%rax -cmp $0x3fff,%rax -ja ffffffff810722fd +sub %rsp,%rax +cmp $0x3fff,%rax +ja ffffffff810722fa Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer and use it instead of the removed function. Signed-off-by: Andrey Ryabinin Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar [dwmw2: We want ASM_CALL_CONSTRAINT for retpoline] Signed-off-by: David Woodhouse Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm.h | 11 +++++++++++ arch/x86/include/asm/thread_info.h | 11 ----------- arch/x86/kernel/irq_32.c | 6 +++--- arch/x86/kernel/traps.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 189679aba703..b9c6c7a6f5a6 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -105,4 +105,15 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) +#endif + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c7b551028740..9b028204685d 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -166,17 +166,6 @@ static inline struct thread_info *current_thread_info(void) return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 38da8f29a9c8..647089c64d13 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -65,7 +65,7 @@ static void call_on_stack(void *func, void *stack) static inline void *current_stack(void) { - return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) @@ -89,7 +89,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -142,7 +142,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 679302c312f8..22b81f35c500 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -166,7 +166,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * from double_fault. */ BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer()) >= THREAD_SIZE); + current_stack_pointer) >= THREAD_SIZE); preempt_enable_no_resched(); } From b76ac90af34dc08f057da15aa34584a0a3d381b5 Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Sun, 11 Dec 2016 02:09:18 +0100 Subject: [PATCH 006/131] x86/kbuild: enable modversions for symbols exported from asm commit 334bb773876403eae3457d81be0b8ea70f8e4ccc upstream. Commit 4efca4ed ("kbuild: modversions for EXPORT_SYMBOL() for asm") adds modversion support for symbols exported from asm files. Architectures must include C-style declarations for those symbols in asm/asm-prototypes.h in order for them to be versioned. Add these declarations for x86, and an architecture-independent file that can be used for common symbols. With f27c2f6 reverting 8ab2ae6 ("default exported asm symbols to zero") we produce a scary warning on x86, this commit fixes that. Signed-off-by: Adam Borowski Tested-by: Kalle Valo Acked-by: Nicholas Piggin Tested-by: Peter Wu Tested-by: Oliver Hartkopp Signed-off-by: Michal Marek Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/asm-prototypes.h | 16 ++++++++++++++++ include/asm-generic/asm-prototypes.h | 7 +++++++ 2 files changed, 23 insertions(+) create mode 100644 arch/x86/include/asm/asm-prototypes.h create mode 100644 include/asm-generic/asm-prototypes.h diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..44b8762fa0c7 --- /dev/null +++ b/arch/x86/include/asm/asm-prototypes.h @@ -0,0 +1,16 @@ +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +#endif diff --git a/include/asm-generic/asm-prototypes.h b/include/asm-generic/asm-prototypes.h new file mode 100644 index 000000000000..df13637e4017 --- /dev/null +++ b/include/asm-generic/asm-prototypes.h @@ -0,0 +1,7 @@ +#include +extern void *__memset(void *, int, __kernel_size_t); +extern void *__memcpy(void *, const void *, __kernel_size_t); +extern void *__memmove(void *, const void *, __kernel_size_t); +extern void *memset(void *, int, __kernel_size_t); +extern void *memcpy(void *, const void *, __kernel_size_t); +extern void *memmove(void *, const void *, __kernel_size_t); From b8e7a489b51810992ba3266bfd9e043709e07267 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 26 Apr 2016 12:23:25 -0700 Subject: [PATCH 007/131] x86/asm: Make asm/alternative.h safe from assembly commit f005f5d860e0231fe212cfda8c1a3148b99609f4 upstream. asm/alternative.h isn't directly useful from assembly, but it shouldn't break the build. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5b693fcef99fe6e80341c9e97a002fb23871e91.1461698311.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/alternative.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index d1cf17173b1b..215ea9214215 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H +#ifndef __ASSEMBLY__ + #include #include #include @@ -271,4 +273,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_ALTERNATIVE_H */ From a88693d006986b8fc9ae5036852ac0156106de2a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 11 Jan 2016 10:54:54 -0500 Subject: [PATCH 008/131] EXPORT_SYMBOL() for asm commit 22823ab419d8ed884195cfa75483fd3a99bb1462 upstream. Add asm-usable variants of EXPORT_SYMBOL/EXPORT_SYMBOL_GPL. This commit just adds the default implementation; most of the architectures can simply add export.h to asm/Kbuild and start using from assembler. The rest needs to have their define everal macros and then explicitly include One area where the things might diverge from default is the alignment; normally it's 8 bytes on 64bit targets and 4 on 32bit ones, both for unsigned long and for struct kernel_symbol. Unfortunately, amd64 and m68k are unusual - m68k aligns to 2 bytes (for both) and amd64 aligns struct kernel_symbol to 16 bytes. For those we'll need asm/export.h to override the constants used by generic version - KSYM_ALIGN and KCRC_ALIGN for kernel_symbol and unsigned long resp. And no, __alignof__ would not do the trick - on amd64 __alignof__ of struct kernel_symbol is 8, not 16. More serious source of unpleasantness is treatment of function descriptors on architectures that have those. Things like ppc64, parisc, ia64, etc. need more than the address of the first insn to call an arbitrary function. As the result, their representation of pointers to functions is not the typical "address of the entry point" - it's an address of a small static structure containing all the required information (including the entry point, of course). Sadly, the asm-side conventions differ in what the function name refers to - entry point or the function descriptor. On ppc64 we do the latter; bar: .quad foo is what void (*bar)(void) = foo; turns into and the rare places where we need to explicitly work with the label of entry point are dealt with as DOTSYM(foo). For our purposes it's ideal - generic macros are usable. However, parisc would have foo and P%foo used for label of entry point and address of the function descriptor and bar: .long P%foo woudl be used instead. ia64 goes similar to parisc in that respect, except that there it's @fptr(foo) rather than P%foo. Such architectures need to define KSYM_FUNC that would turn a function name into whatever is needed to refer to function descriptor. What's more, on such architectures we need to know whether we are exporting a function or an object - in assembler we have to tell that explicitly, to decide whether we want EXPORT_SYMBOL(foo) produce e.g. __ksymtab_foo: .quad foo or __ksymtab_foo: .quad @fptr(foo) For that reason we introduce EXPORT_DATA_SYMBOL{,_GPL}(), to be used for exports of data objects. On normal architectures it's the same thing as EXPORT_SYMBOL{,_GPL}(), but on parisc-like ones they differ and the right one needs to be used. Most of the exports are functions, so we keep EXPORT_SYMBOL for those... Signed-off-by: Al Viro Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/export.h | 94 ++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 include/asm-generic/export.h diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h new file mode 100644 index 000000000000..43199a049da5 --- /dev/null +++ b/include/asm-generic/export.h @@ -0,0 +1,94 @@ +#ifndef __ASM_GENERIC_EXPORT_H +#define __ASM_GENERIC_EXPORT_H + +#ifndef KSYM_FUNC +#define KSYM_FUNC(x) x +#endif +#ifdef CONFIG_64BIT +#define __put .quad +#ifndef KSYM_ALIGN +#define KSYM_ALIGN 8 +#endif +#ifndef KCRC_ALIGN +#define KCRC_ALIGN 8 +#endif +#else +#define __put .long +#ifndef KSYM_ALIGN +#define KSYM_ALIGN 4 +#endif +#ifndef KCRC_ALIGN +#define KCRC_ALIGN 4 +#endif +#endif + +#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX +#define KSYM(name) _##name +#else +#define KSYM(name) name +#endif + +/* + * note on .section use: @progbits vs %progbits nastiness doesn't matter, + * since we immediately emit into those sections anyway. + */ +.macro ___EXPORT_SYMBOL name,val,sec +#ifdef CONFIG_MODULES + .globl KSYM(__ksymtab_\name) + .section ___ksymtab\sec+\name,"a" + .balign KSYM_ALIGN +KSYM(__ksymtab_\name): + __put \val, KSYM(__kstrtab_\name) + .previous + .section __ksymtab_strings,"a" +KSYM(__kstrtab_\name): +#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX + .asciz "_\name" +#else + .asciz "\name" +#endif + .previous +#ifdef CONFIG_MODVERSIONS + .section ___kcrctab\sec+\name,"a" + .balign KCRC_ALIGN +KSYM(__kcrctab_\name): + __put KSYM(__crc_\name) + .weak KSYM(__crc_\name) + .previous +#endif +#endif +.endm +#undef __put + +#if defined(__KSYM_DEPS__) + +#define __EXPORT_SYMBOL(sym, val, sec) === __KSYM_##sym === + +#elif defined(CONFIG_TRIM_UNUSED_KSYMS) + +#include +#include + +#define __EXPORT_SYMBOL(sym, val, sec) \ + __cond_export_sym(sym, val, sec, config_enabled(__KSYM_##sym)) +#define __cond_export_sym(sym, val, sec, conf) \ + ___cond_export_sym(sym, val, sec, conf) +#define ___cond_export_sym(sym, val, sec, enabled) \ + __cond_export_sym_##enabled(sym, val, sec) +#define __cond_export_sym_1(sym, val, sec) ___EXPORT_SYMBOL sym, val, sec +#define __cond_export_sym_0(sym, val, sec) /* nothing */ + +#else +#define __EXPORT_SYMBOL(sym, val, sec) ___EXPORT_SYMBOL sym, val, sec +#endif + +#define EXPORT_SYMBOL(name) \ + __EXPORT_SYMBOL(name, KSYM_FUNC(KSYM(name)),) +#define EXPORT_SYMBOL_GPL(name) \ + __EXPORT_SYMBOL(name, KSYM_FUNC(KSYM(name)), _gpl) +#define EXPORT_DATA_SYMBOL(name) \ + __EXPORT_SYMBOL(name, KSYM(name),) +#define EXPORT_DATA_SYMBOL_GPL(name) \ + __EXPORT_SYMBOL(name, KSYM(name),_gpl) + +#endif From 675901851fd2a00c7a70bcf327303efba3b81e2f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Jun 2016 14:58:54 +0900 Subject: [PATCH 009/131] kconfig.h: use __is_defined() to check if MODULE is defined commit 4f920843d248946545415c1bf6120942048708ed upstream. The macro MODULE is not a config option, it is a per-file build option. So, config_enabled(MODULE) is not sensible. (There is another case in include/linux/export.h, where config_enabled() is used against a non-config option.) This commit renames some macros in include/linux/kconfig.h for the use for non-config macros and replaces config_enabled(MODULE) with __is_defined(MODULE). I am keeping config_enabled() because it is still referenced from some places, but I expect it would be deprecated in the future. Signed-off-by: Masahiro Yamada Signed-off-by: Michal Marek Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- include/linux/kconfig.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index b33c7797eb57..a94b5bf57f51 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -17,10 +17,11 @@ * the last step cherry picks the 2nd arg, we get a zero. */ #define __ARG_PLACEHOLDER_1 0, -#define config_enabled(cfg) _config_enabled(cfg) -#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value) -#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0) -#define ___config_enabled(__ignored, val, ...) val +#define config_enabled(cfg) ___is_defined(cfg) +#define __is_defined(x) ___is_defined(x) +#define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val) +#define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0) +#define __take_second_arg(__ignored, val, ...) val /* * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 @@ -42,7 +43,7 @@ * built-in code when CONFIG_FOO is set to 'm'. */ #define IS_REACHABLE(option) (config_enabled(option) || \ - (config_enabled(option##_MODULE) && config_enabled(MODULE))) + (config_enabled(option##_MODULE) && __is_defined(MODULE))) /* * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', From 3c5e10905263dbe9fbc621d1889b85e9c867da25 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:25 +0000 Subject: [PATCH 010/131] x86/retpoline: Add initial retpoline support commit 76b043848fd22dbf7f8bf3a1452f8c70d557b860 upstream. Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide the corresponding thunks. Provide assembler macros for invoking the thunks in the same way that GCC does, from native and inline assembler. This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In some circumstances, IBRS microcode features may be used instead, and the retpoline can be disabled. On AMD CPUs if lfence is serialising, the retpoline can be dramatically simplified to a simple "lfence; jmp *\reg". A future patch, after it has been verified that lfence really is serialising in all circumstances, can enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition to X86_FEATURE_RETPOLINE. Do not align the retpoline in the altinstr section, because there is no guarantee that it stays aligned when it's copied over the oldinstr during alternative patching. [ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks] [ tglx: Put actual function CALL/JMP in front of the macros, convert to symbolic labels ] [ dwmw2: Convert back to numeric labels, merge objtool fixes ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse [ 4.4 backport: removed objtool annotation since there is no objtool ] Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 13 ++++ arch/x86/Makefile | 10 +++ arch/x86/include/asm/asm-prototypes.h | 25 ++++++ arch/x86/include/asm/cpufeature.h | 2 + arch/x86/include/asm/nospec-branch.h | 106 ++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 4 + arch/x86/lib/Makefile | 1 + arch/x86/lib/retpoline.S | 48 ++++++++++++ 8 files changed, 209 insertions(+) create mode 100644 arch/x86/include/asm/nospec-branch.h create mode 100644 arch/x86/lib/retpoline.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0ef2cdd11616..75d0053b495a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -379,6 +379,19 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config RETPOLINE + bool "Avoid speculative indirect branches in kernel" + default y + ---help--- + Compile kernel with the retpoline compiler options to guard against + kernel-to-user data leaks by avoiding speculative indirect + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + + Without compiler support, at least indirect branches in assembler + code are eliminated. Since this includes the syscall entry path, + it is not entirely pointless. + if X86_32 config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4086abca0b32..34fdac520edb 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -189,6 +189,16 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) +# Avoid indirect branches in kernel to deal with Spectre +ifdef CONFIG_RETPOLINE + RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) + ifneq ($(RETPOLINE_CFLAGS),) + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE + else + $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) + endif +endif + archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 44b8762fa0c7..b15aa4083dfd 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -10,7 +10,32 @@ #include #include #include +#include #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); #endif + +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_32 +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); +#else +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); +INDIRECT_THUNK(8) +INDIRECT_THUNK(9) +INDIRECT_THUNK(10) +INDIRECT_THUNK(11) +INDIRECT_THUNK(12) +INDIRECT_THUNK(13) +INDIRECT_THUNK(14) +INDIRECT_THUNK(15) +#endif +INDIRECT_THUNK(ax) +INDIRECT_THUNK(bx) +INDIRECT_THUNK(cx) +INDIRECT_THUNK(dx) +INDIRECT_THUNK(si) +INDIRECT_THUNK(di) +INDIRECT_THUNK(bp) +INDIRECT_THUNK(sp) +#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 142028afd049..0fbc98568018 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -200,6 +200,8 @@ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_RETPOLINE ( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h new file mode 100644 index 000000000000..5763548fb30b --- /dev/null +++ b/arch/x86/include/asm/nospec-branch.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NOSPEC_BRANCH_H__ +#define __NOSPEC_BRANCH_H__ + +#include +#include +#include + +#ifdef __ASSEMBLY__ + +/* + * These are the bare retpoline primitives for indirect jmp and call. + * Do not use these directly; they only exist to make the ALTERNATIVE + * invocation below less ugly. + */ +.macro RETPOLINE_JMP reg:req + call .Ldo_rop_\@ +.Lspec_trap_\@: + pause + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov \reg, (%_ASM_SP) + ret +.endm + +/* + * This is a wrapper around RETPOLINE_JMP so the called function in reg + * returns to the instruction after the macro. + */ +.macro RETPOLINE_CALL reg:req + jmp .Ldo_call_\@ +.Ldo_retpoline_jmp_\@: + RETPOLINE_JMP \reg +.Ldo_call_\@: + call .Ldo_retpoline_jmp_\@ +.endm + +/* + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple + * indirect jmp/call which may be susceptible to the Spectre variant 2 + * attack. + */ +.macro JMP_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ALTERNATIVE_2 __stringify(jmp *\reg), \ + __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD +#else + jmp *\reg +#endif +.endm + +.macro CALL_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ALTERNATIVE_2 __stringify(call *\reg), \ + __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD +#else + call *\reg +#endif +.endm + +#else /* __ASSEMBLY__ */ + +#if defined(CONFIG_X86_64) && defined(RETPOLINE) + +/* + * Since the inline asm uses the %V modifier which is only in newer GCC, + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + */ +# define CALL_NOSPEC \ + ALTERNATIVE( \ + "call *%[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ + X86_FEATURE_RETPOLINE) +# define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +/* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ + " jmp 904f;\n" \ + " .align 16\n" \ + "901: call 903f;\n" \ + "902: pause;\n" \ + " jmp 902b;\n" \ + " .align 16\n" \ + "903: addl $4, %%esp;\n" \ + " pushl %[thunk_target];\n" \ + " ret;\n" \ + " .align 16\n" \ + "904: call 901b;\n", \ + X86_FEATURE_RETPOLINE) + +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#else /* No retpoline */ +# define CALL_NOSPEC "call *%[thunk_target]\n" +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0531c1707b40..5b3a6e888bc5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -837,6 +837,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); +#ifdef CONFIG_RETPOLINE + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); +#endif + fpu__init_system(c); #ifdef CONFIG_X86_32 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f2587888d987..12a34d15b648 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -21,6 +21,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-$(CONFIG_RETPOLINE) += retpoline.o obj-y += msr.o msr-reg.o msr-reg-export.o diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S new file mode 100644 index 000000000000..019a03599bb0 --- /dev/null +++ b/arch/x86/lib/retpoline.S @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include +#include +#include +#include + +.macro THUNK reg + .section .text.__x86.indirect_thunk.\reg + +ENTRY(__x86_indirect_thunk_\reg) + CFI_STARTPROC + JMP_NOSPEC %\reg + CFI_ENDPROC +ENDPROC(__x86_indirect_thunk_\reg) +.endm + +/* + * Despite being an assembler file we can't just use .irp here + * because __KSYM_DEPS__ only uses the C preprocessor and would + * only see one instance of "__x86_indirect_thunk_\reg" rather + * than one per register with the correct names. So we do it + * the simple and nasty way... + */ +#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) +#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) + +GENERATE_THUNK(_ASM_AX) +GENERATE_THUNK(_ASM_BX) +GENERATE_THUNK(_ASM_CX) +GENERATE_THUNK(_ASM_DX) +GENERATE_THUNK(_ASM_SI) +GENERATE_THUNK(_ASM_DI) +GENERATE_THUNK(_ASM_BP) +GENERATE_THUNK(_ASM_SP) +#ifdef CONFIG_64BIT +GENERATE_THUNK(r8) +GENERATE_THUNK(r9) +GENERATE_THUNK(r10) +GENERATE_THUNK(r11) +GENERATE_THUNK(r12) +GENERATE_THUNK(r13) +GENERATE_THUNK(r14) +GENERATE_THUNK(r15) +#endif From 9f789bc5711bcacb5df003594b992f0c1cc19df4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:26 +0000 Subject: [PATCH 011/131] x86/spectre: Add boot time option to select Spectre v2 mitigation commit da285121560e769cc31797bba6422eea71d473e0 upstream. Add a spectre_v2= option to select the mitigation used for the indirect branch speculation vulnerability. Currently, the only option available is retpoline, in its various forms. This will be expanded to cover the new IBRS/IBPB microcode features. The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a serializing instruction, which is indicated by the LFENCE_RDTSC feature. [ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS integration becomes simple ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 28 +++++ arch/x86/include/asm/nospec-branch.h | 10 ++ arch/x86/kernel/cpu/bugs.c | 158 ++++++++++++++++++++++++++- arch/x86/kernel/cpu/common.c | 4 - 4 files changed, 195 insertions(+), 5 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 39280b72f27a..22a4688dc0c8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2452,6 +2452,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nohugeiomap [KNL,x86] Disable kernel huge I/O mappings. + nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2 + (indirect branch prediction) vulnerability. System may + allow data leaks with this option, which is equivalent + to spectre_v2=off. + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. @@ -3594,6 +3599,29 @@ bytes respectively. Such letter suffixes can also be entirely omitted. sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt + spectre_v2= [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable + + Selecting 'on' will, and 'auto' may, choose a + mitigation method at run time according to the + CPU, the available microcode, the setting of the + CONFIG_RETPOLINE configuration option, and the + compiler with which the kernel was built. + + Specific mitigations can also be selected manually: + + retpoline - replace indirect branches + retpoline,generic - google's original retpoline + retpoline,amd - AMD-specific minimal thunk + + Not specifying this option is equivalent to + spectre_v2=auto. + spia_io_base= [HW,MTD] spia_fio_base= spia_pedr= diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 5763548fb30b..fe48aeee79d1 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -102,5 +102,15 @@ # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif +/* The Spectre V2 mitigation variants */ +enum spectre_v2_mitigation { + SPECTRE_V2_NONE, + SPECTRE_V2_RETPOLINE_MINIMAL, + SPECTRE_V2_RETPOLINE_MINIMAL_AMD, + SPECTRE_V2_RETPOLINE_GENERIC, + SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS, +}; + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cb6b4f9d0b7a..49d25ddf0e9f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,9 @@ #include #include #include + +#include +#include #include #include #include @@ -20,6 +23,8 @@ #include #include +static void __init spectre_v2_select_mitigation(void); + void __init check_bugs(void) { identify_boot_cpu(); @@ -29,6 +34,9 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* Select the proper spectre mitigation before patching alternatives */ + spectre_v2_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -61,6 +69,153 @@ void __init check_bugs(void) #endif } +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +static void __init spec2_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static inline bool retp_compiler(void) +{ + return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + char arg[20]; + int ret; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, + sizeof(arg)); + if (ret > 0) { + if (match_option(arg, ret, "off")) { + goto disable; + } else if (match_option(arg, ret, "on")) { + spec2_print_if_secure("force enabled on command line."); + return SPECTRE_V2_CMD_FORCE; + } else if (match_option(arg, ret, "retpoline")) { + spec2_print_if_insecure("retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE; + } else if (match_option(arg, ret, "retpoline,amd")) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + return SPECTRE_V2_CMD_AUTO; + } + spec2_print_if_insecure("AMD retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_AMD; + } else if (match_option(arg, ret, "retpoline,generic")) { + spec2_print_if_insecure("generic retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (match_option(arg, ret, "auto")) { + return SPECTRE_V2_CMD_AUTO; + } + } + + if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_AUTO; +disable: + spec2_print_if_insecure("disabled on command line."); + return SPECTRE_V2_CMD_NONE; +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + /* FALLTRHU */ + case SPECTRE_V2_CMD_AUTO: + goto retpoline_auto; + + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; + break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_generic; + break; + case SPECTRE_V2_CMD_RETPOLINE: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; + } + pr_err("kernel not compiled with retpoline; no mitigation available!"); + return; + +retpoline_auto: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + retpoline_amd: + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing. Switching to generic retpoline\n"); + goto retpoline_generic; + } + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : + SPECTRE_V2_RETPOLINE_MINIMAL_AMD; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } else { + retpoline_generic: + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : + SPECTRE_V2_RETPOLINE_MINIMAL; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); +} + +#undef pr_fmt + #ifdef CONFIG_SYSFS ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) @@ -85,6 +240,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Vulnerable\n"); + + return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5b3a6e888bc5..0531c1707b40 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -837,10 +837,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); -#ifdef CONFIG_RETPOLINE - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); -#endif - fpu__init_system(c); #ifdef CONFIG_X86_32 From 9fe55976f0c8acfd7408bf693b6d171587b62129 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:27 +0000 Subject: [PATCH 012/131] x86/retpoline/crypto: Convert crypto assembler indirect jumps commit 9697fa39efd3fc3692f2949d4045f393ec58450b upstream. Convert all indirect jumps in crypto assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-6-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/crypto/aesni-intel_asm.S | 5 +++-- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 3 ++- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 6bd2c6c95373..3f93dedb5a4d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -31,6 +31,7 @@ #include #include +#include /* * The following macros are used to move an (un)aligned 16 byte value to/from @@ -2714,7 +2715,7 @@ ENTRY(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2759,7 +2760,7 @@ ENTRY(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index ce71f9212409..5881756f78a2 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -16,6 +16,7 @@ */ #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1210,7 +1211,7 @@ camellia_xts_crypt_16way: vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 0e0b8863a34b..0d45b04b490a 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -11,6 +11,7 @@ */ #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1323,7 +1324,7 @@ camellia_xts_crypt_32way: vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 4fe27e074194..48767520cbe0 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -45,6 +45,7 @@ #include #include +#include ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction @@ -172,7 +173,7 @@ continue_block: movzxw (bufp, %rax, 2), len offset=crc_array-jump_table lea offset(bufp, len, 1), bufp - jmp *bufp + JMP_NOSPEC bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: From 028083cb02db69237e73950576bc81ac579693dc Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:28 +0000 Subject: [PATCH 013/131] x86/retpoline/entry: Convert entry assembler indirect jumps commit 2641f08bb7fc63a636a2b18173221d7040a3512e upstream. Convert indirect jumps in core 32/64bit entry assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return address after the 'call' instruction must be *precisely* at the .Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work, and the use of alternatives will mess that up unless we play horrid games to prepend with NOPs and make the variants the same length. It's not worth it; in the case where we ALTERNATIVE out the retpoline, the first instruction at __x86.indirect_thunk.rax is going to be a bare jmp *%rax anyway. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_32.S | 6 ++++-- arch/x86/entry/entry_64.S | 14 +++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ae678ad128a9..adbbd4f538e9 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -44,6 +44,7 @@ #include #include #include +#include .section .entry.text, "ax" @@ -226,7 +227,8 @@ ENTRY(ret_from_kernel_thread) pushl $0x0202 # Reset kernel eflags popfl movl PT_EBP(%esp), %eax - call *PT_EBX(%esp) + movl PT_EBX(%esp), %edx + CALL_NOSPEC %edx movl $0, PT_EAX(%esp) /* @@ -938,7 +940,7 @@ error_code: movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call *%edi + CALL_NOSPEC %edi jmp ret_from_exception END(page_fault) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 952b23b5d4e9..81b1cd533965 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -36,6 +36,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -184,7 +185,13 @@ entry_SYSCALL_64_fastpath: #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10, %rcx +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif + movq %rax, RAX(%rsp) 1: /* @@ -276,7 +283,12 @@ tracesys_phase2: #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10, %rcx /* fixup for C */ +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif movq %rax, RAX(%rsp) 1: /* Use IRET because user could have changed pt_regs->foo */ @@ -491,7 +503,7 @@ ENTRY(ret_from_fork) * nb: we depend on RESTORE_EXTRA_REGS above */ movq %rbp, %rdi - call *%rbx + CALL_NOSPEC %rbx movl $0, RAX(%rsp) RESTORE_EXTRA_REGS jmp int_ret_from_sys_call From 7153a6d5ff050050555066f58ac3458c5efc699b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:29 +0000 Subject: [PATCH 014/131] x86/retpoline/ftrace: Convert ftrace assembler indirect jumps commit 9351803bd803cdbeb9b5a7850b7b6f464806e3db upstream. Convert all indirect jumps in ftrace assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-8-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_32.S | 5 +++-- arch/x86/kernel/mcount_64.S | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index adbbd4f538e9..d437f3871e53 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -863,7 +863,8 @@ trace: movl 0x4(%ebp), %edx subl $MCOUNT_INSN_SIZE, %eax - call *ftrace_trace_function + movl ftrace_trace_function, %ecx + CALL_NOSPEC %ecx popl %edx popl %ecx @@ -898,7 +899,7 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - jmp *%ecx + JMP_NOSPEC %ecx #endif #ifdef CONFIG_TRACING diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 5d9afbcb6074..09284cfab86f 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -7,7 +7,7 @@ #include #include #include - +#include .code64 .section .entry.text, "ax" @@ -285,8 +285,9 @@ trace: * ip and parent ip are used and the list function is called when * function tracing is enabled. */ - call *ftrace_trace_function + movq ftrace_trace_function, %r8 + CALL_NOSPEC %r8 restore_mcount_regs jmp fgraph_trace @@ -329,5 +330,5 @@ GLOBAL(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - jmp *%rdi + JMP_NOSPEC %rdi #endif From d2beed45635e3c430bc6d84ff8e6c6e8cb2e10b4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:30 +0000 Subject: [PATCH 015/131] x86/retpoline/hyperv: Convert assembler indirect jumps commit e70e5892b28c18f517f29ab6e83bd57705104b31 upstream. Convert all indirect jumps in hyperv inline asm code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-9-git-send-email-dwmw@amazon.co.uk [ backport to 4.4, hopefully correct, not tested... - gregkh ] Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 8ce1f2e22912..d415a804fd26 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "hyperv_vmbus.h" /* The one and only */ @@ -103,9 +104,10 @@ static u64 do_hypercall(u64 control, void *input, void *output) return (u64)ULLONG_MAX; __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); - __asm__ __volatile__("call *%3" : "=a" (hv_status) : + __asm__ __volatile__(CALL_NOSPEC : + "=a" (hv_status) : "c" (control), "d" (input_address), - "m" (hypercall_page)); + THUNK_TARGET(hypercall_page)); return hv_status; @@ -123,11 +125,12 @@ static u64 do_hypercall(u64 control, void *input, void *output) if (!hypercall_page) return (u64)ULLONG_MAX; - __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), + __asm__ __volatile__ (CALL_NOSPEC : "=d"(hv_status_hi), "=a"(hv_status_lo) : "d" (control_hi), "a" (control_lo), "b" (input_address_hi), "c" (input_address_lo), "D"(output_address_hi), - "S"(output_address_lo), "m" (hypercall_page)); + "S"(output_address_lo), + THUNK_TARGET(hypercall_page)); return hv_status_lo | ((u64)hv_status_hi << 32); #endif /* !x86_64 */ From 6b222e7483af4fd8f632efbf3b91025c2359b10a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:31 +0000 Subject: [PATCH 016/131] x86/retpoline/xen: Convert Xen hypercall indirect jumps commit ea08816d5b185ab3d09e95e393f265af54560350 upstream. Convert indirect call in Xen hypercall to use non-speculative sequence, when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Reviewed-by: Juergen Gross Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/xen/hypercall.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 85133b2b8e99..0977e7607046 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -215,9 +216,9 @@ privcmd_call(unsigned call, __HYPERCALL_5ARG(a1, a2, a3, a4, a5); stac(); - asm volatile("call *%[call]" + asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM - : [call] "a" (&hypercall_page[call]) + : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); clac(); From 7e5bb301bd2fdd62cbee7b26a8234cccb6731849 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 11 Jan 2018 21:46:32 +0000 Subject: [PATCH 017/131] x86/retpoline/checksum32: Convert assembler indirect jumps commit 5096732f6f695001fa2d6f1335a2680b37912c69 upstream. Convert all indirect jumps in 32bit checksum assembler code to use non-speculative sequences when CONFIG_RETPOLINE is enabled. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-11-git-send-email-dwmw@amazon.co.uk Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/checksum_32.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index c1e623209853..90353a26ed95 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -28,7 +28,8 @@ #include #include #include - +#include + /* * computes a partial checksum, e.g. for TCP/UDP fragments */ @@ -155,7 +156,7 @@ ENTRY(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -437,7 +438,7 @@ ENTRY(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - jmp *%ebx + JMP_NOSPEC %ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) From f72655b837eb4320a1ffebbd0e0ebe92ce1e5314 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 11 Jan 2018 21:46:33 +0000 Subject: [PATCH 018/131] x86/retpoline/irq32: Convert assembler indirect jumps commit 7614e913db1f40fff819b36216484dc3808995d4 upstream. Convert all indirect jumps in 32bit irq inline asm code to use non speculative sequences. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Acked-by: Ingo Molnar Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515707194-20531-12-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/irq_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 647089c64d13..528b7aa1780d 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -20,6 +20,7 @@ #include #include +#include #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack); static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=b" (stack) : "0" (stack), - "D"(func) + [thunk_target] "D"(func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) call_on_stack(print_stack_overflow, isp); asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=a" (arg1), "=b" (isp) : "0" (desc), "1" (isp), - "D" (desc->handle_irq) + [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; } From eebc3f8adee0a6f43a4789ef0bf5c5b35de8cfe4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 11:11:27 +0000 Subject: [PATCH 019/131] x86/retpoline: Fill return stack buffer on vmexit commit 117cc7a908c83697b0b737d15ae1eb5943afe35b upstream. In accordance with the Intel and AMD documentation, we need to overwrite all entries in the RSB on exiting a guest, to prevent malicious branch target predictions from affecting the host kernel. This is needed both for retpoline and for IBRS. [ak: numbers again for the RSB stuffing labels] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk Signed-off-by: David Woodhouse Signed-off-by: Razvan Ghitulete Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 76 +++++++++++++++++++++++++++- arch/x86/kvm/svm.c | 4 ++ arch/x86/kvm/vmx.c | 4 ++ 3 files changed, 83 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index fe48aeee79d1..1afd04eb5fb7 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,6 +7,48 @@ #include #include +/* + * Fill the CPU return stack buffer. + * + * Each entry in the RSB, if used for a speculative 'ret', contains an + * infinite 'pause; jmp' loop to capture speculative execution. + * + * This is required in various cases for retpoline and IBRS-based + * mitigations for the Spectre variant 2 vulnerability. Sometimes to + * eliminate potentially bogus entries from the RSB, and sometimes + * purely to ensure that it doesn't get empty, which on some CPUs would + * allow predictions from other (unwanted!) sources to be used. + * + * We define a CPP macro such that it can be used from both .S files and + * inline assembly. It's possible to do a .macro and then include that + * from C via asm(".include ") but let's not go there. + */ + +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ +#define RSB_FILL_LOOPS 16 /* To avoid underflow */ + +/* + * Google experimented with loop-unrolling and this turned out to be + * the optimal version — two calls, each with their own speculation + * trap should their return address end up getting used, in a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr, sp) \ + mov $(nr/2), reg; \ +771: \ + call 772f; \ +773: /* speculation trap */ \ + pause; \ + jmp 773b; \ +772: \ + call 774f; \ +775: /* speculation trap */ \ + pause; \ + jmp 775b; \ +774: \ + dec reg; \ + jnz 771b; \ + add $(BITS_PER_LONG/8) * nr, sp; + #ifdef __ASSEMBLY__ /* @@ -59,6 +101,19 @@ #else call *\reg #endif +.endm + + /* + * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP + * monstrosity above, manually. + */ +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ + \ftr +.Lskip_rsb_\@: +#endif .endm #else /* __ASSEMBLY__ */ @@ -97,7 +152,7 @@ X86_FEATURE_RETPOLINE) # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) -#else /* No retpoline */ +#else /* No retpoline for C / inline asm */ # define CALL_NOSPEC "call *%[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "rm" (addr) #endif @@ -112,5 +167,24 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future + * CPUs with IBRS_ATT *might* it be avoided. + */ +static inline void vmexit_fill_RSB(void) +{ +#ifdef CONFIG_RETPOLINE + unsigned long loops = RSB_CLEAR_LOOPS / 2; + + asm volatile (ALTERNATIVE("jmp 910f", + __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), + X86_FEATURE_RETPOLINE) + "910:" + : "=&r" (loops), ASM_CALL_CONSTRAINT + : "r" (loops) : "memory" ); +#endif +} #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 900ffb6c28b5..2038e5bacce6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "trace.h" @@ -3904,6 +3905,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c26255f19603..75d60e40c389 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "trace.h" #include "pmu.h" @@ -8701,6 +8702,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (debugctlmsr) update_debugctlmsr(debugctlmsr); From 451725c3e785dfc3ede6c65184b96c213181995a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Jan 2018 22:13:29 +0100 Subject: [PATCH 020/131] x86/retpoline: Remove compile time warning commit b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 upstream. Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler does not have retpoline support. Linus rationale for this is: It's wrong because it will just make people turn off RETPOLINE, and the asm updates - and return stack clearing - that are independent of the compiler are likely the most important parts because they are likely the ones easiest to target. And it's annoying because most people won't be able to do anything about it. The number of people building their own compiler? Very small. So if their distro hasn't got a compiler yet (and pretty much nobody does), the warning is just annoying crap. It is already properly reported as part of the sysfs interface. The compile-time warning only encourages bad things. Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support") Requested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Cc: David Woodhouse Cc: Peter Zijlstra (Intel) Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=GEcCDQg5uAtw@mail.gmail.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 34fdac520edb..1f9caa041bf7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -194,8 +194,6 @@ ifdef CONFIG_RETPOLINE RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) ifneq ($(RETPOLINE_CFLAGS),) KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE - else - $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.) endif endif From f1fcb9d2926c378e39adf42ef3446e08b2d63895 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 7 Apr 2017 09:34:12 +0200 Subject: [PATCH 021/131] scsi: sg: disable SET_FORCE_LOW_DMA commit 745dfa0d8ec26b24f3304459ff6e9eacc5c8351b upstream. The ioctl SET_FORCE_LOW_DMA has never worked since the initial git check-in, and the respective setting is nowadays handled correctly. So disable it entirely. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 30 +++++++++--------------------- include/scsi/sg.h | 1 - 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 38f77e127349..0f0ff75755e0 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -160,7 +160,6 @@ typedef struct sg_fd { /* holds the state of a file descriptor */ struct list_head rq_list; /* head of request list */ struct fasync_struct *async_qp; /* used by asynchronous notification */ Sg_request req_arr[SG_MAX_QUEUE]; /* used as singly-linked list */ - char low_dma; /* as in parent but possibly overridden to 1 */ char force_packid; /* 1 -> pack_id input to read(), 0 -> ignored */ char cmd_q; /* 1 -> allow command queuing, 0 -> don't */ unsigned char next_cmd_len; /* 0: automatic, >0: use on next write() */ @@ -932,24 +931,14 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) /* strange ..., for backward compatibility */ return sfp->timeout_user; case SG_SET_FORCE_LOW_DMA: - result = get_user(val, ip); - if (result) - return result; - if (val) { - sfp->low_dma = 1; - if ((0 == sfp->low_dma) && !sfp->res_in_use) { - val = (int) sfp->reserve.bufflen; - sg_remove_scat(sfp, &sfp->reserve); - sg_build_reserve(sfp, val); - } - } else { - if (atomic_read(&sdp->detaching)) - return -ENODEV; - sfp->low_dma = sdp->device->host->unchecked_isa_dma; - } + /* + * N.B. This ioctl never worked properly, but failed to + * return an error value. So returning '0' to keep compability + * with legacy applications. + */ return 0; case SG_GET_LOW_DMA: - return put_user((int) sfp->low_dma, ip); + return put_user((int) sdp->device->host->unchecked_isa_dma, ip); case SG_GET_SCSI_ID: if (!access_ok(VERIFY_WRITE, p, sizeof (sg_scsi_id_t))) return -EFAULT; @@ -1870,6 +1859,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size) int sg_tablesize = sfp->parentdp->sg_tablesize; int blk_size = buff_size, order; gfp_t gfp_mask = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN; + struct sg_device *sdp = sfp->parentdp; if (blk_size < 0) return -EFAULT; @@ -1895,7 +1885,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size) scatter_elem_sz_prev = num; } - if (sfp->low_dma) + if (sdp->device->host->unchecked_isa_dma) gfp_mask |= GFP_DMA; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) @@ -2158,8 +2148,6 @@ sg_add_sfp(Sg_device * sdp) sfp->timeout = SG_DEFAULT_TIMEOUT; sfp->timeout_user = SG_DEFAULT_TIMEOUT_USER; sfp->force_packid = SG_DEF_FORCE_PACK_ID; - sfp->low_dma = (SG_DEF_FORCE_LOW_DMA == 0) ? - sdp->device->host->unchecked_isa_dma : 1; sfp->cmd_q = SG_DEF_COMMAND_Q; sfp->keep_orphan = SG_DEF_KEEP_ORPHAN; sfp->parentdp = sdp; @@ -2618,7 +2606,7 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) jiffies_to_msecs(fp->timeout), fp->reserve.bufflen, (int) fp->reserve.k_use_sg, - (int) fp->low_dma); + (int) sdp->device->host->unchecked_isa_dma); seq_printf(s, " cmd_q=%d f_packid=%d k_orphan=%d closed=0\n", (int) fp->cmd_q, (int) fp->force_packid, (int) fp->keep_orphan); diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 3afec7032448..20bc71c3e0b8 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -197,7 +197,6 @@ typedef struct sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ #define SG_DEFAULT_RETRIES 0 /* Defaults, commented if they differ from original sg driver */ -#define SG_DEF_FORCE_LOW_DMA 0 /* was 1 -> memory below 16MB on i386 */ #define SG_DEF_FORCE_PACK_ID 0 #define SG_DEF_KEEP_ORPHAN 0 #define SG_DEF_RESERVED_SIZE SG_SCATTER_SZ /* load time option */ From 58c82be944f58561e77eb9db5039a4b0eca96ac5 Mon Sep 17 00:00:00 2001 From: Li Jinyue Date: Thu, 14 Dec 2017 17:04:54 +0800 Subject: [PATCH 022/131] futex: Prevent overflow by strengthen input validation commit fbe0e839d1e22d88810f3ee3e2f1479be4c0aa4a upstream. UBSAN reports signed integer overflow in kernel/futex.c: UBSAN: Undefined behaviour in kernel/futex.c:2041:18 signed integer overflow: 0 - -2147483648 cannot be represented in type 'int' Add a sanity check to catch negative values of nr_wake and nr_requeue. Signed-off-by: Li Jinyue Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: dvhart@infradead.org Link: https://lkml.kernel.org/r/1513242294-31786-1-git-send-email-lijinyue@huawei.com Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index fc68462801de..1fce19fc824c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1621,6 +1621,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This From 80547bb6154d02279e446526a7d3daf7d5aa15db Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Jan 2018 23:48:05 +0100 Subject: [PATCH 023/131] ALSA: pcm: Remove yet superfluous WARN_ON() commit 23b19b7b50fe1867da8d431eea9cd3e4b6328c2c upstream. muldiv32() contains a snd_BUG_ON() (which is morphed as WARN_ON() with debug option) for checking the case of 0 / 0. This would be helpful if this happens only as a logical error; however, since the hw refine is performed with any data set provided by user, the inconsistent values that can trigger such a condition might be passed easily. Actually, syzbot caught this by passing some zero'ed old hw_params ioctl. So, having snd_BUG_ON() there is simply superfluous and rather harmful to give unnecessary confusions. Let's get rid of it. Reported-by: syzbot+7e6ee55011deeebce15d@syzkaller.appspotmail.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/pcm_lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 7b805766306e..4c145d6bccd4 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -578,7 +578,6 @@ static inline unsigned int muldiv32(unsigned int a, unsigned int b, { u_int64_t n = (u_int64_t) a * b; if (c == 0) { - snd_BUG_ON(!n); *r = 0; return UINT_MAX; } From a4d7639d5fb65070e43f679493341f1ec4212e6e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Jan 2018 08:34:28 +0100 Subject: [PATCH 024/131] ALSA: hda - Apply headphone noise quirk for another Dell XPS 13 variant commit e4c9fd10eb21376f44723c40ad12395089251c28 upstream. There is another Dell XPS 13 variant (SSID 1028:082a) that requires the existing fixup for reducing the headphone noise. This patch adds the quirk entry for that. BugLink: http://lkml.kernel.org/r/CAHXyb9ZCZJzVisuBARa+UORcjRERV8yokez=DP1_5O5isTz0ZA@mail.gmail.com Reported-and-tested-by: Francisco G. Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 5875a08d555e..f14c1f288443 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5600,6 +5600,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x075b, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x0798, "Dell Inspiron 17 7000 Gaming", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), + SND_PCI_QUIRK(0x1028, 0x082a, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), From 478a7fa82ff78be3b7aa94a9994fe0544b931f53 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 10 Jan 2018 10:53:18 +0100 Subject: [PATCH 025/131] ALSA: hda - Apply the existing quirk to iMac 14,1 commit 031f335cda879450095873003abb03ae8ed3b74a upstream. iMac 14,1 requires the same quirk as iMac 12,2, using GPIO 2 and 3 for headphone and speaker output amps. Add the codec SSID quirk entry (106b:0600) accordingly. BugLink: http://lkml.kernel.org/r/CAEw6Zyteav09VGHRfD5QwsfuWv5a43r0tFBNbfcHXoNrxVz7ew@mail.gmail.com Reported-by: Freaky Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_cirrus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_cirrus.c b/sound/pci/hda/patch_cirrus.c index 80bbadc83721..d6e079f4ec09 100644 --- a/sound/pci/hda/patch_cirrus.c +++ b/sound/pci/hda/patch_cirrus.c @@ -408,6 +408,7 @@ static const struct snd_pci_quirk cs420x_fixup_tbl[] = { /*SND_PCI_QUIRK(0x8086, 0x7270, "IMac 27 Inch", CS420X_IMAC27),*/ /* codec SSID */ + SND_PCI_QUIRK(0x106b, 0x0600, "iMac 14,1", CS420X_IMAC27_122), SND_PCI_QUIRK(0x106b, 0x1c00, "MacBookPro 8,1", CS420X_MBP81), SND_PCI_QUIRK(0x106b, 0x2000, "iMac 12,2", CS420X_IMAC27_122), SND_PCI_QUIRK(0x106b, 0x2800, "MacBookPro 10,1", CS420X_MBP101), From 48907f2535aa4b5951a0863b8bc7526402388c8d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 18:13:05 -0600 Subject: [PATCH 026/131] af_key: fix buffer overread in verify_address_len() commit 06b335cb51af018d5feeff5dd4fd53847ddb675a upstream. If a message sent to a PF_KEY socket ended with one of the extensions that takes a 'struct sadb_address' but there were not enough bytes remaining in the message for the ->sa_family member of the 'struct sockaddr' which is supposed to follow, then verify_address_len() read past the end of the message, into uninitialized memory. Fix it by returning -EINVAL in this case. This bug was found using syzkaller with KMSAN. Reproducer: #include #include #include int main() { int sock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2); char buf[24] = { 0 }; struct sadb_msg *msg = (void *)buf; struct sadb_address *addr = (void *)(msg + 1); msg->sadb_msg_version = PF_KEY_V2; msg->sadb_msg_type = SADB_DELETE; msg->sadb_msg_len = 3; addr->sadb_address_len = 1; addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; write(sock, buf, 24); } Reported-by: Alexander Potapenko Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman --- net/key/af_key.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/key/af_key.c b/net/key/af_key.c index 94bf810ad242..ea81a58a4ff6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -401,6 +401,11 @@ static int verify_address_len(const void *p) #endif int len; + if (sp->sadb_address_len < + DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), + sizeof(uint64_t))) + return -EINVAL; + switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); From b78f2d36e7378f43ba849e90e1f9dbfedd09eaf9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 29 Dec 2017 18:15:23 -0600 Subject: [PATCH 027/131] af_key: fix buffer overread in parse_exthdrs() commit 4e765b4972af7b07adcb1feb16e7a525ce1f6b28 upstream. If a message sent to a PF_KEY socket ended with an incomplete extension header (fewer than 4 bytes remaining), then parse_exthdrs() read past the end of the message, into uninitialized memory. Fix it by returning -EINVAL in this case. Reproducer: #include #include #include int main() { int sock = socket(PF_KEY, SOCK_RAW, PF_KEY_V2); char buf[17] = { 0 }; struct sadb_msg *msg = (void *)buf; msg->sadb_msg_version = PF_KEY_V2; msg->sadb_msg_type = SADB_DELETE; msg->sadb_msg_len = 2; write(sock, buf, 17); } Signed-off-by: Eric Biggers Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman --- net/key/af_key.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/key/af_key.c b/net/key/af_key.c index ea81a58a4ff6..6482b001f19a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -516,6 +516,9 @@ static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void * uint16_t ext_type; int ext_len; + if (len < sizeof(*ehdr)) + return -EINVAL; + ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; From ec7a002d7796a1831d171159366d71fbcc8bf049 Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Mon, 20 Mar 2017 16:42:48 +0100 Subject: [PATCH 028/131] scsi: hpsa: fix volume offline state commit eb94588dabec82e012281608949a860f64752914 upstream. In a previous patch a hpsa_scsi_dev_t.volume_offline update line has been removed, so let us put it back.. Fixes: 85b29008d8 (hpsa: update check for logical volume status) Signed-off-by: Tomas Henzl Acked-by: Don Brace Signed-off-by: Martin K. Petersen Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/hpsa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 0c87f341fed4..910b795fc5eb 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -3638,6 +3638,7 @@ static int hpsa_update_device_info(struct ctlr_info *h, if (h->fw_support & MISC_FW_RAID_OFFLOAD_BASIC) hpsa_get_ioaccel_status(h, scsi3addr, this_device); volume_offline = hpsa_volume_offline(h, scsi3addr); + this_device->volume_offline = volume_offline; if (volume_offline == HPSA_LV_FAILED) { rc = HPSA_LV_FAILED; dev_err(&h->pdev->dev, From 8bd58b61d2faacdacd05287a3bbb32ad226b5428 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 10 May 2017 21:03:37 +0800 Subject: [PATCH 029/131] sched/deadline: Zero out positive runtime after throttling constrained tasks commit ae83b56a56f8d9643dedbee86b457fa1c5d42f59 upstream. When a contrained task is throttled by dl_check_constrained_dl(), it may carry the remaining positive runtime, as a result when dl_task_timer() fires and calls replenish_dl_entity(), it will not be replenished correctly due to the positive dl_se->runtime. This patch assigns its runtime to 0 if positive after throttling. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Fixes: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline) Link: http://lkml.kernel.org/r/1494421417-27550-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- kernel/sched/deadline.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a996f7356216..6be2afd9bfd6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -732,6 +732,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; + if (dl_se->runtime > 0) + dl_se->runtime = 0; } } From fba063e6dfb413e06b9daa5d45b164761172f5ed Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Sat, 13 Jan 2018 17:27:30 -0600 Subject: [PATCH 030/131] x86/retpoline: Add LFENCE to the retpoline/RSB filling RSB macros commit 28d437d550e1e39f805d99f9f8ac399c778827b7 upstream. The PAUSE instruction is currently used in the retpoline and RSB filling macros as a speculation trap. The use of PAUSE was originally suggested because it showed a very, very small difference in the amount of cycles/time used to execute the retpoline as compared to LFENCE. On AMD, the PAUSE instruction is not a serializing instruction, so the pause/jmp loop will use excess power as it is speculated over waiting for return to mispredict to the correct target. The RSB filling macro is applicable to AMD, and, if software is unable to verify that LFENCE is serializing on AMD (possible when running under a hypervisor), the generic retpoline support will be used and, so, is also applicable to AMD. Keep the current usage of PAUSE for Intel, but add an LFENCE instruction to the speculation trap for AMD. The same sequence has been adopted by GCC for the GCC generated retpolines. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Acked-by: David Woodhouse Acked-by: Arjan van de Ven Cc: Rik van Riel Cc: Andi Kleen Cc: Paul Turner Cc: Peter Zijlstra Cc: Tim Chen Cc: Jiri Kosina Cc: Dave Hansen Cc: Andy Lutomirski Cc: Josh Poimboeuf Cc: Dan Williams Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Kees Cook Link: https://lkml.kernel.org/r/20180113232730.31060.36287.stgit@tlendack-t1.amdoffice.net Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1afd04eb5fb7..e28a9ff1246c 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -11,7 +11,7 @@ * Fill the CPU return stack buffer. * * Each entry in the RSB, if used for a speculative 'ret', contains an - * infinite 'pause; jmp' loop to capture speculative execution. + * infinite 'pause; lfence; jmp' loop to capture speculative execution. * * This is required in various cases for retpoline and IBRS-based * mitigations for the Spectre variant 2 vulnerability. Sometimes to @@ -38,11 +38,13 @@ call 772f; \ 773: /* speculation trap */ \ pause; \ + lfence; \ jmp 773b; \ 772: \ call 774f; \ 775: /* speculation trap */ \ pause; \ + lfence; \ jmp 775b; \ 774: \ dec reg; \ @@ -60,6 +62,7 @@ call .Ldo_rop_\@ .Lspec_trap_\@: pause + lfence jmp .Lspec_trap_\@ .Ldo_rop_\@: mov \reg, (%_ASM_SP) @@ -142,6 +145,7 @@ " .align 16\n" \ "901: call 903f;\n" \ "902: pause;\n" \ + " lfence;\n" \ " jmp 902b;\n" \ " .align 16\n" \ "903: addl $4, %%esp;\n" \ From 1782af2835fe6fbff6265a53fb85847fb9c58137 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 16 Jan 2018 12:52:28 -0800 Subject: [PATCH 031/131] module: Add retpoline tag to VERMAGIC commit 6cfb521ac0d5b97470883ff9b7facae264b7ab12 upstream. Add a marker for retpoline to the module VERMAGIC. This catches the case when a non RETPOLINE compiled module gets loaded into a retpoline kernel, making it insecure. It doesn't handle the case when retpoline has been runtime disabled. Even in this case the match of the retcompile status will be enforced. This implies that even with retpoline run time disabled all modules loaded need to be recompiled. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kroah-Hartman Acked-by: David Woodhouse Cc: rusty@rustcorp.com.au Cc: arjan.van.de.ven@intel.com Cc: jeyu@kernel.org Cc: torvalds@linux-foundation.org Link: https://lkml.kernel.org/r/20180116205228.4890-1-andi@firstfloor.org Signed-off-by: Greg Kroah-Hartman --- include/linux/vermagic.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 6f8fbcf10dfb..a3d04934aa96 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -24,10 +24,16 @@ #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif +#ifdef RETPOLINE +#define MODULE_VERMAGIC_RETPOLINE "retpoline " +#else +#define MODULE_VERMAGIC_RETPOLINE "" +#endif #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ - MODULE_ARCH_VERMAGIC + MODULE_ARCH_VERMAGIC \ + MODULE_VERMAGIC_RETPOLINE From aa041f13f8c6761ba985a4aab42c0dab49ec4202 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:21 -0800 Subject: [PATCH 032/131] pipe: avoid round_pipe_size() nr_pages overflow on 32-bit commit d3f14c485867cfb2e0c48aa88c41d0ef4bf5209c upstream. round_pipe_size() contains a right-bit-shift expression which may overflow, which would cause undefined results in a subsequent roundup_pow_of_two() call. static inline unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } PAGE_SIZE is defined as (1UL << PAGE_SHIFT), so: - 4 bytes wide on 32-bit (0 to 0xffffffff) - 8 bytes wide on 64-bit (0 to 0xffffffffffffffff) That means that 32-bit round_pipe_size(), nr_pages may overflow to 0: size=0x00000000 nr_pages=0x0 size=0x00000001 nr_pages=0x1 size=0xfffff000 nr_pages=0xfffff size=0xfffff001 nr_pages=0x0 << ! size=0xffffffff nr_pages=0x0 << ! This is bad because roundup_pow_of_two(n) is undefined when n == 0! 64-bit is not a problem as the unsigned int size is 4 bytes wide (similar to 32-bit) and the larger, 8 byte wide unsigned long, is sufficient to handle the largest value of the bit shift expression: size=0xffffffff nr_pages=100000 Modify round_pipe_size() to return 0 if n == 0 and updates its callers to handle accordingly. Link: http://lkml.kernel.org/r/1507658689-11669-3-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Dong Jinguang Signed-off-by: Greg Kroah-Hartman --- fs/pipe.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index ab8dad3ccb6a..39eff9a67253 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1001,6 +1001,9 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) { struct pipe_buffer *bufs; + if (!nr_pages) + return -EINVAL; + /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't * expect a lot of shrink+grow operations, just free and allocate @@ -1045,13 +1048,19 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) /* * Currently we rely on the pipe array holding a power-of-2 number - * of pages. + * of pages. Returns 0 on error. */ static inline unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; + if (size < pipe_min_size) + size = pipe_min_size; + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages == 0) + return 0; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } @@ -1062,13 +1071,18 @@ static inline unsigned int round_pipe_size(unsigned int size) int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { + unsigned int rounded_pipe_max_size; int ret; ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); if (ret < 0 || !write) return ret; - pipe_max_size = round_pipe_size(pipe_max_size); + rounded_pipe_max_size = round_pipe_size(pipe_max_size); + if (rounded_pipe_max_size == 0) + return -EINVAL; + + pipe_max_size = rounded_pipe_max_size; return ret; } From 7fd1335392891188b022f5fa518201710d5be3ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Jan 2018 12:20:18 +0100 Subject: [PATCH 033/131] x86/apic/vector: Fix off by one in error path commit 45d55e7bac4028af93f5fa324e69958a0b868e96 upstream. Keith reported the following warning: WARNING: CPU: 28 PID: 1420 at kernel/irq/matrix.c:222 irq_matrix_remove_managed+0x10f/0x120 x86_vector_free_irqs+0xa1/0x180 x86_vector_alloc_irqs+0x1e4/0x3a0 msi_domain_alloc+0x62/0x130 The reason for this is that if the vector allocation fails the error handling code tries to free the failed vector as well, which causes the above imbalance warning to trigger. Adjust the error path to handle this correctly. Fixes: b5dc8e6c21e7 ("x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors") Reported-by: Keith Busch Signed-off-by: Thomas Gleixner Tested-by: Keith Busch Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801161217300.1823@nanos Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/apic/vector.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 0988e204f1e3..a41e523536a2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -359,14 +359,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; irq_data->hwirq = virq + i; err = assign_irq_vector_policy(virq + i, node, data, info); - if (err) + if (err) { + irq_data->chip_data = NULL; + free_apic_chip_data(data); goto error; + } } return 0; error: - x86_vector_free_irqs(domain, virq, i + 1); + x86_vector_free_irqs(domain, virq, i); return err; } From 47970b4ea09c7dc4bb610825a1e72915abb63aca Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 8 Jan 2018 17:20:18 -0800 Subject: [PATCH 034/131] Input: 88pm860x-ts - fix child-node lookup commit 906bf7daa0618d0ef39f4872ca42218c29a3631f upstream. Fix child node-lookup during probe, which ended up searching the whole device tree depth-first starting at parent rather than just matching on its children. To make things worse, the parent node was prematurely freed, while the child node was leaked. Fixes: 2e57d56747e6 ("mfd: 88pm860x: Device tree support") Signed-off-by: Johan Hovold Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/88pm860x-ts.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/input/touchscreen/88pm860x-ts.c b/drivers/input/touchscreen/88pm860x-ts.c index 251ff2aa0633..7a0dbce4dae9 100644 --- a/drivers/input/touchscreen/88pm860x-ts.c +++ b/drivers/input/touchscreen/88pm860x-ts.c @@ -126,7 +126,7 @@ static int pm860x_touch_dt_init(struct platform_device *pdev, int data, n, ret; if (!np) return -ENODEV; - np = of_find_node_by_name(np, "touch"); + np = of_get_child_by_name(np, "touch"); if (!np) { dev_err(&pdev->dev, "Can't find touch node\n"); return -EINVAL; @@ -144,13 +144,13 @@ static int pm860x_touch_dt_init(struct platform_device *pdev, if (data) { ret = pm860x_reg_write(i2c, PM8607_GPADC_MISC1, data); if (ret < 0) - return -EINVAL; + goto err_put_node; } /* set tsi prebias time */ if (!of_property_read_u32(np, "marvell,88pm860x-tsi-prebias", &data)) { ret = pm860x_reg_write(i2c, PM8607_TSI_PREBIAS, data); if (ret < 0) - return -EINVAL; + goto err_put_node; } /* set prebias & prechg time of pen detect */ data = 0; @@ -161,10 +161,18 @@ static int pm860x_touch_dt_init(struct platform_device *pdev, if (data) { ret = pm860x_reg_write(i2c, PM8607_PD_PREBIAS, data); if (ret < 0) - return -EINVAL; + goto err_put_node; } of_property_read_u32(np, "marvell,88pm860x-resistor-X", res_x); + + of_node_put(np); + return 0; + +err_put_node: + of_node_put(np); + + return -EINVAL; } #else #define pm860x_touch_dt_init(x, y, z) (-1) From 8e70d4862271b9c7a8300273a3bb895a5c9dbc97 Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Mon, 9 May 2016 17:01:01 -0700 Subject: [PATCH 035/131] Input: twl6040-vibra - fix DT node memory management commit c52c545ead97fcc2f4f8ea38f1ae3c23211e09a8 upstream. commit e7ec014a47e4 ("Input: twl6040-vibra - update for device tree support") made the separate vibra DT node to a subnode of the twl6040. It now calls of_find_node_by_name() to locate the "vibra" subnode. This function has a side effect to call of_node_put on() for the twl6040 parent node passed in as a parameter. This causes trouble later on. Solution: we must call of_node_get() before of_find_node_by_name() Signed-off-by: H. Nikolaus Schaller Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/misc/twl6040-vibra.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c index ea63fad48de6..7221a00168bd 100644 --- a/drivers/input/misc/twl6040-vibra.c +++ b/drivers/input/misc/twl6040-vibra.c @@ -262,6 +262,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev) int vddvibr_uV = 0; int error; + of_node_get(twl6040_core_dev->of_node); twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node, "vibra"); if (!twl6040_core_node) { From a89e1ac9b0da065fa95ddd33030dcd38231f2b27 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 8 Jan 2018 17:17:48 -0800 Subject: [PATCH 036/131] Input: twl6040-vibra - fix child-node lookup commit dcaf12a8b0bbdbfcfa2be8dff2c4948d9844b4ad upstream. Fix child-node lookup during probe, which ended up searching the whole device tree depth-first starting at parent rather than just matching on its children. Later sanity checks on node properties (which would likely be missing) should prevent this from causing much trouble however, especially as the original premature free of the parent node has already been fixed separately (but that "fix" was apparently never backported to stable). Fixes: e7ec014a47e4 ("Input: twl6040-vibra - update for device tree support") Fixes: c52c545ead97 ("Input: twl6040-vibra - fix DT node memory management") Signed-off-by: Johan Hovold Acked-by: Peter Ujfalusi Tested-by: H. Nikolaus Schaller (on Pyra OMAP5 hardware) Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/misc/twl6040-vibra.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/input/misc/twl6040-vibra.c b/drivers/input/misc/twl6040-vibra.c index 7221a00168bd..1e968ae37f60 100644 --- a/drivers/input/misc/twl6040-vibra.c +++ b/drivers/input/misc/twl6040-vibra.c @@ -262,8 +262,7 @@ static int twl6040_vibra_probe(struct platform_device *pdev) int vddvibr_uV = 0; int error; - of_node_get(twl6040_core_dev->of_node); - twl6040_core_node = of_find_node_by_name(twl6040_core_dev->of_node, + twl6040_core_node = of_get_child_by_name(twl6040_core_dev->of_node, "vibra"); if (!twl6040_core_node) { dev_err(&pdev->dev, "parent of node is missing?\n"); From b6306f3fdcaa21b1544433de50004d13ce59042b Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 8 Jan 2018 17:15:06 -0800 Subject: [PATCH 037/131] Input: twl4030-vibra - fix sibling-node lookup commit 5b189201993ab03001a398de731045bfea90c689 upstream. A helper purported to look up a child node based on its name was using the wrong of-helper and ended up prematurely freeing the parent of-node while searching the whole device tree depth-first starting at the parent node. Fixes: 64b9e4d803b1 ("input: twl4030-vibra: Support for DT booted kernel") Fixes: e661d0a04462 ("Input: twl4030-vibra - fix ERROR: Bad of_node_put() warning") Signed-off-by: Johan Hovold Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/misc/twl4030-vibra.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/input/misc/twl4030-vibra.c b/drivers/input/misc/twl4030-vibra.c index 10c4e3d462f1..7233db002588 100644 --- a/drivers/input/misc/twl4030-vibra.c +++ b/drivers/input/misc/twl4030-vibra.c @@ -178,12 +178,14 @@ static SIMPLE_DEV_PM_OPS(twl4030_vibra_pm_ops, twl4030_vibra_suspend, twl4030_vibra_resume); static bool twl4030_vibra_check_coexist(struct twl4030_vibra_data *pdata, - struct device_node *node) + struct device_node *parent) { + struct device_node *node; + if (pdata && pdata->coexist) return true; - node = of_find_node_by_name(node, "codec"); + node = of_get_child_by_name(parent, "codec"); if (node) { of_node_put(node); return true; From cf3625004e6c9093dcaa233c6f553b7eba269b50 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 18 Jan 2018 15:53:10 -0500 Subject: [PATCH 038/131] tracing: Fix converting enum's from the map in trace_event_eval_update() commit 1ebe1eaf2f02784921759992ae1fde1a9bec8fd0 upstream. Since enums do not get converted by the TRACE_EVENT macro into their values, the event format displaces the enum name and not the value. This breaks tools like perf and trace-cmd that need to interpret the raw binary data. To solve this, an enum map was created to convert these enums into their actual numbers on boot up. This is done by TRACE_EVENTS() adding a TRACE_DEFINE_ENUM() macro. Some enums were not being converted. This was caused by an optization that had a bug in it. All calls get checked against this enum map to see if it should be converted or not, and it compares the call's system to the system that the enum map was created under. If they match, then they call is processed. To cut down on the number of iterations needed to find the maps with a matching system, since calls and maps are grouped by system, when a match is made, the index into the map array is saved, so that the next call, if it belongs to the same system as the previous call, could start right at that array index and not have to scan all the previous arrays. The problem was, the saved index was used as the variable to know if this is a call in a new system or not. If the index was zero, it was assumed that the call is in a new system and would keep incrementing the saved index until it found a matching system. The issue arises when the first matching system was at index zero. The next map, if it belonged to the same system, would then think it was the first match and increment the index to one. If the next call belong to the same system, it would begin its search of the maps off by one, and miss the first enum that should be converted. This left a single enum not converted properly. Also add a comment to describe exactly what that index was for. It took me a bit too long to figure out what I was thinking when debugging this issue. Link: http://lkml.kernel.org/r/717BE572-2070-4C1E-9902-9F2E0FEDA4F8@oracle.com Fixes: 0c564a538aa93 ("tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values") Reported-by: Chuck Lever Teste-by: Chuck Lever Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 996f0fd34312..ba5392807912 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2300,6 +2300,7 @@ void trace_event_enum_update(struct trace_enum_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2307,15 +2308,28 @@ void trace_event_enum_update(struct trace_enum_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } From d5276c0137137dfa89402c52eceeb406f4950404 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 12 Jan 2018 11:12:05 +0100 Subject: [PATCH 039/131] phy: work around 'phys' references to usb-nop-xceiv devices commit b7563e2796f8b23c98afcfea7363194227fa089d upstream. Stefan Wahren reports a problem with a warning fix that was merged for v4.15: we had lots of device nodes with a 'phys' property pointing to a device node that is not compliant with the binding documented in Documentation/devicetree/bindings/phy/phy-bindings.txt This generally works because USB HCD drivers that support both the generic phy subsystem and the older usb-phy subsystem ignore most errors from phy_get() and related calls and then use the usb-phy driver instead. However, it turns out that making the usb-nop-xceiv device compatible with the generic-phy binding changes the phy_get() return code from -EINVAL to -EPROBE_DEFER, and the dwc2 usb controller driver for bcm2835 now returns -EPROBE_DEFER from its probe function rather than ignoring the failure, breaking all USB support on raspberry-pi when CONFIG_GENERIC_PHY is enabled. The same code is used in the dwc3 driver and the usb_add_hcd() function, so a reasonable assumption would be that many other platforms are affected as well. I have reviewed all the related patches and concluded that "usb-nop-xceiv" is the only USB phy that is affected by the change, and since it is by far the most commonly referenced phy, all the other USB phy drivers appear to be used in ways that are are either safe in DT (they don't use the 'phys' property), or in the driver (they already ignore -EPROBE_DEFER from generic-phy when usb-phy is available). To work around the problem, this adds a special case to _of_phy_get() so we ignore any PHY node that is compatible with "usb-nop-xceiv", as we know that this can never load no matter how much we defer. In the future, we might implement a generic-phy driver for "usb-nop-xceiv" and then remove this workaround. Since we generally want older kernels to also want to work with the fixed devicetree files, it would be good to backport the patch into stable kernels as well (3.13+ are possibly affected), even though they don't contain any of the patches that may have caused regressions. Fixes: 014d6da6cb25 ARM: dts: bcm283x: Fix DTC warnings about missing phy-cells Fixes: c5bbf358b790 arm: dts: nspire: Add missing #phy-cells to usb-nop-xceiv Fixes: 44e5dced2ef6 arm: dts: marvell: Add missing #phy-cells to usb-nop-xceiv Fixes: f568f6f554b8 ARM: dts: omap: Add missing #phy-cells to usb-nop-xceiv Fixes: d745d5f277bf ARM: dts: imx51-zii-rdu1: Add missing #phy-cells to usb-nop-xceiv Fixes: 915fbe59cbf2 ARM: dts: imx: Add missing #phy-cells to usb-nop-xceiv Link: https://marc.info/?l=linux-usb&m=151518314314753&w=2 Link: https://patchwork.kernel.org/patch/10158145/ Cc: Felipe Balbi Cc: Eric Anholt Tested-by: Stefan Wahren Acked-by: Rob Herring Tested-by: Hans Verkuil Acked-by: Kishon Vijay Abraham I Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/phy/phy-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index e7e574dc667a..be1f0276ab23 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -365,6 +365,10 @@ static struct phy *_of_phy_get(struct device_node *np, int index) if (ret) return ERR_PTR(-ENODEV); + /* This phy type handled by the usb-phy subsystem for now */ + if (of_device_is_compatible(args.np, "usb-nop-xceiv")) + return ERR_PTR(-ENODEV); + mutex_lock(&phy_provider_mutex); phy_provider = of_phy_provider_lookup(args.np); if (IS_ERR(phy_provider) || !try_module_get(phy_provider->owner)) { From 2d5523bf47b4326ff1279613bfe69982bc4675c1 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Thu, 4 Jan 2018 17:53:12 +0100 Subject: [PATCH 040/131] ARM: dts: kirkwood: fix pin-muxing of MPP7 on OpenBlocks A7 commit 56aeb07c914a616ab84357d34f8414a69b140cdf upstream. MPP7 is currently muxed as "gpio", but this function doesn't exist for MPP7, only "gpo" is available. This causes the following error: kirkwood-pinctrl f1010000.pin-controller: unsupported function gpio on pin mpp7 pinctrl core: failed to register map default (6): invalid type given kirkwood-pinctrl f1010000.pin-controller: error claiming hogs: -22 kirkwood-pinctrl f1010000.pin-controller: could not claim hogs: -22 kirkwood-pinctrl f1010000.pin-controller: unable to register pinctrl driver kirkwood-pinctrl: probe of f1010000.pin-controller failed with error -22 So the pinctrl driver is not probed, all device drivers (including the UART driver) do a -EPROBE_DEFER, and therefore the system doesn't really boot (well, it boots, but with no UART, and no devices that require pin-muxing). Back when the Device Tree file for this board was introduced, the definition was already wrong. The pinctrl driver also always described as "gpo" this function for MPP7. However, between Linux 4.10 and 4.11, a hog pin failing to be muxed was turned from a simple warning to a hard error that caused the entire pinctrl driver probe to bail out. This is probably the result of commit 6118714275f0a ("pinctrl: core: Fix pinctrl_register_and_init() with pinctrl_enable()"). This commit fixes the Device Tree to use the proper "gpo" function for MPP7, which fixes the boot of OpenBlocks A7, which was broken since Linux 4.11. Fixes: f24b56cbcd9d ("ARM: kirkwood: add support for OpenBlocks A7 platform") Signed-off-by: Thomas Petazzoni Reviewed-by: Andrew Lunn Signed-off-by: Gregory CLEMENT Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/kirkwood-openblocks_a7.dts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts index d5e3bc518968..d57f48543f76 100644 --- a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts +++ b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts @@ -53,7 +53,8 @@ }; pinctrl: pin-controller@10000 { - pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header>; + pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header + &pmx_gpio_header_gpo>; pinctrl-names = "default"; pmx_uart0: pmx-uart0 { @@ -85,11 +86,16 @@ * ground. */ pmx_gpio_header: pmx-gpio-header { - marvell,pins = "mpp17", "mpp7", "mpp29", "mpp28", + marvell,pins = "mpp17", "mpp29", "mpp28", "mpp35", "mpp34", "mpp40"; marvell,function = "gpio"; }; + pmx_gpio_header_gpo: pxm-gpio-header-gpo { + marvell,pins = "mpp7"; + marvell,function = "gpo"; + }; + pmx_gpio_init: pmx-init { marvell,pins = "mpp38"; marvell,function = "gpio"; From 082dfe6141f3a8adab45c7def0d1b1da17545f7c Mon Sep 17 00:00:00 2001 From: Stephane Grosjean Date: Mon, 15 Jan 2018 16:31:19 +0100 Subject: [PATCH 041/131] can: peak: fix potential bug in packet fragmentation commit d8a243af1a68395e07ac85384a2740d4134c67f4 upstream. In some rare conditions when running one PEAK USB-FD interface over a non high-speed USB controller, one useless USB fragment might be sent. This patch fixes the way a USB command is fragmented when its length is greater than 64 bytes and when the underlying USB controller is not a high-speed one. Signed-off-by: Stephane Grosjean Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index ce44a033f63b..64cc86a82b2d 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -184,7 +184,7 @@ static int pcan_usb_fd_send_cmd(struct peak_usb_device *dev, void *cmd_tail) void *cmd_head = pcan_usb_fd_cmd_buffer(dev); int err = 0; u8 *packet_ptr; - int i, n = 1, packet_len; + int packet_len; ptrdiff_t cmd_len; /* usb device unregistered? */ @@ -201,17 +201,13 @@ static int pcan_usb_fd_send_cmd(struct peak_usb_device *dev, void *cmd_tail) } packet_ptr = cmd_head; + packet_len = cmd_len; /* firmware is not able to re-assemble 512 bytes buffer in full-speed */ - if ((dev->udev->speed != USB_SPEED_HIGH) && - (cmd_len > PCAN_UFD_LOSPD_PKT_SIZE)) { - packet_len = PCAN_UFD_LOSPD_PKT_SIZE; - n += cmd_len / packet_len; - } else { - packet_len = cmd_len; - } + if (unlikely(dev->udev->speed != USB_SPEED_HIGH)) + packet_len = min(packet_len, PCAN_UFD_LOSPD_PKT_SIZE); - for (i = 0; i < n; i++) { + do { err = usb_bulk_msg(dev->udev, usb_sndbulkpipe(dev->udev, PCAN_USBPRO_EP_CMDOUT), @@ -224,7 +220,12 @@ static int pcan_usb_fd_send_cmd(struct peak_usb_device *dev, void *cmd_tail) } packet_ptr += packet_len; - } + cmd_len -= packet_len; + + if (cmd_len < PCAN_UFD_LOSPD_PKT_SIZE) + packet_len = cmd_len; + + } while (packet_len > 0); return err; } From b7bd013a3fe033280bcc58903c51f087e4989229 Mon Sep 17 00:00:00 2001 From: Xinyu Lin Date: Sun, 17 Dec 2017 20:13:39 +0800 Subject: [PATCH 042/131] libata: apply MAX_SEC_1024 to all LITEON EP1 series devices commit db5ff909798ef0099004ad50a0ff5fde92426fd1 upstream. LITEON EP1 has the same timeout issues as CX1 series devices. Revert max_sectors to the value of 1024. Fixes: e0edc8c54646 ("libata: apply MAX_SEC_1024 to all CX1-JB*-HP devices") Signed-off-by: Xinyu Lin Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index b0b77b61c40c..69ec1c5d7152 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4143,6 +4143,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { * https://bugzilla.kernel.org/show_bug.cgi?id=121671 */ { "LITEON CX1-JB*-HP", NULL, ATA_HORKAGE_MAX_SEC_1024 }, + { "LITEON EP1-*", NULL, ATA_HORKAGE_MAX_SEC_1024 }, /* Devices we expect to fail diagnostics */ From 9c7755af771ac1b4ef1fc0e552749aa80d23af32 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Dec 2017 09:56:06 +0000 Subject: [PATCH 043/131] dm btree: fix serious bug in btree_split_beneath() commit bc68d0a43560e950850fc69b58f0f8254b28f6d6 upstream. When inserting a new key/value pair into a btree we walk down the spine of btree nodes performing the following 2 operations: i) space for a new entry ii) adjusting the first key entry if the new key is lower than any in the node. If the _root_ node is full, the function btree_split_beneath() allocates 2 new nodes, and redistibutes the root nodes entries between them. The root node is left with 2 entries corresponding to the 2 new nodes. btree_split_beneath() then adjusts the spine to point to one of the two new children. This means the first key is never adjusted if the new key was lower, ie. operation (ii) gets missed out. This can result in the new key being 'lost' for a period; until another low valued key is inserted that will uncover it. This is a serious bug, and quite hard to make trigger in normal use. A reproducing test case ("thin create devices-in-reverse-order") is available as part of the thin-provision-tools project: https://github.com/jthornber/thin-provisioning-tools/blob/master/functional-tests/device-mapper/dm-tests.scm#L593 Fix the issue by changing btree_split_beneath() so it no longer adjusts the spine. Instead it unlocks both the new nodes, and lets the main loop in btree_insert_raw() relock the appropriate one and make any neccessary adjustments. Reported-by: Monty Pavel Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/persistent-data/dm-btree.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index a1a68209bd36..880b7dee9c52 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -671,23 +671,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) pn->keys[1] = rn->keys[0]; memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); - /* - * rejig the spine. This is ugly, since it knows too - * much about the spine - */ - if (s->nodes[0] != new_parent) { - unlock_block(s->info, s->nodes[0]); - s->nodes[0] = new_parent; - } - if (key < le64_to_cpu(rn->keys[0])) { - unlock_block(s->info, right); - s->nodes[1] = left; - } else { - unlock_block(s->info, left); - s->nodes[1] = right; - } - s->count = 2; - + unlock_block(s->info, left); + unlock_block(s->info, right); return 0; } From 7e7b086ef2fec6b1bca061a3082fc1c2701999d5 Mon Sep 17 00:00:00 2001 From: Dennis Yang Date: Tue, 12 Dec 2017 18:21:40 +0800 Subject: [PATCH 044/131] dm thin metadata: THIN_MAX_CONCURRENT_LOCKS should be 6 commit 490ae017f54e55bde382d45ea24bddfb6d1a0aaf upstream. For btree removal, there is a corner case that a single thread could takes 6 locks which is more than THIN_MAX_CONCURRENT_LOCKS(5) and leads to deadlock. A btree removal might eventually call rebalance_children()->rebalance3() to rebalance entries of three neighbor child nodes when shadow_spine has already acquired two write locks. In rebalance3(), it tries to shadow and acquire the write locks of all three child nodes. However, shadowing a child node requires acquiring a read lock of the original child node and a write lock of the new block. Although the read lock will be released after block shadowing, shadowing the third child node in rebalance3() could still take the sixth lock. (2 write locks for shadow_spine + 2 write locks for the first two child nodes's shadow + 1 write lock for the last child node's shadow + 1 read lock for the last child node) Signed-off-by: Dennis Yang Acked-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-thin-metadata.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 3b67afda430b..e339f4288e8f 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -81,10 +81,14 @@ #define SECTOR_TO_BLOCK_SHIFT 3 /* + * For btree insert: * 3 for btree insert + * 2 for btree lookup used within space map + * For btree remove: + * 2 for shadow spine + + * 4 for rebalance 3 child node */ -#define THIN_MAX_CONCURRENT_LOCKS 5 +#define THIN_MAX_CONCURRENT_LOCKS 6 /* This should be plenty */ #define SPACE_MAP_ROOT_SIZE 128 From 5ecd5c8388f060a02a6b97d9b99d9ea885903568 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Jan 2018 10:23:47 +0000 Subject: [PATCH 045/131] arm64: KVM: Fix SMCCC handling of unimplemented SMC/HVC calls commit acfb3b883f6d6a4b5d27ad7fdded11f6a09ae6dd upstream. KVM doesn't follow the SMCCC when it comes to unimplemented calls, and inject an UNDEF instead of returning an error. Since firmware calls are now used for security mitigation, they are becoming more common, and the undef is counter productive. Instead, let's follow the SMCCC which states that -1 must be returned to the caller when getting an unknown function number. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/handle_exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index ba93a09eb536..5295aef7c8f0 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -42,7 +42,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_psci_call(vcpu); if (ret < 0) { - kvm_inject_undefined(vcpu); + vcpu_set_reg(vcpu, 0, ~0UL); return 1; } @@ -51,7 +51,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) { - kvm_inject_undefined(vcpu); + vcpu_set_reg(vcpu, 0, ~0UL); return 1; } From 6b1c99e275c034e4650044a7bb1a0bc274e1eb45 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 26 Dec 2017 23:43:54 -0600 Subject: [PATCH 046/131] x86/cpu, x86/pti: Do not enable PTI on AMD processors commit 694d99d40972f12e59a3696effee8a376b79d7c8 upstream. AMD processors are not subject to the types of attacks that the kernel page table isolation feature protects against. The AMD microarchitecture does not allow memory references, including speculative references, that access higher privileged data when running in a lesser privileged mode when that access would result in a page fault. Disable page table isolation by default on AMD processors by not setting the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI is set. Signed-off-by: Tom Lendacky Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Dave Hansen Cc: Andy Lutomirski Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net Cc: Nick Lowe Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0531c1707b40..f7f2ad3687ee 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -831,8 +831,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - /* Assume for now that ALL x86 CPUs are insecure */ - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + if (c->x86_vendor != X86_VENDOR_AMD) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); From ff535919c1366b0218113f95c447d92c3aac0c1f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 1 Nov 2016 12:46:19 +1100 Subject: [PATCH 047/131] kbuild: modversions for EXPORT_SYMBOL() for asm commit 4efca4ed05cbdfd13ec3e8cb623fb77d6e4ab187 upstream. Allow architectures to create asm/asm-prototypes.h file that provides C prototypes for exported asm functions, which enables proper CRC versions to be generated for them. Signed-off-by: Nicholas Piggin Signed-off-by: Michal Marek [jkosina@suse.cz: folded cc6acc11cad1 fixup in as well ] Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- scripts/Makefile.build | 87 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 7 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 01df30af4d4a..18209917e379 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -158,7 +158,8 @@ cmd_cc_i_c = $(CPP) $(c_flags) -o $@ $< $(obj)/%.i: $(src)/%.c FORCE $(call if_changed_dep,cc_i_c) -cmd_gensymtypes = \ +# These mirror gensymtypes_S and co below, keep them in synch. +cmd_gensymtypes_c = \ $(CPP) -D__GENKSYMS__ $(c_flags) $< | \ $(GENKSYMS) $(if $(1), -T $(2)) \ $(patsubst y,-s _,$(CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX)) \ @@ -168,7 +169,7 @@ cmd_gensymtypes = \ quiet_cmd_cc_symtypes_c = SYM $(quiet_modtag) $@ cmd_cc_symtypes_c = \ set -e; \ - $(call cmd_gensymtypes,true,$@) >/dev/null; \ + $(call cmd_gensymtypes_c,true,$@) >/dev/null; \ test -s $@ || rm -f $@ $(obj)/%.symtypes : $(src)/%.c FORCE @@ -197,9 +198,10 @@ else # the actual value of the checksum generated by genksyms cmd_cc_o_c = $(CC) $(c_flags) -c -o $(@D)/.tmp_$(@F) $< -cmd_modversions = \ + +cmd_modversions_c = \ if $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then \ - $(call cmd_gensymtypes,$(KBUILD_SYMTYPES),$(@:.o=.symtypes)) \ + $(call cmd_gensymtypes_c,$(KBUILD_SYMTYPES),$(@:.o=.symtypes)) \ > $(@D)/.tmp_$(@F:.o=.ver); \ \ $(LD) $(LDFLAGS) -r -o $@ $(@D)/.tmp_$(@F) \ @@ -244,7 +246,7 @@ endif define rule_cc_o_c $(call echo-cmd,checksrc) $(cmd_checksrc) \ $(call echo-cmd,cc_o_c) $(cmd_cc_o_c); \ - $(cmd_modversions) \ + $(cmd_modversions_c) \ $(call echo-cmd,record_mcount) \ $(cmd_record_mcount) \ scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,cc_o_c)' > \ @@ -253,6 +255,15 @@ define rule_cc_o_c mv -f $(dot-target).tmp $(dot-target).cmd endef +define rule_as_o_S + $(call echo-cmd,as_o_S) $(cmd_as_o_S); \ + scripts/basic/fixdep $(depfile) $@ '$(call make-cmd,as_o_S)' > \ + $(dot-target).tmp; \ + $(cmd_modversions_S) \ + rm -f $(depfile); \ + mv -f $(dot-target).tmp $(dot-target).cmd +endef + # Built-in and composite module parts $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE $(call cmd,force_checksrc) @@ -281,6 +292,38 @@ modkern_aflags := $(KBUILD_AFLAGS_KERNEL) $(AFLAGS_KERNEL) $(real-objs-m) : modkern_aflags := $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE) $(real-objs-m:.o=.s): modkern_aflags := $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE) +# .S file exports must have their C prototypes defined in asm/asm-prototypes.h +# or a file that it includes, in order to get versioned symbols. We build a +# dummy C file that includes asm-prototypes and the EXPORT_SYMBOL lines from +# the .S file (with trailing ';'), and run genksyms on that, to extract vers. +# +# This is convoluted. The .S file must first be preprocessed to run guards and +# expand names, then the resulting exports must be constructed into plain +# EXPORT_SYMBOL(symbol); to build our dummy C file, and that gets preprocessed +# to make the genksyms input. +# +# These mirror gensymtypes_c and co above, keep them in synch. +cmd_gensymtypes_S = \ + (echo "\#include " ; \ + echo "\#include " ; \ + $(CPP) $(a_flags) $< | \ + grep "\<___EXPORT_SYMBOL\>" | \ + sed 's/.*___EXPORT_SYMBOL[[:space:]]*\([a-zA-Z0-9_]*\)[[:space:]]*,.*/EXPORT_SYMBOL(\1);/' ) | \ + $(CPP) -D__GENKSYMS__ $(c_flags) -xc - | \ + $(GENKSYMS) $(if $(1), -T $(2)) \ + $(patsubst y,-s _,$(CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX)) \ + $(if $(KBUILD_PRESERVE),-p) \ + -r $(firstword $(wildcard $(2:.symtypes=.symref) /dev/null)) + +quiet_cmd_cc_symtypes_S = SYM $(quiet_modtag) $@ +cmd_cc_symtypes_S = \ + set -e; \ + $(call cmd_gensymtypes_S,true,$@) >/dev/null; \ + test -s $@ || rm -f $@ + +$(obj)/%.symtypes : $(src)/%.S FORCE + $(call cmd,cc_symtypes_S) + quiet_cmd_as_s_S = CPP $(quiet_modtag) $@ cmd_as_s_S = $(CPP) $(a_flags) -o $@ $< @@ -288,10 +331,40 @@ $(obj)/%.s: $(src)/%.S FORCE $(call if_changed_dep,as_s_S) quiet_cmd_as_o_S = AS $(quiet_modtag) $@ -cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $< + +ifndef CONFIG_MODVERSIONS +cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $< + +else + +ASM_PROTOTYPES := $(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/asm-prototypes.h) + +ifeq ($(ASM_PROTOTYPES),) +cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $< + +else + +# versioning matches the C process described above, with difference that +# we parse asm-prototypes.h C header to get function definitions. + +cmd_as_o_S = $(CC) $(a_flags) -c -o $(@D)/.tmp_$(@F) $< + +cmd_modversions_S = \ + if $(OBJDUMP) -h $(@D)/.tmp_$(@F) | grep -q __ksymtab; then \ + $(call cmd_gensymtypes_S,$(KBUILD_SYMTYPES),$(@:.o=.symtypes)) \ + > $(@D)/.tmp_$(@F:.o=.ver); \ + \ + $(LD) $(LDFLAGS) -r -o $@ $(@D)/.tmp_$(@F) \ + -T $(@D)/.tmp_$(@F:.o=.ver); \ + rm -f $(@D)/.tmp_$(@F) $(@D)/.tmp_$(@F:.o=.ver); \ + else \ + mv -f $(@D)/.tmp_$(@F) $@; \ + fi; +endif +endif $(obj)/%.o: $(src)/%.S FORCE - $(call if_changed_dep,as_o_S) + $(call if_changed_rule,as_o_S) targets += $(real-objs-y) $(real-objs-m) $(lib-y) targets += $(extra-y) $(MAKECMDGOALS) $(always) From f59e7ce17ba327245c8feb312d447b09d3b98eba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 18 Jan 2018 16:28:26 +0100 Subject: [PATCH 048/131] x86/mce: Make machine check speculation protected commit 6f41c34d69eb005e7848716bbcafc979b35037d5 upstream. The machine check idtentry uses an indirect branch directly from the low level code. This evades the speculation protection. Replace it by a direct call into C code and issue the indirect call there so the compiler can apply the proper speculation protection. Signed-off-by: Thomas Gleixner Reviewed-by:Borislav Petkov Reviewed-by: David Woodhouse Niced-by: Peter Zijlstra Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801181626290.1847@nanos Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/traps.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 81b1cd533965..a03b22c615d9 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1031,7 +1031,7 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 #endif #ifdef CONFIG_X86_MCE -idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif /* diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c3496619740a..156959ca49ce 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -92,6 +92,7 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif +dotraplinkage void do_mce(struct pt_regs *, long); static inline int get_si_code(unsigned long condition) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7e8a736d09db..364fbad72e60 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1672,6 +1672,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +{ + machine_check_vector(regs, error_code); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: From 799dc737680a8074a0c7c2d3426b85f4c439377f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 19 Jan 2018 01:14:21 +0900 Subject: [PATCH 049/131] retpoline: Introduce start/end markers of indirect thunk commit 736e80a4213e9bbce40a7c050337047128b472ac upstream. Introduce start/end markers of __x86_indirect_thunk_* functions. To make it easy, consolidate .text.__x86.indirect_thunk.* sections to one .text.__x86.indirect_thunk section and put it in the end of kernel text section and adds __indirect_thunk_start/end so that other subsystem (e.g. kprobes) can identify it. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/151629206178.10241.6828804696410044771.stgit@devbox Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 3 +++ arch/x86/kernel/vmlinux.lds.S | 7 +++++++ arch/x86/lib/retpoline.S | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e28a9ff1246c..4f7a5d3fed91 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -171,6 +171,9 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +extern char __indirect_thunk_start[]; +extern char __indirect_thunk_end[]; + /* * On VMEXIT we must ensure that no RSB predictions learned in the guest * can be followed in the host, by overwriting the RSB completely. Both diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 74e4bf11f562..e065065a4dfb 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -104,6 +104,13 @@ SECTIONS IRQENTRY_TEXT *(.fixup) *(.gnu.warning) + +#ifdef CONFIG_RETPOLINE + __indirect_thunk_start = .; + *(.text.__x86.indirect_thunk) + __indirect_thunk_end = .; +#endif + /* End of text section */ _etext = .; } :text = 0x9090 diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 019a03599bb0..8a8093e9cf01 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -9,7 +9,7 @@ #include .macro THUNK reg - .section .text.__x86.indirect_thunk.\reg + .section .text.__x86.indirect_thunk ENTRY(__x86_indirect_thunk_\reg) CFI_STARTPROC From 9b8bd0d3586842716f5673e7a0922a9c9e1fcbfd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 19 Jan 2018 01:14:51 +0900 Subject: [PATCH 050/131] kprobes/x86: Blacklist indirect thunk functions for kprobes commit c1804a236894ecc942da7dc6c5abe209e56cba93 upstream. Mark __x86_indirect_thunk_* functions as blacklist for kprobes because those functions can be called from anywhere in the kernel including blacklist functions of kprobes. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/151629209111.10241.5444852823378068683.stgit@devbox Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/retpoline.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 8a8093e9cf01..e611a124c442 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -25,7 +25,8 @@ ENDPROC(__x86_indirect_thunk_\reg) * than one per register with the correct names. So we do it * the simple and nasty way... */ -#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg) +#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) +#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) #define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) GENERATE_THUNK(_ASM_AX) From 6cb73eb8045157ea280f0a047777ea1f56547375 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 19 Jan 2018 01:15:20 +0900 Subject: [PATCH 051/131] kprobes/x86: Disable optimizing on the function jumps to indirect thunk commit c86a32c09f8ced67971a2310e3b0dda4d1749007 upstream. Since indirect jump instructions will be replaced by jump to __x86_indirect_thunk_*, those jmp instruction must be treated as an indirect jump. Since optprobe prohibits to optimize probes in the function which uses an indirect jump, it also needs to find out the function which jump to __x86_indirect_thunk_* and disable optimization. Add a check that the jump target address is between the __indirect_thunk_start/end when optimizing kprobe. Signed-off-by: Masami Hiramatsu Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Arjan van de Ven Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/151629212062.10241.6991266100233002273.stgit@devbox Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kprobes/opt.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index c9d488f3e4cd..ea8e2b846101 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "common.h" @@ -191,7 +192,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src) } /* Check whether insn is indirect jump */ -static int insn_is_indirect_jump(struct insn *insn) +static int __insn_is_indirect_jump(struct insn *insn) { return ((insn->opcode.bytes[0] == 0xff && (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ @@ -225,6 +226,26 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) return (start <= target && target <= start + len); } +static int insn_is_indirect_jump(struct insn *insn) +{ + int ret = __insn_is_indirect_jump(insn); + +#ifdef CONFIG_RETPOLINE + /* + * Jump to x86_indirect_thunk_* is treated as an indirect jump. + * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with + * older gcc may use indirect jump. So we add this check instead of + * replace indirect-jump check. + */ + if (!ret) + ret = insn_jump_into_range(insn, + (unsigned long)__indirect_thunk_start, + (unsigned long)__indirect_thunk_end - + (unsigned long)__indirect_thunk_start); +#endif + return ret; +} + /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { From 58f96ac5dba6b66d9fb4e0072e0e01df0360a43f Mon Sep 17 00:00:00 2001 From: "zhenwei.pi" Date: Thu, 18 Jan 2018 09:04:52 +0800 Subject: [PATCH 052/131] x86/pti: Document fix wrong index commit 98f0fceec7f84d80bc053e49e596088573086421 upstream. In section <2. Runtime Cost>, fix wrong index. Signed-off-by: zhenwei.pi Signed-off-by: Thomas Gleixner Cc: dave.hansen@linux.intel.com Link: https://lkml.kernel.org/r/1516237492-27739-1-git-send-email-zhenwei.pi@youruncloud.com Signed-off-by: Greg Kroah-Hartman --- Documentation/x86/pti.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/x86/pti.txt b/Documentation/x86/pti.txt index d11eff61fc9a..5cd58439ad2d 100644 --- a/Documentation/x86/pti.txt +++ b/Documentation/x86/pti.txt @@ -78,7 +78,7 @@ this protection comes at a cost: non-PTI SYSCALL entry code, so requires mapping fewer things into the userspace page tables. The downside is that stacks must be switched at entry time. - d. Global pages are disabled for all kernel structures not + c. Global pages are disabled for all kernel structures not mapped into both kernel and userspace page tables. This feature of the MMU allows different processes to share TLB entries mapping the kernel. Losing the feature means more From 11e619414b69b7f1e47baac72c5be589d86e5393 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 17 Jan 2018 14:53:28 -0800 Subject: [PATCH 053/131] x86/retpoline: Optimize inline assembler for vmexit_fill_RSB commit 3f7d875566d8e79c5e0b2c9a413e91b2c29e0854 upstream. The generated assembler for the C fill RSB inline asm operations has several issues: - The C code sets up the loop register, which is then immediately overwritten in __FILL_RETURN_BUFFER with the same value again. - The C code also passes in the iteration count in another register, which is not used at all. Remove these two unnecessary operations. Just rely on the single constant passed to the macro for the iterations. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Cc: dave.hansen@intel.com Cc: gregkh@linuxfoundation.org Cc: torvalds@linux-foundation.org Cc: arjan@linux.intel.com Link: https://lkml.kernel.org/r/20180117225328.15414-1-andi@firstfloor.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 4f7a5d3fed91..492370b9b35b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -183,15 +183,16 @@ extern char __indirect_thunk_end[]; static inline void vmexit_fill_RSB(void) { #ifdef CONFIG_RETPOLINE - unsigned long loops = RSB_CLEAR_LOOPS / 2; + unsigned long loops; asm volatile (ALTERNATIVE("jmp 910f", __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), X86_FEATURE_RETPOLINE) "910:" - : "=&r" (loops), ASM_CALL_CONSTRAINT - : "r" (loops) : "memory" ); + : "=r" (loops), ASM_CALL_CONSTRAINT + : : "memory" ); #endif } + #endif /* __ASSEMBLY__ */ #endif /* __NOSPEC_BRANCH_H__ */ From 38bc402237f8cb68c5f3ab0292c6d5537af38847 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sun, 29 Oct 2017 16:27:21 +0100 Subject: [PATCH 054/131] MIPS: AR7: ensure the port type's FCR value is used commit 0a5191efe06b5103909206e4fbcff81d30283f8e upstream. Since commit aef9a7bd9b67 ("serial/uart/8250: Add tunable RX interrupt trigger I/F of FIFO buffers"), the port's default FCR value isn't used in serial8250_do_set_termios anymore, but copied over once in serial8250_config_port and then modified as needed. Unfortunately, serial8250_config_port will never be called if the port is shared between kernel and userspace, and the port's flag doesn't have UPF_BOOT_AUTOCONF, which would trigger a serial8250_config_port as well. This causes garbled output from userspace: [ 5.220000] random: procd urandom read with 49 bits of entropy available ers [kee Fix this by forcing it to be configured on boot, resulting in the expected output: [ 5.250000] random: procd urandom read with 50 bits of entropy available Press the [f] key and hit [enter] to enter failsafe mode Press the [1], [2], [3] or [4] key and hit [enter] to select the debug level Fixes: aef9a7bd9b67 ("serial/uart/8250: Add tunable RX interrupt trigger I/F of FIFO buffers") Signed-off-by: Jonas Gorski Cc: Greg Kroah-Hartman Cc: Yoshihiro YUNOMAE Cc: Florian Fainelli Cc: Nicolas Schichan Cc: linux-mips@linux-mips.org Cc: linux-serial@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17544/ Signed-off-by: Ralf Baechle Cc: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/ar7/platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ar7/platform.c b/arch/mips/ar7/platform.c index 3446b6fb3acb..9da4e2292fc7 100644 --- a/arch/mips/ar7/platform.c +++ b/arch/mips/ar7/platform.c @@ -576,7 +576,7 @@ static int __init ar7_register_uarts(void) uart_port.type = PORT_AR7; uart_port.uartclk = clk_get_rate(bus_clk) / 2; uart_port.iotype = UPIO_MEM32; - uart_port.flags = UPF_FIXED_TYPE; + uart_port.flags = UPF_FIXED_TYPE | UPF_BOOT_AUTOCONF; uart_port.regshift = 2; uart_port.line = 0; From f0d0a93b0e81278e86c7d81c25a54ac4f4b739d2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 23 Jan 2018 19:50:18 +0100 Subject: [PATCH 055/131] Linux 4.4.113 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 07070a1e6292..39019c9d205c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 112 +SUBLEVEL = 113 EXTRAVERSION = NAME = Blurry Fish Butt From 8bb0c6a1b3b2fe92707ee6f6901bfdfc392f8cd1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 9 Dec 2016 10:24:05 -0800 Subject: [PATCH 056/131] x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit kernels commit 1c52d859cb2d417e7216d3e56bb7fea88444cec9 upstream. We support various non-Intel CPUs that don't have the CPUID instruction, so the M486 test was wrong. For now, fix it with a big hammer: handle missing CPUID on all 32-bit CPUs. Reported-by: One Thousand Gnomes Signed-off-by: Andy Lutomirski Cc: Juergen Gross Cc: Peter Zijlstra Cc: Brian Gerst Cc: Matthew Whitehead Cc: Borislav Petkov Cc: Henrique de Moraes Holschuh Cc: Andrew Cooper Cc: Boris Ostrovsky Cc: xen-devel Link: http://lkml.kernel.org/r/685bd083a7c036f7769510b6846315b17d6ba71f.1481307769.git.luto@kernel.org Signed-off-by: Thomas Gleixner Cc: "Zhang, Ning A" Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c124d6ab4bf9..86bccb4bd4dc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -574,7 +574,7 @@ static inline void sync_core(void) { int tmp; -#ifdef CONFIG_M486 +#ifdef CONFIG_X86_32 /* * Do a CPUID if available, otherwise do a jump. The jump * can conveniently enough be the jump around CPUID. From 28f467e0bdda754aa36390fd90b01823f0d3b18d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:49 -0700 Subject: [PATCH 057/131] usbip: prevent vhci_hcd driver from leaking a socket pointer address commit 2f2d0088eb93db5c649d2a5e34a3800a8a935fc5 upstream. When a client has a USB device attached over IP, the vhci_hcd driver is locally leaking a socket pointer address via the /sys/devices/platform/vhci_hcd/status file (world-readable) and in debug output when "usbip --debug port" is run. Fix it to not leak. The socket pointer address is not used at the moment and it was made visible as a convenient way to find IP address from socket pointer address by looking up /proc/net/{tcp,tcp6}. As this opens a security hole, the fix replaces socket pointer address with sockfd. Reported-by: Secunia Research Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_common.h | 1 + drivers/usb/usbip/vhci_sysfs.c | 25 +++++++++++++++---------- tools/usb/usbip/libsrc/vhci_driver.c | 8 ++++---- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/usb/usbip/usbip_common.h b/drivers/usb/usbip/usbip_common.h index 86b08475c254..f875ccaa55f9 100644 --- a/drivers/usb/usbip/usbip_common.h +++ b/drivers/usb/usbip/usbip_common.h @@ -261,6 +261,7 @@ struct usbip_device { /* lock for status */ spinlock_t lock; + int sockfd; struct socket *tcp_socket; struct task_struct *tcp_rx; diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c index 211f43f67ea2..84c21c4ccf46 100644 --- a/drivers/usb/usbip/vhci_sysfs.c +++ b/drivers/usb/usbip/vhci_sysfs.c @@ -39,16 +39,20 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, /* * output example: - * prt sta spd dev socket local_busid - * 000 004 000 000 c5a7bb80 1-2.3 - * 001 004 000 000 d8cee980 2-3.4 + * port sta spd dev sockfd local_busid + * 0000 004 000 00000000 000003 1-2.3 + * 0001 004 000 00000000 000004 2-3.4 * - * IP address can be retrieved from a socket pointer address by looking - * up /proc/net/{tcp,tcp6}. Also, a userland program may remember a - * port number and its peer IP address. + * Output includes socket fd instead of socket pointer address to + * avoid leaking kernel memory address in: + * /sys/devices/platform/vhci_hcd.0/status and in debug output. + * The socket pointer address is not used at the moment and it was + * made visible as a convenient way to find IP address from socket + * pointer address by looking up /proc/net/{tcp,tcp6}. As this opens + * a security hole, the change is made to use sockfd instead. */ out += sprintf(out, - "prt sta spd bus dev socket local_busid\n"); + "prt sta spd bus dev sockfd local_busid\n"); for (i = 0; i < VHCI_NPORTS; i++) { struct vhci_device *vdev = port_to_vdev(i); @@ -60,11 +64,11 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, out += sprintf(out, "%03u %08x ", vdev->speed, vdev->devid); out += sprintf(out, "%16p ", vdev->ud.tcp_socket); + out += sprintf(out, "%06u", vdev->ud.sockfd); out += sprintf(out, "%s", dev_name(&vdev->udev->dev)); - } else { - out += sprintf(out, "000 000 000 0000000000000000 0-0"); - } + } else + out += sprintf(out, "000 000 000 000000 0-0"); out += sprintf(out, "\n"); spin_unlock(&vdev->ud.lock); @@ -223,6 +227,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, vdev->devid = devid; vdev->speed = speed; + vdev->ud.sockfd = sockfd; vdev->ud.tcp_socket = socket; vdev->ud.status = VDEV_ST_NOTASSIGNED; diff --git a/tools/usb/usbip/libsrc/vhci_driver.c b/tools/usb/usbip/libsrc/vhci_driver.c index ad9204773533..1274f326242c 100644 --- a/tools/usb/usbip/libsrc/vhci_driver.c +++ b/tools/usb/usbip/libsrc/vhci_driver.c @@ -55,12 +55,12 @@ static int parse_status(const char *value) while (*c != '\0') { int port, status, speed, devid; - unsigned long socket; + int sockfd; char lbusid[SYSFS_BUS_ID_SIZE]; - ret = sscanf(c, "%d %d %d %x %lx %31s\n", + ret = sscanf(c, "%d %d %d %x %u %31s\n", &port, &status, &speed, - &devid, &socket, lbusid); + &devid, &sockfd, lbusid); if (ret < 5) { dbg("sscanf failed: %d", ret); @@ -69,7 +69,7 @@ static int parse_status(const char *value) dbg("port %d status %d speed %d devid %x", port, status, speed, devid); - dbg("socket %lx lbusid %s", socket, lbusid); + dbg("sockfd %u lbusid %s", sockfd, lbusid); /* if a device is connected, look at it */ From 4493f49816acc5531ed31ffc600073330393cc61 Mon Sep 17 00:00:00 2001 From: Jonathan Dieter Date: Mon, 27 Feb 2017 10:31:04 +0200 Subject: [PATCH 058/131] usbip: Fix implicit fallthrough warning commit cfd6ed4537a9e938fa76facecd4b9cd65b6d1563 upstream. GCC 7 now warns when switch statements fall through implicitly, and with -Werror enabled in configure.ac, that makes these tools unbuildable. We fix this by notifying the compiler that this particular case statement is meant to fall through. Reviewed-by: Peter Senna Tschudin Signed-off-by: Jonathan Dieter Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/src/usbip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/usb/usbip/src/usbip.c b/tools/usb/usbip/src/usbip.c index d7599d943529..73d8eee8130b 100644 --- a/tools/usb/usbip/src/usbip.c +++ b/tools/usb/usbip/src/usbip.c @@ -176,6 +176,8 @@ int main(int argc, char *argv[]) break; case '?': printf("usbip: invalid option\n"); + /* Terminate after printing error */ + /* FALLTHRU */ default: usbip_usage(); goto out; From 579a9885cfe40645bc62207f318d172c4cad7942 Mon Sep 17 00:00:00 2001 From: Jonathan Dieter Date: Mon, 27 Feb 2017 10:31:03 +0200 Subject: [PATCH 059/131] usbip: Fix potential format overflow in userspace tools commit e5dfa3f902b9a642ae8c6997d57d7c41e384a90b upstream. The usbip userspace tools call sprintf()/snprintf() and don't check for the return value which can lead the paths to overflow, truncating the final file in the path. More urgently, GCC 7 now warns that these aren't checked with -Wformat-overflow, and with -Werror enabled in configure.ac, that makes these tools unbuildable. This patch fixes these problems by replacing sprintf() with snprintf() in one place and adding checks for the return value of snprintf(). Reviewed-by: Peter Senna Tschudin Signed-off-by: Jonathan Dieter Acked-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- tools/usb/usbip/libsrc/usbip_common.c | 9 +++++++- tools/usb/usbip/libsrc/usbip_host_driver.c | 27 ++++++++++++++++++---- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/tools/usb/usbip/libsrc/usbip_common.c b/tools/usb/usbip/libsrc/usbip_common.c index ac73710473de..8000445ff884 100644 --- a/tools/usb/usbip/libsrc/usbip_common.c +++ b/tools/usb/usbip/libsrc/usbip_common.c @@ -215,9 +215,16 @@ int read_usb_interface(struct usbip_usb_device *udev, int i, struct usbip_usb_interface *uinf) { char busid[SYSFS_BUS_ID_SIZE]; + int size; struct udev_device *sif; - sprintf(busid, "%s:%d.%d", udev->busid, udev->bConfigurationValue, i); + size = snprintf(busid, sizeof(busid), "%s:%d.%d", + udev->busid, udev->bConfigurationValue, i); + if (size < 0 || (unsigned int)size >= sizeof(busid)) { + err("busid length %i >= %lu or < 0", size, + (unsigned long)sizeof(busid)); + return -1; + } sif = udev_device_new_from_subsystem_sysname(udev_context, "usb", busid); if (!sif) { diff --git a/tools/usb/usbip/libsrc/usbip_host_driver.c b/tools/usb/usbip/libsrc/usbip_host_driver.c index bef08d5c44e8..071b9ce99420 100644 --- a/tools/usb/usbip/libsrc/usbip_host_driver.c +++ b/tools/usb/usbip/libsrc/usbip_host_driver.c @@ -39,13 +39,19 @@ struct udev *udev_context; static int32_t read_attr_usbip_status(struct usbip_usb_device *udev) { char status_attr_path[SYSFS_PATH_MAX]; + int size; int fd; int length; char status; int value = 0; - snprintf(status_attr_path, SYSFS_PATH_MAX, "%s/usbip_status", - udev->path); + size = snprintf(status_attr_path, SYSFS_PATH_MAX, "%s/usbip_status", + udev->path); + if (size < 0 || (unsigned int)size >= sizeof(status_attr_path)) { + err("usbip_status path length %i >= %lu or < 0", size, + (unsigned long)sizeof(status_attr_path)); + return -1; + } fd = open(status_attr_path, O_RDONLY); if (fd < 0) { @@ -225,6 +231,7 @@ int usbip_host_export_device(struct usbip_exported_device *edev, int sockfd) { char attr_name[] = "usbip_sockfd"; char sockfd_attr_path[SYSFS_PATH_MAX]; + int size; char sockfd_buff[30]; int ret; @@ -244,10 +251,20 @@ int usbip_host_export_device(struct usbip_exported_device *edev, int sockfd) } /* only the first interface is true */ - snprintf(sockfd_attr_path, sizeof(sockfd_attr_path), "%s/%s", - edev->udev.path, attr_name); + size = snprintf(sockfd_attr_path, sizeof(sockfd_attr_path), "%s/%s", + edev->udev.path, attr_name); + if (size < 0 || (unsigned int)size >= sizeof(sockfd_attr_path)) { + err("exported device path length %i >= %lu or < 0", size, + (unsigned long)sizeof(sockfd_attr_path)); + return -1; + } - snprintf(sockfd_buff, sizeof(sockfd_buff), "%d\n", sockfd); + size = snprintf(sockfd_buff, sizeof(sockfd_buff), "%d\n", sockfd); + if (size < 0 || (unsigned int)size >= sizeof(sockfd_buff)) { + err("socket length %i >= %lu or < 0", size, + (unsigned long)sizeof(sockfd_buff)); + return -1; + } ret = write_sysfs_attribute(sockfd_attr_path, sockfd_buff, strlen(sockfd_buff)); From 20e3fa5dd5818b425b18528571e133034833df27 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 24 Jan 2018 02:31:19 +0000 Subject: [PATCH 060/131] x86/microcode/intel: Fix BDW late-loading revision check The backport of commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") to 4.4-stable deleted a "return true" statement. This bug is not present upstream or other stable branches. Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/microcode/intel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index b428a8174be1..ee011bd7934d 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -1005,6 +1005,7 @@ static bool is_blacklisted(unsigned int cpu) c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); + return true; } return false; From 0dfd5fbcae5d40e5e58bc7c34305498e965bf1ef Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Jun 2016 17:19:27 -0700 Subject: [PATCH 061/131] x86/cpu/intel: Introduce macros for Intel family numbers commit 970442c599b22ccd644ebfe94d1d303bf6f87c05 upstream. Problem: We have a boatload of open-coded family-6 model numbers. Half of them have these model numbers in hex and the other half in decimal. This makes grepping for them tons of fun, if you were to try. Solution: Consolidate all the magic numbers. Put all the definitions in one header. The names here are closely derived from the comments describing the models from arch/x86/events/intel/core.c. We could easily make them shorter by doing things like s/SANDYBRIDGE/SNB/, but they seemed fine even with the longer versions to me. Do not take any of these names too literally, like "DESKTOP" or "MOBILE". These are all colloquial names and not precise descriptions of everywhere a given model will show up. Signed-off-by: Dave Hansen Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Darren Hart Cc: Dave Hansen Cc: Denys Vlasenko Cc: Doug Thompson Cc: Eduardo Valentin Cc: H. Peter Anvin Cc: Jacob Pan Cc: Kan Liang Cc: Len Brown Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rajneesh Bhardwaj Cc: Souvik Kumar Chakravarty Cc: Srinivas Pandruvada Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tony Luck Cc: Ulf Hansson Cc: Viresh Kumar Cc: Vishwanath Somayaji Cc: Zhang Rui Cc: jacob.jun.pan@intel.com Cc: linux-acpi@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: platform-driver-x86@vger.kernel.org Link: http://lkml.kernel.org/r/20160603001927.F2A7D828@viggo.jf.intel.com Signed-off-by: Ingo Molnar Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/intel-family.h | 68 +++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 arch/x86/include/asm/intel-family.h diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h new file mode 100644 index 000000000000..6999f7d01a0d --- /dev/null +++ b/arch/x86/include/asm/intel-family.h @@ -0,0 +1,68 @@ +#ifndef _ASM_X86_INTEL_FAMILY_H +#define _ASM_X86_INTEL_FAMILY_H + +/* + * "Big Core" Processors (Branded as Core, Xeon, etc...) + * + * The "_X" parts are generally the EP and EX Xeons, or the + * "Extreme" ones, like Broadwell-E. + * + * Things ending in "2" are usually because we have no better + * name for them. There's no processor called "WESTMERE2". + */ + +#define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_FAM6_CORE2_MEROM 0x0F +#define INTEL_FAM6_CORE2_MEROM_L 0x16 +#define INTEL_FAM6_CORE2_PENRYN 0x17 +#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D + +#define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_FAM6_NEHALEM_EP 0x1A +#define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_FAM6_WESTMERE 0x25 +#define INTEL_FAM6_WESTMERE2 0x1F +#define INTEL_FAM6_WESTMERE_EP 0x2C +#define INTEL_FAM6_WESTMERE_EX 0x2F + +#define INTEL_FAM6_SANDYBRIDGE 0x2A +#define INTEL_FAM6_SANDYBRIDGE_X 0x2D +#define INTEL_FAM6_IVYBRIDGE 0x3A +#define INTEL_FAM6_IVYBRIDGE_X 0x3E + +#define INTEL_FAM6_HASWELL_CORE 0x3C +#define INTEL_FAM6_HASWELL_X 0x3F +#define INTEL_FAM6_HASWELL_ULT 0x45 +#define INTEL_FAM6_HASWELL_GT3E 0x46 + +#define INTEL_FAM6_BROADWELL_CORE 0x3D +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 +#define INTEL_FAM6_BROADWELL_GT3E 0x47 +#define INTEL_FAM6_BROADWELL_X 0x4F + +#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E +#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +#define INTEL_FAM6_SKYLAKE_X 0x55 +#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E +#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E + +/* "Small Core" Processors (Atom) */ + +#define INTEL_FAM6_ATOM_PINEVIEW 0x1C +#define INTEL_FAM6_ATOM_LINCROFT 0x26 +#define INTEL_FAM6_ATOM_PENWELL 0x27 +#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 +#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 +#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ +#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */ +#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C +#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ + +/* Xeon Phi */ + +#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ + +#endif /* _ASM_X86_INTEL_FAMILY_H */ From 18bb117d1b7690181346e6365c6237b6ceaac4c4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 12 Jan 2018 17:49:25 +0000 Subject: [PATCH 062/131] x86/retpoline: Fill RSB on context switch for affected CPUs commit c995efd5a740d9cbafbf58bde4973e8b50b4d761 upstream. On context switch from a shallow call stack to a deeper one, as the CPU does 'ret' up the deeper side it may encounter RSB entries (predictions for where the 'ret' goes to) which were populated in userspace. This is problematic if neither SMEP nor KPTI (the latter of which marks userspace pages as NX for the kernel) are active, as malicious code in userspace may then be executed speculatively. Overwrite the CPU's return prediction stack with calls which are predicted to return to an infinite loop, to "capture" speculation if this happens. This is required both for retpoline, and also in conjunction with IBRS for !SMEP && !KPTI. On Skylake+ the problem is slightly different, and an *underflow* of the RSB may cause errant branch predictions to occur. So there it's not so much overwrite, as *filling* the RSB to attempt to prevent it getting empty. This is only a partial solution for Skylake+ since there are many other conditions which may result in the RSB becoming empty. The full solution on Skylake+ is to use IBRS, which will prevent the problem even when the RSB becomes empty. With IBRS, the RSB-stuffing will not be required on context switch. [ tglx: Added missing vendor check and slighty massaged comments and changelog ] [js] backport to 4.4 -- __switch_to_asm does not exist there, we have to patch the switch_to macros for both x86_32 and x86_64. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Arjan van de Ven Cc: gnomes@lxorguk.ukuu.org.uk Cc: Rik van Riel Cc: Andi Kleen Cc: Josh Poimboeuf Cc: thomas.lendacky@amd.com Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jiri Kosina Cc: Andy Lutomirski Cc: Dave Hansen Cc: Kees Cook Cc: Tim Chen Cc: Greg Kroah-Hartman Cc: Paul Turner Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/switch_to.h | 38 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/bugs.c | 36 +++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0fbc98568018..641f0f2c2982 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -199,6 +199,7 @@ #define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_RETPOLINE ( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 751bf4b7bf11..025ecfaba9c9 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include + struct task_struct; /* one of the stranger aspects of C forward declarations */ __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); @@ -24,6 +26,23 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +#define __retpoline_fill_return_buffer \ + ALTERNATIVE("jmp 910f", \ + __stringify(__FILL_RETURN_BUFFER(%%ebx, RSB_CLEAR_LOOPS, %%esp)),\ + X86_FEATURE_RSB_CTXSW) \ + "910:\n\t" +#else +#define __retpoline_fill_return_buffer +#endif + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. @@ -46,6 +65,7 @@ do { \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ "pushl %[next_ip]\n\t" /* restore EIP */ \ __switch_canary \ + __retpoline_fill_return_buffer \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ "popl %%ebp\n\t" /* restore EBP */ \ @@ -100,6 +120,23 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ +#define __retpoline_fill_return_buffer \ + ALTERNATIVE("jmp 910f", \ + __stringify(__FILL_RETURN_BUFFER(%%r12, RSB_CLEAR_LOOPS, %%rsp)),\ + X86_FEATURE_RSB_CTXSW) \ + "910:\n\t" +#else +#define __retpoline_fill_return_buffer +#endif + /* * There is no need to save or restore flags, because flags are always * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL @@ -112,6 +149,7 @@ do { \ "call __switch_to\n\t" \ "movq "__percpu_arg([current_task])",%%rsi\n\t" \ __switch_canary \ + __retpoline_fill_return_buffer \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ "movq %%rax,%%rdi\n\t" \ "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 49d25ddf0e9f..8cacf62ec458 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,6 +22,7 @@ #include #include #include +#include static void __init spectre_v2_select_mitigation(void); @@ -154,6 +155,23 @@ disable: return SPECTRE_V2_CMD_NONE; } +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + return true; + } + } + return false; +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -212,6 +230,24 @@ retpoline_auto: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If neither SMEP or KPTI are available, there is a risk of + * hitting userspace addresses in the RSB after a context switch + * from a shallow call stack to a deeper one. To prevent this fill + * the entire RSB, even when using IBRS. + * + * Skylake era CPUs have a separate issue with *underflow* of the + * RSB, when they will predict 'ret' targets from the generic BTB. + * The proper mitigation for this is IBRS. If IBRS is not supported + * or deactivated in favour of retpolines the RSB fill on context + * switch is required. + */ + if ((!boot_cpu_has(X86_FEATURE_KAISER) && + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } } #undef pr_fmt From 1d00e3d9b7ed8fbf6729b7c5a8aae9f2711d2655 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 29 May 2017 16:24:03 +0200 Subject: [PATCH 063/131] sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks commit 3effcb4247e74a51f5d8b775a1ee4abf87cc089a upstream. We have been facing some problems with self-suspending constrained deadline tasks. The main reason is that the original CBS was not designed for such sort of tasks. One problem reported by Xunlei Pang takes place when a task suspends, and then is awakened before the deadline, but so close to the deadline that its remaining runtime can cause the task to have an absolute density higher than allowed. In such situation, the original CBS assumes that the task is facing an early activation, and so it replenishes the task and set another deadline, one deadline in the future. This rule works fine for implicit deadline tasks. Moreover, it allows the system to adapt the period of a task in which the external event source suffered from a clock drift. However, this opens the window for bandwidth leakage for constrained deadline tasks. For instance, a task with the following parameters: runtime = 5 ms deadline = 7 ms [density] = 5 / 7 = 0.71 period = 1000 ms If the task runs for 1 ms, and then suspends for another 1ms, it will be awakened with the following parameters: remaining runtime = 4 laxity = 5 presenting a absolute density of 4 / 5 = 0.80. In this case, the original CBS would assume the task had an early wakeup. Then, CBS will reset the runtime, and the absolute deadline will be postponed by one relative deadline, allowing the task to run. The problem is that, if the task runs this pattern forever, it will keep receiving bandwidth, being able to run 1ms every 2ms. Following this behavior, the task would be able to run 500 ms in 1 sec. Thus running more than the 5 ms / 1 sec the admission control allowed it to run. Trying to address the self-suspending case, Luca Abeni, Giuseppe Lipari, and Juri Lelli [1] revisited the CBS in order to deal with self-suspending tasks. In the new approach, rather than replenishing/postponing the absolute deadline, the revised wakeup rule adjusts the remaining runtime, reducing it to fit into the allowed density. A revised version of the idea is: At a given time t, the maximum absolute density of a task cannot be higher than its relative density, that is: runtime / (deadline - t) <= dl_runtime / dl_deadline Knowing the laxity of a task (deadline - t), it is possible to move it to the other side of the equality, thus enabling to define max remaining runtime a task can use within the absolute deadline, without over-running the allowed density: runtime = (dl_runtime / dl_deadline) * (deadline - t) For instance, in our previous example, the task could still run: runtime = ( 5 / 7 ) * 5 runtime = 3.57 ms Without causing damage for other deadline tasks. It is note worthy that the laxity cannot be negative because that would cause a negative runtime. Thus, this patch depends on the patch: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline") Which throttles a constrained deadline task activated after the deadline. Finally, it is also possible to use the revised wakeup rule for all other tasks, but that would require some more discussions about pros and cons. [The main difference from the original commit is that the BW_SHIFT define was not present yet. As BW_SHIFT was introduced in a new feature, I just used the value (20), likewise we used to use before the #define. Other changes were required because of comments. - bistrot] Reported-by: Xunlei Pang Signed-off-by: Daniel Bristot de Oliveira [peterz: replaced dl_is_constrained with dl_is_implicit] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Romulo Silva de Oliveira Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/5c800ab3a74a168a84ee5f3f84d12a02e11383be.1495803804.git.bristot@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Greg Kroah-Hartman --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 + kernel/sched/deadline.c | 98 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e887c8d6f395..90bea398e5e0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1313,6 +1313,7 @@ struct sched_dl_entity { u64 dl_deadline; /* relative deadline of each instance */ u64 dl_period; /* separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_deadline */ + u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d6b3d869592..e6d1173a2046 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2109,6 +2109,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + dl_se->dl_density = 0; dl_se->dl_throttled = 0; dl_se->dl_new = 1; @@ -3647,6 +3648,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); /* * Changing the parameters of a task is 'tricky' and we're not doing diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6be2afd9bfd6..e12b0a4df891 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -480,13 +480,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> 20; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. + * + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. + * + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -505,6 +576,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -991,11 +1070,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } -static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline < dl_se->dl_period; -} - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -1027,7 +1101,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) dl_check_constrained_dl(&p->dl); /* From e7514af60542830a37cd8f2fc0419fc5e320d70c Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: [PATCH 064/131] can: af_can: can_rcv(): replace WARN_ONCE by pr_warn_once commit 8cb68751c115d176ec851ca56ecfbb411568c9e8 upstream. If an invalid CAN frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+4386709c0c1284dca827@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Signed-off-by: Greg Kroah-Hartman --- net/can/af_can.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/can/af_can.c b/net/can/af_can.c index 928f58064098..924ad0513af9 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -722,13 +722,12 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!net_eq(dev_net(dev), &init_net))) goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CAN_MTU || - cfd->len > CAN_MAX_DLEN, - "PF_CAN: dropped non conform CAN skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || + cfd->len > CAN_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); goto drop; + } can_receive(skb, dev); return NET_RX_SUCCESS; From 08f39b7bcccb532a6b96fdeaf91ed08e4af7dda4 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 16 Jan 2018 19:30:14 +0100 Subject: [PATCH 065/131] can: af_can: canfd_rcv(): replace WARN_ONCE by pr_warn_once commit d4689846881d160a4d12a514e991a740bcb5d65a upstream. If an invalid CANFD frame is received, from a driver or from a tun interface, a Kernel warning is generated. This patch replaces the WARN_ONCE by a simple pr_warn_once, so that a kernel, bootet with panic_on_warn, does not panic. A printk seems to be more appropriate here. Reported-by: syzbot+e3b775f40babeff6e68b@syzkaller.appspotmail.com Suggested-by: Dmitry Vyukov Acked-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Signed-off-by: Greg Kroah-Hartman --- net/can/af_can.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/can/af_can.c b/net/can/af_can.c index 924ad0513af9..c866e761651a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -745,13 +745,12 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!net_eq(dev_net(dev), &init_net))) goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || - skb->len != CANFD_MTU || - cfd->len > CANFD_MAX_DLEN, - "PF_CAN: dropped non conform CAN FD skbuf: " - "dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len)) + if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || + cfd->len > CANFD_MAX_DLEN)) { + pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n", + dev->type, skb->len, cfd->len); goto drop; + } can_receive(skb, dev); return NET_RX_SUCCESS; From 0cb5ae4db47abc3985016aac6326ef98c533fe4d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 24 Jan 2016 20:08:52 -0600 Subject: [PATCH 066/131] PM / sleep: declare __tracedata symbols as char[] rather than char commit f97238373b8662a6d580e204df2e7bcbfa43e27a upstream. Accessing more than one byte from a symbol declared simply 'char' is undefined behavior, as reported by UBSAN: UBSAN: Undefined behaviour in drivers/base/power/trace.c:178:18 load of address ffffffff8203fc78 with insufficient space for an object of type 'char' Avoid this by declaring the symbols as arrays. Signed-off-by: Eric Biggers Signed-off-by: Rafael J. Wysocki Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/base/power/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index a311cfa4c5bd..a6975795e7f3 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -166,14 +166,14 @@ void generate_pm_trace(const void *tracedata, unsigned int user) } EXPORT_SYMBOL(generate_pm_trace); -extern char __tracedata_start, __tracedata_end; +extern char __tracedata_start[], __tracedata_end[]; static int show_file_hash(unsigned int value) { int match; char *tracedata; match = 0; - for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ; + for (tracedata = __tracedata_start ; tracedata < __tracedata_end ; tracedata += 2 + sizeof(unsigned long)) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); From 08b1cf4964d526cf3c2ea9e36ffd752752543ef3 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 13 Aug 2016 01:37:04 +0200 Subject: [PATCH 067/131] time: Avoid undefined behaviour in ktime_add_safe() commit 979515c5645830465739254abc1b1648ada41518 upstream. I ran into this: ================================================================================ UBSAN: Undefined behaviour in kernel/time/hrtimer.c:310:16 signed integer overflow: 9223372036854775807 + 50000 cannot be represented in type 'long long int' CPU: 2 PID: 4798 Comm: trinity-c2 Not tainted 4.8.0-rc1+ #91 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.3-0-ge2fc41e-prebuilt.qemu-project.org 04/01/2014 0000000000000000 ffff88010ce6fb88 ffffffff82344740 0000000041b58ab3 ffffffff84f97a20 ffffffff82344694 ffff88010ce6fbb0 ffff88010ce6fb60 000000000000c350 ffff88010ce6f968 dffffc0000000000 ffffffff857bc320 Call Trace: [] dump_stack+0xac/0xfc [] ? _atomic_dec_and_lock+0xc4/0xc4 [] ubsan_epilogue+0xd/0x8a [] handle_overflow+0x202/0x23d [] ? val_to_string.constprop.6+0x11e/0x11e [] ? timerqueue_add+0x151/0x410 [] ? hrtimer_start_range_ns+0x3b8/0x1380 [] ? memset+0x31/0x40 [] __ubsan_handle_add_overflow+0xe/0x10 [] hrtimer_nanosleep+0x5d9/0x790 [] ? hrtimer_init_sleeper+0x80/0x80 [] ? __might_sleep+0x5b/0x260 [] common_nsleep+0x20/0x30 [] SyS_clock_nanosleep+0x197/0x210 [] ? SyS_clock_getres+0x150/0x150 [] ? __this_cpu_preempt_check+0x13/0x20 [] ? __context_tracking_exit.part.3+0x30/0x1b0 [] ? SyS_clock_getres+0x150/0x150 [] do_syscall_64+0x1b3/0x4b0 [] entry_SYSCALL64_slow_path+0x25/0x25 ================================================================================ Add a new ktime_add_unsafe() helper which doesn't check for overflow, but doesn't throw a UBSAN warning when it does overflow either. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Signed-off-by: Vegard Nossum Signed-off-by: John Stultz Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- include/linux/ktime.h | 7 +++++++ kernel/time/hrtimer.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 2b6a204bd8d4..3ffc69ebe967 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -63,6 +63,13 @@ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) #define ktime_add(lhs, rhs) \ ({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; }) +/* + * Same as ktime_add(), but avoids undefined behaviour on overflow; however, + * this means that you must check the result for overflow yourself. + */ +#define ktime_add_unsafe(lhs, rhs) \ + ({ (ktime_t){ .tv64 = (u64) (lhs).tv64 + (rhs).tv64 }; }) + /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 17f7bcff1e02..1dc94768b5a3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -312,7 +312,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { - ktime_t res = ktime_add(lhs, rhs); + ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can From 839003061aa9362f23866614862b49b57bfdd6ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:41:56 +0200 Subject: [PATCH 068/131] timers: Plug locking race vs. timer migration commit b831275a3553c32091222ac619cfddd73a5553fb upstream. Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the timer flags. As a consequence the compiler is allowed to reload the flags between the initial check for TIMER_MIGRATION and the following timer base computation and the spin lock of the base. While this has not been observed (yet), we need to make sure that it never happens. Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: Andrew Morton Cc: Peter Zijlstra Signed-off-by: Mike Galbraith Signed-off-by: Greg Kroah-Hartman --- kernel/time/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 125407144c01..3d7588a2e97c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -764,8 +764,15 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, __acquires(timer->base->lock) { for (;;) { - u32 tf = timer->flags; struct tvec_base *base; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); From 29b73cace1b1cc2daf2556b3285d4e79f19bcb3b Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Tue, 25 Apr 2017 16:44:03 -0500 Subject: [PATCH 069/131] Prevent timer value 0 for MWAITX commit 88d879d29f9cc0de2d930b584285638cdada6625 upstream. Newer hardware has uncovered a bug in the software implementation of using MWAITX for the delay function. A value of 0 for the timer is meant to indicate that a timeout will not be used to exit MWAITX. On newer hardware this can result in MWAITX never returning, resulting in NMI soft lockup messages being printed. On older hardware, some of the other conditions under which MWAITX can exit masked this issue. The AMD APM does not currently document this and will be updated. Please refer to http://marc.info/?l=kvm&m=148950623231140 for information regarding NMI soft lockup messages on an AMD Ryzen 1800X. This has been root-caused as a 0 passed to MWAITX causing it to wait indefinitely. This change has the added benefit of avoiding the unnecessary setup of MONITORX/MWAITX when the delay value is zero. Signed-off-by: Janakarajan Natarajan Link: http://lkml.kernel.org/r/1493156643-29366-1-git-send-email-Janakarajan.Natarajan@amd.com Signed-off-by: Thomas Gleixner Signed-off-by: Davidlohr Bueso Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/delay.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index e912b2f6d36e..45772560aceb 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops) { u64 start, end, delay, loops = __loops; + /* + * Timer value of 0 causes MWAITX to wait indefinitely, unless there + * is a store on the memory monitored by MONITORX. + */ + if (loops == 0) + return; + start = rdtsc_ordered(); for (;;) { From f31a8450c09327339939fe195707749c1d67016d Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 28 Oct 2016 09:45:28 +0100 Subject: [PATCH 070/131] drivers: base: cacheinfo: fix x86 with CONFIG_OF enabled commit fac51482577d5e05bbb0efa8d602a3c2111098bf upstream. With CONFIG_OF enabled on x86, we get the following error on boot: " Failed to find cpu0 device node Unable to detect cache hierarchy from DT for CPU 0 " and the cacheinfo fails to get populated in the corresponding sysfs entries. This is because cache_setup_of_node looks for of_node for setting up the shared cpu_map without checking that it's already populated in the architecture specific callback. In order to indicate that the shared cpu_map is already populated, this patch introduces a boolean `cpu_map_populated` in struct cpu_cacheinfo that can be used by the generic code to skip cache_shared_cpu_map_setup. This patch also sets that boolean for x86. Cc: Greg Kroah-Hartman Signed-off-by: Sudeep Holla Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 ++ drivers/base/cacheinfo.c | 3 +++ include/linux/cacheinfo.h | 1 + 3 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index e38d338a6447..b4ca91cf55b0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index e9fd32e91668..ecde8957835a 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -106,6 +106,9 @@ static int cache_shared_cpu_map_setup(unsigned int cpu) unsigned int index; int ret; + if (this_cpu_ci->cpu_map_populated) + return 0; + ret = cache_setup_of_node(cpu); if (ret) return ret; diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2189935075b4..a951fd10aaaa 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -71,6 +71,7 @@ struct cpu_cacheinfo { struct cacheinfo *info_list; unsigned int num_levels; unsigned int num_leaves; + bool cpu_map_populated; }; /* From a91cdfa754905b702a5841b8d2b895599d8fad72 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Fri, 28 Oct 2016 09:45:29 +0100 Subject: [PATCH 071/131] drivers: base: cacheinfo: fix boot error message when acpi is enabled commit 55877ef45fbd7f975d078426866b7d1a2435dcc3 upstream. ARM64 enables both CONFIG_OF and CONFIG_ACPI and the firmware can pass both ACPI tables and the device tree. Based on the kernel parameter, one of the two will be chosen. If acpi is enabled, then device tree is not unflattened. Currently ARM64 platforms report: " Failed to find cpu0 device node Unable to detect cache hierarchy from DT for CPU 0 " which is incorrect when booting with ACPI. Also latest ACPI v6.1 has no support for cache properties/hierarchy. This patch adds check for unflattened device tree and also returns as "not supported" if ACPI is runtime enabled. It also removes the reference to DT from the error message as the cache hierarchy can be detected from the firmware(OF/DT/ACPI) Cc: Greg Kroah-Hartman Signed-off-by: Sudeep Holla Signed-off-by: Mian Yousaf Kaukab Signed-off-by: Greg Kroah-Hartman --- drivers/base/cacheinfo.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index ecde8957835a..70e13cf06ed0 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -16,6 +16,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ +#include #include #include #include @@ -104,12 +105,16 @@ static int cache_shared_cpu_map_setup(unsigned int cpu) struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *this_leaf, *sib_leaf; unsigned int index; - int ret; + int ret = 0; if (this_cpu_ci->cpu_map_populated) return 0; - ret = cache_setup_of_node(cpu); + if (of_have_populated_dt()) + ret = cache_setup_of_node(cpu); + else if (!acpi_disabled) + /* No cache property/hierarchy support yet in ACPI */ + ret = -ENOTSUPP; if (ret) return ret; @@ -206,8 +211,7 @@ static int detect_cache_attributes(unsigned int cpu) */ ret = cache_shared_cpu_map_setup(cpu); if (ret) { - pr_warn("Unable to detect cache hierarchy from DT for CPU %d\n", - cpu); + pr_warn("Unable to detect cache hierarchy for CPU %d\n", cpu); goto free_ci; } return 0; From 12de6baf0637e2b0067bf5d71c3a7af1ec5b4ff1 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 27 Jan 2016 09:32:05 -0800 Subject: [PATCH 072/131] PCI: layerscape: Add "fsl,ls2085a-pcie" compatible ID commit dbae40b76abef2f8a7e7bf1701f77df9e73def48 upstream. The Layerscape PCI host driver must recognize ls2085a compatible when using firmware with ls2085a compatible property, otherwise the PCI bus won't be detected even though ls2085a compatible is included by the dts. Signed-off-by: Yang Shi Signed-off-by: Bjorn Helgaas Signed-off-by: Matthias Brugger Signed-off-by: Greg Kroah-Hartman --- drivers/pci/host/pci-layerscape.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c index 3923bed93c7e..c40d8b2ce330 100644 --- a/drivers/pci/host/pci-layerscape.c +++ b/drivers/pci/host/pci-layerscape.c @@ -203,6 +203,7 @@ static const struct of_device_id ls_pcie_of_match[] = { { .compatible = "fsl,ls1021a-pcie", .data = &ls1021_drvdata }, { .compatible = "fsl,ls1043a-pcie", .data = &ls1043_drvdata }, { .compatible = "fsl,ls2080a-pcie", .data = &ls2080_drvdata }, + { .compatible = "fsl,ls2085a-pcie", .data = &ls2080_drvdata }, { }, }; MODULE_DEVICE_TABLE(of, ls_pcie_of_match); From 5ee8fef15d0affce9ff482dd99ff33a0578c2368 Mon Sep 17 00:00:00 2001 From: Minghuan Lian Date: Mon, 29 Feb 2016 17:24:15 -0600 Subject: [PATCH 073/131] PCI: layerscape: Fix MSG TLP drop setting commit 1195c103f6c98d9ff381cac3a8760d4f8a133627 upstream. Some kinds of Layerscape PCIe controllers will forward the received message TLPs to system application address space, which could corrupt system memory or lead to a system hang. Enable MSG_DROP to fix this issue. Signed-off-by: Minghuan Lian Signed-off-by: Bjorn Helgaas Signed-off-by: Matthias Brugger Signed-off-by: Greg Kroah-Hartman --- drivers/pci/host/pci-layerscape.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c index c40d8b2ce330..a21e229d95e0 100644 --- a/drivers/pci/host/pci-layerscape.c +++ b/drivers/pci/host/pci-layerscape.c @@ -77,6 +77,16 @@ static void ls_pcie_fix_class(struct ls_pcie *pcie) iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->dbi + PCI_CLASS_DEVICE); } +/* Drop MSG TLP except for Vendor MSG */ +static void ls_pcie_drop_msg_tlp(struct ls_pcie *pcie) +{ + u32 val; + + val = ioread32(pcie->dbi + PCIE_STRFMR1); + val &= 0xDFFFFFFF; + iowrite32(val, pcie->dbi + PCIE_STRFMR1); +} + static int ls1021_pcie_link_up(struct pcie_port *pp) { u32 state; @@ -97,7 +107,7 @@ static int ls1021_pcie_link_up(struct pcie_port *pp) static void ls1021_pcie_host_init(struct pcie_port *pp) { struct ls_pcie *pcie = to_ls_pcie(pp); - u32 val, index[2]; + u32 index[2]; pcie->scfg = syscon_regmap_lookup_by_phandle(pp->dev->of_node, "fsl,pcie-scfg"); @@ -116,13 +126,7 @@ static void ls1021_pcie_host_init(struct pcie_port *pp) dw_pcie_setup_rc(pp); - /* - * LS1021A Workaround for internal TKT228622 - * to fix the INTx hang issue - */ - val = ioread32(pcie->dbi + PCIE_STRFMR1); - val &= 0xffff; - iowrite32(val, pcie->dbi + PCIE_STRFMR1); + ls_pcie_drop_msg_tlp(pcie); } static int ls_pcie_link_up(struct pcie_port *pp) @@ -147,6 +151,7 @@ static void ls_pcie_host_init(struct pcie_port *pp) iowrite32(1, pcie->dbi + PCIE_DBI_RO_WR_EN); ls_pcie_fix_class(pcie); ls_pcie_clear_multifunction(pcie); + ls_pcie_drop_msg_tlp(pcie); iowrite32(0, pcie->dbi + PCIE_DBI_RO_WR_EN); } From 0fd49331f9c1fee00ad9ef125faea9d14d7c6a59 Mon Sep 17 00:00:00 2001 From: yangbo lu Date: Wed, 25 Nov 2015 10:05:37 +0800 Subject: [PATCH 074/131] mmc: sdhci-of-esdhc: add/remove some quirks according to vendor version commit 1ef5e49e46b919052474d9b54a15debc79ff0133 upstream. A previous patch had removed esdhc_of_platform_init() by mistake. static void esdhc_of_platform_init(struct sdhci_host *host) { u32 vvn; vvn = in_be32(host->ioaddr + SDHCI_SLOT_INT_STATUS); vvn = (vvn & SDHCI_VENDOR_VER_MASK) >> SDHCI_VENDOR_VER_SHIFT; if (vvn == VENDOR_V_22) host->quirks2 |= SDHCI_QUIRK2_HOST_NO_CMD23; if (vvn > VENDOR_V_22) host->quirks &= ~SDHCI_QUIRK_NO_BUSY_IRQ; } This patch is used to fix it by add/remove some quirks according to verdor version in probe. Signed-off-by: Yangbo Lu Fixes: f4932cfd22f1 ("mmc: sdhci-of-esdhc: support both BE and LE host controller") Signed-off-by: Ulf Hansson Signed-off-by: Matthias Brugger Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-of-esdhc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 90e94a028a49..83b1226471c1 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -584,6 +584,8 @@ static int sdhci_esdhc_probe(struct platform_device *pdev) { struct sdhci_host *host; struct device_node *np; + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_esdhc *esdhc; int ret; np = pdev->dev.of_node; @@ -600,6 +602,14 @@ static int sdhci_esdhc_probe(struct platform_device *pdev) sdhci_get_of_property(pdev); + pltfm_host = sdhci_priv(host); + esdhc = pltfm_host->priv; + if (esdhc->vendor_ver == VENDOR_V_22) + host->quirks2 |= SDHCI_QUIRK2_HOST_NO_CMD23; + + if (esdhc->vendor_ver > VENDOR_V_22) + host->quirks &= ~SDHCI_QUIRK_NO_BUSY_IRQ; + if (of_device_is_compatible(np, "fsl,p5040-esdhc") || of_device_is_compatible(np, "fsl,p5020-esdhc") || of_device_is_compatible(np, "fsl,p4080-esdhc") || From 13757fc50433b6b6f68c21bd88a094d56514cb5b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 11 Oct 2016 13:51:14 -0700 Subject: [PATCH 075/131] fs/select: add vmalloc fallback for select(2) commit 2d19309cf86883f634a4f8ec55a54bda87db19bf upstream. The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows with the number of fds passed. We had a customer report page allocation failures of order-4 for this allocation. This is a costly order, so it might easily fail, as the VM expects such allocation to have a lower-order fallback. Such trivial fallback is vmalloc(), as the memory doesn't have to be physically contiguous and the allocation is temporary for the duration of the syscall only. There were some concerns, whether this would have negative impact on the system by exposing vmalloc() to userspace. Although an excessive use of vmalloc can cause some system wide performance issues - TLB flushes etc. - a large order allocation is not for free either and an excessive reclaim/compaction can have a similar effect. Also note that the size is effectively limited by RLIMIT_NOFILE which defaults to 1024 on the systems I checked. That means the bitmaps will fit well within single page and thus the vmalloc() fallback could be only excercised for processes where root allows a higher limit. Note that the poll(2) syscall seems to use a linked list of order-0 pages, so it doesn't need this kind of fallback. [eric.dumazet@gmail.com: fix failure path logic] [akpm@linux-foundation.org: use proper type for size] Link: http://lkml.kernel.org/r/20160927084536.5923-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Alexander Viro Cc: Eric Dumazet Cc: David Laight Cc: Hillf Danton Cc: Nicholas Piggin Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/select.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/select.c b/fs/select.c index 015547330e88..f4dd55fc638c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -550,7 +551,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set_bits fds; void *bits; int ret, max_fds; - unsigned int size; + size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -577,7 +578,14 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; - bits = kmalloc(6 * size, GFP_KERNEL); + if (size > (SIZE_MAX / 6)) + goto out_nofds; + + alloc_size = 6 * size; + bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); + if (!bits && alloc_size > PAGE_SIZE) + bits = vmalloc(alloc_size); + if (!bits) goto out_nofds; } @@ -614,7 +622,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, out: if (bits != stack_fds) - kfree(bits); + kvfree(bits); out_nofds: return ret; } From 7175e56fa7076ea176ffa6dd41ddad288294ddb5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:49:51 -0700 Subject: [PATCH 076/131] mm/mmap.c: do not blow on PROT_NONE MAP_FIXED holes in the stack commit 561b5e0709e4a248c67d024d4d94b6e31e3edf2f upstream. Commit 1be7107fbe18 ("mm: larger stack guard gap, between vmas") has introduced a regression in some rust and Java environments which are trying to implement their own stack guard page. They are punching a new MAP_FIXED mapping inside the existing stack Vma. This will confuse expand_{downwards,upwards} into thinking that the stack expansion would in fact get us too close to an existing non-stack vma which is a correct behavior wrt safety. It is a real regression on the other hand. Let's work around the problem by considering PROT_NONE mapping as a part of the stack. This is a gros hack but overflowing to such a mapping would trap anyway an we only can hope that usespace knows what it is doing and handle it propely. Fixes: 1be7107fbe18 ("mm: larger stack guard gap, between vmas") Link: http://lkml.kernel.org/r/20170705182849.GA18027@dhcp22.suse.cz Signed-off-by: Michal Hocko Debugged-by: Vlastimil Babka Cc: Ben Hutchings Cc: Willy Tarreau Cc: Oleg Nesterov Cc: Rik van Riel Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index eaa460ddcaf9..cc84b97ca250 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2188,7 +2188,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr) { + if (next && next->vm_start < gap_addr && + (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2273,7 +2274,8 @@ int expand_downwards(struct vm_area_struct *vma, if (gap_addr > address) return -ENOMEM; prev = vma->vm_prev; - if (prev && prev->vm_end > gap_addr) { + if (prev && prev->vm_end > gap_addr && + (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(prev->vm_flags & VM_GROWSDOWN)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ From 361376a53481a7e042a1dd320b6bdbde8dc1ba48 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 12 May 2017 15:46:26 -0700 Subject: [PATCH 077/131] hwpoison, memcg: forcibly uncharge LRU pages commit 18365225f0440d09708ad9daade2ec11275c3df9 upstream. Laurent Dufour has noticed that hwpoinsoned pages are kept charged. In his particular case he has hit a bad_page("page still charged to cgroup") when onlining a hwpoison page. While this looks like something that shouldn't happen in the first place because onlining hwpages and returning them to the page allocator makes only little sense it shows a real problem. hwpoison pages do not get freed usually so we do not uncharge them (at least not since commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API")). Each charge pins memcg (since e8ea14cc6ead ("mm: memcontrol: take a css reference for each charged page")) as well and so the mem_cgroup and the associated state will never go away. Fix this leak by forcibly uncharging a LRU hwpoisoned page in delete_from_lru_cache(). We also have to tweak uncharge_list because it cannot rely on zero ref count for these pages. [akpm@linux-foundation.org: coding-style fixes] Fixes: 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") Link: http://lkml.kernel.org/r/20170502185507.GB19165@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Laurent Dufour Tested-by: Laurent Dufour Reviewed-by: Balbir Singh Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 2 +- mm/memory-failure.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e25b93a4267d..55a9facb8e8d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5576,7 +5576,7 @@ static void uncharge_list(struct list_head *page_list) next = page->lru.next; VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); if (!page->mem_cgroup) continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 091fe9b06663..92a647957f91 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -539,6 +539,13 @@ static int delete_from_lru_cache(struct page *p) */ ClearPageActive(p); ClearPageUnevictable(p); + + /* + * Poisoned page might never drop its ref count to 0 so we have + * to uncharge it manually from its memcg. + */ + mem_cgroup_uncharge(p); + /* * drop the page count elevated by isolate_lru_page() */ From 1ab0bb9834766841e0119d472c78077c459654ac Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Mon, 10 Jul 2017 15:49:44 -0700 Subject: [PATCH 078/131] cma: fix calculation of aligned offset commit e048cb32f69038aa1c8f11e5c1b331be4181659d upstream. The align_offset parameter is used by bitmap_find_next_zero_area_off() to represent the offset of map's base from the previous alignment boundary; the function ensures that the returned index, plus the align_offset, honors the specified align_mask. The logic introduced by commit b5be83e308f7 ("mm: cma: align to physical address, not CMA region position") has the cma driver calculate the offset to the *next* alignment boundary. In most cases, the base alignment is greater than that specified when making allocations, resulting in a zero offset whether we align up or down. In the example given with the commit, the base alignment (8MB) was half the requested alignment (16MB) so the math also happened to work since the offset is 8MB in both directions. However, when requesting allocations with an alignment greater than twice that of the base, the returned index would not be correctly aligned. Also, the align_order arguments of cma_bitmap_aligned_mask() and cma_bitmap_aligned_offset() should not be negative so the argument type was made unsigned. Fixes: b5be83e308f7 ("mm: cma: align to physical address, not CMA region position") Link: http://lkml.kernel.org/r/20170628170742.2895-1-opendmb@gmail.com Signed-off-by: Angus Clark Signed-off-by: Doug Berger Acked-by: Gregory Fong Cc: Doug Berger Cc: Angus Clark Cc: Laura Abbott Cc: Vlastimil Babka Cc: Greg Kroah-Hartman Cc: Lucas Stach Cc: Catalin Marinas Cc: Shiraz Hashim Cc: Jaewon Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Vlastimil Babka Signed-off-by: Greg Kroah-Hartman --- mm/cma.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index bd0e1412475e..43f4a122e969 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -54,7 +54,7 @@ unsigned long cma_get_size(const struct cma *cma) } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, - int align_order) + unsigned int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -62,17 +62,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, } /* - * Find a PFN aligned to the specified order and return an offset represented in - * order_per_bits. + * Find the offset of the base PFN from the specified align_order. + * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, - int align_order) + unsigned int align_order) { - if (align_order <= cma->order_per_bit) - return 0; - - return (ALIGN(cma->base_pfn, (1UL << align_order)) - - cma->base_pfn) >> cma->order_per_bit; + return (cma->base_pfn & ((1UL << align_order) - 1)) + >> cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, From 90ca50c56bfba6c670c4b32cc062aa4c2b850938 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Nov 2017 17:38:30 -0800 Subject: [PATCH 079/131] mm, page_alloc: fix potential false positive in __zone_watermark_ok commit b050e3769c6b4013bb937e879fc43bf1847ee819 upstream. Since commit 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations"), __zone_watermark_ok() check for high-order allocations will shortcut per-migratetype free list checks for ALLOC_HARDER allocations, and return true as long as there's free page of any migratetype. The intention is that ALLOC_HARDER can allocate from MIGRATE_HIGHATOMIC free lists, while normal allocations can't. However, as a side effect, the watermark check will then also return true when there are pages only on the MIGRATE_ISOLATE list, or (prior to CMA conversion to ZONE_MOVABLE) on the MIGRATE_CMA list. Since the allocation cannot actually obtain isolated pages, and might not be able to obtain CMA pages, this can result in a false positive. The condition should be rare and perhaps the outcome is not a fatal one. Still, it's better if the watermark check is correct. There also shouldn't be a performance tradeoff here. Link: http://lkml.kernel.org/r/20171102125001.23708-1-vbabka@suse.cz Fixes: 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations") Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Rik van Riel Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c70f03d91ec..a4c9cd80c7b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2468,9 +2468,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -2482,6 +2479,9 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } From bab93a6196224c862c679a1b1e97be8ceef926a5 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 14 Dec 2016 15:06:07 -0800 Subject: [PATCH 080/131] ipc: msg, make msgrcv work with LONG_MIN commit 999898355e08ae3b92dfd0a08db706e0c6703d30 upstream. When LONG_MIN is passed to msgrcv, one would expect to recieve any message. But convert_mode does *msgtyp = -*msgtyp and -LONG_MIN is undefined. In particular, with my gcc -LONG_MIN produces -LONG_MIN again. So handle this case properly by assigning LONG_MAX to *msgtyp if LONG_MIN was specified as msgtyp to msgrcv. This code: long msg[] = { 100, 200 }; int m = msgget(IPC_PRIVATE, IPC_CREAT | 0644); msgsnd(m, &msg, sizeof(msg), 0); msgrcv(m, &msg, sizeof(msg), LONG_MIN, 0); produces currently nothing: msgget(IPC_PRIVATE, IPC_CREAT|0644) = 65538 msgsnd(65538, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, 0) = 0 msgrcv(65538, ... Except a UBSAN warning: UBSAN: Undefined behaviour in ipc/msg.c:745:13 negation of -9223372036854775808 cannot be represented in type 'long int': With the patch, I see what I expect: msgget(IPC_PRIVATE, IPC_CREAT|0644) = 0 msgsnd(0, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, 0) = 0 msgrcv(0, {100, "\310\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 16, -9223372036854775808, 0) = 16 Link: http://lkml.kernel.org/r/20161024082633.10148-1-jslaby@suse.cz Signed-off-by: Jiri Slaby Cc: Davidlohr Bueso Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- ipc/msg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ipc/msg.c b/ipc/msg.c index c6521c205cb4..f993f441f852 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -742,7 +742,10 @@ static inline int convert_mode(long *msgtyp, int msgflg) if (*msgtyp == 0) return SEARCH_ANY; if (*msgtyp < 0) { - *msgtyp = -*msgtyp; + if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ + *msgtyp = LONG_MAX; + else + *msgtyp = -*msgtyp; return SEARCH_LESSEQUAL; } if (msgflg & MSG_EXCEPT) From 699a6cc7dd321113c99e06c6c997e8d15557439c Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 8 Jun 2016 14:59:52 +0800 Subject: [PATCH 081/131] x86/ioapic: Fix incorrect pointers in ioapic_setup_resources() commit 9d98bcec731756b8688b59ec998707924d716d7b upstream. On a 4-socket Brickland system, hot-removing one ioapic is fine. Hot-removing the 2nd one causes panic in mp_unregister_ioapic() while calling release_resource(). It is because the iomem_res pointer has already been released when removing the first ioapic. To explain the use of &res[num] here: res is assigned to ioapic_resources, and later in ioapic_insert_resources() we do: struct resource *r = ioapic_resources; for_each_ioapic(i) { insert_resource(&iomem_resource, r); r++; } Here 'r' is treated as an arry of 'struct resource', and the r++ ensures that each element of the array is inserted separately. Thus we should call release_resouce() on each element at &res[num]. Fix it by assigning the correct pointers to ioapics[i].iomem_res in ioapic_setup_resources(). Signed-off-by: Rui Wang Signed-off-by: Thomas Gleixner Cc: tony.luck@intel.com Cc: linux-pci@vger.kernel.org Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: bhelgaas@google.com Link: http://lkml.kernel.org/r/1465369193-4816-3-git-send-email-rui.y.wang@intel.com Signed-off-by: Ingo Molnar Acked-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index fc91c98bee01..fd945099fc95 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2592,8 +2592,8 @@ static struct resource * __init ioapic_setup_resources(void) res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; + ioapics[i].iomem_res = &res[num]; num++; - ioapics[i].iomem_res = res; } ioapic_resources = res; From 4d48ba75ec7877939e9f69ef66894f7f9ce4cf38 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 2 Jun 2016 01:57:50 +0200 Subject: [PATCH 082/131] ACPI / processor: Avoid reserving IO regions too early commit 86314751c7945fa0c67f459beeda2e7c610ca429 upstream. Roland Dreier reports that one of his systems cannot boot because of the changes made by commit ac212b6980d8 (ACPI / processor: Use common hotplug infrastructure). The problematic part of it is the request_region() call in acpi_processor_get_info() that used to run at module init time before the above commit and now it runs much earlier. Unfortunately, the region(s) reserved by it fall into a range the PCI subsystem attempts to reserve for AHCI IO BARs. As a result, the PCI reservation fails and AHCI doesn't work, while previously the PCI reservation would be made before acpi_processor_get_info() and it would succeed. That request_region() call, however, was overlooked by commit ac212b6980d8, as it is not necessary for the enumeration of the processors. It only is needed when the ACPI processor driver actually attempts to handle them which doesn't happen before loading the ACPI processor driver module. Therefore that call should have been moved from acpi_processor_get_info() into that module. Address the problem by moving the request_region() call in question out of acpi_processor_get_info() and use the observation that the region reserved by it is only needed if the FADT-based CPU throttling method is going to be used, which means that it should be sufficient to invoke it from acpi_processor_get_throttling_fadt(). Fixes: ac212b6980d8 (ACPI / processor: Use common hotplug infrastructure) Reported-by: Roland Dreier Tested-by: Roland Dreier Signed-off-by: Rafael J. Wysocki Acked-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/acpi_processor.c | 9 --------- drivers/acpi/processor_throttling.c | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 9f77943653fb..b63a173786d5 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -331,15 +331,6 @@ static int acpi_processor_get_info(struct acpi_device *device) pr->throttling.duty_width = acpi_gbl_FADT.duty_width; pr->pblk = object.processor.pblk_address; - - /* - * We don't care about error returns - we just try to mark - * these reserved so that nobody else is confused into thinking - * that this region might be unused.. - * - * (In particular, allocating the IO range for Cardbus) - */ - request_region(pr->throttling.address, 6, "ACPI CPU throttle"); } /* diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index f170d746336d..c72e64893d03 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -676,6 +676,15 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr) if (!pr->flags.throttling) return -ENODEV; + /* + * We don't care about error returns - we just try to mark + * these reserved so that nobody else is confused into thinking + * that this region might be unused.. + * + * (In particular, allocating the IO range for Cardbus) + */ + request_region(pr->throttling.address, 6, "ACPI CPU throttle"); + pr->throttling.state = 0; duty_mask = pr->throttling.state_count - 1; From 1fe277d48f998bd928f1b6ddcbe72f2a34b6f5ad Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Dec 2016 02:27:31 +0100 Subject: [PATCH 083/131] ACPI / scan: Prefer devices without _HID/_CID for _ADR matching commit c2a6bbaf0c5f90463a7011a295bbdb7e33c80b51 upstream. The way acpi_find_child_device() works currently is that, if there are two (or more) devices with the same _ADR value in the same namespace scope (which is not specifically allowed by the spec and the OS behavior in that case is not defined), the first one of them found to be present (with the help of _STA) will be returned. This covers the majority of cases, but is not sufficient if some of the devices in question have a _HID (or _CID) returning some valid ACPI/PNP device IDs (which is disallowed by the spec) and the ASL writers' expectation appears to be that the OS will match devices without a valid ACPI/PNP device ID against a given bus address first. To cover this special case as well, modify find_child_checks() to prefer devices without ACPI/PNP device IDs over devices that have them. Suggested-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki Tested-by: Hans de Goede Signed-off-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/glue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 73c9c7fa9001..f06317d6fc38 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -99,13 +99,13 @@ static int find_child_checks(struct acpi_device *adev, bool check_children) return -ENODEV; /* - * If the device has a _HID (or _CID) returning a valid ACPI/PNP - * device ID, it is better to make it look less attractive here, so that - * the other device with the same _ADR value (that may not have a valid - * device ID) can be matched going forward. [This means a second spec - * violation in a row, so whatever we do here is best effort anyway.] + * If the device has a _HID returning a valid ACPI/PNP device ID, it is + * better to make it look less attractive here, so that the other device + * with the same _ADR value (that may not have a valid device ID) can be + * matched going forward. [This means a second spec violation in a row, + * so whatever we do here is best effort anyway.] */ - return sta_present && list_empty(&adev->pnp.ids) ? + return sta_present && !adev->pnp.type.platform_id ? FIND_CHILD_MAX_SCORE : FIND_CHILD_MIN_SCORE; } From 4c19b00e5588828f0d3198b926efade766dcf2c8 Mon Sep 17 00:00:00 2001 From: Seunghun Han Date: Wed, 26 Apr 2017 16:18:08 +0800 Subject: [PATCH 084/131] ACPICA: Namespace: fix operand cache leak commit 3b2d69114fefa474fca542e51119036dceb4aa6f upstream. ACPICA commit a23325b2e583556eae88ed3f764e457786bf4df6 I found some ACPI operand cache leaks in ACPI early abort cases. Boot log of ACPI operand cache leak is as follows: >[ 0.174332] ACPI: Added _OSI(Module Device) >[ 0.175504] ACPI: Added _OSI(Processor Device) >[ 0.176010] ACPI: Added _OSI(3.0 _SCP Extensions) >[ 0.177032] ACPI: Added _OSI(Processor Aggregator Device) >[ 0.178284] ACPI: SCI (IRQ16705) allocation failed >[ 0.179352] ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20160930/evevent-131) >[ 0.180008] ACPI: Unable to start the ACPI Interpreter >[ 0.181125] ACPI Error: Could not remove SCI handler (20160930/evmisc-281) >[ 0.184068] kmem_cache_destroy Acpi-Operand: Slab cache still has objects >[ 0.185358] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.10.0-rc3 #2 >[ 0.186820] Hardware name: innotek gmb_h virtual_box/virtual_box, BIOS virtual_box 12/01/2006 >[ 0.188000] Call Trace: >[ 0.188000] ? dump_stack+0x5c/0x7d >[ 0.188000] ? kmem_cache_destroy+0x224/0x230 >[ 0.188000] ? acpi_sleep_proc_init+0x22/0x22 >[ 0.188000] ? acpi_os_delete_cache+0xa/0xd >[ 0.188000] ? acpi_ut_delete_caches+0x3f/0x7b >[ 0.188000] ? acpi_terminate+0x5/0xf >[ 0.188000] ? acpi_init+0x288/0x32e >[ 0.188000] ? __class_create+0x4c/0x80 >[ 0.188000] ? video_setup+0x7a/0x7a >[ 0.188000] ? do_one_initcall+0x4e/0x1b0 >[ 0.188000] ? kernel_init_freeable+0x194/0x21a >[ 0.188000] ? rest_init+0x80/0x80 >[ 0.188000] ? kernel_init+0xa/0x100 >[ 0.188000] ? ret_from_fork+0x25/0x30 When early abort is occurred due to invalid ACPI information, Linux kernel terminates ACPI by calling acpi_terminate() function. The function calls acpi_ns_terminate() function to delete namespace data and ACPI operand cache (acpi_gbl_module_code_list). But the deletion code in acpi_ns_terminate() function is wrapped in ACPI_EXEC_APP definition, therefore the code is only executed when the definition exists. If the define doesn't exist, ACPI operand cache (acpi_gbl_module_code_list) is leaked, and stack dump is shown in kernel log. This causes a security threat because the old kernel (<= 4.9) shows memory locations of kernel functions in stack dump, therefore kernel ASLR can be neutralized. To fix ACPI operand leak for enhancing security, I made a patch which removes the ACPI_EXEC_APP define in acpi_ns_terminate() function for executing the deletion code unconditionally. Link: https://github.com/acpica/acpica/commit/a23325b2 Signed-off-by: Seunghun Han Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki Acked-by: Lee, Chun-Yi Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/acpica/nsutils.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/acpica/nsutils.c b/drivers/acpi/acpica/nsutils.c index de325ae04ce1..3b3c5b90bd20 100644 --- a/drivers/acpi/acpica/nsutils.c +++ b/drivers/acpi/acpica/nsutils.c @@ -593,25 +593,20 @@ struct acpi_namespace_node *acpi_ns_validate_handle(acpi_handle handle) void acpi_ns_terminate(void) { acpi_status status; + union acpi_operand_object *prev; + union acpi_operand_object *next; ACPI_FUNCTION_TRACE(ns_terminate); -#ifdef ACPI_EXEC_APP - { - union acpi_operand_object *prev; - union acpi_operand_object *next; + /* Delete any module-level code blocks */ - /* Delete any module-level code blocks */ - - next = acpi_gbl_module_code_list; - while (next) { - prev = next; - next = next->method.mutex; - prev->method.mutex = NULL; /* Clear the Mutex (cheated) field */ - acpi_ut_remove_reference(prev); - } + next = acpi_gbl_module_code_list; + while (next) { + prev = next; + next = next->method.mutex; + prev->method.mutex = NULL; /* Clear the Mutex (cheated) field */ + acpi_ut_remove_reference(prev); } -#endif /* * Free the entire namespace -- all nodes and all objects From 45cf54e13c70ce0ec4875220103916978ce3ed07 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 14 Jul 2016 17:51:26 +0200 Subject: [PATCH 085/131] netfilter: x_tables: speed up jump target validation commit f4dc77713f8016d2e8a3295e1c9c53a21f296def upstream. The dummy ruleset I used to test the original validation change was broken, most rules were unreachable and were not tested by mark_source_chains(). In some cases rulesets that used to load in a few seconds now require several minutes. sample ruleset that shows the behaviour: echo "*filter" for i in $(seq 0 100000);do printf ":chain_%06x - [0:0]\n" $i done for i in $(seq 0 100000);do printf -- "-A INPUT -j chain_%06x\n" $i printf -- "-A INPUT -j chain_%06x\n" $i printf -- "-A INPUT -j chain_%06x\n" $i done echo COMMIT [ pipe result into iptables-restore ] This ruleset will be about 74mbyte in size, with ~500k searches though all 500k[1] rule entries. iptables-restore will take forever (gave up after 10 minutes) Instead of always searching the entire blob for a match, fill an array with the start offsets of every single ipt_entry struct, then do a binary search to check if the jump target is present or not. After this change ruleset restore times get again close to what one gets when reverting 36472341017529e (~3 seconds on my workstation). [1] every user-defined rule gets an implicit RETURN, so we get 300k jumps + 100k userchains + 100k returns -> 500k rule entries Fixes: 36472341017529e ("netfilter: x_tables: validate targets of jumps") Reported-by: Jeff Wu Tested-by: Jeff Wu Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- include/linux/netfilter/x_tables.h | 4 +++ net/ipv4/netfilter/arp_tables.c | 47 +++++++++++++++------------- net/ipv4/netfilter/ip_tables.c | 45 ++++++++++++++------------- net/ipv6/netfilter/ip6_tables.c | 45 ++++++++++++++------------- net/netfilter/x_tables.c | 50 ++++++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 64 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 04078e8a4803..26a41ddd3dec 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -243,6 +243,10 @@ int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); +unsigned int *xt_alloc_entry_offsets(unsigned int size); +bool xt_find_jump_offset(const unsigned int *offsets, + unsigned int target, unsigned int size); + int xt_check_match(struct xt_mtchk_param *, unsigned int size, u_int8_t proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u_int8_t proto, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 6e3e0e8b1ce3..d6531c544a82 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -367,23 +367,12 @@ static inline bool unconditional(const struct arpt_entry *e) memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } -static bool find_jump_target(const struct xt_table_info *t, - const struct arpt_entry *target) -{ - struct arpt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -472,10 +461,11 @@ static int mark_source_chains(const struct xt_table_info *newinfo, /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct arpt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -642,6 +632,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { struct arpt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -655,6 +646,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: size %u\n", newinfo->size); + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ @@ -665,7 +659,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - break; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(arpt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) @@ -673,12 +669,13 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) - return ret; + goto out_free; + ret = -EINVAL; if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", i, repl->num_entries); - return -EINVAL; + goto out_free; } /* Check hooks all assigned */ @@ -689,17 +686,20 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", i, repl->hook_entry[i]); - return -EINVAL; + goto out_free; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", i, repl->underflow[i]); - return -EINVAL; + goto out_free; } } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -719,6 +719,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a399c5419622..2ba158f2cb72 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -443,23 +443,12 @@ ipt_do_table(struct sk_buff *skb, #endif } -static bool find_jump_target(const struct xt_table_info *t, - const struct ipt_entry *target) -{ - struct ipt_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -552,10 +541,11 @@ mark_source_chains(const struct xt_table_info *newinfo, /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ipt_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -811,6 +801,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { struct ipt_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -824,6 +815,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: size %u\n", newinfo->size); + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -833,17 +827,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ipt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", i, repl->num_entries); - return -EINVAL; + goto out_free; } /* Check hooks all assigned */ @@ -854,17 +851,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", i, repl->hook_entry[i]); - return -EINVAL; + goto out_free; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", i, repl->underflow[i]); - return -EINVAL; + goto out_free; } } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -884,6 +884,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 22f39e00bef3..c26bed92f523 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -455,23 +455,12 @@ ip6t_do_table(struct sk_buff *skb, #endif } -static bool find_jump_target(const struct xt_table_info *t, - const struct ip6t_entry *target) -{ - struct ip6t_entry *iter; - - xt_entry_foreach(iter, t->entries, t->size) { - if (iter == target) - return true; - } - return false; -} - /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, - unsigned int valid_hooks, void *entry0) + unsigned int valid_hooks, void *entry0, + unsigned int *offsets) { unsigned int hook; @@ -564,10 +553,11 @@ mark_source_chains(const struct xt_table_info *newinfo, /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); + if (!xt_find_jump_offset(offsets, newpos, + newinfo->number)) + return 0; e = (struct ip6t_entry *) (entry0 + newpos); - if (!find_jump_target(newinfo, e)) - return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -823,6 +813,7 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { struct ip6t_entry *iter; + unsigned int *offsets; unsigned int i; int ret = 0; @@ -836,6 +827,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, } duprintf("translate_table: size %u\n", newinfo->size); + offsets = xt_alloc_entry_offsets(newinfo->number); + if (!offsets) + return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -845,17 +839,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, repl->underflow, repl->valid_hooks); if (ret != 0) - return ret; + goto out_free; + if (i < repl->num_entries) + offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ip6t_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } + ret = -EINVAL; if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", i, repl->num_entries); - return -EINVAL; + goto out_free; } /* Check hooks all assigned */ @@ -866,17 +863,20 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", i, repl->hook_entry[i]); - return -EINVAL; + goto out_free; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", i, repl->underflow[i]); - return -EINVAL; + goto out_free; } } - if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) - return -ELOOP; + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { + ret = -ELOOP; + goto out_free; + } + kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; @@ -896,6 +896,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, return ret; } + return ret; + out_free: + kvfree(offsets); return ret; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2fc6ca9d1286..7b42b0ad3f9b 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -701,6 +701,56 @@ int xt_check_entry_offsets(const void *base, } EXPORT_SYMBOL(xt_check_entry_offsets); +/** + * xt_alloc_entry_offsets - allocate array to store rule head offsets + * + * @size: number of entries + * + * Return: NULL or kmalloc'd or vmalloc'd array + */ +unsigned int *xt_alloc_entry_offsets(unsigned int size) +{ + unsigned int *off; + + off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN); + + if (off) + return off; + + if (size < (SIZE_MAX / sizeof(unsigned int))) + off = vmalloc(size * sizeof(unsigned int)); + + return off; +} +EXPORT_SYMBOL(xt_alloc_entry_offsets); + +/** + * xt_find_jump_offset - check if target is a valid jump offset + * + * @offsets: array containing all valid rule start offsets of a rule blob + * @target: the jump target to search for + * @size: entries in @offset + */ +bool xt_find_jump_offset(const unsigned int *offsets, + unsigned int target, unsigned int size) +{ + int m, low = 0, hi = size; + + while (hi > low) { + m = (low + hi) / 2u; + + if (offsets[m] > target) + hi = m; + else if (offsets[m] < target) + low = m + 1; + else + return true; + } + + return false; +} +EXPORT_SYMBOL(xt_find_jump_offset); + int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { From a84338dad3c9501a5301db27ee665cda663219fc Mon Sep 17 00:00:00 2001 From: Hongxu Jia Date: Tue, 29 Nov 2016 21:56:26 -0500 Subject: [PATCH 086/131] netfilter: arp_tables: fix invoking 32bit "iptable -P INPUT ACCEPT" failed in 64bit kernel commit 17a49cd549d9dc8707dc9262210166455c612dde upstream. Since 09d9686047db ("netfilter: x_tables: do compat validation via translate_table"), it used compatr structure to assign newinfo structure. In translate_compat_table of ip_tables.c and ip6_tables.c, it used compatr->hook_entry to replace info->hook_entry and compatr->underflow to replace info->underflow, but not do the same replacement in arp_tables.c. It caused invoking 32-bit "arptbale -P INPUT ACCEPT" failed in 64bit kernel. -------------------------------------- root@qemux86-64:~# arptables -P INPUT ACCEPT root@qemux86-64:~# arptables -P INPUT ACCEPT ERROR: Policy for `INPUT' offset 448 != underflow 0 arptables: Incompatible with this kernel -------------------------------------- Fixes: 09d9686047db ("netfilter: x_tables: do compat validation via translate_table") Signed-off-by: Hongxu Jia Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/ipv4/netfilter/arp_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d6531c544a82..c75211528991 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1339,8 +1339,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { - newinfo->hook_entry[i] = info->hook_entry[i]; - newinfo->underflow[i] = info->underflow[i]; + newinfo->hook_entry[i] = compatr->hook_entry[i]; + newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; From da00455d38a73c9412dd3f99285b48753ff1b61a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 26 May 2016 19:08:10 +0200 Subject: [PATCH 087/131] netfilter: nf_dup_ipv6: set again FLOWI_FLAG_KNOWN_NH at flowi6_flags commit 83170f3beccccd7ceb4f9a0ac0c4dc736afde90c upstream. With the commit 48e8aa6e3137 ("ipv6: Set FLOWI_FLAG_KNOWN_NH at flowi6_flags") ip6_pol_route() callers were asked to to set the FLOWI_FLAG_KNOWN_NH properly and xt_TEE was updated accordingly, but with the later refactor in commit bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6") the flowi6_flags update was lost. This commit re-add it just before the routing decision. Fixes: bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6") Signed-off-by: Paolo Abeni Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/ipv6/netfilter/nf_dup_ipv6.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 6989c70ae29f..4a84b5ad9ecb 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -33,6 +33,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, fl6.daddr = *gw; fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) | (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]); + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); From d8f5cce0a8bce0decd125ef430ee6196692ff8da Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 8 Aug 2016 21:57:58 +0800 Subject: [PATCH 088/131] netfilter: nf_ct_expect: remove the redundant slash when policy name is empty commit b173a28f62cf929324a8a6adcc45adadce311d16 upstream. The 'name' filed in struct nf_conntrack_expect_policy{} is not a pointer, so check it is NULL or not will always return true. Even if the name is empty, slash will always be displayed like follows: # cat /proc/net/nf_conntrack_expect 297 l3proto = 2 proto=6 src=1.1.1.1 dst=2.2.2.2 sport=1 dport=1025 ftp/ ^ Fixes: 3a8fc53a45c4 ("netfilter: nf_ct_helper: allocate 16 bytes for the helper and policy names") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_expect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 7f16d19d6198..a91f8bd51d05 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -560,7 +560,7 @@ static int exp_seq_show(struct seq_file *s, void *v) helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); - if (helper->expect_policy[expect->class].name) + if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } From 2adda392d28baa25ca7803002fdafcaa95b7b1fb Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 8 Aug 2016 22:07:27 +0800 Subject: [PATCH 089/131] netfilter: nfnetlink_queue: reject verdict request from different portid commit 00a3101f561816e58de054a470484996f78eb5eb upstream. Like NFQNL_MSG_VERDICT_BATCH do, we should also reject the verdict request when the portid is not same with the initial portid(maybe from another process). Fixes: 97d32cf9440d ("netfilter: nfnetlink_queue: batch verdict support") Signed-off-by: Liping Zhang Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink_queue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f6837f9b6d6c..c14d2e8eaec3 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1053,10 +1053,8 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, struct net *net = sock_net(ctnl); struct nfnl_queue_net *q = nfnl_queue_pernet(net); - queue = instance_lookup(q, queue_num); - if (!queue) - queue = verdict_instance_lookup(q, queue_num, - NETLINK_CB(skb).portid); + queue = verdict_instance_lookup(q, queue_num, + NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); From 2cf26badef7bde1f9ecb9fbdea2079ce366f3013 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Aug 2016 15:33:29 +0200 Subject: [PATCH 090/131] netfilter: restart search if moved to other chain commit 95a8d19f28e6b29377a880c6264391a62e07fccc upstream. In case nf_conntrack_tuple_taken did not find a conflicting entry check that all entries in this hash slot were tested and restart in case an entry was moved to another chain. Reported-by: Eric Dumazet Fixes: ea781f197d6a ("netfilter: nf_conntrack: use SLAB_DESTROY_BY_RCU and get rid of call_rcu()") Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 86a3c6f0c871..5f747089024f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -719,6 +719,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * least once for the stats anyway. */ rcu_read_lock_bh(); + begin: hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && @@ -730,6 +731,12 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, } NF_CT_STAT_INC(net, searched); } + + if (get_nulls_value(n) != hash) { + NF_CT_STAT_INC(net, search_restart); + goto begin; + } + rcu_read_unlock_bh(); return 0; From 80291566d8f324563f2d889ef5a470e223d44f5c Mon Sep 17 00:00:00 2001 From: Ulrich Weber Date: Mon, 24 Oct 2016 18:07:23 +0200 Subject: [PATCH 091/131] netfilter: nf_conntrack_sip: extend request line validation commit 444f901742d054a4cd5ff045871eac5131646cfb upstream. on SIP requests, so a fragmented TCP SIP packet from an allow header starting with INVITE,NOTIFY,OPTIONS,REFER,REGISTER,UPDATE,SUBSCRIBE Content-Length: 0 will not bet interpreted as an INVITE request. Also Request-URI must start with an alphabetic character. Confirm with RFC 3261 Request-Line = Method SP Request-URI SP SIP-Version CRLF Fixes: 30f33e6dee80 ("[NETFILTER]: nf_conntrack_sip: support method specific request/response handling") Signed-off-by: Ulrich Weber Acked-by: Marco Angaroni Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_sip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 885b4aba3695..1665c2159e4b 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1434,9 +1434,12 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, handler = &sip_handlers[i]; if (handler->request == NULL) continue; - if (*datalen < handler->len || + if (*datalen < handler->len + 2 || strncasecmp(*dptr, handler->method, handler->len)) continue; + if ((*dptr)[handler->len] != ' ' || + !isalpha((*dptr)[handler->len+1])) + continue; if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, &matchoff, &matchlen) <= 0) { From f4ca7cba8ffa94c43913ed322c7d35ab3d6d1e38 Mon Sep 17 00:00:00 2001 From: Pau Espin Pedrol Date: Fri, 6 Jan 2017 20:33:27 +0100 Subject: [PATCH 092/131] netfilter: use fwmark_reflect in nf_send_reset commit cc31d43b4154ad5a7d8aa5543255a93b7e89edc2 upstream. Otherwise, RST packets generated by ipt_REJECT always have mark 0 when the routing is checked later in the same code path. Fixes: e110861f8609 ("net: add a sysctl to reflect the fwmark on replies") Cc: Lorenzo Colitti Signed-off-by: Pau Espin Pedrol Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/ipv4/netfilter/nf_reject_ipv4.c | 2 ++ net/ipv6/netfilter/nf_reject_ipv6.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index c747b2d9eb77..d4acf38b60fd 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -124,6 +124,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); + nskb->mark = IP4_REPLY_MARK(net, oldskb->mark); + skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index e0f922b777e3..7117e5bef412 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -157,6 +157,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; + fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst == NULL || dst->error) { @@ -180,6 +181,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) skb_dst_set(nskb, dst); + nskb->mark = fl6.flowi6_mark; + skb_reserve(nskb, hh_len + dst->header_len); ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); From 936b21419e7c5be2f81e6dea02fc3d8852f3fb83 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 Apr 2016 10:39:34 +0200 Subject: [PATCH 093/131] netfilter: fix IS_ERR_VALUE usage commit 92b4423e3a0bc5d43ecde4bcad871f8b5ba04efd upstream. This is a forward-port of the original patch from Andrzej Hajda, he said: "IS_ERR_VALUE should be used only with unsigned long type. Otherwise it can work incorrectly. To achieve this function xt_percpu_counter_alloc is modified to return unsigned long, and its result is assigned to temporary variable to perform error checking, before assigning to .pcnt field. The patch follows conclusion from discussion on LKML [1][2]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2120927 [2]: http://permalink.gmane.org/gmane.linux.kernel/2150581" Original patch from Andrzej is here: http://patchwork.ozlabs.org/patch/582970/ This patch has clashed with input validation fixes for x_tables. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman Acked-by: Michal Kubecek --- include/linux/netfilter/x_tables.h | 6 +++--- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 26a41ddd3dec..d6c53fce006b 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -381,16 +381,16 @@ static inline unsigned long ifname_compare_aligned(const char *_a, * allows us to return 0 for single core systems without forcing * callers to deal with SMP vs. NONSMP issues. */ -static inline u64 xt_percpu_counter_alloc(void) +static inline unsigned long xt_percpu_counter_alloc(void) { if (nr_cpu_ids > 1) { void __percpu *res = __alloc_percpu(sizeof(struct xt_counters), sizeof(struct xt_counters)); if (res == NULL) - return (u64) -ENOMEM; + return -ENOMEM; - return (u64) (__force unsigned long) res; + return (__force unsigned long) res; } return 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c75211528991..4cfcc22f7430 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -511,11 +511,13 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct xt_entry_target *t; struct xt_target *target; + unsigned long pcnt; int ret; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2ba158f2cb72..a98173d1ea97 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -653,10 +653,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c26bed92f523..bb1b5453a7a1 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -666,10 +666,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; From a359a437fbc6bb08aa9cc8e25ef4ac3b77ca727b Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 3 Dec 2017 12:12:45 -0800 Subject: [PATCH 094/131] netfilter: nfnetlink_cthelper: Add missing permission checks commit 4b380c42f7d00a395feede754f0bc2292eebe6e5 upstream. The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, nfnl_cthelper_list is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: $ nfct helper list nfct v1.4.4: netlink error: Operation not permitted $ vpnns -- nfct helper list { .name = ftp, .queuenum = 0, .l3protonum = 2, .l4protonum = 6, .priv_data_len = 24, .status = enabled, }; Add capable() checks in nfnetlink_cthelper, as this is cleaner than trying to generalize the solution. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 8d34a488efc0..ac143ae4f7b6 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -392,6 +393,9 @@ nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb, struct nfnl_cthelper *nlcth; int ret = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; @@ -595,6 +599,9 @@ nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb, struct nfnl_cthelper *nlcth; bool tuple_set = false; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, @@ -661,6 +668,9 @@ nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb, struct nfnl_cthelper *nlcth, *n; int j = 0, ret; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); From d01ceb4722cd8d64176272434fe332b596750d9c Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Tue, 5 Dec 2017 15:42:41 -0800 Subject: [PATCH 095/131] netfilter: xt_osf: Add missing permission checks commit 916a27901de01446bcf57ecca4783f6cff493309 upstream. The capability check in nfnetlink_rcv() verifies that the caller has CAP_NET_ADMIN in the namespace that "owns" the netlink socket. However, xt_osf_fingers is shared by all net namespaces on the system. An unprivileged user can create user and net namespaces in which he holds CAP_NET_ADMIN to bypass the netlink_net_capable() check: vpnns -- nfnl_osf -f /tmp/pf.os vpnns -- nfnl_osf -f /tmp/pf.os -d These non-root operations successfully modify the systemwide OS fingerprint list. Add new capable() checks so that they can't. Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso Acked-by: Michal Kubecek Signed-off-by: Greg Kroah-Hartman --- net/netfilter/xt_osf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index df8801e02a32..7eae0d0af89a 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -69,6 +70,9 @@ static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, struct xt_osf_finger *kf = NULL, *sf; int err = 0; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; @@ -112,6 +116,9 @@ static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb, struct xt_osf_finger *sf; int err = -ENOENT; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; From 8e8224de0e1848a5905ebda018969e2ce72258ca Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 21 Jun 2017 14:34:15 +0200 Subject: [PATCH 096/131] ext2: Don't clear SGID when inheriting ACLs commit a992f2d38e4ce17b8c7d1f7f67b2de0eebdea069 upstream. When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by creating __ext2_set_acl() function that does not call posix_acl_update_mode() and use it when inheriting ACLs. That prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org CC: linux-ext4@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext2/acl.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d6aeb84e90b6..d882d873c5a3 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -178,11 +178,8 @@ ext2_get_acl(struct inode *inode, int type) return acl; } -/* - * inode->i_mutex: down - */ -int -ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int +__ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int name_index; void *value = NULL; @@ -192,13 +189,6 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch(type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: @@ -224,6 +214,24 @@ ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; } +/* + * inode->i_mutex: down + */ +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) + return error; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + return __ext2_set_acl(inode, acl, type); +} + /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * @@ -241,12 +249,12 @@ ext2_init_acl(struct inode *inode, struct inode *dir) return error; if (default_acl) { - error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!error) - error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } return error; From ac125d89f8e980f100f63634a81b84fed7e997cf Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 22 Jun 2017 16:47:34 -0400 Subject: [PATCH 097/131] reiserfs: fix race in prealloc discard commit 08db141b5313ac2f64b844fb5725b8d81744b417 upstream. The main loop in __discard_prealloc is protected by the reiserfs write lock which is dropped across schedules like the BKL it replaced. The problem is that it checks the value, calls a routine that schedules, and then adjusts the state. As a result, two threads that are calling reiserfs_prealloc_discard at the same time can race when one calls reiserfs_free_prealloc_block, the lock is dropped, and the other calls reiserfs_free_prealloc_block with the same block number. In the right circumstances, it can cause the prealloc count to go negative. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/bitmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index dc198bc64c61..73705d4bb069 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -513,9 +513,17 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, "inode has negative prealloc blocks count."); #endif while (ei->i_prealloc_count > 0) { - reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); - ei->i_prealloc_block++; + b_blocknr_t block_to_free; + + /* + * reiserfs_free_prealloc_block can drop the write lock, + * which could allow another caller to free the same block. + * We can protect against it by modifying the prealloc + * state before calling it. + */ + block_to_free = ei->i_prealloc_block++; ei->i_prealloc_count--; + reiserfs_free_prealloc_block(th, inode, block_to_free); dirty = 1; } if (dirty) From 512e3b18531eb9526a1e695c19043f78e3a0abb1 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 22 Jun 2017 16:35:04 -0400 Subject: [PATCH 098/131] reiserfs: don't preallocate blocks for extended attributes commit 54930dfeb46e978b447af0fb8ab4e181c1bf9d7a upstream. Most extended attributes will fit in a single block. More importantly, we drop the reference to the inode while holding the transaction open so the preallocated blocks aren't released. As a result, the inode may be evicted before it's removed from the transaction's prealloc list which can cause memory corruption. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 73705d4bb069..edc8ef78b63f 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1136,7 +1136,7 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) hint->prealloc_size = 0; if (!hint->formatted_node && hint->preallocate) { - if (S_ISREG(hint->inode->i_mode) + if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode) && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options. preallocmin * hint->inode->i_sb->s_blocksize) From e22c8378505e1354b4346c7ecccb446b7c505deb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2017 09:32:49 +0200 Subject: [PATCH 099/131] reiserfs: Don't clear SGID when inheriting ACLs commit 6883cd7f68245e43e91e5ee583b7550abf14523f upstream. When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __reiserfs_set_acl() into reiserfs_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: reiserfs-devel@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/reiserfs/xattr_acl.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 9b1824f35501..91b036902a17 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -37,7 +37,14 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { + if (type == ACL_TYPE_ACCESS && acl) { + error = posix_acl_update_mode(inode, &inode->i_mode, + &acl); + if (error) + goto unlock; + } error = __reiserfs_set_acl(&th, inode, type, acl); +unlock: reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th); reiserfs_write_unlock(inode->i_sb); @@ -245,11 +252,6 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - } break; case ACL_TYPE_DEFAULT: name = POSIX_ACL_XATTR_DEFAULT; From ad941c49a40237b77c543469e9e8daee0482a851 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 13 Jun 2017 13:35:51 +0200 Subject: [PATCH 100/131] fs/fcntl: f_setown, avoid undefined behaviour commit fc3dc67471461c0efcb1ed22fb7595121d65fad9 upstream. fcntl(0, F_SETOWN, 0x80000000) triggers: UBSAN: Undefined behaviour in fs/fcntl.c:118:7 negation of -2147483648 cannot be represented in type 'int': CPU: 1 PID: 18261 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1 ... Call Trace: ... [] ? f_setown+0x1d8/0x200 [] ? SyS_fcntl+0x999/0xf30 [] ? entry_SYSCALL_64_fastpath+0x23/0xc1 Fix that by checking the arg parameter properly (against INT_MAX) before "who = -who". And return immediatelly with -EINVAL in case it is wrong. Note that according to POSIX we can return EINVAL: http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html [EINVAL] The cmd argument is F_SETOWN and the value of the argument is not valid as a process or process group identifier. [v2] returns an error, v1 used to fail silently [v3] implement proper check for the bad value INT_MIN Signed-off-by: Jiri Slaby Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Jeff Layton Signed-off-by: Greg Kroah-Hartman --- fs/fcntl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 62376451bbce..5df914943d96 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -113,6 +113,10 @@ void f_setown(struct file *filp, unsigned long arg, int force) int who = arg; type = PIDTYPE_PID; if (who < 0) { + /* avoid overflow below */ + if (who == INT_MIN) + return; + type = PIDTYPE_PGID; who = -who; } From 509ed20926aa67aa9933b13f96002dc86b71ca60 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 9 Oct 2017 13:33:19 +0200 Subject: [PATCH 101/131] scsi: libiscsi: fix shifting of DID_REQUEUE host byte commit eef9ffdf9cd39b2986367bc8395e2772bc1284ba upstream. The SCSI host byte should be shifted left by 16 in order to have scsi_decide_disposition() do the right thing (.i.e. requeue the command). Signed-off-by: Johannes Thumshirn Fixes: 661134ad3765 ("[SCSI] libiscsi, bnx2i: make bound ep check common") Cc: Lee Duncan Cc: Hannes Reinecke Cc: Bart Van Assche Cc: Chris Leech Acked-by: Lee Duncan Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/libiscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index c1ccf1ee99ea..efce04df2109 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1727,7 +1727,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc) if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { reason = FAILURE_SESSION_IN_RECOVERY; - sc->result = DID_REQUEUE; + sc->result = DID_REQUEUE << 16; goto fault; } From 5f1dc06152e1683a9277a2836517d6574757dd1f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 24 Jan 2018 15:28:17 +0100 Subject: [PATCH 102/131] Revert "module: Add retpoline tag to VERMAGIC" commit 5132ede0fe8092b043dae09a7cc32b8ae7272baa upstream. This reverts commit 6cfb521ac0d5b97470883ff9b7facae264b7ab12. Turns out distros do not want to make retpoline as part of their "ABI", so this patch should not have been merged. Sorry Andi, this was my fault, I suggested it when your original patch was the "correct" way of doing this instead. Reported-by: Jiri Kosina Fixes: 6cfb521ac0d5 ("module: Add retpoline tag to VERMAGIC") Acked-by: Andi Kleen Cc: Thomas Gleixner Cc: David Woodhouse Cc: rusty@rustcorp.com.au Cc: arjan.van.de.ven@intel.com Cc: jeyu@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/vermagic.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a3d04934aa96..6f8fbcf10dfb 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -24,16 +24,10 @@ #ifndef MODULE_ARCH_VERMAGIC #define MODULE_ARCH_VERMAGIC "" #endif -#ifdef RETPOLINE -#define MODULE_VERMAGIC_RETPOLINE "retpoline " -#else -#define MODULE_VERMAGIC_RETPOLINE "" -#endif #define VERMAGIC_STRING \ UTS_RELEASE " " \ MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT \ MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS \ - MODULE_ARCH_VERMAGIC \ - MODULE_VERMAGIC_RETPOLINE + MODULE_ARCH_VERMAGIC From 9af75003499c7a441e5a483220f5d2cf65861e73 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Fri, 19 Jan 2018 09:43:39 -0800 Subject: [PATCH 103/131] Input: trackpoint - force 3 buttons if 0 button is reported commit f5d07b9e98022d50720e38aa936fc11c67868ece upstream. Lenovo introduced trackpoint compatible sticks with minimum PS/2 commands. They supposed to reply with 0x02, 0x03, or 0x04 in response to the "Read Extended ID" command, so we would know not to try certain extended commands. Unfortunately even some trackpoints reporting the original IBM version (0x01 firmware 0x0e) now respond with incorrect data to the "Get Extended Buttons" command: thinkpad_acpi: ThinkPad BIOS R0DET87W (1.87 ), EC unknown thinkpad_acpi: Lenovo ThinkPad E470, model 20H1004SGE psmouse serio2: trackpoint: IBM TrackPoint firmware: 0x0e, buttons: 0/0 Since there are no trackpoints without buttons, let's assume the trackpoint has 3 buttons when we get 0 response to the extended buttons query. Signed-off-by: Aaron Ma Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196253 Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/trackpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 7e2dc5e56632..0b49f29bf0da 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -383,6 +383,9 @@ int trackpoint_detect(struct psmouse *psmouse, bool set_properties) if (trackpoint_read(&psmouse->ps2dev, TP_EXT_BTN, &button_info)) { psmouse_warn(psmouse, "failed to get extended button data, assuming 3 buttons\n"); button_info = 0x33; + } else if (!button_info) { + psmouse_warn(psmouse, "got 0 in extended button data, assuming 3 buttons\n"); + button_info = 0x33; } psmouse->private = kzalloc(sizeof(struct trackpoint_data), GFP_KERNEL); From 18e905a839ed54b135b00e9d9c9dee2ed6908a08 Mon Sep 17 00:00:00 2001 From: Andrew Goodbody Date: Tue, 2 Feb 2016 17:36:39 +0000 Subject: [PATCH 104/131] usb: usbip: Fix possible deadlocks reported by lockdep commit 21619792d1eca7e772ca190ba68588e57f29595b upstream. Change spin_lock calls to spin_lock_irqsave to prevent attmpted recursive lock taking in interrupt context. This patch fixes Bug 109351 https://bugzilla.kernel.org/show_bug.cgi?id=109351 Signed-off-by: Andrew Goodbody Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/usbip_event.c | 5 +- drivers/usb/usbip/vhci_hcd.c | 88 +++++++++++++++++++-------------- drivers/usb/usbip/vhci_rx.c | 30 ++++++----- drivers/usb/usbip/vhci_sysfs.c | 19 ++++--- drivers/usb/usbip/vhci_tx.c | 14 +++--- 5 files changed, 91 insertions(+), 65 deletions(-) diff --git a/drivers/usb/usbip/usbip_event.c b/drivers/usb/usbip/usbip_event.c index 64933b993d7a..2580a32bcdff 100644 --- a/drivers/usb/usbip/usbip_event.c +++ b/drivers/usb/usbip/usbip_event.c @@ -117,11 +117,12 @@ EXPORT_SYMBOL_GPL(usbip_event_add); int usbip_event_happened(struct usbip_device *ud) { int happened = 0; + unsigned long flags; - spin_lock(&ud->lock); + spin_lock_irqsave(&ud->lock, flags); if (ud->event != 0) happened = 1; - spin_unlock(&ud->lock); + spin_unlock_irqrestore(&ud->lock, flags); return happened; } diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index f9af04d7f02f..0aaa8e524afd 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -121,9 +121,11 @@ static void dump_port_status_diff(u32 prev_status, u32 new_status) void rh_port_connect(int rhport, enum usb_device_speed speed) { + unsigned long flags; + usbip_dbg_vhci_rh("rh_port_connect %d\n", rhport); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); the_controller->port_status[rhport] |= USB_PORT_STAT_CONNECTION | (1 << USB_PORT_FEAT_C_CONNECTION); @@ -139,22 +141,24 @@ void rh_port_connect(int rhport, enum usb_device_speed speed) break; } - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_poll_rh_status(vhci_to_hcd(the_controller)); } static void rh_port_disconnect(int rhport) { + unsigned long flags; + usbip_dbg_vhci_rh("rh_port_disconnect %d\n", rhport); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); the_controller->port_status[rhport] &= ~USB_PORT_STAT_CONNECTION; the_controller->port_status[rhport] |= (1 << USB_PORT_FEAT_C_CONNECTION); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_poll_rh_status(vhci_to_hcd(the_controller)); } @@ -182,13 +186,14 @@ static int vhci_hub_status(struct usb_hcd *hcd, char *buf) int retval; int rhport; int changed = 0; + unsigned long flags; retval = DIV_ROUND_UP(VHCI_NPORTS + 1, 8); memset(buf, 0, retval); vhci = hcd_to_vhci(hcd); - spin_lock(&vhci->lock); + spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) { usbip_dbg_vhci_rh("hw accessible flag not on?\n"); goto done; @@ -209,7 +214,7 @@ static int vhci_hub_status(struct usb_hcd *hcd, char *buf) usb_hcd_resume_root_hub(hcd); done: - spin_unlock(&vhci->lock); + spin_unlock_irqrestore(&vhci->lock, flags); return changed ? retval : 0; } @@ -236,6 +241,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, struct vhci_hcd *dum; int retval = 0; int rhport; + unsigned long flags; u32 prev_port_status[VHCI_NPORTS]; @@ -254,7 +260,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, dum = hcd_to_vhci(hcd); - spin_lock(&dum->lock); + spin_lock_irqsave(&dum->lock, flags); /* store old status and compare now and old later */ if (usbip_dbg_flag_vhci_rh) { @@ -408,7 +414,7 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, } usbip_dbg_vhci_rh(" bye\n"); - spin_unlock(&dum->lock); + spin_unlock_irqrestore(&dum->lock, flags); return retval; } @@ -431,6 +437,7 @@ static void vhci_tx_urb(struct urb *urb) { struct vhci_device *vdev = get_vdev(urb->dev); struct vhci_priv *priv; + unsigned long flags; if (!vdev) { pr_err("could not get virtual device"); @@ -443,7 +450,7 @@ static void vhci_tx_urb(struct urb *urb) return; } - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); priv->seqnum = atomic_inc_return(&the_controller->seqnum); if (priv->seqnum == 0xffff) @@ -457,7 +464,7 @@ static void vhci_tx_urb(struct urb *urb) list_add_tail(&priv->list, &vdev->priv_tx); wake_up(&vdev->waitq_tx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); } static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, @@ -466,15 +473,16 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, struct device *dev = &urb->dev->dev; int ret = 0; struct vhci_device *vdev; + unsigned long flags; /* patch to usb_sg_init() is in 2.5.60 */ BUG_ON(!urb->transfer_buffer && urb->transfer_buffer_length); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); if (urb->status != -EINPROGRESS) { dev_err(dev, "URB already unlinked!, status %d\n", urb->status); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return urb->status; } @@ -486,7 +494,7 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, vdev->ud.status == VDEV_ST_ERROR) { dev_err(dev, "enqueue for inactive port %d\n", vdev->rhport); spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return -ENODEV; } spin_unlock(&vdev->ud.lock); @@ -559,14 +567,14 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, out: vhci_tx_urb(urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return 0; no_need_xmit: usb_hcd_unlink_urb_from_ep(hcd, urb); no_need_unlink: - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); if (!ret) usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); @@ -623,14 +631,15 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { struct vhci_priv *priv; struct vhci_device *vdev; + unsigned long flags; - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); priv = urb->hcpriv; if (!priv) { /* URB was never linked! or will be soon given back by * vhci_rx. */ - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return -EIDRM; } @@ -639,7 +648,7 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) ret = usb_hcd_check_unlink_urb(hcd, urb, status); if (ret) { - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return ret; } } @@ -664,10 +673,10 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) */ usb_hcd_unlink_urb_from_ep(hcd, urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); } else { /* tcp connection is alive */ @@ -679,7 +688,7 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) unlink = kzalloc(sizeof(struct vhci_unlink), GFP_ATOMIC); if (!unlink) { spin_unlock(&vdev->priv_lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_MALLOC); return -ENOMEM; } @@ -698,7 +707,7 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) spin_unlock(&vdev->priv_lock); } - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usbip_dbg_vhci_hc("leave\n"); return 0; @@ -707,8 +716,9 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) static void vhci_device_unlink_cleanup(struct vhci_device *vdev) { struct vhci_unlink *unlink, *tmp; + unsigned long flags; - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); spin_lock(&vdev->priv_lock); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_tx, list) { @@ -742,19 +752,19 @@ static void vhci_device_unlink_cleanup(struct vhci_device *vdev) list_del(&unlink->list); spin_unlock(&vdev->priv_lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); spin_lock(&vdev->priv_lock); kfree(unlink); } spin_unlock(&vdev->priv_lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); } /* @@ -821,8 +831,9 @@ static void vhci_shutdown_connection(struct usbip_device *ud) static void vhci_device_reset(struct usbip_device *ud) { struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); + unsigned long flags; - spin_lock(&ud->lock); + spin_lock_irqsave(&ud->lock, flags); vdev->speed = 0; vdev->devid = 0; @@ -836,14 +847,16 @@ static void vhci_device_reset(struct usbip_device *ud) } ud->status = VDEV_ST_NULL; - spin_unlock(&ud->lock); + spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_unusable(struct usbip_device *ud) { - spin_lock(&ud->lock); + unsigned long flags; + + spin_lock_irqsave(&ud->lock, flags); ud->status = VDEV_ST_ERROR; - spin_unlock(&ud->lock); + spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_init(struct vhci_device *vdev) @@ -933,12 +946,13 @@ static int vhci_get_frame_number(struct usb_hcd *hcd) static int vhci_bus_suspend(struct usb_hcd *hcd) { struct vhci_hcd *vhci = hcd_to_vhci(hcd); + unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); - spin_lock(&vhci->lock); + spin_lock_irqsave(&vhci->lock, flags); hcd->state = HC_STATE_SUSPENDED; - spin_unlock(&vhci->lock); + spin_unlock_irqrestore(&vhci->lock, flags); return 0; } @@ -947,15 +961,16 @@ static int vhci_bus_resume(struct usb_hcd *hcd) { struct vhci_hcd *vhci = hcd_to_vhci(hcd); int rc = 0; + unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); - spin_lock(&vhci->lock); + spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) rc = -ESHUTDOWN; else hcd->state = HC_STATE_RUNNING; - spin_unlock(&vhci->lock); + spin_unlock_irqrestore(&vhci->lock, flags); return rc; } @@ -1053,17 +1068,18 @@ static int vhci_hcd_suspend(struct platform_device *pdev, pm_message_t state) int rhport = 0; int connected = 0; int ret = 0; + unsigned long flags; hcd = platform_get_drvdata(pdev); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); for (rhport = 0; rhport < VHCI_NPORTS; rhport++) if (the_controller->port_status[rhport] & USB_PORT_STAT_CONNECTION) connected += 1; - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); if (connected > 0) { dev_info(&pdev->dev, diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c index bc4eb0855314..323aa7789989 100644 --- a/drivers/usb/usbip/vhci_rx.c +++ b/drivers/usb/usbip/vhci_rx.c @@ -71,10 +71,11 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, { struct usbip_device *ud = &vdev->ud; struct urb *urb; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, pdu->base.seqnum); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { pr_err("cannot find a urb of seqnum %u max seqnum %d\n", @@ -103,9 +104,9 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev, usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_to_hcd(the_controller), urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); @@ -116,8 +117,9 @@ static struct vhci_unlink *dequeue_pending_unlink(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_unlink *unlink, *tmp; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_rx, list) { pr_info("unlink->seqnum %lu\n", unlink->seqnum); @@ -126,12 +128,12 @@ static struct vhci_unlink *dequeue_pending_unlink(struct vhci_device *vdev, unlink->seqnum); list_del(&unlink->list); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return unlink; } } - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } @@ -141,6 +143,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, { struct vhci_unlink *unlink; struct urb *urb; + unsigned long flags; usbip_dump_header(pdu); @@ -151,9 +154,9 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, return; } - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, unlink->unlink_seqnum); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { /* @@ -170,9 +173,9 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, urb->status = pdu->u.ret_unlink.status; pr_info("urb->status %d\n", urb->status); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_to_hcd(the_controller), urb); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usb_hcd_giveback_urb(vhci_to_hcd(the_controller), urb, urb->status); @@ -184,10 +187,11 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev, static int vhci_priv_tx_empty(struct vhci_device *vdev) { int empty = 0; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); empty = list_empty(&vdev->priv_rx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return empty; } diff --git a/drivers/usb/usbip/vhci_sysfs.c b/drivers/usb/usbip/vhci_sysfs.c index 84c21c4ccf46..1c7f41a65565 100644 --- a/drivers/usb/usbip/vhci_sysfs.c +++ b/drivers/usb/usbip/vhci_sysfs.c @@ -32,10 +32,11 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, { char *s = out; int i = 0; + unsigned long flags; BUG_ON(!the_controller || !out); - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); /* * output example: @@ -74,7 +75,7 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, spin_unlock(&vdev->ud.lock); } - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return out - s; } @@ -84,11 +85,12 @@ static DEVICE_ATTR_RO(status); static int vhci_port_disconnect(__u32 rhport) { struct vhci_device *vdev; + unsigned long flags; usbip_dbg_vhci_sysfs("enter\n"); /* lock */ - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); vdev = port_to_vdev(rhport); @@ -98,14 +100,14 @@ static int vhci_port_disconnect(__u32 rhport) /* unlock */ spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); return -EINVAL; } /* unlock */ spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_DOWN); @@ -181,6 +183,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, int sockfd = 0; __u32 rhport = 0, devid = 0, speed = 0; int err; + unsigned long flags; /* * @rhport: port number of vhci_hcd @@ -206,14 +209,14 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, /* now need lock until setting vdev status as used */ /* begin a lock */ - spin_lock(&the_controller->lock); + spin_lock_irqsave(&the_controller->lock, flags); vdev = port_to_vdev(rhport); spin_lock(&vdev->ud.lock); if (vdev->ud.status != VDEV_ST_NULL) { /* end of the lock */ spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); sockfd_put(socket); @@ -232,7 +235,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, vdev->ud.status = VDEV_ST_NOTASSIGNED; spin_unlock(&vdev->ud.lock); - spin_unlock(&the_controller->lock); + spin_unlock_irqrestore(&the_controller->lock, flags); /* end the lock */ vdev->ud.tcp_rx = kthread_get_run(vhci_rx_loop, &vdev->ud, "vhci_rx"); diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c index 3c5796c8633a..a9a663a578b6 100644 --- a/drivers/usb/usbip/vhci_tx.c +++ b/drivers/usb/usbip/vhci_tx.c @@ -47,16 +47,17 @@ static void setup_cmd_submit_pdu(struct usbip_header *pdup, struct urb *urb) static struct vhci_priv *dequeue_from_priv_tx(struct vhci_device *vdev) { struct vhci_priv *priv, *tmp; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(priv, tmp, &vdev->priv_tx, list) { list_move_tail(&priv->list, &vdev->priv_rx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return priv; } - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } @@ -137,16 +138,17 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev) static struct vhci_unlink *dequeue_from_unlink_tx(struct vhci_device *vdev) { struct vhci_unlink *unlink, *tmp; + unsigned long flags; - spin_lock(&vdev->priv_lock); + spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_tx, list) { list_move_tail(&unlink->list, &vdev->unlink_rx); - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return unlink; } - spin_unlock(&vdev->priv_lock); + spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } From 80e733a9b37fb6b40351bf1924d5a90d89c375ae Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:47 -0700 Subject: [PATCH 105/131] usbip: fix stub_rx: get_pipe() to validate endpoint number commit 635f545a7e8be7596b9b2b6a43cab6bbd5a88e43 upstream. get_pipe() routine doesn't validate the input endpoint number and uses to reference ep_in and ep_out arrays. Invalid endpoint number can trigger BUG(). Range check the epnum and returning error instead of calling BUG(). Change caller stub_recv_cmd_submit() to handle the get_pipe() error return. Reported-by: Secunia Research Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index 7de54a66044f..e617c90661b4 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -344,15 +344,15 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) struct usb_host_endpoint *ep; struct usb_endpoint_descriptor *epd = NULL; + if (epnum < 0 || epnum > 15) + goto err_ret; + if (dir == USBIP_DIR_IN) ep = udev->ep_in[epnum & 0x7f]; else ep = udev->ep_out[epnum & 0x7f]; - if (!ep) { - dev_err(&sdev->interface->dev, "no such endpoint?, %d\n", - epnum); - BUG(); - } + if (!ep) + goto err_ret; epd = &ep->desc; if (usb_endpoint_xfer_control(epd)) { @@ -383,9 +383,10 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) return usb_rcvisocpipe(udev, epnum); } +err_ret: /* NOT REACHED */ - dev_err(&sdev->interface->dev, "get pipe, epnum %d\n", epnum); - return 0; + dev_err(&sdev->udev->dev, "get pipe() invalid epnum %d\n", epnum); + return -1; } static void masking_bogus_flags(struct urb *urb) @@ -451,6 +452,9 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, struct usb_device *udev = sdev->udev; int pipe = get_pipe(sdev, pdu->base.ep, pdu->base.direction); + if (pipe == -1) + return; + priv = stub_priv_alloc(sdev, pdu); if (!priv) return; From b6f826ba10dce86f74efd3c0953cb9982a3c51e2 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 7 Dec 2017 14:16:48 -0700 Subject: [PATCH 106/131] usbip: fix stub_rx: harden CMD_SUBMIT path to handle malicious input commit c6688ef9f29762e65bce325ef4acd6c675806366 upstream. Harden CMD_SUBMIT path to handle malicious input that could trigger large memory allocations. Add checks to validate transfer_buffer_length and number_of_packets to protect against bad input requesting for unbounded memory allocations. Validate early in get_pipe() and return failure. Reported-by: Secunia Research Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_rx.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c index e617c90661b4..56cacb68040c 100644 --- a/drivers/usb/usbip/stub_rx.c +++ b/drivers/usb/usbip/stub_rx.c @@ -338,11 +338,13 @@ static struct stub_priv *stub_priv_alloc(struct stub_device *sdev, return priv; } -static int get_pipe(struct stub_device *sdev, int epnum, int dir) +static int get_pipe(struct stub_device *sdev, struct usbip_header *pdu) { struct usb_device *udev = sdev->udev; struct usb_host_endpoint *ep; struct usb_endpoint_descriptor *epd = NULL; + int epnum = pdu->base.ep; + int dir = pdu->base.direction; if (epnum < 0 || epnum > 15) goto err_ret; @@ -355,6 +357,7 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) goto err_ret; epd = &ep->desc; + if (usb_endpoint_xfer_control(epd)) { if (dir == USBIP_DIR_OUT) return usb_sndctrlpipe(udev, epnum); @@ -377,6 +380,27 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) } if (usb_endpoint_xfer_isoc(epd)) { + /* validate packet size and number of packets */ + unsigned int maxp, packets, bytes; + +#define USB_EP_MAXP_MULT_SHIFT 11 +#define USB_EP_MAXP_MULT_MASK (3 << USB_EP_MAXP_MULT_SHIFT) +#define USB_EP_MAXP_MULT(m) \ + (((m) & USB_EP_MAXP_MULT_MASK) >> USB_EP_MAXP_MULT_SHIFT) + + maxp = usb_endpoint_maxp(epd); + maxp *= (USB_EP_MAXP_MULT( + __le16_to_cpu(epd->wMaxPacketSize)) + 1); + bytes = pdu->u.cmd_submit.transfer_buffer_length; + packets = DIV_ROUND_UP(bytes, maxp); + + if (pdu->u.cmd_submit.number_of_packets < 0 || + pdu->u.cmd_submit.number_of_packets > packets) { + dev_err(&sdev->udev->dev, + "CMD_SUBMIT: isoc invalid num packets %d\n", + pdu->u.cmd_submit.number_of_packets); + return -1; + } if (dir == USBIP_DIR_OUT) return usb_sndisocpipe(udev, epnum); else @@ -385,7 +409,7 @@ static int get_pipe(struct stub_device *sdev, int epnum, int dir) err_ret: /* NOT REACHED */ - dev_err(&sdev->udev->dev, "get pipe() invalid epnum %d\n", epnum); + dev_err(&sdev->udev->dev, "CMD_SUBMIT: invalid epnum %d\n", epnum); return -1; } @@ -450,7 +474,7 @@ static void stub_recv_cmd_submit(struct stub_device *sdev, struct stub_priv *priv; struct usbip_device *ud = &sdev->ud; struct usb_device *udev = sdev->udev; - int pipe = get_pipe(sdev, pdu->base.ep, pdu->base.direction); + int pipe = get_pipe(sdev, pdu); if (pipe == -1) return; From 04e7c734e33c53ca2f2b8d027d9f7251f12bb206 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Fri, 15 Dec 2017 10:50:09 -0700 Subject: [PATCH 107/131] usbip: prevent leaking socket pointer address in messages commit 90120d15f4c397272aaf41077960a157fc4212bf upstream. usbip driver is leaking socket pointer address in messages. Remove the messages that aren't useful and print sockfd in the ones that are useful for debugging. Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/stub_dev.c | 3 +-- drivers/usb/usbip/usbip_common.c | 15 ++++----------- drivers/usb/usbip/vhci_hcd.c | 2 +- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c index a3ec49bdc1e6..ec38370ffcab 100644 --- a/drivers/usb/usbip/stub_dev.c +++ b/drivers/usb/usbip/stub_dev.c @@ -163,8 +163,7 @@ static void stub_shutdown_connection(struct usbip_device *ud) * step 1? */ if (ud->tcp_socket) { - dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n", - ud->tcp_socket); + dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c index 9752b93f754e..1838f1b2c2fa 100644 --- a/drivers/usb/usbip/usbip_common.c +++ b/drivers/usb/usbip/usbip_common.c @@ -317,18 +317,14 @@ int usbip_recv(struct socket *sock, void *buf, int size) struct msghdr msg; struct kvec iov; int total = 0; - /* for blocks of if (usbip_dbg_flag_xmit) */ char *bp = buf; int osize = size; - usbip_dbg_xmit("enter\n"); - - if (!sock || !buf || !size) { - pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf, - size); + if (!sock || !buf || !size) return -EINVAL; - } + + usbip_dbg_xmit("enter\n"); do { sock->sk->sk_allocation = GFP_NOIO; @@ -341,11 +337,8 @@ int usbip_recv(struct socket *sock, void *buf, int size) msg.msg_flags = MSG_NOSIGNAL; result = kernel_recvmsg(sock, &msg, &iov, 1, size, MSG_WAITALL); - if (result <= 0) { - pr_debug("receive sock %p buf %p size %u ret %d total %d\n", - sock, buf, size, result, total); + if (result <= 0) goto err; - } size -= result; buf += result; diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 0aaa8e524afd..00d68945548e 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -778,7 +778,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud) /* need this? see stub_dev.c */ if (ud->tcp_socket) { - pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket); + pr_debug("shutdown sockfd %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } From e1e457a49544718928282c56b69af082e1ebe405 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sun, 20 Aug 2017 13:26:04 +0200 Subject: [PATCH 108/131] um: link vmlinux with -no-pie commit 883354afbc109c57f925ccc19840055193da0cc0 upstream. Debian's gcc defaults to pie. The global Makefile already defines the -fno-pie option. Link UML dynamic kernel image also with -no-pie to fix the build. Signed-off-by: Thomas Meyer Signed-off-by: Richard Weinberger Cc: Bernie Innocenti Signed-off-by: Greg Kroah-Hartman --- arch/um/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/Makefile b/arch/um/Makefile index e3abe6f3156d..9ccf462131c4 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -117,7 +117,7 @@ archheaders: archprepare: include/generated/user_constants.h LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib +LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib $(call cc-option, -no-pie) CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \ $(call cc-option, -fno-stack-protector,) \ From ed73df0b7f23c95b3243a0f4bfc40f962e61d349 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 26 Jan 2018 16:23:02 +0000 Subject: [PATCH 109/131] vsyscall: Fix permissions for emulate mode with KAISER/PTI The backport of KAISER to 4.4 turned vsyscall emulate mode into native mode. Add a vsyscall_pgprot variable to hold the correct page protections, like Borislav and Hugh did for 3.2 and 3.18. Cc: Borislav Petkov Cc: Hugh Dickins Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/vsyscall/vsyscall_64.c | 7 ++++--- arch/x86/include/asm/vsyscall.h | 1 + arch/x86/mm/kaiser.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 112178b401a1..2d359991a273 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -46,6 +46,7 @@ static enum { EMULATE, NATIVE, NONE } vsyscall_mode = #else EMULATE; #endif +unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL; static int __init vsyscall_setup(char *str) { @@ -336,11 +337,11 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + if (vsyscall_mode != NATIVE) + vsyscall_pgprot = __PAGE_KERNEL_VVAR; if (vsyscall_mode != NONE) __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + __pgprot(vsyscall_pgprot)); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 4865e10dbb55..9ee85066f407 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,6 +13,7 @@ extern void map_vsyscall(void); */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); extern bool vsyscall_enabled(void); +extern unsigned long vsyscall_pgprot; #else static inline void map_vsyscall(void) {} static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 8af98513d36c..2298434f7bdb 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -345,7 +345,7 @@ void __init kaiser_init(void) if (vsyscall_enabled()) kaiser_add_user_map_early((void *)VSYSCALL_ADDR, PAGE_SIZE, - __PAGE_KERNEL_VSYSCALL); + vsyscall_pgprot); for_each_possible_cpu(cpu) { void *percpu_vaddr = __per_cpu_user_mapped_start + From 506ff217e470bdaa78477406a051c0570403307d Mon Sep 17 00:00:00 2001 From: Greg KH Date: Wed, 8 Mar 2017 19:03:44 +0100 Subject: [PATCH 110/131] eventpoll.h: add missing epoll event masks commit 7e040726850a106587485c21bdacc0bfc8a0cbed upstream. [resend due to me forgetting to cc: linux-api the first time around I posted these back on Feb 23] From: Greg Kroah-Hartman For some reason these values are not in the uapi header file, so any libc has to define it themselves. To prevent them from needing to do this, just have the kernel provide the correct values. Reported-by: Elliott Hughes Signed-off-by: Greg Hackmann Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/eventpoll.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index bc81fb2e1f0e..6f04cb419115 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -26,6 +26,19 @@ #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +/* Epoll event masks */ +#define EPOLLIN 0x00000001 +#define EPOLLPRI 0x00000002 +#define EPOLLOUT 0x00000004 +#define EPOLLERR 0x00000008 +#define EPOLLHUP 0x00000010 +#define EPOLLRDNORM 0x00000040 +#define EPOLLRDBAND 0x00000080 +#define EPOLLWRNORM 0x00000100 +#define EPOLLWRBAND 0x00000200 +#define EPOLLMSG 0x00000400 +#define EPOLLRDHUP 0x00002000 + /* * Request the handling of system wakeup events so as to prevent system suspends * from happening while those events are being processed. From 20c0a04284496b262b172707b7dd122cdd8fe9a1 Mon Sep 17 00:00:00 2001 From: Jia Zhang Date: Tue, 23 Jan 2018 11:41:32 +0100 Subject: [PATCH 111/131] x86/microcode/intel: Extend BDW late-loading further with LLC size check commit 7e702d17ed138cf4ae7c00e8c00681ed464587c7 upstream. Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") reduced the impact of erratum BDF90 for Broadwell model 79. The impact can be reduced further by checking the size of the last level cache portion per core. Tony: "The erratum says the problem only occurs on the large-cache SKUs. So we only need to avoid the update if we are on a big cache SKU that is also running old microcode." For more details, see erratum BDF90 in document #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family Specification Update) from September 2017. Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check") Signed-off-by: Jia Zhang Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Acked-by: Tony Luck Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux.alibaba.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ee011bd7934d..2c76a1801393 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,6 +39,9 @@ #include #include +/* last level cache size per core */ +static int llc_size_per_core; + static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; static struct mc_saved_data { unsigned int mc_saved_count; @@ -996,12 +999,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 - * may result in a system hang. This behavior is documented in item - * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family). + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). */ if (c->x86 == 6 && c->x86_model == 79 && c->x86_mask == 0x01 && + llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); @@ -1068,6 +1073,15 @@ static struct microcode_ops microcode_intel_ops = { .microcode_fini_cpu = microcode_fini_cpu, }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -1078,6 +1092,8 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + llc_size_per_core = calc_llc_size_per_core(c); + return µcode_intel_ops; } From 360496cab53af2f1dd77dbe353b7b665bcfdc1e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jan 2018 14:54:32 +0100 Subject: [PATCH 112/131] hrtimer: Reset hrtimer cpu base proper on CPU hotplug commit d5421ea43d30701e03cadc56a38854c36a8b4433 upstream. The hrtimer interrupt code contains a hang detection and mitigation mechanism, which prevents that a long delayed hrtimer interrupt causes a continous retriggering of interrupts which prevent the system from making progress. If a hang is detected then the timer hardware is programmed with a certain delay into the future and a flag is set in the hrtimer cpu base which prevents newly enqueued timers from reprogramming the timer hardware prior to the chosen delay. The subsequent hrtimer interrupt after the delay clears the flag and resumes normal operation. If such a hang happens in the last hrtimer interrupt before a CPU is unplugged then the hang_detected flag is set and stays that way when the CPU is plugged in again. At that point the timer hardware is not armed and it cannot be armed because the hang_detected flag is still active, so nothing clears that flag. As a consequence the CPU does not receive hrtimer interrupts and no timers expire on that CPU which results in RCU stalls and other malfunctions. Clear the flag along with some other less critical members of the hrtimer cpu base to ensure starting from a clean state when a CPU is plugged in. Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the root cause of that hard to reproduce heisenbug. Once understood it's trivial and certainly justifies a brown paperbag. Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic") Reported-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Anna-Maria Gleixner Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos Signed-off-by: Greg Kroah-Hartman --- kernel/time/hrtimer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1dc94768b5a3..323282e63865 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -669,7 +669,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next.tv64 = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1615,6 +1617,7 @@ static void init_hrtimers_cpu(int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); } From e91e5b700fb6f887d4df4a18de0d8865df43d77d Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 26 Jan 2018 15:14:16 +0300 Subject: [PATCH 113/131] dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state [ Upstream commit dd5684ecae3bd8e44b644f50e2c12c7e57fdfef5 ] ccid2_hc_tx_rto_expire() timer callback always restarts the timer again and can run indefinitely (unless it is stopped outside), and after commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"), which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from dccp_destroy_sock() to sk_destruct(), this started to happen quite often. The timer prevents releasing the socket, as a result, sk_destruct() won't be called. Found with LTP/dccp_ipsec tests running on the bonding device, which later couldn't be unloaded after the tests were completed: unregister_netdevice: waiting for bond0 to become free. Usage count = 148 Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation") Signed-off-by: Alexey Kodanev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/ccids/ccid2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 5e3a7302f774..7753681195c1 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); + if (sk->sk_state == DCCP_CLOSED) + goto out; + /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) From c5371a321a4a3c1f181f4482b3b3ceae06b72879 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:06:42 +0000 Subject: [PATCH 114/131] ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL [ Upstream commit e9191ffb65d8e159680ce0ad2224e1acbde6985c ] Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl setting") removed the initialisation of ipv6_pinfo::autoflowlabel and added a second flag to indicate whether this field or the net namespace default should be used. The getsockopt() handling for this case was not updated, so it currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is not explicitly enabled. Fix it to return the effective value, whether that has been set at the socket or net namespace level. Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...") Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ipv6.h | 1 + net/ipv6/ip6_output.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7a8066b90289..84f0d0602433 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -281,6 +281,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b809958f7388..d82b814e6c27 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -148,7 +148,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) { if (!np->autoflowlabel_set) return ip6_default_np_autolabel(net); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 435e26210587..9011176c8387 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1313,7 +1313,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case IPV6_AUTOFLOWLABEL: - val = np->autoflowlabel; + val = ip6_autoflowlabel(sock_net(sk), np); break; default: From c867a05df5d1baefbb2040474356b5e02ca5fcaf Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Wed, 10 Jan 2018 12:45:10 -0500 Subject: [PATCH 115/131] ipv6: fix udpv6 sendmsg crash caused by too small MTU [ Upstream commit 749439bfac6e1a2932c582e2699f91d329658196 ] The logic in __ip6_append_data() assumes that the MTU is at least large enough for the headers. A device's MTU may be adjusted after being added while sendmsg() is processing data, resulting in __ip6_append_data() seeing any MTU. For an mtu smaller than the size of the fragmentation header, the math results in a negative 'maxfraglen', which causes problems when refragmenting any previous skb in the skb_write_queue, leaving it possibly malformed. Instead sendmsg returns EINVAL when the mtu is calculated to be less than IPV6_MIN_MTU. Found by syzkaller: kernel BUG at ./include/linux/skbuff.h:2064! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 14216 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d0b68580 task.stack: ffff8801ac6b8000 RIP: 0010:__skb_pull include/linux/skbuff.h:2064 [inline] RIP: 0010:__ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: 0018:ffff8801ac6bf570 EFLAGS: 00010216 RAX: 0000000000010000 RBX: 0000000000000028 RCX: ffffc90003cce000 RDX: 00000000000001b8 RSI: ffffffff839df06f RDI: ffff8801d9478ca0 RBP: ffff8801ac6bf780 R08: ffff8801cc3f1dbc R09: 0000000000000000 R10: ffff8801ac6bf7a0 R11: 43cb4b7b1948a9e7 R12: ffff8801cc3f1dc8 R13: ffff8801cc3f1d40 R14: 0000000000001036 R15: dffffc0000000000 FS: 00007f43d740c700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7834984000 CR3: 00000001d79b9000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6_finish_skb include/net/ipv6.h:911 [inline] udp_v6_push_pending_frames+0x255/0x390 net/ipv6/udp.c:1093 udpv6_sendmsg+0x280d/0x31a0 net/ipv6/udp.c:1363 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 SYSC_sendto+0x352/0x5a0 net/socket.c:1750 SyS_sendto+0x40/0x50 net/socket.c:1718 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f43d740bc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000007180a8 RCX: 00000000004512e9 RDX: 000000000000002e RSI: 0000000020d08000 RDI: 0000000000000005 RBP: 0000000000000086 R08: 00000000209c1000 R09: 000000000000001c R10: 0000000000040800 R11: 0000000000000216 R12: 00000000004b9c69 R13: 00000000ffffffff R14: 0000000000000005 R15: 00000000202c2000 Code: 9e 01 fe e9 c5 e8 ff ff e8 7f 9e 01 fe e9 4a ea ff ff 48 89 f7 e8 52 9e 01 fe e9 aa eb ff ff e8 a8 b6 cf fd 0f 0b e8 a1 b6 cf fd <0f> 0b 49 8d 45 78 4d 8d 45 7c 48 89 85 78 fe ff ff 49 8d 85 ba RIP: __skb_pull include/linux/skbuff.h:2064 [inline] RSP: ffff8801ac6bf570 RIP: __ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: ffff8801ac6bf570 Reported-by: syzbot Signed-off-by: Mike Maloney Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d82b814e6c27..dfd4a9a02dc3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1246,14 +1246,16 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->tclass = tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? - rt->dst.dev->mtu : dst_mtu(rt->dst.path); + READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path); if (np->frag_size < mtu) { if (np->frag_size) mtu = np->frag_size; } + if (mtu < IPV6_MIN_MTU) + return -EINVAL; cork->base.fragsize = mtu; if (dst_allfrag(rt->dst.path)) cork->base.flags |= IPCORK_ALLFRAG; From f64568e420e6b9b523611ef564941b83c7527614 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jan 2018 22:31:18 -0800 Subject: [PATCH 116/131] ipv6: ip6_make_skb() needs to clear cork.base.dst [ Upstream commit 95ef498d977bf44ac094778fd448b98af158a3e6 ] In my last patch, I missed fact that cork.base.dst was not initialized in ip6_make_skb() : If ip6_setup_cork() returns an error, we might attempt a dst_release() on some random pointer. Fixes: 862c03ee1deb ("ipv6: fix possible mem leaks in ipv6_make_skb()") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index dfd4a9a02dc3..3ef81c387923 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1785,6 +1785,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.flags = 0; cork.base.addr = 0; cork.base.opt = NULL; + cork.base.dst = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6); if (err) { From 1202fc020465b67297f84dfc7127b83805082c69 Mon Sep 17 00:00:00 2001 From: Yuiko Oshino Date: Mon, 15 Jan 2018 13:24:28 -0500 Subject: [PATCH 117/131] lan78xx: Fix failure in USB Full Speed [ Upstream commit a5b1379afbfabf91e3a689e82ac619a7157336b3 ] Fix initialize the uninitialized tx_qlen to an appropriate value when USB Full Speed is used. Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver") Signed-off-by: Yuiko Oshino Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/lan78xx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 41e9ebd7d0a6..ebdee8f01f65 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1859,6 +1859,7 @@ static int lan78xx_reset(struct lan78xx_net *dev) buf = DEFAULT_BURST_CAP_SIZE / FS_USB_PKT_SIZE; dev->rx_urb_size = DEFAULT_BURST_CAP_SIZE; dev->rx_qlen = 4; + dev->tx_qlen = 4; } ret = lan78xx_write_reg(dev, BURST_CAP, buf); From 6c489ab43ccacb9f26620ded90374c582c466344 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 19 Jan 2018 11:50:46 +0100 Subject: [PATCH 118/131] net: igmp: fix source address check for IGMPv3 reports [ Upstream commit ad23b750933ea7bf962678972a286c78a8fa36aa ] Commit "net: igmp: Use correct source address on IGMPv3 reports" introduced a check to validate the source address of locally generated IGMPv3 packets. Instead of checking the local interface address directly, it uses inet_ifa_match(fl4->saddr, ifa), which checks if the address is on the local subnet (or equal to the point-to-point address if used). This breaks for point-to-point interfaces, so check against ifa->ifa_local directly. Cc: Kevin Cernekee Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports") Reported-by: Sebastian Gottschall Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller Tested-by: Florian Wolters Signed-off-by: Greg Kroah-Hartman --- net/ipv4/igmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b60106d34346..8212ed80da48 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -338,7 +338,7 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev, return htonl(INADDR_ANY); for_ifa(in_dev) { - if (inet_ifa_match(fl4->saddr, ifa)) + if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } endfor_ifa(in_dev); From bed42ef5ebbfc046e0440f7e0f941cc07f8ca1e8 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:37 -0500 Subject: [PATCH 119/131] tcp: __tcp_hdrlen() helper commit d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 upstream. tcp_hdrlen is wasteful if you already have a pointer to struct tcphdr. This splits the size calculation into a helper function that can be used if a struct tcphdr is already available. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 318c24612458..2260f92f1492 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -29,9 +29,14 @@ static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) return (struct tcphdr *)skb_transport_header(skb); } +static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) +{ + return th->doff * 4; +} + static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { - return tcp_hdr(skb)->doff * 4; + return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) From f50fc5f4f3e5f0d2b55cd09c01f3a09d0ddeb9dc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jan 2018 19:59:19 -0800 Subject: [PATCH 120/131] net: qdisc_pkt_len_init() should be more robust [ Upstream commit 7c68d1a6b4db9012790af7ac0f0fdc0d2083422a ] Without proper validation of DODGY packets, we might very well feed qdisc_pkt_len_init() with invalid GSO packets. tcp_hdrlen() might access out-of-bound data, so let's use skb_header_pointer() and proper checks. Whole story is described in commit d0c081b49137 ("flow_dissector: properly cap thoff field") We have the goal of validating DODGY packets earlier in the stack, so we might very well revert this fix in the future. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Jason Wang Reported-by: syzbot+9da69ebac7dddd804552@syzkaller.appspotmail.com Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 3b67c1e5756f..cb58ba15d51e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2889,10 +2889,21 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) hdr_len = skb_transport_header(skb) - skb_mac_header(skb); /* + transport layer */ - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len, From a49558ebc2c4ce129a381d7cf39c4f01ea02c78b Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 22 Jan 2018 18:06:37 +0100 Subject: [PATCH 121/131] pppoe: take ->needed_headroom of lower device into account on xmit [ Upstream commit 02612bb05e51df8489db5e94d0cf8d1c81f87b0c ] In pppoe_sendmsg(), reserving dev->hard_header_len bytes of headroom was probably fine before the introduction of ->needed_headroom in commit f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom"). But now, virtual devices typically advertise the size of their overhead in dev->needed_headroom, so we must also take it into account in skb_reserve(). Allocation size of skb is also updated to take dev->needed_tailroom into account and replace the arbitrary 32 bytes with the real size of a PPPoE header. This issue was discovered by syzbot, who connected a pppoe socket to a gre device which had dev->header_ops->create == ipgre_header and dev->hard_header_len == 0. Therefore, PPPoE didn't reserve any headroom, and dev_hard_header() crashed when ipgre_header() tried to prepend its header to skb->data. skbuff: skb_under_panic: text:000000001d390b3a len:31 put:24 head:00000000d8ed776f data:000000008150e823 tail:0x7 end:0xc0 dev:gre0 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:104! invalid opcode: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 3670 Comm: syzkaller801466 Not tainted 4.15.0-rc7-next-20180115+ #97 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_panic+0x162/0x1f0 net/core/skbuff.c:100 RSP: 0018:ffff8801d9bd7840 EFLAGS: 00010282 RAX: 0000000000000083 RBX: ffff8801d4f083c0 RCX: 0000000000000000 RDX: 0000000000000083 RSI: 1ffff1003b37ae92 RDI: ffffed003b37aefc RBP: ffff8801d9bd78a8 R08: 1ffff1003b37ae8a R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff86200de0 R13: ffffffff84a981ad R14: 0000000000000018 R15: ffff8801d2d34180 FS: 00000000019c4880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000208bc000 CR3: 00000001d9111001 CR4: 00000000001606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_under_panic net/core/skbuff.c:114 [inline] skb_push+0xce/0xf0 net/core/skbuff.c:1714 ipgre_header+0x6d/0x4e0 net/ipv4/ip_gre.c:879 dev_hard_header include/linux/netdevice.h:2723 [inline] pppoe_sendmsg+0x58e/0x8b0 drivers/net/ppp/pppoe.c:890 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg+0xca/0x110 net/socket.c:640 sock_write_iter+0x31a/0x5d0 net/socket.c:909 call_write_iter include/linux/fs.h:1775 [inline] do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653 do_iter_write+0x154/0x540 fs/read_write.c:932 vfs_writev+0x18a/0x340 fs/read_write.c:977 do_writev+0xfc/0x2a0 fs/read_write.c:1012 SYSC_writev fs/read_write.c:1085 [inline] SyS_writev+0x27/0x30 fs/read_write.c:1082 entry_SYSCALL_64_fastpath+0x29/0xa0 Admittedly PPPoE shouldn't be allowed to run on non Ethernet-like interfaces, but reserving space for ->needed_headroom is a more fundamental issue that needs to be addressed first. Same problem exists for __pppoe_xmit(), which also needs to take dev->needed_headroom into account in skb_cow_head(). Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom") Reported-by: syzbot+ed0838d0fa4c4f2b528e20286e6dc63effc7c14d@syzkaller.appspotmail.com Signed-off-by: Guillaume Nault Reviewed-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ppp/pppoe.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4e0068e775f9..b7b859c3a0c7 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -860,6 +860,7 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, struct pppoe_hdr *ph; struct net_device *dev; char *start; + int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { @@ -878,16 +879,16 @@ static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, if (total_len > (dev->mtu + dev->hard_header_len)) goto end; - - skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32, - 0, GFP_KERNEL); + hlen = LL_RESERVED_SPACE(dev); + skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ - skb_reserve(skb, dev->hard_header_len); + skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; @@ -948,7 +949,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) /* Copy the data if there is no space for the header or if it's * read-only. */ - if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len)) + if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); From d6611191f5a18d03314ab4a1b621d1892116af79 Mon Sep 17 00:00:00 2001 From: Francois Romieu Date: Fri, 26 Jan 2018 01:53:26 +0100 Subject: [PATCH 122/131] r8169: fix memory corruption on retrieval of hardware statistics. [ Upstream commit a78e93661c5fd30b9e1dee464b2f62f966883ef7 ] Hardware statistics retrieval hurts in tight invocation loops. Avoid extraneous write and enforce strict ordering of writes targeted to the tally counters dump area address registers. Signed-off-by: Francois Romieu Tested-by: Oliver Freyermuth Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/realtek/r8169.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index c5ea1018cb47..24155380e43c 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -2205,19 +2205,14 @@ static bool rtl8169_do_counters(struct net_device *dev, u32 counter_cmd) void __iomem *ioaddr = tp->mmio_addr; dma_addr_t paddr = tp->counters_phys_addr; u32 cmd; - bool ret; RTL_W32(CounterAddrHigh, (u64)paddr >> 32); + RTL_R32(CounterAddrHigh); cmd = (u64)paddr & DMA_BIT_MASK(32); RTL_W32(CounterAddrLow, cmd); RTL_W32(CounterAddrLow, cmd | counter_cmd); - ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); - - RTL_W32(CounterAddrLow, 0); - RTL_W32(CounterAddrHigh, 0); - - return ret; + return rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000); } static bool rtl8169_reset_counters(struct net_device *dev) From 23f521bc70b935156e13897db2873312b18b0437 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:02:00 +0800 Subject: [PATCH 123/131] sctp: do not allow the v4 socket to bind a v4mapped v6 address [ Upstream commit c5006b8aa74599ce19104b31d322d2ea9ff887cc ] The check in sctp_sockaddr_af is not robust enough to forbid binding a v4mapped v6 addr on a v4 socket. The worse thing is that v4 socket's bind_verify would not convert this v4mapped v6 addr to a v4 addr. syzbot even reported a crash as the v4 socket bound a v6 addr. This patch is to fix it by doing the common sa.sa_family check first, then AF_INET check for v4mapped v6 addrs. Fixes: 7dab83de50c7 ("sctp: Support ipv6only AF_INET6 sockets.") Reported-by: syzbot+7b7b518b1228d2743963@syzkaller.appspotmail.com Acked-by: Neil Horman Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a870d27ca778..acf487283e9f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -332,16 +332,14 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, if (len < sizeof (struct sockaddr)) return NULL; + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + /* V4 mapped address are really of AF_INET family */ if (addr->sa.sa_family == AF_INET6 && - ipv6_addr_v4mapped(&addr->v6.sin6_addr)) { - if (!opt->pf->af_supported(AF_INET, opt)) - return NULL; - } else { - /* Does this PF support this AF? */ - if (!opt->pf->af_supported(addr->sa.sa_family, opt)) - return NULL; - } + ipv6_addr_v4mapped(&addr->v6.sin6_addr) && + !opt->pf->af_supported(AF_INET, opt)) + return NULL; /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); From 50194c3f48de2c2374731d4b6b7f3fbb89250094 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jan 2018 17:01:36 +0800 Subject: [PATCH 124/131] sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf [ Upstream commit a0ff660058b88d12625a783ce9e5c1371c87951f ] After commit cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep"), it may change to lock another sk if the asoc has been peeled off in sctp_wait_for_sndbuf. However, the asoc's new sk could be already closed elsewhere, as it's in the sendmsg context of the old sk that can't avoid the new sk's closing. If the sk's last one refcnt is held by this asoc, later on after putting this asoc, the new sk will be freed, while under it's own lock. This patch is to revert that commit, but fix the old issue by returning error under the old sk's lock. Fixes: cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep") Reported-by: syzbot+ac6ea7baa4432811eb50@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index acf487283e9f..e9851198a850 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -83,7 +83,7 @@ static int sctp_writeable(struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk); + size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1952,7 +1952,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if (!sctp_wspace(asoc)) { /* sk can be changed by peel off when waiting for buf. */ - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) { if (err == -ESRCH) { /* asoc is already dead. */ @@ -6974,12 +6974,12 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len, struct sock **orig_sk) + size_t msg_len) { struct sock *sk = asoc->base.sk; - int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); + int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); @@ -7008,17 +7008,13 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); - if (sk != asoc->base.sk) { - release_sock(sk); - sk = asoc->base.sk; - lock_sock(sk); - } + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } out: - *orig_sk = sk; finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ From 0d9bcadb2226150776c5bf84ef6eb5df03e32b03 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 22 Jan 2018 16:06:37 -0500 Subject: [PATCH 125/131] vmxnet3: repair memory leak [ Upstream commit 848b159835ddef99cc4193083f7e786c3992f580 ] with the introduction of commit b0eb57cb97e7837ebb746404c2c58c6f536f23fa, it appears that rq->buf_info is improperly handled. While it is heap allocated when an rx queue is setup, and freed when torn down, an old line of code in vmxnet3_rq_destroy was not properly removed, leading to rq->buf_info[0] being set to NULL prior to its being freed, causing a memory leak, which eventually exhausts the system on repeated create/destroy operations (for example, when the mtu of a vmxnet3 interface is changed frequently. Fix is pretty straight forward, just move the NULL set to after the free. Tested by myself with successful results Applies to net, and should likely be queued for stable, please Signed-off-by: Neil Horman Reported-By: boyang@redhat.com CC: boyang@redhat.com CC: Shrikrishna Khare CC: "VMware, Inc." CC: David S. Miller Acked-by: Shrikrishna Khare Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/vmxnet3/vmxnet3_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 0cbf520cea77..82bf85ae5d08 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1563,7 +1563,6 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, rq->rx_ring[i].basePA); rq->rx_ring[i].base = NULL; } - rq->buf_info[i] = NULL; } if (rq->comp_ring.base) { @@ -1578,6 +1577,7 @@ static void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq, (rq->rx_ring[0].size + rq->rx_ring[1].size); dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0], rq->buf_info_pa); + rq->buf_info[0] = rq->buf_info[1] = NULL; } } From 29837a4a8764c1b73674eb78c99717cbc73aa9f3 Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:50 -0800 Subject: [PATCH 126/131] net: Allow neigh contructor functions ability to modify the primary_key [ Upstream commit 096b9854c04df86f03b38a97d40b6506e5730919 ] Use n->primary_key instead of pkey to account for the possibility that a neigh constructor function may have modified the primary_key value. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ae92131c4f89..253c86b78ff0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -496,7 +496,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) nht = neigh_hash_grow(tbl, nht->hash_shift + 1); - hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift); if (n->parms->dead) { rc = ERR_PTR(-EINVAL); @@ -508,7 +508,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, n1 != NULL; n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { - if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); rc = n1; From cab8451486a6fd9ba4ce798d41352f45abb44fae Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:51 -0800 Subject: [PATCH 127/131] ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY [ Upstream commit cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 ] Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices to avoid making an entry for every remote ip the device needs to talk to. This used the be the old behavior but became broken in a263b3093641f (ipv4: Make neigh lookups directly in output packet path) and later removed in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point devices) because it was broken. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/arp.h | 3 +++ net/ipv4/arp.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/net/arp.h b/include/net/arp.h index 5e0f891d476c..1b3f86981757 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -19,6 +19,9 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + key = INADDR_ANY; + return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 711b4dfa17c3..cb5eb649ad5f 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -223,11 +223,16 @@ static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32 *)neigh->primary_key; + __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; + u32 inaddr_any = INADDR_ANY; + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); + + addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { From d35cd5e279881ec36ff8cd82a2d9caebc0cce3fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Jan 2018 14:21:13 -0800 Subject: [PATCH 128/131] flow_dissector: properly cap thoff field [ Upstream commit d0c081b49137cd3200f2023c0875723be66e7ce5 ] syzbot reported yet another crash [1] that is caused by insufficient validation of DODGY packets. Two bugs are happening here to trigger the crash. 1) Flow dissection leaves with incorrect thoff field. 2) skb_probe_transport_header() sets transport header to this invalid thoff, even if pointing after skb valid data. 3) qdisc_pkt_len_init() reads out-of-bound data because it trusts tcp_hdrlen(skb) Possible fixes : - Full flow dissector validation before injecting bad DODGY packets in the stack. This approach was attempted here : https://patchwork.ozlabs.org/patch/ 861874/ - Have more robust functions in the core. This might be needed anyway for stable versions. This patch fixes the flow dissection issue. [1] CPU: 1 PID: 3144 Comm: syzkaller271204 Not tainted 4.15.0-rc4-mm1+ #49 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:355 [inline] kasan_report+0x23b/0x360 mm/kasan/report.c:413 __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:432 __tcp_hdrlen include/linux/tcp.h:35 [inline] tcp_hdrlen include/linux/tcp.h:40 [inline] qdisc_pkt_len_init net/core/dev.c:3160 [inline] __dev_queue_xmit+0x20d3/0x2200 net/core/dev.c:3465 dev_queue_xmit+0x17/0x20 net/core/dev.c:3554 packet_snd net/packet/af_packet.c:2943 [inline] packet_sendmsg+0x3ad5/0x60a0 net/packet/af_packet.c:2968 sock_sendmsg_nosec net/socket.c:628 [inline] sock_sendmsg+0xca/0x110 net/socket.c:638 sock_write_iter+0x31a/0x5d0 net/socket.c:907 call_write_iter include/linux/fs.h:1776 [inline] new_sync_write fs/read_write.c:469 [inline] __vfs_write+0x684/0x970 fs/read_write.c:482 vfs_write+0x189/0x510 fs/read_write.c:544 SYSC_write fs/read_write.c:589 [inline] SyS_write+0xef/0x220 fs/read_write.c:581 entry_SYSCALL_64_fastpath+0x1f/0x96 Fixes: 34fad54c2537 ("net: __skb_flow_dissect() must cap its return value") Fixes: a6e544b0a88b ("flow_dissector: Jump to exit code in __skb_flow_dissect") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Acked-by: Jason Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/flow_dissector.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ee9082792530..4d14908afaec 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -492,8 +492,8 @@ ip_proto_again: out_good: ret = true; - key_control->thoff = (u16)nhoff; out: + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; @@ -501,7 +501,6 @@ out: out_bad: ret = false; - key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); goto out; } EXPORT_SYMBOL(__skb_flow_dissect); From edaafa805e0f9d09560a4892790b8e19cab8bf09 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 18 Jan 2018 16:14:26 -0500 Subject: [PATCH 129/131] net: tcp: close sock if net namespace is exiting [ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ] When a tcp socket is closed, if it detects that its net namespace is exiting, close immediately and do not wait for FIN sequence. For normal sockets, a reference is taken to their net namespace, so it will never exit while the socket is open. However, kernel sockets do not take a reference to their net namespace, so it may begin exiting while the kernel socket is still open. In this case if the kernel socket is a tcp socket, it will stay open trying to complete its close sequence. The sock's dst(s) hold a reference to their interface, which are all transferred to the namespace's loopback interface when the real interfaces are taken down. When the namespace tries to take down its loopback interface, it hangs waiting for all references to the loopback interface to release, which results in messages like: unregister_netdevice: waiting for lo to become free. Usage count = 1 These messages continue until the socket finally times out and closes. Since the net namespace cleanup holds the net_mutex while calling its registered pernet callbacks, any new net namespace initialization is blocked until the current net namespace finishes exiting. After this change, the tcp socket notices the exiting net namespace, and closes immediately, releasing its dst(s) and their reference to the loopback interface, which lets the net namespace continue exiting. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 Signed-off-by: Dan Streetman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/net_namespace.h | 10 ++++++++++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_timer.c | 15 +++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 2dcea635ecce..93328c61934a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -209,6 +209,11 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } +static inline int check_net(const struct net *net) +{ + return atomic_read(&net->count) != 0; +} + void net_drop_ns(void *); #else @@ -233,6 +238,11 @@ int net_eq(const struct net *net1, const struct net *net2) return 1; } +static inline int check_net(const struct net *net) +{ + return 1; +} + #define net_drop_ns NULL #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5597120c8ffd..37e8966a457b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2176,6 +2176,9 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + } else if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_set_state(sk, TCP_CLOSE); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1ec12a4f327e..35f638cfc675 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -46,11 +46,19 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * + * Also close if our net namespace is exiting; in that case there is no + * hope of ever communicating again since all netns interfaces are already + * down (or about to be down), and we need to release our dst references, + * which have been moved to the netns loopback interface, so the namespace + * can finish exiting. This condition is only possible if we are a kernel + * socket, as those do not hold references to the namespace. + * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. + * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { @@ -79,6 +87,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } + + if (!check_net(sock_net(sk))) { + /* Not possible to send reset; just close */ + tcp_done(sk); + return 1; + } + return 0; } From 3f84339bd344b2cf0afe64b78d3964bb6422d0f3 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:11:06 +0000 Subject: [PATCH 130/131] nfsd: auth: Fix gid sorting when rootsquash enabled commit 1995266727fa8143897e89b55f5d3c79aa828420 upstream. Commit bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility group_info allocators") appears to break nfsd rootsquash in a pretty major way. It adds a call to groups_sort() inside the loop that copies/squashes gids, which means the valid gids are sorted along with the following garbage. The net result is that the highest numbered valid gids are replaced with any lower-valued garbage gids, possibly including 0. We should sort only once, after filling in all the gids. Fixes: bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility ...") Signed-off-by: Ben Hutchings Acked-by: J. Bruce Fields Signed-off-by: Linus Torvalds Cc: Wolfgang Walter Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/auth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index a260060042ad..67eb154af881 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,9 +60,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else GROUP_AT(gi, i) = GROUP_AT(rqgi, i); - /* Each thread allocates its own gi, no race */ - groups_sort(gi); } + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } else { gi = get_group_info(rqgi); } From 49fe90b853dfb1087d0a734cd7f4af1aa00c8e53 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 31 Jan 2018 12:06:14 +0100 Subject: [PATCH 131/131] Linux 4.4.114 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 39019c9d205c..153440b1bbb0 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 113 +SUBLEVEL = 114 EXTRAVERSION = NAME = Blurry Fish Butt