From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Dec 2017 15:07:59 +0100 Subject: [PATCH] x86/mm: Use/Fix PCID to optimize user/kernel switches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CVE-2017-5754 We can use PCID to retain the TLBs across CR3 switches; including those now part of the user/kernel switch. This increases performance of kernel entry/exit at the cost of more expensive/complicated TLB flushing. Now that we have two address spaces, one for kernel and one for user space, we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID (just like we use the PFN LSB for the PGD). Since we do TLB invalidation from kernel space, the existing code will only invalidate the kernel PCID, we augment that by marking the corresponding user PCID invalid, and upon switching back to userspace, use a flushing CR3 write for the switch. In order to access the user_pcid_flush_mask we use PER_CPU storage, which means the previously established SWAPGS vs CR3 ordering is now mandatory and required. Having to do this memory access does require additional registers, most sites have a functioning stack and we can spill one (RAX), sites without functional stack need to otherwise provide the second scratch register. Note: PCID is generally available on Intel Sandybridge and later CPUs. Note: Up until this point TLB flushing was broken in this series. Based-on-code-from: Dave Hansen Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Signed-off-by: Ingo Molnar (backported from commit 6fd166aae78c0ab738d49bda653cbd9e3b1491cf) Signed-off-by: Andy Whitcroft Signed-off-by: Kleber Sacilotto de Souza (cherry picked from commit ac7471365d49c0a91d4b63453eb848cc19f17589) Signed-off-by: Fabian Grünbichler --- arch/x86/entry/calling.h | 72 ++++++++++++++++++----- arch/x86/include/asm/processor-flags.h | 5 ++ arch/x86/include/asm/tlbflush.h | 91 +++++++++++++++++++++++++---- arch/x86/include/uapi/asm/processor-flags.h | 7 ++- arch/x86/kernel/asm-offsets.c | 4 ++ arch/x86/mm/init.c | 2 +- arch/x86/mm/tlb.c | 1 + arch/x86/entry/entry_64.S | 9 +-- arch/x86/entry/entry_64_compat.S | 4 +- 9 files changed, 162 insertions(+), 33 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index bb56f5346ae8..ce5fb309926d 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -2,6 +2,9 @@ #include #include #include +#include +#include +#include /* @@ -190,17 +193,21 @@ For 32-bit we have the following conventions - kernel is built with #ifdef CONFIG_PAGE_TABLE_ISOLATION -/* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two halves: */ -#define PTI_SWITCH_MASK (1< #include #include +#include +#include static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { @@ -23,24 +25,54 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) /* There are 12 bits of space for ASIDS in CR3 */ #define CR3_HW_ASID_BITS 12 + /* * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches */ -#define PTI_CONSUMED_ASID_BITS 0 +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) -#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) /* * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account * for them being zero-based. Another -1 is because ASID 0 is reserved for * use by non-PCID-aware users. */ -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS 6 static inline u16 kern_pcid(u16 asid) { VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT)); + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * If current->mm == NULL then we borrow a mm which may change + * during a task switch and therefore we must not be preempted + * while we write CR3 back: */ preempt_disable(); native_write_cr3(__native_read_cr3()); @@ -290,7 +350,14 @@ static inline void __native_flush_tlb_global(void) */ static inline void __native_flush_tlb_single(unsigned long addr) { + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + invalidate_user_asid(loaded_mm_asid); } /* diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 39946d0a1d41..69077da3dbf1 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -77,7 +77,12 @@ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ + +#define X86_CR3_PCID_BITS 12 +#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) + +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 25b4832e9c28..87c3bafcef2c 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -93,6 +94,9 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index af75069fb116..caeb8a7bf0a4 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -855,7 +855,7 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 353f2f4e1d96..06f3854d0a4f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -106,6 +106,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) unsigned long new_mm_cr3; if (need_flush) { + invalidate_user_asid(new_asid); new_mm_cr3 = build_cr3(pgdir, new_asid); } else { new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 292ccc6ec48d..fb43f14ed299 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -22,7 +22,6 @@ #include #include #include -#include "calling.h" #include #include #include @@ -39,6 +38,8 @@ #include #include +#include "calling.h" + .code64 .section .entry.text, "ax" @@ -405,7 +406,7 @@ syscall_return_via_sysret: * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ - SWITCH_TO_USER_CR3 scratch_reg=%rdi + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi popq %rdi popq %rsp @@ -743,7 +744,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) * We can do future final exit work right here. */ - SWITCH_TO_USER_CR3 scratch_reg=%rdi + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi /* Restore RDI. */ popq %rdi @@ -856,7 +857,7 @@ native_irq_return_ldt: */ orq PER_CPU_VAR(espfix_stack), %rax - SWITCH_TO_USER_CR3 scratch_reg=%rdi /* to user CR3 */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi SWAPGS /* to user GS */ popq %rdi /* Restore user RDI */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 43f856aeee67..973527e34887 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -274,9 +274,9 @@ sysret32_from_system_call: * switch until after after the last reference to the process * stack. * - * %r8 is zeroed before the sysret, thus safe to clobber. + * %r8/%r9 are zeroed before the sysret, thus safe to clobber. */ - SWITCH_TO_USER_CR3 scratch_reg=%r8 + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 xorq %r8, %r8 xorq %r9, %r9 -- 2.14.2