466 lines
16 KiB
Diff
466 lines
16 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Andy Lutomirski <luto@kernel.org>
|
|
Date: Tue, 12 Dec 2017 07:56:45 -0800
|
|
Subject: [PATCH] x86/pti: Put the LDT in its own PGD if PTI is on
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
CVE-2017-5754
|
|
|
|
With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
|
|
The LDT is per process, i.e. per mm.
|
|
|
|
An earlier approach mapped the LDT on context switch into a fixmap area,
|
|
but that's a big overhead and exhausted the fixmap space when NR_CPUS got
|
|
big.
|
|
|
|
Take advantage of the fact that there is an address space hole which
|
|
provides a completely unused pgd. Use this pgd to manage per-mm LDT
|
|
mappings.
|
|
|
|
This has a down side: the LDT isn't (currently) randomized, and an attack
|
|
that can write the LDT is instant root due to call gates (thanks, AMD, for
|
|
leaving call gates in AMD64 but designing them wrong so they're only useful
|
|
for exploits). This can be mitigated by making the LDT read-only or
|
|
randomizing the mapping, either of which is strightforward on top of this
|
|
patch.
|
|
|
|
This will significantly slow down LDT users, but that shouldn't matter for
|
|
important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
|
|
old libc implementations.
|
|
|
|
[ tglx: Cleaned it up. ]
|
|
|
|
Signed-off-by: Andy Lutomirski <luto@kernel.org>
|
|
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
|
|
Cc: Borislav Petkov <bp@alien8.de>
|
|
Cc: Brian Gerst <brgerst@gmail.com>
|
|
Cc: Dave Hansen <dave.hansen@intel.com>
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Cc: David Laight <David.Laight@aculab.com>
|
|
Cc: H. Peter Anvin <hpa@zytor.com>
|
|
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
|
|
Cc: Juergen Gross <jgross@suse.com>
|
|
Cc: Kees Cook <keescook@chromium.org>
|
|
Cc: Kirill A. Shutemov <kirill@shutemov.name>
|
|
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Signed-off-by: Ingo Molnar <mingo@kernel.org>
|
|
(cherry picked from commit f55f0501cbf65ec41cca5058513031b711730b1d)
|
|
Signed-off-by: Andy Whitcroft <apw@canonical.com>
|
|
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
|
|
(cherry picked from commit c250643846b45ea6782fb0cfcc15e8cd34744bc7)
|
|
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
|
---
|
|
Documentation/x86/x86_64/mm.txt | 3 +-
|
|
arch/x86/include/asm/mmu_context.h | 59 ++++++++++++--
|
|
arch/x86/include/asm/pgtable_64_types.h | 4 +
|
|
arch/x86/include/asm/processor.h | 23 ++++--
|
|
arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++-
|
|
arch/x86/mm/dump_pagetables.c | 9 +++
|
|
6 files changed, 220 insertions(+), 17 deletions(-)
|
|
|
|
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
|
|
index 496a1dbf139d..ad41b3813f0a 100644
|
|
--- a/Documentation/x86/x86_64/mm.txt
|
|
+++ b/Documentation/x86/x86_64/mm.txt
|
|
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
|
|
... unused hole ...
|
|
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
|
|
... unused hole ...
|
|
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
|
|
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
|
|
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
|
|
... unused hole ...
|
|
@@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
|
|
hole caused by [56:63] sign extension
|
|
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
|
|
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
|
|
-ff90000000000000 - ff9fffffffffffff (=52 bits) hole
|
|
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
|
|
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
|
|
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
|
|
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
|
|
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
|
|
index 89a01ad7e370..9e3546e1c0f4 100644
|
|
--- a/arch/x86/include/asm/mmu_context.h
|
|
+++ b/arch/x86/include/asm/mmu_context.h
|
|
@@ -49,10 +49,33 @@ struct ldt_struct {
|
|
* call gates. On native, we could merge the ldt_struct and LDT
|
|
* allocations, but it's not worth trying to optimize.
|
|
*/
|
|
- struct desc_struct *entries;
|
|
- unsigned int nr_entries;
|
|
+ struct desc_struct *entries;
|
|
+ unsigned int nr_entries;
|
|
+
|
|
+ /*
|
|
+ * If PTI is in use, then the entries array is not mapped while we're
|
|
+ * in user mode. The whole array will be aliased at the addressed
|
|
+ * given by ldt_slot_va(slot). We use two slots so that we can allocate
|
|
+ * and map, and enable a new LDT without invalidating the mapping
|
|
+ * of an older, still-in-use LDT.
|
|
+ *
|
|
+ * slot will be -1 if this LDT doesn't have an alias mapping.
|
|
+ */
|
|
+ int slot;
|
|
};
|
|
|
|
+/* This is a multiple of PAGE_SIZE. */
|
|
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
|
|
+
|
|
+static inline void *ldt_slot_va(int slot)
|
|
+{
|
|
+#ifdef CONFIG_X86_64
|
|
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
|
|
+#else
|
|
+ BUG();
|
|
+#endif
|
|
+}
|
|
+
|
|
/*
|
|
* Used for LDT copy/destruction.
|
|
*/
|
|
@@ -63,6 +86,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
|
|
}
|
|
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
|
|
void destroy_context_ldt(struct mm_struct *mm);
|
|
+void ldt_arch_exit_mmap(struct mm_struct *mm);
|
|
#else /* CONFIG_MODIFY_LDT_SYSCALL */
|
|
static inline void init_new_context_ldt(struct mm_struct *mm) { }
|
|
static inline int ldt_dup_context(struct mm_struct *oldmm,
|
|
@@ -70,7 +94,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
|
|
{
|
|
return 0;
|
|
}
|
|
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
|
|
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
|
|
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
|
|
#endif
|
|
|
|
static inline void load_mm_ldt(struct mm_struct *mm)
|
|
@@ -95,10 +120,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
|
|
* that we can see.
|
|
*/
|
|
|
|
- if (unlikely(ldt))
|
|
- set_ldt(ldt->entries, ldt->nr_entries);
|
|
- else
|
|
+ if (unlikely(ldt)) {
|
|
+ if (static_cpu_has(X86_FEATURE_PTI)) {
|
|
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
|
|
+ /*
|
|
+ * Whoops -- either the new LDT isn't mapped
|
|
+ * (if slot == -1) or is mapped into a bogus
|
|
+ * slot (if slot > 1).
|
|
+ */
|
|
+ clear_LDT();
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If page table isolation is enabled, ldt->entries
|
|
+ * will not be mapped in the userspace pagetables.
|
|
+ * Tell the CPU to access the LDT through the alias
|
|
+ * at ldt_slot_va(ldt->slot).
|
|
+ */
|
|
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
|
|
+ } else {
|
|
+ set_ldt(ldt->entries, ldt->nr_entries);
|
|
+ }
|
|
+ } else {
|
|
clear_LDT();
|
|
+ }
|
|
#else
|
|
clear_LDT();
|
|
#endif
|
|
@@ -193,6 +239,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
|
|
static inline void arch_exit_mmap(struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_exit_mmap(mm);
|
|
+ ldt_arch_exit_mmap(mm);
|
|
}
|
|
|
|
#ifdef CONFIG_X86_64
|
|
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
|
|
index 5932dead34ee..e8a809ee0bb6 100644
|
|
--- a/arch/x86/include/asm/pgtable_64_types.h
|
|
+++ b/arch/x86/include/asm/pgtable_64_types.h
|
|
@@ -81,10 +81,14 @@ typedef struct { pteval_t pte; } pte_t;
|
|
# define VMALLOC_SIZE_TB _AC(12800, UL)
|
|
# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
|
|
# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
|
|
+# define LDT_PGD_ENTRY _AC(-112, UL)
|
|
+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
|
|
#else
|
|
# define VMALLOC_SIZE_TB _AC(32, UL)
|
|
# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
|
|
# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
|
|
+# define LDT_PGD_ENTRY _AC(-4, UL)
|
|
+# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
|
|
#endif
|
|
|
|
#ifdef CONFIG_RANDOMIZE_MEMORY
|
|
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
|
|
index 935d68609922..24503521c947 100644
|
|
--- a/arch/x86/include/asm/processor.h
|
|
+++ b/arch/x86/include/asm/processor.h
|
|
@@ -843,13 +843,22 @@ static inline void spin_lock_prefetch(const void *x)
|
|
|
|
#else
|
|
/*
|
|
- * User space process size. 47bits minus one guard page. The guard
|
|
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
|
|
- * the highest possible canonical userspace address, then that
|
|
- * syscall will enter the kernel with a non-canonical return
|
|
- * address, and SYSRET will explode dangerously. We avoid this
|
|
- * particular problem by preventing anything from being mapped
|
|
- * at the maximum canonical address.
|
|
+ * User space process size. This is the first address outside the user range.
|
|
+ * There are a few constraints that determine this:
|
|
+ *
|
|
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
|
|
+ * address, then that syscall will enter the kernel with a
|
|
+ * non-canonical return address, and SYSRET will explode dangerously.
|
|
+ * We avoid this particular problem by preventing anything executable
|
|
+ * from being mapped at the maximum canonical address.
|
|
+ *
|
|
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
|
|
+ * CPUs malfunction if they execute code from the highest canonical page.
|
|
+ * They'll speculate right off the end of the canonical space, and
|
|
+ * bad things happen. This is worked around in the same way as the
|
|
+ * Intel problem.
|
|
+ *
|
|
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
|
|
*/
|
|
#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
|
|
|
|
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
|
|
index 74a5aaf13f3c..eceaada581ff 100644
|
|
--- a/arch/x86/kernel/ldt.c
|
|
+++ b/arch/x86/kernel/ldt.c
|
|
@@ -23,6 +23,7 @@
|
|
#include <linux/uaccess.h>
|
|
|
|
#include <asm/ldt.h>
|
|
+#include <asm/tlb.h>
|
|
#include <asm/desc.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/syscalls.h>
|
|
@@ -50,13 +51,11 @@ static void refresh_ldt_segments(void)
|
|
static void flush_ldt(void *__mm)
|
|
{
|
|
struct mm_struct *mm = __mm;
|
|
- mm_context_t *pc;
|
|
|
|
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
|
|
return;
|
|
|
|
- pc = &mm->context;
|
|
- set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
|
|
+ load_mm_ldt(mm);
|
|
|
|
refresh_ldt_segments();
|
|
}
|
|
@@ -93,10 +92,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
|
|
return NULL;
|
|
}
|
|
|
|
+ /* The new LDT isn't aliased for PTI yet. */
|
|
+ new_ldt->slot = -1;
|
|
+
|
|
new_ldt->nr_entries = num_entries;
|
|
return new_ldt;
|
|
}
|
|
|
|
+/*
|
|
+ * If PTI is enabled, this maps the LDT into the kernelmode and
|
|
+ * usermode tables for the given mm.
|
|
+ *
|
|
+ * There is no corresponding unmap function. Even if the LDT is freed, we
|
|
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
|
|
+ * This is harmless: the LDT is always in ordinary memory, and no one will
|
|
+ * access the freed slot.
|
|
+ *
|
|
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
|
|
+ * it useful, and the flush would slow down modify_ldt().
|
|
+ */
|
|
+static int
|
|
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
|
|
+{
|
|
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
+ bool is_vmalloc, had_top_level_entry;
|
|
+ unsigned long va;
|
|
+ spinlock_t *ptl;
|
|
+ pgd_t *pgd;
|
|
+ int i;
|
|
+
|
|
+ if (!static_cpu_has(X86_FEATURE_PTI))
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * Any given ldt_struct should have map_ldt_struct() called at most
|
|
+ * once.
|
|
+ */
|
|
+ WARN_ON(ldt->slot != -1);
|
|
+
|
|
+ /*
|
|
+ * Did we already have the top level entry allocated? We can't
|
|
+ * use pgd_none() for this because it doens't do anything on
|
|
+ * 4-level page table kernels.
|
|
+ */
|
|
+ pgd = pgd_offset(mm, LDT_BASE_ADDR);
|
|
+ had_top_level_entry = (pgd->pgd != 0);
|
|
+
|
|
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
|
|
+
|
|
+ for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
|
|
+ unsigned long offset = i << PAGE_SHIFT;
|
|
+ const void *src = (char *)ldt->entries + offset;
|
|
+ unsigned long pfn;
|
|
+ pte_t pte, *ptep;
|
|
+
|
|
+ va = (unsigned long)ldt_slot_va(slot) + offset;
|
|
+ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
|
|
+ page_to_pfn(virt_to_page(src));
|
|
+ /*
|
|
+ * Treat the PTI LDT range as a *userspace* range.
|
|
+ * get_locked_pte() will allocate all needed pagetables
|
|
+ * and account for them in this mm.
|
|
+ */
|
|
+ ptep = get_locked_pte(mm, va, &ptl);
|
|
+ if (!ptep)
|
|
+ return -ENOMEM;
|
|
+ pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
|
|
+ set_pte_at(mm, va, ptep, pte);
|
|
+ pte_unmap_unlock(ptep, ptl);
|
|
+ }
|
|
+
|
|
+ if (mm->context.ldt) {
|
|
+ /*
|
|
+ * We already had an LDT. The top-level entry should already
|
|
+ * have been allocated and synchronized with the usermode
|
|
+ * tables.
|
|
+ */
|
|
+ WARN_ON(!had_top_level_entry);
|
|
+ if (static_cpu_has(X86_FEATURE_PTI))
|
|
+ WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
|
|
+ } else {
|
|
+ /*
|
|
+ * This is the first time we're mapping an LDT for this process.
|
|
+ * Sync the pgd to the usermode tables.
|
|
+ */
|
|
+ WARN_ON(had_top_level_entry);
|
|
+ if (static_cpu_has(X86_FEATURE_PTI)) {
|
|
+ WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
|
|
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ va = (unsigned long)ldt_slot_va(slot);
|
|
+ flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
|
|
+
|
|
+ ldt->slot = slot;
|
|
+#endif
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void free_ldt_pgtables(struct mm_struct *mm)
|
|
+{
|
|
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
|
+ struct mmu_gather tlb;
|
|
+ unsigned long start = LDT_BASE_ADDR;
|
|
+ unsigned long end = start + (1UL << PGDIR_SHIFT);
|
|
+
|
|
+ if (!static_cpu_has(X86_FEATURE_PTI))
|
|
+ return;
|
|
+
|
|
+ tlb_gather_mmu(&tlb, mm, start, end);
|
|
+ free_pgd_range(&tlb, start, end, start, end);
|
|
+ tlb_finish_mmu(&tlb, start, end);
|
|
+#endif
|
|
+}
|
|
+
|
|
/* After calling this, the LDT is immutable. */
|
|
static void finalize_ldt_struct(struct ldt_struct *ldt)
|
|
{
|
|
@@ -155,6 +265,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
|
|
new_ldt->nr_entries * LDT_ENTRY_SIZE);
|
|
finalize_ldt_struct(new_ldt);
|
|
|
|
+ retval = map_ldt_struct(mm, new_ldt, 0);
|
|
+ if (retval) {
|
|
+ free_ldt_pgtables(mm);
|
|
+ free_ldt_struct(new_ldt);
|
|
+ goto out_unlock;
|
|
+ }
|
|
mm->context.ldt = new_ldt;
|
|
|
|
out_unlock:
|
|
@@ -173,6 +289,11 @@ void destroy_context_ldt(struct mm_struct *mm)
|
|
mm->context.ldt = NULL;
|
|
}
|
|
|
|
+void ldt_arch_exit_mmap(struct mm_struct *mm)
|
|
+{
|
|
+ free_ldt_pgtables(mm);
|
|
+}
|
|
+
|
|
static int read_ldt(void __user *ptr, unsigned long bytecount)
|
|
{
|
|
struct mm_struct *mm = current->mm;
|
|
@@ -286,6 +407,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
|
|
new_ldt->entries[ldt_info.entry_number] = ldt;
|
|
finalize_ldt_struct(new_ldt);
|
|
|
|
+ /*
|
|
+ * If we are using PTI, map the new LDT into the userspace pagetables.
|
|
+ * If there is already an LDT, use the other slot so that other CPUs
|
|
+ * will continue to use the old LDT until install_ldt() switches
|
|
+ * them over to the new LDT.
|
|
+ */
|
|
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
|
|
+ if (error) {
|
|
+ free_ldt_struct(old_ldt);
|
|
+ goto out_unlock;
|
|
+ }
|
|
+
|
|
install_ldt(mm, new_ldt);
|
|
free_ldt_struct(old_ldt);
|
|
error = 0;
|
|
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
|
|
index 3b7720404a9f..eed93dd4cb4a 100644
|
|
--- a/arch/x86/mm/dump_pagetables.c
|
|
+++ b/arch/x86/mm/dump_pagetables.c
|
|
@@ -52,11 +52,17 @@ enum address_markers_idx {
|
|
USER_SPACE_NR = 0,
|
|
KERNEL_SPACE_NR,
|
|
LOW_KERNEL_NR,
|
|
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
|
|
+ LDT_NR,
|
|
+#endif
|
|
VMALLOC_START_NR,
|
|
VMEMMAP_START_NR,
|
|
#ifdef CONFIG_KASAN
|
|
KASAN_SHADOW_START_NR,
|
|
KASAN_SHADOW_END_NR,
|
|
+#endif
|
|
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
|
|
+ LDT_NR,
|
|
#endif
|
|
CPU_ENTRY_AREA_NR,
|
|
#ifdef CONFIG_X86_ESPFIX64
|
|
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
|
|
#ifdef CONFIG_KASAN
|
|
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
|
|
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
|
|
+#endif
|
|
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
+ [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
|
|
#endif
|
|
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
|
|
#ifdef CONFIG_X86_ESPFIX64
|
|
--
|
|
2.14.2
|
|
|