From 134c1aae4311b6fb9823722647ab9c4ab554a152 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Mon, 19 Dec 2022 21:07:49 -0800
Subject: [PATCH] ANDROID: Make SPF aware of fast mremaps

SPF attempts page faults without taking the mmap lock, but takes the
PTL. If there is a concurrent fast mremap (at PMD/PUD level), this
can lead to a UAF as fast mremap will only take the PTL locks at the
PMD/PUD level. SPF cannot take the PTL locks at the larger subtree
granularity since this introduces much contention in the page fault
paths.

To address the race:
  1) Fast mremaps wait until there are no users of the VMA.
  2) Speculative faults detect ongoing fast mremaps and fallback
    to conventional fault handling (taking mmap read lock).

Since this race condition is very rare the performance impact is
negligible.

Bug: 263177905
Change-Id: If9755aa4261337fe180e3093a3cefaae8ac9ff1a
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
---
 include/linux/mm.h |   3 ++
 mm/mmap.c          |  30 ++++++++++++--
 mm/mremap.c        | 100 ++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dfefcfa1d6a4..5de4309bfa14 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1758,6 +1758,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write);
 
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern wait_queue_head_t vma_users_wait;
+extern atomic_t vma_user_waiters;
+
 static inline void vm_write_begin(struct vm_area_struct *vma)
 {
         /*
diff --git a/mm/mmap.c b/mm/mmap.c
index fba57c628671..c3dfbfdb674a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -180,7 +180,17 @@ static void __free_vma(struct vm_area_struct *vma)
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
 void put_vma(struct vm_area_struct *vma)
 {
-	if (atomic_dec_and_test(&vma->vm_ref_count))
+	int ref_count = atomic_dec_return(&vma->vm_ref_count);
+
+	/*
+	 * Implicit smp_mb due to atomic_dec_return.
+	 *
+	 * If this is the last reference, wake up the mremap waiter
+	 * (if any).
+	 */
+	if (ref_count == 1 && unlikely(atomic_read(&vma_user_waiters) > 0))
+		wake_up(&vma_users_wait);
+	else if (ref_count <= 0)
 		__free_vma(vma);
 }
 #else
@@ -2421,8 +2431,22 @@ struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
 
 	read_lock(&mm->mm_rb_lock);
 	vma = __find_vma(mm, addr);
-	if (vma)
-		atomic_inc(&vma->vm_ref_count);
+
+	/*
+	 * If there is a concurrent fast mremap, bail out since the entire
+	 * PMD/PUD subtree may have been remapped.
+	 *
+	 * This is usually safe for conventional mremap since it takes the
+	 * PTE locks as does SPF. However fast mremap only takes the lock
+	 * at the PMD/PUD level which is ok as it is done with the mmap
+	 * write lock held. But since SPF, as the term implies forgoes,
+	 * taking the mmap read lock and also cannot take PTL lock at the
+	 * larger PMD/PUD granualrity, since it would introduce huge
+	 * contention in the page fault path; fall back to regular fault
+	 * handling.
+	 */
+	if (vma && !atomic_inc_unless_negative(&vma->vm_ref_count))
+		vma = NULL;
 	read_unlock(&mm->mm_rb_lock);
 
 	return vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index 5a18cec23fa7..0763b83ef779 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,17 +210,74 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		drop_rmap_locks(vma);
 }
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+DECLARE_WAIT_QUEUE_HEAD(vma_users_wait);
+atomic_t vma_user_waiters = ATOMIC_INIT(0);
+
+static inline void wait_for_vma_users(struct vm_area_struct *vma)
+{
+	/*
+	 * If we have the only reference, swap the refcount to -1. This
+	 * will prevent other concurrent references by get_vma() for SPFs.
+	 */
+	if (likely(atomic_cmpxchg(&vma->vm_ref_count, 1, -1) == 1))
+		return;
+
+	/* Indicate we are waiting for other users of the VMA to finish. */
+	atomic_inc(&vma_user_waiters);
+
+	/* Failed atomic_cmpxchg; no implicit barrier, use an explicit one. */
+	smp_mb();
+
+	/*
+	 * Callers cannot handle failure, sleep uninterruptibly until there
+	 * are no other users of this VMA.
+	 *
+	 * We don't need to worry about references from concurrent waiters,
+	 * since this is only used in the context of fast mremaps, with
+	 * exclusive mmap write lock held.
+	 */
+	wait_event(vma_users_wait, atomic_cmpxchg(&vma->vm_ref_count, 1, -1) == 1);
+
+	atomic_dec(&vma_user_waiters);
+}
+
+
 /*
- * Speculative page fault handlers will not detect page table changes done
- * without ptl locking.
+ * Restore the VMA reference count to 1 after a fast mremap.
  */
-#if defined(CONFIG_HAVE_MOVE_PMD) && !defined(CONFIG_SPECULATIVE_PAGE_FAULT)
+static inline void restore_vma_ref_count(struct vm_area_struct *vma)
+{
+	/*
+	 * This should only be called after a corresponding,
+	 * wait_for_vma_users()
+	 */
+	VM_BUG_ON_VMA(atomic_cmpxchg(&vma->vm_ref_count, -1, 1) != -1,
+		      vma);
+}
+#else	/* !CONFIG_SPECULATIVE_PAGE_FAULT */
+static inline void wait_for_vma_users(struct vm_area_struct *vma)
+{
+}
+static inline void restore_vma_ref_count(struct vm_area_struct *vma)
+{
+}
+#endif	/* CONFIG_SPECULATIVE_PAGE_FAULT */
+
+#ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
+	bool ret;
+
+	/*
+	 * Wait for concurrent users, since these can potentially be
+	 * speculative page faults.
+	 */
+	wait_for_vma_users(vma);
 
 	/*
 	 * The destination pmd shouldn't be established, free_pgtables()
@@ -245,8 +302,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * One alternative might be to just unmap the target pmd at
 	 * this point, and verify that it really is empty. We'll see.
 	 */
-	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
-		return false;
+	if (WARN_ON_ONCE(!pmd_none(*new_pmd))) {
+		ret = false;
+		goto out;
+	}
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -270,7 +329,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
-	return true;
+	ret = true;
+
+out:
+	restore_vma_ref_count(vma);
+	return ret;
 }
 #else
 static inline bool move_normal_pmd(struct vm_area_struct *vma,
@@ -281,24 +344,29 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma,
 }
 #endif
 
-/*
- * Speculative page fault handlers will not detect page table changes done
- * without ptl locking.
- */
-#if defined(CONFIG_HAVE_MOVE_PUD) && !defined(CONFIG_SPECULATIVE_PAGE_FAULT)
+#ifdef CONFIG_HAVE_MOVE_PUD
 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
+	bool ret;
+
+	/*
+	 * Wait for concurrent users, since these can potentially be
+	 * speculative page faults.
+	 */
+	wait_for_vma_users(vma);
 
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
 	 */
-	if (WARN_ON_ONCE(!pud_none(*new_pud)))
-		return false;
+	if (WARN_ON_ONCE(!pud_none(*new_pud))) {
+		ret = false;
+		goto out;
+	}
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -322,7 +390,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
-	return true;
+	ret = true;
+
+out:
+	restore_vma_ref_count(vma);
+	return ret;
 }
 #else
 static inline bool move_normal_pud(struct vm_area_struct *vma,