diff --git a/include/linux/mm.h b/include/linux/mm.h
index dfefcfa1d6a4..5de4309bfa14 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1758,6 +1758,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write);
 
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern wait_queue_head_t vma_users_wait;
+extern atomic_t vma_user_waiters;
+
 static inline void vm_write_begin(struct vm_area_struct *vma)
 {
         /*
diff --git a/mm/mmap.c b/mm/mmap.c
index fba57c628671..c3dfbfdb674a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -180,7 +180,17 @@ static void __free_vma(struct vm_area_struct *vma)
 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
 void put_vma(struct vm_area_struct *vma)
 {
-	if (atomic_dec_and_test(&vma->vm_ref_count))
+	int ref_count = atomic_dec_return(&vma->vm_ref_count);
+
+	/*
+	 * Implicit smp_mb due to atomic_dec_return.
+	 *
+	 * If this is the last reference, wake up the mremap waiter
+	 * (if any).
+	 */
+	if (ref_count == 1 && unlikely(atomic_read(&vma_user_waiters) > 0))
+		wake_up(&vma_users_wait);
+	else if (ref_count <= 0)
 		__free_vma(vma);
 }
 #else
@@ -2421,8 +2431,22 @@ struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
 
 	read_lock(&mm->mm_rb_lock);
 	vma = __find_vma(mm, addr);
-	if (vma)
-		atomic_inc(&vma->vm_ref_count);
+
+	/*
+	 * If there is a concurrent fast mremap, bail out since the entire
+	 * PMD/PUD subtree may have been remapped.
+	 *
+	 * This is usually safe for conventional mremap since it takes the
+	 * PTE locks as does SPF. However fast mremap only takes the lock
+	 * at the PMD/PUD level which is ok as it is done with the mmap
+	 * write lock held. But since SPF, as the term implies forgoes,
+	 * taking the mmap read lock and also cannot take PTL lock at the
+	 * larger PMD/PUD granualrity, since it would introduce huge
+	 * contention in the page fault path; fall back to regular fault
+	 * handling.
+	 */
+	if (vma && !atomic_inc_unless_negative(&vma->vm_ref_count))
+		vma = NULL;
 	read_unlock(&mm->mm_rb_lock);
 
 	return vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index 5a18cec23fa7..0763b83ef779 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,17 +210,74 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		drop_rmap_locks(vma);
 }
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+DECLARE_WAIT_QUEUE_HEAD(vma_users_wait);
+atomic_t vma_user_waiters = ATOMIC_INIT(0);
+
+static inline void wait_for_vma_users(struct vm_area_struct *vma)
+{
+	/*
+	 * If we have the only reference, swap the refcount to -1. This
+	 * will prevent other concurrent references by get_vma() for SPFs.
+	 */
+	if (likely(atomic_cmpxchg(&vma->vm_ref_count, 1, -1) == 1))
+		return;
+
+	/* Indicate we are waiting for other users of the VMA to finish. */
+	atomic_inc(&vma_user_waiters);
+
+	/* Failed atomic_cmpxchg; no implicit barrier, use an explicit one. */
+	smp_mb();
+
+	/*
+	 * Callers cannot handle failure, sleep uninterruptibly until there
+	 * are no other users of this VMA.
+	 *
+	 * We don't need to worry about references from concurrent waiters,
+	 * since this is only used in the context of fast mremaps, with
+	 * exclusive mmap write lock held.
+	 */
+	wait_event(vma_users_wait, atomic_cmpxchg(&vma->vm_ref_count, 1, -1) == 1);
+
+	atomic_dec(&vma_user_waiters);
+}
+
+
 /*
- * Speculative page fault handlers will not detect page table changes done
- * without ptl locking.
+ * Restore the VMA reference count to 1 after a fast mremap.
  */
-#if defined(CONFIG_HAVE_MOVE_PMD) && !defined(CONFIG_SPECULATIVE_PAGE_FAULT)
+static inline void restore_vma_ref_count(struct vm_area_struct *vma)
+{
+	/*
+	 * This should only be called after a corresponding,
+	 * wait_for_vma_users()
+	 */
+	VM_BUG_ON_VMA(atomic_cmpxchg(&vma->vm_ref_count, -1, 1) != -1,
+		      vma);
+}
+#else	/* !CONFIG_SPECULATIVE_PAGE_FAULT */
+static inline void wait_for_vma_users(struct vm_area_struct *vma)
+{
+}
+static inline void restore_vma_ref_count(struct vm_area_struct *vma)
+{
+}
+#endif	/* CONFIG_SPECULATIVE_PAGE_FAULT */
+
+#ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
+	bool ret;
+
+	/*
+	 * Wait for concurrent users, since these can potentially be
+	 * speculative page faults.
+	 */
+	wait_for_vma_users(vma);
 
 	/*
 	 * The destination pmd shouldn't be established, free_pgtables()
@@ -245,8 +302,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * One alternative might be to just unmap the target pmd at
 	 * this point, and verify that it really is empty. We'll see.
 	 */
-	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
-		return false;
+	if (WARN_ON_ONCE(!pmd_none(*new_pmd))) {
+		ret = false;
+		goto out;
+	}
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -270,7 +329,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
-	return true;
+	ret = true;
+
+out:
+	restore_vma_ref_count(vma);
+	return ret;
 }
 #else
 static inline bool move_normal_pmd(struct vm_area_struct *vma,
@@ -281,24 +344,29 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma,
 }
 #endif
 
-/*
- * Speculative page fault handlers will not detect page table changes done
- * without ptl locking.
- */
-#if defined(CONFIG_HAVE_MOVE_PUD) && !defined(CONFIG_SPECULATIVE_PAGE_FAULT)
+#ifdef CONFIG_HAVE_MOVE_PUD
 static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
+	bool ret;
+
+	/*
+	 * Wait for concurrent users, since these can potentially be
+	 * speculative page faults.
+	 */
+	wait_for_vma_users(vma);
 
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.
 	 */
-	if (WARN_ON_ONCE(!pud_none(*new_pud)))
-		return false;
+	if (WARN_ON_ONCE(!pud_none(*new_pud))) {
+		ret = false;
+		goto out;
+	}
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -322,7 +390,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
-	return true;
+	ret = true;
+
+out:
+	restore_vma_ref_count(vma);
+	return ret;
 }
 #else
 static inline bool move_normal_pud(struct vm_area_struct *vma,