 dc6c9a35b6
			
		
	
	
	dc6c9a35b6
	
	
	
		
			
			Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup.  The trick is to allocate a lot of PMD page tables.  Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low.  oom_score for the process will be 0.
	#include <errno.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <sys/mman.h>
	#include <sys/prctl.h>
	#define PUD_SIZE (1UL << 30)
	#define PMD_SIZE (1UL << 21)
	#define NR_PUD 130000
	int main(void)
	{
		char *addr = NULL;
		unsigned long i;
		prctl(PR_SET_THP_DISABLE);
		for (i = 0; i < NR_PUD ; i++) {
			addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
			if (addr == MAP_FAILED) {
				perror("mmap");
				break;
			}
			*addr = 'x';
			munmap(addr, PMD_SIZE);
			mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
			if (addr == MAP_FAILED)
				perror("re-mmap"), exit(1);
		}
		printf("PID %d consumed %lu KiB in PMD page tables\n",
				getpid(), i * 4096 >> 10);
		return pause();
	}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
 - HugeTLB can share PMD page tables. The patch handles by accounting
   the table to all processes who share it.
 - x86 PAE pre-allocates few PMD tables on fork.
 - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
   check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded).  As with nr_ptes we use per-mm counter.  The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
	
			
		
			
				
	
	
		
			240 lines
		
	
	
	
		
			6.5 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			240 lines
		
	
	
	
		
			6.5 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * mm/debug.c
 | |
|  *
 | |
|  * mm/ specific debug routines.
 | |
|  *
 | |
|  */
 | |
| 
 | |
| #include <linux/kernel.h>
 | |
| #include <linux/mm.h>
 | |
| #include <linux/ftrace_event.h>
 | |
| #include <linux/memcontrol.h>
 | |
| 
 | |
| static const struct trace_print_flags pageflag_names[] = {
 | |
| 	{1UL << PG_locked,		"locked"	},
 | |
| 	{1UL << PG_error,		"error"		},
 | |
| 	{1UL << PG_referenced,		"referenced"	},
 | |
| 	{1UL << PG_uptodate,		"uptodate"	},
 | |
| 	{1UL << PG_dirty,		"dirty"		},
 | |
| 	{1UL << PG_lru,			"lru"		},
 | |
| 	{1UL << PG_active,		"active"	},
 | |
| 	{1UL << PG_slab,		"slab"		},
 | |
| 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},
 | |
| 	{1UL << PG_arch_1,		"arch_1"	},
 | |
| 	{1UL << PG_reserved,		"reserved"	},
 | |
| 	{1UL << PG_private,		"private"	},
 | |
| 	{1UL << PG_private_2,		"private_2"	},
 | |
| 	{1UL << PG_writeback,		"writeback"	},
 | |
| #ifdef CONFIG_PAGEFLAGS_EXTENDED
 | |
| 	{1UL << PG_head,		"head"		},
 | |
| 	{1UL << PG_tail,		"tail"		},
 | |
| #else
 | |
| 	{1UL << PG_compound,		"compound"	},
 | |
| #endif
 | |
| 	{1UL << PG_swapcache,		"swapcache"	},
 | |
| 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},
 | |
| 	{1UL << PG_reclaim,		"reclaim"	},
 | |
| 	{1UL << PG_swapbacked,		"swapbacked"	},
 | |
| 	{1UL << PG_unevictable,		"unevictable"	},
 | |
| #ifdef CONFIG_MMU
 | |
| 	{1UL << PG_mlocked,		"mlocked"	},
 | |
| #endif
 | |
| #ifdef CONFIG_ARCH_USES_PG_UNCACHED
 | |
| 	{1UL << PG_uncached,		"uncached"	},
 | |
| #endif
 | |
| #ifdef CONFIG_MEMORY_FAILURE
 | |
| 	{1UL << PG_hwpoison,		"hwpoison"	},
 | |
| #endif
 | |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 | |
| 	{1UL << PG_compound_lock,	"compound_lock"	},
 | |
| #endif
 | |
| };
 | |
| 
 | |
| static void dump_flags(unsigned long flags,
 | |
| 			const struct trace_print_flags *names, int count)
 | |
| {
 | |
| 	const char *delim = "";
 | |
| 	unsigned long mask;
 | |
| 	int i;
 | |
| 
 | |
| 	pr_emerg("flags: %#lx(", flags);
 | |
| 
 | |
| 	/* remove zone id */
 | |
| 	flags &= (1UL << NR_PAGEFLAGS) - 1;
 | |
| 
 | |
| 	for (i = 0; i < count && flags; i++) {
 | |
| 
 | |
| 		mask = names[i].mask;
 | |
| 		if ((flags & mask) != mask)
 | |
| 			continue;
 | |
| 
 | |
| 		flags &= ~mask;
 | |
| 		pr_cont("%s%s", delim, names[i].name);
 | |
| 		delim = "|";
 | |
| 	}
 | |
| 
 | |
| 	/* check for left over flags */
 | |
| 	if (flags)
 | |
| 		pr_cont("%s%#lx", delim, flags);
 | |
| 
 | |
| 	pr_cont(")\n");
 | |
| }
 | |
| 
 | |
| void dump_page_badflags(struct page *page, const char *reason,
 | |
| 		unsigned long badflags)
 | |
| {
 | |
| 	pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
 | |
| 		  page, atomic_read(&page->_count), page_mapcount(page),
 | |
| 		  page->mapping, page->index);
 | |
| 	BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
 | |
| 	dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
 | |
| 	if (reason)
 | |
| 		pr_alert("page dumped because: %s\n", reason);
 | |
| 	if (page->flags & badflags) {
 | |
| 		pr_alert("bad because of flags:\n");
 | |
| 		dump_flags(page->flags & badflags,
 | |
| 				pageflag_names, ARRAY_SIZE(pageflag_names));
 | |
| 	}
 | |
| #ifdef CONFIG_MEMCG
 | |
| 	if (page->mem_cgroup)
 | |
| 		pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
 | |
| #endif
 | |
| }
 | |
| 
 | |
| void dump_page(struct page *page, const char *reason)
 | |
| {
 | |
| 	dump_page_badflags(page, reason, 0);
 | |
| }
 | |
| EXPORT_SYMBOL(dump_page);
 | |
| 
 | |
| #ifdef CONFIG_DEBUG_VM
 | |
| 
 | |
| static const struct trace_print_flags vmaflags_names[] = {
 | |
| 	{VM_READ,			"read"		},
 | |
| 	{VM_WRITE,			"write"		},
 | |
| 	{VM_EXEC,			"exec"		},
 | |
| 	{VM_SHARED,			"shared"	},
 | |
| 	{VM_MAYREAD,			"mayread"	},
 | |
| 	{VM_MAYWRITE,			"maywrite"	},
 | |
| 	{VM_MAYEXEC,			"mayexec"	},
 | |
| 	{VM_MAYSHARE,			"mayshare"	},
 | |
| 	{VM_GROWSDOWN,			"growsdown"	},
 | |
| 	{VM_PFNMAP,			"pfnmap"	},
 | |
| 	{VM_DENYWRITE,			"denywrite"	},
 | |
| 	{VM_LOCKED,			"locked"	},
 | |
| 	{VM_IO,				"io"		},
 | |
| 	{VM_SEQ_READ,			"seqread"	},
 | |
| 	{VM_RAND_READ,			"randread"	},
 | |
| 	{VM_DONTCOPY,			"dontcopy"	},
 | |
| 	{VM_DONTEXPAND,			"dontexpand"	},
 | |
| 	{VM_ACCOUNT,			"account"	},
 | |
| 	{VM_NORESERVE,			"noreserve"	},
 | |
| 	{VM_HUGETLB,			"hugetlb"	},
 | |
| #if defined(CONFIG_X86)
 | |
| 	{VM_PAT,			"pat"		},
 | |
| #elif defined(CONFIG_PPC)
 | |
| 	{VM_SAO,			"sao"		},
 | |
| #elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
 | |
| 	{VM_GROWSUP,			"growsup"	},
 | |
| #elif !defined(CONFIG_MMU)
 | |
| 	{VM_MAPPED_COPY,		"mappedcopy"	},
 | |
| #else
 | |
| 	{VM_ARCH_1,			"arch_1"	},
 | |
| #endif
 | |
| 	{VM_DONTDUMP,			"dontdump"	},
 | |
| #ifdef CONFIG_MEM_SOFT_DIRTY
 | |
| 	{VM_SOFTDIRTY,			"softdirty"	},
 | |
| #endif
 | |
| 	{VM_MIXEDMAP,			"mixedmap"	},
 | |
| 	{VM_HUGEPAGE,			"hugepage"	},
 | |
| 	{VM_NOHUGEPAGE,			"nohugepage"	},
 | |
| 	{VM_MERGEABLE,			"mergeable"	},
 | |
| };
 | |
| 
 | |
| void dump_vma(const struct vm_area_struct *vma)
 | |
| {
 | |
| 	pr_emerg("vma %p start %p end %p\n"
 | |
| 		"next %p prev %p mm %p\n"
 | |
| 		"prot %lx anon_vma %p vm_ops %p\n"
 | |
| 		"pgoff %lx file %p private_data %p\n",
 | |
| 		vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
 | |
| 		vma->vm_prev, vma->vm_mm,
 | |
| 		(unsigned long)pgprot_val(vma->vm_page_prot),
 | |
| 		vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
 | |
| 		vma->vm_file, vma->vm_private_data);
 | |
| 	dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
 | |
| }
 | |
| EXPORT_SYMBOL(dump_vma);
 | |
| 
 | |
| void dump_mm(const struct mm_struct *mm)
 | |
| {
 | |
| 	pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
 | |
| #ifdef CONFIG_MMU
 | |
| 		"get_unmapped_area %p\n"
 | |
| #endif
 | |
| 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
 | |
| 		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
 | |
| 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 | |
| 		"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
 | |
| 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
 | |
| 		"start_brk %lx brk %lx start_stack %lx\n"
 | |
| 		"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
 | |
| 		"binfmt %p flags %lx core_state %p\n"
 | |
| #ifdef CONFIG_AIO
 | |
| 		"ioctx_table %p\n"
 | |
| #endif
 | |
| #ifdef CONFIG_MEMCG
 | |
| 		"owner %p "
 | |
| #endif
 | |
| 		"exe_file %p\n"
 | |
| #ifdef CONFIG_MMU_NOTIFIER
 | |
| 		"mmu_notifier_mm %p\n"
 | |
| #endif
 | |
| #ifdef CONFIG_NUMA_BALANCING
 | |
| 		"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
 | |
| #endif
 | |
| #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 | |
| 		"tlb_flush_pending %d\n"
 | |
| #endif
 | |
| 		"%s",	/* This is here to hold the comma */
 | |
| 
 | |
| 		mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
 | |
| #ifdef CONFIG_MMU
 | |
| 		mm->get_unmapped_area,
 | |
| #endif
 | |
| 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 | |
| 		mm->pgd, atomic_read(&mm->mm_users),
 | |
| 		atomic_read(&mm->mm_count),
 | |
| 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
 | |
| 		mm_nr_pmds((struct mm_struct *)mm),
 | |
| 		mm->map_count,
 | |
| 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 | |
| 		mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
 | |
| 		mm->start_code, mm->end_code, mm->start_data, mm->end_data,
 | |
| 		mm->start_brk, mm->brk, mm->start_stack,
 | |
| 		mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
 | |
| 		mm->binfmt, mm->flags, mm->core_state,
 | |
| #ifdef CONFIG_AIO
 | |
| 		mm->ioctx_table,
 | |
| #endif
 | |
| #ifdef CONFIG_MEMCG
 | |
| 		mm->owner,
 | |
| #endif
 | |
| 		mm->exe_file,
 | |
| #ifdef CONFIG_MMU_NOTIFIER
 | |
| 		mm->mmu_notifier_mm,
 | |
| #endif
 | |
| #ifdef CONFIG_NUMA_BALANCING
 | |
| 		mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
 | |
| #endif
 | |
| #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
 | |
| 		mm->tlb_flush_pending,
 | |
| #endif
 | |
| 		""		/* This is here to not have a comma! */
 | |
| 		);
 | |
| 
 | |
| 		dump_flags(mm->def_flags, vmaflags_names,
 | |
| 				ARRAY_SIZE(vmaflags_names));
 | |
| }
 | |
| 
 | |
| #endif		/* CONFIG_DEBUG_VM */
 |