 ec8acf20af
			
		
	
	
	ec8acf20af
	
	
	
		
			
			swap_lock is heavily contended when I test swap to 3 fast SSD (even slightly slower than swap to 2 such SSD). The main contention comes from swap_info_get(). This patch tries to fix the gap with adding a new per-partition lock. Global data like nr_swapfiles, total_swap_pages, least_priority and swap_list are still protected by swap_lock. nr_swap_pages is an atomic now, it can be changed without swap_lock. In theory, it's possible get_swap_page() finds no swap pages but actually there are free swap pages. But sounds not a big problem. Accessing partition specific data (like scan_swap_map and so on) is only protected by swap_info_struct.lock. Changing swap_info_struct.flags need hold swap_lock and swap_info_struct.lock, because scan_scan_map() will check it. read the flags is ok with either the locks hold. If both swap_lock and swap_info_struct.lock must be hold, we always hold the former first to avoid deadlock. swap_entry_free() can change swap_list. To delete that code, we add a new highest_priority_index. Whenever get_swap_page() is called, we check it. If it's valid, we use it. It's a pity get_swap_page() still holds swap_lock(). But in practice, swap_lock() isn't heavily contended in my test with this patch (or I can say there are other much more heavier bottlenecks like TLB flush). And BTW, looks get_swap_page() doesn't really need the lock. We never free swap_info[] and we check SWAP_WRITEOK flag. The only risk without the lock is we could swapout to some low priority swap, but we can quickly recover after several rounds of swap, so sounds not a big deal to me. But I'd prefer to fix this if it's a real problem. "swap: make each swap partition have one address_space" improved the swapout speed from 1.7G/s to 2G/s. This patch further improves the speed to 2.3G/s, so around 15% improvement. It's a multi-process test, so TLB flush isn't the biggest bottleneck before the patches. [arnd@arndb.de: fix it for nommu] [hughd@google.com: add missing unlock] [minchan@kernel.org: get rid of lockdep whinge on sys_swapon] Signed-off-by: Shaohua Li <shli@fusionio.com> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Seth Jennings <sjenning@linux.vnet.ibm.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
			
				
	
	
		
			426 lines
		
	
	
	
		
			11 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			426 lines
		
	
	
	
		
			11 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  *  linux/arch/sparc/mm/init.c
 | |
|  *
 | |
|  *  Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
 | |
|  *  Copyright (C) 1995 Eddie C. Dost (ecd@skynet.be)
 | |
|  *  Copyright (C) 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
 | |
|  *  Copyright (C) 2000 Anton Blanchard (anton@samba.org)
 | |
|  */
 | |
| 
 | |
| #include <linux/module.h>
 | |
| #include <linux/signal.h>
 | |
| #include <linux/sched.h>
 | |
| #include <linux/kernel.h>
 | |
| #include <linux/errno.h>
 | |
| #include <linux/string.h>
 | |
| #include <linux/types.h>
 | |
| #include <linux/ptrace.h>
 | |
| #include <linux/mman.h>
 | |
| #include <linux/mm.h>
 | |
| #include <linux/swap.h>
 | |
| #include <linux/initrd.h>
 | |
| #include <linux/init.h>
 | |
| #include <linux/highmem.h>
 | |
| #include <linux/bootmem.h>
 | |
| #include <linux/pagemap.h>
 | |
| #include <linux/poison.h>
 | |
| #include <linux/gfp.h>
 | |
| 
 | |
| #include <asm/sections.h>
 | |
| #include <asm/page.h>
 | |
| #include <asm/pgtable.h>
 | |
| #include <asm/vaddrs.h>
 | |
| #include <asm/pgalloc.h>	/* bug in asm-generic/tlb.h: check_pgt_cache */
 | |
| #include <asm/tlb.h>
 | |
| #include <asm/prom.h>
 | |
| #include <asm/leon.h>
 | |
| 
 | |
| unsigned long *sparc_valid_addr_bitmap;
 | |
| EXPORT_SYMBOL(sparc_valid_addr_bitmap);
 | |
| 
 | |
| unsigned long phys_base;
 | |
| EXPORT_SYMBOL(phys_base);
 | |
| 
 | |
| unsigned long pfn_base;
 | |
| EXPORT_SYMBOL(pfn_base);
 | |
| 
 | |
| struct sparc_phys_banks sp_banks[SPARC_PHYS_BANKS+1];
 | |
| 
 | |
| /* Initial ramdisk setup */
 | |
| extern unsigned int sparc_ramdisk_image;
 | |
| extern unsigned int sparc_ramdisk_size;
 | |
| 
 | |
| unsigned long highstart_pfn, highend_pfn;
 | |
| 
 | |
| void show_mem(unsigned int filter)
 | |
| {
 | |
| 	printk("Mem-info:\n");
 | |
| 	show_free_areas(filter);
 | |
| 	printk("Free swap:       %6ldkB\n",
 | |
| 	       get_nr_swap_pages() << (PAGE_SHIFT-10));
 | |
| 	printk("%ld pages of RAM\n", totalram_pages);
 | |
| 	printk("%ld free pages\n", nr_free_pages());
 | |
| }
 | |
| 
 | |
| 
 | |
| extern unsigned long cmdline_memory_size;
 | |
| unsigned long last_valid_pfn;
 | |
| 
 | |
| unsigned long calc_highpages(void)
 | |
| {
 | |
| 	int i;
 | |
| 	int nr = 0;
 | |
| 
 | |
| 	for (i = 0; sp_banks[i].num_bytes != 0; i++) {
 | |
| 		unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
 | |
| 		unsigned long end_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
 | |
| 
 | |
| 		if (end_pfn <= max_low_pfn)
 | |
| 			continue;
 | |
| 
 | |
| 		if (start_pfn < max_low_pfn)
 | |
| 			start_pfn = max_low_pfn;
 | |
| 
 | |
| 		nr += end_pfn - start_pfn;
 | |
| 	}
 | |
| 
 | |
| 	return nr;
 | |
| }
 | |
| 
 | |
| static unsigned long calc_max_low_pfn(void)
 | |
| {
 | |
| 	int i;
 | |
| 	unsigned long tmp = pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT);
 | |
| 	unsigned long curr_pfn, last_pfn;
 | |
| 
 | |
| 	last_pfn = (sp_banks[0].base_addr + sp_banks[0].num_bytes) >> PAGE_SHIFT;
 | |
| 	for (i = 1; sp_banks[i].num_bytes != 0; i++) {
 | |
| 		curr_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
 | |
| 
 | |
| 		if (curr_pfn >= tmp) {
 | |
| 			if (last_pfn < tmp)
 | |
| 				tmp = last_pfn;
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		last_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
 | |
| 	}
 | |
| 
 | |
| 	return tmp;
 | |
| }
 | |
| 
 | |
| unsigned long __init bootmem_init(unsigned long *pages_avail)
 | |
| {
 | |
| 	unsigned long bootmap_size, start_pfn;
 | |
| 	unsigned long end_of_phys_memory = 0UL;
 | |
| 	unsigned long bootmap_pfn, bytes_avail, size;
 | |
| 	int i;
 | |
| 
 | |
| 	bytes_avail = 0UL;
 | |
| 	for (i = 0; sp_banks[i].num_bytes != 0; i++) {
 | |
| 		end_of_phys_memory = sp_banks[i].base_addr +
 | |
| 			sp_banks[i].num_bytes;
 | |
| 		bytes_avail += sp_banks[i].num_bytes;
 | |
| 		if (cmdline_memory_size) {
 | |
| 			if (bytes_avail > cmdline_memory_size) {
 | |
| 				unsigned long slack = bytes_avail - cmdline_memory_size;
 | |
| 
 | |
| 				bytes_avail -= slack;
 | |
| 				end_of_phys_memory -= slack;
 | |
| 
 | |
| 				sp_banks[i].num_bytes -= slack;
 | |
| 				if (sp_banks[i].num_bytes == 0) {
 | |
| 					sp_banks[i].base_addr = 0xdeadbeef;
 | |
| 				} else {
 | |
| 					sp_banks[i+1].num_bytes = 0;
 | |
| 					sp_banks[i+1].base_addr = 0xdeadbeef;
 | |
| 				}
 | |
| 				break;
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	/* Start with page aligned address of last symbol in kernel
 | |
| 	 * image.  
 | |
| 	 */
 | |
| 	start_pfn  = (unsigned long)__pa(PAGE_ALIGN((unsigned long) &_end));
 | |
| 
 | |
| 	/* Now shift down to get the real physical page frame number. */
 | |
| 	start_pfn >>= PAGE_SHIFT;
 | |
| 
 | |
| 	bootmap_pfn = start_pfn;
 | |
| 
 | |
| 	max_pfn = end_of_phys_memory >> PAGE_SHIFT;
 | |
| 
 | |
| 	max_low_pfn = max_pfn;
 | |
| 	highstart_pfn = highend_pfn = max_pfn;
 | |
| 
 | |
| 	if (max_low_pfn > pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT)) {
 | |
| 		highstart_pfn = pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT);
 | |
| 		max_low_pfn = calc_max_low_pfn();
 | |
| 		printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 | |
| 		    calc_highpages() >> (20 - PAGE_SHIFT));
 | |
| 	}
 | |
| 
 | |
| #ifdef CONFIG_BLK_DEV_INITRD
 | |
| 	/* Now have to check initial ramdisk, so that bootmap does not overwrite it */
 | |
| 	if (sparc_ramdisk_image) {
 | |
| 		if (sparc_ramdisk_image >= (unsigned long)&_end - 2 * PAGE_SIZE)
 | |
| 			sparc_ramdisk_image -= KERNBASE;
 | |
| 		initrd_start = sparc_ramdisk_image + phys_base;
 | |
| 		initrd_end = initrd_start + sparc_ramdisk_size;
 | |
| 		if (initrd_end > end_of_phys_memory) {
 | |
| 			printk(KERN_CRIT "initrd extends beyond end of memory "
 | |
| 		                 	 "(0x%016lx > 0x%016lx)\ndisabling initrd\n",
 | |
| 			       initrd_end, end_of_phys_memory);
 | |
| 			initrd_start = 0;
 | |
| 		}
 | |
| 		if (initrd_start) {
 | |
| 			if (initrd_start >= (start_pfn << PAGE_SHIFT) &&
 | |
| 			    initrd_start < (start_pfn << PAGE_SHIFT) + 2 * PAGE_SIZE)
 | |
| 				bootmap_pfn = PAGE_ALIGN (initrd_end) >> PAGE_SHIFT;
 | |
| 		}
 | |
| 	}
 | |
| #endif	
 | |
| 	/* Initialize the boot-time allocator. */
 | |
| 	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, pfn_base,
 | |
| 					 max_low_pfn);
 | |
| 
 | |
| 	/* Now register the available physical memory with the
 | |
| 	 * allocator.
 | |
| 	 */
 | |
| 	*pages_avail = 0;
 | |
| 	for (i = 0; sp_banks[i].num_bytes != 0; i++) {
 | |
| 		unsigned long curr_pfn, last_pfn;
 | |
| 
 | |
| 		curr_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
 | |
| 		if (curr_pfn >= max_low_pfn)
 | |
| 			break;
 | |
| 
 | |
| 		last_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
 | |
| 		if (last_pfn > max_low_pfn)
 | |
| 			last_pfn = max_low_pfn;
 | |
| 
 | |
| 		/*
 | |
| 		 * .. finally, did all the rounding and playing
 | |
| 		 * around just make the area go away?
 | |
| 		 */
 | |
| 		if (last_pfn <= curr_pfn)
 | |
| 			continue;
 | |
| 
 | |
| 		size = (last_pfn - curr_pfn) << PAGE_SHIFT;
 | |
| 		*pages_avail += last_pfn - curr_pfn;
 | |
| 
 | |
| 		free_bootmem(sp_banks[i].base_addr, size);
 | |
| 	}
 | |
| 
 | |
| #ifdef CONFIG_BLK_DEV_INITRD
 | |
| 	if (initrd_start) {
 | |
| 		/* Reserve the initrd image area. */
 | |
| 		size = initrd_end - initrd_start;
 | |
| 		reserve_bootmem(initrd_start, size, BOOTMEM_DEFAULT);
 | |
| 		*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
 | |
| 
 | |
| 		initrd_start = (initrd_start - phys_base) + PAGE_OFFSET;
 | |
| 		initrd_end = (initrd_end - phys_base) + PAGE_OFFSET;		
 | |
| 	}
 | |
| #endif
 | |
| 	/* Reserve the kernel text/data/bss. */
 | |
| 	size = (start_pfn << PAGE_SHIFT) - phys_base;
 | |
| 	reserve_bootmem(phys_base, size, BOOTMEM_DEFAULT);
 | |
| 	*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
 | |
| 
 | |
| 	/* Reserve the bootmem map.   We do not account for it
 | |
| 	 * in pages_avail because we will release that memory
 | |
| 	 * in free_all_bootmem.
 | |
| 	 */
 | |
| 	size = bootmap_size;
 | |
| 	reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size, BOOTMEM_DEFAULT);
 | |
| 	*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
 | |
| 
 | |
| 	return max_pfn;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * paging_init() sets up the page tables: We call the MMU specific
 | |
|  * init routine based upon the Sun model type on the Sparc.
 | |
|  *
 | |
|  */
 | |
| extern void srmmu_paging_init(void);
 | |
| extern void device_scan(void);
 | |
| 
 | |
| void __init paging_init(void)
 | |
| {
 | |
| 	srmmu_paging_init();
 | |
| 	prom_build_devicetree();
 | |
| 	of_fill_in_cpu_data();
 | |
| 	device_scan();
 | |
| }
 | |
| 
 | |
| static void __init taint_real_pages(void)
 | |
| {
 | |
| 	int i;
 | |
| 
 | |
| 	for (i = 0; sp_banks[i].num_bytes; i++) {
 | |
| 		unsigned long start, end;
 | |
| 
 | |
| 		start = sp_banks[i].base_addr;
 | |
| 		end = start + sp_banks[i].num_bytes;
 | |
| 
 | |
| 		while (start < end) {
 | |
| 			set_bit(start >> 20, sparc_valid_addr_bitmap);
 | |
| 			start += PAGE_SIZE;
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| static void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
 | |
| {
 | |
| 	unsigned long tmp;
 | |
| 
 | |
| #ifdef CONFIG_DEBUG_HIGHMEM
 | |
| 	printk("mapping high region %08lx - %08lx\n", start_pfn, end_pfn);
 | |
| #endif
 | |
| 
 | |
| 	for (tmp = start_pfn; tmp < end_pfn; tmp++) {
 | |
| 		struct page *page = pfn_to_page(tmp);
 | |
| 
 | |
| 		ClearPageReserved(page);
 | |
| 		init_page_count(page);
 | |
| 		__free_page(page);
 | |
| 		totalhigh_pages++;
 | |
| 	}
 | |
| }
 | |
| 
 | |
| void __init mem_init(void)
 | |
| {
 | |
| 	int codepages = 0;
 | |
| 	int datapages = 0;
 | |
| 	int initpages = 0; 
 | |
| 	int reservedpages = 0;
 | |
| 	int i;
 | |
| 
 | |
| 	if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
 | |
| 		prom_printf("BUG: fixmap and pkmap areas overlap\n");
 | |
| 		prom_printf("pkbase: 0x%lx pkend: 0x%lx fixstart 0x%lx\n",
 | |
| 		       PKMAP_BASE,
 | |
| 		       (unsigned long)PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
 | |
| 		       FIXADDR_START);
 | |
| 		prom_printf("Please mail sparclinux@vger.kernel.org.\n");
 | |
| 		prom_halt();
 | |
| 	}
 | |
| 
 | |
| 
 | |
| 	/* Saves us work later. */
 | |
| 	memset((void *)&empty_zero_page, 0, PAGE_SIZE);
 | |
| 
 | |
| 	i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5);
 | |
| 	i += 1;
 | |
| 	sparc_valid_addr_bitmap = (unsigned long *)
 | |
| 		__alloc_bootmem(i << 2, SMP_CACHE_BYTES, 0UL);
 | |
| 
 | |
| 	if (sparc_valid_addr_bitmap == NULL) {
 | |
| 		prom_printf("mem_init: Cannot alloc valid_addr_bitmap.\n");
 | |
| 		prom_halt();
 | |
| 	}
 | |
| 	memset(sparc_valid_addr_bitmap, 0, i << 2);
 | |
| 
 | |
| 	taint_real_pages();
 | |
| 
 | |
| 	max_mapnr = last_valid_pfn - pfn_base;
 | |
| 	high_memory = __va(max_low_pfn << PAGE_SHIFT);
 | |
| 
 | |
| 	totalram_pages = free_all_bootmem();
 | |
| 
 | |
| 	for (i = 0; sp_banks[i].num_bytes != 0; i++) {
 | |
| 		unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT;
 | |
| 		unsigned long end_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT;
 | |
| 
 | |
| 		num_physpages += sp_banks[i].num_bytes >> PAGE_SHIFT;
 | |
| 
 | |
| 		if (end_pfn <= highstart_pfn)
 | |
| 			continue;
 | |
| 
 | |
| 		if (start_pfn < highstart_pfn)
 | |
| 			start_pfn = highstart_pfn;
 | |
| 
 | |
| 		map_high_region(start_pfn, end_pfn);
 | |
| 	}
 | |
| 	
 | |
| 	totalram_pages += totalhigh_pages;
 | |
| 
 | |
| 	codepages = (((unsigned long) &_etext) - ((unsigned long)&_start));
 | |
| 	codepages = PAGE_ALIGN(codepages) >> PAGE_SHIFT;
 | |
| 	datapages = (((unsigned long) &_edata) - ((unsigned long)&_etext));
 | |
| 	datapages = PAGE_ALIGN(datapages) >> PAGE_SHIFT;
 | |
| 	initpages = (((unsigned long) &__init_end) - ((unsigned long) &__init_begin));
 | |
| 	initpages = PAGE_ALIGN(initpages) >> PAGE_SHIFT;
 | |
| 
 | |
| 	/* Ignore memory holes for the purpose of counting reserved pages */
 | |
| 	for (i=0; i < max_low_pfn; i++)
 | |
| 		if (test_bit(i >> (20 - PAGE_SHIFT), sparc_valid_addr_bitmap)
 | |
| 		    && PageReserved(pfn_to_page(i)))
 | |
| 			reservedpages++;
 | |
| 
 | |
| 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
 | |
| 	       nr_free_pages() << (PAGE_SHIFT-10),
 | |
| 	       num_physpages << (PAGE_SHIFT - 10),
 | |
| 	       codepages << (PAGE_SHIFT-10),
 | |
| 	       reservedpages << (PAGE_SHIFT - 10),
 | |
| 	       datapages << (PAGE_SHIFT-10), 
 | |
| 	       initpages << (PAGE_SHIFT-10),
 | |
| 	       totalhigh_pages << (PAGE_SHIFT-10));
 | |
| }
 | |
| 
 | |
| void free_initmem (void)
 | |
| {
 | |
| 	unsigned long addr;
 | |
| 	unsigned long freed;
 | |
| 
 | |
| 	addr = (unsigned long)(&__init_begin);
 | |
| 	freed = (unsigned long)(&__init_end) - addr;
 | |
| 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 | |
| 		struct page *p;
 | |
| 
 | |
| 		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 | |
| 		p = virt_to_page(addr);
 | |
| 
 | |
| 		ClearPageReserved(p);
 | |
| 		init_page_count(p);
 | |
| 		__free_page(p);
 | |
| 		totalram_pages++;
 | |
| 		num_physpages++;
 | |
| 	}
 | |
| 	printk(KERN_INFO "Freeing unused kernel memory: %ldk freed\n",
 | |
| 		freed >> 10);
 | |
| }
 | |
| 
 | |
| #ifdef CONFIG_BLK_DEV_INITRD
 | |
| void free_initrd_mem(unsigned long start, unsigned long end)
 | |
| {
 | |
| 	if (start < end)
 | |
| 		printk(KERN_INFO "Freeing initrd memory: %ldk freed\n",
 | |
| 			(end - start) >> 10);
 | |
| 	for (; start < end; start += PAGE_SIZE) {
 | |
| 		struct page *p;
 | |
| 
 | |
| 		memset((void *)start, POISON_FREE_INITMEM, PAGE_SIZE);
 | |
| 		p = virt_to_page(start);
 | |
| 
 | |
| 		ClearPageReserved(p);
 | |
| 		init_page_count(p);
 | |
| 		__free_page(p);
 | |
| 		totalram_pages++;
 | |
| 		num_physpages++;
 | |
| 	}
 | |
| }
 | |
| #endif
 | |
| 
 | |
| void sparc_flush_page_to_ram(struct page *page)
 | |
| {
 | |
| 	unsigned long vaddr = (unsigned long)page_address(page);
 | |
| 
 | |
| 	if (vaddr)
 | |
| 		__flush_page_to_ram(vaddr);
 | |
| }
 | |
| EXPORT_SYMBOL(sparc_flush_page_to_ram);
 |