powerpc: reorder per-cpu NUMA information's initialization
There is an issue currently where NUMA information is used on powerpc (and possibly ia64) before it has been read from the device-tree, which leads to large slab consumption with CONFIG_SLUB and memoryless nodes. NUMA powerpc non-boot CPU's cpu_to_node/cpu_to_mem is only accurate after start_secondary(), similar to ia64, which is invoked via smp_init(). Commit6ee0578b4d("workqueue: mark init_workqueues() as early_initcall()") made init_workqueues() be invoked via do_pre_smp_initcalls(), which is obviously before the secondary processors are online. Additionally, the following commits changed init_workqueues() to use cpu_to_node to determine the node to use for kthread_create_on_node:bce903809a("workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[]")f3f90ad469("workqueue: determine NUMA node of workers accourding to the allowed cpumask") Therefore, when init_workqueues() runs, it sees all CPUs as being on Node 0. On LPARs or KVM guests where Node 0 is memoryless, this leads to a high number of slab deactivations (http://www.spinics.net/lists/linux-mm/msg67489.html). Fix this by initializing the powerpc-specific CPU<->node/local memory node mapping as early as possible, which on powerpc is do_init_bootmem(). Currently that function initializes the mapping for the boot CPU, but we extend it to setup the mapping for all possible CPUs. Then, in smp_prepare_cpus(), we can correspondingly set the per-cpu values for all possible CPUs. That ensures that before the early_initcalls run (and really as early as possible), the per-cpu NUMA mapping is accurate. While testing memoryless nodes on PowerKVM guests with a fix to the workqueue logic to use cpu_to_mem() instead of cpu_to_node(), with a guest topology of: available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 node 0 size: 0 MB node 0 free: 0 MB node 1 cpus: 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 node 1 size: 16336 MB node 1 free: 15329 MB node distances: node 0 1 0: 10 40 1: 40 10 the slab consumption decreases from Slab: 932416 kB SUnreclaim: 902336 kB to Slab: 395264 kB SUnreclaim: 359424 kB And we a corresponding increase in the slab efficiency from slab mem objs slabs used active active ------------------------------------------------------------ kmalloc-16384 337 MB 11.28% 100.00% task_struct 288 MB 9.93% 100.00% to slab mem objs slabs used active active ------------------------------------------------------------ kmalloc-16384 37 MB 100.00% 100.00% task_struct 31 MB 100.00% 100.00% Powerpc didn't support memoryless nodes until recently (64bb80d87f"powerpc/numa: Enable CONFIG_HAVE_MEMORYLESS_NODES" and8c27226119"powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID"). Those commits also helped improve memory consumption with these kind of environments. Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
		
					parent
					
						
							
								d658972284
							
						
					
				
			
			
				commit
				
					
						2fabf084b6
					
				
			
		
					 2 changed files with 15 additions and 9 deletions
				
			
		|  | @ -376,6 +376,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) | |||
| 					GFP_KERNEL, cpu_to_node(cpu)); | ||||
| 		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), | ||||
| 					GFP_KERNEL, cpu_to_node(cpu)); | ||||
| 		/*
 | ||||
| 		 * numa_node_id() works after this. | ||||
| 		 */ | ||||
| 		set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]); | ||||
| 		set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu])); | ||||
| 	} | ||||
| 
 | ||||
| 	cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); | ||||
|  | @ -723,12 +728,6 @@ void start_secondary(void *unused) | |||
| 	} | ||||
| 	traverse_core_siblings(cpu, true); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * numa_node_id() works after this. | ||||
| 	 */ | ||||
| 	set_numa_node(numa_cpu_lookup_table[cpu]); | ||||
| 	set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); | ||||
| 
 | ||||
| 	smp_wmb(); | ||||
| 	notify_cpu_starting(cpu); | ||||
| 	set_cpu_online(cpu, true); | ||||
|  |  | |||
|  | @ -1049,7 +1049,7 @@ static void __init mark_reserved_regions_for_nid(int nid) | |||
| 
 | ||||
| void __init do_init_bootmem(void) | ||||
| { | ||||
| 	int nid; | ||||
| 	int nid, cpu; | ||||
| 
 | ||||
| 	min_low_pfn = 0; | ||||
| 	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; | ||||
|  | @ -1122,8 +1122,15 @@ void __init do_init_bootmem(void) | |||
| 
 | ||||
| 	reset_numa_cpu_lookup_table(); | ||||
| 	register_cpu_notifier(&ppc64_numa_nb); | ||||
| 	/*
 | ||||
| 	 * We need the numa_cpu_lookup_table to be accurate for all CPUs, | ||||
| 	 * even before we online them, so that we can use cpu_to_{node,mem} | ||||
| 	 * early in boot, cf. smp_prepare_cpus(). | ||||
| 	 */ | ||||
| 	for_each_possible_cpu(cpu) { | ||||
| 		cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, | ||||
| 			  (void *)(unsigned long)boot_cpuid); | ||||
| 				  (void *)(unsigned long)cpu); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| void __init paging_init(void) | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Nishanth Aravamudan
				Nishanth Aravamudan