| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * This file is subject to the terms and conditions of the GNU General Public | 
					
						
							|  |  |  |  * License.  See the file "COPYING" in the main directory of this archive | 
					
						
							|  |  |  |  * for more details. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * This file contains NUMA specific variables and functions which can | 
					
						
							|  |  |  |  * be split away from DISCONTIGMEM and are used on NUMA machines with | 
					
						
							|  |  |  |  * contiguous memory. | 
					
						
							|  |  |  |  *  | 
					
						
							|  |  |  |  *                         2002/08/07 Erich Focht <efocht@ess.nec.de> | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include <linux/cpu.h>
 | 
					
						
							|  |  |  | #include <linux/kernel.h>
 | 
					
						
							|  |  |  | #include <linux/mm.h>
 | 
					
						
							|  |  |  | #include <linux/node.h>
 | 
					
						
							|  |  |  | #include <linux/init.h>
 | 
					
						
							|  |  |  | #include <linux/bootmem.h>
 | 
					
						
							| 
									
										
										
										
											2006-09-30 23:27:07 -07:00
										 |  |  | #include <linux/module.h>
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #include <asm/mmzone.h>
 | 
					
						
							|  |  |  | #include <asm/numa.h>
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * The following structures are usually initialized by ACPI or | 
					
						
							|  |  |  |  * similar mechanisms and describe the NUMA characteristics of the machine. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | int num_node_memblks; | 
					
						
							|  |  |  | struct node_memblk_s node_memblk[NR_NODE_MEMBLKS]; | 
					
						
							| 
									
										
										
										
											2008-04-03 15:17:13 -05:00
										 |  |  | struct node_cpuid_s node_cpuid[NR_CPUS] = | 
					
						
							|  |  |  | 	{ [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * This is a matrix with "distances" between nodes, they should be | 
					
						
							|  |  |  |  * proportional to the memory access latency ratios. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Identify which cnode a physical address resides on */ | 
					
						
							|  |  |  | int | 
					
						
							|  |  |  | paddr_to_nid(unsigned long paddr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	int	i; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for (i = 0; i < num_node_memblks; i++) | 
					
						
							|  |  |  | 		if (paddr >= node_memblk[i].start_paddr && | 
					
						
							|  |  |  | 		    paddr < node_memblk[i].start_paddr + node_memblk[i].size) | 
					
						
							|  |  |  | 			break; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0); | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2005-10-04 15:13:57 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | #if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA)
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Because of holes evaluate on section limits. | 
					
						
							|  |  |  |  * If the section of memory exists, then return the node where the section | 
					
						
							|  |  |  |  * resides.  Otherwise return node 0 as the default.  This is used by | 
					
						
							|  |  |  |  * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where | 
					
						
							|  |  |  |  * the section resides. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
											  
											
												mm: clean up for early_pfn_to_nid()
What's happening is that the assertion in mm/page_alloc.c:move_freepages()
is triggering:
	BUG_ON(page_zone(start_page) != page_zone(end_page));
Once I knew this is what was happening, I added some annotations:
	if (unlikely(page_zone(start_page) != page_zone(end_page))) {
		printk(KERN_ERR "move_freepages: Bogus zones: "
		       "start_page[%p] end_page[%p] zone[%p]\n",
		       start_page, end_page, zone);
		printk(KERN_ERR "move_freepages: "
		       "start_zone[%p] end_zone[%p]\n",
		       page_zone(start_page), page_zone(end_page));
		printk(KERN_ERR "move_freepages: "
		       "start_pfn[0x%lx] end_pfn[0x%lx]\n",
		       page_to_pfn(start_page), page_to_pfn(end_page));
		printk(KERN_ERR "move_freepages: "
		       "start_nid[%d] end_nid[%d]\n",
		       page_to_nid(start_page), page_to_nid(end_page));
 ...
And here's what I got:
	move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00]
	move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00]
	move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff]
	move_freepages: start_nid[1] end_nid[0]
My memory layout on this box is:
[    0.000000] Zone PFN ranges:
[    0.000000]   Normal   0x00000000 -> 0x0081ff5d
[    0.000000] Movable zone start PFN for each node
[    0.000000] early_node_map[8] active PFN ranges
[    0.000000]     0: 0x00000000 -> 0x00020000
[    0.000000]     1: 0x00800000 -> 0x0081f7ff
[    0.000000]     1: 0x0081f800 -> 0x0081fe50
[    0.000000]     1: 0x0081fed1 -> 0x0081fed8
[    0.000000]     1: 0x0081feda -> 0x0081fedb
[    0.000000]     1: 0x0081fedd -> 0x0081fee5
[    0.000000]     1: 0x0081fee7 -> 0x0081ff51
[    0.000000]     1: 0x0081ff59 -> 0x0081ff5d
So it's a block move in that 0x81f600-->0x81f7ff region which triggers
the problem.
This patch:
Declaration of early_pfn_to_nid() is scattered over per-arch include
files, and it seems it's complicated to know when the declaration is used.
 I think it makes fix-for-memmap-init not easy.
This patch moves all declaration to include/linux/mm.h
After this,
  if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
     -> Use static definition in include/linux/mm.h
  else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
     -> Use generic definition in mm/page_alloc.c
  else
     -> per-arch back end function will be called.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: David Miller <davem@davemlloft.net>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
											
										 
											2009-02-18 14:48:32 -08:00
										 |  |  | int __meminit __early_pfn_to_nid(unsigned long pfn) | 
					
						
							| 
									
										
										
										
											2005-10-04 15:13:57 -04:00
										 |  |  | { | 
					
						
							|  |  |  | 	int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	for (i = 0; i < num_node_memblks; i++) { | 
					
						
							|  |  |  | 		ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT; | 
					
						
							|  |  |  | 		esec = (node_memblk[i].start_paddr + node_memblk[i].size + | 
					
						
							|  |  |  | 			((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT; | 
					
						
							|  |  |  | 		if (section >= ssec && section < esec) | 
					
						
							|  |  |  | 			return node_memblk[i].nid; | 
					
						
							|  |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-02-18 14:48:33 -08:00
										 |  |  | 	return -1; | 
					
						
							| 
									
										
										
										
											2005-10-04 15:13:57 -04:00
										 |  |  | } | 
					
						
							| 
									
										
										
										
											2006-09-30 23:27:07 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | #ifdef CONFIG_MEMORY_HOTPLUG
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  *  SRAT information is stored in node_memblk[], then we can use SRAT | 
					
						
							|  |  |  |  *  information at memory-hot-add if necessary. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | int memory_add_physaddr_to_nid(u64 addr) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	int nid = paddr_to_nid(addr); | 
					
						
							|  |  |  | 	if (nid < 0) | 
					
						
							|  |  |  | 		return 0; | 
					
						
							|  |  |  | 	return nid; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | 
					
						
							|  |  |  | #endif
 | 
					
						
							| 
									
										
										
										
											2005-10-04 15:13:57 -04:00
										 |  |  | #endif
 |