When booting a kexec/kdump kernel on a system that has specific memory hotplug regions the boot will fail with warnings like: swapper/0: page allocation failure: order:9, mode:0x84d0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.10.0-65.el7.x86_64 #1 Hardware name: QCI QSSC-S4R/QSSC-S4R, BIOS QSSC-S4R.QCI.01.00.S013.032920111005 03/29/2011 0000000000000000 ffff8800341bd8c8 ffffffff815bcc67 ffff8800341bd950 ffffffff8113b1a0 ffff880036339b00 0000000000000009 00000000000084d0 ffff8800341bd950 ffffffff815b87ee 0000000000000000 0000000000000200 Call Trace: [<ffffffff815bcc67>] dump_stack+0x19/0x1b [<ffffffff8113b1a0>] warn_alloc_failed+0xf0/0x160 [<ffffffff815b87ee>] ? __alloc_pages_direct_compact+0xac/0x196 [<ffffffff8113f14f>] __alloc_pages_nodemask+0x7ff/0xa00 [<ffffffff815b417c>] vmemmap_alloc_block+0x62/0xba [<ffffffff815b41e9>] vmemmap_alloc_block_buf+0x15/0x3b [<ffffffff815b1ff6>] vmemmap_populate+0xb4/0x21b [<ffffffff815b461d>] sparse_mem_map_populate+0x27/0x35 [<ffffffff815b400f>] sparse_add_one_section+0x7a/0x185 [<ffffffff815a1e9f>] __add_pages+0xaf/0x240 [<ffffffff81047359>] arch_add_memory+0x59/0xd0 [<ffffffff815a21d9>] add_memory+0xb9/0x1b0 [<ffffffff81333b9c>] acpi_memory_device_add+0x18d/0x26d [<ffffffff81309a01>] acpi_bus_device_attach+0x7d/0xcd [<ffffffff8132379d>] acpi_ns_walk_namespace+0xc8/0x17f [<ffffffff81309984>] ? acpi_bus_type_and_status+0x90/0x90 [<ffffffff81309984>] ? acpi_bus_type_and_status+0x90/0x90 [<ffffffff81323c8c>] acpi_walk_namespace+0x95/0xc5 [<ffffffff8130a6d6>] acpi_bus_scan+0x8b/0x9d [<ffffffff81a2019a>] acpi_scan_init+0x63/0x160 [<ffffffff81a1ffb5>] acpi_init+0x25d/0x2a6 [<ffffffff81a1fd58>] ? acpi_sleep_proc_init+0x2a/0x2a [<ffffffff810020e2>] do_one_initcall+0xe2/0x190 [<ffffffff819e20c4>] kernel_init_freeable+0x17c/0x207 [<ffffffff819e18d0>] ? do_early_param+0x88/0x88 [<ffffffff8159fea0>] ? rest_init+0x80/0x80 [<ffffffff8159feae>] kernel_init+0xe/0x180 [<ffffffff815cca2c>] ret_from_fork+0x7c/0xb0 [<ffffffff8159fea0>] ? rest_init+0x80/0x80 Mem-Info: Node 0 DMA per-cpu: CPU 0: hi: 0, btch: 1 usd: 0 Node 0 DMA32 per-cpu: CPU 0: hi: 42, btch: 7 usd: 0 active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:0 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 free:872 slab_reclaimable:13 slab_unreclaimable:1880 mapped:0 shmem:0 pagetables:0 bounce:0 free_cma:0 because the system has run out of memory at boot time. This occurs because of the following sequence in the boot: Main kernel boots and sets E820 map. The second kernel is booted with a map generated by the kdump service using memmap= and memmap=exactmap. These parameters are added to the kernel parameters of the kexec/kdump kernel. The kexec/kdump kernel has limited memory resources so as not to severely impact the main kernel. The system then panics and the kdump/kexec kernel boots (which is a completely new kernel boot). During this boot ACPI is initialized and the kernel (as can be seen above) traverses the ACPI namespace and finds an entry for a memory device to be hotadded. ie) [<ffffffff815a1e9f>] __add_pages+0xaf/0x240 [<ffffffff81047359>] arch_add_memory+0x59/0xd0 [<ffffffff815a21d9>] add_memory+0xb9/0x1b0 [<ffffffff81333b9c>] acpi_memory_device_add+0x18d/0x26d [<ffffffff81309a01>] acpi_bus_device_attach+0x7d/0xcd [<ffffffff8132379d>] acpi_ns_walk_namespace+0xc8/0x17f [<ffffffff81309984>] ? acpi_bus_type_and_status+0x90/0x90 [<ffffffff81309984>] ? acpi_bus_type_and_status+0x90/0x90 [<ffffffff81323c8c>] acpi_walk_namespace+0x95/0xc5 [<ffffffff8130a6d6>] acpi_bus_scan+0x8b/0x9d [<ffffffff81a2019a>] acpi_scan_init+0x63/0x160 [<ffffffff81a1ffb5>] acpi_init+0x25d/0x2a6 At this point the kernel adds page table information and the the kexec/kdump kernel runs out of memory. This can also be reproduced by using the memmap=exactmap and mem=X parameters on the main kernel and booting. This patchset resolves the problem by adding a kernel parameter, acpi_no_memhotplug, to disable ACPI memory hotplug. Signed-off-by: Prarit Bhargava <prarit@redhat.com> Acked-by: Vivek Goyal <vgoyal@redhat.com> Acked-by: Toshi Kani <toshi.kani@hp.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
		
			
				
	
	
		
			378 lines
		
	
	
	
		
			10 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			378 lines
		
	
	
	
		
			10 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * Copyright (C) 2004, 2013 Intel Corporation
 | 
						|
 * Author: Naveen B S <naveen.b.s@intel.com>
 | 
						|
 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
 | 
						|
 *
 | 
						|
 * All rights reserved.
 | 
						|
 *
 | 
						|
 * This program is free software; you can redistribute it and/or modify
 | 
						|
 * it under the terms of the GNU General Public License as published by
 | 
						|
 * the Free Software Foundation; either version 2 of the License, or (at
 | 
						|
 * your option) any later version.
 | 
						|
 *
 | 
						|
 * This program is distributed in the hope that it will be useful, but
 | 
						|
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 | 
						|
 * NON INFRINGEMENT.  See the GNU General Public License for more
 | 
						|
 * details.
 | 
						|
 *
 | 
						|
 * You should have received a copy of the GNU General Public License
 | 
						|
 * along with this program; if not, write to the Free Software
 | 
						|
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 | 
						|
 *
 | 
						|
 *
 | 
						|
 * ACPI based HotPlug driver that supports Memory Hotplug
 | 
						|
 * This driver fields notifications from firmware for memory add
 | 
						|
 * and remove operations and alerts the VM of the affected memory
 | 
						|
 * ranges.
 | 
						|
 */
 | 
						|
 | 
						|
#include <linux/acpi.h>
 | 
						|
#include <linux/memory.h>
 | 
						|
#include <linux/memory_hotplug.h>
 | 
						|
 | 
						|
#include "internal.h"
 | 
						|
 | 
						|
#define ACPI_MEMORY_DEVICE_CLASS		"memory"
 | 
						|
#define ACPI_MEMORY_DEVICE_HID			"PNP0C80"
 | 
						|
#define ACPI_MEMORY_DEVICE_NAME			"Hotplug Mem Device"
 | 
						|
 | 
						|
#define _COMPONENT		ACPI_MEMORY_DEVICE_COMPONENT
 | 
						|
 | 
						|
#undef PREFIX
 | 
						|
#define 	PREFIX		"ACPI:memory_hp:"
 | 
						|
 | 
						|
ACPI_MODULE_NAME("acpi_memhotplug");
 | 
						|
 | 
						|
/* Memory Device States */
 | 
						|
#define MEMORY_INVALID_STATE	0
 | 
						|
#define MEMORY_POWER_ON_STATE	1
 | 
						|
#define MEMORY_POWER_OFF_STATE	2
 | 
						|
 | 
						|
static int acpi_memory_device_add(struct acpi_device *device,
 | 
						|
				  const struct acpi_device_id *not_used);
 | 
						|
static void acpi_memory_device_remove(struct acpi_device *device);
 | 
						|
 | 
						|
static const struct acpi_device_id memory_device_ids[] = {
 | 
						|
	{ACPI_MEMORY_DEVICE_HID, 0},
 | 
						|
	{"", 0},
 | 
						|
};
 | 
						|
 | 
						|
static struct acpi_scan_handler memory_device_handler = {
 | 
						|
	.ids = memory_device_ids,
 | 
						|
	.attach = acpi_memory_device_add,
 | 
						|
	.detach = acpi_memory_device_remove,
 | 
						|
	.hotplug = {
 | 
						|
		.enabled = true,
 | 
						|
	},
 | 
						|
};
 | 
						|
 | 
						|
struct acpi_memory_info {
 | 
						|
	struct list_head list;
 | 
						|
	u64 start_addr;		/* Memory Range start physical addr */
 | 
						|
	u64 length;		/* Memory Range length */
 | 
						|
	unsigned short caching;	/* memory cache attribute */
 | 
						|
	unsigned short write_protect;	/* memory read/write attribute */
 | 
						|
	unsigned int enabled:1;
 | 
						|
};
 | 
						|
 | 
						|
struct acpi_memory_device {
 | 
						|
	struct acpi_device * device;
 | 
						|
	unsigned int state;	/* State of the memory device */
 | 
						|
	struct list_head res_list;
 | 
						|
};
 | 
						|
 | 
						|
static acpi_status
 | 
						|
acpi_memory_get_resource(struct acpi_resource *resource, void *context)
 | 
						|
{
 | 
						|
	struct acpi_memory_device *mem_device = context;
 | 
						|
	struct acpi_resource_address64 address64;
 | 
						|
	struct acpi_memory_info *info, *new;
 | 
						|
	acpi_status status;
 | 
						|
 | 
						|
	status = acpi_resource_to_address64(resource, &address64);
 | 
						|
	if (ACPI_FAILURE(status) ||
 | 
						|
	    (address64.resource_type != ACPI_MEMORY_RANGE))
 | 
						|
		return AE_OK;
 | 
						|
 | 
						|
	list_for_each_entry(info, &mem_device->res_list, list) {
 | 
						|
		/* Can we combine the resource range information? */
 | 
						|
		if ((info->caching == address64.info.mem.caching) &&
 | 
						|
		    (info->write_protect == address64.info.mem.write_protect) &&
 | 
						|
		    (info->start_addr + info->length == address64.minimum)) {
 | 
						|
			info->length += address64.address_length;
 | 
						|
			return AE_OK;
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL);
 | 
						|
	if (!new)
 | 
						|
		return AE_ERROR;
 | 
						|
 | 
						|
	INIT_LIST_HEAD(&new->list);
 | 
						|
	new->caching = address64.info.mem.caching;
 | 
						|
	new->write_protect = address64.info.mem.write_protect;
 | 
						|
	new->start_addr = address64.minimum;
 | 
						|
	new->length = address64.address_length;
 | 
						|
	list_add_tail(&new->list, &mem_device->res_list);
 | 
						|
 | 
						|
	return AE_OK;
 | 
						|
}
 | 
						|
 | 
						|
static void
 | 
						|
acpi_memory_free_device_resources(struct acpi_memory_device *mem_device)
 | 
						|
{
 | 
						|
	struct acpi_memory_info *info, *n;
 | 
						|
 | 
						|
	list_for_each_entry_safe(info, n, &mem_device->res_list, list)
 | 
						|
		kfree(info);
 | 
						|
	INIT_LIST_HEAD(&mem_device->res_list);
 | 
						|
}
 | 
						|
 | 
						|
static int
 | 
						|
acpi_memory_get_device_resources(struct acpi_memory_device *mem_device)
 | 
						|
{
 | 
						|
	acpi_status status;
 | 
						|
 | 
						|
	if (!list_empty(&mem_device->res_list))
 | 
						|
		return 0;
 | 
						|
 | 
						|
	status = acpi_walk_resources(mem_device->device->handle, METHOD_NAME__CRS,
 | 
						|
				     acpi_memory_get_resource, mem_device);
 | 
						|
	if (ACPI_FAILURE(status)) {
 | 
						|
		acpi_memory_free_device_resources(mem_device);
 | 
						|
		return -EINVAL;
 | 
						|
	}
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
 | 
						|
{
 | 
						|
	unsigned long long current_status;
 | 
						|
 | 
						|
	/* Get device present/absent information from the _STA */
 | 
						|
	if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle,
 | 
						|
					       METHOD_NAME__STA, NULL,
 | 
						|
					       ¤t_status)))
 | 
						|
		return -ENODEV;
 | 
						|
	/*
 | 
						|
	 * Check for device status. Device should be
 | 
						|
	 * present/enabled/functioning.
 | 
						|
	 */
 | 
						|
	if (!((current_status & ACPI_STA_DEVICE_PRESENT)
 | 
						|
	      && (current_status & ACPI_STA_DEVICE_ENABLED)
 | 
						|
	      && (current_status & ACPI_STA_DEVICE_FUNCTIONING)))
 | 
						|
		return -ENODEV;
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
 | 
						|
{
 | 
						|
	return PFN_DOWN(info->start_addr);
 | 
						|
}
 | 
						|
 | 
						|
static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
 | 
						|
{
 | 
						|
	return PFN_UP(info->start_addr + info->length-1);
 | 
						|
}
 | 
						|
 | 
						|
static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 | 
						|
{
 | 
						|
	return acpi_bind_one(&mem->dev, arg);
 | 
						|
}
 | 
						|
 | 
						|
static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
 | 
						|
				   struct acpi_device *adev)
 | 
						|
{
 | 
						|
	return walk_memory_range(acpi_meminfo_start_pfn(info),
 | 
						|
				 acpi_meminfo_end_pfn(info), adev,
 | 
						|
				 acpi_bind_memblk);
 | 
						|
}
 | 
						|
 | 
						|
static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
 | 
						|
{
 | 
						|
	acpi_unbind_one(&mem->dev);
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static void acpi_unbind_memory_blocks(struct acpi_memory_info *info)
 | 
						|
{
 | 
						|
	walk_memory_range(acpi_meminfo_start_pfn(info),
 | 
						|
			  acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk);
 | 
						|
}
 | 
						|
 | 
						|
static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 | 
						|
{
 | 
						|
	acpi_handle handle = mem_device->device->handle;
 | 
						|
	int result, num_enabled = 0;
 | 
						|
	struct acpi_memory_info *info;
 | 
						|
	int node;
 | 
						|
 | 
						|
	node = acpi_get_node(handle);
 | 
						|
	/*
 | 
						|
	 * Tell the VM there is more memory here...
 | 
						|
	 * Note: Assume that this function returns zero on success
 | 
						|
	 * We don't have memory-hot-add rollback function,now.
 | 
						|
	 * (i.e. memory-hot-remove function)
 | 
						|
	 */
 | 
						|
	list_for_each_entry(info, &mem_device->res_list, list) {
 | 
						|
		if (info->enabled) { /* just sanity check...*/
 | 
						|
			num_enabled++;
 | 
						|
			continue;
 | 
						|
		}
 | 
						|
		/*
 | 
						|
		 * If the memory block size is zero, please ignore it.
 | 
						|
		 * Don't try to do the following memory hotplug flowchart.
 | 
						|
		 */
 | 
						|
		if (!info->length)
 | 
						|
			continue;
 | 
						|
		if (node < 0)
 | 
						|
			node = memory_add_physaddr_to_nid(info->start_addr);
 | 
						|
 | 
						|
		result = add_memory(node, info->start_addr, info->length);
 | 
						|
 | 
						|
		/*
 | 
						|
		 * If the memory block has been used by the kernel, add_memory()
 | 
						|
		 * returns -EEXIST. If add_memory() returns the other error, it
 | 
						|
		 * means that this memory block is not used by the kernel.
 | 
						|
		 */
 | 
						|
		if (result && result != -EEXIST)
 | 
						|
			continue;
 | 
						|
 | 
						|
		result = acpi_bind_memory_blocks(info, mem_device->device);
 | 
						|
		if (result) {
 | 
						|
			acpi_unbind_memory_blocks(info);
 | 
						|
			return -ENODEV;
 | 
						|
		}
 | 
						|
 | 
						|
		info->enabled = 1;
 | 
						|
 | 
						|
		/*
 | 
						|
		 * Add num_enable even if add_memory() returns -EEXIST, so the
 | 
						|
		 * device is bound to this driver.
 | 
						|
		 */
 | 
						|
		num_enabled++;
 | 
						|
	}
 | 
						|
	if (!num_enabled) {
 | 
						|
		dev_err(&mem_device->device->dev, "add_memory failed\n");
 | 
						|
		mem_device->state = MEMORY_INVALID_STATE;
 | 
						|
		return -EINVAL;
 | 
						|
	}
 | 
						|
	/*
 | 
						|
	 * Sometimes the memory device will contain several memory blocks.
 | 
						|
	 * When one memory block is hot-added to the system memory, it will
 | 
						|
	 * be regarded as a success.
 | 
						|
	 * Otherwise if the last memory block can't be hot-added to the system
 | 
						|
	 * memory, it will be failure and the memory device can't be bound with
 | 
						|
	 * driver.
 | 
						|
	 */
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 | 
						|
{
 | 
						|
	acpi_handle handle = mem_device->device->handle;
 | 
						|
	struct acpi_memory_info *info, *n;
 | 
						|
	int nid = acpi_get_node(handle);
 | 
						|
 | 
						|
	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 | 
						|
		if (!info->enabled)
 | 
						|
			continue;
 | 
						|
 | 
						|
		if (nid == NUMA_NO_NODE)
 | 
						|
			nid = memory_add_physaddr_to_nid(info->start_addr);
 | 
						|
 | 
						|
		acpi_unbind_memory_blocks(info);
 | 
						|
		remove_memory(nid, info->start_addr, info->length);
 | 
						|
		list_del(&info->list);
 | 
						|
		kfree(info);
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
static void acpi_memory_device_free(struct acpi_memory_device *mem_device)
 | 
						|
{
 | 
						|
	if (!mem_device)
 | 
						|
		return;
 | 
						|
 | 
						|
	acpi_memory_free_device_resources(mem_device);
 | 
						|
	mem_device->device->driver_data = NULL;
 | 
						|
	kfree(mem_device);
 | 
						|
}
 | 
						|
 | 
						|
static int acpi_memory_device_add(struct acpi_device *device,
 | 
						|
				  const struct acpi_device_id *not_used)
 | 
						|
{
 | 
						|
	struct acpi_memory_device *mem_device;
 | 
						|
	int result;
 | 
						|
 | 
						|
	if (!device)
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL);
 | 
						|
	if (!mem_device)
 | 
						|
		return -ENOMEM;
 | 
						|
 | 
						|
	INIT_LIST_HEAD(&mem_device->res_list);
 | 
						|
	mem_device->device = device;
 | 
						|
	sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME);
 | 
						|
	sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS);
 | 
						|
	device->driver_data = mem_device;
 | 
						|
 | 
						|
	/* Get the range from the _CRS */
 | 
						|
	result = acpi_memory_get_device_resources(mem_device);
 | 
						|
	if (result) {
 | 
						|
		device->driver_data = NULL;
 | 
						|
		kfree(mem_device);
 | 
						|
		return result;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Set the device state */
 | 
						|
	mem_device->state = MEMORY_POWER_ON_STATE;
 | 
						|
 | 
						|
	result = acpi_memory_check_device(mem_device);
 | 
						|
	if (result) {
 | 
						|
		acpi_memory_device_free(mem_device);
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
 | 
						|
	result = acpi_memory_enable_device(mem_device);
 | 
						|
	if (result) {
 | 
						|
		dev_err(&device->dev, "acpi_memory_enable_device() error\n");
 | 
						|
		acpi_memory_device_free(mem_device);
 | 
						|
		return result;
 | 
						|
	}
 | 
						|
 | 
						|
	dev_dbg(&device->dev, "Memory device configured by ACPI\n");
 | 
						|
	return 1;
 | 
						|
}
 | 
						|
 | 
						|
static void acpi_memory_device_remove(struct acpi_device *device)
 | 
						|
{
 | 
						|
	struct acpi_memory_device *mem_device;
 | 
						|
 | 
						|
	if (!device || !acpi_driver_data(device))
 | 
						|
		return;
 | 
						|
 | 
						|
	mem_device = acpi_driver_data(device);
 | 
						|
	acpi_memory_remove_memory(mem_device);
 | 
						|
	acpi_memory_device_free(mem_device);
 | 
						|
}
 | 
						|
 | 
						|
static bool __initdata acpi_no_memhotplug;
 | 
						|
 | 
						|
void __init acpi_memory_hotplug_init(void)
 | 
						|
{
 | 
						|
	if (acpi_no_memhotplug)
 | 
						|
		return;
 | 
						|
 | 
						|
	acpi_scan_add_handler_with_hotplug(&memory_device_handler, "memory");
 | 
						|
}
 | 
						|
 | 
						|
static int __init disable_acpi_memory_hotplug(char *str)
 | 
						|
{
 | 
						|
	acpi_no_memhotplug = true;
 | 
						|
	return 1;
 | 
						|
}
 | 
						|
__setup("acpi_no_memhotplug", disable_acpi_memory_hotplug);
 |