From adccea444c2df5660fff32fe75563075b7d237f7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 6 Jul 2012 07:09:42 -0400 Subject: [PATCH 001/135] cifs: always update the inode cache with the results from a FIND_* commit cd60042cc1392e79410dc8de9e9c1abb38a29e57 upstream. When we get back a FIND_FIRST/NEXT result, we have some info about the dentry that we use to instantiate a new inode. We were ignoring and discarding that info when we had an existing dentry in the cache. Fix this by updating the inode in place when we find an existing dentry and the uniqueid is the same. Reported-and-Tested-by: Andrew Bartlett Reported-by: Bill Robertson Reported-by: Dion Edwards Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/readdir.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 6751e745bbc6..c71032ba5b75 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -85,9 +85,12 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name, dentry = d_lookup(parent, name); if (dentry) { - /* FIXME: check for inode number changes? */ - if (dentry->d_inode != NULL) + inode = dentry->d_inode; + /* update inode in place if i_ino didn't change */ + if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { + cifs_fattr_to_inode(inode, fattr); return dentry; + } d_drop(dentry); dput(dentry); } From dccecc646f06f06db8c32fc6615fee847852cec6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:50 -0400 Subject: [PATCH 002/135] ntp: Fix STA_INS/DEL clearing bug commit 6b1859dba01c7d512b72d77e3fd7da8354235189 upstream. In commit 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d, I introduced a bug that kept the STA_INS or STA_DEL bit from being cleared from time_status via adjtimex() without forcing STA_PLL first. Usually once the STA_INS is set, it isn't cleared until the leap second is applied, so its unlikely this affected anyone. However during testing I noticed it took some effort to cancel a leap second once STA_INS was set. Signed-off-by: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/time/ntp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f1eb182b5fe0..61fc450643b2 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -375,7 +375,9 @@ int second_overflow(unsigned long secs) time_state = TIME_DEL; break; case TIME_INS: - if (secs % 86400 == 0) { + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; time_tai++; @@ -384,7 +386,9 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if ((secs + 1) % 86400 == 0) { + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { leap = 1; time_tai--; time_state = TIME_WAIT; From 6d40de834ce9bb2964a70b7f91a98406eceb0399 Mon Sep 17 00:00:00 2001 From: Aaditya Kumar Date: Tue, 17 Jul 2012 15:48:07 -0700 Subject: [PATCH 003/135] mm: fix lost kswapd wakeup in kswapd_stop() commit 1c7e7f6c0703d03af6bcd5ccc11fc15d23e5ecbe upstream. Offlining memory may block forever, waiting for kswapd() to wake up because kswapd() does not check the event kthread->should_stop before sleeping. The proper pattern, from Documentation/memory-barriers.txt, is: --- waker --- event_indicated = 1; wake_up_process(event_daemon); --- sleeper --- for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (event_indicated) break; schedule(); } set_current_state() may be wrapped by: prepare_to_wait(); In the kswapd() case, event_indicated is kthread->should_stop. === offlining memory (waker) === kswapd_stop() kthread_stop() kthread->should_stop = 1 wake_up_process() wait_for_completion() === kswapd_try_to_sleep (sleeper) === kswapd_try_to_sleep() prepare_to_wait() . . schedule() . . finish_wait() The schedule() needs to be protected by a test of kthread->should_stop, which is wrapped by kthread_should_stop(). Reproducer: Do heavy file I/O in background. Do a memory offline/online in a tight loop Signed-off-by: Aaditya Kumar Acked-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1b0ed361e5bf..130fa32441c0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2695,7 +2695,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * them before going back to sleep. */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - schedule(); + + if (!kthread_should_stop()) + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); } else { if (remaining) From 689415c18f0ca810c48fe907ab4a349b40d66df0 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 19 Jul 2012 09:11:14 +0200 Subject: [PATCH 004/135] MIPS: Properly align the .data..init_task section. commit 7b1c0d26a8e272787f0f9fcc5f3e8531df3b3409 upstream. Improper alignment can lead to unbootable systems and/or random crashes. [ralf@linux-mips.org: This is a lond standing bug since 6eb10bc9e2deab06630261cd05c4cb1e9a60e980 (kernel.org) rsp. c422a10917f75fd19fa7fe070aaaa23e384dae6f (lmo) [MIPS: Clean up linker script using new linker script macros.] so dates back to 2.6.32.] Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/3881/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/thread_info.h | 4 ++-- arch/mips/kernel/vmlinux.lds.S | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index 97f8bf6639e7..adda036bba17 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -60,6 +60,8 @@ struct thread_info { register struct thread_info *__current_thread_info __asm__("$28"); #define current_thread_info() __current_thread_info +#endif /* !__ASSEMBLY__ */ + /* thread information allocation */ #if defined(CONFIG_PAGE_SIZE_4KB) && defined(CONFIG_32BIT) #define THREAD_SIZE_ORDER (1) @@ -97,8 +99,6 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define free_thread_info(info) kfree(info) -#endif /* !__ASSEMBLY__ */ - #define PREEMPT_ACTIVE 0x10000000 /* diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index a81176f44c74..be281c69f6ca 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -1,5 +1,6 @@ #include #include +#include #include #undef mips @@ -73,7 +74,7 @@ SECTIONS .data : { /* Data */ . = . + DATAOFFSET; /* for CONFIG_MAPPED_KERNEL */ - INIT_TASK_DATA(PAGE_SIZE) + INIT_TASK_DATA(THREAD_SIZE) NOSAVE_DATA CACHELINE_ALIGNED_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT) READ_MOSTLY_DATA(1 << CONFIG_MIPS_L1_CACHE_SHIFT) From cd050f56481c26e8f2a1d2fc89188d6c92537545 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sat, 14 Jul 2012 14:33:09 +0300 Subject: [PATCH 005/135] UBIFS: fix a bug in empty space fix-up commit c6727932cfdb13501108b16c38463c09d5ec7a74 upstream. UBIFS has a feature called "empty space fix-up" which is a quirk to work-around limitations of dumb flasher programs. Namely, of those flashers that are unable to skip NAND pages full of 0xFFs while flashing, resulting in empty space at the end of half-filled eraseblocks to be unusable for UBIFS. This feature is relatively new (introduced in v3.0). The fix-up routine (fixup_free_space()) is executed only once at the very first mount if the superblock has the 'space_fixup' flag set (can be done with -F option of mkfs.ubifs). It basically reads all the UBIFS data and metadata and writes it back to the same LEB. The routine assumes the image is pristine and does not have anything in the journal. There was a bug in 'fixup_free_space()' where it fixed up the log incorrectly. All but one LEB of the log of a pristine file-system are empty. And one contains just a commit start node. And 'fixup_free_space()' just unmapped this LEB, which resulted in wiping the commit start node. As a result, some users were unable to mount the file-system next time with the following symptom: UBIFS error (pid 1): replay_log_leb: first log node at LEB 3:0 is not CS node UBIFS error (pid 1): replay_log_leb: log error detected while replaying the log at LEB 3:0 The root-cause of this bug was that 'fixup_free_space()' wrongly assumed that the beginning of empty space in the log head (c->lhead_offs) was known on mount. However, it is not the case - it was always 0. UBIFS does not store in it the master node and finds out by scanning the log on every mount. The fix is simple - just pass commit start node size instead of 0 to 'fixup_leb()'. Signed-off-by: Artem Bityutskiy Reported-by: Iwo Mergler Tested-by: Iwo Mergler Reported-by: James Nute Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/sb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index c606f010e8df..1250016bfb9f 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -715,8 +715,12 @@ static int fixup_free_space(struct ubifs_info *c) lnum = ubifs_next_log_lnum(c, lnum); } - /* Fixup the current log head */ - err = fixup_leb(c, c->lhead_lnum, c->lhead_offs); + /* + * Fixup the log head which contains the only a CS node at the + * beginning. + */ + err = fixup_leb(c, c->lhead_lnum, + ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size)); if (err) goto out; From fbb41f55c42a4f4e708c9e9af926dc6227a5b52d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 20 Jul 2012 14:25:03 +0100 Subject: [PATCH 006/135] dm raid1: fix crash with mirror recovery and discard commit 751f188dd5ab95b3f2b5f2f467c38aae5a2877eb upstream. This patch fixes a crash when a discard request is sent during mirror recovery. Firstly, some background. Generally, the following sequence happens during mirror synchronization: - function do_recovery is called - do_recovery calls dm_rh_recovery_prepare - dm_rh_recovery_prepare uses a semaphore to limit the number simultaneously recovered regions (by default the semaphore value is 1, so only one region at a time is recovered) - dm_rh_recovery_prepare calls __rh_recovery_prepare, __rh_recovery_prepare asks the log driver for the next region to recover. Then, it sets the region state to DM_RH_RECOVERING. If there are no pending I/Os on this region, the region is added to quiesced_regions list. If there are pending I/Os, the region is not added to any list. It is added to the quiesced_regions list later (by dm_rh_dec function) when all I/Os finish. - when the region is on quiesced_regions list, there are no I/Os in flight on this region. The region is popped from the list in dm_rh_recovery_start function. Then, a kcopyd job is started in the recover function. - when the kcopyd job finishes, recovery_complete is called. It calls dm_rh_recovery_end. dm_rh_recovery_end adds the region to recovered_regions or failed_recovered_regions list (depending on whether the copy operation was successful or not). The above mechanism assumes that if the region is in DM_RH_RECOVERING state, no new I/Os are started on this region. When I/O is started, dm_rh_inc_pending is called, which increases reg->pending count. When I/O is finished, dm_rh_dec is called. It decreases reg->pending count. If the count is zero and the region was in DM_RH_RECOVERING state, dm_rh_dec adds it to the quiesced_regions list. Consequently, if we call dm_rh_inc_pending/dm_rh_dec while the region is in DM_RH_RECOVERING state, it could be added to quiesced_regions list multiple times or it could be added to this list when kcopyd is copying data (it is assumed that the region is not on any list while kcopyd does its jobs). This results in memory corruption and crash. There already exist bypasses for REQ_FLUSH requests: REQ_FLUSH requests do not belong to any region, so they are always added to the sync list in do_writes. dm_rh_inc_pending does not increase count for REQ_FLUSH requests. In mirror_end_io, dm_rh_dec is never called for REQ_FLUSH requests. These bypasses avoid the crash possibility described above. These bypasses were improperly implemented for REQ_DISCARD when the mirror target gained discard support in commit 5fc2ffeabb9ee0fc0e71ff16b49f34f0ed3d05b4 (dm raid1: support discard). In do_writes, REQ_DISCARD requests is always added to the sync queue and immediately dispatched (even if the region is in DM_RH_RECOVERING). However, dm_rh_inc and dm_rh_dec is called for REQ_DISCARD resusts. So it violates the rule that no I/Os are started on DM_RH_RECOVERING regions, and causes the list corruption described above. This patch changes it so that REQ_DISCARD requests follow the same path as REQ_FLUSH. This avoids the crash. Reference: https://bugzilla.redhat.com/837607 Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-region-hash.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9bfd057be686..42ef54f9260c 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1210,7 +1210,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (!(bio->bi_rw & REQ_FLUSH)) + if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) dm_rh_dec(ms->rh, map_context->ll); return error; } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7771ed212182..69732e03eb34 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) return; } + if (bio->bi_rw & REQ_DISCARD) + return; + /* We must inform the log that the sync count has changed. */ log->type->set_region_sync(log, region, 0); @@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio->bi_rw & REQ_FLUSH) + if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } From 9116bc4fb2f96d4a3190932bf17088174da04401 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Mon, 31 Oct 2011 17:09:46 -0700 Subject: [PATCH 007/135] mm/vmstat.c: cache align vm_stat commit a1cb2c60ddc98ff4e5246f410558805401ceee67 upstream. Stable note: Not tracked on Bugzilla. This patch is known to make a big difference to tmpfs performance on larger machines. This was found to adversely affect tmpfs I/O performance. Tests run on a 640 cpu UV system. With 120 threads doing parallel writes, each to different tmpfs mounts: No patch: ~300 MB/sec With vm_stat alignment: ~430 MB/sec Signed-off-by: Dimitri Sivanich Acked-by: Christoph Lameter Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b2..6559013c5a16 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu) * * vm_stat contains the global counters */ -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP From 1126e70953638f9516b6a0b96385799c708815e4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 17 Oct 2011 16:38:20 +0200 Subject: [PATCH 008/135] mm: memory hotplug: Check if pages are correctly reserved on a per-section basis commit 2bbcb8788311a40714b585fc11b51da6ffa2ab92 upstream. Stable note: Fixes https://bugzilla.novell.com/show_bug.cgi?id=721039 . Without the patch, memory hot-add can fail for kernel configurations that do not set CONFIG_SPARSEMEM_VMEMMAP. (Resending as I am not seeing it in -next so maybe it got lost) mm: memory hotplug: Check if pages are correctly reserved on a per-section basis It is expected that memory being brought online is PageReserved similar to what happens when the page allocator is being brought up. Memory is onlined in "memory blocks" which consist of one or more sections. Unfortunately, the code that verifies PageReserved is currently assuming that the memmap backing all these pages is virtually contiguous which is only the case when CONFIG_SPARSEMEM_VMEMMAP is set. As a result, memory hot-add is failing on those configurations with the message; kernel: section number XXX page number 256 not reserved, was it already online? This patch updates the PageReserved check to lookup struct page once per section to guarantee the correct struct page is being checked. [Check pages within sections properly: rientjes@google.com] [original patch by: nfont@linux.vnet.ibm.com] Signed-off-by: Mel Gorman Acked-by: KAMEZAWA Hiroyuki Tested-by: Nathan Fontenot Signed-off-by: Greg Kroah-Hartman --- drivers/base/memory.c | 58 +++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 45d7c8fc73bd..5fb6aaed2adc 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -223,6 +223,42 @@ int memory_isolate_notify(unsigned long val, void *v) return atomic_notifier_call_chain(&memory_isolate_chain, val, v); } +/* + * The probe routines leave the pages reserved, just as the bootmem code does. + * Make sure they're still that way. + */ +static bool pages_correctly_reserved(unsigned long start_pfn, + unsigned long nr_pages) +{ + int i, j; + struct page *page; + unsigned long pfn = start_pfn; + + /* + * memmap between sections is not contiguous except with + * SPARSEMEM_VMEMMAP. We lookup the page once per section + * and assume memmap is contiguous within each section + */ + for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return false; + page = pfn_to_page(pfn); + + for (j = 0; j < PAGES_PER_SECTION; j++) { + if (PageReserved(page + j)) + continue; + + printk(KERN_WARNING "section number %ld page number %d " + "not reserved, was it already online?\n", + pfn_to_section_nr(pfn), j); + + return false; + } + } + + return true; +} + /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. @@ -230,7 +266,6 @@ int memory_isolate_notify(unsigned long val, void *v) static int memory_block_action(unsigned long phys_index, unsigned long action) { - int i; unsigned long start_pfn, start_paddr; unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; struct page *first_page; @@ -238,26 +273,13 @@ memory_block_action(unsigned long phys_index, unsigned long action) first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT); - /* - * The probe routines leave the pages reserved, just - * as the bootmem code does. Make sure they're still - * that way. - */ - if (action == MEM_ONLINE) { - for (i = 0; i < nr_pages; i++) { - if (PageReserved(first_page+i)) - continue; - - printk(KERN_WARNING "section number %ld page number %d " - "not reserved, was it already online?\n", - phys_index, i); - return -EBUSY; - } - } - switch (action) { case MEM_ONLINE: start_pfn = page_to_pfn(first_page); + + if (!pages_correctly_reserved(start_pfn, nr_pages)) + return -EBUSY; + ret = online_pages(start_pfn, nr_pages); break; case MEM_OFFLINE: From 71a07f4cf29615d30369760c022972d4875758b3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Jan 2012 15:07:14 -0800 Subject: [PATCH 009/135] mm: reduce the amount of work done when updating min_free_kbytes commit 938929f14cb595f43cd1a4e63e22d36cab1e4a1f upstream. Stable note: Fixes https://bugzilla.novell.com/show_bug.cgi?id=726210 . Large machines with 1TB or more of RAM take a long time to boot without this patch and may spew out soft lockup warnings. When min_free_kbytes is updated, some pageblocks are marked MIGRATE_RESERVE. Ordinarily, this work is unnoticable as it happens early in boot but on large machines with 1TB of memory, this has been reported to delay boot times, probably due to the NUMA distances involved. The bulk of the work is due to calling calling pageblock_is_reserved() an unnecessary amount of times and accessing far more struct page metadata than is necessary. This patch significantly reduces the amount of work done by setup_zone_migrate_reserve() improving boot times on 1TB machines. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 947a7e96f91d..70c049dc4ee1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3418,25 +3418,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) if (page_to_nid(page) != zone_to_nid(zone)) continue; - /* Blocks with reserved pages will never free, skip them. */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - block_migratetype = get_pageblock_migratetype(page); - /* If this block is reserved, account for it */ - if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } + /* Only test what is necessary when the reserves are not met */ + if (reserve > 0) { + /* + * Blocks with reserved pages will never free, skip + * them. + */ + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) + continue; - /* Suitable for reserving if this block is movable */ - if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, MIGRATE_RESERVE); - move_freepages_block(zone, page, MIGRATE_RESERVE); - reserve--; - continue; + /* If this block is reserved, account for it */ + if (block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, + MIGRATE_RESERVE); + move_freepages_block(zone, page, + MIGRATE_RESERVE); + reserve--; + continue; + } } /* From 33c17eafdeefb08fbb6ded946abcf024f76c9615 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 14 Sep 2011 16:21:52 -0700 Subject: [PATCH 010/135] mm: vmscan: fix force-scanning small targets without swap commit a4d3e9e76337059406fcf3ead288c0df22a790e9 upstream. Stable note: Not tracked in Bugzilla. This patch augments an earlier commit that avoids scanning priority being artificially raised. The older fix was particularly important for small memcgs to avoid calling wait_iff_congested() unnecessarily. Without swap, anonymous pages are not scanned. As such, they should not count when considering force-scanning a small target if there is no swap. Otherwise, targets are not force-scanned even when their effective scan number is zero and the other conditions--kswapd/memcg--apply. This fixes 246e87a93934 ("memcg: fix get_scan_count() for small targets"). [akpm@linux-foundation.org: fix comment] Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Cc: Ying Han Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Daisuke Nishimura Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 130fa32441c0..347bb4478f39 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1747,23 +1747,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, u64 fraction[2], denominator; enum lru_list l; int noswap = 0; - int force_scan = 0; + bool force_scan = false; unsigned long nr_force_scan[2]; - - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - - if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { - /* kswapd does zone balancing and need to scan this zone */ - if (scanning_global_lru(sc) && current_is_kswapd()) - force_scan = 1; - /* memcg may have small limit and need to avoid priority drop */ - if (!scanning_global_lru(sc)) - force_scan = 1; - } + /* kswapd does zone balancing and needs to scan this zone */ + if (scanning_global_lru(sc) && current_is_kswapd()) + force_scan = true; + /* memcg may have small limit and need to avoid priority drop */ + if (!scanning_global_lru(sc)) + force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || (nr_swap_pages <= 0)) { @@ -1776,6 +1768,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, From 564ea9dd5ab042cb2fe8373f4d627073706e1d4f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 25 Aug 2011 15:59:12 -0700 Subject: [PATCH 011/135] vmscan: clear ZONE_CONGESTED for zone with good watermark commit 439423f6894aa0dec22187526827456f5004baed upstream. Stable note: Not tracked in Bugzilla. kswapd is responsible for clearing ZONE_CONGESTED after it balances a zone and this patch fixes a bug where that was failing to happen. Without this patch, processes can stall in wait_iff_congested unnecessarily. For users, this can look like an interactivity stall but some workloads would see it as sudden drop in throughput. ZONE_CONGESTED is only cleared in kswapd, but pages can be freed in any task. It's possible ZONE_CONGESTED isn't cleared in some cases: 1. the zone is already balanced just entering balance_pgdat() for order-0 because concurrent tasks free memory. In this case, later check will skip the zone as it's balanced so the flag isn't cleared. 2. high order balance fallbacks to order-0. quote from Mel: At the end of balance_pgdat(), kswapd uses the following logic; If reclaiming at high order { for each zone { if all_unreclaimable skip if watermark is not met order = 0 loop again /* watermark is met */ clear congested } } i.e. it clears ZONE_CONGESTED if it the zone is balanced. if not, it restarts balancing at order-0. However, if the higher zones are balanced for order-0, kswapd will miss clearing ZONE_CONGESTED as that only happens after a zone is shrunk. This can mean that wait_iff_congested() stalls unnecessarily. This patch makes kswapd clear ZONE_CONGESTED during its initial highmem->dma scan for zones that are already balanced. Signed-off-by: Shaohua Li Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 347bb4478f39..6b0f8a60ca68 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2456,6 +2456,9 @@ loop_again: high_wmark_pages(zone), 0, 0)) { end_zone = i; break; + } else { + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); } } if (i < 0) From 5e5b3d2ed3aee6f8bbe38c0945876aacce11ff03 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:34 +1000 Subject: [PATCH 012/135] vmscan: add shrink_slab tracepoints commit 095760730c1047c69159ce88021a7fa3833502c8 upstream. Stable note: This patch makes later patches easier to apply but otherwise has little to justify it. It is a diagnostic patch that was part of a series addressing excessive slab shrinking after GFP_NOFS failures. There is detailed information on the series' motivation at https://lkml.org/lkml/2011/6/2/42 . It is impossible to understand what the shrinkers are actually doing without instrumenting the code, so add a some tracepoints to allow insight to be gained. Signed-off-by: Dave Chinner Signed-off-by: Al Viro Signed-off-by: Mel Gorman --- include/trace/events/vmscan.h | 77 +++++++++++++++++++++++++++++++++++ mm/vmscan.c | 8 +++- 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index b2c33bd955fa..36851f7f13da 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -179,6 +179,83 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re TP_ARGS(nr_reclaimed) ); +TRACE_EVENT(mm_shrink_slab_start, + TP_PROTO(struct shrinker *shr, struct shrink_control *sc, + long nr_objects_to_shrink, unsigned long pgs_scanned, + unsigned long lru_pgs, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan), + + TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, + cache_items, delta, total_scan), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, nr_objects_to_shrink) + __field(gfp_t, gfp_flags) + __field(unsigned long, pgs_scanned) + __field(unsigned long, lru_pgs) + __field(unsigned long, cache_items) + __field(unsigned long long, delta) + __field(unsigned long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->nr_objects_to_shrink = nr_objects_to_shrink; + __entry->gfp_flags = sc->gfp_mask; + __entry->pgs_scanned = pgs_scanned; + __entry->lru_pgs = lru_pgs; + __entry->cache_items = cache_items; + __entry->delta = delta; + __entry->total_scan = total_scan; + ), + + TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + __entry->shrink, + __entry->shr, + __entry->nr_objects_to_shrink, + show_gfp_flags(__entry->gfp_flags), + __entry->pgs_scanned, + __entry->lru_pgs, + __entry->cache_items, + __entry->delta, + __entry->total_scan) +); + +TRACE_EVENT(mm_shrink_slab_end, + TP_PROTO(struct shrinker *shr, int shrinker_retval, + long unused_scan_cnt, long new_scan_cnt), + + TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), + + TP_STRUCT__entry( + __field(struct shrinker *, shr) + __field(void *, shrink) + __field(long, unused_scan) + __field(long, new_scan) + __field(int, retval) + __field(long, total_scan) + ), + + TP_fast_assign( + __entry->shr = shr; + __entry->shrink = shr->shrink; + __entry->unused_scan = unused_scan_cnt; + __entry->new_scan = new_scan_cnt; + __entry->retval = shrinker_retval; + __entry->total_scan = new_scan_cnt - unused_scan_cnt; + ), + + TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + __entry->shrink, + __entry->shr, + __entry->unused_scan, + __entry->new_scan, + __entry->total_scan, + __entry->retval) +); DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, diff --git a/mm/vmscan.c b/mm/vmscan.c index 6b0f8a60ca68..abc798167cf2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -250,6 +250,7 @@ unsigned long shrink_slab(struct shrink_control *shrink, unsigned long long delta; unsigned long total_scan; unsigned long max_pass; + int shrink_ret = 0; max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; @@ -274,9 +275,12 @@ unsigned long shrink_slab(struct shrink_control *shrink, total_scan = shrinker->nr; shrinker->nr = 0; + trace_mm_shrink_slab_start(shrinker, shrink, total_scan, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); + while (total_scan >= SHRINK_BATCH) { long this_scan = SHRINK_BATCH; - int shrink_ret; int nr_before; nr_before = do_shrinker_shrink(shrinker, shrink, 0); @@ -293,6 +297,8 @@ unsigned long shrink_slab(struct shrink_control *shrink, } shrinker->nr += total_scan; + trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan, + shrinker->nr); } up_read(&shrinker_rwsem); out: From 6a5091a09f9278f8f821e3f33ac748633d143cea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:35 +1000 Subject: [PATCH 013/135] vmscan: shrinker->nr updates race and go wrong commit acf92b485cccf028177f46918e045c0c4e80ee10 upstream. Stable note: Not tracked in Bugzilla. This patch reduces excessive reclaim of slab objects reducing the amount of information that has to be brought back in from disk. shrink_slab() allows shrinkers to be called in parallel so the struct shrinker can be updated concurrently. It does not provide any exclusio for such updates, so we can get the shrinker->nr value increasing or decreasing incorrectly. As a result, when a shrinker repeatedly returns a value of -1 (e.g. a VFS shrinker called w/ GFP_NOFS), the shrinker->nr goes haywire, sometimes updating with the scan count that wasn't used, sometimes losing it altogether. Worse is when a shrinker does work and that update is lost due to racy updates, which means the shrinker will do the work again! Fix this by making the total_scan calculations independent of shrinker->nr, and making the shrinker->nr updates atomic w.r.t. to other updates via cmpxchg loops. Signed-off-by: Dave Chinner Signed-off-by: Al Viro Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index abc798167cf2..bb7df1e2382d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -251,17 +251,29 @@ unsigned long shrink_slab(struct shrink_control *shrink, unsigned long total_scan; unsigned long max_pass; int shrink_ret = 0; + long nr; + long new_nr; + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + + total_scan = nr; max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { + total_scan += delta; + if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to " "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; + shrinker->shrink, total_scan); + total_scan = max_pass; } /* @@ -269,13 +281,10 @@ unsigned long shrink_slab(struct shrink_control *shrink, * never try to free more than twice the estimate number of * freeable entries. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; - total_scan = shrinker->nr; - shrinker->nr = 0; - - trace_mm_shrink_slab_start(shrinker, shrink, total_scan, + trace_mm_shrink_slab_start(shrinker, shrink, nr, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); @@ -296,9 +305,19 @@ unsigned long shrink_slab(struct shrink_control *shrink, cond_resched(); } - shrinker->nr += total_scan; - trace_mm_shrink_slab_end(shrinker, shrink_ret, total_scan, - shrinker->nr); + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); out: From 7554e3446a916363447a81a29f9300d3f2fbf503 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:36 +1000 Subject: [PATCH 014/135] vmscan: reduce wind up shrinker->nr when shrinker can't do work commit 3567b59aa80ac4417002bf58e35dce5c777d4164 upstream. Stable note: Not tracked in Bugzilla. This patch reduces excessive reclaim of slab objects reducing the amount of information that has to be brought back in from disk. The third and fourth paragram in the series describes the impact. When a shrinker returns -1 to shrink_slab() to indicate it cannot do any work given the current memory reclaim requirements, it adds the entire total_scan count to shrinker->nr. The idea ehind this is that whenteh shrinker is next called and can do work, it will do the work of the previously aborted shrinker call as well. However, if a filesystem is doing lots of allocation with GFP_NOFS set, then we get many, many more aborts from the shrinkers than we do successful calls. The result is that shrinker->nr winds up to it's maximum permissible value (twice the current cache size) and then when the next shrinker call that can do work is issued, it has enough scan count built up to free the entire cache twice over. This manifests itself in the cache going from full to empty in a matter of seconds, even when only a small part of the cache is needed to be emptied to free sufficient memory. Under metadata intensive workloads on ext4 and XFS, I'm seeing the VFS caches increase memory consumption up to 75% of memory (no page cache pressure) over a period of 30-60s, and then the shrinker empties them down to zero in the space of 2-3s. This cycle repeats over and over again, with the shrinker completely trashing the inode and dentry caches every minute or so the workload continues. This behaviour was made obvious by the shrink_slab tracepoints added earlier in the series, and made worse by the patch that corrected the concurrent accounting of shrinker->nr. To avoid this problem, stop repeated small increments of the total scan value from winding shrinker->nr up to a value that can cause the entire cache to be freed. We still need to allow it to wind up, so use the delta as the "large scan" threshold check - if the delta is more than a quarter of the entire cache size, then it is a large scan and allowed to cause lots of windup because we are clearly needing to free lots of memory. If it isn't a large scan then limit the total scan to half the size of the cache so that windup never increases to consume the whole cache. Reducing the total scan limit further does not allow enough wind-up to maintain the current levels of performance, whilst a higher threshold does not prevent the windup from freeing the entire cache under sustained workloads. Signed-off-by: Dave Chinner Signed-off-by: Al Viro Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index bb7df1e2382d..d375de3111ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -276,6 +276,21 @@ unsigned long shrink_slab(struct shrink_control *shrink, total_scan = max_pass; } + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of From 4d4724067d512e7f17010112da8ec64917c192e7 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 31 Oct 2011 17:09:31 -0700 Subject: [PATCH 015/135] vmscan: limit direct reclaim for higher order allocations commit e0887c19b2daa140f20ca8104bdc5740f39dbb86 upstream. Stable note: Not tracked on Bugzilla. THP and compaction was found to aggressively reclaim pages and stall systems under different situations that was addressed piecemeal over time. Paragraph 3 of this changelog is the motivation for this patch. When suffering from memory fragmentation due to unfreeable pages, THP page faults will repeatedly try to compact memory. Due to the unfreeable pages, compaction fails. Needless to say, at that point page reclaim also fails to create free contiguous 2MB areas. However, that doesn't stop the current code from trying, over and over again, and freeing a minimum of 4MB (2UL << sc->order pages) at every single invocation. This resulted in my 12GB system having 2-3GB free memory, a corresponding amount of used swap and very sluggish response times. This can be avoided by having the direct reclaim code not reclaim from zones that already have plenty of free memory available for compaction. If compaction still fails due to unmovable memory, doing additional reclaim will only hurt the system, not help. [jweiner@redhat.com: change comment to explain the order check] Signed-off-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Mel Gorman Cc: Andrea Arcangeli Reviewed-by: Minchan Kim Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index d375de3111ba..e1ae88b0b44e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2059,6 +2059,22 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + if (COMPACTION_BUILD) { + /* + * If we already have plenty of memory + * free for compaction, don't free any + * more. Even though compaction is + * invoked for any non-zero order, + * only frequent costly order + * reclamation is disruptive enough to + * become a noticable problem, like + * transparent huge page allocations. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER && + (compaction_suitable(zone, sc->order) || + compaction_deferred(zone))) + continue; + } /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and From 4682e89d1455d66e2536d9efb2875d61a1f1f294 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:09:33 -0700 Subject: [PATCH 016/135] vmscan: abort reclaim/compaction if compaction can proceed commit e0c23279c9f800c403f37511484d9014ac83adec upstream. Stable note: Not tracked on Bugzilla. THP and compaction was found to aggressively reclaim pages and stall systems under different situations that was addressed piecemeal over time. If compaction can proceed, shrink_zones() stops doing any work but its callers still call shrink_slab() which raises the priority and potentially sleeps. This is unnecessary and wasteful so this patch aborts direct reclaim/compaction entirely if compaction can proceed. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Minchan Kim Acked-by: Johannes Weiner Cc: Josh Boyer Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e1ae88b0b44e..b146b427cda1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2037,14 +2037,19 @@ restart: * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is either ready to begin or deferred. + * This indicates to the caller that it should retry the allocation or fail. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + bool should_abort_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2061,19 +2066,20 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; /* Let kswapd poll it */ if (COMPACTION_BUILD) { /* - * If we already have plenty of memory - * free for compaction, don't free any - * more. Even though compaction is - * invoked for any non-zero order, - * only frequent costly order - * reclamation is disruptive enough to - * become a noticable problem, like - * transparent huge page allocations. + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticable problem, like transparent huge page + * allocations. */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER && (compaction_suitable(zone, sc->order) || - compaction_deferred(zone))) + compaction_deferred(zone))) { + should_abort_reclaim = true; continue; + } } /* * This steals pages from memory cgroups over softlimit @@ -2092,6 +2098,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } + + return should_abort_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2156,7 +2164,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->mem_cgroup); - shrink_zones(priority, zonelist, sc); + if (shrink_zones(priority, zonelist, sc)) + break; + /* * Don't shrink slabs when reclaiming memory from * over limit cgroups From f665a680f89357a6773fb97684690c76933888f6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:44 -0700 Subject: [PATCH 017/135] mm: compaction: trivial clean up in acct_isolated() commit b9e84ac1536d35aee03b2601f19694949f0bd506 upstream. Stable note: Not tracked in Bugzilla. This patch makes later patches easier to apply but has no other impact. acct_isolated of compaction uses page_lru_base_type which returns only base type of LRU list so it never returns LRU_ACTIVE_ANON or LRU_ACTIVE_FILE. In addtion, cc->nr_[anon|file] is used in only acct_isolated so it doesn't have fields in conpact_control. This patch removes fields from compact_control and makes clear function of acct_issolated which counts the number of anon|file pages isolated. Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index adc5336ad844..d6ba0377e260 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -35,10 +35,6 @@ struct compact_control { unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ - /* Account for isolated anon and file pages */ - unsigned long nr_anon; - unsigned long nr_file; - unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; @@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone, static void acct_isolated(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned int count[NR_LRU_LISTS] = { 0, }; + unsigned int count[2] = { 0, }; - list_for_each_entry(page, &cc->migratepages, lru) { - int lru = page_lru_base_type(page); - count[lru]++; - } + list_for_each_entry(page, &cc->migratepages, lru) + count[!!page_is_file_cache(page)]++; - cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ From a15a3971cc49eefbde40b397a446c0fa9c5fed9c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:47 -0700 Subject: [PATCH 018/135] mm: change isolate mode from #define to bitwise type commit 4356f21d09283dc6d39a6f7287a65ddab61e2808 upstream. Stable note: Not tracked in Bugzilla. This patch makes later patches easier to apply but has no other impact. Change ISOLATE_XXX macro with bitwise isolate_mode_t type. Normally, macro isn't recommended as it's type-unsafe and making debugging harder as symbol cannot be passed throught to the debugger. Quote from Johannes " Hmm, it would probably be cleaner to fully convert the isolation mode into independent flags. INACTIVE, ACTIVE, BOTH is currently a tri-state among flags, which is a bit ugly." This patch moves isolate mode from swap.h to mmzone.h by memcontrol.h Signed-off-by: Minchan Kim Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Cc: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- .../postprocess/trace-vmscan-postprocess.pl | 8 ++-- include/linux/memcontrol.h | 3 +- include/linux/mmzone.h | 8 ++++ include/linux/swap.h | 7 +--- include/trace/events/vmscan.h | 8 ++-- mm/compaction.c | 3 +- mm/memcontrol.c | 3 +- mm/vmscan.c | 37 ++++++++++--------- 8 files changed, 43 insertions(+), 34 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 12cecc83cd91..4a37c4759cd2 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -379,10 +379,10 @@ EVENT_PROCESS: # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation - # isolate_inactive == 0 - # isolate_active == 1 - # isolate_both == 2 - if ($isolate_mode != 1) { + # isolate_inactive == 1 + # isolate_active == 2 + # isolate_both == 3 + if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; } $perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 313a00eca40e..4a8da84487b7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,7 +35,8 @@ enum mem_cgroup_page_stat_item { extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, - int mode, struct zone *z, + isolate_mode_t mode, + struct zone *z, struct mem_cgroup *mem_cont, int active, int file); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aa2d80b61d47..047ea6521cc7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -158,6 +158,14 @@ static inline int is_unevictable_lru(enum lru_list l) return (l == LRU_UNEVICTABLE); } +/* Isolate inactive pages */ +#define ISOLATE_INACTIVE ((__force isolate_mode_t)0x1) +/* Isolate active pages */ +#define ISOLATE_ACTIVE ((__force isolate_mode_t)0x2) + +/* LRU Isolation modes. */ +typedef unsigned __bitwise__ isolate_mode_t; + enum zone_watermarks { WMARK_MIN, WMARK_LOW, diff --git a/include/linux/swap.h b/include/linux/swap.h index a273468f8285..e73799d3b1c5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -243,11 +243,6 @@ static inline void lru_cache_add_file(struct page *page) __lru_cache_add(page, LRU_INACTIVE_FILE); } -/* LRU Isolation modes. */ -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ - /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); @@ -259,7 +254,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, unsigned int swappiness, struct zone *zone, unsigned long *nr_scanned); -extern int __isolate_lru_page(struct page *page, int mode, int file); +extern int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 36851f7f13da..edc4b3d25a2d 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -266,7 +266,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - int isolate_mode), + isolate_mode_t isolate_mode), TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode), @@ -278,7 +278,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template, __field(unsigned long, nr_lumpy_taken) __field(unsigned long, nr_lumpy_dirty) __field(unsigned long, nr_lumpy_failed) - __field(int, isolate_mode) + __field(isolate_mode_t, isolate_mode) ), TP_fast_assign( @@ -312,7 +312,7 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - int isolate_mode), + isolate_mode_t isolate_mode), TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) @@ -327,7 +327,7 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate, unsigned long nr_lumpy_taken, unsigned long nr_lumpy_dirty, unsigned long nr_lumpy_failed, - int isolate_mode), + isolate_mode_t isolate_mode), TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode) diff --git a/mm/compaction.c b/mm/compaction.c index d6ba0377e260..26521a12f229 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -371,7 +371,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* Try isolate the page */ - if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) + if (__isolate_lru_page(page, + ISOLATE_ACTIVE|ISOLATE_INACTIVE, 0) != 0) continue; VM_BUG_ON(PageTransCompound(page)); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ffb99b4e7527..57cdf5ad6924 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, - int mode, struct zone *z, + isolate_mode_t mode, + struct zone *z, struct mem_cgroup *mem_cont, int active, int file) { diff --git a/mm/vmscan.c b/mm/vmscan.c index b146b427cda1..9267ba1b6641 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1012,23 +1012,27 @@ keep_lumpy: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) { + bool all_lru_mode; int ret = -EINVAL; /* Only take pages on the LRU. */ if (!PageLRU(page)) return ret; + all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == + (ISOLATE_ACTIVE|ISOLATE_INACTIVE); + /* * When checking the active state, we need to be sure we are * dealing with comparible boolean values. Take the logical not * of each. */ - if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) return ret; - if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) + if (!all_lru_mode && !!page_is_file_cache(page) != file) return ret; /* @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode, int file) + unsigned long *scanned, int order, isolate_mode_t mode, + int file) { unsigned long nr_taken = 0; unsigned long nr_lumpy_taken = 0; @@ -1201,8 +1206,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, - int mode, struct zone *z, - int active, int file) + isolate_mode_t mode, + struct zone *z, int active, int file) { int lru = LRU_BASE; if (active) @@ -1448,6 +1453,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_taken; unsigned long nr_anon; unsigned long nr_file; + isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1458,15 +1464,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } set_reclaim_mode(priority, sc, false); + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) + reclaim_mode |= ISOLATE_ACTIVE; + lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, 0, file); + nr_taken = isolate_pages_global(nr_to_scan, &page_list, + &nr_scanned, sc->order, reclaim_mode, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1475,12 +1481,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, - 0, file); + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, + &nr_scanned, sc->order, reclaim_mode, zone, + sc->mem_cgroup, 0, file); /* * mem_cgroup_isolate_pages() keeps track of * scanned pages on its own. From 19faec0520b3b16dfd58cde30938a3c4d3dcdd5b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:51 -0700 Subject: [PATCH 019/135] mm: compaction: make isolate_lru_page() filter-aware commit 39deaf8585152f1a35c1676d3d7dc6ae0fb65967 upstream. Stable note: Not tracked in Bugzilla. THP and compaction disrupt the LRU list leading to poor reclaim decisions which has a variable performance impact. In async mode, compaction doesn't migrate dirty or writeback pages. So, it's meaningless to pick the page and re-add it to lru list. Of course, when we isolate the page in compaction, the page might be dirty or writeback but when we try to migrate the page, the page would be not dirty, writeback. So it could be migrated. But it's very unlikely as isolate and migration cycle is much faster than writeout. So, this patch helps cpu overhead and prevent unnecessary LRU churning. Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- include/linux/mmzone.h | 2 ++ mm/compaction.c | 7 +++++-- mm/vmscan.c | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 047ea6521cc7..0ed9149a2725 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -162,6 +162,8 @@ static inline int is_unevictable_lru(enum lru_list l) #define ISOLATE_INACTIVE ((__force isolate_mode_t)0x1) /* Isolate active pages */ #define ISOLATE_ACTIVE ((__force isolate_mode_t)0x2) +/* Isolate clean file */ +#define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) /* LRU Isolation modes. */ typedef unsigned __bitwise__ isolate_mode_t; diff --git a/mm/compaction.c b/mm/compaction.c index 26521a12f229..b30c40f860c7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -261,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; + isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; /* Do not scan outside zone boundaries */ low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); @@ -370,9 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; } + if (!cc->sync) + mode |= ISOLATE_CLEAN; + /* Try isolate the page */ - if (__isolate_lru_page(page, - ISOLATE_ACTIVE|ISOLATE_INACTIVE, 0) != 0) + if (__isolate_lru_page(page, mode, 0) != 0) continue; VM_BUG_ON(PageTransCompound(page)); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9267ba1b6641..68357354db1b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1045,6 +1045,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) ret = -EBUSY; + if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) + return ret; + if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're From 5e02dde6aee7c4492b3a62ad93e7f1120877a019 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:55 -0700 Subject: [PATCH 020/135] mm: zone_reclaim: make isolate_lru_page() filter-aware commit f80c0673610e36ae29d63e3297175e22f70dde5f upstream. Stable note: Not tracked in Bugzilla. THP and compaction disrupt the LRU list leading to poor reclaim decisions which has a variable performance impact. In __zone_reclaim case, we don't want to shrink mapped page. Nonetheless, we have isolated mapped page and re-add it into LRU's head. It's unnecessary CPU overhead and makes LRU churning. Of course, when we isolate the page, the page might be mapped but when we try to migrate the page, the page would be not mapped. So it could be migrated. But race is rare and although it happens, it's no big deal. Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- include/linux/mmzone.h | 2 ++ mm/vmscan.c | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0ed9149a2725..80acfbc9aa14 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -164,6 +164,8 @@ static inline int is_unevictable_lru(enum lru_list l) #define ISOLATE_ACTIVE ((__force isolate_mode_t)0x2) /* Isolate clean file */ #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) +/* Isolate unmapped file */ +#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise__ isolate_mode_t; diff --git a/mm/vmscan.c b/mm/vmscan.c index 68357354db1b..0c78bd3bd964 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1048,6 +1048,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) return ret; + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) + return ret; + if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're @@ -1471,6 +1474,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, reclaim_mode |= ISOLATE_ACTIVE; lru_add_drain(); + + if (!sc->may_unmap) + reclaim_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + reclaim_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { @@ -1588,19 +1597,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); unsigned long nr_rotated = 0; + isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; lru_add_drain(); + + if (!sc->may_unmap) + reclaim_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + reclaim_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, + reclaim_mode, zone, 1, file); zone->pages_scanned += pgscanned; } else { nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, + reclaim_mode, zone, sc->mem_cgroup, 1, file); /* * mem_cgroup_isolate_pages() keeps track of From 331fae62e66ac4209f23df0df66999932e513fff Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:57 -0700 Subject: [PATCH 021/135] mm: migration: clean up unmap_and_move() commit 0dabec93de633a87adfbbe1d800a4c56cd19d73b upstream. Stable note: Not tracked in Bugzilla. This patch makes later patches easier to apply but has no other impact. unmap_and_move() is one a big messy function. Clean it up. Signed-off-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Rik van Riel Cc: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/migrate.c | 75 ++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 14d0a6a632f6..33358f878111 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, return rc; } -/* - * Obtain the lock on page, remove all ptes and migrate the page - * to the newly allocated page in newpage. - */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, bool sync) +static int __unmap_and_move(struct page *page, struct page *newpage, + int force, bool offlining, bool sync) { - int rc = 0; - int *result = NULL; - struct page *newpage = get_new_page(page, private, &result); + int rc = -EAGAIN; int remap_swapcache = 1; int charge = 0; struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; - if (!newpage) - return -ENOMEM; - - if (page_count(page) == 1) { - /* page was freed from under us. So we are done. */ - goto move_newpage; - } - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) - goto move_newpage; - - /* prepare cgroup just returns 0 or -ENOMEM */ - rc = -EAGAIN; - if (!trylock_page(page)) { if (!force || !sync) - goto move_newpage; + goto out; /* * It's not safe for direct compaction to call lock_page. @@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * altogether. */ if (current->flags & PF_MEMALLOC) - goto move_newpage; + goto out; lock_page(page); } @@ -785,27 +765,52 @@ uncharge: mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); +out: + return rc; +} -move_newpage: +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(new_page_t get_new_page, unsigned long private, + struct page *page, int force, bool offlining, bool sync) +{ + int rc = 0; + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); + + if (!newpage) + return -ENOMEM; + + if (page_count(page) == 1) { + /* page was freed from under us. So we are done. */ + goto out; + } + + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto out; + + rc = __unmap_and_move(page, newpage, force, offlining, sync); +out: if (rc != -EAGAIN) { - /* - * A page that has been migrated has all references - * removed and will be freed. A page that has not been - * migrated will have kepts its references and be - * restored. - */ - list_del(&page->lru); + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. */ putback_lru_page(newpage); - if (result) { if (rc) *result = rc; From ec46a9e8767b9fa37c5d18b18b24ea96a5d2695d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:22 -0800 Subject: [PATCH 022/135] mm: compaction: allow compaction to isolate dirty pages commit a77ebd333cd810d7b680d544be88c875131c2bd3 upstream. Stable note: Not tracked in Bugzilla. A fix aimed at preserving page aging information by reducing LRU list churning had the side-effect of reducing THP allocation success rates. This was part of a series to restore the success rates while preserving the reclaim fix. Short summary: There are severe stalls when a USB stick using VFAT is used with THP enabled that are reduced by this series. If you are experiencing this problem, please test and report back and considering I have seen complaints from openSUSE and Fedora users on this as well as a few private mails, I'm guessing it's a widespread issue. This is a new type of USB-related stall because it is due to synchronous compaction writing where as in the past the big problem was dirty pages reaching the end of the LRU and being written by reclaim. Am cc'ing Andrew this time and this series would replace mm-do-not-stall-in-synchronous-compaction-for-thp-allocations.patch. I'm also cc'ing Dave Jones as he might have merged that patch to Fedora for wider testing and ideally it would be reverted and replaced by this series. That said, the later patches could really do with some review. If this series is not the answer then a new direction needs to be discussed because as it is, the stalls are unacceptable as the results in this leader show. For testers that try backporting this to 3.1, it won't work because there is a non-obvious dependency on not writing back pages in direct reclaim so you need those patches too. Changelog since V5 o Rebase to 3.2-rc5 o Tidy up the changelogs a bit Changelog since V4 o Added reviewed-bys, credited Andrea properly for sync-light o Allow dirty pages without mappings to be considered for migration o Bound the number of pages freed for compaction o Isolate PageReclaim pages on their own LRU list This is against 3.2-rc5 and follows on from discussions on "mm: Do not stall in synchronous compaction for THP allocations" and "[RFC PATCH 0/5] Reduce compaction-related stalls". Initially, the proposed patch eliminated stalls due to compaction which sometimes resulted in user-visible interactivity problems on browsers by simply never using sync compaction. The downside was that THP success allocation rates were lower because dirty pages were not being migrated as reported by Andrea. His approach at fixing this was nacked on the grounds that it reverted fixes from Rik merged that reduced the amount of pages reclaimed as it severely impacted his workloads performance. This series attempts to reconcile the requirements of maximising THP usage, without stalling in a user-visible fashion due to compaction or cheating by reclaiming an excessive number of pages. Patch 1 partially reverts commit 39deaf85 to allow migration to isolate dirty pages. This is because migration can move some dirty pages without blocking. Patch 2 notes that the /proc/sys/vm/compact_memory handler is not using synchronous compaction when it should be. This is unrelated to the reported stalls but is worth fixing. Patch 3 checks if we isolated a compound page during lumpy scan and account for it properly. For the most part, this affects tracing so it's unrelated to the stalls but worth fixing. Patch 4 notes that it is possible to abort reclaim early for compaction and return 0 to the page allocator potentially entering the "may oom" path. This has not been observed in practice but the rest of the series potentially makes it easier to happen. Patch 5 adds a sync parameter to the migratepage callback and gives the callback responsibility for migrating the page without blocking if sync==false. For example, fallback_migrate_page will not call writepage if sync==false. This increases the number of pages that can be handled by asynchronous compaction thereby reducing stalls. Patch 6 restores filter-awareness to isolate_lru_page for migration. In practice, it means that pages under writeback and pages without a ->migratepage callback will not be isolated for migration. Patch 7 avoids calling direct reclaim if compaction is deferred but makes sure that compaction is only deferred if sync compaction was used. Patch 8 introduces a sync-light migration mechanism that sync compaction uses. The objective is to allow some stalls but to not call ->writepage which can lead to significant user-visible stalls. Patch 9 notes that while we want to abort reclaim ASAP to allow compation to go ahead that we leave a very small window of opportunity for compaction to run. This patch allows more pages to be freed by reclaim but bounds the number to a reasonable level based on the high watermark on each zone. Patch 10 allows slabs to be shrunk even after compaction_ready() is true for one zone. This is to avoid a problem whereby a single small zone can abort reclaim even though no pages have been reclaimed and no suitably large zone is in a usable state. Patch 11 fixes a problem with the rate of page scanning. As reclaim is rarely stalling on pages under writeback it means that scan rates are very high. This is particularly true for direct reclaim which is not calling writepage. The vmstat figures implied that much of this was busy work with PageReclaim pages marked for immediate reclaim. This patch is a prototype that moves these pages to their own LRU list. This has been tested and other than 2 USB keys getting trashed, nothing horrible fell out. That said, I am a bit unhappy with the rescue logic in patch 11 but did not find a better way around it. It does significantly reduce scan rates and System CPU time indicating it is the right direction to take. What is of critical importance is that stalls due to compaction are massively reduced even though sync compaction was still allowed. Testing from people complaining about stalls copying to USBs with THP enabled are particularly welcome. The following tests all involve THP usage and USB keys in some way. Each test follows this type of pattern 1. Read from some fast fast storage, be it raw device or file. Each time the copy finishes, start again until the test ends 2. Write a large file to a filesystem on a USB stick. Each time the copy finishes, start again until the test ends 3. When memory is low, start an alloc process that creates a mapping the size of physical memory to stress THP allocation. This is the "real" part of the test and the part that is meant to trigger stalls when THP is enabled. Copying continues in the background. 4. Record the CPU usage and time to execute of the alloc process 5. Record the number of THP allocs and fallbacks as well as the number of THP pages in use a the end of the test just before alloc exited 6. Run the test 5 times to get an idea of variability 7. Between each run, sync is run and caches dropped and the test waits until nr_dirty is a small number to avoid interference or caching between iterations that would skew the figures. The individual tests were then writebackCPDeviceBasevfat Disable THP, read from a raw device (sda), vfat on USB stick writebackCPDeviceBaseext4 Disable THP, read from a raw device (sda), ext4 on USB stick writebackCPDevicevfat THP enabled, read from a raw device (sda), vfat on USB stick writebackCPDeviceext4 THP enabled, read from a raw device (sda), ext4 on USB stick writebackCPFilevfat THP enabled, read from a file on fast storage and USB, both vfat writebackCPFileext4 THP enabled, read from a file on fast storage and USB, both ext4 The kernels tested were 3.1 3.1 vanilla 3.2-rc5 freemore Patches 1-10 immediate Patches 1-11 andrea The 8 patches Andrea posted as a basis of comparison The results are very long unfortunately. I'll start with the case where we are not using THP at all writebackCPDeviceBasevfat 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.28 ( 0.00%) 54.49 (-4143.46%) 48.63 (-3687.69%) 4.69 ( -265.11%) 51.88 (-3940.81%) +/- 0.06 ( 0.00%) 2.45 (-4305.55%) 4.75 (-8430.57%) 7.46 (-13282.76%) 4.76 (-8440.70%) User Time 0.09 ( 0.00%) 0.05 ( 40.91%) 0.06 ( 29.55%) 0.07 ( 15.91%) 0.06 ( 27.27%) +/- 0.02 ( 0.00%) 0.01 ( 45.39%) 0.02 ( 25.07%) 0.00 ( 77.06%) 0.01 ( 52.24%) Elapsed Time 110.27 ( 0.00%) 56.38 ( 48.87%) 49.95 ( 54.70%) 11.77 ( 89.33%) 53.43 ( 51.54%) +/- 7.33 ( 0.00%) 3.77 ( 48.61%) 4.94 ( 32.63%) 6.71 ( 8.50%) 4.76 ( 35.03%) THP Active 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) +/- 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Fault Alloc 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) +/- 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Fault Fallback 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) +/- 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) The THP figures are obviously all 0 because THP was enabled. The main thing to watch is the elapsed times and how they compare to times when THP is enabled later. It's also important to note that elapsed time is improved by this series as System CPu time is much reduced. writebackCPDevicevfat 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.22 ( 0.00%) 13.89 (-1040.72%) 46.40 (-3709.20%) 4.44 ( -264.37%) 47.37 (-3789.33%) +/- 0.06 ( 0.00%) 22.82 (-37635.56%) 3.84 (-6249.44%) 6.48 (-10618.92%) 6.60 (-10818.53%) User Time 0.06 ( 0.00%) 0.06 ( -6.90%) 0.05 ( 17.24%) 0.05 ( 13.79%) 0.04 ( 31.03%) +/- 0.01 ( 0.00%) 0.01 ( 33.33%) 0.01 ( 33.33%) 0.01 ( 39.14%) 0.01 ( 25.46%) Elapsed Time 10445.54 ( 0.00%) 2249.92 ( 78.46%) 70.06 ( 99.33%) 16.59 ( 99.84%) 472.43 ( 95.48%) +/- 643.98 ( 0.00%) 811.62 ( -26.03%) 10.02 ( 98.44%) 7.03 ( 98.91%) 59.99 ( 90.68%) THP Active 15.60 ( 0.00%) 35.20 ( 225.64%) 65.00 ( 416.67%) 70.80 ( 453.85%) 62.20 ( 398.72%) +/- 18.48 ( 0.00%) 51.29 ( 277.59%) 15.99 ( 86.52%) 37.91 ( 205.18%) 22.02 ( 119.18%) Fault Alloc 121.80 ( 0.00%) 76.60 ( 62.89%) 155.40 ( 127.59%) 181.20 ( 148.77%) 286.60 ( 235.30%) +/- 73.51 ( 0.00%) 61.11 ( 83.12%) 34.89 ( 47.46%) 31.88 ( 43.36%) 68.13 ( 92.68%) Fault Fallback 881.20 ( 0.00%) 926.60 ( -5.15%) 847.60 ( 3.81%) 822.00 ( 6.72%) 716.60 ( 18.68%) +/- 73.51 ( 0.00%) 61.26 ( 16.67%) 34.89 ( 52.54%) 31.65 ( 56.94%) 67.75 ( 7.84%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 3540.88 1945.37 716.04 64.97 1937.03 Total Elapsed Time (seconds) 52417.33 11425.90 501.02 230.95 2520.28 The first thing to note is the "Elapsed Time" for the vanilla kernels of 2249 seconds versus 56 with THP disabled which might explain the reports of USB stalls with THP enabled. Applying the patches brings performance in line with THP-disabled performance while isolating pages for immediate reclaim from the LRU cuts down System CPU time. The "Fault Alloc" success rate figures are also improved. The vanilla kernel only managed to allocate 76.6 pages on average over the course of 5 iterations where as applying the series allocated 181.20 on average albeit it is well within variance. It's worth noting that applies the series at least descreases the amount of variance which implies an improvement. Andrea's series had a higher success rate for THP allocations but at a severe cost to elapsed time which is still better than vanilla but still much worse than disabling THP altogether. One can bring my series close to Andrea's by removing this check /* * If compaction is deferred for high-order allocations, it is because * sync compaction recently failed. In this is the case and the caller * has requested the system not be heavily disrupted, fail the * allocation now instead of entering direct reclaim */ if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; I didn't include a patch that removed the above check because hurting overall performance to improve the THP figure is not what the average user wants. It's something to consider though if someone really wants to maximise THP usage no matter what it does to the workload initially. This is summary of vmstat figures from the same test. 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 Page Ins 3257266139 1111844061 17263623 10901575 161423219 Page Outs 81054922 30364312 3626530 3657687 8753730 Swap Ins 3294 2851 6560 4964 4592 Swap Outs 390073 528094 620197 790912 698285 Direct pages scanned 1077581700 3024951463 1764930052 115140570 5901188831 Kswapd pages scanned 34826043 7112868 2131265 1686942 1893966 Kswapd pages reclaimed 28950067 4911036 1246044 966475 1497726 Direct pages reclaimed 805148398 280167837 3623473 2215044 40809360 Kswapd efficiency 83% 69% 58% 57% 79% Kswapd velocity 664.399 622.521 4253.852 7304.360 751.490 Direct efficiency 74% 9% 0% 1% 0% Direct velocity 20557.737 264745.137 3522673.849 498551.938 2341481.435 Percentage direct scans 96% 99% 99% 98% 99% Page writes by reclaim 722646 529174 620319 791018 699198 Page writes file 332573 1080 122 106 913 Page writes anon 390073 528094 620197 790912 698285 Page reclaim immediate 0 2552514720 1635858848 111281140 5478375032 Page rescued immediate 0 0 0 87848 0 Slabs scanned 23552 23552 9216 8192 9216 Direct inode steals 231 0 0 0 0 Kswapd inode steals 0 0 0 0 0 Kswapd skipped wait 28076 786 0 61 6 THP fault alloc 609 383 753 906 1433 THP collapse alloc 12 6 0 0 6 THP splits 536 211 456 593 1136 THP fault fallback 4406 4633 4263 4110 3583 THP collapse fail 120 127 0 0 4 Compaction stalls 1810 728 623 779 3200 Compaction success 196 53 60 80 123 Compaction failures 1614 675 563 699 3077 Compaction pages moved 193158 53545 243185 333457 226688 Compaction move failure 9952 9396 16424 23676 45070 The main things to look at are 1. Page In/out figures are much reduced by the series. 2. Direct page scanning is incredibly high (264745.137 pages scanned per second on the vanilla kernel) but isolating PageReclaim pages on their own list reduces the number of pages scanned significantly. 3. The fact that "Page rescued immediate" is a positive number implies that we sometimes race removing pages from the LRU_IMMEDIATE list that need to be put back on a normal LRU but it happens only for 0.07% of the pages marked for immediate reclaim. writebackCPDeviceext4 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.51 ( 0.00%) 1.77 ( -17.66%) 1.46 ( 2.92%) 1.15 ( 23.77%) 1.89 ( -25.63%) +/- 0.27 ( 0.00%) 0.67 ( -148.52%) 0.33 ( -22.76%) 0.30 ( -11.15%) 0.19 ( 30.16%) User Time 0.03 ( 0.00%) 0.04 ( -37.50%) 0.05 ( -62.50%) 0.07 ( -112.50%) 0.04 ( -18.75%) +/- 0.01 ( 0.00%) 0.02 ( -146.64%) 0.02 ( -97.91%) 0.02 ( -75.59%) 0.02 ( -63.30%) Elapsed Time 124.93 ( 0.00%) 114.49 ( 8.36%) 96.77 ( 22.55%) 27.48 ( 78.00%) 205.70 ( -64.65%) +/- 20.20 ( 0.00%) 74.39 ( -268.34%) 59.88 ( -196.48%) 7.72 ( 61.79%) 25.03 ( -23.95%) THP Active 161.80 ( 0.00%) 83.60 ( 51.67%) 141.20 ( 87.27%) 84.60 ( 52.29%) 82.60 ( 51.05%) +/- 71.95 ( 0.00%) 43.80 ( 60.88%) 26.91 ( 37.40%) 59.02 ( 82.03%) 52.13 ( 72.45%) Fault Alloc 471.40 ( 0.00%) 228.60 ( 48.49%) 282.20 ( 59.86%) 225.20 ( 47.77%) 388.40 ( 82.39%) +/- 88.07 ( 0.00%) 87.42 ( 99.26%) 73.79 ( 83.78%) 109.62 ( 124.47%) 82.62 ( 93.81%) Fault Fallback 531.60 ( 0.00%) 774.60 ( -45.71%) 720.80 ( -35.59%) 777.80 ( -46.31%) 614.80 ( -15.65%) +/- 88.07 ( 0.00%) 87.26 ( 0.92%) 73.79 ( 16.22%) 109.62 ( -24.47%) 82.29 ( 6.56%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 50.22 33.76 30.65 24.14 128.45 Total Elapsed Time (seconds) 1113.73 1132.19 1029.45 759.49 1707.26 Similar test but the USB stick is using ext4 instead of vfat. As ext4 does not use writepage for migration, the large stalls due to compaction when THP is enabled are not observed. Still, isolating PageReclaim pages on their own list helped completion time largely by reducing the number of pages scanned by direct reclaim although time spend in congestion_wait could also be a factor. Again, Andrea's series had far higher success rates for THP allocation at the cost of elapsed time. I didn't look too closely but a quick look at the vmstat figures tells me kswapd reclaimed 8 times more pages than the patch series and direct reclaim reclaimed roughly three times as many pages. It follows that if memory is aggressively reclaimed, there will be more available for THP. writebackCPFilevfat 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.76 ( 0.00%) 29.10 (-1555.52%) 46.01 (-2517.18%) 4.79 ( -172.35%) 54.89 (-3022.53%) +/- 0.14 ( 0.00%) 25.61 (-18185.17%) 2.15 (-1434.83%) 6.60 (-4610.03%) 9.75 (-6863.76%) User Time 0.05 ( 0.00%) 0.07 ( -45.83%) 0.05 ( -4.17%) 0.06 ( -29.17%) 0.06 ( -16.67%) +/- 0.02 ( 0.00%) 0.02 ( 20.11%) 0.02 ( -3.14%) 0.01 ( 31.58%) 0.01 ( 47.41%) Elapsed Time 22520.79 ( 0.00%) 1082.85 ( 95.19%) 73.30 ( 99.67%) 32.43 ( 99.86%) 291.84 ( 98.70%) +/- 7277.23 ( 0.00%) 706.29 ( 90.29%) 19.05 ( 99.74%) 17.05 ( 99.77%) 125.55 ( 98.27%) THP Active 83.80 ( 0.00%) 12.80 ( 15.27%) 15.60 ( 18.62%) 13.00 ( 15.51%) 0.80 ( 0.95%) +/- 66.81 ( 0.00%) 20.19 ( 30.22%) 5.92 ( 8.86%) 15.06 ( 22.54%) 1.17 ( 1.75%) Fault Alloc 171.00 ( 0.00%) 67.80 ( 39.65%) 97.40 ( 56.96%) 125.60 ( 73.45%) 133.00 ( 77.78%) +/- 82.91 ( 0.00%) 30.69 ( 37.02%) 53.91 ( 65.02%) 55.05 ( 66.40%) 21.19 ( 25.56%) Fault Fallback 832.00 ( 0.00%) 935.20 ( -12.40%) 906.00 ( -8.89%) 877.40 ( -5.46%) 870.20 ( -4.59%) +/- 82.91 ( 0.00%) 30.69 ( 62.98%) 54.01 ( 34.86%) 55.05 ( 33.60%) 20.91 ( 74.78%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 7229.81 928.42 704.52 80.68 1330.76 Total Elapsed Time (seconds) 112849.04 5618.69 571.11 360.54 1664.28 In this case, the test is reading/writing only from filesystems but as it's vfat, it's slow due to calling writepage during compaction. Little to observe really - the time to complete the test goes way down with the series applied and THP allocation success rates go up in comparison to 3.2-rc5. The success rates are lower than 3.1.0 but the elapsed time for that kernel is abysmal so it is not really a sensible comparison. As before, Andrea's series allocates more THPs at the cost of overall performance. writebackCPFileext4 3.1.0-vanilla rc5-vanilla freemore-v6r1 isolate-v6r1 andrea-v2r1 System Time 1.51 ( 0.00%) 1.77 ( -17.66%) 1.46 ( 2.92%) 1.15 ( 23.77%) 1.89 ( -25.63%) +/- 0.27 ( 0.00%) 0.67 ( -148.52%) 0.33 ( -22.76%) 0.30 ( -11.15%) 0.19 ( 30.16%) User Time 0.03 ( 0.00%) 0.04 ( -37.50%) 0.05 ( -62.50%) 0.07 ( -112.50%) 0.04 ( -18.75%) +/- 0.01 ( 0.00%) 0.02 ( -146.64%) 0.02 ( -97.91%) 0.02 ( -75.59%) 0.02 ( -63.30%) Elapsed Time 124.93 ( 0.00%) 114.49 ( 8.36%) 96.77 ( 22.55%) 27.48 ( 78.00%) 205.70 ( -64.65%) +/- 20.20 ( 0.00%) 74.39 ( -268.34%) 59.88 ( -196.48%) 7.72 ( 61.79%) 25.03 ( -23.95%) THP Active 161.80 ( 0.00%) 83.60 ( 51.67%) 141.20 ( 87.27%) 84.60 ( 52.29%) 82.60 ( 51.05%) +/- 71.95 ( 0.00%) 43.80 ( 60.88%) 26.91 ( 37.40%) 59.02 ( 82.03%) 52.13 ( 72.45%) Fault Alloc 471.40 ( 0.00%) 228.60 ( 48.49%) 282.20 ( 59.86%) 225.20 ( 47.77%) 388.40 ( 82.39%) +/- 88.07 ( 0.00%) 87.42 ( 99.26%) 73.79 ( 83.78%) 109.62 ( 124.47%) 82.62 ( 93.81%) Fault Fallback 531.60 ( 0.00%) 774.60 ( -45.71%) 720.80 ( -35.59%) 777.80 ( -46.31%) 614.80 ( -15.65%) +/- 88.07 ( 0.00%) 87.26 ( 0.92%) 73.79 ( 16.22%) 109.62 ( -24.47%) 82.29 ( 6.56%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 50.22 33.76 30.65 24.14 128.45 Total Elapsed Time (seconds) 1113.73 1132.19 1029.45 759.49 1707.26 Same type of story - elapsed times go down. In this case, allocation success rates are roughtly the same. As before, Andrea's has higher success rates but takes a lot longer. Overall the series does reduce latencies and while the tests are inherency racy as alloc competes with the cp processes, the variability was included. The THP allocation rates are not as high as they could be but that is because we would have to be more aggressive about reclaim and compaction impacting overall performance. This patch: Commit 39deaf85 ("mm: compaction: make isolate_lru_page() filter-aware") noted that compaction does not migrate dirty or writeback pages and that is was meaningless to pick the page and re-add it to the LRU list. What was missed during review is that asynchronous migration moves dirty pages if their ->migratepage callback is migrate_page() because these can be moved without blocking. This potentially impacted hugepage allocation success rates by a factor depending on how many dirty pages are in the system. This patch partially reverts 39deaf85 to allow migration to isolate dirty pages again. This increases how much compaction disrupts the LRU but that is addressed later in the series. Signed-off-by: Mel Gorman Reviewed-by: Andrea Arcangeli Reviewed-by: Rik van Riel Reviewed-by: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b30c40f860c7..228f91b57182 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -371,9 +371,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; } - if (!cc->sync) - mode |= ISOLATE_CLEAN; - /* Try isolate the page */ if (__isolate_lru_page(page, mode, 0) != 0) continue; From 397d9c507ff1c9c5afc80c80ee245c2455d6a1db Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:34 -0800 Subject: [PATCH 023/135] mm: compaction: determine if dirty pages can be migrated without blocking within ->migratepage commit b969c4ab9f182a6e1b2a0848be349f99714947b0 upstream. Stable note: Not tracked in Bugzilla. A fix aimed at preserving page aging information by reducing LRU list churning had the side-effect of reducing THP allocation success rates. This was part of a series to restore the success rates while preserving the reclaim fix. Asynchronous compaction is used when allocating transparent hugepages to avoid blocking for long periods of time. Due to reports of stalling, there was a debate on disabling synchronous compaction but this severely impacted allocation success rates. Part of the reason was that many dirty pages are skipped in asynchronous compaction by the following check; if (PageDirty(page) && !sync && mapping->a_ops->migratepage != migrate_page) rc = -EBUSY; This skips over all mapping aops using buffer_migrate_page() even though it is possible to migrate some of these pages without blocking. This patch updates the ->migratepage callback with a "sync" parameter. It is the responsibility of the callback to fail gracefully if migration would block. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 4 +- fs/hugetlbfs/inode.c | 3 +- fs/nfs/internal.h | 2 +- fs/nfs/write.c | 4 +- include/linux/fs.h | 9 ++- include/linux/migrate.h | 2 +- mm/migrate.c | 129 ++++++++++++++++++++++++++++------------ 7 files changed, 106 insertions(+), 47 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1ac8db5dc0a3..522cb2a6cb10 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -801,7 +801,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { /* * we can't safely write a btree page from here, @@ -816,7 +816,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); } #endif diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8b0c87530b04..6ca608bbc8b6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -568,7 +568,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) } static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + bool sync) { int rc; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 2a55347a2daa..a74442ae6884 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -315,7 +315,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f2f80c005c02..22a48fdee3cf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1662,7 +1662,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page) + struct page *page, bool sync) { /* * If PagePrivate is set, then the page is currently associated with @@ -1677,7 +1677,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); } #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 96b10354c741..09ddec9ee5dd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -607,9 +607,12 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); - /* migrate the contents of a page to the specified target */ + /* + * migrate the contents of a page to the specified target. If sync + * is false, it must not block. + */ int (*migratepage) (struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2478,7 +2481,7 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index e39aeecfe9a2..14e6d2a88475 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -11,7 +11,7 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, bool sync); diff --git a/mm/migrate.c b/mm/migrate.c index 33358f878111..d43689ad8a5b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -220,6 +220,55 @@ out: pte_unmap_unlock(ptep, ptl); } +#ifdef CONFIG_BLOCK +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (sync) { + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + get_bh(bh); + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + put_bh(failed_bh); + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} +#else +static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, + bool sync) +{ + return true; +} +#endif /* CONFIG_BLOCK */ + /* * Replace the page in the mapping. * @@ -229,7 +278,8 @@ out: * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ static int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + struct buffer_head *head, bool sync) { int expected_count; void **pslot; @@ -258,6 +308,19 @@ static int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; } + /* + * In the async migration case of moving a page with buffers, lock the + * buffers using trylock before the mapping is moved. If the mapping + * was moved, we later failed to lock the buffers and could not move + * the mapping back due to an elevated page count, we would have to + * block waiting on other references to be dropped. + */ + if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) { + page_unfreeze_refs(page, expected_count); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + /* * Now we know that no one else is looking at the page. */ @@ -415,13 +478,13 @@ EXPORT_SYMBOL(fail_migrate_page); * Pages are locked upon entry and exit. */ int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync); if (rc) return rc; @@ -438,28 +501,28 @@ EXPORT_SYMBOL(migrate_page); * exist. */ int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { struct buffer_head *bh, *head; int rc; if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page); + rc = migrate_page_move_mapping(mapping, newpage, page, head, sync); if (rc) return rc; - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); + /* + * In the async case, migrate_page_move_mapping locked the buffers + * with an IRQ-safe spinlock held. In the sync case, the buffers + * need to be locked now + */ + if (sync) + BUG_ON(!buffer_migrate_lock_buffers(head, sync)); ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -536,10 +599,13 @@ static int writeout(struct address_space *mapping, struct page *page) * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, bool sync) { - if (PageDirty(page)) + if (PageDirty(page)) { + if (!sync) + return -EBUSY; return writeout(mapping, page); + } /* * Buffers may be managed in a filesystem specific way. @@ -549,7 +615,7 @@ static int fallback_migrate_page(struct address_space *mapping, !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, sync); } /* @@ -585,29 +651,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) - rc = migrate_page(mapping, newpage, page); - else { + rc = migrate_page(mapping, newpage, page, sync); + else if (mapping->a_ops->migratepage) /* - * Do not writeback pages if !sync and migratepage is - * not pointing to migrate_page() which is nonblocking - * (swapcache/tmpfs uses migratepage = migrate_page). + * Most pages have a mapping and most filesystems provide a + * migratepage callback. Anonymous pages are part of swap + * space which also has its own migratepage callback. This + * is the most common path for page migration. */ - if (PageDirty(page) && !sync && - mapping->a_ops->migratepage != migrate_page) - rc = -EBUSY; - else if (mapping->a_ops->migratepage) - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); - } + rc = mapping->a_ops->migratepage(mapping, + newpage, page, sync); + else + rc = fallback_migrate_page(mapping, newpage, page, sync); if (rc) { newpage->mapping = NULL; From c17a36656685a2af6ea9e99fa243a7103b643d12 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:41 -0800 Subject: [PATCH 024/135] mm: page allocator: do not call direct reclaim for THP allocations while compaction is deferred commit 66199712e9eef5aede09dbcd9dfff87798a66917 upstream. Stable note: Not tracked in Buzilla. This was part of a series that reduced interactivity stalls experienced when THP was enabled. If compaction is deferred, direct reclaim is used to try to free enough pages for the allocation to succeed. For small high-orders, this has a reasonable chance of success. However, if the caller has specified __GFP_NO_KSWAPD to limit the disruption to the system, it makes more sense to fail the allocation rather than stall the caller in direct reclaim. This patch skips direct reclaim if compaction is deferred and the caller specifies __GFP_NO_KSWAPD. Async compaction only considers a subset of pages so it is possible for compaction to be deferred prematurely and not enter direct reclaim even in cases where it should. To compensate for this, this patch also defers compaction only if sync compaction failed. Signed-off-by: Mel Gorman Acked-by: Minchan Kim Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 70c049dc4ee1..0d490ba1af93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1897,14 +1897,20 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { struct page *page; - if (!order || compaction_deferred(preferred_zone)) + if (!order) return NULL; + if (compaction_deferred(preferred_zone)) { + *deferred_compaction = true; + return NULL; + } + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); @@ -1932,7 +1938,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); - defer_compaction(preferred_zone); + + /* + * As async compaction considers a subset of pageblocks, only + * defer if the failure was a sync compaction failure. + */ + if (sync_migration) + defer_compaction(preferred_zone); cond_resched(); } @@ -1944,8 +1956,9 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress, - bool sync_migration) + int migratetype, bool sync_migration, + bool *deferred_compaction, + unsigned long *did_some_progress) { return NULL; } @@ -2095,6 +2108,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; bool sync_migration = false; + bool deferred_compaction = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2175,12 +2189,22 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; sync_migration = true; + /* + * If compaction is deferred for high-order allocations, it is because + * sync compaction recently failed. In this is the case and the caller + * has requested the system not be heavily disrupted, fail the + * allocation now instead of entering direct reclaim + */ + if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2243,8 +2267,9 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress, - sync_migration); + migratetype, sync_migration, + &deferred_compaction, + &did_some_progress); if (page) goto got_pg; } From a7e32d7a2a801b7838b4159e9d73ea86f68ae002 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:38 -0800 Subject: [PATCH 025/135] mm: compaction: make isolate_lru_page() filter-aware again commit c82449352854ff09e43062246af86bdeb628f0c3 upstream. Stable note: Not tracked in Bugzilla. A fix aimed at preserving page aging information by reducing LRU list churning had the side-effect of reducing THP allocation success rates. This was part of a series to restore the success rates while preserving the reclaim fix. Commit 39deaf85 ("mm: compaction: make isolate_lru_page() filter-aware") noted that compaction does not migrate dirty or writeback pages and that is was meaningless to pick the page and re-add it to the LRU list. This had to be partially reverted because some dirty pages can be migrated by compaction without blocking. This patch updates "mm: compaction: make isolate_lru_page" by skipping over pages that migration has no possibility of migrating to minimise LRU disruption. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Reviewed-by: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mmzone.h | 2 ++ mm/compaction.c | 3 +++ mm/vmscan.c | 35 +++++++++++++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 80acfbc9aa14..b32f3f9182ca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -166,6 +166,8 @@ static inline int is_unevictable_lru(enum lru_list l) #define ISOLATE_CLEAN ((__force isolate_mode_t)0x4) /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x8) +/* Isolate for asynchronous migration */ +#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x10) /* LRU Isolation modes. */ typedef unsigned __bitwise__ isolate_mode_t; diff --git a/mm/compaction.c b/mm/compaction.c index 228f91b57182..da449202ff9d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -371,6 +371,9 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; } + if (!cc->sync) + mode |= ISOLATE_ASYNC_MIGRATE; + /* Try isolate the page */ if (__isolate_lru_page(page, mode, 0) != 0) continue; diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c78bd3bd964..45c40d66b747 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1045,8 +1045,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) ret = -EBUSY; - if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) - return ret; + /* + * To minimise LRU disruption, the caller can indicate that it only + * wants to isolate pages it will be able to operate on without + * blocking - clean pages for the most part. + * + * ISOLATE_CLEAN means that only clean pages should be isolated. This + * is used by reclaim when it is cannot write to backing storage + * + * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages + * that it is possible to migrate without blocking + */ + if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + /* All the caller can do on PageWriteback is block */ + if (PageWriteback(page)) + return ret; + + if (PageDirty(page)) { + struct address_space *mapping; + + /* ISOLATE_CLEAN means only clean pages */ + if (mode & ISOLATE_CLEAN) + return ret; + + /* + * Only pages without mappings or that have a + * ->migratepage callback are possible to migrate + * without blocking + */ + mapping = page_mapping(page); + if (mapping && !mapping->a_ops->migratepage) + return ret; + } + } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) return ret; From 5d62e5ca429b85ecadaa5042bdb1d8b88d4bfe80 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 31 Oct 2011 17:08:39 -0700 Subject: [PATCH 026/135] kswapd: avoid unnecessary rebalance after an unsuccessful balancing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d2ebd0f6b89567eb93ead4e2ca0cbe03021f344b upstream. Stable note: Fixes https://bugzilla.redhat.com/show_bug.cgi?id=712019. This patch reduces kswapd CPU usage. In commit 215ddd66 ("mm: vmscan: only read new_classzone_idx from pgdat when reclaiming successfully") , Mel Gorman said kswapd is better to sleep after a unsuccessful balancing if there is tighter reclaim request pending in the balancing. But in the following scenario, kswapd do something that is not matched our expectation. The patch fixes this issue. 1, Read pgdat request A (classzone_idx, order = 3) 2, balance_pgdat() 3, During pgdat, a new pgdat request B (classzone_idx, order = 5) is placed 4, balance_pgdat() returns but failed since returned order = 0 5, pgdat of request A assigned to balance_pgdat(), and do balancing again. While the expectation behavior of kswapd should try to sleep. Signed-off-by: Alex Shi Reviewed-by: Tim Chen Acked-by: Mel Gorman Tested-by: Pádraig Brady Cc: Rik van Riel Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 45c40d66b747..5cc0f92eda3e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2844,7 +2844,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; + unsigned balanced_order; int classzone_idx, new_classzone_idx; + int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2875,7 +2877,9 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; + balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; + balanced_classzone_idx = classzone_idx; for ( ; ; ) { int ret; @@ -2884,7 +2888,8 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (classzone_idx >= new_classzone_idx && order == new_order) { + if (balanced_classzone_idx >= new_classzone_idx && + balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2899,7 +2904,8 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx); + kswapd_try_to_sleep(pgdat, balanced_order, + balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2916,7 +2922,9 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - order = balance_pgdat(pgdat, order, &classzone_idx); + balanced_classzone_idx = classzone_idx; + balanced_order = balance_pgdat(pgdat, order, + &balanced_classzone_idx); } } return 0; From 9203b3fa57cc16bf1ad1be7c64b01b5e45cc6151 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 31 Oct 2011 17:08:45 -0700 Subject: [PATCH 027/135] kswapd: assign new_order and new_classzone_idx after wakeup in sleeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f0dfcde099453aa4c0dc42473828d15a6d492936 upstream. Stable note: Fixes https://bugzilla.redhat.com/show_bug.cgi?id=712019. This patch reduces kswapd CPU usage. There 2 places to read pgdat in kswapd. One is return from a successful balance, another is waked up from kswapd sleeping. The new_order and new_classzone_idx represent the balance input order and classzone_idx. But current new_order and new_classzone_idx are not assigned after kswapd_try_to_sleep(), that will cause a bug in the following scenario. 1: after a successful balance, kswapd goes to sleep, and new_order = 0; new_classzone_idx = __MAX_NR_ZONES - 1; 2: kswapd waked up with order = 3 and classzone_idx = ZONE_NORMAL 3: in the balance_pgdat() running, a new balance wakeup happened with order = 5, and classzone_idx = ZONE_NORMAL 4: the first wakeup(order = 3) finished successufly, return order = 3 but, the new_order is still 0, so, this balancing will be treated as a failed balance. And then the second tighter balancing will be missed. So, to avoid the above problem, the new_order and new_classzone_idx need to be assigned for later successful comparison. Signed-off-by: Alex Shi Acked-by: Mel Gorman Reviewed-by: Minchan Kim Tested-by: Pádraig Brady Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5cc0f92eda3e..870cbcfc328a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2908,6 +2908,8 @@ static int kswapd(void *p) balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; + new_order = order; + new_classzone_idx = classzone_idx; pgdat->kswapd_max_order = 0; pgdat->classzone_idx = pgdat->nr_zones - 1; } From f869774c37710ef2b773d167d184b9936988d07f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:43 -0800 Subject: [PATCH 028/135] mm: compaction: introduce sync-light migration for use by compaction commit a6bc32b899223a877f595ef9ddc1e89ead5072b8 upstream. Stable note: Not tracked in Buzilla. This was part of a series that reduced interactivity stalls experienced when THP was enabled. These stalls were particularly noticable when copying data to a USB stick but the experiences for users varied a lot. This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT mode that avoids writing back pages to backing storage. Async compaction maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT. For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is used. This avoids sync compaction stalling for an excessive length of time, particularly when copying files to a USB stick where there might be a large number of dirty pages backed by a filesystem that does not support ->writepages. [aarcange@redhat.com: This patch is heavily based on Andrea's work] [akpm@linux-foundation.org: fix fs/nfs/write.c build] [akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 5 +-- fs/hugetlbfs/inode.c | 2 +- fs/nfs/internal.h | 2 +- fs/nfs/write.c | 4 +-- include/linux/fs.h | 6 ++-- include/linux/migrate.h | 23 +++++++++--- mm/compaction.c | 2 +- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 78 +++++++++++++++++++++++------------------ 11 files changed, 76 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 522cb2a6cb10..57106a99b52d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -801,7 +801,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -816,7 +817,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6ca608bbc8b6..6327a069d83d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -569,7 +569,7 @@ static int hugetlbfs_set_page_dirty(struct page *page) static int hugetlbfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, - bool sync) + enum migrate_mode mode) { int rc; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a74442ae6884..4f10d8188ab6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -315,7 +315,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 22a48fdee3cf..58bb9994b947 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1662,7 +1662,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, bool sync) + struct page *page, enum migrate_mode mode) { /* * If PagePrivate is set, then the page is currently associated with @@ -1677,7 +1677,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 09ddec9ee5dd..212ea7ba3f1d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -523,6 +523,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +enum migrate_mode; struct iov_iter { const struct iovec *iov; @@ -612,7 +613,7 @@ struct address_space_operations { * is false, it must not block. */ int (*migratepage) (struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2481,7 +2482,8 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, + enum migrate_mode); #else #define buffer_migrate_page NULL #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 14e6d2a88475..eaf867412f7a 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -6,18 +6,31 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); +/* + * MIGRATE_ASYNC means never block + * MIGRATE_SYNC_LIGHT in the current implementation means to allow blocking + * on most operations but not ->writepage as the potential stall time + * is too significant + * MIGRATE_SYNC will block when migrating pages + */ +enum migrate_mode { + MIGRATE_ASYNC, + MIGRATE_SYNC_LIGHT, + MIGRATE_SYNC, +}; + #ifdef CONFIG_MIGRATION #define PAGE_MIGRATION 1 extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync); + enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -36,10 +49,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - bool sync) { return -ENOSYS; } + enum migrate_mode mode) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index da449202ff9d..8ea7308601bc 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -577,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059c..6496748df214 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, true); + 0, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b5a11e..ae5a3f21010b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -747,7 +747,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - true, true); + true, MIGRATE_SYNC); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3dac2d168e47..dd5f8747e6ff 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -926,7 +926,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, true); + false, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index d43689ad8a5b..480714b6f3fd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -222,12 +222,13 @@ out: #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ -static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync) +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) { struct buffer_head *bh = head; /* Simple case, sync compaction */ - if (sync) { + if (mode != MIGRATE_ASYNC) { do { get_bh(bh); lock_buffer(bh); @@ -263,7 +264,7 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, bool sync) } #else static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, - bool sync) + enum migrate_mode mode) { return true; } @@ -279,7 +280,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, */ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, - struct buffer_head *head, bool sync) + struct buffer_head *head, enum migrate_mode mode) { int expected_count; void **pslot; @@ -315,7 +316,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, * the mapping back due to an elevated page count, we would have to * block waiting on other references to be dropped. */ - if (!sync && head && !buffer_migrate_lock_buffers(head, sync)) { + if (mode == MIGRATE_ASYNC && head && + !buffer_migrate_lock_buffers(head, mode)) { page_unfreeze_refs(page, expected_count); spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; @@ -478,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page); * Pages are locked upon entry and exit. */ int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, sync); + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); if (rc) return rc; @@ -501,17 +504,17 @@ EXPORT_SYMBOL(migrate_page); * exist. */ int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, enum migrate_mode mode) { struct buffer_head *bh, *head; int rc; if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); head = page_buffers(page); - rc = migrate_page_move_mapping(mapping, newpage, page, head, sync); + rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); if (rc) return rc; @@ -521,8 +524,8 @@ int buffer_migrate_page(struct address_space *mapping, * with an IRQ-safe spinlock held. In the sync case, the buffers * need to be locked now */ - if (sync) - BUG_ON(!buffer_migrate_lock_buffers(head, sync)); + if (mode != MIGRATE_ASYNC) + BUG_ON(!buffer_migrate_lock_buffers(head, mode)); ClearPagePrivate(page); set_page_private(newpage, page_private(page)); @@ -599,10 +602,11 @@ static int writeout(struct address_space *mapping, struct page *page) * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, bool sync) + struct page *newpage, struct page *page, enum migrate_mode mode) { if (PageDirty(page)) { - if (!sync) + /* Only writeback pages in full synchronous migration */ + if (mode != MIGRATE_SYNC) return -EBUSY; return writeout(mapping, page); } @@ -615,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping, !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, sync); + return migrate_page(mapping, newpage, page, mode); } /* @@ -630,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache, bool sync) + int remap_swapcache, enum migrate_mode mode) { struct address_space *mapping; int rc; @@ -651,7 +655,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) - rc = migrate_page(mapping, newpage, page, sync); + rc = migrate_page(mapping, newpage, page, mode); else if (mapping->a_ops->migratepage) /* * Most pages have a mapping and most filesystems provide a @@ -660,9 +664,9 @@ static int move_to_new_page(struct page *newpage, struct page *page, * is the most common path for page migration. */ rc = mapping->a_ops->migratepage(mapping, - newpage, page, sync); + newpage, page, mode); else - rc = fallback_migrate_page(mapping, newpage, page, sync); + rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc) { newpage->mapping = NULL; @@ -677,7 +681,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } static int __unmap_and_move(struct page *page, struct page *newpage, - int force, bool offlining, bool sync) + int force, bool offlining, enum migrate_mode mode) { int rc = -EAGAIN; int remap_swapcache = 1; @@ -686,7 +690,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { - if (!force || !sync) + if (!force || mode == MIGRATE_ASYNC) goto out; /* @@ -732,10 +736,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (PageWriteback(page)) { /* - * For !sync, there is no point retrying as the retry loop - * is expected to be too short for PageWriteback to be cleared + * Only in the case of a full syncronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much */ - if (!sync) { + if (mode != MIGRATE_SYNC) { rc = -EBUSY; goto uncharge; } @@ -806,7 +812,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache, sync); + rc = move_to_new_page(newpage, page, remap_swapcache, mode); if (rc && remap_swapcache) remove_migration_ptes(page, page); @@ -829,7 +835,8 @@ out: * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, bool sync) + struct page *page, int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -847,7 +854,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (unlikely(split_huge_page(page))) goto out; - rc = __unmap_and_move(page, newpage, force, offlining, sync); + rc = __unmap_and_move(page, newpage, force, offlining, mode); out: if (rc != -EAGAIN) { /* @@ -895,7 +902,8 @@ out: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, bool offlining, bool sync) + int force, bool offlining, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -908,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force || !sync) + if (!force || mode != MIGRATE_SYNC) goto out; lock_page(hpage); } @@ -919,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1, sync); + rc = move_to_new_page(new_hpage, hpage, 1, mode); if (rc) remove_migration_ptes(hpage, hpage); @@ -962,7 +970,7 @@ out: */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -983,7 +991,7 @@ int migrate_pages(struct list_head *from, rc = unmap_and_move(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -1013,7 +1021,7 @@ out: int migrate_huge_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - bool sync) + enum migrate_mode mode) { int retry = 1; int nr_failed = 0; @@ -1030,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from, rc = unmap_and_move_huge_page(get_new_page, private, page, pass > 2, offlining, - sync); + mode); switch(rc) { case -ENOMEM: @@ -1159,7 +1167,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, true); + (unsigned long)pm, 0, MIGRATE_SYNC); if (err) putback_lru_pages(&pagelist); } From d50462a3a29fc5f53ef4a5d74eb693b4d4cb1512 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:45 -0800 Subject: [PATCH 029/135] mm: vmscan: when reclaiming for compaction, ensure there are sufficient free pages available commit fe4b1b244bdb96136855f2c694071cb09d140766 upstream. Stable note: Not tracked on Bugzilla. THP and compaction was found to aggressively reclaim pages and stall systems under different situations that was addressed piecemeal over time. This patch addresses a problem where the fix regressed THP allocation success rates. In commit e0887c19 ("vmscan: limit direct reclaim for higher order allocations"), Rik noted that reclaim was too aggressive when THP was enabled. In his initial patch he used the number of free pages to decide if reclaim should abort for compaction. My feedback was that reclaim and compaction should be using the same logic when deciding if reclaim should be aborted. Unfortunately, this had the effect of reducing THP success rates when the workload included something like streaming reads that continually allocated pages. The window during which compaction could run and return a THP was too small. This patch combines Rik's two patches together. compaction_suitable() is still used to decide if reclaim should be aborted to allow compaction is used. However, it will also ensure that there is a reasonable buffer of free pages available. This improves upon the THP allocation success rates but bounds the number of pages that are freed for compaction. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 870cbcfc328a..eadab0943141 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2075,6 +2075,42 @@ restart: throttle_vm_writeout(sc->gfp_mask); } +/* Returns true if compaction should go ahead for a high-order request */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long balance_gap, watermark; + bool watermark_ok; + + /* Do not consider compaction for orders reclaim is meant to satisfy */ + if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * Compaction takes time to run and there are potentially other + * callers using the pages just freed. Continue reclaiming until + * there is a buffer of free pages available to give compaction + * a reasonable chance of completing and allocating the page + */ + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + + /* + * If compaction is deferred, reclaim up to a point where + * compaction will have a chance of success when re-enabled + */ + if (compaction_deferred(zone)) + return watermark_ok; + + /* If compaction is not ready to start, keep reclaiming */ + if (!compaction_suitable(zone, sc->order)) + return false; + + return watermark_ok; +} + /* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation @@ -2092,8 +2128,8 @@ restart: * scan then give up on it. * * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is either ready to begin or deferred. - * This indicates to the caller that it should retry the allocation or fail. + * high-order allocation and compaction is ready to begin. This indicates to + * the caller that it should retry the allocation or fail. */ static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) @@ -2127,9 +2163,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * noticable problem, like transparent huge page * allocations. */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER && - (compaction_suitable(zone, sc->order) || - compaction_deferred(zone))) { + if (compaction_ready(zone, sc)) { should_abort_reclaim = true; continue; } From da0dc52b5236e73d7d7b5a58e8e067236fd1323e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:33 -0800 Subject: [PATCH 030/135] mm: vmscan: do not OOM if aborting reclaim to start compaction commit 7335084d446b83cbcb15da80497d03f0c1dc9e21 upstream. Stable note: Not tracked in Bugzilla. This patch makes later patches easier to apply but otherwise has little to justify it. The problem it fixes was never observed but the source of the theoretical problem did not exist for very long. During direct reclaim it is possible that reclaim will be aborted so that compaction can be attempted to satisfy a high-order allocation. If this decision is made before any pages are reclaimed, it is possible that 0 is returned to the page allocator potentially triggering an OOM. This has not been observed but it is a possibility so this patch addresses it. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index eadab0943141..441f97efdfb5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2240,6 +2240,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; unsigned long writeback_threshold; + bool should_abort_reclaim; get_mems_allowed(); delayacct_freepages_start(); @@ -2251,7 +2252,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->mem_cgroup); - if (shrink_zones(priority, zonelist, sc)) + should_abort_reclaim = shrink_zones(priority, zonelist, sc); + if (should_abort_reclaim) break; /* @@ -2318,6 +2320,10 @@ out: if (oom_killer_disabled) return 0; + /* Aborting reclaim to try compaction? don't OOM, then */ + if (should_abort_reclaim) + return 1; + /* top priority shrink_zones still had more to do? don't OOM, then */ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) return 1; From 9cad5d6a3ce8ffc5fee70c6514ccb7ce003b8792 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:49 -0800 Subject: [PATCH 031/135] mm: vmscan: check if reclaim should really abort even if compaction_ready() is true for one zone commit 0cee34fd72c582b4f8ad8ce00645b75fb4168199 upstream. Stable note: Not tracked on Bugzilla. THP and compaction was found to aggressively reclaim pages and stall systems under different situations that was addressed piecemeal over time. If compaction can proceed for a given zone, shrink_zones() does not reclaim any more pages from it. After commit [e0c2327: vmscan: abort reclaim/compaction if compaction can proceed], do_try_to_free_pages() tries to finish as soon as possible once one zone can compact. This was intended to prevent slabs being shrunk unnecessarily but there are side-effects. One is that a small zone that is ready for compaction will abort reclaim even if the chances of successfully allocating a THP from that zone is small. It also means that reclaim can return too early even though sc->nr_to_reclaim pages were not reclaimed. This partially reverts the commit until it is proven that slabs are really being shrunk unnecessarily but preserves the check to return 1 to avoid OOM if reclaim was aborted prematurely. [aarcange@redhat.com: This patch replaces a revert from Andrea] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 441f97efdfb5..f8c96c74cb1a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2129,7 +2129,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * * This function returns true if a zone is being reclaimed for a costly * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should retry the allocation or fail. + * the caller that it should consider retrying the allocation instead of + * further reclaim. */ static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) @@ -2138,7 +2139,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool should_abort_reclaim = false; + bool aborted_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2164,7 +2165,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, * allocations. */ if (compaction_ready(zone, sc)) { - should_abort_reclaim = true; + aborted_reclaim = true; continue; } } @@ -2186,7 +2187,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } - return should_abort_reclaim; + return aborted_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2240,7 +2241,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; unsigned long writeback_threshold; - bool should_abort_reclaim; + bool aborted_reclaim; get_mems_allowed(); delayacct_freepages_start(); @@ -2252,9 +2253,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->mem_cgroup); - should_abort_reclaim = shrink_zones(priority, zonelist, sc); - if (should_abort_reclaim) - break; + aborted_reclaim = shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from @@ -2320,8 +2319,8 @@ out: if (oom_killer_disabled) return 0; - /* Aborting reclaim to try compaction? don't OOM, then */ - if (should_abort_reclaim) + /* Aborted reclaim to try compaction? don't OOM, then */ + if (aborted_reclaim) return 1; /* top priority shrink_zones still had more to do? don't OOM, then */ From 03722816ec065d8c7a9306fc2d601251bc0c623e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:06:59 -0800 Subject: [PATCH 032/135] vmscan: promote shared file mapped pages commit 34dbc67a644f11ab3475d822d72e25409911e760 upstream. Stable note: Not tracked in Bugzilla. There were reports of shared mapped pages being unfairly reclaimed in comparison to older kernels. This is being addressed over time. The specific workload being addressed here in described in paragraph four and while paragraph five says it did not help performance as such, it made a difference to major page faults. I'm aware of at least one bug for a large vendor that was due to increased major faults. Commit 645747462435 ("vmscan: detect mapped file pages used only once") greatly decreases lifetime of single-used mapped file pages. Unfortunately it also decreases life time of all shared mapped file pages. Because after commit bf3f3bc5e7347 ("mm: don't mark_page_accessed in fault path") page-fault handler does not mark page active or even referenced. Thus page_check_references() activates file page only if it was used twice while it stays in inactive list, meanwhile it activates anon pages after first access. Inactive list can be small enough, this way reclaimer can accidentally throw away any widely used page if it wasn't used twice in short period. After this patch page_check_references() also activate file mapped page at first inactive list scan if this page is already used multiple times via several ptes. I found this while trying to fix degragation in rhel6 (~2.6.32) from rhel5 (~2.6.18). There a complete mess with >100 web/mail/spam/ftp containers, they share all their files but there a lot of anonymous pages: ~500mb shared file mapped memory and 15-20Gb non-shared anonymous memory. In this situation major-pagefaults are very costly, because all containers share the same page. In my load kernel created a disproportionate pressure on the file memory, compared with the anonymous, they equaled only if I raise swappiness up to 150 =) These patches actually wasn't helped a lot in my problem, but I saw noticable (10-20 times) reduce in count and average time of major-pagefault in file-mapped areas. Actually both patches are fixes for commit v2.6.33-5448-g6457474, because it was aimed at one scenario (singly used pages), but it breaks the logic in other scenarios (shared and/or executable pages) Signed-off-by: Konstantin Khlebnikov Acked-by: Pekka Enberg Acked-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Johannes Weiner Cc: Nick Piggin Cc: Mel Gorman Cc: Shaohua Li Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f8c96c74cb1a..811141e166e0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -723,7 +723,7 @@ static enum page_references page_check_references(struct page *page, */ SetPageReferenced(page); - if (referenced_page) + if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; From 4391b5f49e28bdb78ddf67495abb2f767474216d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:03 -0800 Subject: [PATCH 033/135] vmscan: activate executable pages after first usage commit c909e99364c8b6ca07864d752950b6b4ecf6bef4 upstream. Stable note: Not tracked in Bugzilla. There were reports of shared mapped pages being unfairly reclaimed in comparison to older kernels. This is being addressed over time. Logic added in commit 8cab4754d24a0 ("vmscan: make mapped executable pages the first class citizen") was noticeably weakened in commit 645747462435d84 ("vmscan: detect mapped file pages used only once"). Currently these pages can become "first class citizens" only after second usage. After this patch page_check_references() will activate they after first usage, and executable code gets yet better chance to stay in memory. Signed-off-by: Konstantin Khlebnikov Cc: Pekka Enberg Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Johannes Weiner Cc: Nick Piggin Cc: Mel Gorman Cc: Shaohua Li Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 811141e166e0..fed30222e4ab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -726,6 +726,12 @@ static enum page_references page_check_references(struct page *page, if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; + /* + * Activate file-backed executable pages after first usage. + */ + if (vm_flags & VM_EXEC) + return PAGEREF_ACTIVATE; + return PAGEREF_KEEP; } From 503e973ce4a57bc949ddbf35d4b4ecd1a90263f8 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2012 15:08:18 -0800 Subject: [PATCH 034/135] mm/vmscan.c: consider swap space when deciding whether to continue reclaim commit 86cfd3a45042ab242d47f3935a02811a402beab6 upstream. Stable note: Not tracked in Bugzilla. This patch reduces kswapd CPU usage on swapless systems with high anonymous memory usage. It's pointless to continue reclaiming when we have no swap space and lots of anon pages in the inactive list. Without this patch, it is possible when swap is disabled to continue trying to reclaim when there are only anonymous pages in the system even though that will not make any progress. Signed-off-by: Minchan Kim Cc: KOSAKI Motohiro Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index fed30222e4ab..7a5698185a92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2008,8 +2008,9 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (nr_swap_pages > 0) + inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; From d2b02236b85226c9f2cd28e32a9fdf197584e448 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 10 Jan 2012 15:08:33 -0800 Subject: [PATCH 035/135] mm: test PageSwapBacked in lumpy reclaim commit 043bcbe5ec51e0478ef2b44acef17193e01d7f70 upstream. Stable note: Not tracked in Bugzilla. There were reports of shared mapped pages being unfairly reclaimed in comparison to older kernels. This is being addressed over time. Even though the subject refers to lumpy reclaim, it impacts compaction as well. Lumpy reclaim does well to stop at a PageAnon when there's no swap, but better is to stop at any PageSwapBacked, which includes shmem/tmpfs too. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7a5698185a92..39bb4161bb06 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1199,7 +1199,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * anon page which don't already have a swap slot is * pointless. */ - if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && !PageSwapCache(cursor_page)) break; From 4d01a2e38a9c27d3de083ecc561b32b6a55fc7eb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 12 Jan 2012 17:18:06 -0800 Subject: [PATCH 036/135] mm: vmscan: convert global reclaim to per-memcg LRU lists commit b95a2f2d486d0d768a92879c023a03757b9c7e58 upstream - WARNING: this is a substitute patch. Stable note: Not tracked in Bugzilla. This is a partial backport of an upstream commit addressing a completely different issue that accidentally contained an important fix. The workload this patch helps was memcached when IO is started in the background. memcached should stay resident but without this patch it gets swapped. Sometimes this manifests as a drop in throughput but mostly it was observed through /proc/vmstat. Commit [246e87a9: memcg: fix get_scan_count() for small targets] was meant to fix a problem whereby small scan targets on memcg were ignored causing priority to raise too sharply. It forced scanning to take place if the target was small, memcg or kswapd. From the time it was introduced it caused excessive reclaim by kswapd with workloads being pushed to swap that previously would have stayed resident. This was accidentally fixed in commit [b95a2f2d: mm: vmscan: convert global reclaim to per-memcg LRU lists] by making it harder for kswapd to force scan small targets but that patchset is not suitable for backporting. This was later changed again by commit [90126375: mm/vmscan: push lruvec pointer into get_scan_count()] into a format that looks like it would be a straight-forward backport but there is a subtle difference due to the use of lruvecs. The impact of the accidental fix is to make it harder for kswapd to force scan small targets by taking zone->all_unreclaimable into account. This patch is the closest equivalent available based on what is backported. Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 39bb4161bb06..6697b7ab99c2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1850,7 +1850,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, unsigned long nr_force_scan[2]; /* kswapd does zone balancing and needs to scan this zone */ - if (scanning_global_lru(sc) && current_is_kswapd()) + if (scanning_global_lru(sc) && current_is_kswapd() && + zone->all_unreclaimable) force_scan = true; /* memcg may have small limit and need to avoid priority drop */ if (!scanning_global_lru(sc)) From 6b63ea81d831b2b6f2ce6d60cfcfa05b1858ad97 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 Nov 2011 13:38:39 -0700 Subject: [PATCH 037/135] cpusets: avoid looping when storing to mems_allowed if one node remains set commit 89e8a244b97e48f1f30e898b6f32acca477f2a13 upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. {get,put}_mems_allowed() exist so that general kernel code may locklessly access a task's set of allowable nodes without having the chance that a concurrent write will cause the nodemask to be empty on configurations where MAX_NUMNODES > BITS_PER_LONG. This could incur a significant delay, however, especially in low memory conditions because the page allocator is blocking and reclaim requires get_mems_allowed() itself. It is not atypical to see writes to cpuset.mems take over 2 seconds to complete, for example. In low memory conditions, this is problematic because it's one of the most imporant times to change cpuset.mems in the first place! The only way a task's set of allowable nodes may change is through cpusets by writing to cpuset.mems and when attaching a task to a generic code is not reading the nodemask with get_mems_allowed() at the same time, and then clearing all the old nodes. This prevents the possibility that a reader will see an empty nodemask at the same time the writer is storing a new nodemask. If at least one node remains unchanged, though, it's possible to simply set all new nodes and then clear all the old nodes. Changing a task's nodemask is protected by cgroup_mutex so it's guaranteed that two threads are not changing the same task's nodemask at the same time, so the nodemask is guaranteed to be stored before another thread changes it and determines whether a node remains set or not. Signed-off-by: David Rientjes Cc: Miao Xie Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9c9b7545c810..a9958936d89c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { + bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + repeat: /* * Allow tasks that have access to memory reserves because they have @@ -963,7 +965,6 @@ repeat: nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* * ensure checking ->mems_allowed_change_disable after setting all new * allowed nodes. @@ -980,9 +981,11 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. + * for the read-side. No wait is necessary, however, if at least one + * node remains unchanged. */ - while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (masks_disjoint && + ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); From ba204b545c29176659273c5316afb147eec332a3 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 19 Dec 2011 17:11:52 -0800 Subject: [PATCH 038/135] cpusets: stall when updating mems_allowed for mempolicy or disjoint nodemask commit b246272ecc5ac68c743b15c9e41a2275f7ce70e2 upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. Kernels where MAX_NUMNODES > BITS_PER_LONG may temporarily see an empty nodemask in a tsk's mempolicy if its previous nodemask is remapped onto a new set of allowed cpuset nodes where the two nodemasks, as a result of the remap, are now disjoint. c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") adds get_mems_allowed() to prevent the set of allowed nodes from changing for a thread. This causes any update to a set of allowed nodes to stall until put_mems_allowed() is called. This stall is unncessary, however, if at least one node remains unchanged in the update to the set of allowed nodes. This was addressed by 89e8a244b97e ("cpusets: avoid looping when storing to mems_allowed if one node remains set"), but it's still possible that an empty nodemask may be read from a mempolicy because the old nodemask may be remapped to the new nodemask during rebind. To prevent this, only avoid the stall if there is no mempolicy for the thread being changed. This is a temporary solution until all reads from mempolicy nodemasks can be guaranteed to not be empty without the get_mems_allowed() synchronization. Also moves the check for nodemask intersection inside task_lock() so that tsk->mems_allowed cannot change. This ensures that nothing can set this tsk's mems_allowed out from under us and also protects tsk->mempolicy. Reported-by: Miao Xie Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: Paul Menage Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a9958936d89c..28d0bbd81ad1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +#ifdef CONFIG_NUMA +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return task->mempolicy; +} +#else +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return false; +} +#endif + + /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, @@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + bool need_loop; repeat: /* @@ -962,6 +975,14 @@ repeat: return; task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing + * get_mems_allowed(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ + need_loop = task_has_mempolicy(tsk) || + !nodes_intersects(*newmems, tsk->mems_allowed); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -981,11 +1002,9 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. No wait is necessary, however, if at least one - * node remains unchanged. + * for the read-side. */ - while (masks_disjoint && - ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); From 627c5c60b4ac673e9f4be758858073071684dce9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:11 -0700 Subject: [PATCH 039/135] cpuset: mm: reduce large amounts of memory barrier related damage v3 commit cc9a6c8776615f9c194ccf0b63a0aa5628235545 upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman Cc: Miao Xie Cc: David Rientjes Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- include/linux/cpuset.h | 47 +++++++++++++++++---------------------- include/linux/init_task.h | 8 +++++++ include/linux/sched.h | 2 +- kernel/cpuset.c | 43 +++++++---------------------------- kernel/fork.c | 3 +++ mm/filemap.c | 11 +++++---- mm/hugetlb.c | 15 +++++++++---- mm/mempolicy.c | 28 +++++++++++++++++------ mm/page_alloc.c | 33 ++++++++++++++++++--------- mm/slab.c | 13 ++++++----- mm/slub.c | 36 +++++++++++++++++++----------- mm/vmscan.c | 2 -- 12 files changed, 133 insertions(+), 108 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e9eaec522655..7a7e5fd2a277 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); /* - * reading current mems_allowed and mempolicy in the fastpath must protected - * by get_mems_allowed() + * get_mems_allowed is required when making decisions involving mems_allowed + * such as during page allocation. mems_allowed can be updated in parallel + * and depending on the new value an operation can fail potentially causing + * process failure. A retry loop with get_mems_allowed and put_mems_allowed + * prevents these artificial failures. */ -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { - current->mems_allowed_change_disable++; - - /* - * ensure that reading mems_allowed and mempolicy happens after the - * update of ->mems_allowed_change_disable. - * - * the write-side task finds ->mems_allowed_change_disable is not 0, - * and knows the read-side task is reading mems_allowed or mempolicy, - * so it will clear old bits lazily. - */ - smp_mb(); + return read_seqcount_begin(¤t->mems_allowed_seq); } -static inline void put_mems_allowed(void) +/* + * If this returns false, the operation that took place after get_mems_allowed + * may have failed. It is up to the caller to retry the operation if + * appropriate. + */ +static inline bool put_mems_allowed(unsigned int seq) { - /* - * ensure that reading mems_allowed and mempolicy before reducing - * mems_allowed_change_disable. - * - * the write-side task will know that the read-side task is still - * reading mems_allowed or mempolicy, don't clears old bits in the - * nodemask. - */ - smp_mb(); - --ACCESS_ONCE(current->mems_allowed_change_disable); + return !read_seqcount_retry(¤t->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { task_lock(current); + write_seqcount_begin(¤t->mems_allowed_seq); current->mems_allowed = nodemask; + write_seqcount_end(¤t->mems_allowed_seq); task_unlock(current); } @@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { + return 0; } -static inline void put_mems_allowed(void) +static inline bool put_mems_allowed(unsigned int seq) { + return true; } #endif /* !CONFIG_CPUSETS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 580f70c02391..5e41a8e1cc77 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -30,6 +30,13 @@ extern struct fs_struct init_fs; #define INIT_THREADGROUP_FORK_LOCK(sig) #endif +#ifdef CONFIG_CPUSETS +#define INIT_CPUSET_SEQ \ + .mems_allowed_seq = SEQCNT_ZERO, +#else +#define INIT_CPUSET_SEQ +#endif + #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ @@ -193,6 +200,7 @@ extern struct cred init_cred; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_CPUSET_SEQ \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ef452b93f65..443ec43bebee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1484,7 +1484,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ - int mems_allowed_change_disable; + seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 28d0bbd81ad1..b2e84bd3ceb9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; -repeat: /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat: */ need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); + + if (need_loop) + write_seqcount_begin(&tsk->mems_allowed_seq); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); - - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + + if (need_loop) + write_seqcount_end(&tsk->mems_allowed_seq); + task_unlock(tsk); } diff --git a/kernel/fork.c b/kernel/fork.c index 4712e3e5578e..3d42aa3dad38 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -985,6 +985,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_CGROUPS init_rwsem(&sig->threadgroup_fork_lock); #endif +#ifdef CONFIG_CPUSETS + seqcount_init(&tsk->mems_allowed_seq); +#endif sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; diff --git a/mm/filemap.c b/mm/filemap.c index b7d860390f34..10481ebd96c9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp) struct page *page; if (cpuset_do_page_mem_spread()) { - get_mems_allowed(); - n = cpuset_mem_spread_node(); - page = alloc_pages_exact_node(n, gfp, 0); - put_mems_allowed(); + unsigned int cpuset_mems_cookie; + do { + cpuset_mems_cookie = get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + return page; } return alloc_pages(gfp, 0); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 05f8fd425f69..64f2b7aa0dd1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -454,14 +454,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; nodemask_t *nodemask; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + unsigned int cpuset_mems_cookie; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask, &mpol, &nodemask); /* @@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } } } + + mpol_cond_put(mpol); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; + err: mpol_cond_put(mpol); - put_mems_allowed(); - return page; + return NULL; } static void update_and_free_page(struct hstate *h, struct page *page) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dd5f8747e6ff..cff919fe7025 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1810,18 +1810,24 @@ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { - struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct mempolicy *pol; struct zonelist *zl; struct page *page; + unsigned int cpuset_mems_cookie; + +retry_cpuset: + pol = get_vma_policy(current, vma, addr); + cpuset_mems_cookie = get_mems_allowed(); - get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } zl = policy_zonelist(gfp, pol, node); @@ -1832,7 +1838,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } /* @@ -1840,7 +1847,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, */ page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } @@ -1867,11 +1875,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; struct page *page; + unsigned int cpuset_mems_cookie; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -1882,7 +1893,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(alloc_pages_current); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0d490ba1af93..9177aa35ae7e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2293,8 +2293,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; - struct page *page; + struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); + unsigned int cpuset_mems_cookie; gfp_mask &= gfp_allowed_mask; @@ -2313,15 +2314,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); - if (!preferred_zone) { - put_mems_allowed(); - return NULL; - } + if (!preferred_zone) + goto out; /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2331,9 +2332,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); + +out: + /* + * When updating a task's mems_allowed, it is possible to race with + * parallel threads in such a way that an allocation can fail while + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2557,13 +2568,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; + unsigned int cpuset_mems_cookie; if (!(flags & SHOW_MEM_FILTER_NODES)) goto out; - get_mems_allowed(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - put_mems_allowed(); + do { + cpuset_mems_cookie = get_mems_allowed(); + ret = !node_isset(nid, cpuset_current_mems_allowed); + } while (!put_mems_allowed(cpuset_mems_cookie)); out: return ret; } diff --git a/mm/slab.c b/mm/slab.c index d96e223de775..a67f8121ce5a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_mem_id(); - get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); - put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; + unsigned int cpuset_mems_cookie; if (flags & __GFP_THISNODE) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + retry: /* * Look through allowed nodes for objects available @@ -3306,7 +3307,9 @@ retry: } } } - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + goto retry_cpuset; return obj; } diff --git a/mm/slub.c b/mm/slub.c index 10ab2335e2ea..ae6e80ed1e5c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); struct page *page; + unsigned int cpuset_mems_cookie; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - struct kmem_cache_node *n; + do { + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; - n = get_node(s, zone_to_nid(zone)); + n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > s->min_partial) { - page = get_partial_node(n); - if (page) { - put_mems_allowed(); - return page; + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { + page = get_partial_node(n); + if (page) { + /* + * Return the object even if + * put_mems_allowed indicated that + * the cpuset mems_allowed was + * updated in parallel. It's a + * harmless race between the alloc + * and the cpuset update. + */ + put_mems_allowed(cpuset_mems_cookie); + return page; + } } } - } - put_mems_allowed(); + } while (!put_mems_allowed(cpuset_mems_cookie)); #endif return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6697b7ab99c2..1378487041cf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2251,7 +2251,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, unsigned long writeback_threshold; bool aborted_reclaim; - get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) @@ -2314,7 +2313,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, out: delayacct_freepages_end(); - put_mems_allowed(); if (sc->nr_reclaimed) return sc->nr_reclaimed; From ad04b9e911d7bc1edaa599a85f44b5fc83f9e90e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 25 Apr 2012 16:01:46 -0700 Subject: [PATCH 040/135] mm/hugetlb: fix warning in alloc_huge_page/dequeue_huge_page_vma commit b1c12cbcd0a02527c180a862e8971e249d3b347d upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. Fix a gcc warning (and bug?) introduced in cc9a6c877 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") Local variable "page" can be uninitialized if the nodemask from vma policy does not intersects with nodemask from cpuset. Even if it doesn't happens it is better to initialize this variable explicitly than to introduce a kernel oops in a weird corner case. mm/hugetlb.c: In function `alloc_huge_page': mm/hugetlb.c:1135:5: warning: `page' may be used uninitialized in this function Signed-off-by: Konstantin Khlebnikov Acked-by: Mel Gorman Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 64f2b7aa0dd1..ae60a53f0506 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -454,7 +454,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; struct zonelist *zonelist; From 909e0a4e5c3a9d3b60c1eecb34de75afa64ade95 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:51 -0800 Subject: [PATCH 041/135] vmscan: fix initial shrinker size handling commit 635697c663f38106063d5659f0cf2e45afcd4bb5 upstream. Stable note: The commit [acf92b48: vmscan: shrinker->nr updates race and go wrong] aimed to reduce excessive reclaim of slab objects but had bug in how it treated shrinker functions that returned -1. A shrinker function can return -1, means that it cannot do anything without a risk of deadlock. For example prune_super() does this if it cannot grab a superblock refrence, even if nr_to_scan=0. Currently we interpret this -1 as a ULONG_MAX size shrinker and evaluate `total_scan' according to this. So the next time around this shrinker can cause really big pressure. Let's skip such shrinkers instead. Also make total_scan signed, otherwise the check (total_scan < 0) below never works. Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1378487041cf..5326f98f506e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -248,12 +248,16 @@ unsigned long shrink_slab(struct shrink_control *shrink, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; - unsigned long total_scan; - unsigned long max_pass; + long total_scan; + long max_pass; int shrink_ret = 0; long nr; long new_nr; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + if (max_pass <= 0) + continue; + /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations @@ -264,7 +268,6 @@ unsigned long shrink_slab(struct shrink_control *shrink, } while (cmpxchg(&shrinker->nr, nr, 0) != nr); total_scan = nr; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); From f351a1d7efda2edd52c23a150b07b8380c47b6c0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 1 Aug 2012 12:28:17 -0700 Subject: [PATCH 042/135] Linux 3.0.39 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5fdfaa85e6e4..3ec1722025f3 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 0 -SUBLEVEL = 38 +SUBLEVEL = 39 EXTRAVERSION = NAME = Sneaky Weasel From 4067ad7b53cd5ba2699286620a2c645bb56542fa Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Tue, 3 Jul 2012 23:13:39 +0100 Subject: [PATCH 043/135] mmc: sdhci-pci: CaFe has broken card detection commit 55fc05b7414274f17795cd0e8a3b1546f3649d5e upstream. At http://dev.laptop.org/ticket/11980 we have determined that the Marvell CaFe SDHCI controller reports bad card presence during resume. It reports that no card is present even when it is. This is a regression -- resume worked back around 2.6.37. Around 400ms after resuming, a "card inserted" interrupt is generated, at which point it starts reporting presence. Work around this hardware oddity by setting the SDHCI_QUIRK_BROKEN_CARD_DETECTION flag. Thanks to Chris Ball for helping with diagnosis. Signed-off-by: Daniel Drake Signed-off-by: Chris Ball Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c index 936bbca19c0a..d3b3115def49 100644 --- a/drivers/mmc/host/sdhci-pci.c +++ b/drivers/mmc/host/sdhci-pci.c @@ -140,6 +140,7 @@ static const struct sdhci_pci_fixes sdhci_ene_714 = { static const struct sdhci_pci_fixes sdhci_cafe = { .quirks = SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER | SDHCI_QUIRK_NO_BUSY_IRQ | + SDHCI_QUIRK_BROKEN_CARD_DETECTION | SDHCI_QUIRK_BROKEN_TIMEOUT_VAL, }; From a8ed5765b5a8bf44a86284d80afd24f37a23e369 Mon Sep 17 00:00:00 2001 From: roger blofeld Date: Thu, 21 Jun 2012 05:27:14 +0000 Subject: [PATCH 044/135] powerpc/ftrace: Fix assembly trampoline register usage commit fd5a42980e1cf327b7240adf5e7b51ea41c23437 upstream. Just like the module loader, ftrace needs to be updated to use r12 instead of r11 with newer gcc's. Signed-off-by: Roger Blofeld Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Gortmaker Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/ftrace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index bf99cfa6bbfe..63240081133e 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -245,9 +245,9 @@ __ftrace_make_nop(struct module *mod, /* * On PPC32 the trampoline looks like: - * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha - * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l - * 0x7d, 0x69, 0x03, 0xa6 mtctr r11 + * 0x3d, 0x80, 0x00, 0x00 lis r12,sym@ha + * 0x39, 0x8c, 0x00, 0x00 addi r12,r12,sym@l + * 0x7d, 0x89, 0x03, 0xa6 mtctr r12 * 0x4e, 0x80, 0x04, 0x20 bctr */ @@ -262,9 +262,9 @@ __ftrace_make_nop(struct module *mod, pr_devel(" %08x %08x ", jmp[0], jmp[1]); /* verify that this is what we expect it to be */ - if (((jmp[0] & 0xffff0000) != 0x3d600000) || - ((jmp[1] & 0xffff0000) != 0x396b0000) || - (jmp[2] != 0x7d6903a6) || + if (((jmp[0] & 0xffff0000) != 0x3d800000) || + ((jmp[1] & 0xffff0000) != 0x398c0000) || + (jmp[2] != 0x7d8903a6) || (jmp[3] != 0x4e800420)) { printk(KERN_ERR "Not a trampoline\n"); return -EINVAL; From 93487ce8d6edc7c550b1449770df5e44715f520f Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Wed, 11 Jul 2012 14:22:46 +1000 Subject: [PATCH 045/135] powerpc: Add "memory" attribute for mfmsr() commit b416c9a10baae6a177b4f9ee858b8d309542fbef upstream. Add "memory" attribute in inline assembly language as a compiler barrier to make sure 4.6.x GCC don't reorder mfmsr(). Signed-off-by: Tiejun Chen Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/reg.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c5cae0dd176c..764e99c750d9 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1000,7 +1000,8 @@ /* Macros for setting and retrieving special purpose registers */ #ifndef __ASSEMBLY__ #define mfmsr() ({unsigned long rval; \ - asm volatile("mfmsr %0" : "=r" (rval)); rval;}) + asm volatile("mfmsr %0" : "=r" (rval) : \ + : "memory"); rval;}) #ifdef CONFIG_PPC_BOOK3S_64 #define __mtmsrd(v, l) asm volatile("mtmsrd %0," __stringify(l) \ : : "r" (v) : "memory") From c43386c06d5d73a9b3a8604226b1e32d85a4c384 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Fri, 9 Dec 2011 11:35:08 +0000 Subject: [PATCH 046/135] powerpc: Fix wrong divisor in usecs_to_cputime commit 9f5072d4f63f28d30d343573830ac6c85fc0deff upstream. Commit d57af9b (taskstats: use real microsecond granularity for CPU times) renamed msecs_to_cputime to usecs_to_cputime, but failed to update all numbers on the way. This causes nonsensical cpu idle/iowait values to be displayed in /proc/stat (the only user of usecs_to_cputime so far). This also renames __cputime_msec_factor to __cputime_usec_factor, adapting its value and using it directly in cputime_to_usecs instead of doing two multiplications. Signed-off-by: Andreas Schwab Acked-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt Cc: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/cputime.h | 6 +++--- arch/powerpc/kernel/time.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 1cf20bdfbeca..33a35801f7c9 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -126,11 +126,11 @@ static inline u64 cputime64_to_jiffies64(const cputime_t ct) /* * Convert cputime <-> microseconds */ -extern u64 __cputime_msec_factor; +extern u64 __cputime_usec_factor; static inline unsigned long cputime_to_usecs(const cputime_t ct) { - return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC; + return mulhdu(ct, __cputime_usec_factor); } static inline cputime_t usecs_to_cputime(const unsigned long us) @@ -143,7 +143,7 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) sec = us / 1000000; if (ct) { ct *= tb_ticks_per_sec; - do_div(ct, 1000); + do_div(ct, 1000000); } if (sec) ct += (cputime_t) sec * tb_ticks_per_sec; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2de304af07ab..1becd7b920d2 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -168,13 +168,13 @@ EXPORT_SYMBOL_GPL(ppc_tb_freq); #ifdef CONFIG_VIRT_CPU_ACCOUNTING /* * Factors for converting from cputime_t (timebase ticks) to - * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds). + * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds). * These are all stored as 0.64 fixed-point binary fractions. */ u64 __cputime_jiffies_factor; EXPORT_SYMBOL(__cputime_jiffies_factor); -u64 __cputime_msec_factor; -EXPORT_SYMBOL(__cputime_msec_factor); +u64 __cputime_usec_factor; +EXPORT_SYMBOL(__cputime_usec_factor); u64 __cputime_sec_factor; EXPORT_SYMBOL(__cputime_sec_factor); u64 __cputime_clockt_factor; @@ -192,8 +192,8 @@ static void calc_cputime_factors(void) div128_by_32(HZ, 0, tb_ticks_per_sec, &res); __cputime_jiffies_factor = res.result_low; - div128_by_32(1000, 0, tb_ticks_per_sec, &res); - __cputime_msec_factor = res.result_low; + div128_by_32(1000000, 0, tb_ticks_per_sec, &res); + __cputime_usec_factor = res.result_low; div128_by_32(1, 0, tb_ticks_per_sec, &res); __cputime_sec_factor = res.result_low; div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); From 2da74cd8a6bad64d02207396c76d0939f3c57aaa Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Jun 2012 23:36:15 -0700 Subject: [PATCH 047/135] SCSI: libsas: continue revalidation commit 26f2f199ff150d8876b2641c41e60d1c92d2fb81 upstream. Continue running revalidation until no more broadcast devices are discovered. Fixes cases where re-discovery completes too early in a domain with multiple expanders with pending re-discovery events. Servicing BCNs can get backed up behind error recovery. Signed-off-by: Dan Williams Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/libsas/sas_expander.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index e68fac69504b..e8ebe6883267 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -1972,9 +1972,7 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev) struct domain_device *dev = NULL; res = sas_find_bcast_dev(port_dev, &dev); - if (res) - goto out; - if (dev) { + while (res == 0 && dev) { struct expander_device *ex = &dev->ex_dev; int i = 0, phy_id; @@ -1986,8 +1984,10 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev) res = sas_rediscover(dev, phy_id); i = phy_id + 1; } while (i < ex->num_phys); + + dev = NULL; + res = sas_find_bcast_dev(port_dev, &dev); } -out: return res; } From 3f67ec4b517d985ac6961a1a03f91339e62657c0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Jun 2012 23:36:20 -0700 Subject: [PATCH 048/135] SCSI: libsas: fix sas_discover_devices return code handling commit b17caa174a7e1fd2e17b26e210d4ee91c4c28b37 upstream. commit 198439e4 [SCSI] libsas: do not set res = 0 in sas_ex_discover_dev() commit 19252de6 [SCSI] libsas: fix wide port hotplug issues The above commits seem to have confused the return value of sas_ex_discover_dev which is non-zero on failure and sas_ex_join_wide_port which just indicates short circuiting discovery on already established ports. The result is random discovery failures depending on configuration. Calls to sas_ex_join_wide_port are the source of the trouble as its return value is errantly assigned to 'res'. Convert it to bool and stop returning its result up the stack. Tested-by: Dan Melnic Reported-by: Dan Melnic Signed-off-by: Dan Williams Reviewed-by: Jack Wang Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/libsas/sas_expander.c | 39 +++++++++--------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index e8ebe6883267..d2f95761ba32 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -770,7 +770,7 @@ static struct domain_device *sas_ex_discover_end_dev( } /* See if this phy is part of a wide port */ -static int sas_ex_join_wide_port(struct domain_device *parent, int phy_id) +static bool sas_ex_join_wide_port(struct domain_device *parent, int phy_id) { struct ex_phy *phy = &parent->ex_dev.ex_phy[phy_id]; int i; @@ -786,11 +786,11 @@ static int sas_ex_join_wide_port(struct domain_device *parent, int phy_id) sas_port_add_phy(ephy->port, phy->phy); phy->port = ephy->port; phy->phy_state = PHY_DEVICE_DISCOVERED; - return 0; + return true; } } - return -ENODEV; + return false; } static struct domain_device *sas_ex_discover_expander( @@ -928,8 +928,7 @@ static int sas_ex_discover_dev(struct domain_device *dev, int phy_id) return res; } - res = sas_ex_join_wide_port(dev, phy_id); - if (!res) { + if (sas_ex_join_wide_port(dev, phy_id)) { SAS_DPRINTK("Attaching ex phy%d to wide port %016llx\n", phy_id, SAS_ADDR(ex_phy->attached_sas_addr)); return res; @@ -974,8 +973,7 @@ static int sas_ex_discover_dev(struct domain_device *dev, int phy_id) if (SAS_ADDR(ex->ex_phy[i].attached_sas_addr) == SAS_ADDR(child->sas_addr)) { ex->ex_phy[i].phy_state= PHY_DEVICE_DISCOVERED; - res = sas_ex_join_wide_port(dev, i); - if (!res) + if (sas_ex_join_wide_port(dev, i)) SAS_DPRINTK("Attaching ex phy%d to wide port %016llx\n", i, SAS_ADDR(ex->ex_phy[i].attached_sas_addr)); @@ -1838,32 +1836,20 @@ static int sas_discover_new(struct domain_device *dev, int phy_id) { struct ex_phy *ex_phy = &dev->ex_dev.ex_phy[phy_id]; struct domain_device *child; - bool found = false; - int res, i; + int res; SAS_DPRINTK("ex %016llx phy%d new device attached\n", SAS_ADDR(dev->sas_addr), phy_id); res = sas_ex_phy_discover(dev, phy_id); if (res) - goto out; - /* to support the wide port inserted */ - for (i = 0; i < dev->ex_dev.num_phys; i++) { - struct ex_phy *ex_phy_temp = &dev->ex_dev.ex_phy[i]; - if (i == phy_id) - continue; - if (SAS_ADDR(ex_phy_temp->attached_sas_addr) == - SAS_ADDR(ex_phy->attached_sas_addr)) { - found = true; - break; - } - } - if (found) { - sas_ex_join_wide_port(dev, phy_id); + return res; + + if (sas_ex_join_wide_port(dev, phy_id)) return 0; - } + res = sas_ex_discover_devices(dev, phy_id); - if (!res) - goto out; + if (res) + return res; list_for_each_entry(child, &dev->ex_dev.children, siblings) { if (SAS_ADDR(child->sas_addr) == SAS_ADDR(ex_phy->attached_sas_addr)) { @@ -1873,7 +1859,6 @@ static int sas_discover_new(struct domain_device *dev, int phy_id) break; } } -out: return res; } From bd9afacc545006fac0136c42783bdad0688e9165 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Jun 2012 23:25:32 -0700 Subject: [PATCH 049/135] SCSI: fix eh wakeup (scsi_schedule_eh vs scsi_restart_operations) commit 57fc2e335fd3c2f898ee73570dc81426c28dc7b4 upstream. Rapid ata hotplug on a libsas controller results in cases where libsas is waiting indefinitely on eh to perform an ata probe. A race exists between scsi_schedule_eh() and scsi_restart_operations() in the case when scsi_restart_operations() issues i/o to other devices in the sas domain. When this happens the host state transitions from SHOST_RECOVERY (set by scsi_schedule_eh) back to SHOST_RUNNING and ->host_busy is non-zero so we put the eh thread to sleep even though ->host_eh_scheduled is active. Before putting the error handler to sleep we need to check if the host_state needs to return to SHOST_RECOVERY for another trip through eh. Since i/o that is released by scsi_restart_operations has been blocked for at least one eh cycle, this implementation allows those i/o's to run before another eh cycle starts to discourage hung task timeouts. Reported-by: Tom Jackson Tested-by: Tom Jackson Signed-off-by: Dan Williams Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_error.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index a4b9cdbaaa0b..7f1afdefb350 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1665,6 +1665,20 @@ static void scsi_restart_operations(struct Scsi_Host *shost) * requests are started. */ scsi_run_host_queues(shost); + + /* + * if eh is active and host_eh_scheduled is pending we need to re-run + * recovery. we do this check after scsi_run_host_queues() to allow + * everything pent up since the last eh run a chance to make forward + * progress before we sync again. Either we'll immediately re-run + * recovery or scsi_device_unbusy() will wake us again when these + * pending commands complete. + */ + spin_lock_irqsave(shost->host_lock, flags); + if (shost->host_eh_scheduled) + if (scsi_host_set_state(shost, SHOST_RECOVERY)) + WARN_ON(scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY)); + spin_unlock_irqrestore(shost->host_lock, flags); } /** From 8fff2f802f32cbb0ed253117cc01586762c22ae9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 21 Jun 2012 23:47:28 -0700 Subject: [PATCH 050/135] SCSI: fix hot unplug vs async scan race commit 3b661a92e869ebe2358de8f4b3230ad84f7fce51 upstream. The following crash results from cases where the end_device has been removed before scsi_sysfs_add_sdev has had a chance to run. BUG: unable to handle kernel NULL pointer dereference at 0000000000000098 IP: [] sysfs_create_dir+0x32/0xb6 ... Call Trace: [] kobject_add_internal+0x120/0x1e3 [] ? trace_hardirqs_on+0xd/0xf [] kobject_add_varg+0x41/0x50 [] kobject_add+0x64/0x66 [] device_add+0x12d/0x63a [] ? _raw_spin_unlock_irqrestore+0x47/0x56 [] ? module_refcount+0x89/0xa0 [] scsi_sysfs_add_sdev+0x4e/0x28a [] do_scan_async+0x9c/0x145 ...teach scsi_sysfs_add_devices() to check for deleted devices() before trying to add them, and teach scsi_remove_target() how to remove targets that have not been added via device_add(). Reported-by: Dariusz Majchrzak Signed-off-by: Dan Williams Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_scan.c | 3 +++ drivers/scsi/scsi_sysfs.c | 43 ++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 6e7ea4a2b7a1..a48b59c0354a 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -1710,6 +1710,9 @@ static void scsi_sysfs_add_devices(struct Scsi_Host *shost) { struct scsi_device *sdev; shost_for_each_device(sdev, shost) { + /* target removed before the device could be added */ + if (sdev->sdev_state == SDEV_DEL) + continue; if (!scsi_host_scan_allowed(shost) || scsi_sysfs_add_sdev(sdev) != 0) __scsi_remove_device(sdev); diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index e0bd3f790fca..de21547e7c43 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -962,7 +962,6 @@ static void __scsi_remove_target(struct scsi_target *starget) struct scsi_device *sdev; spin_lock_irqsave(shost->host_lock, flags); - starget->reap_ref++; restart: list_for_each_entry(sdev, &shost->__devices, siblings) { if (sdev->channel != starget->channel || @@ -976,14 +975,6 @@ static void __scsi_remove_target(struct scsi_target *starget) goto restart; } spin_unlock_irqrestore(shost->host_lock, flags); - scsi_target_reap(starget); -} - -static int __remove_child (struct device * dev, void * data) -{ - if (scsi_is_target_device(dev)) - __scsi_remove_target(to_scsi_target(dev)); - return 0; } /** @@ -996,14 +987,34 @@ static int __remove_child (struct device * dev, void * data) */ void scsi_remove_target(struct device *dev) { - if (scsi_is_target_device(dev)) { - __scsi_remove_target(to_scsi_target(dev)); - return; - } + struct Scsi_Host *shost = dev_to_shost(dev->parent); + struct scsi_target *starget, *found; + unsigned long flags; - get_device(dev); - device_for_each_child(dev, NULL, __remove_child); - put_device(dev); + restart: + found = NULL; + spin_lock_irqsave(shost->host_lock, flags); + list_for_each_entry(starget, &shost->__targets, siblings) { + if (starget->state == STARGET_DEL) + continue; + if (starget->dev.parent == dev || &starget->dev == dev) { + found = starget; + found->reap_ref++; + break; + } + } + spin_unlock_irqrestore(shost->host_lock, flags); + + if (found) { + __scsi_remove_target(found); + scsi_target_reap(found); + /* in the case where @dev has multiple starget children, + * continue removing. + * + * FIXME: does such a case exist? + */ + goto restart; + } } EXPORT_SYMBOL(scsi_remove_target); From 8add44b313360a3940e6734b96c636c8c8edc5f8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 29 Jun 2012 15:34:26 +0000 Subject: [PATCH 051/135] SCSI: Avoid dangling pointer in scsi_requeue_command() commit 940f5d47e2f2e1fa00443921a0abf4822335b54d upstream. When we call scsi_unprep_request() the command associated with the request gets destroyed and therefore drops its reference on the device. If this was the only reference, the device may get released and we end up with a NULL pointer deref when we call blk_requeue_request. Reported-by: Mike Christie Signed-off-by: Bart Van Assche Reviewed-by: Mike Christie Reviewed-by: Tejun Heo [jejb: enhance commend and add commit log for stable] Signed-off-by: James Bottomley Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_lib.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 99fc45bb72df..dd454c4f99aa 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -481,15 +481,26 @@ void scsi_requeue_run_queue(struct work_struct *work) */ static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd) { + struct scsi_device *sdev = cmd->device; struct request *req = cmd->request; unsigned long flags; + /* + * We need to hold a reference on the device to avoid the queue being + * killed after the unlock and before scsi_run_queue is invoked which + * may happen because scsi_unprep_request() puts the command which + * releases its reference on the device. + */ + get_device(&sdev->sdev_gendev); + spin_lock_irqsave(q->queue_lock, flags); scsi_unprep_request(req); blk_requeue_request(q, req); spin_unlock_irqrestore(q->queue_lock, flags); scsi_run_queue(q); + + put_device(&sdev->sdev_gendev); } void scsi_next_command(struct scsi_cmnd *cmd) From c3d6a03a5702fb0971109d235dd3c74a5bd08248 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Fri, 18 May 2012 12:26:19 -0500 Subject: [PATCH 052/135] ARM: OMAP2+: OPP: Fix to ensure check of right oppdef after bad one commit b110547e586eb5825bc1d04aa9147bff83b57672 upstream. Commit 9fa2df6b90786301b175e264f5fa9846aba81a65 (ARM: OMAP2+: OPP: allow OPP enumeration to continue if device is not present) makes the logic: for (i = 0; i < opp_def_size; i++) { if (!oh || !oh->od) { continue; } opp_def++; } In short, the moment we hit a "Bad OPP", we end up looping the list comparing against the bad opp definition pointer for the rest of the iteration count. Instead, increment opp_def in the for loop itself and allow continue to be used in code without much thought so that we check the next set of OPP definition pointers :) Cc: Steve Sakoman Cc: Tony Lindgren Signed-off-by: Nishanth Menon Signed-off-by: Kevin Hilman Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-omap2/opp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/mach-omap2/opp.c b/arch/arm/mach-omap2/opp.c index ab8b35b780b5..062749483476 100644 --- a/arch/arm/mach-omap2/opp.c +++ b/arch/arm/mach-omap2/opp.c @@ -53,7 +53,7 @@ int __init omap_init_opp_table(struct omap_opp_def *opp_def, omap_table_init = 1; /* Lets now register with OPP library */ - for (i = 0; i < opp_def_size; i++) { + for (i = 0; i < opp_def_size; i++, opp_def++) { struct omap_hwmod *oh; struct device *dev; @@ -86,7 +86,6 @@ int __init omap_init_opp_table(struct omap_opp_def *opp_def, __func__, opp_def->freq, opp_def->hwmod_name, i, r); } - opp_def++; } return 0; From c6c3f3ff6c89fdd50ac44ed7787141d8a50b79de Mon Sep 17 00:00:00 2001 From: David Henningsson Date: Wed, 18 Jul 2012 07:38:46 +0200 Subject: [PATCH 053/135] ALSA: hda - Add support for Realtek ALC282 commit 4e01ec636e64707d202a1ca21a47bbc6d53085b7 upstream. This codec has a separate dmic path (separate dmic only ADC), and thus it looks mostly like ALC275. BugLink: https://bugs.launchpad.net/bugs/1025377 Tested-by: Ray Chen Signed-off-by: David Henningsson Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index baa7a49acbb2..8d288a7836ca 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -20133,6 +20133,7 @@ static const struct hda_codec_preset snd_hda_preset_realtek[] = { { .id = 0x10ec0275, .name = "ALC275", .patch = patch_alc269 }, { .id = 0x10ec0276, .name = "ALC276", .patch = patch_alc269 }, { .id = 0x10ec0280, .name = "ALC280", .patch = patch_alc269 }, + { .id = 0x10ec0282, .name = "ALC282", .patch = patch_alc269 }, { .id = 0x10ec0861, .rev = 0x100340, .name = "ALC660", .patch = patch_alc861 }, { .id = 0x10ec0660, .name = "ALC660-VD", .patch = patch_alc861vd }, From a0f7a5ac6e752612427077b2c04db1e0ae720a66 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 4 Jul 2012 09:18:01 +0200 Subject: [PATCH 054/135] usbdevfs: Correct amount of data copied to user in processcompl_compat commit 2102e06a5f2e414694921f23591f072a5ba7db9f upstream. iso data buffers may have holes in them if some packets were short, so for iso urbs we should always copy the entire buffer, just like the regular processcompl does. Signed-off-by: Hans de Goede Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index ca3c303eed81..4d1f996b9875 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1557,10 +1557,14 @@ static int processcompl_compat(struct async *as, void __user * __user *arg) void __user *addr = as->userurb; unsigned int i; - if (as->userbuffer && urb->actual_length) - if (copy_to_user(as->userbuffer, urb->transfer_buffer, - urb->actual_length)) + if (as->userbuffer && urb->actual_length) { + if (urb->number_of_packets > 0) /* Isochronous */ + i = urb->transfer_buffer_length; + else /* Non-Isoc */ + i = urb->actual_length; + if (copy_to_user(as->userbuffer, urb->transfer_buffer, i)) return -EFAULT; + } if (put_user(as->status, &userurb->status)) return -EFAULT; if (put_user(urb->actual_length, &userurb->actual_length)) From f58f16f2039cca9dc58a406593e5f46c6a35e0df Mon Sep 17 00:00:00 2001 From: Kevin Cernekee Date: Sun, 24 Jun 2012 21:11:22 -0700 Subject: [PATCH 055/135] usb: gadget: Fix g_ether interface link status commit 31bde1ceaa873bcaecd49e829bfabceacc4c512d upstream. A "usb0" interface that has never been connected to a host has an unknown operstate, and therefore the IFF_RUNNING flag is (incorrectly) asserted when queried by ifconfig, ifplugd, etc. This is a result of calling netif_carrier_off() too early in the probe function; it should be called after register_netdev(). Similar problems have been fixed in many other drivers, e.g.: e826eafa6 (bonding: Call netif_carrier_off after register_netdevice) 0d672e9f8 (drivers/net: Call netif_carrier_off at the end of the probe) 6a3c869a6 (cxgb4: fix reported state of interfaces without link) Fix is to move netif_carrier_off() to the end of the function. Signed-off-by: Kevin Cernekee Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/u_ether.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c index 2ac1d2147325..a52404a1aef5 100644 --- a/drivers/usb/gadget/u_ether.c +++ b/drivers/usb/gadget/u_ether.c @@ -803,12 +803,6 @@ int gether_setup(struct usb_gadget *g, u8 ethaddr[ETH_ALEN]) SET_ETHTOOL_OPS(net, &ops); - /* two kinds of host-initiated state changes: - * - iff DATA transfer is active, carrier is "on" - * - tx queueing enabled if open *and* carrier is "on" - */ - netif_carrier_off(net); - dev->gadget = g; SET_NETDEV_DEV(net, &g->dev); SET_NETDEV_DEVTYPE(net, &gadget_type); @@ -822,6 +816,12 @@ int gether_setup(struct usb_gadget *g, u8 ethaddr[ETH_ALEN]) INFO(dev, "HOST MAC %pM\n", dev->host_mac); the_dev = dev; + + /* two kinds of host-initiated state changes: + * - iff DATA transfer is active, carrier is "on" + * - tx queueing enabled if open *and* carrier is "on" + */ + netif_carrier_off(net); } return status; From dc525df9895f810ba777feec1540c7b822512c04 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 23 Jul 2012 15:17:17 -0400 Subject: [PATCH 056/135] locks: fix checking of fcntl_setlease argument commit 0ec4f431eb56d633da3a55da67d5c4b88886ccc7 upstream. The only checks of the long argument passed to fcntl(fd,F_SETLEASE,.) are done after converting the long to an int. Thus some illegal values may be let through and cause problems in later code. [ They actually *don't* cause problems in mainline, as of Dave Jones's commit 8d657eb3b438 "Remove easily user-triggerable BUG from generic_setlease", but we should fix this anyway. And this patch will be necessary to fix real bugs on earlier kernels. ] Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/locks.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index b286539d547a..35388d524c75 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -315,7 +315,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, return 0; } -static int assign_type(struct file_lock *fl, int type) +static int assign_type(struct file_lock *fl, long type) { switch (type) { case F_RDLCK: @@ -452,7 +452,7 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, int type, struct file_lock *fl) +static int lease_init(struct file *filp, long type, struct file_lock *fl) { if (assign_type(fl, type) != 0) return -EINVAL; @@ -470,7 +470,7 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl) } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, int type) +static struct file_lock *lease_alloc(struct file *filp, long type) { struct file_lock *fl = locks_alloc_lock(); int error = -ENOMEM; From 31b1c0850709c77ece1690a8fbc043829717d14c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 16 Jun 2012 15:30:45 +0200 Subject: [PATCH 057/135] ftrace: Disable function tracing during suspend/resume and hibernation, again commit 443772d408a25af62498793f6f805ce3c559309a upstream. If function tracing is enabled for some of the low-level suspend/resume functions, it leads to triple fault during resume from suspend, ultimately ending up in a reboot instead of a resume (or a total refusal to come out of suspended state, on some machines). This issue was explained in more detail in commit f42ac38c59e0a03d (ftrace: disable tracing for suspend to ram). However, the changes made by that commit got reverted by commit cbe2f5a6e84eebb (tracing: allow tracing of suspend/resume & hibernation code again). So, unfortunately since things are not yet robust enough to allow tracing of low-level suspend/resume functions, suspend/resume is still broken when ftrace is enabled. So fix this by disabling function tracing during suspend/resume & hibernation. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/power/hibernate.c | 6 ++++++ kernel/power/suspend.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8884c27682f5..32f1590644d2 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -344,6 +344,7 @@ int hibernation_snapshot(int platform_mode) goto Complete_devices; suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); if (error) @@ -369,6 +370,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); + ftrace_start(); resume_console(); Complete_devices: @@ -471,6 +473,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -478,6 +481,7 @@ int hibernation_restore(int platform_mode) dpm_resume_end(PMSG_RECOVER); } pm_restore_gfp_mask(); + ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -504,6 +508,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -547,6 +552,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + ftrace_start(); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 449ccc9c0a4f..e40d20595b15 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "power.h" @@ -210,6 +211,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_stop(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -226,6 +228,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + ftrace_start(); resume_console(); Close: if (suspend_ops->end) From 8d50f086b22f886265031643748e4089257c768b Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 5 Jun 2012 11:15:50 -0400 Subject: [PATCH 058/135] stable: update references to older 2.6 versions for 3.x commit 2584f5212d97b664be250ad5700a2d0fee31a10d upstream. Also add information on where the respective trees are. Signed-off-by: Paul Gortmaker Acked-by: Rob Landley Signed-off-by: Greg Kroah-Hartman --- Documentation/stable_kernel_rules.txt | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/stable_kernel_rules.txt b/Documentation/stable_kernel_rules.txt index e1f856b1b456..22bf11b846cc 100644 --- a/Documentation/stable_kernel_rules.txt +++ b/Documentation/stable_kernel_rules.txt @@ -1,4 +1,4 @@ -Everything you ever wanted to know about Linux 2.6 -stable releases. +Everything you ever wanted to know about Linux -stable releases. Rules on what kind of patches are accepted, and which ones are not, into the "-stable" tree: @@ -41,10 +41,10 @@ Procedure for submitting patches to the -stable tree: cherry-picked than this can be specified in the following format in the sign-off area: - Cc: # .32.x: a1f84a3: sched: Check for idle - Cc: # .32.x: 1b9508f: sched: Rate-limit newidle - Cc: # .32.x: fd21073: sched: Fix affinity logic - Cc: # .32.x + Cc: # 3.3.x: a1f84a3: sched: Check for idle + Cc: # 3.3.x: 1b9508f: sched: Rate-limit newidle + Cc: # 3.3.x: fd21073: sched: Fix affinity logic + Cc: # 3.3.x Signed-off-by: Ingo Molnar The tag sequence has the meaning of: @@ -78,6 +78,15 @@ Review cycle: security kernel team, and not go through the normal review cycle. Contact the kernel security team for more details on this procedure. +Trees: + + - The queues of patches, for both completed versions and in progress + versions can be found at: + http://git.kernel.org/?p=linux/kernel/git/stable/stable-queue.git + - The finalized and tagged releases of all stable kernels can be found + in separate branches per version at: + http://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git + Review committee: From b1c7ba1bab7363fee6dc5d4ee5be4e916adcf691 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:26 -0700 Subject: [PATCH 059/135] workqueue: perform cpu down operations from low priority cpu_notifier() commit 6575820221f7a4dd6eadecf7bf83cdd154335eda upstream. Currently, all workqueue cpu hotplug operations run off CPU_PRI_WORKQUEUE which is higher than normal notifiers. This is to ensure that workqueue is up and running while bringing up a CPU before other notifiers try to use workqueue on the CPU. Per-cpu workqueues are supposed to remain working and bound to the CPU for normal CPU_DOWN_PREPARE notifiers. This holds mostly true even with workqueue offlining running with higher priority because workqueue CPU_DOWN_PREPARE only creates a bound trustee thread which runs the per-cpu workqueue without concurrency management without explicitly detaching the existing workers. However, if the trustee needs to create new workers, it creates unbound workers which may wander off to other CPUs while CPU_DOWN_PREPARE notifiers are in progress. Furthermore, if the CPU down is cancelled, the per-CPU workqueue may end up with workers which aren't bound to the CPU. While reliably reproducible with a convoluted artificial test-case involving scheduling and flushing CPU burning work items from CPU down notifiers, this isn't very likely to happen in the wild, and, even when it happens, the effects are likely to be hidden by the following successful CPU down. Fix it by using different priorities for up and down notifiers - high priority for up operations and low priority for down operations. Workqueue cpu hotplug operations will soon go through further cleanup. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" Signed-off-by: Greg Kroah-Hartman --- include/linux/cpu.h | 5 +++-- kernel/workqueue.c | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 5f09323ee880..42af2eae8805 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -66,8 +66,9 @@ enum { /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, - /* prepare workqueues for other notifiers */ - CPU_PRI_WORKQUEUE = 5, + /* bring up workqueues before normal notifiers and down after */ + CPU_PRI_WORKQUEUE_UP = 5, + CPU_PRI_WORKQUEUE_DOWN = -5, }; #ifdef CONFIG_SMP diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee1845b8d690..e88c924fc6b3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3561,6 +3561,41 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, return notifier_from_errno(0); } +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. + */ +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_POST_DEAD: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + #ifdef CONFIG_SMP struct work_for_cpu { @@ -3754,7 +3789,8 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { From 53895e01fe540ddd0c9f2615468a04cb48d9ed2f Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Fri, 20 Jul 2012 13:29:16 +0800 Subject: [PATCH 060/135] ACPI/AC: prevent OOPS on some boxes due to missing check power_supply_register() return value check commit f197ac13f6eeb351b31250b9ab7d0da17434ea36 upstream. In the ac.c, power_supply_register()'s return value is not checked. As a result, the driver's add() ops may return success even though the device failed to initialize. For example, some BIOS may describe two ACADs in the same DSDT. The second ACAD device will fail to register, but ACPI driver's add() ops returns sucessfully. The ACPI device will receive ACPI notification and cause OOPS. https://bugzilla.redhat.com/show_bug.cgi?id=772730 Signed-off-by: Lan Tianyu Signed-off-by: Len Brown Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/ac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 58c3f74bd84c..958205046e06 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -292,7 +292,9 @@ static int acpi_ac_add(struct acpi_device *device) ac->charger.properties = ac_props; ac->charger.num_properties = ARRAY_SIZE(ac_props); ac->charger.get_property = get_ac_property; - power_supply_register(&ac->device->dev, &ac->charger); + result = power_supply_register(&ac->device->dev, &ac->charger); + if (result) + goto end; printk(KERN_INFO PREFIX "%s [%s] (%s)\n", acpi_device_name(device), acpi_device_bid(device), From 4ffd3692dd84a5979310c5e86bcf942e4b46c9ce Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jul 2012 15:57:13 -0400 Subject: [PATCH 061/135] Btrfs: call the ordered free operation without any locks held commit e9fbcb42201c862fd6ab45c48ead4f47bb2dea9d upstream. Each ordered operation has a free callback, and this was called with the worker spinlock held. Josef made the free callback also call iput, which we can't do with the spinlock. This drops the spinlock for the free operation and grabs it again before moving through the rest of the list. We'll circle back around to this and find a cleaner way that doesn't bounce the lock around so much. Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/async-thread.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7ec14097fef1..8006a28390fb 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -212,10 +212,17 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, work->ordered_func(work); - /* now take the lock again and call the freeing code */ + /* now take the lock again and drop our item from the list */ spin_lock(&workers->order_lock); list_del(&work->order_list); + spin_unlock(&workers->order_lock); + + /* + * we don't want to call the ordered free functions + * with the lock held though + */ work->ordered_free(work); + spin_lock(&workers->order_lock); } spin_unlock(&workers->order_lock); From 4826f249d0b09bb6d8969277b43df9ffc3bccfe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 17 Jul 2012 19:02:09 +0200 Subject: [PATCH 062/135] drm/radeon: Try harder to avoid HW cursor ending on a multiple of 128 columns. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f60ec4c7df043df81e62891ac45383d012afe0da upstream. This could previously fail if either of the enabled displays was using a horizontal resolution that is a multiple of 128, and only the leftmost column of the cursor was (supposed to be) visible at the right edge of that display. The solution is to move the cursor one pixel to the left in that case. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=33183 Signed-off-by: Michel Dänzer Reviewed-by: Alex Deucher Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_cursor.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_cursor.c b/drivers/gpu/drm/radeon/radeon_cursor.c index 3fb222615c6f..72f749d56c12 100644 --- a/drivers/gpu/drm/radeon/radeon_cursor.c +++ b/drivers/gpu/drm/radeon/radeon_cursor.c @@ -257,8 +257,14 @@ int radeon_crtc_cursor_move(struct drm_crtc *crtc, if (!(cursor_end & 0x7f)) w--; } - if (w <= 0) + if (w <= 0) { w = 1; + cursor_end = x - xorigin + w; + if (!(cursor_end & 0x7f)) { + x--; + WARN_ON_ONCE(x < 0); + } + } } } From ea07d57bea33b7004d1773db115bf62bb7788c99 Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Tue, 17 Jul 2012 17:17:16 -0400 Subject: [PATCH 063/135] drm/radeon: fix non revealent error message commit 8d1c702aa0b2c4b22b0742b72a1149d91690674b upstream. We want to print link status query failed only if it's an unexepected fail. If we query to see if we need link training it might be because there is nothing connected and thus link status query have the right to fail in that case. To avoid printing failure when it's expected, move the failure message to proper place. Signed-off-by: Jerome Glisse Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/atombios_dp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c index 3b77ad60ed51..efc2b214b340 100644 --- a/drivers/gpu/drm/radeon/atombios_dp.c +++ b/drivers/gpu/drm/radeon/atombios_dp.c @@ -22,6 +22,7 @@ * * Authors: Dave Airlie * Alex Deucher + * Jerome Glisse */ #include "drmP.h" #include "radeon_drm.h" @@ -624,7 +625,6 @@ static bool radeon_dp_get_link_status(struct radeon_connector *radeon_connector, ret = radeon_dp_aux_native_read(radeon_connector, DP_LANE0_1_STATUS, link_status, DP_LINK_STATUS_SIZE, 100); if (ret <= 0) { - DRM_ERROR("displayport link status failed\n"); return false; } @@ -797,8 +797,10 @@ static int radeon_dp_link_train_cr(struct radeon_dp_link_train_info *dp_info) else mdelay(dp_info->rd_interval * 4); - if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) + if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) { + DRM_ERROR("displayport link status failed\n"); break; + } if (dp_clock_recovery_ok(dp_info->link_status, dp_info->dp_lane_count)) { clock_recovery = true; @@ -860,8 +862,10 @@ static int radeon_dp_link_train_ce(struct radeon_dp_link_train_info *dp_info) else mdelay(dp_info->rd_interval * 4); - if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) + if (!radeon_dp_get_link_status(dp_info->radeon_connector, dp_info->link_status)) { + DRM_ERROR("displayport link status failed\n"); break; + } if (dp_channel_eq_ok(dp_info->link_status, dp_info->dp_lane_count)) { channel_eq = true; From a0283f9072a2c289926a8bafb99a231a55eb7517 Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Thu, 19 Jul 2012 17:15:56 -0400 Subject: [PATCH 064/135] drm/radeon: fix hotplug of DP to DVI|HDMI passive adapters (v2) commit 266dcba541a1ef7e5d82d9e67c67fde2910636e8 upstream. No need to retrain the link for passive adapters. v2: agd5f - no passive DP to VGA adapters, update comments - assign radeon_connector_atom_dig after we are sure we have a digital connector as analog connectors have different private data. - get new sink type before checking for retrain. No need to check if it's no longer a DP connection. Signed-off-by: Jerome Glisse Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_connectors.c | 27 ++++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index 1f6a0f55ad19..ea6f60fc188d 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -66,14 +66,27 @@ void radeon_connector_hotplug(struct drm_connector *connector) /* just deal with DP (not eDP) here. */ if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort) { - int saved_dpms = connector->dpms; + struct radeon_connector_atom_dig *dig_connector = + radeon_connector->con_priv; - /* Only turn off the display it it's physically disconnected */ - if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); - else if (radeon_dp_needs_link_train(radeon_connector)) - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); - connector->dpms = saved_dpms; + /* if existing sink type was not DP no need to retrain */ + if (dig_connector->dp_sink_type != CONNECTOR_OBJECT_ID_DISPLAYPORT) + return; + + /* first get sink type as it may be reset after (un)plug */ + dig_connector->dp_sink_type = radeon_dp_getsinktype(radeon_connector); + /* don't do anything if sink is not display port, i.e., + * passive dp->(dvi|hdmi) adaptor + */ + if (dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_DISPLAYPORT) { + int saved_dpms = connector->dpms; + /* Only turn off the display if it's physically disconnected */ + if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) + drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); + else if (radeon_dp_needs_link_train(radeon_connector)) + drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); + connector->dpms = saved_dpms; + } } } From 073271315cd11ae149fde728a8c0d96fb2ad03eb Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Thu, 19 Jul 2012 17:25:55 -0400 Subject: [PATCH 065/135] drm/radeon: on hotplug force link training to happen (v2) commit ca2ccde5e2f24a792caa4cca919fc5c6f65d1887 upstream. To have DP behave like VGA/DVI we need to retrain the link on hotplug. For this to happen we need to force link training to happen by setting connector dpms to off before asking it turning it on again. v2: agd5f - drop the dp_get_link_status() change in atombios_dp.c for now. We still need the dpms OFF change. Signed-off-by: Jerome Glisse Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/radeon/radeon_connectors.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index ea6f60fc188d..f1a1e8aa4dad 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -81,10 +81,16 @@ void radeon_connector_hotplug(struct drm_connector *connector) if (dig_connector->dp_sink_type == CONNECTOR_OBJECT_ID_DISPLAYPORT) { int saved_dpms = connector->dpms; /* Only turn off the display if it's physically disconnected */ - if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) + if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) { drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); - else if (radeon_dp_needs_link_train(radeon_connector)) + } else if (radeon_dp_needs_link_train(radeon_connector)) { + /* set it to OFF so that drm_helper_connector_dpms() + * won't return immediately since the current state + * is ON at this point. + */ + connector->dpms = DRM_MODE_DPMS_OFF; drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); + } connector->dpms = saved_dpms; } } From 9d0ed6ec043bb8cbb895d13ce780878e2e50df80 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 5 Jun 2012 16:52:06 -0400 Subject: [PATCH 066/135] nfsd4: our filesystems are normally case sensitive commit 2930d381d22b9c56f40dd4c63a8fa59719ca2c3c upstream. Actually, xfs and jfs can optionally be case insensitive; we'll handle that case in later patches. Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 6c740974bfe0..f91d58990b65 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2010,7 +2010,7 @@ out_acl: if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32(1); + WRITE32(0); } if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { if ((buflen -= 4) < 0) From eb65b85e1bce06b665b3568c19a249cd886fa6ff Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Jul 2012 13:58:51 -0400 Subject: [PATCH 067/135] nfs: skip commit in releasepage if we're freeing memory for fs-related reasons commit 5cf02d09b50b1ee1c2d536c9cf64af5a7d433f56 upstream. We've had some reports of a deadlock where rpciod ends up with a stack trace like this: PID: 2507 TASK: ffff88103691ab40 CPU: 14 COMMAND: "rpciod/14" #0 [ffff8810343bf2f0] schedule at ffffffff814dabd9 #1 [ffff8810343bf3b8] nfs_wait_bit_killable at ffffffffa038fc04 [nfs] #2 [ffff8810343bf3c8] __wait_on_bit at ffffffff814dbc2f #3 [ffff8810343bf418] out_of_line_wait_on_bit at ffffffff814dbcd8 #4 [ffff8810343bf488] nfs_commit_inode at ffffffffa039e0c1 [nfs] #5 [ffff8810343bf4f8] nfs_release_page at ffffffffa038bef6 [nfs] #6 [ffff8810343bf528] try_to_release_page at ffffffff8110c670 #7 [ffff8810343bf538] shrink_page_list.clone.0 at ffffffff81126271 #8 [ffff8810343bf668] shrink_inactive_list at ffffffff81126638 #9 [ffff8810343bf818] shrink_zone at ffffffff8112788f #10 [ffff8810343bf8c8] do_try_to_free_pages at ffffffff81127b1e #11 [ffff8810343bf958] try_to_free_pages at ffffffff8112812f #12 [ffff8810343bfa08] __alloc_pages_nodemask at ffffffff8111fdad #13 [ffff8810343bfb28] kmem_getpages at ffffffff81159942 #14 [ffff8810343bfb58] fallback_alloc at ffffffff8115a55a #15 [ffff8810343bfbd8] ____cache_alloc_node at ffffffff8115a2d9 #16 [ffff8810343bfc38] kmem_cache_alloc at ffffffff8115b09b #17 [ffff8810343bfc78] sk_prot_alloc at ffffffff81411808 #18 [ffff8810343bfcb8] sk_alloc at ffffffff8141197c #19 [ffff8810343bfce8] inet_create at ffffffff81483ba6 #20 [ffff8810343bfd38] __sock_create at ffffffff8140b4a7 #21 [ffff8810343bfd98] xs_create_sock at ffffffffa01f649b [sunrpc] #22 [ffff8810343bfdd8] xs_tcp_setup_socket at ffffffffa01f6965 [sunrpc] #23 [ffff8810343bfe38] worker_thread at ffffffff810887d0 #24 [ffff8810343bfee8] kthread at ffffffff8108dd96 #25 [ffff8810343bff48] kernel_thread at ffffffff8100c1ca rpciod is trying to allocate memory for a new socket to talk to the server. The VM ends up calling ->releasepage to get more memory, and it tries to do a blocking commit. That commit can't succeed however without a connected socket, so we deadlock. Fix this by setting PF_FSTRANS on the workqueue task prior to doing the socket allocation, and having nfs_release_page check for that flag when deciding whether to do a commit call. Also, set PF_FSTRANS unconditionally in rpc_async_schedule since that function can also do allocations sometimes. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/file.c | 7 +++++-- net/sunrpc/sched.c | 2 ++ net/sunrpc/xprtrdma/transport.c | 3 ++- net/sunrpc/xprtsock.c | 10 ++++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index dd2f13077bee..6c6e2c461222 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -493,8 +493,11 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Only do I/O if gfp is a superset of GFP_KERNEL */ - if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL) { + /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not + * doing this memory reclaim for a fs-related allocation. + */ + if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && + !(current->flags & PF_FSTRANS)) { int how = FLUSH_SYNC; /* Don't let kswapd deadlock waiting for OOM RPC calls */ diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b6bb22571c57..c57f97f44e6e 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -713,7 +713,9 @@ void rpc_execute(struct rpc_task *task) static void rpc_async_schedule(struct work_struct *work) { + current->flags |= PF_FSTRANS; __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); + current->flags &= ~PF_FSTRANS; } /** diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 0867070bb5ca..d0b5210d9810 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -200,6 +200,7 @@ xprt_rdma_connect_worker(struct work_struct *work) int rc = 0; if (!xprt->shutdown) { + current->flags |= PF_FSTRANS; xprt_clear_connected(xprt); dprintk("RPC: %s: %sconnect\n", __func__, @@ -212,10 +213,10 @@ xprt_rdma_connect_worker(struct work_struct *work) out: xprt_wake_pending_tasks(xprt, rc); - out_clear: dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); + current->flags &= ~PF_FSTRANS; } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ea7507979b0b..554111f42b09 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1882,6 +1882,8 @@ static void xs_local_setup_socket(struct work_struct *work) if (xprt->shutdown) goto out; + current->flags |= PF_FSTRANS; + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); @@ -1915,6 +1917,7 @@ static void xs_local_setup_socket(struct work_struct *work) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + current->flags &= ~PF_FSTRANS; } static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) @@ -1957,6 +1960,8 @@ static void xs_udp_setup_socket(struct work_struct *work) if (xprt->shutdown) goto out; + current->flags |= PF_FSTRANS; + /* Start by resetting any existing state */ xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, @@ -1975,6 +1980,7 @@ static void xs_udp_setup_socket(struct work_struct *work) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + current->flags &= ~PF_FSTRANS; } /* @@ -2100,6 +2106,8 @@ static void xs_tcp_setup_socket(struct work_struct *work) if (xprt->shutdown) goto out; + current->flags |= PF_FSTRANS; + if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); sock = xs_create_sock(xprt, transport, @@ -2149,6 +2157,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_clear_connecting(xprt); + current->flags &= ~PF_FSTRANS; return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -2161,6 +2170,7 @@ out_eagain: out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); + current->flags &= ~PF_FSTRANS; } /** From 6ff2c41b81bd0778aa44ffcfce0ea623fa660887 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 30 Jun 2012 19:14:57 -0400 Subject: [PATCH 068/135] ext4: pass a char * to ext4_count_free() instead of a buffer_head ptr commit f6fb99cadcd44660c68e13f6eab28333653621e6 upstream. Make it possible for ext4_count_free to operate on buffers and not just data in buffer_heads. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/balloc.c | 3 ++- fs/ext4/bitmap.c | 8 +++----- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 3 ++- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f6949511e..ebe95f56514a 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -514,7 +514,8 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) if (bitmap_bh == NULL) continue; - x = ext4_count_free(bitmap_bh, sb->s_blocksize); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_BLOCKS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_blks_count(sb, gdp), x); bitmap_count += x; diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index fa3af81ac565..012faaaec4ad 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -15,15 +15,13 @@ static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) +unsigned int ext4_count_free(char *bitmap, unsigned int numchars) { unsigned int i, sum = 0; - if (!map) - return 0; for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; + sum += nibblemap[bitmap[i] & 0xf] + + nibblemap[(bitmap[i] >> 4) & 0xf]; return sum; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1a34c1c84604..e0113aa0d3a2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1713,7 +1713,7 @@ struct mmpd_data { # define NORET_AND noreturn, /* bitmap.c */ -extern unsigned int ext4_count_free(struct buffer_head *, unsigned); +extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); /* balloc.c */ extern unsigned int ext4_block_group(struct super_block *sb, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 412469b241a8..29272de30232 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1193,7 +1193,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) if (!bitmap_bh) continue; - x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; From b4cbf953e071121a4082e35c9820c3c3c10aeb55 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Sun, 22 Jul 2012 23:59:40 -0400 Subject: [PATCH 069/135] ext4: don't let i_reserved_meta_blocks go negative commit 97795d2a5b8d3c8dc4365d4bd3404191840453ba upstream. If we hit a condition where we have allocated metadata blocks that were not appropriately reserved, we risk underflow of ei->i_reserved_meta_blocks. In turn, this can throw sbi->s_dirtyclusters_counter significantly out of whack and undermine the nondelalloc fallback logic in ext4_nonda_switch(). Warn if this occurs and set i_allocated_meta_blocks to avoid this problem. This condition is reproduced by xfstests 270 against ext2 with delalloc enabled: Mar 28 08:58:02 localhost kernel: [ 171.526344] EXT4-fs (loop1): delayed block allocation failed for inode 14 at logical offset 64486 with max blocks 64 with error -28 Mar 28 08:58:02 localhost kernel: [ 171.526346] EXT4-fs (loop1): This should not happen!! Data will be lost 270 ultimately fails with an inconsistent filesystem and requires an fsck to repair. The cause of the error is an underflow in ext4_da_update_reserve_space() due to an unreserved meta block allocation. Signed-off-by: Brian Foster Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c1e6a7263893..18fee6daecd5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1134,6 +1134,15 @@ void ext4_da_update_reserve_space(struct inode *inode, used = ei->i_reserved_data_blocks; } + if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, allocated %d " + "with only %d reserved metadata blocks\n", __func__, + inode->i_ino, ei->i_allocated_meta_blocks, + ei->i_reserved_meta_blocks); + WARN_ON(1); + ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks; + } + /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; From 6577472957c45c35cedaa0e53748cb1c6c954e44 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 10 Jul 2012 10:04:40 +0000 Subject: [PATCH 070/135] bnx2: Fix bug in bnx2_free_tx_skbs(). [ Upstream commit c1f5163de417dab01fa9daaf09a74bbb19303f3c ] In rare cases, bnx2x_free_tx_skbs() can unmap the wrong DMA address when it gets to the last entry of the tx ring. We were not using the proper macro to skip the last entry when advancing the tx index. Reported-by: Zongyun Lai Reviewed-by: Jeffrey Huang Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bnx2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 74580bb175f1..c9b123c1608b 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -5310,7 +5310,7 @@ bnx2_free_tx_skbs(struct bnx2 *bp) int k, last; if (skb == NULL) { - j++; + j = NEXT_TX_BD(j); continue; } @@ -5322,8 +5322,8 @@ bnx2_free_tx_skbs(struct bnx2 *bp) tx_buf->skb = NULL; last = tx_buf->nr_frags; - j++; - for (k = 0; k < last; k++, j++) { + j = NEXT_TX_BD(j); + for (k = 0; k < last; k++, j = NEXT_TX_BD(j)) { tx_buf = &txr->tx_buf_ring[TX_RING_IDX(j)]; dma_unmap_page(&bp->pdev->dev, dma_unmap_addr(tx_buf, mapping), From 9b9f676623b92483d890d1a3a52611c09fc190f3 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 12 Jul 2012 03:39:11 +0000 Subject: [PATCH 071/135] sch_sfb: Fix missing NULL check [ Upstream commit 7ac2908e4b2edaec60e9090ddb4d9ceb76c05e7d ] Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=44461 Signed-off-by: Alan Cox Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_sfb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 47ee29fad350..e85b248773e7 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -556,6 +556,8 @@ static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) sch->qstats.backlog = q->qdisc->qstats.backlog; opts = nla_nest_start(skb, TCA_OPTIONS); + if (opts == NULL) + goto nla_put_failure; NLA_PUT(skb, TCA_SFB_PARMS, sizeof(opt), &opt); return nla_nest_end(skb, opts); From 2f890d2777247beb207be1c99835a0c5e09d340c Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 16 Jul 2012 09:13:51 +0000 Subject: [PATCH 072/135] sctp: Fix list corruption resulting from freeing an association on a list [ Upstream commit 2eebc1e188e9e45886ee00662519849339884d6d ] A few days ago Dave Jones reported this oops: [22766.294255] general protection fault: 0000 [#1] PREEMPT SMP [22766.295376] CPU 0 [22766.295384] Modules linked in: [22766.387137] ffffffffa169f292 6b6b6b6b6b6b6b6b ffff880147c03a90 ffff880147c03a74 [22766.387135] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 00000000000 [22766.387136] Process trinity-watchdo (pid: 10896, threadinfo ffff88013e7d2000, [22766.387137] Stack: [22766.387140] ffff880147c03a10 [22766.387140] ffffffffa169f2b6 [22766.387140] ffff88013ed95728 [22766.387143] 0000000000000002 [22766.387143] 0000000000000000 [22766.387143] ffff880003fad062 [22766.387144] ffff88013c120000 [22766.387144] [22766.387145] Call Trace: [22766.387145] [22766.387150] [] ? __sctp_lookup_association+0x62/0xd0 [sctp] [22766.387154] [] __sctp_lookup_association+0x86/0xd0 [sctp] [22766.387157] [] sctp_rcv+0x207/0xbb0 [sctp] [22766.387161] [] ? trace_hardirqs_off_caller+0x28/0xd0 [22766.387163] [] ? nf_hook_slow+0x133/0x210 [22766.387166] [] ? ip_local_deliver_finish+0x4c/0x4c0 [22766.387168] [] ip_local_deliver_finish+0x18d/0x4c0 [22766.387169] [] ? ip_local_deliver_finish+0x4c/0x4c0 [22766.387171] [] ip_local_deliver+0x47/0x80 [22766.387172] [] ip_rcv_finish+0x150/0x680 [22766.387174] [] ip_rcv+0x214/0x320 [22766.387176] [] __netif_receive_skb+0x7b7/0x910 [22766.387178] [] ? __netif_receive_skb+0x11c/0x910 [22766.387180] [] ? put_lock_stats.isra.25+0xe/0x40 [22766.387182] [] netif_receive_skb+0x23/0x1f0 [22766.387183] [] ? dev_gro_receive+0x139/0x440 [22766.387185] [] napi_skb_finish+0x70/0xa0 [22766.387187] [] napi_gro_receive+0xf5/0x130 [22766.387218] [] e1000_receive_skb+0x59/0x70 [e1000e] [22766.387242] [] e1000_clean_rx_irq+0x28b/0x460 [e1000e] [22766.387266] [] e1000e_poll+0x78/0x430 [e1000e] [22766.387268] [] net_rx_action+0x1aa/0x3d0 [22766.387270] [] ? account_system_vtime+0x10f/0x130 [22766.387273] [] __do_softirq+0xe0/0x420 [22766.387275] [] call_softirq+0x1c/0x30 [22766.387278] [] do_softirq+0xd5/0x110 [22766.387279] [] irq_exit+0xd5/0xe0 [22766.387281] [] do_IRQ+0x63/0xd0 [22766.387283] [] common_interrupt+0x6f/0x6f [22766.387283] [22766.387284] [22766.387285] [] ? retint_swapgs+0x13/0x1b [22766.387285] Code: c0 90 5d c3 66 0f 1f 44 00 00 4c 89 c8 5d c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 5d e8 4c 89 65 f0 4c 89 6d f8 66 66 66 66 90 <0f> b7 87 98 00 00 00 48 89 fb 49 89 f5 66 c1 c0 08 66 39 46 02 [22766.387307] [22766.387307] RIP [22766.387311] [] sctp_assoc_is_match+0x19/0x90 [sctp] [22766.387311] RSP [22766.387142] ffffffffa16ab120 [22766.599537] ---[ end trace 3f6dae82e37b17f5 ]--- [22766.601221] Kernel panic - not syncing: Fatal exception in interrupt It appears from his analysis and some staring at the code that this is likely occuring because an association is getting freed while still on the sctp_assoc_hashtable. As a result, we get a gpf when traversing the hashtable while a freed node corrupts part of the list. Nominally I would think that an mibalanced refcount was responsible for this, but I can't seem to find any obvious imbalance. What I did note however was that the two places where we create an association using sctp_primitive_ASSOCIATE (__sctp_connect and sctp_sendmsg), have failure paths which free a newly created association after calling sctp_primitive_ASSOCIATE. sctp_primitive_ASSOCIATE brings us into the sctp_sf_do_prm_asoc path, which issues a SCTP_CMD_NEW_ASOC side effect, which in turn adds a new association to the aforementioned hash table. the sctp command interpreter that process side effects has not way to unwind previously processed commands, so freeing the association from the __sctp_connect or sctp_sendmsg error path would lead to a freed association remaining on this hash table. I've fixed this but modifying sctp_[un]hash_established to use hlist_del_init, which allows us to proerly use hlist_unhashed to check if the node is on a hashlist safely during a delete. That in turn alows us to safely call sctp_unhash_established in the __sctp_connect and sctp_sendmsg error paths before freeing them, regardles of what the associations state is on the hash list. I noted, while I was doing this, that the __sctp_unhash_endpoint was using hlist_unhsashed in a simmilar fashion, but never nullified any removed nodes pointers to make that function work properly, so I fixed that up in a simmilar fashion. I attempted to test this using a virtual guest running the SCTP_RR test from netperf in a loop while running the trinity fuzzer, both in a loop. I wasn't able to recreate the problem prior to this fix, nor was I able to trigger the failure after (neither of which I suppose is suprising). Given the trace above however, I think its likely that this is what we hit. Signed-off-by: Neil Horman Reported-by: davej@redhat.com CC: davej@redhat.com CC: "David S. Miller" CC: Vlad Yasevich CC: Sridhar Samudrala CC: linux-sctp@vger.kernel.org Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/input.c | 7 ++----- net/sctp/socket.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 741ed1648838..cd9eded3bb0e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -737,15 +737,12 @@ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) epb = &ep->base; - if (hlist_unhashed(&epb->node)) - return; - epb->hashent = sctp_ep_hashfn(epb->bind_addr.port); head = &sctp_ep_hashtable[epb->hashent]; sctp_write_lock(&head->lock); - __hlist_del(&epb->node); + hlist_del_init(&epb->node); sctp_write_unlock(&head->lock); } @@ -826,7 +823,7 @@ static void __sctp_unhash_established(struct sctp_association *asoc) head = &sctp_assoc_hashtable[epb->hashent]; sctp_write_lock(&head->lock); - __hlist_del(&epb->node); + hlist_del_init(&epb->node); sctp_write_unlock(&head->lock); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4434853a9fe7..b70a3ee6016c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1160,8 +1160,14 @@ out_free: SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p" " kaddrs: %p err: %d\n", asoc, kaddrs, err); - if (asoc) + if (asoc) { + /* sctp_primitive_ASSOCIATE may have added this association + * To the hash table, try to unhash it, just in case, its a noop + * if it wasn't hashed so we're safe + */ + sctp_unhash_established(asoc); sctp_association_free(asoc); + } return err; } @@ -1871,8 +1877,10 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, goto out_unlock; out_free: - if (new_asoc) + if (new_asoc) { + sctp_unhash_established(asoc); sctp_association_free(asoc); + } out_unlock: sctp_release_sock(sk); From 22cb83b5a318697b09fe1d6e237703d8371ab1fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sjur=20Br=C3=A6ndeland?= Date: Sun, 15 Jul 2012 10:10:14 +0000 Subject: [PATCH 073/135] caif: Fix access to freed pernet memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 96f80d123eff05c3cd4701463786b87952a6c3ac ] unregister_netdevice_notifier() must be called before unregister_pernet_subsys() to avoid accessing already freed pernet memory. This fixes the following oops when doing rmmod: Call Trace: [] caif_device_notify+0x4d/0x5a0 [caif] [] unregister_netdevice_notifier+0xb9/0x100 [] caif_device_exit+0x1c/0x250 [caif] [] sys_delete_module+0x1a4/0x300 [] ? trace_hardirqs_on_caller+0x15d/0x1e0 [] ? trace_hardirqs_on_thunk+0x3a/0x3 [] system_call_fastpath+0x1a/0x1f RIP [] caif_get+0x51/0xb0 [caif] Signed-off-by: Sjur Brændeland Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/caif/caif_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 5ba4366a220d..804e50f18a5b 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -424,9 +424,9 @@ static int __init caif_device_init(void) static void __exit caif_device_exit(void) { - unregister_pernet_subsys(&caif_net_ops); unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); + unregister_pernet_subsys(&caif_net_ops); } module_init(caif_device_init); From bca8ae51a3761a9fc4795641541a3478001975c1 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 17 Jul 2012 11:07:47 +0000 Subject: [PATCH 074/135] cipso: don't follow a NULL pointer when setsockopt() is called [ Upstream commit 89d7ae34cdda4195809a5a987f697a517a2a3177 ] As reported by Alan Cox, and verified by Lin Ming, when a user attempts to add a CIPSO option to a socket using the CIPSO_V4_TAG_LOCAL tag the kernel dies a terrible death when it attempts to follow a NULL pointer (the skb argument to cipso_v4_validate() is NULL when called via the setsockopt() syscall). This patch fixes this by first checking to ensure that the skb is non-NULL before using it to find the incoming network interface. In the unlikely case where the skb is NULL and the user attempts to add a CIPSO option with the _TAG_LOCAL tag we return an error as this is not something we want to allow. A simple reproducer, kindly supplied by Lin Ming, although you must have the CIPSO DOI #3 configure on the system first or you will be caught early in cipso_v4_validate(): #include #include #include #include #include struct local_tag { char type; char length; char info[4]; }; struct cipso { char type; char length; char doi[4]; struct local_tag local; }; int main(int argc, char **argv) { int sockfd; struct cipso cipso = { .type = IPOPT_CIPSO, .length = sizeof(struct cipso), .local = { .type = 128, .length = sizeof(struct local_tag), }, }; memset(cipso.doi, 0, 4); cipso.doi[3] = 3; sockfd = socket(AF_INET, SOCK_DGRAM, 0); #define SOL_IP 0 setsockopt(sockfd, SOL_IP, IP_OPTIONS, &cipso, sizeof(struct cipso)); return 0; } CC: Lin Ming Reported-by: Alan Cox Signed-off-by: Paul Moore Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/cipso_ipv4.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 2b3c23c287cd..062876b77308 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1725,8 +1725,10 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) case CIPSO_V4_TAG_LOCAL: /* This is a non-standard tag that we only allow for * local connections, so if the incoming interface is - * not the loopback device drop the packet. */ - if (!(skb->dev->flags & IFF_LOOPBACK)) { + * not the loopback device drop the packet. Further, + * there is no legitimate reason for setting this from + * userspace so reject it if skb is NULL. */ + if (skb == NULL || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } From 4b53a23467b73b472f276d5530a103a736fe8ae1 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 24 Jul 2012 02:42:14 +0000 Subject: [PATCH 075/135] caif: fix NULL pointer check [ Upstream commit c66b9b7d365444b433307ebb18734757cb668a02 ] Reported-by: Resolves-bug: http://bugzilla.kernel.org/show_bug?44441 Signed-off-by: Alan Cox Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/caif/caif_serial.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c index 3df0c0f8b8bf..82b1802b1d3d 100644 --- a/drivers/net/caif/caif_serial.c +++ b/drivers/net/caif/caif_serial.c @@ -325,6 +325,9 @@ static int ldisc_open(struct tty_struct *tty) sprintf(name, "cf%s", tty->name); dev = alloc_netdev(sizeof(*ser), name, caifdev_setup); + if (!dev) + return -ENOMEM; + ser = netdev_priv(dev); ser->tty = tty_kref_get(tty); ser->dev = dev; From 8a22bda491f4fb197c31c2d50c18a816be494a75 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 24 Jul 2012 08:16:25 +0000 Subject: [PATCH 076/135] wanmain: comparing array with NULL [ Upstream commit 8b72ff6484fe303e01498b58621810a114f3cf09 ] gcc really should warn about these ! Signed-off-by: Alan Cox Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/wanrouter/wanmain.c | 47 ++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 788a12c1eb5d..2ab785064b7e 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -602,36 +602,31 @@ static int wanrouter_device_new_if(struct wan_device *wandev, * successfully, add it to the interface list. */ - if (dev->name == NULL) { - err = -EINVAL; - } else { +#ifdef WANDEBUG + printk(KERN_INFO "%s: registering interface %s...\n", + wanrouter_modname, dev->name); +#endif - #ifdef WANDEBUG - printk(KERN_INFO "%s: registering interface %s...\n", - wanrouter_modname, dev->name); - #endif + err = register_netdev(dev); + if (!err) { + struct net_device *slave = NULL; + unsigned long smp_flags=0; - err = register_netdev(dev); - if (!err) { - struct net_device *slave = NULL; - unsigned long smp_flags=0; + lock_adapter_irq(&wandev->lock, &smp_flags); - lock_adapter_irq(&wandev->lock, &smp_flags); - - if (wandev->dev == NULL) { - wandev->dev = dev; - } else { - for (slave=wandev->dev; - DEV_TO_SLAVE(slave); - slave = DEV_TO_SLAVE(slave)) - DEV_TO_SLAVE(slave) = dev; - } - ++wandev->ndev; - - unlock_adapter_irq(&wandev->lock, &smp_flags); - err = 0; /* done !!! */ - goto out; + if (wandev->dev == NULL) { + wandev->dev = dev; + } else { + for (slave=wandev->dev; + DEV_TO_SLAVE(slave); + slave = DEV_TO_SLAVE(slave)) + DEV_TO_SLAVE(slave) = dev; } + ++wandev->ndev; + + unlock_adapter_irq(&wandev->lock, &smp_flags); + err = 0; /* done !!! */ + goto out; } if (wandev->del_if) wandev->del_if(wandev, dev); From 8d7c99de6821880884e6f9ade1c6f269d40ebbae Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 26 Jul 2012 22:52:21 +0000 Subject: [PATCH 077/135] tcp: Add TCP_USER_TIMEOUT negative value check [ Upstream commit 42493570100b91ef663c4c6f0c0fdab238f9d3c2 ] TCP_USER_TIMEOUT is a TCP level socket option that takes an unsigned int. But patch "tcp: Add TCP_USER_TIMEOUT socket option"(dca43c75) didn't check the negative values. If a user assign -1 to it, the socket will set successfully and wait for 4294967295 miliseconds. This patch add a negative value check to avoid this issue. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6db041d3284c..b6ec23c7ffc5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2394,7 +2394,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Cap the max timeout in ms TCP will retry/retrans * before giving up and aborting (ETIMEDOUT) a connection. */ - icsk->icsk_user_timeout = msecs_to_jiffies(val); + if (val < 0) + err = -EINVAL; + else + icsk->icsk_user_timeout = msecs_to_jiffies(val); break; default: err = -ENOPROTOOPT; From 242e0e14c3995231230b2836bcd1f5dc6c08ff90 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Jul 2012 01:46:51 +0000 Subject: [PATCH 078/135] USB: kaweth.c: use GFP_ATOMIC under spin_lock [ Upstream commit e4c7f259c5be99dcfc3d98f913590663b0305bf8 ] The problem is that we call this with a spin lock held. The call tree is: kaweth_start_xmit() holds kaweth->device_lock. -> kaweth_async_set_rx_mode() -> kaweth_control() -> kaweth_internal_control_msg() The kaweth_internal_control_msg() function is only called from kaweth_control() which used GFP_ATOMIC for its allocations. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/kaweth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c index ad0298f9b5f9..3362449a2d9b 100644 --- a/drivers/net/usb/kaweth.c +++ b/drivers/net/usb/kaweth.c @@ -1308,7 +1308,7 @@ static int kaweth_internal_control_msg(struct usb_device *usb_dev, int retv; int length = 0; /* shut up GCC */ - urb = usb_alloc_urb(0, GFP_NOIO); + urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) return -ENOMEM; From c94eb3f964dd7c08486d7c1227c7b5b69d09aabe Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 27 Jul 2012 02:58:22 +0000 Subject: [PATCH 079/135] net: fix rtnetlink IFF_PROMISC and IFF_ALLMULTI handling [ Upstream commit b1beb681cba5358f62e6187340660ade226a5fcc ] When device flags are set using rtnetlink, IFF_PROMISC and IFF_ALLMULTI flags are handled specially. Function dev_change_flags sets IFF_PROMISC and IFF_ALLMULTI bits in dev->gflags according to the passed value but do_setlink passes a result of rtnl_dev_combine_flags which takes those bits from dev->flags. This can be easily trigerred by doing: tcpdump -i eth0 & ip l s up eth0 ip sets IFF_UP flag in ifi_flags and ifi_change, which is combined with IFF_PROMISC by rtnl_dev_combine_flags, causing __dev_change_flags to set IFF_PROMISC in gflags. Reported-by: Max Matveev Signed-off-by: Jiri Benc Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/rtnetlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index abd936d8a716..861d53f5f13b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -647,6 +647,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition) } } +static unsigned int rtnl_dev_get_flags(const struct net_device *dev) +{ + return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | + (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); +} + static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { @@ -655,7 +661,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | - (dev->flags & ~ifm->ifi_change); + (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } From 41f079a0e1089bf0ed2b795cc427bf40b72efe0e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 27 Jul 2012 10:38:50 +0000 Subject: [PATCH 080/135] tcp: perform DMA to userspace only if there is a task waiting for it [ Upstream commit 59ea33a68a9083ac98515e4861c00e71efdc49a1 ] Back in 2006, commit 1a2449a87b ("[I/OAT]: TCP recv offload to I/OAT") added support for receive offloading to IOAT dma engine if available. The code in tcp_rcv_established() tries to perform early DMA copy if applicable. It however does so without checking whether the userspace task is actually expecting the data in the buffer. This is not a problem under normal circumstances, but there is a corner case where this doesn't work -- and that's when MSG_TRUNC flag to recvmsg() is used. If the IOAT dma engine is not used, the code properly checks whether there is a valid ucopy.task and the socket is owned by userspace, but misses the check in the dmaengine case. This problem can be observed in real trivially -- for example 'tbench' is a good reproducer, as it makes a heavy use of MSG_TRUNC. On systems utilizing IOAT, you will soon find tbench waiting indefinitely in sk_wait_data(), as they have been already early-copied in tcp_rcv_established() using dma engine. This patch introduces the same check we are performing in the simple iovec copy case to the IOAT case as well. It fixes the indefinite recvmsg(MSG_TRUNC) hangs. Signed-off-by: Jiri Kosina Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6e33b79cb688..b76aa2d9624f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5340,7 +5340,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { #ifdef CONFIG_NET_DMA - if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + if (tp->ucopy.task == current && + sock_owned_by_user(sk) && + tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { copied_early = 1; eaten = 1; } From 4e98953723643ac9db9191381f52973a8f113902 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 29 Jul 2012 19:45:14 +0000 Subject: [PATCH 081/135] net/tun: fix ioctl() based info leaks [ Upstream commits a117dacde0288f3ec60b6e5bcedae8fa37ee0dfc and 8bbb181308bc348e02bfdbebdedd4e4ec9d452ce ] The tun module leaks up to 36 bytes of memory by not fully initializing a structure located on the stack that gets copied to user memory by the TUNGETIFF and SIOCGIFHWADDR ioctl()s. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/tun.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index fb50e5a542fd..a631bf71fee5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1239,10 +1239,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int vnet_hdr_sz; int ret; - if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) + if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; - + } else { + memset(&ifr, 0, sizeof(ifr)); + } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on From aeaab8a0fee896fb31f2e5044f86d2a270d65cb6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 30 Jul 2012 16:06:42 +0100 Subject: [PATCH 082/135] USB: echi-dbgp: increase the controller wait time to come out of halt. commit f96a4216e85050c0a9d41a41ecb0ae9d8e39b509 upstream. The default 10 microsecond delay for the controller to come out of halt in dbgp_ehci_startup is too short, so increase it to 1 millisecond. This is based on emperical testing on various USB debug ports on modern machines such as a Lenovo X220i and an Ivybridge development platform that needed to wait ~450-950 microseconds. Signed-off-by: Colin Ian King Signed-off-by: Jason Wessel Signed-off-by: Greg Kroah-Hartman --- drivers/usb/early/ehci-dbgp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c index 1fc8f1249806..347bb058e1ee 100644 --- a/drivers/usb/early/ehci-dbgp.c +++ b/drivers/usb/early/ehci-dbgp.c @@ -450,7 +450,7 @@ static int dbgp_ehci_startup(void) writel(FLAG_CF, &ehci_regs->configured_flag); /* Wait until the controller is no longer halted */ - loop = 10; + loop = 1000; do { status = readl(&ehci_regs->status); if (!(status & STS_HALT)) From f45cd6dfe00503a2ff49920bf8365a55ea69edbb Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 1 Aug 2012 10:16:53 +0200 Subject: [PATCH 083/135] ALSA: snd-usb: fix clock source validity index commit aff252a848ce21b431ba822de3dab9c4c94571cb upstream. uac_clock_source_is_valid() uses the control selector value to access the bmControls bitmap of the clock source unit. This is wrong, as control selector values start from 1, while the bitmap uses all available bits. In other words, "Clock Validity Control" is stored in D3..2, not D5..4 of the clock selector unit's bmControls. Signed-off-by: Daniel Mack Reported-by: Andreas Koch Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/clock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/usb/clock.c b/sound/usb/clock.c index 075195e8661a..f0ff776579ba 100644 --- a/sound/usb/clock.c +++ b/sound/usb/clock.c @@ -111,7 +111,8 @@ static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, int source_id) return 0; /* If a clock source can't tell us whether it's valid, we assume it is */ - if (!uac2_control_is_readable(cs_desc->bmControls, UAC2_CS_CONTROL_CLOCK_VALID)) + if (!uac2_control_is_readable(cs_desc->bmControls, + UAC2_CS_CONTROL_CLOCK_VALID - 1)) return 1; err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC2_CS_CUR, From de4bc9fc942526dbcb0246dbb2ca87ad9b3b02b0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Jul 2012 11:35:55 +0200 Subject: [PATCH 084/135] ALSA: mpu401: Fix missing initialization of irq field commit bc733d495267a23ef8660220d696c6e549ce30b3 upstream. The irq field of struct snd_mpu401 is supposed to be initialized to -1. Since it's set to zero as of now, a probing error before the irq installation results in a kernel warning "Trying to free already-free IRQ 0". Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=44821 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/drivers/mpu401/mpu401_uart.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/drivers/mpu401/mpu401_uart.c b/sound/drivers/mpu401/mpu401_uart.c index 2af09996a3d0..74f5a3da87e1 100644 --- a/sound/drivers/mpu401/mpu401_uart.c +++ b/sound/drivers/mpu401/mpu401_uart.c @@ -554,6 +554,7 @@ int snd_mpu401_uart_new(struct snd_card *card, int device, spin_lock_init(&mpu->output_lock); spin_lock_init(&mpu->timer_lock); mpu->hardware = hardware; + mpu->irq = -1; if (! (info_flags & MPU401_INFO_INTEGRATED)) { int res_size = hardware == MPU401_HW_PC98II ? 4 : 2; mpu->res = request_region(port, res_size, "MPU401 UART"); From 4ae4c20ceb593c1a8f392bb1100cc60a1f04ee4c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 30 Jul 2012 18:24:19 +0100 Subject: [PATCH 085/135] ASoC: wm8962: Allow VMID time to fully ramp commit 9d40e5582c9c4cfb6977ba2a0ca9c2ed82c56f21 upstream. Required for reliable power up from cold. Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8962.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/codecs/wm8962.c b/sound/soc/codecs/wm8962.c index c850e3d84ed0..f16f587dfb0e 100644 --- a/sound/soc/codecs/wm8962.c +++ b/sound/soc/codecs/wm8962.c @@ -2890,6 +2890,9 @@ static int wm8962_set_bias_level(struct snd_soc_codec *codec, /* VMID 2*250k */ snd_soc_update_bits(codec, WM8962_PWR_MGMT_1, WM8962_VMID_SEL_MASK, 0x100); + + if (codec->dapm.bias_level == SND_SOC_BIAS_OFF) + msleep(100); break; case SND_SOC_BIAS_OFF: From 3b6ae1807d29f8cacd63fddf513110f9308b4c8e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 22 Jun 2012 17:21:17 +0100 Subject: [PATCH 086/135] ASoC: wm8994: Ensure there are enough BCLKs for four channels commit b8edf3e5522735c8ce78b81845f7a1a2d4a08626 upstream. Otherwise if someone tries to use all four channels on AIF1 with the device in master mode we won't be able to clock out all the data. Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm8994.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c index 219491237875..1f7616d03fff 100644 --- a/sound/soc/codecs/wm8994.c +++ b/sound/soc/codecs/wm8994.c @@ -2127,7 +2127,7 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream, return -EINVAL; } - bclk_rate = params_rate(params) * 2; + bclk_rate = params_rate(params) * 4; switch (params_format(params)) { case SNDRV_PCM_FORMAT_S16_LE: bclk_rate *= 16; From d3be3eeedbc5b39f93b27c6ece2879c1d417eed5 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Sat, 28 Jul 2012 00:20:34 +0200 Subject: [PATCH 087/135] m68k: Make sys_atomic_cmpxchg_32 work on classic m68k commit 9e2760d18b3cf179534bbc27692c84879c61b97c upstream. User space access must always go through uaccess accessors, since on classic m68k user space and kernel space are completely separate. Signed-off-by: Andreas Schwab Tested-by: Thorsten Glaser Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- arch/m68k/kernel/sys_m68k.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 8623f8dc16f8..9a5932ec3689 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -479,9 +479,13 @@ sys_atomic_cmpxchg_32(unsigned long newval, int oldval, int d3, int d4, int d5, goto bad_access; } - mem_value = *mem; + /* + * No need to check for EFAULT; we know that the page is + * present and writable. + */ + __get_user(mem_value, mem); if (mem_value == oldval) - *mem = newval; + __put_user(newval, mem); pte_unmap_unlock(pte, ptl); up_read(&mm->mmap_sem); From e3d8d77f515ca7aa4896f1ed9b8e24a487225109 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Thu, 19 Apr 2012 00:53:36 +0200 Subject: [PATCH 088/135] m68k: Correct the Atari ALLOWINT definition commit c663600584a596b5e66258cc10716fb781a5c2c9 upstream. Booting a 3.2, 3.3, or 3.4-rc4 kernel on an Atari using the `nfeth' ethernet device triggers a WARN_ONCE() in generic irq handling code on the first irq for that device: WARNING: at kernel/irq/handle.c:146 handle_irq_event_percpu+0x134/0x142() irq 3 handler nfeth_interrupt+0x0/0x194 enabled interrupts Modules linked in: Call Trace: [<000299b2>] warn_slowpath_common+0x48/0x6a [<000299c0>] warn_slowpath_common+0x56/0x6a [<00029a4c>] warn_slowpath_fmt+0x2a/0x32 [<0005b34c>] handle_irq_event_percpu+0x134/0x142 [<0005b34c>] handle_irq_event_percpu+0x134/0x142 [<0000a584>] nfeth_interrupt+0x0/0x194 [<001ba0a8>] schedule_preempt_disabled+0x0/0xc [<0005b37a>] handle_irq_event+0x20/0x2c [<0005add4>] generic_handle_irq+0x2c/0x3a [<00002ab6>] do_IRQ+0x20/0x32 [<0000289e>] auto_irqhandler_fixup+0x4/0x6 [<00003144>] cpu_idle+0x22/0x2e [<001b8a78>] printk+0x0/0x18 [<0024d112>] start_kernel+0x37a/0x386 [<0003021d>] __do_proc_dointvec+0xb1/0x366 [<0003021d>] __do_proc_dointvec+0xb1/0x366 [<0024c31e>] _sinittext+0x31e/0x9c0 After invoking the irq's handler the kernel sees !irqs_disabled() and concludes that the handler erroneously enabled interrupts. However, debugging shows that !irqs_disabled() is true even before the handler is invoked, which indicates a problem in the platform code rather than the specific driver. The warning does not occur in 3.1 or older kernels. It turns out that the ALLOWINT definition for Atari is incorrect. The Atari definition of ALLOWINT is ~0x400, the stated purpose of that is to avoid taking HSYNC interrupts. irqs_disabled() returns true if the 3-bit ipl & 4 is non-zero. The nfeth interrupt runs at ipl 3 (it's autovector 3), but 3 & 4 is zero so irqs_disabled() is false, and the warning above is generated. When interrupts are explicitly disabled, ipl is set to 7. When they are enabled, ipl is masked with ALLOWINT. On Atari this will result in ipl = 3, which blocks interrupts at ipl 3 and below. So how come nfeth interrupts at ipl 3 are received at all? That's because ipl is reset to 2 by Atari-specific code in default_idle(), again with the stated purpose of blocking HSYNC interrupts. This discrepancy means that ipl 3 can remain blocked for longer than intended. Both default_idle() and falcon_hblhandler() identify HSYNC with ipl 2, and the "Atari ST/.../F030 Hardware Register Listing" agrees, but ALLOWINT is defined as if HSYNC was ipl 3. [As an experiment I modified default_idle() to reset ipl to 3, and as expected that resulted in all nfeth interrupts being blocked.] The fix is simple: define ALLOWINT as ~0x500 instead. This makes arch_local_irq_enable() consistent with default_idle(), and prevents the !irqs_disabled() problems for ipl 3 interrupts. Tested on Atari running in an Aranym VM. Signed-off-by: Mikael Pettersson Tested-by: Michael Schmitz Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- arch/m68k/include/asm/entry_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68k/include/asm/entry_mm.h b/arch/m68k/include/asm/entry_mm.h index 73b8c8fbed9c..bdace4bb29d8 100644 --- a/arch/m68k/include/asm/entry_mm.h +++ b/arch/m68k/include/asm/entry_mm.h @@ -35,8 +35,8 @@ /* the following macro is used when enabling interrupts */ #if defined(MACH_ATARI_ONLY) - /* block out HSYNC on the atari */ -#define ALLOWINT (~0x400) + /* block out HSYNC = ipl 2 on the atari */ +#define ALLOWINT (~0x500) #define MAX_NOINT_IPL 3 #else /* portable version */ From bc16cc3950aed91b84c9a7071af2c72abee91660 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:29 -0700 Subject: [PATCH 089/135] futex: Test for pi_mutex on fault in futex_wait_requeue_pi() commit b6070a8d9853eda010a549fa9a09eb8d7269b929 upstream. If fixup_pi_state_owner() faults, pi_mutex may be NULL. Test for pi_mutex != NULL before testing the owner against current and possibly unlocking it. Signed-off-by: Darren Hart Cc: Dave Jones Cc: Dan Carpenter Link: http://lkml.kernel.org/r/dc59890338fc413606f04e5c5b131530734dae3d.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/futex.c b/kernel/futex.c index 11e8924b6ee5..260e1d6e5f66 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2370,7 +2370,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * fault, unlock the rt_mutex and return the fault to userspace. */ if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) + if (pi_mutex && rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* From 7367fdb4987147438ca8cd69d6ae021ffc646c57 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:30 -0700 Subject: [PATCH 090/135] futex: Fix bug in WARN_ON for NULL q.pi_state commit f27071cb7fe3e1d37a9dbe6c0dfc5395cd40fa43 upstream. The WARN_ON in futex_wait_requeue_pi() for a NULL q.pi_state was testing the address (&q.pi_state) of the pointer instead of the value (q.pi_state) of the pointer. Correct it accordingly. Signed-off-by: Darren Hart Cc: Dave Jones Link: http://lkml.kernel.org/r/1c85d97f6e5f79ec389a4ead3e367363c74bd09a.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/futex.c b/kernel/futex.c index 260e1d6e5f66..ca308210f446 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2343,7 +2343,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * signal. futex_unlock_pi() will not destroy the lock_ptr nor * the pi_state. */ - WARN_ON(!&q.pi_state); + WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); From b7a06be61b78c66860897ee2a0f9e23845f1f438 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:31 -0700 Subject: [PATCH 091/135] futex: Forbid uaddr == uaddr2 in futex_wait_requeue_pi() commit 6f7b0a2a5c0fb03be7c25bd1745baa50582348ef upstream. If uaddr == uaddr2, then we have broken the rule of only requeueing from a non-pi futex to a pi futex with this call. If we attempt this, as the trinity test suite manages to do, we miss early wakeups as q.key is equal to key2 (because they are the same uaddr). We will then attempt to dereference the pi_mutex (which would exist had the futex_q been properly requeued to a pi futex) and trigger a NULL pointer dereference. Signed-off-by: Darren Hart Cc: Dave Jones Link: http://lkml.kernel.org/r/ad82bfe7f7d130247fbe2b5b4275654807774227.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index ca308210f446..24bc59c8867f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: @@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (uaddr == uaddr2) + return -EINVAL; + if (!bitset) return -EINVAL; From b09b34258046c4555e535a279e29032303a932f8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 9 Aug 2012 08:28:18 -0700 Subject: [PATCH 092/135] Linux 3.0.40 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3ec1722025f3..ec4fee552b51 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 0 -SUBLEVEL = 39 +SUBLEVEL = 40 EXTRAVERSION = NAME = Sneaky Weasel From 22268214261ad64783cbad1d9c1df227302e5ed1 Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Tue, 14 Aug 2012 21:26:54 -0300 Subject: [PATCH 093/135] x86: Simplify code by removing a !SMP #ifdefs from 'struct cpuinfo_x86' commit 141168c36cdee3ff23d9c7700b0edc47cb65479f and commit 3f806e50981825fa56a7f1938f24c0680816be45 upstream. Several fields in struct cpuinfo_x86 were not defined for the !SMP case, likely to save space. However, those fields still have some meaning for UP, and keeping them allows some #ifdef removal from other files. The additional size of the UP kernel from this change is not significant enough to worry about keeping up the distinction: text data bss dec hex filename 4737168 506459 972040 6215667 5ed7f3 vmlinux.o.before 4737444 506459 972040 6215943 5ed907 vmlinux.o.after for a difference of 276 bytes for an example UP config. If someone wants those 276 bytes back badly then it should be implemented in a cleaner way. Signed-off-by: Kevin Winchester Cc: Steffen Persvold Link: http://lkml.kernel.org/r/1324428742-12498-1-git-send-email-kjwinchester@gmail.com Signed-off-by: Ingo Molnar Signed-off-by: Borislav Petkov Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/processor.h | 2 -- arch/x86/kernel/amd_nb.c | 8 ++------ arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/cpu/common.c | 5 ----- arch/x86/kernel/cpu/intel.c | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 2 -- arch/x86/kernel/cpu/mcheck/mce_amd.c | 5 +---- arch/x86/kernel/cpu/proc.c | 4 +--- drivers/edac/i7core_edac.c | 2 -- drivers/hwmon/coretemp.c | 7 +++---- 10 files changed, 7 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5d9c61d0b270..e5f724834edb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -99,7 +99,6 @@ struct cpuinfo_x86 { u16 apicid; u16 initial_apicid; u16 x86_clflush_size; -#ifdef CONFIG_SMP /* number of cores as seen by the OS: */ u16 booted_cores; /* Physical processor id: */ @@ -110,7 +109,6 @@ struct cpuinfo_x86 { u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; -#endif } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index bae1efe6d515..be16854591cc 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -154,16 +154,14 @@ int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; unsigned int mask; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return 0; pci_read_config_dword(link, 0x1d4, &mask); -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif return (mask >> (4 * cuid)) & 0xf; } @@ -172,7 +170,7 @@ int amd_set_subcaches(int cpu, int mask) static unsigned int reset, ban; struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); unsigned int reg; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) return -EINVAL; @@ -190,9 +188,7 @@ int amd_set_subcaches(int cpu, int mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b13ed393dfce..8115040e55d0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -146,7 +146,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -190,7 +189,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) valid_k7: ; -#endif } static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 22a073d7fbff..0cb288368f68 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -675,9 +675,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); -#ifdef CONFIG_SMP c->cpu_index = 0; -#endif filter_cpuid_features(c, false); setup_smep(c); @@ -760,10 +758,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif - -#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; -#endif } setup_smep(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ed6086eedf1d..e0dc0005456d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -179,7 +179,6 @@ static void __cpuinit trap_init_f00f_bug(void) static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -196,7 +195,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } -#endif } static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ff1ae9b6464d..942bda2fd2f8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -122,9 +122,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index dc4fb779a724..b97aa72702f1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -65,11 +65,9 @@ struct threshold_bank { }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 }; -#endif static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -227,10 +225,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP + if (shared_bank[bank] && c->cpu_core_id) break; -#endif memset(&b, 0, sizeof(b)); b.cpu = cpu; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 62ac8cb6ba27..72c365a12406 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu = 0; + unsigned int cpu; int i; -#ifdef CONFIG_SMP cpu = c->cpu_index; -#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index f6cf448d69b4..240966b4c7e7 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -1842,11 +1842,9 @@ static int i7core_mce_check_error(void *priv, struct mce *mce) if (mce->bank != 8) return 0; -#ifdef CONFIG_SMP /* Only handle if it is the right mc controller */ if (cpu_data(mce->cpu).phys_proc_id != pvt->i7core_dev->socket) return 0; -#endif smp_rmb(); if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 252defdd2f13..87fd034dabd9 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -47,16 +47,15 @@ #define MAX_ATTRS 5 /* Maximum no of per-core attrs */ #define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) -#ifdef CONFIG_SMP #define TO_PHYS_ID(cpu) cpu_data(cpu).phys_proc_id #define TO_CORE_ID(cpu) cpu_data(cpu).cpu_core_id +#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) + +#ifdef CONFIG_SMP #define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) #else -#define TO_PHYS_ID(cpu) (cpu) -#define TO_CORE_ID(cpu) (cpu) #define for_each_sibling(i, cpu) for (i = 0; false; ) #endif -#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) /* * Per-Core Temperature Data From 5bf75ed61c9715435661971acd4e181c26f2af20 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 26 Jul 2012 10:55:26 -0700 Subject: [PATCH 094/135] Redefine ATOMIC_INIT and ATOMIC64_INIT to drop the casts commit a119365586b0130dfea06457f584953e0ff6481d upstream. The following build error occured during a ia64 build with swap-over-NFS patches applied. net/core/sock.c:274:36: error: initializer element is not constant net/core/sock.c:274:36: error: (near initialization for 'memalloc_socks') net/core/sock.c:274:36: error: initializer element is not constant This is identical to a parisc build error. Fengguang Wu, Mel Gorman and James Bottomley did all the legwork to track the root cause of the problem. This fix and entire commit log is shamelessly copied from them with one extra detail to change a dubious runtime use of ATOMIC_INIT() to atomic_set() in drivers/char/mspec.c Dave Anglin says: > Here is the line in sock.i: > > struct static_key memalloc_socks = ((struct static_key) { .enabled = > ((atomic_t) { (0) }) }); The above line contains two compound literals. It also uses a designated initializer to initialize the field enabled. A compound literal is not a constant expression. The location of the above statement isn't fully clear, but if a compound literal occurs outside the body of a function, the initializer list must consist of constant expressions. Signed-off-by: Tony Luck Signed-off-by: Greg Kroah-Hartman --- arch/ia64/include/asm/atomic.h | 4 ++-- drivers/char/mspec.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 446881439675..6fcc9a081e4c 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -18,8 +18,8 @@ #include -#define ATOMIC_INIT(i) ((atomic_t) { (i) }) -#define ATOMIC64_INIT(i) ((atomic64_t) { (i) }) +#define ATOMIC_INIT(i) { (i) } +#define ATOMIC64_INIT(i) { (i) } #define atomic_read(v) (*(volatile int *)&(v)->counter) #define atomic64_read(v) (*(volatile long *)&(v)->counter) diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 25d139c9dbed..579051ce8545 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -284,7 +284,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, vdata->flags = flags; vdata->type = type; spin_lock_init(&vdata->lock); - vdata->refcnt = ATOMIC_INIT(1); + atomic_set(&vdata->refcnt, 1); vma->vm_private_data = vdata; vma->vm_flags |= (VM_IO | VM_RESERVED | VM_PFNMAP | VM_DONTEXPAND); From d90c97ba987ab9f1c0a1e11697269814386243b9 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 20 Jul 2012 15:57:48 +0400 Subject: [PATCH 095/135] SUNRPC: return negative value in case rpcbind client creation error commit caea33da898e4e14f0ba58173e3b7689981d2c0b upstream. Without this patch kernel will panic on LockD start, because lockd_up() checks lockd_up_net() result for negative value. From my pow it's better to return negative value from rpcbind routines instead of replacing all such checks like in lockd_up(). Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/rpcb_clnt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index e45d2fbbe5a8..bf0a7f64f000 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -193,7 +193,7 @@ static int rpcb_create_local_unix(void) if (IS_ERR(clnt)) { dprintk("RPC: failed to create AF_LOCAL rpcbind " "client (errno %ld).\n", PTR_ERR(clnt)); - result = -PTR_ERR(clnt); + result = PTR_ERR(clnt); goto out; } @@ -242,7 +242,7 @@ static int rpcb_create_local_net(void) if (IS_ERR(clnt)) { dprintk("RPC: failed to create local rpcbind " "client (errno %ld).\n", PTR_ERR(clnt)); - result = -PTR_ERR(clnt); + result = PTR_ERR(clnt); goto out; } From 85e937dcf1cc31e7f649c200ac3c3def01b54766 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 30 Jul 2012 14:42:07 -0700 Subject: [PATCH 096/135] nilfs2: fix deadlock issue between chcp and thaw ioctls commit 572d8b3945a31bee7c40d21556803e4807fd9141 upstream. An fs-thaw ioctl causes deadlock with a chcp or mkcp -s command: chcp D ffff88013870f3d0 0 1325 1324 0x00000004 ... Call Trace: nilfs_transaction_begin+0x11c/0x1a0 [nilfs2] wake_up_bit+0x20/0x20 copy_from_user+0x18/0x30 [nilfs2] nilfs_ioctl_change_cpmode+0x7d/0xcf [nilfs2] nilfs_ioctl+0x252/0x61a [nilfs2] do_page_fault+0x311/0x34c get_unmapped_area+0x132/0x14e do_vfs_ioctl+0x44b/0x490 __set_task_blocked+0x5a/0x61 vm_mmap_pgoff+0x76/0x87 __set_current_blocked+0x30/0x4a sys_ioctl+0x4b/0x6f system_call_fastpath+0x16/0x1b thaw D ffff88013870d890 0 1352 1351 0x00000004 ... Call Trace: rwsem_down_failed_common+0xdb/0x10f call_rwsem_down_write_failed+0x13/0x20 down_write+0x25/0x27 thaw_super+0x13/0x9e do_vfs_ioctl+0x1f5/0x490 vm_mmap_pgoff+0x76/0x87 sys_ioctl+0x4b/0x6f filp_close+0x64/0x6c system_call_fastpath+0x16/0x1b where the thaw ioctl deadlocked at thaw_super() when called while chcp was waiting at nilfs_transaction_begin() called from nilfs_ioctl_change_cpmode(). This deadlock is 100% reproducible. This is because nilfs_ioctl_change_cpmode() first locks sb->s_umount in read mode and then waits for unfreezing in nilfs_transaction_begin(), whereas thaw_super() locks sb->s_umount in write mode. The locking of sb->s_umount here was intended to make snapshot mounts and the downgrade of snapshots to checkpoints exclusive. This fixes the deadlock issue by replacing the sb->s_umount usage in nilfs_ioctl_change_cpmode() with a dedicated mutex which protects snapshot mounts. Signed-off-by: Ryusuke Konishi Cc: Fernando Luis Vazquez Cao Tested-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/ioctl.c | 4 ++-- fs/nilfs2/super.c | 3 +++ fs/nilfs2/the_nilfs.c | 1 + fs/nilfs2/the_nilfs.h | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3e654273cfc2..0d1c9bdbb797 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -182,7 +182,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, if (copy_from_user(&cpmode, argp, sizeof(cpmode))) goto out; - down_read(&inode->i_sb->s_umount); + mutex_lock(&nilfs->ns_snapshot_mount_mutex); nilfs_transaction_begin(inode->i_sb, &ti, 0); ret = nilfs_cpfile_change_cpmode( @@ -192,7 +192,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, else nilfs_transaction_commit(inode->i_sb); /* never fails */ - up_read(&inode->i_sb->s_umount); + mutex_unlock(&nilfs->ns_snapshot_mount_mutex); out: mnt_drop_write(filp->f_path.mnt); return ret; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8351c44a7320..97bfbddd9fc8 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -951,6 +951,8 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, struct nilfs_root *root; int ret; + mutex_lock(&nilfs->ns_snapshot_mount_mutex); + down_read(&nilfs->ns_segctor_sem); ret = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, cno); up_read(&nilfs->ns_segctor_sem); @@ -975,6 +977,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, ret = nilfs_get_root_dentry(s, root, root_dentry); nilfs_put_root(root); out: + mutex_unlock(&nilfs->ns_snapshot_mount_mutex); return ret; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 35a89708b635..1c98f5394dee 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -76,6 +76,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_bdev = bdev; atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); + mutex_init(&nilfs->ns_snapshot_mount_mutex); INIT_LIST_HEAD(&nilfs->ns_dirty_files); INIT_LIST_HEAD(&nilfs->ns_gc_inodes); spin_lock_init(&nilfs->ns_inode_lock); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 9992b11312ff..de7435f0ef5c 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -47,6 +47,7 @@ enum { * @ns_flags: flags * @ns_bdev: block device * @ns_sem: semaphore for shared states + * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data * @ns_sbwtime: previous write time of super block @@ -99,6 +100,7 @@ struct the_nilfs { struct block_device *ns_bdev; struct rw_semaphore ns_sem; + struct mutex ns_snapshot_mount_mutex; /* * used for From 83c0c5e47281e4fb3eedf56fa8cf8b895aede0e0 Mon Sep 17 00:00:00 2001 From: Greg Pearson Date: Mon, 30 Jul 2012 14:39:05 -0700 Subject: [PATCH 097/135] pcdp: use early_ioremap/early_iounmap to access pcdp table commit 6c4088ac3a4d82779903433bcd5f048c58fb1aca upstream. efi_setup_pcdp_console() is called during boot to parse the HCDP/PCDP EFI system table and setup an early console for printk output. The routine uses ioremap/iounmap to setup access to the HCDP/PCDP table information. The call to ioremap is happening early in the boot process which leads to a panic on x86_64 systems: panic+0x01ca do_exit+0x043c oops_end+0x00a7 no_context+0x0119 __bad_area_nosemaphore+0x0138 bad_area_nosemaphore+0x000e do_page_fault+0x0321 page_fault+0x0020 reserve_memtype+0x02a1 __ioremap_caller+0x0123 ioremap_nocache+0x0012 efi_setup_pcdp_console+0x002b setup_arch+0x03a9 start_kernel+0x00d4 x86_64_start_reservations+0x012c x86_64_start_kernel+0x00fe This replaces the calls to ioremap/iounmap in efi_setup_pcdp_console() with calls to early_ioremap/early_iounmap which can be called during early boot. This patch was tested on an x86_64 prototype system which uses the HCDP/PCDP table for early console setup. Signed-off-by: Greg Pearson Acked-by: Khalid Aziz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/pcdp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c index 51e0e2d8fac6..a330492e06f9 100644 --- a/drivers/firmware/pcdp.c +++ b/drivers/firmware/pcdp.c @@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline) if (efi.hcdp == EFI_INVALID_TABLE_ADDR) return -ENODEV; - pcdp = ioremap(efi.hcdp, 4096); + pcdp = early_ioremap(efi.hcdp, 4096); printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); if (strstr(cmdline, "console=hcdp")) { @@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline) } out: - iounmap(pcdp); + early_iounmap(pcdp, 4096); return rc; } From cad33da5ceedac56481dd3168e42580e9bec6343 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 30 Jul 2012 14:39:04 -0700 Subject: [PATCH 098/135] mm: fix wrong argument of migrate_huge_pages() in soft_offline_huge_page() commit dc32f63453f56d07a1073a697dcd843dd3098c09 upstream. Commit a6bc32b89922 ("mm: compaction: introduce sync-light migration for use by compaction") changed the declaration of migrate_pages() and migrate_huge_pages(). But it missed changing the argument of migrate_huge_pages() in soft_offline_huge_page(). In this case, we should call migrate_huge_pages() with MIGRATE_SYNC. Additionally, there is a mismatch between type the of argument and the function declaration for migrate_pages(). Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Mel Gorman Acked-by: David Rientjes Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6496748df214..2f49dcf4f474 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, - true); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, + MIGRATE_SYNC); if (ret) { struct page *page1, *page2; list_for_each_entry_safe(page1, page2, &pagelist, lru) @@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - 0, MIGRATE_SYNC); + false, MIGRATE_SYNC); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", From 0b41a531bee76feefaa22878b035d8c71edcb87e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 20 Jul 2012 18:24:55 +0100 Subject: [PATCH 099/135] ARM: 7478/1: errata: extend workaround for erratum #720789 commit 5a783cbc48367cfc7b65afc75430953dfe60098f upstream. Commit cdf357f1 ("ARM: 6299/1: errata: TLBIASIDIS and TLBIMVAIS operations can broadcast a faulty ASID") replaced by-ASID TLB flushing operations with all-ASID variants to workaround A9 erratum #720789. This patch extends the workaround to include the tlb_range operations, which were overlooked by the original patch. Tested-by: Steve Capper Signed-off-by: Will Deacon Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/mm/tlb-v7.S | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm/mm/tlb-v7.S b/arch/arm/mm/tlb-v7.S index 53cd5b454673..875634aad5ef 100644 --- a/arch/arm/mm/tlb-v7.S +++ b/arch/arm/mm/tlb-v7.S @@ -38,11 +38,19 @@ ENTRY(v7wbi_flush_user_tlb_range) dsb mov r0, r0, lsr #PAGE_SHIFT @ align address mov r1, r1, lsr #PAGE_SHIFT +#ifdef CONFIG_ARM_ERRATA_720789 + mov r3, #0 +#else asid r3, r3 @ mask ASID +#endif orr r0, r3, r0, lsl #PAGE_SHIFT @ Create initial MVA mov r1, r1, lsl #PAGE_SHIFT 1: +#ifdef CONFIG_ARM_ERRATA_720789 + ALT_SMP(mcr p15, 0, r0, c8, c3, 3) @ TLB invalidate U MVA all ASID (shareable) +#else ALT_SMP(mcr p15, 0, r0, c8, c3, 1) @ TLB invalidate U MVA (shareable) +#endif ALT_UP(mcr p15, 0, r0, c8, c7, 1) @ TLB invalidate U MVA add r0, r0, #PAGE_SZ @@ -70,7 +78,11 @@ ENTRY(v7wbi_flush_kern_tlb_range) mov r0, r0, lsl #PAGE_SHIFT mov r1, r1, lsl #PAGE_SHIFT 1: +#ifdef CONFIG_ARM_ERRATA_720789 + ALT_SMP(mcr p15, 0, r0, c8, c3, 3) @ TLB invalidate U MVA all ASID (shareable) +#else ALT_SMP(mcr p15, 0, r0, c8, c3, 1) @ TLB invalidate U MVA (shareable) +#endif ALT_UP(mcr p15, 0, r0, c8, c7, 1) @ TLB invalidate U MVA add r0, r0, #PAGE_SZ cmp r0, r1 From b9d316de7dc18de5df147597e3c38365ee1bfd6c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Jul 2012 14:18:13 +0100 Subject: [PATCH 100/135] ARM: 7479/1: mm: avoid NULL dereference when flushing gate_vma with VIVT caches commit b74253f78400f9a4b42da84bb1de7540b88ce7c4 upstream. The vivt_flush_cache_{range,page} functions check that the mm_struct of the VMA being flushed has been active on the current CPU before performing the cache maintenance. The gate_vma has a NULL mm_struct pointer and, as such, will cause a kernel fault if we try to flush it with the above operations. This happens during ELF core dumps, which include the gate_vma as it may be useful for debugging purposes. This patch adds checks to the VIVT cache flushing functions so that VMAs with a NULL mm_struct are flushed unconditionally (the vectors page may be dirty if we use it to store the current TLS pointer). Reported-by: Gilles Chanteperdrix Tested-by: Uros Bizjak Signed-off-by: Will Deacon Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/include/asm/cacheflush.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 1252a2675ca9..42dec04f6170 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -215,7 +215,9 @@ static inline void vivt_flush_cache_mm(struct mm_struct *mm) static inline void vivt_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) + struct mm_struct *mm = vma->vm_mm; + + if (!mm || cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) __cpuc_flush_user_range(start & PAGE_MASK, PAGE_ALIGN(end), vma->vm_flags); } @@ -223,7 +225,9 @@ vivt_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned static inline void vivt_flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn) { - if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { + struct mm_struct *mm = vma->vm_mm; + + if (!mm || cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) { unsigned long addr = user_addr & PAGE_MASK; __cpuc_flush_user_range(addr, addr + PAGE_SIZE, vma->vm_flags); } From 8bda26e33846b53e2c70a2ccff13e3f5b69ab067 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 31 Jul 2012 16:45:52 -0700 Subject: [PATCH 101/135] mm: mmu_notifier: fix freed page still mapped in secondary MMU commit 3ad3d901bbcfb15a5e4690e55350db0899095a68 upstream. mmu_notifier_release() is called when the process is exiting. It will delete all the mmu notifiers. But at this time the page belonging to the process is still present in page tables and is present on the LRU list, so this race will happen: CPU 0 CPU 1 mmu_notifier_release: try_to_unmap: hlist_del_init_rcu(&mn->hlist); ptep_clear_flush_notify: mmu nofifler not found free page !!!!!! /* * At the point, the page has been * freed, but it is still mapped in * the secondary MMU. */ mn->ops->release(mn, mm); Then the box is not stable and sometimes we can get this bug: [ 738.075923] BUG: Bad page state in process migrate-perf pfn:03bec [ 738.075931] page:ffffea00000efb00 count:0 mapcount:0 mapping: (null) index:0x8076 [ 738.075936] page flags: 0x20000000000014(referenced|dirty) The same issue is present in mmu_notifier_unregister(). We can call ->release before deleting the notifier to ensure the page has been unmapped from the secondary MMU before it is freed. Signed-off-by: Xiao Guangrong Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Paul Gortmaker Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mmu_notifier.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8d032de4088e..71c78115c453 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -33,6 +33,24 @@ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; + struct hlist_node *n; + + /* + * RCU here will block mmu_notifier_unregister until + * ->release returns. + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) + /* + * if ->release runs before mmu_notifier_unregister it + * must be handled as it's the only way for the driver + * to flush all existing sptes and stop the driver + * from establishing any more sptes before all the + * pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) * mmu_notifier_unregister to return. */ hlist_del_init_rcu(&mn->hlist); - /* - * RCU here will block mmu_notifier_unregister until - * ->release returns. - */ - rcu_read_lock(); - spin_unlock(&mm->mmu_notifier_mm->lock); - /* - * if ->release runs before mmu_notifier_unregister it - * must be handled as it's the only way for the driver - * to flush all existing sptes and stop the driver - * from establishing any more sptes before all the - * pages in the mm are freed. - */ - if (mn->ops->release) - mn->ops->release(mn, mm); - rcu_read_unlock(); - spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); @@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { - hlist_del_rcu(&mn->hlist); - /* * RCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ rcu_read_lock(); - spin_unlock(&mm->mmu_notifier_mm->lock); + /* * exit_mmap will block in mmu_notifier_release to * guarantee ->release is called before freeing the @@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) if (mn->ops->release) mn->ops->release(mn, mm); rcu_read_unlock(); - } else + + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_del_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); + } /* * Wait any running method to finish, of course including From 61e0a9e79d85258d254c5bfb7832c40ba2c46ed4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Aug 2012 21:03:21 +0200 Subject: [PATCH 102/135] mac80211: cancel mesh path timer commit dd4c9260e7f23f2e951cbfb2726e468c6d30306c upstream. The mesh path timer needs to be canceled when leaving the mesh as otherwise it could fire after the interface has been removed already. Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/mesh.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 29e9980c8e60..370aa94ead16 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -490,6 +490,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) del_timer_sync(&sdata->u.mesh.housekeeping_timer); del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); + del_timer_sync(&sdata->u.mesh.mesh_path_timer); /* * If the timer fired while we waited for it, it will have * requeued the work. Now the work will be running again From 7b1cad628030b9bbdaaa4bb8ff73cabaab6e82c9 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Jul 2012 16:28:19 +0100 Subject: [PATCH 103/135] x86, nops: Missing break resulting in incorrect selection on Intel commit d6250a3f12edb3a86db9598ffeca3de8b4a219e9 upstream. The Intel case falls through into the generic case which then changes the values. For cases like the P6 it doesn't do the right thing so this seems to be a screwup. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/n/tip-lww2uirad4skzjlmrm0vru8o@git.kernel.org Signed-off-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a81f2d52f869..dfabea4de417 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -220,7 +220,7 @@ void __init arch_init_ideal_nops(void) ideal_nops = intel_nops; #endif } - + break; default: #ifdef CONFIG_X86_64 ideal_nops = k8_nops; From 6133313b3bcbd0473feac85c8a8d7ef704ff2279 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 31 Jul 2011 13:54:50 -0700 Subject: [PATCH 104/135] random: Add support for architectural random hooks commit 63d77173266c1791f1553e9e8ccea65dc87c4485 upstream. Add support for architecture-specific hooks into the kernel-directed random number generator interfaces. This patchset does not use the architecture random number generator interfaces for the userspace-directed interfaces (/dev/random and /dev/urandom), thus eliminating the need to distinguish between them based on a pool pointer. Changes in version 3: - Moved the hooks from extract_entropy() to get_random_bytes(). - Changes the hooks to inlines. Signed-off-by: H. Peter Anvin Cc: Fenghua Yu Cc: Matt Mackall Cc: Herbert Xu Cc: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 23 +++++++++++++++++++++-- include/linux/random.h | 13 +++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c35a785005b0..154eeda26db5 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -932,7 +932,21 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, */ void get_random_bytes(void *buf, int nbytes) { - extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); + char *p = buf; + + while (nbytes) { + unsigned long v; + int chunk = min(nbytes, (int)sizeof(unsigned long)); + + if (!arch_get_random_long(&v)) + break; + + memcpy(buf, &v, chunk); + p += chunk; + nbytes -= chunk; + } + + extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); } EXPORT_SYMBOL(get_random_bytes); @@ -1318,9 +1332,14 @@ late_initcall(random_int_secret_init); DEFINE_PER_CPU(__u32 [MD5_DIGEST_WORDS], get_random_int_hash); unsigned int get_random_int(void) { - __u32 *hash = get_cpu_var(get_random_int_hash); + __u32 *hash; unsigned int ret; + if (arch_get_random_int(&ret)) + return ret; + + hash = get_cpu_var(get_random_int_hash); + hash[0] += current->pid + jiffies + get_cycles(); md5_transform(hash, random_int_secret); ret = hash[0]; diff --git a/include/linux/random.h b/include/linux/random.h index d13059f3ea32..8f74538c96db 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -91,6 +91,19 @@ static inline void prandom32_seed(struct rnd_state *state, u64 seed) state->s3 = __seed(i, 15); } +#ifdef CONFIG_ARCH_RANDOM +# include +#else +static inline int arch_get_random_long(unsigned long *v) +{ + return 0; +} +static inline int arch_get_random_int(unsigned int *v) +{ + return 0; +} +#endif + #endif /* __KERNEL___ */ #endif /* _LINUX_RANDOM_H */ From 21a465d5868b83e17a1c2434864fbb95b6414481 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 16 Nov 2011 10:50:56 -0800 Subject: [PATCH 105/135] fix typo/thinko in get_random_bytes() commit bd29e568a4cb6465f6e5ec7c1c1f3ae7d99cbec1 upstream. If there is an architecture-specific random number generator we use it to acquire randomness one "long" at a time. We should put these random words into consecutive words in the result buffer - not just overwrite the first word again and again. Signed-off-by: Tony Luck Acked-by: H. Peter Anvin Acked-by: Thomas Gleixner Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 154eeda26db5..8b47c2b57437 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -941,7 +941,7 @@ void get_random_bytes(void *buf, int nbytes) if (!arch_get_random_long(&v)) break; - memcpy(buf, &v, chunk); + memcpy(p, &v, chunk); p += chunk; nbytes -= chunk; } From be0052b8995dfa8d43a94badd54254e8cfc59471 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 22 Dec 2011 11:36:22 -0800 Subject: [PATCH 106/135] random: Use arch_get_random_int instead of cycle counter if avail commit cf833d0b9937874b50ef2867c4e8badfd64948ce upstream. We still don't use rdrand in /dev/random, which just seems stupid. We accept the *cycle*counter* as a random input, but we don't accept rdrand? That's just broken. Sure, people can do things in user space (write to /dev/random, use rdrand in addition to /dev/random themselves etc etc), but that *still* seems to be a particularly stupid reason for saying "we shouldn't bother to try to do better in /dev/random". And even if somebody really doesn't trust rdrand as a source of random bytes, it seems singularly stupid to trust the cycle counter *more*. So I'd suggest the attached patch. I'm not going to even bother arguing that we should add more bits to the entropy estimate, because that's not the point - I don't care if /dev/random fills up slowly or not, I think it's just stupid to not use the bits we can get from rdrand and mix them into the strong randomness pool. Link: http://lkml.kernel.org/r/CA%2B55aFwn59N1=m651QAyTy-1gO1noGbK18zwKDwvwqnravA84A@mail.gmail.com Acked-by: "David S. Miller" Acked-by: "Theodore Ts'o" Acked-by: Herbert Xu Cc: Matt Mackall Cc: Tony Luck Cc: Eric Dumazet Signed-off-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 8b47c2b57437..e35f6e8680a1 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -624,8 +624,8 @@ static struct timer_rand_state input_timer_state; static void add_timer_randomness(struct timer_rand_state *state, unsigned num) { struct { - cycles_t cycles; long jiffies; + unsigned cycles; unsigned num; } sample; long delta, delta2, delta3; @@ -637,7 +637,11 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) goto out; sample.jiffies = jiffies; - sample.cycles = get_cycles(); + + /* Use arch random value, fall back to cycles */ + if (!arch_get_random_int(&sample.cycles)) + sample.cycles = get_cycles(); + sample.num = num; mix_pool_bytes(&input_pool, &sample, sizeof(sample)); From d191959fa812fadb86b52f16aa09a49fe6db98dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 22 Dec 2011 16:28:01 -0500 Subject: [PATCH 107/135] random: Use arch-specific RNG to initialize the entropy store commit 3e88bdff1c65145f7ba297ccec69c774afe4c785 upstream. If there is an architecture-specific random number generator (such as RDRAND for Intel architectures), use it to initialize /dev/random's entropy stores. Even in the worst case, if RDRAND is something like AES(NSA_KEY, counter++), it won't hurt, and it will definitely help against any other adversaries. Signed-off-by: "Theodore Ts'o" Link: http://lkml.kernel.org/r/1324589281-31931-1-git-send-email-tytso@mit.edu Signed-off-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/char/random.c b/drivers/char/random.c index e35f6e8680a1..fddfc4242ca2 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -965,6 +965,7 @@ EXPORT_SYMBOL(get_random_bytes); */ static void init_std_data(struct entropy_store *r) { + int i; ktime_t now; unsigned long flags; @@ -974,6 +975,11 @@ static void init_std_data(struct entropy_store *r) now = ktime_get_real(); mix_pool_bytes(r, &now, sizeof(now)); + for (i = r->poolinfo->poolwords; i; i--) { + if (!arch_get_random_long(&flags)) + break; + mix_pool_bytes(r, &flags, sizeof(flags)); + } mix_pool_bytes(r, utsname(), sizeof(*(utsname()))); } From a5914eb0c34cf8b2d777c3dca852b2595dec908c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 16 Jan 2012 11:23:29 -0800 Subject: [PATCH 108/135] random: Adjust the number of loops when initializing commit 2dac8e54f988ab58525505d7ef982493374433c3 upstream. When we are initializing using arch_get_random_long() we only need to loop enough times to touch all the bytes in the buffer; using poolwords for that does twice the number of operations necessary on a 64-bit machine, since in the random number generator code "word" means 32 bits. Signed-off-by: H. Peter Anvin Cc: "Theodore Ts'o" Link: http://lkml.kernel.org/r/1324589281-31931-1-git-send-email-tytso@mit.edu Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index fddfc4242ca2..1eeb22f59670 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -975,7 +975,7 @@ static void init_std_data(struct entropy_store *r) now = ktime_get_real(); mix_pool_bytes(r, &now, sizeof(now)); - for (i = r->poolinfo->poolwords; i; i--) { + for (i = r->poolinfo->POOLBYTES; i > 0; i -= sizeof flags) { if (!arch_get_random_long(&flags)) break; mix_pool_bytes(r, &flags, sizeof(flags)); From f5a1367c1bc150e70e8db9bb6f2892e8e31648c7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 12 Apr 2012 12:49:12 -0700 Subject: [PATCH 109/135] drivers/char/random.c: fix boot id uniqueness race commit 44e4360fa3384850d65dd36fb4e6e5f2f112709b upstream. /proc/sys/kernel/random/boot_id can be read concurrently by userspace processes. If two (or more) user-space processes concurrently read boot_id when sysctl_bootid is not yet assigned, a race can occur making boot_id differ between the reads. Because the whole point of the boot id is to be unique across a kernel execution, fix this by protecting this operation with a spinlock. Given that this operation is not frequently used, hitting the spinlock on each call should not be an issue. Signed-off-by: Mathieu Desnoyers Cc: "Theodore Ts'o" Cc: Matt Mackall Signed-off-by: Eric Dumazet Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 1eeb22f59670..5d5141f337c6 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1260,10 +1260,15 @@ static int proc_do_uuid(ctl_table *table, int write, uuid = table->data; if (!uuid) { uuid = tmp_uuid; - uuid[8] = 0; - } - if (uuid[8] == 0) generate_random_uuid(uuid); + } else { + static DEFINE_SPINLOCK(bootid_spinlock); + + spin_lock(&bootid_spinlock); + if (!uuid[8]) + generate_random_uuid(uuid); + spin_unlock(&bootid_spinlock); + } sprintf(buf, "%pU", uuid); From aa88dea2270f685349ab7b92169600452fe73b62 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Jul 2012 07:52:16 -0400 Subject: [PATCH 110/135] random: make 'add_interrupt_randomness()' do something sane commit 775f4b297b780601e61787b766f306ed3e1d23eb upstream. We've been moving away from add_interrupt_randomness() for various reasons: it's too expensive to do on every interrupt, and flooding the CPU with interrupts could theoretically cause bogus floods of entropy from a somewhat externally controllable source. This solves both problems by limiting the actual randomness addition to just once a second or after 64 interrupts, whicever comes first. During that time, the interrupt cycle data is buffered up in a per-cpu pool. Also, we make sure the the nonblocking pool used by urandom is initialized before we start feeding the normal input pool. This assures that /dev/urandom is returning unpredictable data as soon as possible. (Based on an original patch by Linus, but significantly modified by tytso.) Tested-by: Eric Wustrow Reported-by: Eric Wustrow Reported-by: Nadia Heninger Reported-by: Zakir Durumeric Reported-by: J. Alex Halderman Signed-off-by: Linus Torvalds Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 103 +++++++++++++++++++++++++++++++------- drivers/mfd/ab3100-core.c | 2 - include/linux/random.h | 2 +- kernel/irq/handle.c | 7 ++- 4 files changed, 90 insertions(+), 24 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 5d5141f337c6..4e4683f2bbbf 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -127,19 +127,15 @@ * * void add_input_randomness(unsigned int type, unsigned int code, * unsigned int value); - * void add_interrupt_randomness(int irq); + * void add_interrupt_randomness(int irq, int irq_flags); * void add_disk_randomness(struct gendisk *disk); * * add_input_randomness() uses the input layer interrupt timing, as well as * the event type information from the hardware. * - * add_interrupt_randomness() uses the inter-interrupt timing as random - * inputs to the entropy pool. Note that not all interrupts are good - * sources of randomness! For example, the timer interrupts is not a - * good choice, because the periodicity of the interrupts is too - * regular, and hence predictable to an attacker. Network Interface - * Controller interrupts are a better measure, since the timing of the - * NIC interrupts are more unpredictable. + * add_interrupt_randomness() uses the interrupt timing as random + * inputs to the entropy pool. Using the cycle counters and the irq source + * as inputs, it feeds the randomness roughly once a second. * * add_disk_randomness() uses what amounts to the seek time of block * layer request events, on a per-disk_devt basis, as input to the @@ -248,6 +244,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_HARDIRQS # include @@ -256,6 +253,7 @@ #include #include #include +#include #include /* @@ -421,7 +419,9 @@ struct entropy_store { spinlock_t lock; unsigned add_ptr; int entropy_count; + int entropy_total; int input_rotate; + unsigned int initialized:1; __u8 last_data[EXTRACT_SIZE]; }; @@ -454,6 +454,10 @@ static struct entropy_store nonblocking_pool = { .pool = nonblocking_pool_data }; +static __u32 const twist_table[8] = { + 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, + 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; + /* * This function adds bytes into the entropy "pool". It does not * update the entropy estimate. The caller should call @@ -467,9 +471,6 @@ static struct entropy_store nonblocking_pool = { static void mix_pool_bytes_extract(struct entropy_store *r, const void *in, int nbytes, __u8 out[64]) { - static __u32 const twist_table[8] = { - 0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158, - 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; unsigned long i, j, tap1, tap2, tap3, tap4, tap5; int input_rotate; int wordmask = r->poolinfo->poolwords - 1; @@ -528,6 +529,36 @@ static void mix_pool_bytes(struct entropy_store *r, const void *in, int bytes) mix_pool_bytes_extract(r, in, bytes, NULL); } +struct fast_pool { + __u32 pool[4]; + unsigned long last; + unsigned short count; + unsigned char rotate; + unsigned char last_timer_intr; +}; + +/* + * This is a fast mixing routine used by the interrupt randomness + * collector. It's hardcoded for an 128 bit pool and assumes that any + * locks that might be needed are taken by the caller. + */ +static void fast_mix(struct fast_pool *f, const void *in, int nbytes) +{ + const char *bytes = in; + __u32 w; + unsigned i = f->count; + unsigned input_rotate = f->rotate; + + while (nbytes--) { + w = rol32(*bytes++, input_rotate & 31) ^ f->pool[i & 3] ^ + f->pool[(i + 1) & 3]; + f->pool[i & 3] = (w >> 3) ^ twist_table[w & 7]; + input_rotate += (i++ & 3) ? 7 : 14; + } + f->count = i; + f->rotate = input_rotate; +} + /* * Credit (or debit) the entropy store with n bits of entropy */ @@ -551,6 +582,12 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) entropy_count = r->poolinfo->POOLBITS; r->entropy_count = entropy_count; + if (!r->initialized && nbits > 0) { + r->entropy_total += nbits; + if (r->entropy_total > 128) + r->initialized = 1; + } + /* should we wake readers? */ if (r == &input_pool && entropy_count >= random_read_wakeup_thresh) { wake_up_interruptible(&random_read_wait); @@ -700,17 +737,48 @@ void add_input_randomness(unsigned int type, unsigned int code, } EXPORT_SYMBOL_GPL(add_input_randomness); -void add_interrupt_randomness(int irq) +static DEFINE_PER_CPU(struct fast_pool, irq_randomness); + +void add_interrupt_randomness(int irq, int irq_flags) { - struct timer_rand_state *state; + struct entropy_store *r; + struct fast_pool *fast_pool = &__get_cpu_var(irq_randomness); + struct pt_regs *regs = get_irq_regs(); + unsigned long now = jiffies; + __u32 input[4], cycles = get_cycles(); - state = get_timer_rand_state(irq); + input[0] = cycles ^ jiffies; + input[1] = irq; + if (regs) { + __u64 ip = instruction_pointer(regs); + input[2] = ip; + input[3] = ip >> 32; + } - if (state == NULL) + fast_mix(fast_pool, input, sizeof(input)); + + if ((fast_pool->count & 1023) && + !time_after(now, fast_pool->last + HZ)) return; - DEBUG_ENT("irq event %d\n", irq); - add_timer_randomness(state, 0x100 + irq); + fast_pool->last = now; + + r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; + mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool)); + /* + * If we don't have a valid cycle counter, and we see + * back-to-back timer interrupts, then skip giving credit for + * any entropy. + */ + if (cycles == 0) { + if (irq_flags & __IRQF_TIMER) { + if (fast_pool->last_timer_intr) + return; + fast_pool->last_timer_intr = 1; + } else + fast_pool->last_timer_intr = 0; + } + credit_entropy_bits(r, 1); } #ifdef CONFIG_BLOCK @@ -971,6 +1039,7 @@ static void init_std_data(struct entropy_store *r) spin_lock_irqsave(&r->lock, flags); r->entropy_count = 0; + r->entropy_total = 0; spin_unlock_irqrestore(&r->lock, flags); now = ktime_get_real(); diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index a20e1c41bed2..c0befd3ad98f 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -408,8 +408,6 @@ static irqreturn_t ab3100_irq_handler(int irq, void *data) u32 fatevent; int err; - add_interrupt_randomness(irq); - err = ab3100_get_register_page_interruptible(ab3100, AB3100_EVENTA1, event_regs, 3); if (err) diff --git a/include/linux/random.h b/include/linux/random.h index 8f74538c96db..6ef39d7f2db1 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -52,7 +52,7 @@ extern void rand_initialize_irq(int irq); extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); -extern void add_interrupt_randomness(int irq); +extern void add_interrupt_randomness(int irq, int irq_flags); extern void get_random_bytes(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 470d08c82bbe..10e077289c8d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -117,7 +117,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t retval = IRQ_NONE; - unsigned int random = 0, irq = desc->irq_data.irq; + unsigned int flags = 0, irq = desc->irq_data.irq; do { irqreturn_t res; @@ -145,7 +145,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: - random |= action->flags; + flags |= action->flags; break; default: @@ -156,8 +156,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); - if (random & IRQF_SAMPLE_RANDOM) - add_interrupt_randomness(irq); + add_interrupt_randomness(irq, flags); if (!noirqdebug) note_interrupt(irq, desc, retval); From ebb6006e3be88bb15328887dec4a8c3b7c0138b6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Jul 2012 10:38:30 -0400 Subject: [PATCH 111/135] random: use lockless techniques in the interrupt path commit 902c098a3663de3fa18639efbb71b6080f0bcd3c upstream. The real-time Linux folks don't like add_interrupt_randomness() taking a spinlock since it is called in the low-level interrupt routine. This also allows us to reduce the overhead in the fast path, for the random driver, which is the interrupt collection path. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 78 +++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 4e4683f2bbbf..c30ab2082ffc 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -418,9 +418,9 @@ struct entropy_store { /* read-write data: */ spinlock_t lock; unsigned add_ptr; + unsigned input_rotate; int entropy_count; int entropy_total; - int input_rotate; unsigned int initialized:1; __u8 last_data[EXTRACT_SIZE]; }; @@ -468,26 +468,24 @@ static __u32 const twist_table[8] = { * it's cheap to do so and helps slightly in the expected case where * the entropy is concentrated in the low-order bits. */ -static void mix_pool_bytes_extract(struct entropy_store *r, const void *in, - int nbytes, __u8 out[64]) +static void __mix_pool_bytes(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) { unsigned long i, j, tap1, tap2, tap3, tap4, tap5; int input_rotate; int wordmask = r->poolinfo->poolwords - 1; const char *bytes = in; __u32 w; - unsigned long flags; - /* Taps are constant, so we can load them without holding r->lock. */ tap1 = r->poolinfo->tap1; tap2 = r->poolinfo->tap2; tap3 = r->poolinfo->tap3; tap4 = r->poolinfo->tap4; tap5 = r->poolinfo->tap5; - spin_lock_irqsave(&r->lock, flags); - input_rotate = r->input_rotate; - i = r->add_ptr; + smp_rmb(); + input_rotate = ACCESS_ONCE(r->input_rotate); + i = ACCESS_ONCE(r->add_ptr); /* mix one byte at a time to simplify size handling and churn faster */ while (nbytes--) { @@ -514,19 +512,23 @@ static void mix_pool_bytes_extract(struct entropy_store *r, const void *in, input_rotate += i ? 7 : 14; } - r->input_rotate = input_rotate; - r->add_ptr = i; + ACCESS_ONCE(r->input_rotate) = input_rotate; + ACCESS_ONCE(r->add_ptr) = i; + smp_wmb(); if (out) for (j = 0; j < 16; j++) ((__u32 *)out)[j] = r->pool[(i - j) & wordmask]; - - spin_unlock_irqrestore(&r->lock, flags); } -static void mix_pool_bytes(struct entropy_store *r, const void *in, int bytes) +static void mix_pool_bytes(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) { - mix_pool_bytes_extract(r, in, bytes, NULL); + unsigned long flags; + + spin_lock_irqsave(&r->lock, flags); + __mix_pool_bytes(r, in, nbytes, out); + spin_unlock_irqrestore(&r->lock, flags); } struct fast_pool { @@ -564,23 +566,22 @@ static void fast_mix(struct fast_pool *f, const void *in, int nbytes) */ static void credit_entropy_bits(struct entropy_store *r, int nbits) { - unsigned long flags; - int entropy_count; + int entropy_count, orig; if (!nbits) return; - spin_lock_irqsave(&r->lock, flags); - DEBUG_ENT("added %d entropy credits to %s\n", nbits, r->name); - entropy_count = r->entropy_count; +retry: + entropy_count = orig = ACCESS_ONCE(r->entropy_count); entropy_count += nbits; if (entropy_count < 0) { DEBUG_ENT("negative entropy/overflow\n"); entropy_count = 0; } else if (entropy_count > r->poolinfo->POOLBITS) entropy_count = r->poolinfo->POOLBITS; - r->entropy_count = entropy_count; + if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig) + goto retry; if (!r->initialized && nbits > 0) { r->entropy_total += nbits; @@ -593,7 +594,6 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) wake_up_interruptible(&random_read_wait); kill_fasync(&fasync, SIGIO, POLL_IN); } - spin_unlock_irqrestore(&r->lock, flags); } /********************************************************************* @@ -680,7 +680,7 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) sample.cycles = get_cycles(); sample.num = num; - mix_pool_bytes(&input_pool, &sample, sizeof(sample)); + mix_pool_bytes(&input_pool, &sample, sizeof(sample), NULL); /* * Calculate number of bits of randomness we probably added. @@ -764,7 +764,7 @@ void add_interrupt_randomness(int irq, int irq_flags) fast_pool->last = now; r = nonblocking_pool.initialized ? &input_pool : &nonblocking_pool; - mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool)); + __mix_pool_bytes(r, &fast_pool->pool, sizeof(fast_pool->pool), NULL); /* * If we don't have a valid cycle counter, and we see * back-to-back timer interrupts, then skip giving credit for @@ -829,7 +829,7 @@ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) bytes = extract_entropy(r->pull, tmp, bytes, random_read_wakeup_thresh / 8, rsvd); - mix_pool_bytes(r, tmp, bytes); + mix_pool_bytes(r, tmp, bytes, NULL); credit_entropy_bits(r, bytes*8); } } @@ -890,9 +890,11 @@ static void extract_buf(struct entropy_store *r, __u8 *out) int i; __u32 hash[5], workspace[SHA_WORKSPACE_WORDS]; __u8 extract[64]; + unsigned long flags; /* Generate a hash across the pool, 16 words (512 bits) at a time */ sha_init(hash); + spin_lock_irqsave(&r->lock, flags); for (i = 0; i < r->poolinfo->poolwords; i += 16) sha_transform(hash, (__u8 *)(r->pool + i), workspace); @@ -905,7 +907,8 @@ static void extract_buf(struct entropy_store *r, __u8 *out) * brute-forcing the feedback as hard as brute-forcing the * hash. */ - mix_pool_bytes_extract(r, hash, sizeof(hash), extract); + __mix_pool_bytes(r, hash, sizeof(hash), extract); + spin_unlock_irqrestore(&r->lock, flags); /* * To avoid duplicates, we atomically extract a portion of the @@ -928,11 +931,10 @@ static void extract_buf(struct entropy_store *r, __u8 *out) } static ssize_t extract_entropy(struct entropy_store *r, void *buf, - size_t nbytes, int min, int reserved) + size_t nbytes, int min, int reserved) { ssize_t ret = 0, i; __u8 tmp[EXTRACT_SIZE]; - unsigned long flags; xfer_secondary_pool(r, nbytes); nbytes = account(r, nbytes, min, reserved); @@ -941,6 +943,8 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, extract_buf(r, tmp); if (fips_enabled) { + unsigned long flags; + spin_lock_irqsave(&r->lock, flags); if (!memcmp(tmp, r->last_data, EXTRACT_SIZE)) panic("Hardware RNG duplicated output!\n"); @@ -1034,22 +1038,18 @@ EXPORT_SYMBOL(get_random_bytes); static void init_std_data(struct entropy_store *r) { int i; - ktime_t now; - unsigned long flags; + ktime_t now = ktime_get_real(); + unsigned long rv; - spin_lock_irqsave(&r->lock, flags); r->entropy_count = 0; r->entropy_total = 0; - spin_unlock_irqrestore(&r->lock, flags); - - now = ktime_get_real(); - mix_pool_bytes(r, &now, sizeof(now)); - for (i = r->poolinfo->POOLBYTES; i > 0; i -= sizeof flags) { - if (!arch_get_random_long(&flags)) + mix_pool_bytes(r, &now, sizeof(now), NULL); + for (i = r->poolinfo->POOLBYTES; i > 0; i -= sizeof(rv)) { + if (!arch_get_random_long(&rv)) break; - mix_pool_bytes(r, &flags, sizeof(flags)); + mix_pool_bytes(r, &rv, sizeof(rv), NULL); } - mix_pool_bytes(r, utsname(), sizeof(*(utsname()))); + mix_pool_bytes(r, utsname(), sizeof(*(utsname())), NULL); } static int rand_initialize(void) @@ -1186,7 +1186,7 @@ write_pool(struct entropy_store *r, const char __user *buffer, size_t count) count -= bytes; p += bytes; - mix_pool_bytes(r, buf, bytes); + mix_pool_bytes(r, buf, bytes, NULL); cond_resched(); } From 3e035335b0578f6a058aecaf817b462ec9773c24 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 4 Jul 2012 11:16:01 -0400 Subject: [PATCH 112/135] random: create add_device_randomness() interface commit a2080a67abe9e314f9e9c2cc3a4a176e8a8f8793 upstream. Add a new interface, add_device_randomness() for adding data to the random pool that is likely to differ between two devices (or possibly even per boot). This would be things like MAC addresses or serial numbers, or the read-out of the RTC. This does *not* add any actual entropy to the pool, but it initializes the pool to different values for devices that might otherwise be identical and have very little entropy available to them (particularly common in the embedded world). [ Modified by tytso to mix in a timestamp, since there may be some variability caused by the time needed to detect/configure the hardware in question. ] Signed-off-by: Linus Torvalds Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 28 ++++++++++++++++++++++++++++ include/linux/random.h | 1 + 2 files changed, 29 insertions(+) diff --git a/drivers/char/random.c b/drivers/char/random.c index c30ab2082ffc..70dd96469fe5 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -125,11 +125,20 @@ * The current exported interfaces for gathering environmental noise * from the devices are: * + * void add_device_randomness(const void *buf, unsigned int size); * void add_input_randomness(unsigned int type, unsigned int code, * unsigned int value); * void add_interrupt_randomness(int irq, int irq_flags); * void add_disk_randomness(struct gendisk *disk); * + * add_device_randomness() is for adding data to the random pool that + * is likely to differ between two devices (or possibly even per boot). + * This would be things like MAC addresses or serial numbers, or the + * read-out of the RTC. This does *not* add any actual entropy to the + * pool, but it initializes the pool to different values for devices + * that might otherwise be identical and have very little entropy + * available to them (particularly common in the embedded world). + * * add_input_randomness() uses the input layer interrupt timing, as well as * the event type information from the hardware. * @@ -646,6 +655,25 @@ static void set_timer_rand_state(unsigned int irq, } #endif +/* + * Add device- or boot-specific data to the input and nonblocking + * pools to help initialize them to unique values. + * + * None of this adds any entropy, it is meant to avoid the + * problem of the nonblocking pool having similar initial state + * across largely identical devices. + */ +void add_device_randomness(const void *buf, unsigned int size) +{ + unsigned long time = get_cycles() ^ jiffies; + + mix_pool_bytes(&input_pool, buf, size, NULL); + mix_pool_bytes(&input_pool, &time, sizeof(time), NULL); + mix_pool_bytes(&nonblocking_pool, buf, size, NULL); + mix_pool_bytes(&nonblocking_pool, &time, sizeof(time), NULL); +} +EXPORT_SYMBOL(add_device_randomness); + static struct timer_rand_state input_timer_state; /* diff --git a/include/linux/random.h b/include/linux/random.h index 6ef39d7f2db1..e14b4387354a 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -50,6 +50,7 @@ struct rnd_state { extern void rand_initialize_irq(int irq); +extern void add_device_randomness(const void *, unsigned int); extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); extern void add_interrupt_randomness(int irq, int irq_flags); From 52d1114f461ad4d33d8bc38987b5991161712e96 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Jul 2012 11:22:20 -0400 Subject: [PATCH 113/135] usb: feed USB device information to the /dev/random driver commit b04b3156a20d395a7faa8eed98698d1e17a36000 upstream. Send the USB device's serial, product, and manufacturer strings to the /dev/random driver to help seed its pools. Cc: Linus Torvalds Acked-by: Greg KH Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 34bb059607f5..3c0aa028fb3c 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1902,6 +1903,14 @@ int usb_new_device(struct usb_device *udev) /* Tell the world! */ announce_device(udev); + if (udev->serial) + add_device_randomness(udev->serial, strlen(udev->serial)); + if (udev->product) + add_device_randomness(udev->product, strlen(udev->product)); + if (udev->manufacturer) + add_device_randomness(udev->manufacturer, + strlen(udev->manufacturer)); + device_enable_async_suspend(&udev->dev); /* Register the device. The device driver is responsible * for configuring the device and invoking the add-device From 118f98c7de67622c27ef9c968958b6de8483357f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Jul 2012 21:23:25 -0400 Subject: [PATCH 114/135] net: feed /dev/random with the MAC address when registering a device commit 7bf2357524408b97fec58344caf7397f8140c3fd upstream. Signed-off-by: "Theodore Ts'o" Cc: David Miller Cc: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 3 +++ net/core/rtnetlink.c | 1 + 2 files changed, 4 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index a71eafc392e7..8235b81a7db3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1163,6 +1163,7 @@ static int __dev_open(struct net_device *dev) net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; @@ -4730,6 +4731,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) err = ops->ndo_set_mac_address(dev, sa); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + add_device_randomness(dev->dev_addr, dev->addr_len); return err; } EXPORT_SYMBOL(dev_set_mac_address); @@ -5507,6 +5509,7 @@ int register_netdevice(struct net_device *dev) dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 861d53f5f13b..ac49ad519ca2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1304,6 +1304,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, goto errout; send_addr_notify = 1; modified = 1; + add_device_randomness(dev->dev_addr, dev->addr_len); } if (tb[IFLA_MTU]) { From b2b6f1202d6f948bfe8d280dec83da1fef05a05c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 5 Jul 2012 10:21:01 -0400 Subject: [PATCH 115/135] random: use the arch-specific rng in xfer_secondary_pool commit e6d4947b12e8ad947add1032dd754803c6004824 upstream. If the CPU supports a hardware random number generator, use it in xfer_secondary_pool(), where it will significantly improve things and where we can afford it. Also, remove the use of the arch-specific rng in add_timer_randomness(), since the call is significantly slower than get_cycles(), and we're much better off using it in xfer_secondary_pool() anyway. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 70dd96469fe5..3ebc0337a290 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -254,6 +254,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_HARDIRQS # include @@ -702,11 +703,7 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) goto out; sample.jiffies = jiffies; - - /* Use arch random value, fall back to cycles */ - if (!arch_get_random_int(&sample.cycles)) - sample.cycles = get_cycles(); - + sample.cycles = get_cycles(); sample.num = num; mix_pool_bytes(&input_pool, &sample, sizeof(sample), NULL); @@ -838,7 +835,11 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, */ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) { - __u32 tmp[OUTPUT_POOL_WORDS]; + union { + __u32 tmp[OUTPUT_POOL_WORDS]; + long hwrand[4]; + } u; + int i; if (r->pull && r->entropy_count < nbytes * 8 && r->entropy_count < r->poolinfo->POOLBITS) { @@ -849,17 +850,23 @@ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) /* pull at least as many as BYTES as wakeup BITS */ bytes = max_t(int, bytes, random_read_wakeup_thresh / 8); /* but never more than the buffer size */ - bytes = min_t(int, bytes, sizeof(tmp)); + bytes = min_t(int, bytes, sizeof(u.tmp)); DEBUG_ENT("going to reseed %s with %d bits " "(%d of %d requested)\n", r->name, bytes * 8, nbytes * 8, r->entropy_count); - bytes = extract_entropy(r->pull, tmp, bytes, + bytes = extract_entropy(r->pull, u.tmp, bytes, random_read_wakeup_thresh / 8, rsvd); - mix_pool_bytes(r, tmp, bytes, NULL); + mix_pool_bytes(r, u.tmp, bytes, NULL); credit_entropy_bits(r, bytes*8); } + kmemcheck_mark_initialized(&u.hwrand, sizeof(u.hwrand)); + for (i = 0; i < 4; i++) + if (arch_get_random_long(&u.hwrand[i])) + break; + if (i) + mix_pool_bytes(r, &u.hwrand, sizeof(u.hwrand), 0); } /* From efe6c422db90b8303bfaf7fc2131bb2824a06c39 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 5 Jul 2012 10:35:23 -0400 Subject: [PATCH 116/135] random: add new get_random_bytes_arch() function commit c2557a303ab6712bb6e09447df828c557c710ac9 upstream. Create a new function, get_random_bytes_arch() which will use the architecture-specific hardware random number generator if it is present. Change get_random_bytes() to not use the HW RNG, even if it is avaiable. The reason for this is that the hw random number generator is fast (if it is present), but it requires that we trust the hardware manufacturer to have not put in a back door. (For example, an increasing counter encrypted by an AES key known to the NSA.) It's unlikely that Intel (for example) was paid off by the US Government to do this, but it's impossible for them to prove otherwise --- especially since Bull Mountain is documented to use AES as a whitener. Hence, the output of an evil, trojan-horse version of RDRAND is statistically indistinguishable from an RDRAND implemented to the specifications claimed by Intel. Short of using a tunnelling electronic microscope to reverse engineer an Ivy Bridge chip and disassembling and analyzing the CPU microcode, there's no way for us to tell for sure. Since users of get_random_bytes() in the Linux kernel need to be able to support hardware systems where the HW RNG is not present, most time-sensitive users of this interface have already created their own cryptographic RNG interface which uses get_random_bytes() as a seed. So it's much better to use the HW RNG to improve the existing random number generator, by mixing in any entropy returned by the HW RNG into /dev/random's entropy pool, but to always _use_ /dev/random's entropy pool. This way we get almost of the benefits of the HW RNG without any potential liabilities. The only benefits we forgo is the speed/performance enhancements --- and generic kernel code can't depend on depend on get_random_bytes() having the speed of a HW RNG anyway. For those places that really want access to the arch-specific HW RNG, if it is available, we provide get_random_bytes_arch(). Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 27 +++++++++++++++++++++++---- include/linux/random.h | 1 + 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 3ebc0337a290..f4dc6d818894 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1038,10 +1038,27 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, /* * This function is the exported kernel interface. It returns some - * number of good random numbers, suitable for seeding TCP sequence - * numbers, etc. + * number of good random numbers, suitable for key generation, seeding + * TCP sequence numbers, etc. It does not use the hw random number + * generator, if available; use get_random_bytes_arch() for that. */ void get_random_bytes(void *buf, int nbytes) +{ + extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0); +} +EXPORT_SYMBOL(get_random_bytes); + +/* + * This function will use the architecture-specific hardware random + * number generator if it is available. The arch-specific hw RNG will + * almost certainly be faster than what we can do in software, but it + * is impossible to verify that it is implemented securely (as + * opposed, to, say, the AES encryption of a sequence number using a + * key known by the NSA). So it's useful if we need the speed, but + * only if we're willing to trust the hardware manufacturer not to + * have put in a back door. + */ +void get_random_bytes_arch(void *buf, int nbytes) { char *p = buf; @@ -1057,9 +1074,11 @@ void get_random_bytes(void *buf, int nbytes) nbytes -= chunk; } - extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); + if (nbytes) + extract_entropy(&nonblocking_pool, p, nbytes, 0, 0); } -EXPORT_SYMBOL(get_random_bytes); +EXPORT_SYMBOL(get_random_bytes_arch); + /* * init_std_data - initialize pool with system data diff --git a/include/linux/random.h b/include/linux/random.h index e14b4387354a..29e217a7e6d0 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -56,6 +56,7 @@ extern void add_input_randomness(unsigned int type, unsigned int code, extern void add_interrupt_randomness(int irq, int irq_flags); extern void get_random_bytes(void *buf, int nbytes); +extern void get_random_bytes_arch(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); #ifndef MODULE From 1edbd889fa5c0343d817ae6fe44ab95870a096b4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Jul 2012 16:19:30 -0400 Subject: [PATCH 117/135] random: add tracepoints for easier debugging and verification commit 00ce1db1a634746040ace24c09a4e3a7949a3145 upstream. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 26 ++++++- include/trace/events/random.h | 134 ++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 include/trace/events/random.h diff --git a/drivers/char/random.c b/drivers/char/random.c index f4dc6d818894..e70701ec3b07 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -266,6 +266,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * Configuration information */ @@ -478,8 +481,8 @@ static __u32 const twist_table[8] = { * it's cheap to do so and helps slightly in the expected case where * the entropy is concentrated in the low-order bits. */ -static void __mix_pool_bytes(struct entropy_store *r, const void *in, - int nbytes, __u8 out[64]) +static void _mix_pool_bytes(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) { unsigned long i, j, tap1, tap2, tap3, tap4, tap5; int input_rotate; @@ -531,13 +534,21 @@ static void __mix_pool_bytes(struct entropy_store *r, const void *in, ((__u32 *)out)[j] = r->pool[(i - j) & wordmask]; } -static void mix_pool_bytes(struct entropy_store *r, const void *in, +static void __mix_pool_bytes(struct entropy_store *r, const void *in, int nbytes, __u8 out[64]) +{ + trace_mix_pool_bytes_nolock(r->name, nbytes, _RET_IP_); + _mix_pool_bytes(r, in, nbytes, out); +} + +static void mix_pool_bytes(struct entropy_store *r, const void *in, + int nbytes, __u8 out[64]) { unsigned long flags; + trace_mix_pool_bytes(r->name, nbytes, _RET_IP_); spin_lock_irqsave(&r->lock, flags); - __mix_pool_bytes(r, in, nbytes, out); + _mix_pool_bytes(r, in, nbytes, out); spin_unlock_irqrestore(&r->lock, flags); } @@ -585,6 +596,7 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) retry: entropy_count = orig = ACCESS_ONCE(r->entropy_count); entropy_count += nbits; + if (entropy_count < 0) { DEBUG_ENT("negative entropy/overflow\n"); entropy_count = 0; @@ -599,6 +611,9 @@ retry: r->initialized = 1; } + trace_credit_entropy_bits(r->name, nbits, entropy_count, + r->entropy_total, _RET_IP_); + /* should we wake readers? */ if (r == &input_pool && entropy_count >= random_read_wakeup_thresh) { wake_up_interruptible(&random_read_wait); @@ -971,6 +986,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, ssize_t ret = 0, i; __u8 tmp[EXTRACT_SIZE]; + trace_extract_entropy(r->name, nbytes, r->entropy_count, _RET_IP_); xfer_secondary_pool(r, nbytes); nbytes = account(r, nbytes, min, reserved); @@ -1005,6 +1021,7 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, ssize_t ret = 0, i; __u8 tmp[EXTRACT_SIZE]; + trace_extract_entropy_user(r->name, nbytes, r->entropy_count, _RET_IP_); xfer_secondary_pool(r, nbytes); nbytes = account(r, nbytes, 0, 0); @@ -1062,6 +1079,7 @@ void get_random_bytes_arch(void *buf, int nbytes) { char *p = buf; + trace_get_random_bytes(nbytes, _RET_IP_); while (nbytes) { unsigned long v; int chunk = min(nbytes, (int)sizeof(unsigned long)); diff --git a/include/trace/events/random.h b/include/trace/events/random.h new file mode 100644 index 000000000000..422df19de732 --- /dev/null +++ b/include/trace/events/random.h @@ -0,0 +1,134 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM random + +#if !defined(_TRACE_RANDOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RANDOM_H + +#include +#include + +DECLARE_EVENT_CLASS(random__mix_pool_bytes, + TP_PROTO(const char *pool_name, int bytes, unsigned long IP), + + TP_ARGS(pool_name, bytes, IP), + + TP_STRUCT__entry( + __field( const char *, pool_name ) + __field( int, bytes ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + __entry->bytes = bytes; + __entry->IP = IP; + ), + + TP_printk("%s pool: bytes %d caller %pF", + __entry->pool_name, __entry->bytes, (void *)__entry->IP) +); + +DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes, + TP_PROTO(const char *pool_name, int bytes, unsigned long IP), + + TP_ARGS(pool_name, bytes, IP) +); + +DEFINE_EVENT(random__mix_pool_bytes, mix_pool_bytes_nolock, + TP_PROTO(const char *pool_name, int bytes, unsigned long IP), + + TP_ARGS(pool_name, bytes, IP) +); + +TRACE_EVENT(credit_entropy_bits, + TP_PROTO(const char *pool_name, int bits, int entropy_count, + int entropy_total, unsigned long IP), + + TP_ARGS(pool_name, bits, entropy_count, entropy_total, IP), + + TP_STRUCT__entry( + __field( const char *, pool_name ) + __field( int, bits ) + __field( int, entropy_count ) + __field( int, entropy_total ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + __entry->bits = bits; + __entry->entropy_count = entropy_count; + __entry->entropy_total = entropy_total; + __entry->IP = IP; + ), + + TP_printk("%s pool: bits %d entropy_count %d entropy_total %d " + "caller %pF", __entry->pool_name, __entry->bits, + __entry->entropy_count, __entry->entropy_total, + (void *)__entry->IP) +); + +TRACE_EVENT(get_random_bytes, + TP_PROTO(int nbytes, unsigned long IP), + + TP_ARGS(nbytes, IP), + + TP_STRUCT__entry( + __field( int, nbytes ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->nbytes = nbytes; + __entry->IP = IP; + ), + + TP_printk("nbytes %d caller %pF", __entry->nbytes, (void *)__entry->IP) +); + +DECLARE_EVENT_CLASS(random__extract_entropy, + TP_PROTO(const char *pool_name, int nbytes, int entropy_count, + unsigned long IP), + + TP_ARGS(pool_name, nbytes, entropy_count, IP), + + TP_STRUCT__entry( + __field( const char *, pool_name ) + __field( int, nbytes ) + __field( int, entropy_count ) + __field(unsigned long, IP ) + ), + + TP_fast_assign( + __entry->pool_name = pool_name; + __entry->nbytes = nbytes; + __entry->entropy_count = entropy_count; + __entry->IP = IP; + ), + + TP_printk("%s pool: nbytes %d entropy_count %d caller %pF", + __entry->pool_name, __entry->nbytes, __entry->entropy_count, + (void *)__entry->IP) +); + + +DEFINE_EVENT(random__extract_entropy, extract_entropy, + TP_PROTO(const char *pool_name, int nbytes, int entropy_count, + unsigned long IP), + + TP_ARGS(pool_name, nbytes, entropy_count, IP) +); + +DEFINE_EVENT(random__extract_entropy, extract_entropy_user, + TP_PROTO(const char *pool_name, int nbytes, int entropy_count, + unsigned long IP), + + TP_ARGS(pool_name, nbytes, entropy_count, IP) +); + + + +#endif /* _TRACE_RANDOM_H */ + +/* This part must be outside protection */ +#include From 0789520922496716eff4381e0ef724e66ead63e5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 4 Jul 2012 11:32:48 -0400 Subject: [PATCH 118/135] MAINTAINERS: Theodore Ts'o is taking over the random driver commit 330e0a01d54c2b8606c56816f99af6ebc58ec92c upstream. Matt Mackall stepped down as the /dev/random driver maintainer last year, so Theodore Ts'o is taking back the /dev/random driver. Cc: Matt Mackall Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index de85391c021a..c8c087443817 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5247,7 +5247,7 @@ F: Documentation/blockdev/ramdisk.txt F: drivers/block/brd.c RANDOM NUMBER DRIVER -M: Matt Mackall +M: Theodore Ts'o" S: Maintained F: drivers/char/random.c From 2fcadd93622a8e130d98dd214aa75a17fc9b0ffc Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 5 Jul 2012 20:19:17 +0000 Subject: [PATCH 119/135] rtc: wm831x: Feed the write counter into device_add_randomness() commit 9dccf55f4cb011a7552a8a2749a580662f5ed8ed upstream. The tamper evident features of the RTC include the "write counter" which is a pseudo-random number regenerated whenever we set the RTC. Since this value is unpredictable it should provide some useful seeding to the random number generator. Only do this on boot since the goal is to seed the pool rather than add useful entropy. Signed-off-by: Mark Brown Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/rtc/rtc-wm831x.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index bdc909bd56da..f3c211099a14 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -24,7 +24,7 @@ #include #include #include - +#include /* * R16416 (0x4020) - RTC Write Counter @@ -96,6 +96,26 @@ struct wm831x_rtc { unsigned int alarm_enabled:1; }; +static void wm831x_rtc_add_randomness(struct wm831x *wm831x) +{ + int ret; + u16 reg; + + /* + * The write counter contains a pseudo-random number which is + * regenerated every time we set the RTC so it should be a + * useful per-system source of entropy. + */ + ret = wm831x_reg_read(wm831x, WM831X_RTC_WRITE_COUNTER); + if (ret >= 0) { + reg = ret; + add_device_randomness(®, sizeof(reg)); + } else { + dev_warn(wm831x->dev, "Failed to read RTC write counter: %d\n", + ret); + } +} + /* * Read current time and date in RTC */ @@ -449,6 +469,8 @@ static int wm831x_rtc_probe(struct platform_device *pdev) alm_irq, ret); } + wm831x_rtc_add_randomness(wm831x); + return 0; err: From f99ef862a7235bb06e963bd99e3c4287f5b8f6fd Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 5 Jul 2012 20:23:21 +0000 Subject: [PATCH 120/135] mfd: wm831x: Feed the device UUID into device_add_randomness() commit 27130f0cc3ab97560384da437e4621fc4e94f21c upstream. wm831x devices contain a unique ID value. Feed this into the newly added device_add_randomness() to add some per device seed data to the pool. Signed-off-by: Mark Brown Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/wm831x-otp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/mfd/wm831x-otp.c b/drivers/mfd/wm831x-otp.c index f742745ff354..b90f3e06b6c9 100644 --- a/drivers/mfd/wm831x-otp.c +++ b/drivers/mfd/wm831x-otp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -66,6 +67,7 @@ static DEVICE_ATTR(unique_id, 0444, wm831x_unique_id_show, NULL); int wm831x_otp_init(struct wm831x *wm831x) { + char uuid[WM831X_UNIQUE_ID_LEN]; int ret; ret = device_create_file(wm831x->dev, &dev_attr_unique_id); @@ -73,6 +75,12 @@ int wm831x_otp_init(struct wm831x *wm831x) dev_err(wm831x->dev, "Unique ID attribute not created: %d\n", ret); + ret = wm831x_unique_id_read(wm831x, uuid); + if (ret == 0) + add_device_randomness(uuid, sizeof(uuid)); + else + dev_err(wm831x->dev, "Failed to read UUID: %d\n", ret); + return ret; } From b6b847a93be87fc9974d8232984668a5a59754df Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Jul 2012 20:27:52 -0400 Subject: [PATCH 121/135] random: remove rand_initialize_irq() commit c5857ccf293968348e5eb4ebedc68074de3dcda6 upstream. With the new interrupt sampling system, we are no longer using the timer_rand_state structure in the irq descriptor, so we can stop initializing it now. [ Merged in fixes from Sedat to find some last missing references to rand_initialize_irq() ] Signed-off-by: "Theodore Ts'o" Signed-off-by: Sedat Dilek Signed-off-by: Greg Kroah-Hartman --- arch/ia64/kernel/irq_ia64.c | 1 - drivers/char/random.c | 55 ------------------------------------- drivers/mfd/ab3100-core.c | 3 -- drivers/mfd/ab3550-core.c | 2 -- include/linux/irqdesc.h | 1 - include/linux/random.h | 2 -- kernel/irq/manage.c | 17 ------------ 7 files changed, 81 deletions(-) diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 782c3a357f24..3540c5e80426 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -23,7 +23,6 @@ #include #include #include -#include /* for rand_initialize_irq() */ #include #include #include diff --git a/drivers/char/random.c b/drivers/char/random.c index e70701ec3b07..5df09b014ea1 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -634,43 +634,6 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -#ifndef CONFIG_GENERIC_HARDIRQS - -static struct timer_rand_state *irq_timer_state[NR_IRQS]; - -static struct timer_rand_state *get_timer_rand_state(unsigned int irq) -{ - return irq_timer_state[irq]; -} - -static void set_timer_rand_state(unsigned int irq, - struct timer_rand_state *state) -{ - irq_timer_state[irq] = state; -} - -#else - -static struct timer_rand_state *get_timer_rand_state(unsigned int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - return desc->timer_rand_state; -} - -static void set_timer_rand_state(unsigned int irq, - struct timer_rand_state *state) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - desc->timer_rand_state = state; -} -#endif - /* * Add device- or boot-specific data to the input and nonblocking * pools to help initialize them to unique values. @@ -1133,24 +1096,6 @@ static int rand_initialize(void) } module_init(rand_initialize); -void rand_initialize_irq(int irq) -{ - struct timer_rand_state *state; - - state = get_timer_rand_state(irq); - - if (state) - return; - - /* - * If kzalloc returns null, we just won't use that entropy - * source. - */ - state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL); - if (state) - set_timer_rand_state(irq, state); -} - #ifdef CONFIG_BLOCK void rand_initialize_disk(struct gendisk *disk) { diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index c0befd3ad98f..ccd81b1540a2 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -936,9 +936,6 @@ static int __devinit ab3100_probe(struct i2c_client *client, err = request_threaded_irq(client->irq, NULL, ab3100_irq_handler, IRQF_ONESHOT, "ab3100-core", ab3100); - /* This real unpredictable IRQ is of course sampled for entropy */ - rand_initialize_irq(client->irq); - if (err) goto exit_no_irq; diff --git a/drivers/mfd/ab3550-core.c b/drivers/mfd/ab3550-core.c index 3d7dce671b93..d69dc4bf8bbd 100644 --- a/drivers/mfd/ab3550-core.c +++ b/drivers/mfd/ab3550-core.c @@ -1309,8 +1309,6 @@ static int __init ab3550_probe(struct i2c_client *client, err = request_threaded_irq(client->irq, NULL, ab3550_irq_handler, IRQF_ONESHOT, "ab3550-core", ab); - /* This real unpredictable IRQ is of course sampled for entropy */ - rand_initialize_irq(client->irq); if (err) goto exit_no_irq; diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 2d921b35212c..d0a3100b4061 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -38,7 +38,6 @@ struct timer_rand_state; */ struct irq_desc { struct irq_data irq_data; - struct timer_rand_state *timer_rand_state; unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; #ifdef CONFIG_IRQ_PREFLOW_FASTEOI diff --git a/include/linux/random.h b/include/linux/random.h index 29e217a7e6d0..ac621ce886ca 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -48,8 +48,6 @@ struct rnd_state { #ifdef __KERNEL__ -extern void rand_initialize_irq(int irq); - extern void add_device_randomness(const void *, unsigned int); extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index df8136fff8cc..fa4a70ee2720 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -886,22 +886,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & IRQF_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } /* * Check whether the interrupt nests into another interrupt @@ -1325,7 +1309,6 @@ EXPORT_SYMBOL(free_irq); * Flags: * * IRQF_SHARED Interrupt is shared - * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * IRQF_TRIGGER_* Specify active edge(s) or level * */ From dbbdd2bb89716814a5336105d8b3e6fd64ff0886 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 23 Jul 2012 09:47:57 -0700 Subject: [PATCH 122/135] random: Add comment to random_initialize() commit cbc96b7594b5691d61eba2db8b2ea723645be9ca upstream. Many platforms have per-machine instance data (serial numbers, asset tags, etc.) squirreled away in areas that are accessed during early system bringup. Mixing this data into the random pools has a very high value in providing better random data, so we should allow (and even encourage) architecture code to call add_device_randomness() from the setup_arch() paths. However, this limits our options for internal structure of the random driver since random_initialize() is not called until long after setup_arch(). Add a big fat comment to rand_initialize() spelling out this requirement. Suggested-by: Theodore Ts'o Signed-off-by: Tony Luck Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/char/random.c b/drivers/char/random.c index 5df09b014ea1..c2e25afdbe5a 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1087,6 +1087,16 @@ static void init_std_data(struct entropy_store *r) mix_pool_bytes(r, utsname(), sizeof(*(utsname())), NULL); } +/* + * Note that setup_arch() may call add_device_randomness() + * long before we get here. This allows seeding of the pools + * with some platform dependent data very early in the boot + * process. But it limits our options here. We must use + * statically allocated structures that already have all + * initializations complete at compile time. We should also + * take care not to overwrite the precious per platform data + * we were given. + */ static int rand_initialize(void) { init_std_data(&input_pool); From 4f4cb6f72c4e95cac32316b2be4a0b344513910a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 20 Jul 2012 13:15:20 -0700 Subject: [PATCH 123/135] dmi: Feed DMI table to /dev/random driver commit d114a33387472555188f142ed8e98acdb8181c6d upstream. Send the entire DMI (SMBIOS) table to the /dev/random driver to help seed its pools. Signed-off-by: Tony Luck Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/dmi_scan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index bcb1126e3d00..02a52d13a467 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -6,6 +6,7 @@ #include #include #include +#include #include /* @@ -111,6 +112,8 @@ static int __init dmi_walk_early(void (*decode)(const struct dmi_header *, dmi_table(buf, dmi_len, dmi_num, decode, NULL); + add_device_randomness(buf, dmi_len); + dmi_iounmap(buf, dmi_len); return 0; } From 9f6082404e1d17ea7a9745c141dd91dba2102c57 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 27 Jul 2012 22:26:08 -0400 Subject: [PATCH 124/135] random: mix in architectural randomness in extract_buf() commit d2e7c96af1e54b507ae2a6a7dd2baf588417a7e5 upstream. Mix in any architectural randomness in extract_buf() instead of xfer_secondary_buf(). This allows us to mix in more architectural randomness, and it also makes xfer_secondary_buf() faster, moving a tiny bit of additional CPU overhead to process which is extracting the randomness. [ Commit description modified by tytso to remove an extended advertisement for the RDRAND instruction. ] Signed-off-by: H. Peter Anvin Acked-by: Ingo Molnar Cc: DJ Johnston Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- drivers/char/random.c | 56 ++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c2e25afdbe5a..fceac955bfb8 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -277,6 +277,8 @@ #define SEC_XFER_SIZE 512 #define EXTRACT_SIZE 10 +#define LONGS(x) (((x) + sizeof(unsigned long) - 1)/sizeof(unsigned long)) + /* * The minimum number of bits of entropy before we wake up a read on * /dev/random. Should be enough to do a significant reseed. @@ -813,11 +815,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, */ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) { - union { - __u32 tmp[OUTPUT_POOL_WORDS]; - long hwrand[4]; - } u; - int i; + __u32 tmp[OUTPUT_POOL_WORDS]; if (r->pull && r->entropy_count < nbytes * 8 && r->entropy_count < r->poolinfo->POOLBITS) { @@ -828,23 +826,17 @@ static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes) /* pull at least as many as BYTES as wakeup BITS */ bytes = max_t(int, bytes, random_read_wakeup_thresh / 8); /* but never more than the buffer size */ - bytes = min_t(int, bytes, sizeof(u.tmp)); + bytes = min_t(int, bytes, sizeof(tmp)); DEBUG_ENT("going to reseed %s with %d bits " "(%d of %d requested)\n", r->name, bytes * 8, nbytes * 8, r->entropy_count); - bytes = extract_entropy(r->pull, u.tmp, bytes, + bytes = extract_entropy(r->pull, tmp, bytes, random_read_wakeup_thresh / 8, rsvd); - mix_pool_bytes(r, u.tmp, bytes, NULL); + mix_pool_bytes(r, tmp, bytes, NULL); credit_entropy_bits(r, bytes*8); } - kmemcheck_mark_initialized(&u.hwrand, sizeof(u.hwrand)); - for (i = 0; i < 4; i++) - if (arch_get_random_long(&u.hwrand[i])) - break; - if (i) - mix_pool_bytes(r, &u.hwrand, sizeof(u.hwrand), 0); } /* @@ -901,15 +893,19 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min, static void extract_buf(struct entropy_store *r, __u8 *out) { int i; - __u32 hash[5], workspace[SHA_WORKSPACE_WORDS]; + union { + __u32 w[5]; + unsigned long l[LONGS(EXTRACT_SIZE)]; + } hash; + __u32 workspace[SHA_WORKSPACE_WORDS]; __u8 extract[64]; unsigned long flags; /* Generate a hash across the pool, 16 words (512 bits) at a time */ - sha_init(hash); + sha_init(hash.w); spin_lock_irqsave(&r->lock, flags); for (i = 0; i < r->poolinfo->poolwords; i += 16) - sha_transform(hash, (__u8 *)(r->pool + i), workspace); + sha_transform(hash.w, (__u8 *)(r->pool + i), workspace); /* * We mix the hash back into the pool to prevent backtracking @@ -920,14 +916,14 @@ static void extract_buf(struct entropy_store *r, __u8 *out) * brute-forcing the feedback as hard as brute-forcing the * hash. */ - __mix_pool_bytes(r, hash, sizeof(hash), extract); + __mix_pool_bytes(r, hash.w, sizeof(hash.w), extract); spin_unlock_irqrestore(&r->lock, flags); /* * To avoid duplicates, we atomically extract a portion of the * pool while mixing, and hash one final time. */ - sha_transform(hash, extract, workspace); + sha_transform(hash.w, extract, workspace); memset(extract, 0, sizeof(extract)); memset(workspace, 0, sizeof(workspace)); @@ -936,11 +932,23 @@ static void extract_buf(struct entropy_store *r, __u8 *out) * pattern, we fold it in half. Thus, we always feed back * twice as much data as we output. */ - hash[0] ^= hash[3]; - hash[1] ^= hash[4]; - hash[2] ^= rol32(hash[2], 16); - memcpy(out, hash, EXTRACT_SIZE); - memset(hash, 0, sizeof(hash)); + hash.w[0] ^= hash.w[3]; + hash.w[1] ^= hash.w[4]; + hash.w[2] ^= rol32(hash.w[2], 16); + + /* + * If we have a architectural hardware random number + * generator, mix that in, too. + */ + for (i = 0; i < LONGS(EXTRACT_SIZE); i++) { + unsigned long v; + if (!arch_get_random_long(&v)) + break; + hash.l[i] ^= v; + } + + memcpy(out, &hash, EXTRACT_SIZE); + memset(&hash, 0, sizeof(hash)); } static ssize_t extract_entropy(struct entropy_store *r, void *buf, From d426c78930e6e01373e3007df81161855f5b6ec1 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Sun, 6 May 2012 11:11:04 -0600 Subject: [PATCH 125/135] x86, microcode: microcode_core.c simple_strtoul cleanup commit e826abd523913f63eb03b59746ffb16153c53dc4 upstream. Change reload_for_cpu() in kernel/microcode_core.c to call kstrtoul() instead of calling obsoleted simple_strtoul(). Signed-off-by: Shuah Khan Reviewed-by: Borislav Petkov Link: http://lkml.kernel.org/r/1336324264.2897.9.camel@lorien2 Signed-off-by: H. Peter Anvin Cc: Henrique de Moraes Holschuh Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/microcode_core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f9242800bc84..b9e52a538614 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -298,12 +298,11 @@ static ssize_t reload_store(struct sys_device *dev, { unsigned long val; int cpu = dev->id; - int ret = 0; - char *end; + ssize_t ret = 0; - val = simple_strtoul(buf, &end, 0); - if (end == buf) - return -EINVAL; + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (val == 1) { get_online_cpus(); From bb014c405d5e6d92fd9617bdfc29b29ad2bf6588 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 21 Jun 2012 14:07:16 +0200 Subject: [PATCH 126/135] x86, microcode: Sanitize per-cpu microcode reloading interface commit c9fc3f778a6a215ace14ee556067c73982b6d40f upstream. Microcode reloading in a per-core manner is a very bad idea for both major x86 vendors. And the thing is, we have such interface with which we can end up with different microcode versions applied on different cores of an otherwise homogeneous wrt (family,model,stepping) system. So turn off the possibility of doing that per core and allow it only system-wide. This is a minimal fix which we'd like to see in stable too thus the more-or-less arbitrary decision to allow system-wide reloading only on the BSP: $ echo 1 > /sys/devices/system/cpu/cpu0/microcode/reload ... and disable the interface on the other cores: $ echo 1 > /sys/devices/system/cpu/cpu23/microcode/reload -bash: echo: write error: Invalid argument Also, allowing the reload only from one CPU (the BSP in that case) doesn't allow the reload procedure to degenerate into an O(n^2) deal when triggering reloads from all /sys/devices/system/cpu/cpuX/microcode/reload sysfs nodes simultaneously. A more generic fix will follow. Signed-off-by: Borislav Petkov Cc: Henrique de Moraes Holschuh Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1340280437-7718-2-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/microcode_core.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index b9e52a538614..c4e246541c54 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -297,19 +297,31 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t size) { unsigned long val; - int cpu = dev->id; - ssize_t ret = 0; + int cpu; + ssize_t ret = 0, tmp_ret; + + /* allow reload only from the BSP */ + if (boot_cpu_data.cpu_index != dev->id) + return -EINVAL; ret = kstrtoul(buf, 0, &val); if (ret) return ret; - if (val == 1) { - get_online_cpus(); - if (cpu_online(cpu)) - ret = reload_for_cpu(cpu); - put_online_cpus(); + if (val != 1) + return size; + + get_online_cpus(); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; } + put_online_cpus(); if (!ret) ret = size; From 4c9682c5269e3c63ef9009ad20ea4e150370b7e0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:46:20 -0700 Subject: [PATCH 127/135] mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables commit d833352a4338dc31295ed832a30c9ccff5c7a183 upstream. If a process creates a large hugetlbfs mapping that is eligible for page table sharing and forks heavily with children some of whom fault and others which destroy the mapping then it is possible for page tables to get corrupted. Some teardowns of the mapping encounter a "bad pmd" and output a message to the kernel log. The final teardown will trigger a BUG_ON in mm/filemap.c. This was reproduced in 3.4 but is known to have existed for a long time and goes back at least as far as 2.6.37. It was probably was introduced in 2.6.20 by [39dde65c: shared page table for hugetlb page]. The messages look like this; [ ..........] Lots of bad pmd messages followed by this [ 127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7). [ 127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7). [ 127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7). [ 127.186778] ------------[ cut here ]------------ [ 127.186781] kernel BUG at mm/filemap.c:134! [ 127.186782] invalid opcode: 0000 [#1] SMP [ 127.186783] CPU 7 [ 127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod [ 127.186801] [ 127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR [ 127.186804] RIP: 0010:[] [] __delete_from_page_cache+0x15e/0x160 [ 127.186809] RSP: 0000:ffff8804144b5c08 EFLAGS: 00010002 [ 127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0 [ 127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00 [ 127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003 [ 127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8 [ 127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8 [ 127.186815] FS: 00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000 [ 127.186816] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0 [ 127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0) [ 127.186821] Stack: [ 127.186822] ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b [ 127.186824] ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98 [ 127.186825] ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000 [ 127.186827] Call Trace: [ 127.186829] [] delete_from_page_cache+0x3b/0x80 [ 127.186832] [] truncate_hugepages+0x115/0x220 [ 127.186834] [] hugetlbfs_evict_inode+0x13/0x30 [ 127.186837] [] evict+0xa7/0x1b0 [ 127.186839] [] iput_final+0xd3/0x1f0 [ 127.186840] [] iput+0x39/0x50 [ 127.186842] [] d_kill+0xf8/0x130 [ 127.186843] [] dput+0xd2/0x1a0 [ 127.186845] [] __fput+0x170/0x230 [ 127.186848] [] ? rb_erase+0xce/0x150 [ 127.186849] [] fput+0x1d/0x30 [ 127.186851] [] remove_vma+0x37/0x80 [ 127.186853] [] do_munmap+0x2d2/0x360 [ 127.186855] [] sys_shmdt+0xc9/0x170 [ 127.186857] [] system_call_fastpath+0x16/0x1b [ 127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0 [ 127.186868] RIP [] __delete_from_page_cache+0x15e/0x160 [ 127.186870] RSP [ 127.186871] ---[ end trace 7cbac5d1db69f426 ]--- The bug is a race and not always easy to reproduce. To reproduce it I was doing the following on a single socket I7-based machine with 16G of RAM. $ hugeadm --pool-pages-max DEFAULT:13G $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall $ for i in `seq 1 9000`; do ./hugetlbfs-test; done On my particular machine, it usually triggers within 10 minutes but enabling debug options can change the timing such that it never hits. Once the bug is triggered, the machine is in trouble and needs to be rebooted. The machine will respond but processes accessing proc like "ps aux" will hang due to the BUG_ON. shutdown will also hang and needs a hard reset or a sysrq-b. The basic problem is a race between page table sharing and teardown. For the most part page table sharing depends on i_mmap_mutex. In some cases, it is also taking the mm->page_table_lock for the PTE updates but with shared page tables, it is the i_mmap_mutex that is more important. Unfortunately it appears to be also insufficient. Consider the following situation Process A Process B --------- --------- hugetlb_fault shmdt LockWrite(mmap_sem) do_munmap unmap_region unmap_vmas unmap_single_vma unmap_hugepage_range Lock(i_mmap_mutex) Lock(mm->page_table_lock) huge_pmd_unshare/unmap tables <--- (1) Unlock(mm->page_table_lock) Unlock(i_mmap_mutex) huge_pte_alloc ... Lock(i_mmap_mutex) ... vma_prio_walk, find svma, spte ... Lock(mm->page_table_lock) ... share spte ... Unlock(mm->page_table_lock) ... Unlock(i_mmap_mutex) ... hugetlb_no_page <--- (2) free_pgtables unlink_file_vma hugetlb_free_pgd_range remove_vma_list In this scenario, it is possible for Process A to share page tables with Process B that is trying to tear them down. The i_mmap_mutex on its own does not prevent Process A walking Process B's page tables. At (1) above, the page tables are not shared yet so it unmaps the PMDs. Process A sets up page table sharing and at (2) faults a new entry. Process B then trips up on it in free_pgtables. This patch fixes the problem by adding a new function __unmap_hugepage_range_final that is only called when the VMA is about to be destroyed. This function clears VM_MAYSHARE during unmap_hugepage_range() under the i_mmap_mutex. This makes the VMA ineligible for sharing and avoids the race. Superficially this looks like it would then be vunerable to truncate and madvise issues but hugetlbfs has its own truncate handlers so does not use unmap_mapping_range() and does not support madvise(DONTNEED). This should be treated as a -stable candidate if it is merged. Test program is as follows. The test case was mostly written by Michal Hocko with a few minor changes to reproduce this bug. ==== CUT HERE ==== static size_t huge_page_size = (2UL << 20); static size_t nr_huge_page_A = 512; static size_t nr_huge_page_B = 5632; unsigned int get_random(unsigned int max) { struct timeval tv; gettimeofday(&tv, NULL); srandom(tv.tv_usec); return random() % max; } static void play(void *addr, size_t size) { unsigned char *start = addr, *end = start + size, *a; start += get_random(size/2); /* we could itterate on huge pages but let's give it more time. */ for (a = start; a < end; a += 4096) *a = 0; } int main(int argc, char **argv) { key_t key = IPC_PRIVATE; size_t sizeA = nr_huge_page_A * huge_page_size; size_t sizeB = nr_huge_page_B * huge_page_size; int shmidA, shmidB; void *addrA = NULL, *addrB = NULL; int nr_children = 300, n = 0; if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) { perror("shmget:"); return 1; } if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) { perror("shmat"); return 1; } if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) { perror("shmget:"); return 1; } if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) { perror("shmat"); return 1; } fork_child: switch(fork()) { case 0: switch (n%3) { case 0: play(addrA, sizeA); break; case 1: play(addrB, sizeB); break; case 2: break; } break; case -1: perror("fork:"); break; default: if (++n < nr_children) goto fork_child; play(addrA, sizeA); break; } shmdt(addrA); shmdt(addrB); do { wait(NULL); } while (--n > 0); shmctl(shmidA, IPC_RMID, NULL); shmctl(shmidB, IPC_RMID, NULL); return 0; } [akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build] Signed-off-by: Hugh Dickins Reviewed-by: Michal Hocko Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/hugetlb.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ae60a53f0506..037f077b9865 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2301,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, { mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); __unmap_hugepage_range(vma, start, end, ref_page); + /* + * Clear this flag so that x86's huge_pmd_share page_table_shareable + * test will fail on a vma being torn down, and not grab a page table + * on its way out. We're lucky that the flag has such an appropriate + * name, and can in fact be safely cleared here. We could clear it + * before the __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_mutex below. + * + * This works because in the contexts this is called, the VMA is + * going to be destroyed. It is not vunerable to madvise(DONTNEED) + * because madvise is not supported on hugetlbfs. The same applies + * for direct IO. unmap_hugepage_range() is only being called just + * before free_pgtables() so clearing VM_MAYSHARE will not cause + * surprises later. + */ + vma->vm_flags &= ~VM_MAYSHARE; mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } @@ -2853,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); - + /* + * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_mutex, another task can do the final put_page + * and that page table be reused and filled with junk. + */ flush_tlb_range(vma, start, end); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } int hugetlb_reserve_pages(struct inode *inode, From c75f1f090ef88416c6e299b6cb07edcabe11575b Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 3 Aug 2012 20:54:48 +0200 Subject: [PATCH 128/135] ARM: mxs: Remove MMAP_MIN_ADDR setting from mxs_defconfig commit 3bed491c8d28329e34f8a31e3fe64d03f3a350f1 upstream. The CONFIG_DEFAULT_MMAP_MIN_ADDR was set to 65536 in mxs_defconfig, this caused severe breakage of userland applications since the upper limit for ARM is 32768. By default CONFIG_DEFAULT_MMAP_MIN_ADDR is set to 4096 and can also be changed via /proc/sys/vm/mmap_min_addr if needed. Quoting Russell King [1]: "4096 is also fine for ARM too. There's not much point in having defconfigs change it - that would just be pure noise in the config files." the CONFIG_DEFAULT_MMAP_MIN_ADDR can be removed from the defconfig altogether. This problem was introduced by commit cde7c41 (ARM: configs: add defconfig for mach-mxs). [1] http://marc.info/?l=linux-arm-kernel&m=134401593807820&w=2 Signed-off-by: Marek Vasut Cc: Russell King Cc: Wolfgang Denk Signed-off-by: Shawn Guo Signed-off-by: Greg Kroah-Hartman --- arch/arm/configs/mxs_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig index 2bf224310fb4..166d6aa97c4d 100644 --- a/arch/arm/configs/mxs_defconfig +++ b/arch/arm/configs/mxs_defconfig @@ -29,7 +29,6 @@ CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_PREEMPT_VOLUNTARY=y CONFIG_AEABI=y -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 CONFIG_AUTO_ZRELADDR=y CONFIG_FPE_NWFPE=y CONFIG_NET=y From 9f75ebd871f7f0a613fdb4e1231fbd540916872c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 5 Aug 2012 14:58:37 +0000 Subject: [PATCH 129/135] ARM: pxa: remove irq_to_gpio from ezx-pcap driver commit 59ee93a528b94ef4e81a08db252b0326feff171f upstream. The irq_to_gpio function was removed from the pxa platform in linux-3.2, and this driver has been broken since. There is actually no in-tree user of this driver that adds this platform device, but the driver can and does get enabled on some platforms. Without this patch, building ezx_defconfig results in: drivers/mfd/ezx-pcap.c: In function 'pcap_isr_work': drivers/mfd/ezx-pcap.c:205:2: error: implicit declaration of function 'irq_to_gpio' [-Werror=implicit-function-declaration] Signed-off-by: Arnd Bergmann Acked-by: Haojian Zhuang Cc: Samuel Ortiz Cc: Daniel Ribeiro Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/ezx-pcap.c | 2 +- include/linux/mfd/ezx-pcap.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c index 43a76c41cfcc..db662e2dcfa5 100644 --- a/drivers/mfd/ezx-pcap.c +++ b/drivers/mfd/ezx-pcap.c @@ -202,7 +202,7 @@ static void pcap_isr_work(struct work_struct *work) } local_irq_enable(); ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); - } while (gpio_get_value(irq_to_gpio(pcap->spi->irq))); + } while (gpio_get_value(pdata->gpio)); } static void pcap_irq_handler(unsigned int irq, struct irq_desc *desc) diff --git a/include/linux/mfd/ezx-pcap.h b/include/linux/mfd/ezx-pcap.h index 40c372165f3e..32a1b5cfeba1 100644 --- a/include/linux/mfd/ezx-pcap.h +++ b/include/linux/mfd/ezx-pcap.h @@ -16,6 +16,7 @@ struct pcap_subdev { struct pcap_platform_data { unsigned int irq_base; unsigned int config; + int gpio; void (*init) (void *); /* board specific init */ int num_subdevs; struct pcap_subdev *subdevs; From b27c59d2c23d5f326a74e02a799cc4d00246165a Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 2 Aug 2012 18:41:48 +0100 Subject: [PATCH 130/135] cfg80211: process pending events when unregistering net device commit 1f6fc43e621167492ed4b7f3b4269c584c3d6ccc upstream. libertas currently calls cfg80211_disconnected() when it is being brought down. This causes an event to be allocated, but since the wdev is already removed from the rdev by the time that the event processing work executes, the event is never processed or freed. http://article.gmane.org/gmane.linux.kernel.wireless.general/95666 Fix this leak, and other possible situations, by processing the event queue when a device is being unregistered. Thanks to Johannes Berg for the suggestion. Signed-off-by: Daniel Drake Reviewed-by: Johannes Berg Signed-off-by: John W. Linville Signed-off-by: Greg Kroah-Hartman --- net/wireless/core.c | 5 +++++ net/wireless/core.h | 1 + net/wireless/util.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/net/wireless/core.c b/net/wireless/core.c index 880dbe2e6f94..498c760a1d2a 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -959,6 +959,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, */ synchronize_rcu(); INIT_LIST_HEAD(&wdev->list); + /* + * Ensure that all events have been processed and + * freed. + */ + cfg80211_process_wdev_events(wdev); break; case NETDEV_PRE_UP: if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype))) diff --git a/net/wireless/core.h b/net/wireless/core.h index a570ff9214ec..83516455a1d1 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -426,6 +426,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, u32 *flags, struct vif_params *params); void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); +void cfg80211_process_wdev_events(struct wireless_dev *wdev); int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, diff --git a/net/wireless/util.c b/net/wireless/util.c index bbcb58e61ed5..4d69149eef2c 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -719,7 +719,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) wdev->connect_keys = NULL; } -static void cfg80211_process_wdev_events(struct wireless_dev *wdev) +void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; From 4f5a5866aa4dccf42b2fa1cdecf372025a3e86cc Mon Sep 17 00:00:00 2001 From: Liang Li Date: Thu, 2 Aug 2012 18:55:41 -0400 Subject: [PATCH 131/135] cfg80211: fix interface combinations check for ADHOC(IBSS) partial of commit 8e8b41f9d8c8e63fc92f899ace8da91a490ac573 upstream. As part of commit 463454b5dbd8 ("cfg80211: fix interface combinations check"), this extra check was introduced: if ((all_iftypes & used_iftypes) != used_iftypes) goto cont; However, most wireless NIC drivers did not advertise ADHOC in wiphy.iface_combinations[i].limits[] and hence we'll get -EBUSY when we bring up a ADHOC wlan with commands similar to: # iwconfig wlan0 mode ad-hoc && ifconfig wlan0 up In commit 8e8b41f9d8c8e ("cfg80211: enforce lack of interface combinations"), the change below fixes the issue: if (total == 1) return 0; But it also introduces other dependencies for stable. For example, a full cherry pick of 8e8b41f9d8c8e would introduce additional regressions unless we also start cherry picking driver specific fixes like the following: 9b4760e ath5k: add possible wiphy interface combinations 1ae2fc2 mac80211_hwsim: advertise interface combinations 20c8e8d ath9k: add possible wiphy interface combinations And the purpose of the 'if (total == 1)' is to cover the specific use case (IBSS, adhoc) that was mentioned above. So we just pick the specific part out from 8e8b41f9d8c8e here. Doing so gives stable kernels a way to fix the change introduced by 463454b5dbd8, without having to make cherry picks specific to various NIC drivers. Signed-off-by: Liang Li Signed-off-by: Paul Gortmaker Signed-off-by: Greg Kroah-Hartman --- net/wireless/util.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index 4d69149eef2c..18e22bef386d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -975,6 +975,9 @@ int cfg80211_can_change_interface(struct cfg80211_registered_device *rdev, } mutex_unlock(&rdev->devlist_mtx); + if (total == 1) + return 0; + for (i = 0; i < rdev->wiphy.n_iface_combinations; i++) { const struct ieee80211_iface_combination *c; struct ieee80211_iface_limit *limits; From 5a4cebe94b76a0182a8ebb97b665ff84e7427482 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Tue, 31 Jul 2012 02:02:43 +0000 Subject: [PATCH 132/135] e1000e: NIC goes up and immediately goes down commit b7ec70be01a87f2c85df3ae11046e74f9b67e323 upstream. Found that commit d478eb44 was a bad commit. If the link partner is transmitting codeword (even if NULL codeword), then the RXCW.C bit will be set so check for RXCW.CW is unnecessary. Ref: RH BZ 840642 Reported-by: Fabio Futigami Signed-off-by: Tushar Dave CC: Marcelo Ricardo Leitner Tested-by: Aaron Brown Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Greg Kroah-Hartman --- drivers/net/e1000e/82571.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/e1000e/82571.c b/drivers/net/e1000e/82571.c index 5278e8456b68..0d0ee552d95c 100644 --- a/drivers/net/e1000e/82571.c +++ b/drivers/net/e1000e/82571.c @@ -1602,10 +1602,8 @@ static s32 e1000_check_for_serdes_link_82571(struct e1000_hw *hw) * auto-negotiation in the TXCW register and disable * forced link in the Device Control register in an * attempt to auto-negotiate with our link partner. - * If the partner code word is null, stop forcing - * and restart auto negotiation. */ - if ((rxcw & E1000_RXCW_C) || !(rxcw & E1000_RXCW_CW)) { + if (rxcw & E1000_RXCW_C) { /* Enable autoneg, and unforce link up */ ew32(TXCW, mac->txcw); ew32(CTRL, (ctrl & ~E1000_CTRL_SLU)); From ab7029e676b83e54e639101e5ba96a07d7968cd5 Mon Sep 17 00:00:00 2001 From: Chris Bagwell Date: Tue, 12 Jun 2012 00:25:48 -0700 Subject: [PATCH 133/135] Input: wacom - Bamboo One 1024 pressure fix commit 6dc463511d4a690f01a9248df3b384db717e0b1c upstream. Bamboo One's with ID of 0x6a and 0x6b were added with correct indication of 1024 pressure levels but the Graphire packet routine was only looking at 9 bits. Increased to 10 bits. This bug caused these devices to roll over to zero pressure at half way mark. The other devices using this routine only support 256 or 512 range and look to fix unused bits at zero. Signed-off-by: Chris Bagwell Reported-by: Tushant Mirchandani Reviewed-by: Ping Cheng Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/tablet/wacom_wac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/tablet/wacom_wac.c b/drivers/input/tablet/wacom_wac.c index 08ba5ad9c9be..a28ebf08560c 100644 --- a/drivers/input/tablet/wacom_wac.c +++ b/drivers/input/tablet/wacom_wac.c @@ -242,7 +242,7 @@ static int wacom_graphire_irq(struct wacom_wac *wacom) input_report_abs(input, ABS_X, le16_to_cpup((__le16 *)&data[2])); input_report_abs(input, ABS_Y, le16_to_cpup((__le16 *)&data[4])); if (wacom->tool[0] != BTN_TOOL_MOUSE) { - input_report_abs(input, ABS_PRESSURE, data[6] | ((data[7] & 0x01) << 8)); + input_report_abs(input, ABS_PRESSURE, data[6] | ((data[7] & 0x03) << 8)); input_report_key(input, BTN_TOUCH, data[1] & 0x01); input_report_key(input, BTN_STYLUS, data[1] & 0x02); input_report_key(input, BTN_STYLUS2, data[1] & 0x04); From 931d5990ed8186a1dc627b34c7d3b28dd89011d7 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 3 Aug 2012 12:49:14 +0200 Subject: [PATCH 134/135] rt61pci: fix NULL pointer dereference in config_lna_gain commit deee0214def5d8a32b8112f11d9c2b1696e9c0cb upstream. We can not pass NULL libconf->conf->channel to rt61pci_config() as it is dereferenced unconditionally in rt61pci_config_lna_gain() subroutine. Resolves: https://bugzilla.kernel.org/show_bug.cgi?id=44361 Reported-and-tested-by: Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/rt2x00/rt61pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/wireless/rt2x00/rt61pci.c b/drivers/net/wireless/rt2x00/rt61pci.c index 9d35ec16a3a5..9e5fd45cf492 100644 --- a/drivers/net/wireless/rt2x00/rt61pci.c +++ b/drivers/net/wireless/rt2x00/rt61pci.c @@ -2254,8 +2254,7 @@ static void rt61pci_txdone(struct rt2x00_dev *rt2x00dev) static void rt61pci_wakeup(struct rt2x00_dev *rt2x00dev) { - struct ieee80211_conf conf = { .flags = 0 }; - struct rt2x00lib_conf libconf = { .conf = &conf }; + struct rt2x00lib_conf libconf = { .conf = &rt2x00dev->hw->conf }; rt61pci_config(rt2x00dev, &libconf, IEEE80211_CONF_CHANGE_PS); } From a422ca75bd264cd26bafeb6305655245d2ea7c6b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 15 Aug 2012 12:05:01 -0700 Subject: [PATCH 135/135] Linux 3.0.41 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ec4fee552b51..2cbfd9732090 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 0 -SUBLEVEL = 40 +SUBLEVEL = 41 EXTRAVERSION = NAME = Sneaky Weasel