| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * Copyright (c) 2008 Intel Corporation | 
					
						
							|  |  |  |  * Author: Matthew Wilcox <willy@linux.intel.com> | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Distributed under the terms of the GNU GPL, version 2 | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  |  * | 
					
						
							|  |  |  |  * This file implements counting semaphores. | 
					
						
							|  |  |  |  * A counting semaphore may be acquired 'n' times before sleeping. | 
					
						
							|  |  |  |  * See mutex.c for single-acquisition sleeping locks which enforce | 
					
						
							|  |  |  |  * rules which allow code to be debugged more easily. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Some notes on the implementation: | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The spinlock controls access to the other members of the semaphore. | 
					
						
							|  |  |  |  * down_trylock() and up() can be called from interrupt context, so we | 
					
						
							|  |  |  |  * have to disable interrupts when taking the lock.  It turns out various | 
					
						
							|  |  |  |  * parts of the kernel expect to be able to use down() on a semaphore in | 
					
						
							|  |  |  |  * interrupt context when they know it will succeed, so we have to use | 
					
						
							|  |  |  |  * irqsave variants for down(), down_interruptible() and down_killable() | 
					
						
							|  |  |  |  * too. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The ->count variable represents how many more tasks can acquire this | 
					
						
							|  |  |  |  * semaphore.  If it's zero, there may be tasks waiting on the wait_list. | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include <linux/compiler.h>
 | 
					
						
							|  |  |  | #include <linux/kernel.h>
 | 
					
						
							| 
									
										
										
										
											2011-05-23 14:51:41 -04:00
										 |  |  | #include <linux/export.h>
 | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | #include <linux/sched.h>
 | 
					
						
							|  |  |  | #include <linux/semaphore.h>
 | 
					
						
							|  |  |  | #include <linux/spinlock.h>
 | 
					
						
							| 
									
										
										
										
											2008-05-12 21:21:15 +02:00
										 |  |  | #include <linux/ftrace.h>
 | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | static noinline void __down(struct semaphore *sem); | 
					
						
							|  |  |  | static noinline int __down_interruptible(struct semaphore *sem); | 
					
						
							| 
									
										
										
										
											2008-03-14 13:19:33 -04:00
										 |  |  | static noinline int __down_killable(struct semaphore *sem); | 
					
						
							| 
									
										
										
										
											2014-09-03 03:17:24 -07:00
										 |  |  | static noinline int __down_timeout(struct semaphore *sem, long timeout); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | static noinline void __up(struct semaphore *sem); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  | /**
 | 
					
						
							|  |  |  |  * down - acquire the semaphore | 
					
						
							|  |  |  |  * @sem: the semaphore to be acquired | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Acquires the semaphore.  If no more tasks are allowed to acquire the | 
					
						
							|  |  |  |  * semaphore, calling this function will put the task to sleep until the | 
					
						
							|  |  |  |  * semaphore is released. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Use of this function is deprecated, please use down_interruptible() or | 
					
						
							|  |  |  |  * down_killable() instead. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | void down(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	unsigned long flags; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_lock_irqsave(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	if (likely(sem->count > 0)) | 
					
						
							|  |  |  | 		sem->count--; | 
					
						
							|  |  |  | 	else | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 		__down(sem); | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_unlock_irqrestore(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | } | 
					
						
							|  |  |  | EXPORT_SYMBOL(down); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  | /**
 | 
					
						
							|  |  |  |  * down_interruptible - acquire the semaphore unless interrupted | 
					
						
							|  |  |  |  * @sem: the semaphore to be acquired | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Attempts to acquire the semaphore.  If no more tasks are allowed to | 
					
						
							|  |  |  |  * acquire the semaphore, calling this function will put the task to sleep. | 
					
						
							|  |  |  |  * If the sleep is interrupted by a signal, this function will return -EINTR. | 
					
						
							|  |  |  |  * If the semaphore is successfully acquired, this function returns 0. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | int down_interruptible(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	unsigned long flags; | 
					
						
							|  |  |  | 	int result = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_lock_irqsave(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	if (likely(sem->count > 0)) | 
					
						
							| 
									
										
											  
											
												semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date:   Fri Mar 7 21:55:58 2008 -0500
|
|     Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
        list_del(&waiter->list);
        waiter->up = 1;
        wake_up_process(waiter->task);
and where __down() does this in essence:
        list_add_tail(&waiter.list, &sem->wait_list);
        waiter.task = task;
        waiter.up = 0;
        for (;;) {
                [...]
                spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
                spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
  # v2.6.25 vanilla:
  ..................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    56096.4         91      207.5   789.7   0.4675
  2000    55894.4         94      208.2   792.7   0.4658
  # v2.6.26-rc1-166-gc0a1811 vanilla:
  ...................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    33230.6         83      350.3   784.5   0.2769
  2000    31778.1         86      366.3   783.6   0.2648
  # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
  ...............................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    55707.1         92      209.0   795.6   0.4642
  2000    55704.4         96      209.0   796.0   0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
   text    data     bss     dec     hex filename
   1241       0       0    1241     4d9 semaphore.o.before
   1207       0       0    1207     4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
											
										 
											2008-05-08 11:53:48 +02:00
										 |  |  | 		sem->count--; | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	else | 
					
						
							|  |  |  | 		result = __down_interruptible(sem); | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_unlock_irqrestore(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	return result; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | EXPORT_SYMBOL(down_interruptible); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  | /**
 | 
					
						
							|  |  |  |  * down_killable - acquire the semaphore unless killed | 
					
						
							|  |  |  |  * @sem: the semaphore to be acquired | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Attempts to acquire the semaphore.  If no more tasks are allowed to | 
					
						
							|  |  |  |  * acquire the semaphore, calling this function will put the task to sleep. | 
					
						
							|  |  |  |  * If the sleep is interrupted by a fatal signal, this function will return | 
					
						
							|  |  |  |  * -EINTR.  If the semaphore is successfully acquired, this function returns | 
					
						
							|  |  |  |  * 0. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2008-03-14 13:19:33 -04:00
										 |  |  | int down_killable(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	unsigned long flags; | 
					
						
							|  |  |  | 	int result = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_lock_irqsave(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	if (likely(sem->count > 0)) | 
					
						
							| 
									
										
											  
											
												semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date:   Fri Mar 7 21:55:58 2008 -0500
|
|     Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
        list_del(&waiter->list);
        waiter->up = 1;
        wake_up_process(waiter->task);
and where __down() does this in essence:
        list_add_tail(&waiter.list, &sem->wait_list);
        waiter.task = task;
        waiter.up = 0;
        for (;;) {
                [...]
                spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
                spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
  # v2.6.25 vanilla:
  ..................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    56096.4         91      207.5   789.7   0.4675
  2000    55894.4         94      208.2   792.7   0.4658
  # v2.6.26-rc1-166-gc0a1811 vanilla:
  ...................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    33230.6         83      350.3   784.5   0.2769
  2000    31778.1         86      366.3   783.6   0.2648
  # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
  ...............................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    55707.1         92      209.0   795.6   0.4642
  2000    55704.4         96      209.0   796.0   0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
   text    data     bss     dec     hex filename
   1241       0       0    1241     4d9 semaphore.o.before
   1207       0       0    1207     4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
											
										 
											2008-05-08 11:53:48 +02:00
										 |  |  | 		sem->count--; | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	else | 
					
						
							|  |  |  | 		result = __down_killable(sem); | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_unlock_irqrestore(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-03-14 13:19:33 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	return result; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | EXPORT_SYMBOL(down_killable); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | /**
 | 
					
						
							|  |  |  |  * down_trylock - try to acquire the semaphore, without waiting | 
					
						
							|  |  |  |  * @sem: the semaphore to be acquired | 
					
						
							|  |  |  |  * | 
					
						
							| 
									
										
										
										
											2012-03-03 16:18:47 +02:00
										 |  |  |  * Try to acquire the semaphore atomically.  Returns 0 if the semaphore has | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  |  * been acquired successfully or 1 if it it cannot be acquired. | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  |  * | 
					
						
							|  |  |  |  * NOTE: This return value is inverted from both spin_trylock and | 
					
						
							|  |  |  |  * mutex_trylock!  Be careful about this when converting code. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Unlike mutex_trylock, this function can be used from interrupt context, | 
					
						
							|  |  |  |  * and the semaphore can be released by any task or interrupt. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | int down_trylock(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	unsigned long flags; | 
					
						
							|  |  |  | 	int count; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_lock_irqsave(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 	count = sem->count - 1; | 
					
						
							|  |  |  | 	if (likely(count >= 0)) | 
					
						
							|  |  |  | 		sem->count = count; | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_unlock_irqrestore(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	return (count < 0); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | EXPORT_SYMBOL(down_trylock); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  | /**
 | 
					
						
							|  |  |  |  * down_timeout - acquire the semaphore within a specified time | 
					
						
							|  |  |  |  * @sem: the semaphore to be acquired | 
					
						
							| 
									
										
										
										
											2014-09-03 03:17:24 -07:00
										 |  |  |  * @timeout: how long to wait before failing | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  |  * | 
					
						
							|  |  |  |  * Attempts to acquire the semaphore.  If no more tasks are allowed to | 
					
						
							|  |  |  |  * acquire the semaphore, calling this function will put the task to sleep. | 
					
						
							|  |  |  |  * If the semaphore is not released within the specified number of jiffies, | 
					
						
							|  |  |  |  * this function returns -ETIME.  It returns 0 if the semaphore was acquired. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2014-09-03 03:17:24 -07:00
										 |  |  | int down_timeout(struct semaphore *sem, long timeout) | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | { | 
					
						
							|  |  |  | 	unsigned long flags; | 
					
						
							|  |  |  | 	int result = 0; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_lock_irqsave(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	if (likely(sem->count > 0)) | 
					
						
							| 
									
										
											  
											
												semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date:   Fri Mar 7 21:55:58 2008 -0500
|
|     Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
        list_del(&waiter->list);
        waiter->up = 1;
        wake_up_process(waiter->task);
and where __down() does this in essence:
        list_add_tail(&waiter.list, &sem->wait_list);
        waiter.task = task;
        waiter.up = 0;
        for (;;) {
                [...]
                spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
                spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
  # v2.6.25 vanilla:
  ..................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    56096.4         91      207.5   789.7   0.4675
  2000    55894.4         94      208.2   792.7   0.4658
  # v2.6.26-rc1-166-gc0a1811 vanilla:
  ...................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    33230.6         83      350.3   784.5   0.2769
  2000    31778.1         86      366.3   783.6   0.2648
  # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
  ...............................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    55707.1         92      209.0   795.6   0.4642
  2000    55704.4         96      209.0   796.0   0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
   text    data     bss     dec     hex filename
   1241       0       0    1241     4d9 semaphore.o.before
   1207       0       0    1207     4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
											
										 
											2008-05-08 11:53:48 +02:00
										 |  |  | 		sem->count--; | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	else | 
					
						
							| 
									
										
										
										
											2014-09-03 03:17:24 -07:00
										 |  |  | 		result = __down_timeout(sem, timeout); | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_unlock_irqrestore(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	return result; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | EXPORT_SYMBOL(down_timeout); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-04-11 15:23:52 -04:00
										 |  |  | /**
 | 
					
						
							|  |  |  |  * up - release the semaphore | 
					
						
							|  |  |  |  * @sem: the semaphore to release | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Release the semaphore.  Unlike mutexes, up() may be called from any | 
					
						
							|  |  |  |  * context and even by tasks which have never called down(). | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | void up(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	unsigned long flags; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_lock_irqsave(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	if (likely(list_empty(&sem->wait_list))) | 
					
						
							|  |  |  | 		sem->count++; | 
					
						
							|  |  |  | 	else | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 		__up(sem); | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 	raw_spin_unlock_irqrestore(&sem->lock, flags); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | } | 
					
						
							|  |  |  | EXPORT_SYMBOL(up); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Functions for the contended case */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct semaphore_waiter { | 
					
						
							|  |  |  | 	struct list_head list; | 
					
						
							|  |  |  | 	struct task_struct *task; | 
					
						
							| 
									
										
										
										
											2013-04-30 15:28:33 -07:00
										 |  |  | 	bool up; | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  |  * Because this function is inlined, the 'state' parameter will be | 
					
						
							|  |  |  |  * constant, and thus optimised away by the compiler.  Likewise the | 
					
						
							|  |  |  |  * 'timeout' parameter for the cases without timeouts. | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  |  */ | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | static inline int __sched __down_common(struct semaphore *sem, long state, | 
					
						
							|  |  |  | 								long timeout) | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | { | 
					
						
							|  |  |  | 	struct task_struct *task = current; | 
					
						
							|  |  |  | 	struct semaphore_waiter waiter; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date:   Fri Mar 7 21:55:58 2008 -0500
|
|     Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
        spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
        spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
        list_del(&waiter->list);
        waiter->up = 1;
        wake_up_process(waiter->task);
and where __down() does this in essence:
        list_add_tail(&waiter.list, &sem->wait_list);
        waiter.task = task;
        waiter.up = 0;
        for (;;) {
                [...]
                spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
                spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
  # v2.6.25 vanilla:
  ..................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    56096.4         91      207.5   789.7   0.4675
  2000    55894.4         94      208.2   792.7   0.4658
  # v2.6.26-rc1-166-gc0a1811 vanilla:
  ...................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    33230.6         83      350.3   784.5   0.2769
  2000    31778.1         86      366.3   783.6   0.2648
  # v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
  ...............................................
  Tasks   Jobs/Min        JTI     Real    CPU     Jobs/sec/task
  2000    55707.1         92      209.0   795.6   0.4642
  2000    55704.4         96      209.0   796.0   0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
   text    data     bss     dec     hex filename
   1241       0       0    1241     4d9 semaphore.o.before
   1207       0       0    1207     4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
											
										 
											2008-05-08 11:53:48 +02:00
										 |  |  | 	list_add_tail(&waiter.list, &sem->wait_list); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	waiter.task = task; | 
					
						
							| 
									
										
										
										
											2013-04-30 15:28:33 -07:00
										 |  |  | 	waiter.up = false; | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	for (;;) { | 
					
						
							| 
									
										
										
										
											2008-08-05 13:01:13 -07:00
										 |  |  | 		if (signal_pending_state(state, task)) | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 			goto interrupted; | 
					
						
							| 
									
										
										
										
											2013-04-30 15:28:32 -07:00
										 |  |  | 		if (unlikely(timeout <= 0)) | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 			goto timed_out; | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 		__set_task_state(task, state); | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 		raw_spin_unlock_irq(&sem->lock); | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | 		timeout = schedule_timeout(timeout); | 
					
						
							| 
									
										
										
										
											2010-02-24 09:50:22 +01:00
										 |  |  | 		raw_spin_lock_irq(&sem->lock); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 		if (waiter.up) | 
					
						
							|  |  |  | 			return 0; | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 	} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  |  timed_out: | 
					
						
							|  |  |  | 	list_del(&waiter.list); | 
					
						
							|  |  |  | 	return -ETIME; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  interrupted: | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | 	list_del(&waiter.list); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	return -EINTR; | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static noinline void __sched __down(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | 	__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static noinline int __sched __down_interruptible(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | 	return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-03-14 13:19:33 -04:00
										 |  |  | static noinline int __sched __down_killable(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | 	return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-09-03 03:17:24 -07:00
										 |  |  | static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) | 
					
						
							| 
									
										
										
										
											2008-03-14 13:43:13 -04:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2014-09-03 03:17:24 -07:00
										 |  |  | 	return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); | 
					
						
							| 
									
										
										
										
											2008-03-14 13:19:33 -04:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | static noinline void __sched __up(struct semaphore *sem) | 
					
						
							|  |  |  | { | 
					
						
							| 
									
										
										
										
											2008-03-14 14:35:22 -04:00
										 |  |  | 	struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, | 
					
						
							|  |  |  | 						struct semaphore_waiter, list); | 
					
						
							| 
									
										
										
										
											2008-05-10 20:43:22 -07:00
										 |  |  | 	list_del(&waiter->list); | 
					
						
							| 
									
										
										
										
											2013-04-30 15:28:33 -07:00
										 |  |  | 	waiter->up = true; | 
					
						
							| 
									
										
										
										
											2008-03-14 14:35:22 -04:00
										 |  |  | 	wake_up_process(waiter->task); | 
					
						
							| 
									
										
										
										
											2008-03-07 21:55:58 -05:00
										 |  |  | } |