Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into next

Pull scheduler updates from Ingo Molnar:
 "The main scheduling related changes in this cycle were:

   - various sched/numa updates, for better performance

   - tree wide cleanup of open coded nice levels

   - nohz fix related to rq->nr_running use

   - cpuidle changes and continued consolidation to improve the
     kernel/sched/idle.c high level idle scheduling logic.  As part of
     this effort I pulled cpuidle driver changes from Rafael as well.

   - standardized idle polling amongst architectures

   - continued work on preparing better power/energy aware scheduling

   - sched/rt updates

   - misc fixlets and cleanups"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (49 commits)
  sched/numa: Decay ->wakee_flips instead of zeroing
  sched/numa: Update migrate_improves/degrades_locality()
  sched/numa: Allow task switch if load imbalance improves
  sched/rt: Fix 'struct sched_dl_entity' and dl_task_time() comments, to match the current upstream code
  sched: Consolidate open coded implementations of nice level frobbing into nice_to_rlimit() and rlimit_to_nice()
  sched: Initialize rq->age_stamp on processor start
  sched, nohz: Change rq->nr_running to always use wrappers
  sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance()
  sched: Use clamp() and clamp_val() to make sys_nice() more readable
  sched: Do not zero sg->cpumask and sg->sgp->power in build_sched_groups()
  sched/numa: Fix initialization of sched_domain_topology for NUMA
  sched: Call select_idle_sibling() when not affine_sd
  sched: Simplify return logic in sched_read_attr()
  sched: Simplify return logic in sched_copy_attr()
  sched: Fix exec_start/task_hot on migrated tasks
  arm64: Remove TIF_POLLING_NRFLAG
  metag: Remove TIF_POLLING_NRFLAG
  sched/idle: Make cpuidle_idle_call() void
  sched/idle: Reflow cpuidle_idle_call()
  sched/idle: Delay clearing the polling bit
  ...
This commit is contained in:
Linus Torvalds 2014-06-03 14:00:15 -07:00
commit c84a1e32ee
48 changed files with 761 additions and 664 deletions

View file

@ -521,6 +521,39 @@ static inline void init_hrtick(void)
}
#endif /* CONFIG_SCHED_HRTICK */
/*
* cmpxchg based fetch_or, macro so it works for different integer types
*/
#define fetch_or(ptr, val) \
({ typeof(*(ptr)) __old, __val = *(ptr); \
for (;;) { \
__old = cmpxchg((ptr), __val, __val | (val)); \
if (__old == __val) \
break; \
__val = __old; \
} \
__old; \
})
#ifdef TIF_POLLING_NRFLAG
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
static bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#endif
/*
* resched_task - mark a task 'to be rescheduled now'.
*
@ -537,17 +570,15 @@ void resched_task(struct task_struct *p)
if (test_tsk_need_resched(p))
return;
set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id()) {
set_tsk_need_resched(p);
set_preempt_need_resched();
return;
}
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
if (!tsk_is_polling(p))
if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
}
@ -3018,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
int nice_rlim = 20 - nice;
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
@ -3042,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
if (increment < -40)
increment = -40;
if (increment > 40)
increment = 40;
increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
if (nice < MIN_NICE)
nice = MIN_NICE;
if (nice > MAX_NICE)
nice = MAX_NICE;
nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@ -3642,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
out:
return ret;
return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
ret = -E2BIG;
goto out;
return -E2BIG;
}
/**
@ -3808,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
for (; addr < end; addr++) {
if (*addr)
goto err_size;
return -EFBIG;
}
attr->size = usize;
@ -3818,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
if (ret)
return -EFAULT;
out:
return ret;
err_size:
ret = -E2BIG;
goto out;
return 0;
}
/**
@ -5093,10 +5110,20 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
static void __cpuinit set_cpu_rq_start_time(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
rq->age_stamp = sched_clock_cpu(cpu);
}
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
set_cpu_rq_start_time();
return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@ -5305,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES)) {
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
@ -5336,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING);
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@ -5610,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
static const struct cpumask *cpu_cpu_mask(int cpu)
{
return cpumask_of_node(cpu_to_node(cpu));
}
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
struct sched_group_power **__percpu sgp;
};
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@ -5633,21 +5651,6 @@ enum s_alloc {
sa_none,
};
struct sched_domain_topology_level;
typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
#define SDTL_OVERLAP 0x01
struct sched_domain_topology_level {
sched_domain_init_f init;
sched_domain_mask_f mask;
int flags;
int numa_level;
struct sd_data data;
};
/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
@ -5815,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
continue;
group = get_group(i, sdd, &sg);
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
@ -5866,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
}
int __weak arch_sd_sibling_asym_packing(void)
{
return 0*SD_ASYM_PACKING;
}
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
#ifdef CONFIG_SCHED_DEBUG
# define SD_INIT_NAME(sd, type) sd->name = #type
#else
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
#define SD_INIT_FUNC(type) \
static noinline struct sched_domain * \
sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
{ \
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
*sd = SD_##type##_INIT; \
SD_INIT_NAME(sd, type); \
sd->private = &tl->data; \
return sd; \
}
SD_INIT_FUNC(CPU)
#ifdef CONFIG_SCHED_SMT
SD_INIT_FUNC(SIBLING)
#endif
#ifdef CONFIG_SCHED_MC
SD_INIT_FUNC(MC)
#endif
#ifdef CONFIG_SCHED_BOOK
SD_INIT_FUNC(BOOK)
#endif
static int default_relax_domain_level = -1;
int sched_domain_level_max;
@ -5991,99 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
#ifdef CONFIG_SCHED_SMT
static const struct cpumask *cpu_smt_mask(int cpu)
{
return topology_thread_cpumask(cpu);
}
#endif
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ sd_init_SIBLING, cpu_smt_mask, },
#endif
#ifdef CONFIG_SCHED_MC
{ sd_init_MC, cpu_coregroup_mask, },
#endif
#ifdef CONFIG_SCHED_BOOK
{ sd_init_BOOK, cpu_book_mask, },
#endif
{ sd_init_CPU, cpu_cpu_mask, },
{ NULL, },
};
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->init; tl++)
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif
static inline int sd_local_flags(int level)
{
if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
return 0;
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
}
/*
* SD_flags allowed in topology descriptions.
*
* SD_SHARE_CPUPOWER - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUPOWER | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
SD_SHARE_POWERDOMAIN)
static struct sched_domain *
sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
sd_init(struct sched_domain_topology_level *tl, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
int level = tl->numa_level;
int sd_weight = cpumask_weight(
sched_domains_numa_masks[level][cpu_to_node(cpu)]);
int sd_weight, sd_flags = 0;
#ifdef CONFIG_NUMA
/*
* Ugly hack to pass state to sd_numa_mask()...
*/
sched_domains_curr_level = tl->numa_level;
#endif
sd_weight = cpumask_weight(tl->mask(cpu));
if (tl->sd_flags)
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
.cache_nice_tries = 2,
.busy_idx = 3,
.idle_idx = 2,
.cache_nice_tries = 0,
.busy_idx = 0,
.idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
| 0*SD_BALANCE_EXEC
| 0*SD_BALANCE_FORK
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 0*SD_WAKE_AFFINE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
| 1*SD_SERIALIZE
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
| 1*SD_NUMA
| sd_local_flags(level)
| 0*SD_NUMA
| sd_flags
,
.last_balance = jiffies,
.balance_interval = sd_weight,
.smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
};
SD_INIT_NAME(sd, NUMA);
sd->private = &tl->data;
/*
* Ugly hack to pass state to sd_numa_mask()...
* Convert topological properties into behaviour.
*/
sched_domains_curr_level = tl->numa_level;
if (sd->flags & SD_SHARE_CPUPOWER) {
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
sd->busy_idx = 3;
sd->idle_idx = 2;
sd->flags |= SD_SERIALIZE;
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
}
#endif
} else {
sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
}
sd->private = &tl->data;
return sd;
}
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
struct sched_domain_topology_level *sched_domain_topology = default_topology;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
void set_sched_topology(struct sched_domain_topology_level *tl)
{
sched_domain_topology = tl;
}
#ifdef CONFIG_NUMA
static const struct cpumask *sd_numa_mask(int cpu)
{
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@ -6227,7 +6250,10 @@ static void sched_init_numa(void)
}
}
tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
tl = kzalloc((i + level + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@ -6235,18 +6261,19 @@ static void sched_init_numa(void)
/*
* Copy the default topology bits..
*/
for (i = 0; default_topology[i].init; i++)
tl[i] = default_topology[i];
for (i = 0; sched_domain_topology[i].mask; i++)
tl[i] = sched_domain_topology[i];
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.init = sd_numa_init,
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
.flags = SDTL_OVERLAP,
.numa_level = j,
SD_INIT_NAME(NUMA)
};
}
@ -6404,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
struct sched_domain *sd = tl->init(tl, cpu);
struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
@ -6974,6 +7001,7 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time();
#endif
init_sched_fair_class();