blob: 886531c681c1c353f82d9f11ec887881148cfc3f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */
20
21#include <linux/mm.h>
22#include <linux/module.h>
23#include <linux/nmi.h>
24#include <linux/init.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020025#include <linux/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026#include <linux/highmem.h>
27#include <linux/smp_lock.h>
28#include <asm/mmu_context.h>
29#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080030#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031#include <linux/completion.h>
32#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070033#include <linux/debug_locks.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/security.h>
35#include <linux/notifier.h>
36#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080037#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080038#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <linux/blkdev.h>
40#include <linux/delay.h>
41#include <linux/smp.h>
42#include <linux/threads.h>
43#include <linux/timer.h>
44#include <linux/rcupdate.h>
45#include <linux/cpu.h>
46#include <linux/cpuset.h>
47#include <linux/percpu.h>
48#include <linux/kthread.h>
49#include <linux/seq_file.h>
50#include <linux/syscalls.h>
51#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070052#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080053#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070054#include <linux/delayacct.h>
Eric Dumazet5517d862007-05-08 00:32:57 -070055#include <linux/reciprocal_div.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020056#include <linux/unistd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070057
Eric Dumazet5517d862007-05-08 00:32:57 -070058#include <asm/tlb.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070059
60/*
Alexey Dobriyanb035b6d2007-02-10 01:45:10 -080061 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
64 */
65unsigned long long __attribute__((weak)) sched_clock(void)
66{
67 return (unsigned long long)jiffies * (1000000000 / HZ);
68}
69
70/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070071 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
73 * and back.
74 */
75#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
78
79/*
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
83 */
84#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
87
88/*
89 * Some helpers for converting nanosecond timing to jiffy resolution
90 */
91#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
93
Ingo Molnar6aa645e2007-07-09 18:51:58 +020094#define NICE_0_LOAD SCHED_LOAD_SCALE
95#define NICE_0_SHIFT SCHED_LOAD_SHIFT
96
Linus Torvalds1da177e2005-04-16 15:20:36 -070097/*
98 * These are the 'tuning knobs' of the scheduler:
99 *
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
103 */
104#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105#define DEF_TIMESLICE (100 * HZ / 1000)
Peter Williams2dd73a42006-06-27 02:54:34 -0700106
Eric Dumazet5517d862007-05-08 00:32:57 -0700107#ifdef CONFIG_SMP
108/*
109 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
110 * Since cpu_power is a 'constant', we can use a reciprocal divide.
111 */
112static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
113{
114 return reciprocal_divide(load, sg->reciprocal_cpu_power);
115}
116
117/*
118 * Each time a sched group cpu_power is changed,
119 * we must compute its reciprocal value
120 */
121static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
122{
123 sg->__cpu_power += val;
124 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
125}
126#endif
127
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200128#define SCALE_PRIO(x, prio) \
129 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
Borislav Petkov91fcdd42006-10-19 23:28:29 -0700130
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200131/*
132 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
133 * to time slice values: [800ms ... 100ms ... 5ms]
134 */
135static unsigned int static_prio_timeslice(int static_prio)
Peter Williams2dd73a42006-06-27 02:54:34 -0700136{
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200137 if (static_prio == NICE_TO_PRIO(19))
138 return 1;
139
140 if (static_prio < NICE_TO_PRIO(0))
141 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
142 else
143 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
Peter Williams2dd73a42006-06-27 02:54:34 -0700144}
145
Ingo Molnare05606d2007-07-09 18:51:59 +0200146static inline int rt_policy(int policy)
147{
148 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
149 return 1;
150 return 0;
151}
152
153static inline int task_has_rt_policy(struct task_struct *p)
154{
155 return rt_policy(p->policy);
156}
157
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158/*
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200159 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 */
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200161struct rt_prio_array {
162 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
163 struct list_head queue[MAX_RT_PRIO];
164};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200166struct load_stat {
167 struct load_weight load;
168 u64 load_update_start, load_update_last;
169 unsigned long delta_fair, delta_exec, delta_stat;
170};
171
172/* CFS-related fields in a runqueue */
173struct cfs_rq {
174 struct load_weight load;
175 unsigned long nr_running;
176
177 s64 fair_clock;
178 u64 exec_clock;
179 s64 wait_runtime;
180 u64 sleeper_bonus;
181 unsigned long wait_runtime_overruns, wait_runtime_underruns;
182
183 struct rb_root tasks_timeline;
184 struct rb_node *rb_leftmost;
185 struct rb_node *rb_load_balance_curr;
186#ifdef CONFIG_FAIR_GROUP_SCHED
187 /* 'curr' points to currently running entity on this cfs_rq.
188 * It is set to NULL otherwise (i.e when none are currently running).
189 */
190 struct sched_entity *curr;
191 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
192
193 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
194 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
195 * (like users, containers etc.)
196 *
197 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
198 * list is used during load balance.
199 */
200 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
201#endif
202};
203
204/* Real-Time classes' related field in a runqueue: */
205struct rt_rq {
206 struct rt_prio_array active;
207 int rt_load_balance_idx;
208 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
209};
210
211/*
212 * The prio-array type of the old scheduler:
213 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214struct prio_array {
215 unsigned int nr_active;
Steven Rostedtd4448862006-06-27 02:54:29 -0700216 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217 struct list_head queue[MAX_PRIO];
218};
219
220/*
221 * This is the main, per-CPU runqueue data structure.
222 *
223 * Locking rule: those places that want to lock multiple runqueues
224 * (such as the load balancing or the thread migration code), lock
225 * acquire operations must be ordered by ascending &runqueue.
226 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700227struct rq {
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200228 spinlock_t lock; /* runqueue lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700229
230 /*
231 * nr_running and cpu_load should be in the same cacheline because
232 * remote CPUs use both these fields when doing load calculation.
233 */
234 unsigned long nr_running;
Peter Williams2dd73a42006-06-27 02:54:34 -0700235 unsigned long raw_weighted_load;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200236 #define CPU_LOAD_IDX_MAX 5
237 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Siddha, Suresh Bbdecea32007-05-08 00:32:48 -0700238 unsigned char idle_at_tick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700239#ifdef CONFIG_NO_HZ
240 unsigned char in_nohz_recently;
241#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200242 struct load_stat ls; /* capture load from *all* tasks on this cpu */
243 unsigned long nr_load_updates;
244 u64 nr_switches;
245
246 struct cfs_rq cfs;
247#ifdef CONFIG_FAIR_GROUP_SCHED
248 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200250 struct rt_rq rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
252 /*
253 * This is part of a global counter where only the total sum
254 * over all CPUs matters. A task can increase this counter on
255 * one CPU and if it got migrated afterwards it may decrease
256 * it on another CPU. Always updated under the runqueue lock:
257 */
258 unsigned long nr_uninterruptible;
259
260 unsigned long expired_timestamp;
Mike Galbraithb18ec802006-12-10 02:20:31 -0800261 unsigned long long most_recent_timestamp;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200262
Ingo Molnar36c8b582006-07-03 00:25:41 -0700263 struct task_struct *curr, *idle;
Christoph Lameterc9819f42006-12-10 02:20:25 -0800264 unsigned long next_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265 struct mm_struct *prev_mm;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200266
Ingo Molnar70b97a72006-07-03 00:25:42 -0700267 struct prio_array *active, *expired, arrays[2];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 int best_expired_prio;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200269
270 u64 clock, prev_clock_raw;
271 s64 clock_max_delta;
272
273 unsigned int clock_warps, clock_overflows;
274 unsigned int clock_unstable_events;
275
276 struct sched_class *load_balance_class;
277
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 atomic_t nr_iowait;
279
280#ifdef CONFIG_SMP
281 struct sched_domain *sd;
282
283 /* For active balancing */
284 int active_balance;
285 int push_cpu;
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700286 int cpu; /* cpu of this runqueue */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287
Ingo Molnar36c8b582006-07-03 00:25:41 -0700288 struct task_struct *migration_thread;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 struct list_head migration_queue;
290#endif
291
292#ifdef CONFIG_SCHEDSTATS
293 /* latency stats */
294 struct sched_info rq_sched_info;
295
296 /* sys_sched_yield() stats */
297 unsigned long yld_exp_empty;
298 unsigned long yld_act_empty;
299 unsigned long yld_both_empty;
300 unsigned long yld_cnt;
301
302 /* schedule() stats */
303 unsigned long sched_switch;
304 unsigned long sched_cnt;
305 unsigned long sched_goidle;
306
307 /* try_to_wake_up() stats */
308 unsigned long ttwu_cnt;
309 unsigned long ttwu_local;
310#endif
Ingo Molnarfcb99372006-07-03 00:25:10 -0700311 struct lock_class_key rq_lock_key;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312};
313
Siddha, Suresh Bc3396622007-05-08 00:33:09 -0700314static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
Gautham R Shenoy5be93612007-05-09 02:34:04 -0700315static DEFINE_MUTEX(sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316
Ingo Molnardd41f592007-07-09 18:51:59 +0200317static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
318{
319 rq->curr->sched_class->check_preempt_curr(rq, p);
320}
321
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700322static inline int cpu_of(struct rq *rq)
323{
324#ifdef CONFIG_SMP
325 return rq->cpu;
326#else
327 return 0;
328#endif
329}
330
Nick Piggin674311d2005-06-25 14:57:27 -0700331/*
Ingo Molnar20d315d2007-07-09 18:51:58 +0200332 * Per-runqueue clock, as finegrained as the platform can give us:
333 */
334static unsigned long long __rq_clock(struct rq *rq)
335{
336 u64 prev_raw = rq->prev_clock_raw;
337 u64 now = sched_clock();
338 s64 delta = now - prev_raw;
339 u64 clock = rq->clock;
340
341 /*
342 * Protect against sched_clock() occasionally going backwards:
343 */
344 if (unlikely(delta < 0)) {
345 clock++;
346 rq->clock_warps++;
347 } else {
348 /*
349 * Catch too large forward jumps too:
350 */
351 if (unlikely(delta > 2*TICK_NSEC)) {
352 clock++;
353 rq->clock_overflows++;
354 } else {
355 if (unlikely(delta > rq->clock_max_delta))
356 rq->clock_max_delta = delta;
357 clock += delta;
358 }
359 }
360
361 rq->prev_clock_raw = now;
362 rq->clock = clock;
363
364 return clock;
365}
366
367static inline unsigned long long rq_clock(struct rq *rq)
368{
369 int this_cpu = smp_processor_id();
370
371 if (this_cpu == cpu_of(rq))
372 return __rq_clock(rq);
373
374 return rq->clock;
375}
376
377/*
Nick Piggin674311d2005-06-25 14:57:27 -0700378 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -0700379 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin674311d2005-06-25 14:57:27 -0700380 *
381 * The domain tree of any CPU may only be accessed from within
382 * preempt-disabled sections.
383 */
Ingo Molnar48f24c42006-07-03 00:25:40 -0700384#define for_each_domain(cpu, __sd) \
385 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
387#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
388#define this_rq() (&__get_cpu_var(runqueues))
389#define task_rq(p) cpu_rq(task_cpu(p))
390#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
391
Ingo Molnar138a8ae2007-07-09 18:51:58 +0200392#ifdef CONFIG_FAIR_GROUP_SCHED
393/* Change a task's ->cfs_rq if it moves across CPUs */
394static inline void set_task_cfs_rq(struct task_struct *p)
395{
396 p->se.cfs_rq = &task_rq(p)->cfs;
397}
398#else
399static inline void set_task_cfs_rq(struct task_struct *p)
400{
401}
402#endif
403
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404#ifndef prepare_arch_switch
Nick Piggin4866cde2005-06-25 14:57:23 -0700405# define prepare_arch_switch(next) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406#endif
Nick Piggin4866cde2005-06-25 14:57:23 -0700407#ifndef finish_arch_switch
408# define finish_arch_switch(prev) do { } while (0)
409#endif
410
411#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar70b97a72006-07-03 00:25:42 -0700412static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700413{
414 return rq->curr == p;
415}
416
Ingo Molnar70b97a72006-07-03 00:25:42 -0700417static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700418{
419}
420
Ingo Molnar70b97a72006-07-03 00:25:42 -0700421static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700422{
Ingo Molnarda04c032005-09-13 11:17:59 +0200423#ifdef CONFIG_DEBUG_SPINLOCK
424 /* this is a valid case when another task releases the spinlock */
425 rq->lock.owner = current;
426#endif
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700427 /*
428 * If we are tracking spinlock dependencies then we have to
429 * fix up the runqueue lock - which gets 'carried over' from
430 * prev into current:
431 */
432 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
433
Nick Piggin4866cde2005-06-25 14:57:23 -0700434 spin_unlock_irq(&rq->lock);
435}
436
437#else /* __ARCH_WANT_UNLOCKED_CTXSW */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700438static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700439{
440#ifdef CONFIG_SMP
441 return p->oncpu;
442#else
443 return rq->curr == p;
444#endif
445}
446
Ingo Molnar70b97a72006-07-03 00:25:42 -0700447static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700448{
449#ifdef CONFIG_SMP
450 /*
451 * We can optimise this out completely for !SMP, because the
452 * SMP rebalancing from interrupt is the only thing that cares
453 * here.
454 */
455 next->oncpu = 1;
456#endif
457#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
458 spin_unlock_irq(&rq->lock);
459#else
460 spin_unlock(&rq->lock);
461#endif
462}
463
Ingo Molnar70b97a72006-07-03 00:25:42 -0700464static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700465{
466#ifdef CONFIG_SMP
467 /*
468 * After ->oncpu is cleared, the task can be moved to a different CPU.
469 * We must ensure this doesn't happen until the switch is completely
470 * finished.
471 */
472 smp_wmb();
473 prev->oncpu = 0;
474#endif
475#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
476 local_irq_enable();
477#endif
478}
479#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480
481/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700482 * __task_rq_lock - lock the runqueue a given task resides on.
483 * Must be called interrupts disabled.
484 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700485static inline struct rq *__task_rq_lock(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700486 __acquires(rq->lock)
487{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700488 struct rq *rq;
Ingo Molnarb29739f2006-06-27 02:54:51 -0700489
490repeat_lock_task:
491 rq = task_rq(p);
492 spin_lock(&rq->lock);
493 if (unlikely(rq != task_rq(p))) {
494 spin_unlock(&rq->lock);
495 goto repeat_lock_task;
496 }
497 return rq;
498}
499
500/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 * task_rq_lock - lock the runqueue a given task resides on and disable
502 * interrupts. Note the ordering: we can safely lookup the task_rq without
503 * explicitly disabling preemption.
504 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700505static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506 __acquires(rq->lock)
507{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700508 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509
510repeat_lock_task:
511 local_irq_save(*flags);
512 rq = task_rq(p);
513 spin_lock(&rq->lock);
514 if (unlikely(rq != task_rq(p))) {
515 spin_unlock_irqrestore(&rq->lock, *flags);
516 goto repeat_lock_task;
517 }
518 return rq;
519}
520
Ingo Molnar70b97a72006-07-03 00:25:42 -0700521static inline void __task_rq_unlock(struct rq *rq)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700522 __releases(rq->lock)
523{
524 spin_unlock(&rq->lock);
525}
526
Ingo Molnar70b97a72006-07-03 00:25:42 -0700527static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528 __releases(rq->lock)
529{
530 spin_unlock_irqrestore(&rq->lock, *flags);
531}
532
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533/*
Robert P. J. Daycc2a73b2006-12-10 02:20:00 -0800534 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700536static inline struct rq *this_rq_lock(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537 __acquires(rq->lock)
538{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700539 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540
541 local_irq_disable();
542 rq = this_rq();
543 spin_lock(&rq->lock);
544
545 return rq;
546}
547
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200548/*
Ingo Molnar1b9f19c2007-07-09 18:51:59 +0200549 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
550 */
551void sched_clock_unstable_event(void)
552{
553 unsigned long flags;
554 struct rq *rq;
555
556 rq = task_rq_lock(current, &flags);
557 rq->prev_clock_raw = sched_clock();
558 rq->clock_unstable_events++;
559 task_rq_unlock(rq, &flags);
560}
561
562/*
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200563 * resched_task - mark a task 'to be rescheduled now'.
564 *
565 * On UP this means the setting of the need_resched flag, on SMP it
566 * might also involve a cross-CPU call to trigger the scheduler on
567 * the target CPU.
568 */
569#ifdef CONFIG_SMP
570
571#ifndef tsk_is_polling
572#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
573#endif
574
575static void resched_task(struct task_struct *p)
576{
577 int cpu;
578
579 assert_spin_locked(&task_rq(p)->lock);
580
581 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
582 return;
583
584 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
585
586 cpu = task_cpu(p);
587 if (cpu == smp_processor_id())
588 return;
589
590 /* NEED_RESCHED must be visible before we test polling */
591 smp_mb();
592 if (!tsk_is_polling(p))
593 smp_send_reschedule(cpu);
594}
595
596static void resched_cpu(int cpu)
597{
598 struct rq *rq = cpu_rq(cpu);
599 unsigned long flags;
600
601 if (!spin_trylock_irqsave(&rq->lock, flags))
602 return;
603 resched_task(cpu_curr(cpu));
604 spin_unlock_irqrestore(&rq->lock, flags);
605}
606#else
607static inline void resched_task(struct task_struct *p)
608{
609 assert_spin_locked(&task_rq(p)->lock);
610 set_tsk_need_resched(p);
611}
612#endif
613
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200614static u64 div64_likely32(u64 divident, unsigned long divisor)
615{
616#if BITS_PER_LONG == 32
617 if (likely(divident <= 0xffffffffULL))
618 return (u32)divident / divisor;
619 do_div(divident, divisor);
620
621 return divident;
622#else
623 return divident / divisor;
624#endif
625}
626
627#if BITS_PER_LONG == 32
628# define WMULT_CONST (~0UL)
629#else
630# define WMULT_CONST (1UL << 32)
631#endif
632
633#define WMULT_SHIFT 32
634
635static inline unsigned long
636calc_delta_mine(unsigned long delta_exec, unsigned long weight,
637 struct load_weight *lw)
638{
639 u64 tmp;
640
641 if (unlikely(!lw->inv_weight))
642 lw->inv_weight = WMULT_CONST / lw->weight;
643
644 tmp = (u64)delta_exec * weight;
645 /*
646 * Check whether we'd overflow the 64-bit multiplication:
647 */
648 if (unlikely(tmp > WMULT_CONST)) {
649 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
650 >> (WMULT_SHIFT/2);
651 } else {
652 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
653 }
654
655 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
656}
657
658static inline unsigned long
659calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
660{
661 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
662}
663
664static void update_load_add(struct load_weight *lw, unsigned long inc)
665{
666 lw->weight += inc;
667 lw->inv_weight = 0;
668}
669
670static void update_load_sub(struct load_weight *lw, unsigned long dec)
671{
672 lw->weight -= dec;
673 lw->inv_weight = 0;
674}
675
676static void __update_curr_load(struct rq *rq, struct load_stat *ls)
677{
678 if (rq->curr != rq->idle && ls->load.weight) {
679 ls->delta_exec += ls->delta_stat;
680 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
681 ls->delta_stat = 0;
682 }
683}
684
685/*
686 * Update delta_exec, delta_fair fields for rq.
687 *
688 * delta_fair clock advances at a rate inversely proportional to
689 * total load (rq->ls.load.weight) on the runqueue, while
690 * delta_exec advances at the same rate as wall-clock (provided
691 * cpu is not idle).
692 *
693 * delta_exec / delta_fair is a measure of the (smoothened) load on this
694 * runqueue over any given interval. This (smoothened) load is used
695 * during load balance.
696 *
697 * This function is called /before/ updating rq->ls.load
698 * and when switching tasks.
699 */
700static void update_curr_load(struct rq *rq, u64 now)
701{
702 struct load_stat *ls = &rq->ls;
703 u64 start;
704
705 start = ls->load_update_start;
706 ls->load_update_start = now;
707 ls->delta_stat += now - start;
708 /*
709 * Stagger updates to ls->delta_fair. Very frequent updates
710 * can be expensive.
711 */
712 if (ls->delta_stat >= sysctl_sched_stat_granularity)
713 __update_curr_load(rq, ls);
714}
715
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716/*
Peter Williams2dd73a42006-06-27 02:54:34 -0700717 * To aid in avoiding the subversion of "niceness" due to uneven distribution
718 * of tasks with abnormal "nice" values across CPUs the contribution that
719 * each task makes to its run queue's load is weighted according to its
720 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
721 * scaled version of the new time slice allocation that they receive on time
722 * slice expiry etc.
723 */
724
725/*
726 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
727 * If static_prio_timeslice() is ever changed to break this assumption then
728 * this code will need modification
729 */
730#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
Ingo Molnardd41f592007-07-09 18:51:59 +0200731#define load_weight(lp) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700732 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
733#define PRIO_TO_LOAD_WEIGHT(prio) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200734 load_weight(static_prio_timeslice(prio))
Peter Williams2dd73a42006-06-27 02:54:34 -0700735#define RTPRIO_TO_LOAD_WEIGHT(rp) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200736 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
737
738#define WEIGHT_IDLEPRIO 2
739#define WMULT_IDLEPRIO (1 << 31)
740
741/*
742 * Nice levels are multiplicative, with a gentle 10% change for every
743 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
744 * nice 1, it will get ~10% less CPU time than another CPU-bound task
745 * that remained on nice 0.
746 *
747 * The "10% effect" is relative and cumulative: from _any_ nice level,
748 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
749 * it's +10% CPU usage.
750 */
751static const int prio_to_weight[40] = {
752/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
753/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
754/* 0 */ NICE_0_LOAD /* 1024 */,
755/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
756/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
757};
758
759static const u32 prio_to_wmult[40] = {
760 48356, 60446, 75558, 94446, 118058, 147573,
761 184467, 230589, 288233, 360285, 450347,
762 562979, 703746, 879575, 1099582, 1374389,
763 717986, 2147483, 2684354, 3355443, 4194304,
764 244160, 6557201, 8196502, 10250518, 12782640,
765 16025997, 19976592, 24970740, 31350126, 39045157,
766 49367440, 61356675, 76695844, 95443717, 119304647,
767 148102320, 186737708, 238609294, 286331153,
768};
Peter Williams2dd73a42006-06-27 02:54:34 -0700769
Ingo Molnar36c8b582006-07-03 00:25:41 -0700770static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200771inc_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700772{
Ingo Molnardd41f592007-07-09 18:51:59 +0200773 update_curr_load(rq, now);
774 update_load_add(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700775}
776
Ingo Molnar36c8b582006-07-03 00:25:41 -0700777static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200778dec_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700779{
Ingo Molnardd41f592007-07-09 18:51:59 +0200780 update_curr_load(rq, now);
781 update_load_sub(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700782}
783
Ingo Molnardd41f592007-07-09 18:51:59 +0200784static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700785{
786 rq->nr_running++;
Ingo Molnardd41f592007-07-09 18:51:59 +0200787 inc_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700788}
789
Ingo Molnardd41f592007-07-09 18:51:59 +0200790static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700791{
792 rq->nr_running--;
Ingo Molnardd41f592007-07-09 18:51:59 +0200793 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700794}
795
Ingo Molnardd41f592007-07-09 18:51:59 +0200796static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
797
798/*
799 * runqueue iterator, to support SMP load-balancing between different
800 * scheduling classes, without having to expose their internal data
801 * structures to the load-balancing proper:
802 */
803struct rq_iterator {
804 void *arg;
805 struct task_struct *(*start)(void *);
806 struct task_struct *(*next)(void *);
807};
808
809static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
810 unsigned long max_nr_move, unsigned long max_load_move,
811 struct sched_domain *sd, enum cpu_idle_type idle,
812 int *all_pinned, unsigned long *load_moved,
813 int this_best_prio, int best_prio, int best_prio_seen,
814 struct rq_iterator *iterator);
815
816#include "sched_stats.h"
817#include "sched_rt.c"
818#include "sched_fair.c"
819#include "sched_idletask.c"
820#ifdef CONFIG_SCHED_DEBUG
821# include "sched_debug.c"
822#endif
823
824#define sched_class_highest (&rt_sched_class)
825
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200826static void set_load_weight(struct task_struct *p)
827{
Ingo Molnardd41f592007-07-09 18:51:59 +0200828 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
829 p->se.wait_runtime = 0;
830
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200831 if (task_has_rt_policy(p)) {
Ingo Molnardd41f592007-07-09 18:51:59 +0200832 p->se.load.weight = prio_to_weight[0] * 2;
833 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
834 return;
835 }
836
837 /*
838 * SCHED_IDLE tasks get minimal weight:
839 */
840 if (p->policy == SCHED_IDLE) {
841 p->se.load.weight = WEIGHT_IDLEPRIO;
842 p->se.load.inv_weight = WMULT_IDLEPRIO;
843 return;
844 }
845
846 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
847 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200848}
849
Ingo Molnardd41f592007-07-09 18:51:59 +0200850static void
851enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200852{
853 sched_info_queued(p);
Ingo Molnardd41f592007-07-09 18:51:59 +0200854 p->sched_class->enqueue_task(rq, p, wakeup, now);
855 p->se.on_rq = 1;
856}
857
858static void
859dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
860{
861 p->sched_class->dequeue_task(rq, p, sleep, now);
862 p->se.on_rq = 0;
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200863}
864
865/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200866 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200867 */
Ingo Molnar14531182007-07-09 18:51:59 +0200868static inline int __normal_prio(struct task_struct *p)
869{
Ingo Molnardd41f592007-07-09 18:51:59 +0200870 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +0200871}
872
873/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700874 * Calculate the expected normal priority: i.e. priority
875 * without taking RT-inheritance into account. Might be
876 * boosted by interactivity modifiers. Changes upon fork,
877 * setprio syscalls, and whenever the interactivity
878 * estimator recalculates.
879 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700880static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700881{
882 int prio;
883
Ingo Molnare05606d2007-07-09 18:51:59 +0200884 if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -0700885 prio = MAX_RT_PRIO-1 - p->rt_priority;
886 else
887 prio = __normal_prio(p);
888 return prio;
889}
890
891/*
892 * Calculate the current priority, i.e. the priority
893 * taken into account by the scheduler. This value might
894 * be boosted by RT tasks, or might be boosted by
895 * interactivity modifiers. Will be RT if the task got
896 * RT-boosted. If not then it returns p->normal_prio.
897 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700898static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700899{
900 p->normal_prio = normal_prio(p);
901 /*
902 * If we are RT tasks or we were boosted to RT priority,
903 * keep the priority unchanged. Otherwise, update priority
904 * to the normal priority:
905 */
906 if (!rt_prio(p->prio))
907 return p->normal_prio;
908 return p->prio;
909}
910
911/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200912 * activate_task - move a task to the runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200914static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915{
Ingo Molnardd41f592007-07-09 18:51:59 +0200916 u64 now = rq_clock(rq);
Con Kolivasd425b272006-03-31 02:31:29 -0800917
Ingo Molnardd41f592007-07-09 18:51:59 +0200918 if (p->state == TASK_UNINTERRUPTIBLE)
919 rq->nr_uninterruptible--;
920
921 enqueue_task(rq, p, wakeup, now);
922 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923}
924
925/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200926 * activate_idle_task - move idle task to the _front_ of runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200928static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929{
Ingo Molnardd41f592007-07-09 18:51:59 +0200930 u64 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931
Ingo Molnardd41f592007-07-09 18:51:59 +0200932 if (p->state == TASK_UNINTERRUPTIBLE)
933 rq->nr_uninterruptible--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934
Ingo Molnardd41f592007-07-09 18:51:59 +0200935 enqueue_task(rq, p, 0, now);
936 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937}
938
939/*
940 * deactivate_task - remove a task from the runqueue.
941 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200942static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943{
Ingo Molnardd41f592007-07-09 18:51:59 +0200944 u64 now = rq_clock(rq);
945
946 if (p->state == TASK_UNINTERRUPTIBLE)
947 rq->nr_uninterruptible++;
948
949 dequeue_task(rq, p, sleep, now);
950 dec_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951}
952
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953/**
954 * task_curr - is this task currently executing on a CPU?
955 * @p: the task in question.
956 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700957inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958{
959 return cpu_curr(task_cpu(p)) == p;
960}
961
Peter Williams2dd73a42006-06-27 02:54:34 -0700962/* Used instead of source_load when we know the type == 0 */
963unsigned long weighted_cpuload(const int cpu)
964{
Ingo Molnardd41f592007-07-09 18:51:59 +0200965 return cpu_rq(cpu)->ls.load.weight;
966}
967
968static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
969{
970#ifdef CONFIG_SMP
971 task_thread_info(p)->cpu = cpu;
972 set_task_cfs_rq(p);
973#endif
Peter Williams2dd73a42006-06-27 02:54:34 -0700974}
975
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976#ifdef CONFIG_SMP
Ingo Molnarc65cc872007-07-09 18:51:58 +0200977
Ingo Molnardd41f592007-07-09 18:51:59 +0200978void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +0200979{
Ingo Molnardd41f592007-07-09 18:51:59 +0200980 int old_cpu = task_cpu(p);
981 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
982 u64 clock_offset, fair_clock_offset;
983
984 clock_offset = old_rq->clock - new_rq->clock;
985 fair_clock_offset = old_rq->cfs.fair_clock -
986 new_rq->cfs.fair_clock;
987 if (p->se.wait_start)
988 p->se.wait_start -= clock_offset;
989 if (p->se.wait_start_fair)
990 p->se.wait_start_fair -= fair_clock_offset;
991 if (p->se.sleep_start)
992 p->se.sleep_start -= clock_offset;
993 if (p->se.block_start)
994 p->se.block_start -= clock_offset;
995 if (p->se.sleep_start_fair)
996 p->se.sleep_start_fair -= fair_clock_offset;
997
998 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +0200999}
1000
Ingo Molnar70b97a72006-07-03 00:25:42 -07001001struct migration_req {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001003
Ingo Molnar36c8b582006-07-03 00:25:41 -07001004 struct task_struct *task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005 int dest_cpu;
1006
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 struct completion done;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001008};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009
1010/*
1011 * The task's runqueue lock must be held.
1012 * Returns true if you have to wait for migration thread.
1013 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001014static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07001015migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001017 struct rq *rq = task_rq(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018
1019 /*
1020 * If the task is not on a runqueue (and not running), then
1021 * it is sufficient to simply update the task's cpu field.
1022 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001023 if (!p->se.on_rq && !task_running(rq, p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024 set_task_cpu(p, dest_cpu);
1025 return 0;
1026 }
1027
1028 init_completion(&req->done);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 req->task = p;
1030 req->dest_cpu = dest_cpu;
1031 list_add(&req->list, &rq->migration_queue);
Ingo Molnar48f24c42006-07-03 00:25:40 -07001032
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033 return 1;
1034}
1035
1036/*
1037 * wait_task_inactive - wait for a thread to unschedule.
1038 *
1039 * The caller must ensure that the task *will* unschedule sometime soon,
1040 * else this function might spin for a *long* time. This function can't
1041 * be called with interrupts off, or it may introduce deadlock with
1042 * smp_call_function() if an IPI is sent by the same process we are
1043 * waiting to become inactive.
1044 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001045void wait_task_inactive(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046{
1047 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001048 int running, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001049 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050
1051repeat:
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001052 /*
1053 * We do the initial early heuristics without holding
1054 * any task-queue locks at all. We'll only try to get
1055 * the runqueue lock when things look like they will
1056 * work out!
1057 */
1058 rq = task_rq(p);
1059
1060 /*
1061 * If the task is actively running on another CPU
1062 * still, just relax and busy-wait without holding
1063 * any locks.
1064 *
1065 * NOTE! Since we don't hold any locks, it's not
1066 * even sure that "rq" stays as the right runqueue!
1067 * But we don't care, since "task_running()" will
1068 * return false if the runqueue has changed and p
1069 * is actually now running somewhere else!
1070 */
1071 while (task_running(rq, p))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 cpu_relax();
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001073
1074 /*
1075 * Ok, time to look more closely! We need the rq
1076 * lock now, to be *sure*. If we're wrong, we'll
1077 * just go back and repeat.
1078 */
1079 rq = task_rq_lock(p, &flags);
1080 running = task_running(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02001081 on_rq = p->se.on_rq;
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001082 task_rq_unlock(rq, &flags);
1083
1084 /*
1085 * Was it really running after all now that we
1086 * checked with the proper locks actually held?
1087 *
1088 * Oops. Go back and try again..
1089 */
1090 if (unlikely(running)) {
1091 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 goto repeat;
1093 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001094
1095 /*
1096 * It's not enough that it's not actively running,
1097 * it must be off the runqueue _entirely_, and not
1098 * preempted!
1099 *
1100 * So if it wa still runnable (but just not actively
1101 * running right now), it's preempted, and we should
1102 * yield - it could be a while.
1103 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001104 if (unlikely(on_rq)) {
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001105 yield();
1106 goto repeat;
1107 }
1108
1109 /*
1110 * Ahh, all good. It wasn't running, and it wasn't
1111 * runnable, which means that it will never become
1112 * running in the future either. We're all done!
1113 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114}
1115
1116/***
1117 * kick_process - kick a running thread to enter/exit the kernel
1118 * @p: the to-be-kicked thread
1119 *
1120 * Cause a process which is running on another CPU to enter
1121 * kernel-mode, without any delay. (to get signals handled.)
1122 *
1123 * NOTE: this function doesnt have to take the runqueue lock,
1124 * because all it wants to ensure is that the remote task enters
1125 * the kernel. If the IPI races and the task has been migrated
1126 * to another CPU then no harm is done and the purpose has been
1127 * achieved as well.
1128 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001129void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130{
1131 int cpu;
1132
1133 preempt_disable();
1134 cpu = task_cpu(p);
1135 if ((cpu != smp_processor_id()) && task_curr(p))
1136 smp_send_reschedule(cpu);
1137 preempt_enable();
1138}
1139
1140/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001141 * Return a low guess at the load of a migration-source cpu weighted
1142 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 *
1144 * We want to under-estimate the load of migration sources, to
1145 * balance conservatively.
1146 */
Con Kolivasb9104722005-11-08 21:38:55 -08001147static inline unsigned long source_load(int cpu, int type)
1148{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001149 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001150 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001151
Peter Williams2dd73a42006-06-27 02:54:34 -07001152 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001153 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001154
Ingo Molnardd41f592007-07-09 18:51:59 +02001155 return min(rq->cpu_load[type-1], total);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156}
1157
1158/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001159 * Return a high guess at the load of a migration-target cpu weighted
1160 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 */
Con Kolivasb9104722005-11-08 21:38:55 -08001162static inline unsigned long target_load(int cpu, int type)
1163{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001164 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001165 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001166
Peter Williams2dd73a42006-06-27 02:54:34 -07001167 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001168 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001169
Ingo Molnardd41f592007-07-09 18:51:59 +02001170 return max(rq->cpu_load[type-1], total);
Peter Williams2dd73a42006-06-27 02:54:34 -07001171}
1172
1173/*
1174 * Return the average load per task on the cpu's run queue
1175 */
1176static inline unsigned long cpu_avg_load_per_task(int cpu)
1177{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001178 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001179 unsigned long total = weighted_cpuload(cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001180 unsigned long n = rq->nr_running;
1181
Ingo Molnardd41f592007-07-09 18:51:59 +02001182 return n ? total / n : SCHED_LOAD_SCALE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183}
1184
Nick Piggin147cbb42005-06-25 14:57:19 -07001185/*
1186 * find_idlest_group finds and returns the least busy CPU group within the
1187 * domain.
1188 */
1189static struct sched_group *
1190find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1191{
1192 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1193 unsigned long min_load = ULONG_MAX, this_load = 0;
1194 int load_idx = sd->forkexec_idx;
1195 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1196
1197 do {
1198 unsigned long load, avg_load;
1199 int local_group;
1200 int i;
1201
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001202 /* Skip over this group if it has no CPUs allowed */
1203 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1204 goto nextgroup;
1205
Nick Piggin147cbb42005-06-25 14:57:19 -07001206 local_group = cpu_isset(this_cpu, group->cpumask);
Nick Piggin147cbb42005-06-25 14:57:19 -07001207
1208 /* Tally up the load of all CPUs in the group */
1209 avg_load = 0;
1210
1211 for_each_cpu_mask(i, group->cpumask) {
1212 /* Bias balancing toward cpus of our domain */
1213 if (local_group)
1214 load = source_load(i, load_idx);
1215 else
1216 load = target_load(i, load_idx);
1217
1218 avg_load += load;
1219 }
1220
1221 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07001222 avg_load = sg_div_cpu_power(group,
1223 avg_load * SCHED_LOAD_SCALE);
Nick Piggin147cbb42005-06-25 14:57:19 -07001224
1225 if (local_group) {
1226 this_load = avg_load;
1227 this = group;
1228 } else if (avg_load < min_load) {
1229 min_load = avg_load;
1230 idlest = group;
1231 }
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001232nextgroup:
Nick Piggin147cbb42005-06-25 14:57:19 -07001233 group = group->next;
1234 } while (group != sd->groups);
1235
1236 if (!idlest || 100*this_load < imbalance*min_load)
1237 return NULL;
1238 return idlest;
1239}
1240
1241/*
Satoru Takeuchi0feaece2006-10-03 01:14:10 -07001242 * find_idlest_cpu - find the idlest cpu among the cpus in group.
Nick Piggin147cbb42005-06-25 14:57:19 -07001243 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07001244static int
1245find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Nick Piggin147cbb42005-06-25 14:57:19 -07001246{
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001247 cpumask_t tmp;
Nick Piggin147cbb42005-06-25 14:57:19 -07001248 unsigned long load, min_load = ULONG_MAX;
1249 int idlest = -1;
1250 int i;
1251
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001252 /* Traverse only the allowed CPUs */
1253 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1254
1255 for_each_cpu_mask(i, tmp) {
Peter Williams2dd73a42006-06-27 02:54:34 -07001256 load = weighted_cpuload(i);
Nick Piggin147cbb42005-06-25 14:57:19 -07001257
1258 if (load < min_load || (load == min_load && i == this_cpu)) {
1259 min_load = load;
1260 idlest = i;
1261 }
1262 }
1263
1264 return idlest;
1265}
1266
Nick Piggin476d1392005-06-25 14:57:29 -07001267/*
1268 * sched_balance_self: balance the current task (running on cpu) in domains
1269 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1270 * SD_BALANCE_EXEC.
1271 *
1272 * Balance, ie. select the least loaded group.
1273 *
1274 * Returns the target CPU number, or the same CPU if no balancing is needed.
1275 *
1276 * preempt must be disabled.
1277 */
1278static int sched_balance_self(int cpu, int flag)
1279{
1280 struct task_struct *t = current;
1281 struct sched_domain *tmp, *sd = NULL;
Nick Piggin147cbb42005-06-25 14:57:19 -07001282
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001283 for_each_domain(cpu, tmp) {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07001284 /*
1285 * If power savings logic is enabled for a domain, stop there.
1286 */
1287 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1288 break;
Nick Piggin476d1392005-06-25 14:57:29 -07001289 if (tmp->flags & flag)
1290 sd = tmp;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001291 }
Nick Piggin476d1392005-06-25 14:57:29 -07001292
1293 while (sd) {
1294 cpumask_t span;
1295 struct sched_group *group;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001296 int new_cpu, weight;
1297
1298 if (!(sd->flags & flag)) {
1299 sd = sd->child;
1300 continue;
1301 }
Nick Piggin476d1392005-06-25 14:57:29 -07001302
1303 span = sd->span;
1304 group = find_idlest_group(sd, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001305 if (!group) {
1306 sd = sd->child;
1307 continue;
1308 }
Nick Piggin476d1392005-06-25 14:57:29 -07001309
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001310 new_cpu = find_idlest_cpu(group, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001311 if (new_cpu == -1 || new_cpu == cpu) {
1312 /* Now try balancing at a lower domain level of cpu */
1313 sd = sd->child;
1314 continue;
1315 }
Nick Piggin476d1392005-06-25 14:57:29 -07001316
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001317 /* Now try balancing at a lower domain level of new_cpu */
Nick Piggin476d1392005-06-25 14:57:29 -07001318 cpu = new_cpu;
Nick Piggin476d1392005-06-25 14:57:29 -07001319 sd = NULL;
1320 weight = cpus_weight(span);
1321 for_each_domain(cpu, tmp) {
1322 if (weight <= cpus_weight(tmp->span))
1323 break;
1324 if (tmp->flags & flag)
1325 sd = tmp;
1326 }
1327 /* while loop will break here if sd == NULL */
1328 }
1329
1330 return cpu;
1331}
1332
1333#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334
1335/*
1336 * wake_idle() will wake a task on an idle cpu if task->cpu is
1337 * not idle and an idle cpu is available. The span of cpus to
1338 * search starts with cpus closest then further out as needed,
1339 * so we always favor a closer, idle cpu.
1340 *
1341 * Returns the CPU we should wake onto.
1342 */
1343#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
Ingo Molnar36c8b582006-07-03 00:25:41 -07001344static int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345{
1346 cpumask_t tmp;
1347 struct sched_domain *sd;
1348 int i;
1349
Siddha, Suresh B49531982007-05-08 00:33:01 -07001350 /*
1351 * If it is idle, then it is the best cpu to run this task.
1352 *
1353 * This cpu is also the best, if it has more than one task already.
1354 * Siblings must be also busy(in most cases) as they didn't already
1355 * pickup the extra load from this cpu and hence we need not check
1356 * sibling runqueue info. This will avoid the checks and cache miss
1357 * penalities associated with that.
1358 */
1359 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 return cpu;
1361
1362 for_each_domain(cpu, sd) {
1363 if (sd->flags & SD_WAKE_IDLE) {
Nick Piggine0f364f2005-06-25 14:57:06 -07001364 cpus_and(tmp, sd->span, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365 for_each_cpu_mask(i, tmp) {
1366 if (idle_cpu(i))
1367 return i;
1368 }
1369 }
Nick Piggine0f364f2005-06-25 14:57:06 -07001370 else
1371 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372 }
1373 return cpu;
1374}
1375#else
Ingo Molnar36c8b582006-07-03 00:25:41 -07001376static inline int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377{
1378 return cpu;
1379}
1380#endif
1381
1382/***
1383 * try_to_wake_up - wake up a thread
1384 * @p: the to-be-woken-up thread
1385 * @state: the mask of task states that can be woken
1386 * @sync: do a synchronous wakeup?
1387 *
1388 * Put it on the run-queue if it's not already there. The "current"
1389 * thread is always on the run-queue (except when the actual
1390 * re-schedule is in progress), and as such you're allowed to do
1391 * the simpler "current->state = TASK_RUNNING" to mark yourself
1392 * runnable without the overhead of this.
1393 *
1394 * returns failure only if the task is already active.
1395 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001396static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397{
1398 int cpu, this_cpu, success = 0;
1399 unsigned long flags;
1400 long old_state;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001401 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402#ifdef CONFIG_SMP
Nick Piggin78979862005-06-25 14:57:13 -07001403 struct sched_domain *sd, *this_sd = NULL;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001404 unsigned long load, this_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405 int new_cpu;
1406#endif
1407
1408 rq = task_rq_lock(p, &flags);
1409 old_state = p->state;
1410 if (!(old_state & state))
1411 goto out;
1412
Ingo Molnardd41f592007-07-09 18:51:59 +02001413 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414 goto out_running;
1415
1416 cpu = task_cpu(p);
1417 this_cpu = smp_processor_id();
1418
1419#ifdef CONFIG_SMP
1420 if (unlikely(task_running(rq, p)))
1421 goto out_activate;
1422
Nick Piggin78979862005-06-25 14:57:13 -07001423 new_cpu = cpu;
1424
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425 schedstat_inc(rq, ttwu_cnt);
1426 if (cpu == this_cpu) {
1427 schedstat_inc(rq, ttwu_local);
Nick Piggin78979862005-06-25 14:57:13 -07001428 goto out_set_cpu;
1429 }
1430
1431 for_each_domain(this_cpu, sd) {
1432 if (cpu_isset(cpu, sd->span)) {
1433 schedstat_inc(sd, ttwu_wake_remote);
1434 this_sd = sd;
1435 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436 }
1437 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438
Nick Piggin78979862005-06-25 14:57:13 -07001439 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001440 goto out_set_cpu;
1441
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442 /*
Nick Piggin78979862005-06-25 14:57:13 -07001443 * Check for affine wakeup and passive balancing possibilities.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444 */
Nick Piggin78979862005-06-25 14:57:13 -07001445 if (this_sd) {
1446 int idx = this_sd->wake_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001447 unsigned int imbalance;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448
Nick Piggina3f21bc2005-06-25 14:57:15 -07001449 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1450
Nick Piggin78979862005-06-25 14:57:13 -07001451 load = source_load(cpu, idx);
1452 this_load = target_load(this_cpu, idx);
1453
Nick Piggin78979862005-06-25 14:57:13 -07001454 new_cpu = this_cpu; /* Wake to this CPU if we can */
1455
Nick Piggina3f21bc2005-06-25 14:57:15 -07001456 if (this_sd->flags & SD_WAKE_AFFINE) {
1457 unsigned long tl = this_load;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08001458 unsigned long tl_per_task;
1459
1460 tl_per_task = cpu_avg_load_per_task(this_cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001461
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462 /*
Nick Piggina3f21bc2005-06-25 14:57:15 -07001463 * If sync wakeup then subtract the (maximum possible)
1464 * effect of the currently running task from the load
1465 * of the current CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466 */
Nick Piggina3f21bc2005-06-25 14:57:15 -07001467 if (sync)
Ingo Molnardd41f592007-07-09 18:51:59 +02001468 tl -= current->se.load.weight;
Nick Piggina3f21bc2005-06-25 14:57:15 -07001469
1470 if ((tl <= load &&
Peter Williams2dd73a42006-06-27 02:54:34 -07001471 tl + target_load(cpu, idx) <= tl_per_task) ||
Ingo Molnardd41f592007-07-09 18:51:59 +02001472 100*(tl + p->se.load.weight) <= imbalance*load) {
Nick Piggina3f21bc2005-06-25 14:57:15 -07001473 /*
1474 * This domain has SD_WAKE_AFFINE and
1475 * p is cache cold in this domain, and
1476 * there is no bad imbalance.
1477 */
1478 schedstat_inc(this_sd, ttwu_move_affine);
1479 goto out_set_cpu;
1480 }
1481 }
1482
1483 /*
1484 * Start passive balancing when half the imbalance_pct
1485 * limit is reached.
1486 */
1487 if (this_sd->flags & SD_WAKE_BALANCE) {
1488 if (imbalance*this_load <= 100*load) {
1489 schedstat_inc(this_sd, ttwu_move_balance);
1490 goto out_set_cpu;
1491 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492 }
1493 }
1494
1495 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1496out_set_cpu:
1497 new_cpu = wake_idle(new_cpu, p);
1498 if (new_cpu != cpu) {
1499 set_task_cpu(p, new_cpu);
1500 task_rq_unlock(rq, &flags);
1501 /* might preempt at this point */
1502 rq = task_rq_lock(p, &flags);
1503 old_state = p->state;
1504 if (!(old_state & state))
1505 goto out;
Ingo Molnardd41f592007-07-09 18:51:59 +02001506 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 goto out_running;
1508
1509 this_cpu = smp_processor_id();
1510 cpu = task_cpu(p);
1511 }
1512
1513out_activate:
1514#endif /* CONFIG_SMP */
Ingo Molnardd41f592007-07-09 18:51:59 +02001515 activate_task(rq, p, 1);
Ingo Molnard79fc0f2005-09-10 00:26:12 -07001516 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517 * Sync wakeups (i.e. those types of wakeups where the waker
1518 * has indicated that it will leave the CPU in short order)
1519 * don't trigger a preemption, if the woken up task will run on
1520 * this cpu. (in this case the 'I will reschedule' promise of
1521 * the waker guarantees that the freshly woken up task is going
1522 * to be considered on this CPU.)
1523 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001524 if (!sync || cpu != this_cpu)
1525 check_preempt_curr(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526 success = 1;
1527
1528out_running:
1529 p->state = TASK_RUNNING;
1530out:
1531 task_rq_unlock(rq, &flags);
1532
1533 return success;
1534}
1535
Ingo Molnar36c8b582006-07-03 00:25:41 -07001536int fastcall wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001537{
1538 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1539 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1540}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541EXPORT_SYMBOL(wake_up_process);
1542
Ingo Molnar36c8b582006-07-03 00:25:41 -07001543int fastcall wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544{
1545 return try_to_wake_up(p, state, 0);
1546}
1547
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548/*
1549 * Perform scheduler related setup for a newly forked process p.
1550 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02001551 *
1552 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001554static void __sched_fork(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001555{
Ingo Molnardd41f592007-07-09 18:51:59 +02001556 p->se.wait_start_fair = 0;
1557 p->se.wait_start = 0;
1558 p->se.exec_start = 0;
1559 p->se.sum_exec_runtime = 0;
1560 p->se.delta_exec = 0;
1561 p->se.delta_fair_run = 0;
1562 p->se.delta_fair_sleep = 0;
1563 p->se.wait_runtime = 0;
1564 p->se.sum_wait_runtime = 0;
1565 p->se.sum_sleep_runtime = 0;
1566 p->se.sleep_start = 0;
1567 p->se.sleep_start_fair = 0;
1568 p->se.block_start = 0;
1569 p->se.sleep_max = 0;
1570 p->se.block_max = 0;
1571 p->se.exec_max = 0;
1572 p->se.wait_max = 0;
1573 p->se.wait_runtime_overruns = 0;
1574 p->se.wait_runtime_underruns = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001575
Ingo Molnardd41f592007-07-09 18:51:59 +02001576 INIT_LIST_HEAD(&p->run_list);
1577 p->se.on_rq = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001578
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579 /*
1580 * We mark the process as running here, but have not actually
1581 * inserted it onto the runqueue yet. This guarantees that
1582 * nobody will actually run it, and a signal or other external
1583 * event cannot wake it up and insert it on the runqueue either.
1584 */
1585 p->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02001586}
1587
1588/*
1589 * fork()/clone()-time setup:
1590 */
1591void sched_fork(struct task_struct *p, int clone_flags)
1592{
1593 int cpu = get_cpu();
1594
1595 __sched_fork(p);
1596
1597#ifdef CONFIG_SMP
1598 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1599#endif
1600 __set_task_cpu(p, cpu);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001601
1602 /*
1603 * Make sure we do not leak PI boosting priority to the child:
1604 */
1605 p->prio = current->normal_prio;
1606
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001607#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
Ingo Molnardd41f592007-07-09 18:51:59 +02001608 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001609 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610#endif
Chen, Kenneth Wd6077cb2006-02-14 13:53:10 -08001611#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
Nick Piggin4866cde2005-06-25 14:57:23 -07001612 p->oncpu = 0;
1613#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614#ifdef CONFIG_PREEMPT
Nick Piggin4866cde2005-06-25 14:57:23 -07001615 /* Want to start with kernel preemption disabled. */
Al Viroa1261f542005-11-13 16:06:55 -08001616 task_thread_info(p)->preempt_count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617#endif
Nick Piggin476d1392005-06-25 14:57:29 -07001618 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001619}
1620
1621/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001622 * After fork, child runs first. (default) If set to 0 then
1623 * parent will (try to) run first.
1624 */
1625unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1626
1627/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628 * wake_up_new_task - wake up a newly created task for the first time.
1629 *
1630 * This function will do some initial scheduler statistics housekeeping
1631 * that must be done for every newly created context, then puts the task
1632 * on the runqueue and wakes it.
1633 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001634void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001635{
1636 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001637 struct rq *rq;
1638 int this_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639
1640 rq = task_rq_lock(p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001641 BUG_ON(p->state != TASK_RUNNING);
Ingo Molnardd41f592007-07-09 18:51:59 +02001642 this_cpu = smp_processor_id(); /* parent's CPU */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643
1644 p->prio = effective_prio(p);
1645
Ingo Molnardd41f592007-07-09 18:51:59 +02001646 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1647 task_cpu(p) != this_cpu || !current->se.on_rq) {
1648 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02001651 * Let the scheduling class do new task startup
1652 * management (if any):
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001654 p->sched_class->task_new(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655 }
Ingo Molnardd41f592007-07-09 18:51:59 +02001656 check_preempt_curr(rq, p);
1657 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001658}
1659
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660/**
Nick Piggin4866cde2005-06-25 14:57:23 -07001661 * prepare_task_switch - prepare to switch tasks
1662 * @rq: the runqueue preparing to switch
1663 * @next: the task we are going to switch to.
1664 *
1665 * This is called with the rq lock held and interrupts off. It must
1666 * be paired with a subsequent finish_task_switch after the context
1667 * switch.
1668 *
1669 * prepare_task_switch sets up locking and calls architecture specific
1670 * hooks.
1671 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001672static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001673{
1674 prepare_lock_switch(rq, next);
1675 prepare_arch_switch(next);
1676}
1677
1678/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 * finish_task_switch - clean up after a task-switch
Jeff Garzik344baba2005-09-07 01:15:17 -04001680 * @rq: runqueue associated with task-switch
Linus Torvalds1da177e2005-04-16 15:20:36 -07001681 * @prev: the thread we just switched away from.
1682 *
Nick Piggin4866cde2005-06-25 14:57:23 -07001683 * finish_task_switch must be called after the context switch, paired
1684 * with a prepare_task_switch call before the context switch.
1685 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1686 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687 *
1688 * Note that we may have delayed dropping an mm in context_switch(). If
1689 * so, we finish that here outside of the runqueue lock. (Doing it
1690 * with the lock held can cause deadlocks; see schedule() for
1691 * details.)
1692 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001693static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694 __releases(rq->lock)
1695{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001697 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698
1699 rq->prev_mm = NULL;
1700
1701 /*
1702 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001703 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001704 * schedule one last time. The schedule call will never return, and
1705 * the scheduled task must drop that reference.
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001706 * The test for TASK_DEAD must occur while the runqueue locks are
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707 * still held, otherwise prev could be scheduled on another cpu, die
1708 * there before we look at prev->state, and then the reference would
1709 * be dropped twice.
1710 * Manfred Spraul <manfred@colorfullife.com>
1711 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001712 prev_state = prev->state;
Nick Piggin4866cde2005-06-25 14:57:23 -07001713 finish_arch_switch(prev);
1714 finish_lock_switch(rq, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 if (mm)
1716 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001717 if (unlikely(prev_state == TASK_DEAD)) {
bibo maoc6fd91f2006-03-26 01:38:20 -08001718 /*
1719 * Remove function-return probe instances associated with this
1720 * task and put them back on the free list.
1721 */
1722 kprobe_flush_task(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08001724 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725}
1726
1727/**
1728 * schedule_tail - first thing a freshly forked thread must call.
1729 * @prev: the thread we just switched away from.
1730 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001731asmlinkage void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732 __releases(rq->lock)
1733{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001734 struct rq *rq = this_rq();
1735
Nick Piggin4866cde2005-06-25 14:57:23 -07001736 finish_task_switch(rq, prev);
1737#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1738 /* In this case, finish_task_switch does not reenable preemption */
1739 preempt_enable();
1740#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741 if (current->set_child_tid)
1742 put_user(current->pid, current->set_child_tid);
1743}
1744
1745/*
1746 * context_switch - switch to the new MM and the new
1747 * thread's register state.
1748 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001749static inline void
Ingo Molnar70b97a72006-07-03 00:25:42 -07001750context_switch(struct rq *rq, struct task_struct *prev,
Ingo Molnar36c8b582006-07-03 00:25:41 -07001751 struct task_struct *next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001752{
Ingo Molnardd41f592007-07-09 18:51:59 +02001753 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754
Ingo Molnardd41f592007-07-09 18:51:59 +02001755 prepare_task_switch(rq, next);
1756 mm = next->mm;
1757 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01001758 /*
1759 * For paravirt, this is coupled with an exit in switch_to to
1760 * combine the page table reload and the switch backend into
1761 * one hypercall.
1762 */
1763 arch_enter_lazy_cpu_mode();
1764
Ingo Molnardd41f592007-07-09 18:51:59 +02001765 if (unlikely(!mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 next->active_mm = oldmm;
1767 atomic_inc(&oldmm->mm_count);
1768 enter_lazy_tlb(oldmm, next);
1769 } else
1770 switch_mm(oldmm, mm, next);
1771
Ingo Molnardd41f592007-07-09 18:51:59 +02001772 if (unlikely(!prev->mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001773 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001774 rq->prev_mm = oldmm;
1775 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001776 /*
1777 * Since the runqueue lock will be released by the next
1778 * task (which is an invalid locking op but in the case
1779 * of the scheduler it's an obvious special-case), so we
1780 * do an early lockdep release here:
1781 */
1782#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07001783 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001784#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785
1786 /* Here we just switch the register state and the stack. */
1787 switch_to(prev, next, prev);
1788
Ingo Molnardd41f592007-07-09 18:51:59 +02001789 barrier();
1790 /*
1791 * this_rq must be evaluated again because prev may have moved
1792 * CPUs since it called schedule(), thus the 'rq' on its stack
1793 * frame will be invalid.
1794 */
1795 finish_task_switch(this_rq(), prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001796}
1797
1798/*
1799 * nr_running, nr_uninterruptible and nr_context_switches:
1800 *
1801 * externally visible scheduler statistics: current number of runnable
1802 * threads, current number of uninterruptible-sleeping threads, total
1803 * number of context switches performed since bootup.
1804 */
1805unsigned long nr_running(void)
1806{
1807 unsigned long i, sum = 0;
1808
1809 for_each_online_cpu(i)
1810 sum += cpu_rq(i)->nr_running;
1811
1812 return sum;
1813}
1814
1815unsigned long nr_uninterruptible(void)
1816{
1817 unsigned long i, sum = 0;
1818
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001819 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 sum += cpu_rq(i)->nr_uninterruptible;
1821
1822 /*
1823 * Since we read the counters lockless, it might be slightly
1824 * inaccurate. Do not allow it to go below zero though:
1825 */
1826 if (unlikely((long)sum < 0))
1827 sum = 0;
1828
1829 return sum;
1830}
1831
1832unsigned long long nr_context_switches(void)
1833{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07001834 int i;
1835 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001837 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 sum += cpu_rq(i)->nr_switches;
1839
1840 return sum;
1841}
1842
1843unsigned long nr_iowait(void)
1844{
1845 unsigned long i, sum = 0;
1846
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001847 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1849
1850 return sum;
1851}
1852
Jack Steinerdb1b1fe2006-03-31 02:31:21 -08001853unsigned long nr_active(void)
1854{
1855 unsigned long i, running = 0, uninterruptible = 0;
1856
1857 for_each_online_cpu(i) {
1858 running += cpu_rq(i)->nr_running;
1859 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1860 }
1861
1862 if (unlikely((long)uninterruptible < 0))
1863 uninterruptible = 0;
1864
1865 return running + uninterruptible;
1866}
1867
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001869 * Update rq->cpu_load[] statistics. This function is usually called every
1870 * scheduler tick (TICK_NSEC).
Ingo Molnar48f24c42006-07-03 00:25:40 -07001871 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001872static void update_cpu_load(struct rq *this_rq)
Ingo Molnar48f24c42006-07-03 00:25:40 -07001873{
Ingo Molnardd41f592007-07-09 18:51:59 +02001874 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1875 unsigned long total_load = this_rq->ls.load.weight;
1876 unsigned long this_load = total_load;
1877 struct load_stat *ls = &this_rq->ls;
1878 u64 now = __rq_clock(this_rq);
1879 int i, scale;
1880
1881 this_rq->nr_load_updates++;
1882 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1883 goto do_avg;
1884
1885 /* Update delta_fair/delta_exec fields first */
1886 update_curr_load(this_rq, now);
1887
1888 fair_delta64 = ls->delta_fair + 1;
1889 ls->delta_fair = 0;
1890
1891 exec_delta64 = ls->delta_exec + 1;
1892 ls->delta_exec = 0;
1893
1894 sample_interval64 = now - ls->load_update_last;
1895 ls->load_update_last = now;
1896
1897 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1898 sample_interval64 = TICK_NSEC;
1899
1900 if (exec_delta64 > sample_interval64)
1901 exec_delta64 = sample_interval64;
1902
1903 idle_delta64 = sample_interval64 - exec_delta64;
1904
1905 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1906 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1907
1908 this_load = (unsigned long)tmp64;
1909
1910do_avg:
1911
1912 /* Update our load: */
1913 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1914 unsigned long old_load, new_load;
1915
1916 /* scale is effectively 1 << i now, and >> i divides by scale */
1917
1918 old_load = this_rq->cpu_load[i];
1919 new_load = this_load;
1920
1921 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1922 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07001923}
1924
Ingo Molnardd41f592007-07-09 18:51:59 +02001925#ifdef CONFIG_SMP
1926
Ingo Molnar48f24c42006-07-03 00:25:40 -07001927/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 * double_rq_lock - safely lock two runqueues
1929 *
1930 * Note this does not disable interrupts like task_rq_lock,
1931 * you need to do so manually before calling.
1932 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001933static void double_rq_lock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934 __acquires(rq1->lock)
1935 __acquires(rq2->lock)
1936{
Kirill Korotaev054b9102006-12-10 02:20:11 -08001937 BUG_ON(!irqs_disabled());
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938 if (rq1 == rq2) {
1939 spin_lock(&rq1->lock);
1940 __acquire(rq2->lock); /* Fake it out ;) */
1941 } else {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001942 if (rq1 < rq2) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001943 spin_lock(&rq1->lock);
1944 spin_lock(&rq2->lock);
1945 } else {
1946 spin_lock(&rq2->lock);
1947 spin_lock(&rq1->lock);
1948 }
1949 }
1950}
1951
1952/*
1953 * double_rq_unlock - safely unlock two runqueues
1954 *
1955 * Note this does not restore interrupts like task_rq_unlock,
1956 * you need to do so manually after calling.
1957 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001958static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001959 __releases(rq1->lock)
1960 __releases(rq2->lock)
1961{
1962 spin_unlock(&rq1->lock);
1963 if (rq1 != rq2)
1964 spin_unlock(&rq2->lock);
1965 else
1966 __release(rq2->lock);
1967}
1968
1969/*
1970 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1971 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001972static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001973 __releases(this_rq->lock)
1974 __acquires(busiest->lock)
1975 __acquires(this_rq->lock)
1976{
Kirill Korotaev054b9102006-12-10 02:20:11 -08001977 if (unlikely(!irqs_disabled())) {
1978 /* printk() doesn't work good under rq->lock */
1979 spin_unlock(&this_rq->lock);
1980 BUG_ON(1);
1981 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982 if (unlikely(!spin_trylock(&busiest->lock))) {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001983 if (busiest < this_rq) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 spin_unlock(&this_rq->lock);
1985 spin_lock(&busiest->lock);
1986 spin_lock(&this_rq->lock);
1987 } else
1988 spin_lock(&busiest->lock);
1989 }
1990}
1991
1992/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993 * If dest_cpu is allowed for this process, migrate the task to it.
1994 * This is accomplished by forcing the cpu_allowed mask to only
1995 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1996 * the cpu_allowed mask is restored.
1997 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001998static void sched_migrate_task(struct task_struct *p, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002000 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002001 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002002 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003
2004 rq = task_rq_lock(p, &flags);
2005 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2006 || unlikely(cpu_is_offline(dest_cpu)))
2007 goto out;
2008
2009 /* force the process onto the specified CPU */
2010 if (migrate_task(p, dest_cpu, &req)) {
2011 /* Need to wait for migration thread (might exit: take ref). */
2012 struct task_struct *mt = rq->migration_thread;
Ingo Molnar36c8b582006-07-03 00:25:41 -07002013
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 get_task_struct(mt);
2015 task_rq_unlock(rq, &flags);
2016 wake_up_process(mt);
2017 put_task_struct(mt);
2018 wait_for_completion(&req.done);
Ingo Molnar36c8b582006-07-03 00:25:41 -07002019
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020 return;
2021 }
2022out:
2023 task_rq_unlock(rq, &flags);
2024}
2025
2026/*
Nick Piggin476d1392005-06-25 14:57:29 -07002027 * sched_exec - execve() is a valuable balancing opportunity, because at
2028 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029 */
2030void sched_exec(void)
2031{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 int new_cpu, this_cpu = get_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002033 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 put_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002035 if (new_cpu != this_cpu)
2036 sched_migrate_task(current, new_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037}
2038
2039/*
2040 * pull_task - move a task from a remote runqueue to the local runqueue.
2041 * Both runqueues must be locked.
2042 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002043static void pull_task(struct rq *src_rq, struct task_struct *p,
2044 struct rq *this_rq, int this_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045{
Ingo Molnardd41f592007-07-09 18:51:59 +02002046 deactivate_task(src_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 set_task_cpu(p, this_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02002048 activate_task(this_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002049 /*
2050 * Note that idle threads have a prio of MAX_PRIO, for this test
2051 * to be always true for them.
2052 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002053 check_preempt_curr(this_rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054}
2055
2056/*
2057 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2058 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08002059static
Ingo Molnar70b97a72006-07-03 00:25:42 -07002060int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002061 struct sched_domain *sd, enum cpu_idle_type idle,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07002062 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063{
2064 /*
2065 * We do not migrate tasks that are:
2066 * 1) running (obviously), or
2067 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2068 * 3) are cache-hot on their current CPU.
2069 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070 if (!cpu_isset(this_cpu, p->cpus_allowed))
2071 return 0;
Nick Piggin81026792005-06-25 14:57:07 -07002072 *all_pinned = 0;
2073
2074 if (task_running(rq, p))
2075 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076
2077 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02002078 * Aggressive migration if too many balance attempts have failed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002080 if (sd->nr_balance_failed > sd->cache_nice_tries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081 return 1;
2082
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 return 1;
2084}
2085
Ingo Molnardd41f592007-07-09 18:51:59 +02002086static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2087 unsigned long max_nr_move, unsigned long max_load_move,
2088 struct sched_domain *sd, enum cpu_idle_type idle,
2089 int *all_pinned, unsigned long *load_moved,
2090 int this_best_prio, int best_prio, int best_prio_seen,
2091 struct rq_iterator *iterator)
2092{
2093 int pulled = 0, pinned = 0, skip_for_load;
2094 struct task_struct *p;
2095 long rem_load_move = max_load_move;
2096
2097 if (max_nr_move == 0 || max_load_move == 0)
2098 goto out;
2099
2100 pinned = 1;
2101
2102 /*
2103 * Start the load-balancing iterator:
2104 */
2105 p = iterator->start(iterator->arg);
2106next:
2107 if (!p)
2108 goto out;
2109 /*
2110 * To help distribute high priority tasks accross CPUs we don't
2111 * skip a task if it will be the highest priority task (i.e. smallest
2112 * prio value) on its new queue regardless of its load weight
2113 */
2114 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2115 SCHED_LOAD_SCALE_FUZZ;
2116 if (skip_for_load && p->prio < this_best_prio)
2117 skip_for_load = !best_prio_seen && p->prio == best_prio;
2118 if (skip_for_load ||
2119 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2120
2121 best_prio_seen |= p->prio == best_prio;
2122 p = iterator->next(iterator->arg);
2123 goto next;
2124 }
2125
2126 pull_task(busiest, p, this_rq, this_cpu);
2127 pulled++;
2128 rem_load_move -= p->se.load.weight;
2129
2130 /*
2131 * We only want to steal up to the prescribed number of tasks
2132 * and the prescribed amount of weighted load.
2133 */
2134 if (pulled < max_nr_move && rem_load_move > 0) {
2135 if (p->prio < this_best_prio)
2136 this_best_prio = p->prio;
2137 p = iterator->next(iterator->arg);
2138 goto next;
2139 }
2140out:
2141 /*
2142 * Right now, this is the only place pull_task() is called,
2143 * so we can safely collect pull_task() stats here rather than
2144 * inside pull_task().
2145 */
2146 schedstat_add(sd, lb_gained[idle], pulled);
2147
2148 if (all_pinned)
2149 *all_pinned = pinned;
2150 *load_moved = max_load_move - rem_load_move;
2151 return pulled;
2152}
Ingo Molnar48f24c42006-07-03 00:25:40 -07002153
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154/*
Peter Williams2dd73a42006-06-27 02:54:34 -07002155 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2156 * load from busiest to this_rq, as part of a balancing operation within
2157 * "domain". Returns the number of tasks moved.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 *
2159 * Called with both runqueues locked.
2160 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002161static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002162 unsigned long max_nr_move, unsigned long max_load_move,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002163 struct sched_domain *sd, enum cpu_idle_type idle,
Peter Williams2dd73a42006-06-27 02:54:34 -07002164 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165{
Ingo Molnardd41f592007-07-09 18:51:59 +02002166 struct sched_class *class = sched_class_highest;
2167 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2168 long rem_load_move = max_load_move;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169
Ingo Molnardd41f592007-07-09 18:51:59 +02002170 do {
2171 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2172 max_nr_move, (unsigned long)rem_load_move,
2173 sd, idle, all_pinned, &load_moved);
2174 total_nr_moved += nr_moved;
2175 max_nr_move -= nr_moved;
2176 rem_load_move -= load_moved;
2177 class = class->next;
2178 } while (class && max_nr_move && rem_load_move > 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179
Ingo Molnardd41f592007-07-09 18:51:59 +02002180 return total_nr_moved;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181}
2182
2183/*
2184 * find_busiest_group finds and returns the busiest CPU group within the
Ingo Molnar48f24c42006-07-03 00:25:40 -07002185 * domain. It calculates and returns the amount of weighted load which
2186 * should be moved to restore balance via the imbalance parameter.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002187 */
2188static struct sched_group *
2189find_busiest_group(struct sched_domain *sd, int this_cpu,
Ingo Molnardd41f592007-07-09 18:51:59 +02002190 unsigned long *imbalance, enum cpu_idle_type idle,
2191 int *sd_idle, cpumask_t *cpus, int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192{
2193 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2194 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002195 unsigned long max_pull;
Peter Williams2dd73a42006-06-27 02:54:34 -07002196 unsigned long busiest_load_per_task, busiest_nr_running;
2197 unsigned long this_load_per_task, this_nr_running;
Nick Piggin78979862005-06-25 14:57:13 -07002198 int load_idx;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002199#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2200 int power_savings_balance = 1;
2201 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2202 unsigned long min_nr_running = ULONG_MAX;
2203 struct sched_group *group_min = NULL, *group_leader = NULL;
2204#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205
2206 max_load = this_load = total_load = total_pwr = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002207 busiest_load_per_task = busiest_nr_running = 0;
2208 this_load_per_task = this_nr_running = 0;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002209 if (idle == CPU_NOT_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002210 load_idx = sd->busy_idx;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002211 else if (idle == CPU_NEWLY_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002212 load_idx = sd->newidle_idx;
2213 else
2214 load_idx = sd->idle_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002215
2216 do {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002217 unsigned long load, group_capacity;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218 int local_group;
2219 int i;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002220 unsigned int balance_cpu = -1, first_idle_cpu = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002221 unsigned long sum_nr_running, sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222
2223 local_group = cpu_isset(this_cpu, group->cpumask);
2224
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002225 if (local_group)
2226 balance_cpu = first_cpu(group->cpumask);
2227
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 /* Tally up the load of all CPUs in the group */
Peter Williams2dd73a42006-06-27 02:54:34 -07002229 sum_weighted_load = sum_nr_running = avg_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230
2231 for_each_cpu_mask(i, group->cpumask) {
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002232 struct rq *rq;
2233
2234 if (!cpu_isset(i, *cpus))
2235 continue;
2236
2237 rq = cpu_rq(i);
Peter Williams2dd73a42006-06-27 02:54:34 -07002238
Nick Piggin5969fe02005-09-10 00:26:19 -07002239 if (*sd_idle && !idle_cpu(i))
2240 *sd_idle = 0;
2241
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242 /* Bias balancing toward cpus of our domain */
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002243 if (local_group) {
2244 if (idle_cpu(i) && !first_idle_cpu) {
2245 first_idle_cpu = 1;
2246 balance_cpu = i;
2247 }
2248
Nick Piggina2000572006-02-10 01:51:02 -08002249 load = target_load(i, load_idx);
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002250 } else
Nick Piggina2000572006-02-10 01:51:02 -08002251 load = source_load(i, load_idx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252
2253 avg_load += load;
Peter Williams2dd73a42006-06-27 02:54:34 -07002254 sum_nr_running += rq->nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002255 sum_weighted_load += weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002256 }
2257
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002258 /*
2259 * First idle cpu or the first cpu(busiest) in this sched group
2260 * is eligible for doing load balancing at this and above
2261 * domains.
2262 */
2263 if (local_group && balance_cpu != this_cpu && balance) {
2264 *balance = 0;
2265 goto ret;
2266 }
2267
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 total_load += avg_load;
Eric Dumazet5517d862007-05-08 00:32:57 -07002269 total_pwr += group->__cpu_power;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270
2271 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07002272 avg_load = sg_div_cpu_power(group,
2273 avg_load * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002274
Eric Dumazet5517d862007-05-08 00:32:57 -07002275 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002276
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277 if (local_group) {
2278 this_load = avg_load;
2279 this = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002280 this_nr_running = sum_nr_running;
2281 this_load_per_task = sum_weighted_load;
2282 } else if (avg_load > max_load &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002283 sum_nr_running > group_capacity) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284 max_load = avg_load;
2285 busiest = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002286 busiest_nr_running = sum_nr_running;
2287 busiest_load_per_task = sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002289
2290#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2291 /*
2292 * Busy processors will not participate in power savings
2293 * balance.
2294 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002295 if (idle == CPU_NOT_IDLE ||
2296 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2297 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002298
2299 /*
2300 * If the local group is idle or completely loaded
2301 * no need to do power savings balance at this domain
2302 */
2303 if (local_group && (this_nr_running >= group_capacity ||
2304 !this_nr_running))
2305 power_savings_balance = 0;
2306
Ingo Molnardd41f592007-07-09 18:51:59 +02002307 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002308 * If a group is already running at full capacity or idle,
2309 * don't include that group in power savings calculations
Ingo Molnardd41f592007-07-09 18:51:59 +02002310 */
2311 if (!power_savings_balance || sum_nr_running >= group_capacity
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002312 || !sum_nr_running)
Ingo Molnardd41f592007-07-09 18:51:59 +02002313 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002314
Ingo Molnardd41f592007-07-09 18:51:59 +02002315 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002316 * Calculate the group which has the least non-idle load.
Ingo Molnardd41f592007-07-09 18:51:59 +02002317 * This is the group from where we need to pick up the load
2318 * for saving power
2319 */
2320 if ((sum_nr_running < min_nr_running) ||
2321 (sum_nr_running == min_nr_running &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002322 first_cpu(group->cpumask) <
2323 first_cpu(group_min->cpumask))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002324 group_min = group;
2325 min_nr_running = sum_nr_running;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002326 min_load_per_task = sum_weighted_load /
2327 sum_nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002328 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002329
Ingo Molnardd41f592007-07-09 18:51:59 +02002330 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002331 * Calculate the group which is almost near its
Ingo Molnardd41f592007-07-09 18:51:59 +02002332 * capacity but still has some space to pick up some load
2333 * from other group and save more power
2334 */
2335 if (sum_nr_running <= group_capacity - 1) {
2336 if (sum_nr_running > leader_nr_running ||
2337 (sum_nr_running == leader_nr_running &&
2338 first_cpu(group->cpumask) >
2339 first_cpu(group_leader->cpumask))) {
2340 group_leader = group;
2341 leader_nr_running = sum_nr_running;
2342 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07002343 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002344group_next:
2345#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346 group = group->next;
2347 } while (group != sd->groups);
2348
Peter Williams2dd73a42006-06-27 02:54:34 -07002349 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 goto out_balanced;
2351
2352 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2353
2354 if (this_load >= avg_load ||
2355 100*max_load <= sd->imbalance_pct*this_load)
2356 goto out_balanced;
2357
Peter Williams2dd73a42006-06-27 02:54:34 -07002358 busiest_load_per_task /= busiest_nr_running;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359 /*
2360 * We're trying to get all the cpus to the average_load, so we don't
2361 * want to push ourselves above the average load, nor do we wish to
2362 * reduce the max loaded cpu below the average load, as either of these
2363 * actions would just result in more rebalancing later, and ping-pong
2364 * tasks around. Thus we look for the minimum possible imbalance.
2365 * Negative imbalances (*we* are more loaded than anyone else) will
2366 * be counted as no imbalance for these purposes -- we can't fix that
2367 * by pulling tasks to us. Be careful of negative numbers as they'll
2368 * appear as very large values with unsigned longs.
2369 */
Peter Williams2dd73a42006-06-27 02:54:34 -07002370 if (max_load <= busiest_load_per_task)
2371 goto out_balanced;
2372
2373 /*
2374 * In the presence of smp nice balancing, certain scenarios can have
2375 * max load less than avg load(as we skip the groups at or below
2376 * its cpu_power, while calculating max_load..)
2377 */
2378 if (max_load < avg_load) {
2379 *imbalance = 0;
2380 goto small_imbalance;
2381 }
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002382
2383 /* Don't want to pull so many tasks that a group would go idle */
Peter Williams2dd73a42006-06-27 02:54:34 -07002384 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002385
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386 /* How much load to actually move to equalise the imbalance */
Eric Dumazet5517d862007-05-08 00:32:57 -07002387 *imbalance = min(max_pull * busiest->__cpu_power,
2388 (avg_load - this_load) * this->__cpu_power)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389 / SCHED_LOAD_SCALE;
2390
Peter Williams2dd73a42006-06-27 02:54:34 -07002391 /*
2392 * if *imbalance is less than the average load per runnable task
2393 * there is no gaurantee that any tasks will be moved so we'll have
2394 * a think about bumping its value to force at least one task to be
2395 * moved
2396 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002397 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07002398 unsigned long tmp, pwr_now, pwr_move;
Peter Williams2dd73a42006-06-27 02:54:34 -07002399 unsigned int imbn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400
Peter Williams2dd73a42006-06-27 02:54:34 -07002401small_imbalance:
2402 pwr_move = pwr_now = 0;
2403 imbn = 2;
2404 if (this_nr_running) {
2405 this_load_per_task /= this_nr_running;
2406 if (busiest_load_per_task > this_load_per_task)
2407 imbn = 1;
2408 } else
2409 this_load_per_task = SCHED_LOAD_SCALE;
2410
Ingo Molnardd41f592007-07-09 18:51:59 +02002411 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2412 busiest_load_per_task * imbn) {
Peter Williams2dd73a42006-06-27 02:54:34 -07002413 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414 return busiest;
2415 }
2416
2417 /*
2418 * OK, we don't have enough imbalance to justify moving tasks,
2419 * however we may be able to increase total CPU power used by
2420 * moving them.
2421 */
2422
Eric Dumazet5517d862007-05-08 00:32:57 -07002423 pwr_now += busiest->__cpu_power *
2424 min(busiest_load_per_task, max_load);
2425 pwr_now += this->__cpu_power *
2426 min(this_load_per_task, this_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427 pwr_now /= SCHED_LOAD_SCALE;
2428
2429 /* Amount of load we'd subtract */
Eric Dumazet5517d862007-05-08 00:32:57 -07002430 tmp = sg_div_cpu_power(busiest,
2431 busiest_load_per_task * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432 if (max_load > tmp)
Eric Dumazet5517d862007-05-08 00:32:57 -07002433 pwr_move += busiest->__cpu_power *
Peter Williams2dd73a42006-06-27 02:54:34 -07002434 min(busiest_load_per_task, max_load - tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435
2436 /* Amount of load we'd add */
Eric Dumazet5517d862007-05-08 00:32:57 -07002437 if (max_load * busiest->__cpu_power <
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08002438 busiest_load_per_task * SCHED_LOAD_SCALE)
Eric Dumazet5517d862007-05-08 00:32:57 -07002439 tmp = sg_div_cpu_power(this,
2440 max_load * busiest->__cpu_power);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002441 else
Eric Dumazet5517d862007-05-08 00:32:57 -07002442 tmp = sg_div_cpu_power(this,
2443 busiest_load_per_task * SCHED_LOAD_SCALE);
2444 pwr_move += this->__cpu_power *
2445 min(this_load_per_task, this_load + tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002446 pwr_move /= SCHED_LOAD_SCALE;
2447
2448 /* Move if we gain throughput */
2449 if (pwr_move <= pwr_now)
2450 goto out_balanced;
2451
Peter Williams2dd73a42006-06-27 02:54:34 -07002452 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453 }
2454
Linus Torvalds1da177e2005-04-16 15:20:36 -07002455 return busiest;
2456
2457out_balanced:
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002458#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002459 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002460 goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002462 if (this == group_leader && group_leader != group_min) {
2463 *imbalance = min_load_per_task;
2464 return group_min;
2465 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002466#endif
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002467ret:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468 *imbalance = 0;
2469 return NULL;
2470}
2471
2472/*
2473 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2474 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002475static struct rq *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002476find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002477 unsigned long imbalance, cpumask_t *cpus)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002479 struct rq *busiest = NULL, *rq;
Peter Williams2dd73a42006-06-27 02:54:34 -07002480 unsigned long max_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481 int i;
2482
2483 for_each_cpu_mask(i, group->cpumask) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002484 unsigned long wl;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002485
2486 if (!cpu_isset(i, *cpus))
2487 continue;
2488
Ingo Molnar48f24c42006-07-03 00:25:40 -07002489 rq = cpu_rq(i);
Ingo Molnardd41f592007-07-09 18:51:59 +02002490 wl = weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491
Ingo Molnardd41f592007-07-09 18:51:59 +02002492 if (rq->nr_running == 1 && wl > imbalance)
Peter Williams2dd73a42006-06-27 02:54:34 -07002493 continue;
2494
Ingo Molnardd41f592007-07-09 18:51:59 +02002495 if (wl > max_load) {
2496 max_load = wl;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002497 busiest = rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002498 }
2499 }
2500
2501 return busiest;
2502}
2503
2504/*
Nick Piggin77391d72005-06-25 14:57:30 -07002505 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2506 * so long as it is large enough.
2507 */
2508#define MAX_PINNED_INTERVAL 512
2509
Ingo Molnar48f24c42006-07-03 00:25:40 -07002510static inline unsigned long minus_1_or_zero(unsigned long n)
2511{
2512 return n > 0 ? n - 1 : 0;
2513}
2514
Nick Piggin77391d72005-06-25 14:57:30 -07002515/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2517 * tasks if there is an imbalance.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002519static int load_balance(int this_cpu, struct rq *this_rq,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002520 struct sched_domain *sd, enum cpu_idle_type idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002521 int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522{
Ingo Molnar48f24c42006-07-03 00:25:40 -07002523 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524 struct sched_group *group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002525 unsigned long imbalance;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002526 struct rq *busiest;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002527 cpumask_t cpus = CPU_MASK_ALL;
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002528 unsigned long flags;
Nick Piggin5969fe02005-09-10 00:26:19 -07002529
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002530 /*
2531 * When power savings policy is enabled for the parent domain, idle
2532 * sibling can pick up load irrespective of busy siblings. In this case,
Ingo Molnardd41f592007-07-09 18:51:59 +02002533 * let the state of idle sibling percolate up as CPU_IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002534 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002535 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002536 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002537 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002538 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539
Linus Torvalds1da177e2005-04-16 15:20:36 -07002540 schedstat_inc(sd, lb_cnt[idle]);
2541
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002542redo:
2543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002544 &cpus, balance);
2545
Chen, Kenneth W06066712006-12-10 02:20:35 -08002546 if (*balance == 0)
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002547 goto out_balanced;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002548
Linus Torvalds1da177e2005-04-16 15:20:36 -07002549 if (!group) {
2550 schedstat_inc(sd, lb_nobusyg[idle]);
2551 goto out_balanced;
2552 }
2553
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002554 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555 if (!busiest) {
2556 schedstat_inc(sd, lb_nobusyq[idle]);
2557 goto out_balanced;
2558 }
2559
Nick Piggindb935db2005-06-25 14:57:11 -07002560 BUG_ON(busiest == this_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561
2562 schedstat_add(sd, lb_imbalance[idle], imbalance);
2563
2564 nr_moved = 0;
2565 if (busiest->nr_running > 1) {
2566 /*
2567 * Attempt to move tasks. If find_busiest_group has found
2568 * an imbalance but busiest->nr_running <= 1, the group is
2569 * still unbalanced. nr_moved simply stays zero, so it is
2570 * correctly treated as an imbalance.
2571 */
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002572 local_irq_save(flags);
Nick Piggine17224b2005-09-10 00:26:18 -07002573 double_rq_lock(this_rq, busiest);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002574 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002575 minus_1_or_zero(busiest->nr_running),
2576 imbalance, sd, idle, &all_pinned);
Nick Piggine17224b2005-09-10 00:26:18 -07002577 double_rq_unlock(this_rq, busiest);
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002578 local_irq_restore(flags);
Nick Piggin81026792005-06-25 14:57:07 -07002579
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002580 /*
2581 * some other cpu did the load balance for us.
2582 */
2583 if (nr_moved && this_cpu != smp_processor_id())
2584 resched_cpu(this_cpu);
2585
Nick Piggin81026792005-06-25 14:57:07 -07002586 /* All tasks on this runqueue were pinned by CPU affinity */
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002587 if (unlikely(all_pinned)) {
2588 cpu_clear(cpu_of(busiest), cpus);
2589 if (!cpus_empty(cpus))
2590 goto redo;
Nick Piggin81026792005-06-25 14:57:07 -07002591 goto out_balanced;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002592 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593 }
Nick Piggin81026792005-06-25 14:57:07 -07002594
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595 if (!nr_moved) {
2596 schedstat_inc(sd, lb_failed[idle]);
2597 sd->nr_balance_failed++;
2598
2599 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002601 spin_lock_irqsave(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002602
2603 /* don't kick the migration_thread, if the curr
2604 * task on busiest cpu can't be moved to this_cpu
2605 */
2606 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002607 spin_unlock_irqrestore(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002608 all_pinned = 1;
2609 goto out_one_pinned;
2610 }
2611
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 if (!busiest->active_balance) {
2613 busiest->active_balance = 1;
2614 busiest->push_cpu = this_cpu;
Nick Piggin81026792005-06-25 14:57:07 -07002615 active_balance = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616 }
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002617 spin_unlock_irqrestore(&busiest->lock, flags);
Nick Piggin81026792005-06-25 14:57:07 -07002618 if (active_balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619 wake_up_process(busiest->migration_thread);
2620
2621 /*
2622 * We've kicked active balancing, reset the failure
2623 * counter.
2624 */
Nick Piggin39507452005-06-25 14:57:09 -07002625 sd->nr_balance_failed = sd->cache_nice_tries+1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626 }
Nick Piggin81026792005-06-25 14:57:07 -07002627 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628 sd->nr_balance_failed = 0;
2629
Nick Piggin81026792005-06-25 14:57:07 -07002630 if (likely(!active_balance)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631 /* We were unbalanced, so reset the balancing interval */
2632 sd->balance_interval = sd->min_interval;
Nick Piggin81026792005-06-25 14:57:07 -07002633 } else {
2634 /*
2635 * If we've begun active balancing, start to back off. This
2636 * case may not be covered by the all_pinned logic if there
2637 * is only 1 task on the busy runqueue (because we don't call
2638 * move_tasks).
2639 */
2640 if (sd->balance_interval < sd->max_interval)
2641 sd->balance_interval *= 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642 }
2643
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002644 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002645 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002646 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647 return nr_moved;
2648
2649out_balanced:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002650 schedstat_inc(sd, lb_balanced[idle]);
2651
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002652 sd->nr_balance_failed = 0;
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002653
2654out_one_pinned:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655 /* tune up the balancing interval */
Nick Piggin77391d72005-06-25 14:57:30 -07002656 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2657 (sd->balance_interval < sd->max_interval))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658 sd->balance_interval *= 2;
2659
Ingo Molnar48f24c42006-07-03 00:25:40 -07002660 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002661 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002662 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663 return 0;
2664}
2665
2666/*
2667 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2668 * tasks if there is an imbalance.
2669 *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002670 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671 * this_rq is locked.
2672 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07002673static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07002674load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675{
2676 struct sched_group *group;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002677 struct rq *busiest = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678 unsigned long imbalance;
2679 int nr_moved = 0;
Nick Piggin5969fe02005-09-10 00:26:19 -07002680 int sd_idle = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002681 cpumask_t cpus = CPU_MASK_ALL;
Nick Piggin5969fe02005-09-10 00:26:19 -07002682
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002683 /*
2684 * When power savings policy is enabled for the parent domain, idle
2685 * sibling can pick up load irrespective of busy siblings. In this case,
2686 * let the state of idle sibling percolate up as IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002687 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002688 */
2689 if (sd->flags & SD_SHARE_CPUPOWER &&
2690 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002691 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002693 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002694redo:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002695 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002696 &sd_idle, &cpus, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002697 if (!group) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002698 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002699 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002700 }
2701
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002702 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002703 &cpus);
Nick Piggindb935db2005-06-25 14:57:11 -07002704 if (!busiest) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002705 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002706 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707 }
2708
Nick Piggindb935db2005-06-25 14:57:11 -07002709 BUG_ON(busiest == this_rq);
2710
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002711 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002712
2713 nr_moved = 0;
2714 if (busiest->nr_running > 1) {
2715 /* Attempt to move tasks */
2716 double_lock_balance(this_rq, busiest);
2717 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002718 minus_1_or_zero(busiest->nr_running),
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002719 imbalance, sd, CPU_NEWLY_IDLE, NULL);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002720 spin_unlock(&busiest->lock);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002721
2722 if (!nr_moved) {
2723 cpu_clear(cpu_of(busiest), cpus);
2724 if (!cpus_empty(cpus))
2725 goto redo;
2726 }
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002727 }
2728
Nick Piggin5969fe02005-09-10 00:26:19 -07002729 if (!nr_moved) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002730 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002733 return -1;
2734 } else
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002735 sd->nr_balance_failed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737 return nr_moved;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002738
2739out_balanced:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002740 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
Ingo Molnar48f24c42006-07-03 00:25:40 -07002741 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002742 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002743 return -1;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002744 sd->nr_balance_failed = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002745
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002746 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002747}
2748
2749/*
2750 * idle_balance is called by schedule() if this_cpu is about to become
2751 * idle. Attempts to pull tasks from other CPUs.
2752 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002753static void idle_balance(int this_cpu, struct rq *this_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002754{
2755 struct sched_domain *sd;
Ingo Molnardd41f592007-07-09 18:51:59 +02002756 int pulled_task = -1;
2757 unsigned long next_balance = jiffies + HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002758
2759 for_each_domain(this_cpu, sd) {
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002760 unsigned long interval;
2761
2762 if (!(sd->flags & SD_LOAD_BALANCE))
2763 continue;
2764
2765 if (sd->flags & SD_BALANCE_NEWIDLE)
Ingo Molnar48f24c42006-07-03 00:25:40 -07002766 /* If we've pulled tasks over stop searching: */
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002767 pulled_task = load_balance_newidle(this_cpu,
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002768 this_rq, sd);
2769
2770 interval = msecs_to_jiffies(sd->balance_interval);
2771 if (time_after(next_balance, sd->last_balance + interval))
2772 next_balance = sd->last_balance + interval;
2773 if (pulled_task)
2774 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002775 }
Ingo Molnardd41f592007-07-09 18:51:59 +02002776 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002777 /*
2778 * We are going idle. next_balance may be set based on
2779 * a busy processor. So reset next_balance.
2780 */
2781 this_rq->next_balance = next_balance;
Ingo Molnardd41f592007-07-09 18:51:59 +02002782 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002783}
2784
2785/*
2786 * active_load_balance is run by migration threads. It pushes running tasks
2787 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2788 * running on each physical CPU where possible, and avoids physical /
2789 * logical imbalances.
2790 *
2791 * Called with busiest_rq locked.
2792 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002793static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794{
Nick Piggin39507452005-06-25 14:57:09 -07002795 int target_cpu = busiest_rq->push_cpu;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002796 struct sched_domain *sd;
2797 struct rq *target_rq;
Nick Piggin39507452005-06-25 14:57:09 -07002798
Ingo Molnar48f24c42006-07-03 00:25:40 -07002799 /* Is there any task to move? */
Nick Piggin39507452005-06-25 14:57:09 -07002800 if (busiest_rq->nr_running <= 1)
Nick Piggin39507452005-06-25 14:57:09 -07002801 return;
2802
2803 target_rq = cpu_rq(target_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804
2805 /*
Nick Piggin39507452005-06-25 14:57:09 -07002806 * This condition is "impossible", if it occurs
2807 * we need to fix it. Originally reported by
2808 * Bjorn Helgaas on a 128-cpu setup.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809 */
Nick Piggin39507452005-06-25 14:57:09 -07002810 BUG_ON(busiest_rq == target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002811
Nick Piggin39507452005-06-25 14:57:09 -07002812 /* move a task from busiest_rq to target_rq */
2813 double_lock_balance(busiest_rq, target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814
Nick Piggin39507452005-06-25 14:57:09 -07002815 /* Search for an sd spanning us and the target CPU. */
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002816 for_each_domain(target_cpu, sd) {
Nick Piggin39507452005-06-25 14:57:09 -07002817 if ((sd->flags & SD_LOAD_BALANCE) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07002818 cpu_isset(busiest_cpu, sd->span))
Nick Piggin39507452005-06-25 14:57:09 -07002819 break;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002820 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821
Ingo Molnar48f24c42006-07-03 00:25:40 -07002822 if (likely(sd)) {
2823 schedstat_inc(sd, alb_cnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824
Ingo Molnar48f24c42006-07-03 00:25:40 -07002825 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002826 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002827 NULL))
2828 schedstat_inc(sd, alb_pushed);
2829 else
2830 schedstat_inc(sd, alb_failed);
2831 }
Nick Piggin39507452005-06-25 14:57:09 -07002832 spin_unlock(&target_rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833}
2834
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002835#ifdef CONFIG_NO_HZ
2836static struct {
2837 atomic_t load_balancer;
2838 cpumask_t cpu_mask;
2839} nohz ____cacheline_aligned = {
2840 .load_balancer = ATOMIC_INIT(-1),
2841 .cpu_mask = CPU_MASK_NONE,
2842};
2843
Christoph Lameter7835b982006-12-10 02:20:22 -08002844/*
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002845 * This routine will try to nominate the ilb (idle load balancing)
2846 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2847 * load balancing on behalf of all those cpus. If all the cpus in the system
2848 * go into this tickless mode, then there will be no ilb owner (as there is
2849 * no need for one) and all the cpus will sleep till the next wakeup event
2850 * arrives...
Christoph Lameter7835b982006-12-10 02:20:22 -08002851 *
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002852 * For the ilb owner, tick is not stopped. And this tick will be used
2853 * for idle load balancing. ilb owner will still be part of
2854 * nohz.cpu_mask..
2855 *
2856 * While stopping the tick, this cpu will become the ilb owner if there
2857 * is no other owner. And will be the owner till that cpu becomes busy
2858 * or if all cpus in the system stop their ticks at which point
2859 * there is no need for ilb owner.
2860 *
2861 * When the ilb owner becomes busy, it nominates another owner, during the
2862 * next busy scheduler_tick()
2863 */
2864int select_nohz_load_balancer(int stop_tick)
2865{
2866 int cpu = smp_processor_id();
2867
2868 if (stop_tick) {
2869 cpu_set(cpu, nohz.cpu_mask);
2870 cpu_rq(cpu)->in_nohz_recently = 1;
2871
2872 /*
2873 * If we are going offline and still the leader, give up!
2874 */
2875 if (cpu_is_offline(cpu) &&
2876 atomic_read(&nohz.load_balancer) == cpu) {
2877 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2878 BUG();
2879 return 0;
2880 }
2881
2882 /* time for ilb owner also to sleep */
2883 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2884 if (atomic_read(&nohz.load_balancer) == cpu)
2885 atomic_set(&nohz.load_balancer, -1);
2886 return 0;
2887 }
2888
2889 if (atomic_read(&nohz.load_balancer) == -1) {
2890 /* make me the ilb owner */
2891 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2892 return 1;
2893 } else if (atomic_read(&nohz.load_balancer) == cpu)
2894 return 1;
2895 } else {
2896 if (!cpu_isset(cpu, nohz.cpu_mask))
2897 return 0;
2898
2899 cpu_clear(cpu, nohz.cpu_mask);
2900
2901 if (atomic_read(&nohz.load_balancer) == cpu)
2902 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2903 BUG();
2904 }
2905 return 0;
2906}
2907#endif
2908
2909static DEFINE_SPINLOCK(balancing);
2910
2911/*
Christoph Lameter7835b982006-12-10 02:20:22 -08002912 * It checks each scheduling domain to see if it is due to be balanced,
2913 * and initiates a balancing operation if so.
2914 *
2915 * Balancing parameters are set up in arch_init_sched_domains.
2916 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002917static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
Christoph Lameter7835b982006-12-10 02:20:22 -08002918{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002919 int balance = 1;
2920 struct rq *rq = cpu_rq(cpu);
Christoph Lameter7835b982006-12-10 02:20:22 -08002921 unsigned long interval;
2922 struct sched_domain *sd;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002923 /* Earliest time when we have to do rebalance again */
Christoph Lameterc9819f42006-12-10 02:20:25 -08002924 unsigned long next_balance = jiffies + 60*HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002925
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002926 for_each_domain(cpu, sd) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002927 if (!(sd->flags & SD_LOAD_BALANCE))
2928 continue;
2929
2930 interval = sd->balance_interval;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002931 if (idle != CPU_IDLE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002932 interval *= sd->busy_factor;
2933
2934 /* scale ms to jiffies */
2935 interval = msecs_to_jiffies(interval);
2936 if (unlikely(!interval))
2937 interval = 1;
Ingo Molnardd41f592007-07-09 18:51:59 +02002938 if (interval > HZ*NR_CPUS/10)
2939 interval = HZ*NR_CPUS/10;
2940
Linus Torvalds1da177e2005-04-16 15:20:36 -07002941
Christoph Lameter08c183f2006-12-10 02:20:29 -08002942 if (sd->flags & SD_SERIALIZE) {
2943 if (!spin_trylock(&balancing))
2944 goto out;
2945 }
2946
Christoph Lameterc9819f42006-12-10 02:20:25 -08002947 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002948 if (load_balance(cpu, rq, sd, idle, &balance)) {
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002949 /*
2950 * We've pulled tasks over so either we're no
Nick Piggin5969fe02005-09-10 00:26:19 -07002951 * longer idle, or one of our SMT siblings is
2952 * not idle.
2953 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002954 idle = CPU_NOT_IDLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002955 }
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002956 sd->last_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957 }
Christoph Lameter08c183f2006-12-10 02:20:29 -08002958 if (sd->flags & SD_SERIALIZE)
2959 spin_unlock(&balancing);
2960out:
Christoph Lameterc9819f42006-12-10 02:20:25 -08002961 if (time_after(next_balance, sd->last_balance + interval))
2962 next_balance = sd->last_balance + interval;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002963
2964 /*
2965 * Stop the load balance at this level. There is another
2966 * CPU in our sched group which is doing load balancing more
2967 * actively.
2968 */
2969 if (!balance)
2970 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971 }
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002972 rq->next_balance = next_balance;
2973}
2974
2975/*
2976 * run_rebalance_domains is triggered when needed from the scheduler tick.
2977 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2978 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2979 */
2980static void run_rebalance_domains(struct softirq_action *h)
2981{
Ingo Molnardd41f592007-07-09 18:51:59 +02002982 int this_cpu = smp_processor_id();
2983 struct rq *this_rq = cpu_rq(this_cpu);
2984 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2985 CPU_IDLE : CPU_NOT_IDLE;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002986
Ingo Molnardd41f592007-07-09 18:51:59 +02002987 rebalance_domains(this_cpu, idle);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002988
2989#ifdef CONFIG_NO_HZ
2990 /*
2991 * If this cpu is the owner for idle load balancing, then do the
2992 * balancing on behalf of the other idle cpus whose ticks are
2993 * stopped.
2994 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002995 if (this_rq->idle_at_tick &&
2996 atomic_read(&nohz.load_balancer) == this_cpu) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002997 cpumask_t cpus = nohz.cpu_mask;
2998 struct rq *rq;
2999 int balance_cpu;
3000
Ingo Molnardd41f592007-07-09 18:51:59 +02003001 cpu_clear(this_cpu, cpus);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003002 for_each_cpu_mask(balance_cpu, cpus) {
3003 /*
3004 * If this cpu gets work to do, stop the load balancing
3005 * work being done for other cpus. Next load
3006 * balancing owner will pick it up.
3007 */
3008 if (need_resched())
3009 break;
3010
Ingo Molnardd41f592007-07-09 18:51:59 +02003011 rebalance_domains(balance_cpu, SCHED_IDLE);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003012
3013 rq = cpu_rq(balance_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003014 if (time_after(this_rq->next_balance, rq->next_balance))
3015 this_rq->next_balance = rq->next_balance;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003016 }
3017 }
3018#endif
3019}
3020
3021/*
3022 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3023 *
3024 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3025 * idle load balancing owner or decide to stop the periodic load balancing,
3026 * if the whole system is idle.
3027 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003028static inline void trigger_load_balance(struct rq *rq, int cpu)
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003029{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003030#ifdef CONFIG_NO_HZ
3031 /*
3032 * If we were in the nohz mode recently and busy at the current
3033 * scheduler tick, then check if we need to nominate new idle
3034 * load balancer.
3035 */
3036 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3037 rq->in_nohz_recently = 0;
3038
3039 if (atomic_read(&nohz.load_balancer) == cpu) {
3040 cpu_clear(cpu, nohz.cpu_mask);
3041 atomic_set(&nohz.load_balancer, -1);
3042 }
3043
3044 if (atomic_read(&nohz.load_balancer) == -1) {
3045 /*
3046 * simple selection for now: Nominate the
3047 * first cpu in the nohz list to be the next
3048 * ilb owner.
3049 *
3050 * TBD: Traverse the sched domains and nominate
3051 * the nearest cpu in the nohz.cpu_mask.
3052 */
3053 int ilb = first_cpu(nohz.cpu_mask);
3054
3055 if (ilb != NR_CPUS)
3056 resched_cpu(ilb);
3057 }
3058 }
3059
3060 /*
3061 * If this cpu is idle and doing idle load balancing for all the
3062 * cpus with ticks stopped, is it time for that to stop?
3063 */
3064 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3065 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3066 resched_cpu(cpu);
3067 return;
3068 }
3069
3070 /*
3071 * If this cpu is idle and the idle load balancing is done by
3072 * someone else, then no need raise the SCHED_SOFTIRQ
3073 */
3074 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3075 cpu_isset(cpu, nohz.cpu_mask))
3076 return;
3077#endif
3078 if (time_after_eq(jiffies, rq->next_balance))
3079 raise_softirq(SCHED_SOFTIRQ);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003080}
Ingo Molnardd41f592007-07-09 18:51:59 +02003081
3082#else /* CONFIG_SMP */
3083
Linus Torvalds1da177e2005-04-16 15:20:36 -07003084/*
3085 * on UP we do not need to balance between CPUs:
3086 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07003087static inline void idle_balance(int cpu, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003088{
3089}
Ingo Molnardd41f592007-07-09 18:51:59 +02003090
3091/* Avoid "used but not defined" warning on UP */
3092static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3093 unsigned long max_nr_move, unsigned long max_load_move,
3094 struct sched_domain *sd, enum cpu_idle_type idle,
3095 int *all_pinned, unsigned long *load_moved,
3096 int this_best_prio, int best_prio, int best_prio_seen,
3097 struct rq_iterator *iterator)
3098{
3099 *load_moved = 0;
3100
3101 return 0;
3102}
3103
Linus Torvalds1da177e2005-04-16 15:20:36 -07003104#endif
3105
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106DEFINE_PER_CPU(struct kernel_stat, kstat);
3107
3108EXPORT_PER_CPU_SYMBOL(kstat);
3109
3110/*
Ingo Molnar41b86e92007-07-09 18:51:58 +02003111 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3112 * that have not yet been banked in case the task is currently running.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003113 */
Ingo Molnar41b86e92007-07-09 18:51:58 +02003114unsigned long long task_sched_runtime(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003115{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003116 unsigned long flags;
Ingo Molnar41b86e92007-07-09 18:51:58 +02003117 u64 ns, delta_exec;
3118 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003119
Ingo Molnar41b86e92007-07-09 18:51:58 +02003120 rq = task_rq_lock(p, &flags);
3121 ns = p->se.sum_exec_runtime;
3122 if (rq->curr == p) {
3123 delta_exec = rq_clock(rq) - p->se.exec_start;
3124 if ((s64)delta_exec > 0)
3125 ns += delta_exec;
3126 }
3127 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07003128
Linus Torvalds1da177e2005-04-16 15:20:36 -07003129 return ns;
3130}
3131
3132/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003133 * Account user cpu time to a process.
3134 * @p: the process that the cpu time gets accounted to
3135 * @hardirq_offset: the offset to subtract from hardirq_count()
3136 * @cputime: the cpu time spent in user space since the last update
3137 */
3138void account_user_time(struct task_struct *p, cputime_t cputime)
3139{
3140 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3141 cputime64_t tmp;
3142
3143 p->utime = cputime_add(p->utime, cputime);
3144
3145 /* Add user time to cpustat. */
3146 tmp = cputime_to_cputime64(cputime);
3147 if (TASK_NICE(p) > 0)
3148 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3149 else
3150 cpustat->user = cputime64_add(cpustat->user, tmp);
3151}
3152
3153/*
3154 * Account system cpu time to a process.
3155 * @p: the process that the cpu time gets accounted to
3156 * @hardirq_offset: the offset to subtract from hardirq_count()
3157 * @cputime: the cpu time spent in kernel space since the last update
3158 */
3159void account_system_time(struct task_struct *p, int hardirq_offset,
3160 cputime_t cputime)
3161{
3162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003163 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164 cputime64_t tmp;
3165
3166 p->stime = cputime_add(p->stime, cputime);
3167
3168 /* Add system time to cpustat. */
3169 tmp = cputime_to_cputime64(cputime);
3170 if (hardirq_count() - hardirq_offset)
3171 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3172 else if (softirq_count())
3173 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3174 else if (p != rq->idle)
3175 cpustat->system = cputime64_add(cpustat->system, tmp);
3176 else if (atomic_read(&rq->nr_iowait) > 0)
3177 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3178 else
3179 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3180 /* Account for system time used */
3181 acct_update_integrals(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182}
3183
3184/*
3185 * Account for involuntary wait time.
3186 * @p: the process from which the cpu time has been stolen
3187 * @steal: the cpu time spent in involuntary wait
3188 */
3189void account_steal_time(struct task_struct *p, cputime_t steal)
3190{
3191 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3192 cputime64_t tmp = cputime_to_cputime64(steal);
Ingo Molnar70b97a72006-07-03 00:25:42 -07003193 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003194
3195 if (p == rq->idle) {
3196 p->stime = cputime_add(p->stime, steal);
3197 if (atomic_read(&rq->nr_iowait) > 0)
3198 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3199 else
3200 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3201 } else
3202 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3203}
3204
Christoph Lameter7835b982006-12-10 02:20:22 -08003205/*
3206 * This function gets called by the timer code, with HZ frequency.
3207 * We call it with interrupts disabled.
3208 *
3209 * It also gets called by the fork code, when changing the parent's
3210 * timeslices.
3211 */
3212void scheduler_tick(void)
3213{
Christoph Lameter7835b982006-12-10 02:20:22 -08003214 int cpu = smp_processor_id();
3215 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003216 struct task_struct *curr = rq->curr;
Christoph Lameter7835b982006-12-10 02:20:22 -08003217
Ingo Molnardd41f592007-07-09 18:51:59 +02003218 spin_lock(&rq->lock);
3219 if (curr != rq->idle) /* FIXME: needed? */
3220 curr->sched_class->task_tick(rq, curr);
3221 update_cpu_load(rq);
3222 spin_unlock(&rq->lock);
3223
Christoph Lametere418e1c2006-12-10 02:20:23 -08003224#ifdef CONFIG_SMP
Ingo Molnardd41f592007-07-09 18:51:59 +02003225 rq->idle_at_tick = idle_cpu(cpu);
3226 trigger_load_balance(rq, cpu);
Christoph Lametere418e1c2006-12-10 02:20:23 -08003227#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003228}
3229
Linus Torvalds1da177e2005-04-16 15:20:36 -07003230#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3231
3232void fastcall add_preempt_count(int val)
3233{
3234 /*
3235 * Underflow?
3236 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003237 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3238 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003239 preempt_count() += val;
3240 /*
3241 * Spinlock count overflowing soon?
3242 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08003243 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3244 PREEMPT_MASK - 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245}
3246EXPORT_SYMBOL(add_preempt_count);
3247
3248void fastcall sub_preempt_count(int val)
3249{
3250 /*
3251 * Underflow?
3252 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003253 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3254 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003255 /*
3256 * Is the spinlock portion underflowing?
3257 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003258 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3259 !(preempt_count() & PREEMPT_MASK)))
3260 return;
3261
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262 preempt_count() -= val;
3263}
3264EXPORT_SYMBOL(sub_preempt_count);
3265
3266#endif
3267
3268/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003269 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003271static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272{
Ingo Molnardd41f592007-07-09 18:51:59 +02003273 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3274 prev->comm, preempt_count(), prev->pid);
3275 debug_show_held_locks(prev);
3276 if (irqs_disabled())
3277 print_irqtrace_events(prev);
3278 dump_stack();
3279}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003280
Ingo Molnardd41f592007-07-09 18:51:59 +02003281/*
3282 * Various schedule()-time debugging checks and statistics:
3283 */
3284static inline void schedule_debug(struct task_struct *prev)
3285{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003286 /*
3287 * Test if we are atomic. Since do_exit() needs to call into
3288 * schedule() atomically, we ignore that path for now.
3289 * Otherwise, whine if we are scheduling when we should not be.
3290 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003291 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3292 __schedule_bug(prev);
3293
Linus Torvalds1da177e2005-04-16 15:20:36 -07003294 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3295
Ingo Molnardd41f592007-07-09 18:51:59 +02003296 schedstat_inc(this_rq(), sched_cnt);
3297}
3298
3299/*
3300 * Pick up the highest-prio task:
3301 */
3302static inline struct task_struct *
3303pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3304{
3305 struct sched_class *class;
3306 struct task_struct *p;
3307
3308 /*
3309 * Optimization: we know that if all tasks are in
3310 * the fair class we can call that function directly:
3311 */
3312 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3313 p = fair_sched_class.pick_next_task(rq, now);
3314 if (likely(p))
3315 return p;
3316 }
3317
3318 class = sched_class_highest;
3319 for ( ; ; ) {
3320 p = class->pick_next_task(rq, now);
3321 if (p)
3322 return p;
3323 /*
3324 * Will never be NULL as the idle class always
3325 * returns a non-NULL p:
3326 */
3327 class = class->next;
3328 }
3329}
3330
3331/*
3332 * schedule() is the main scheduler function.
3333 */
3334asmlinkage void __sched schedule(void)
3335{
3336 struct task_struct *prev, *next;
3337 long *switch_count;
3338 struct rq *rq;
3339 u64 now;
3340 int cpu;
3341
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342need_resched:
3343 preempt_disable();
Ingo Molnardd41f592007-07-09 18:51:59 +02003344 cpu = smp_processor_id();
3345 rq = cpu_rq(cpu);
3346 rcu_qsctr_inc(cpu);
3347 prev = rq->curr;
3348 switch_count = &prev->nivcsw;
3349
Linus Torvalds1da177e2005-04-16 15:20:36 -07003350 release_kernel_lock(prev);
3351need_resched_nonpreemptible:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003352
Ingo Molnardd41f592007-07-09 18:51:59 +02003353 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003354
3355 spin_lock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003356 clear_tsk_need_resched(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003357
Ingo Molnardd41f592007-07-09 18:51:59 +02003358 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3359 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3360 unlikely(signal_pending(prev)))) {
3361 prev->state = TASK_RUNNING;
3362 } else {
3363 deactivate_task(rq, prev, 1);
3364 }
3365 switch_count = &prev->nvcsw;
3366 }
3367
3368 if (unlikely(!rq->nr_running))
3369 idle_balance(cpu, rq);
3370
3371 now = __rq_clock(rq);
3372 prev->sched_class->put_prev_task(rq, prev, now);
3373 next = pick_next_task(rq, prev, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003374
3375 sched_info_switch(prev, next);
Ingo Molnardd41f592007-07-09 18:51:59 +02003376
Linus Torvalds1da177e2005-04-16 15:20:36 -07003377 if (likely(prev != next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003378 rq->nr_switches++;
3379 rq->curr = next;
3380 ++*switch_count;
3381
Ingo Molnardd41f592007-07-09 18:51:59 +02003382 context_switch(rq, prev, next); /* unlocks the rq */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003383 } else
3384 spin_unlock_irq(&rq->lock);
3385
Ingo Molnardd41f592007-07-09 18:51:59 +02003386 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3387 cpu = smp_processor_id();
3388 rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003389 goto need_resched_nonpreemptible;
Ingo Molnardd41f592007-07-09 18:51:59 +02003390 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003391 preempt_enable_no_resched();
3392 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3393 goto need_resched;
3394}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003395EXPORT_SYMBOL(schedule);
3396
3397#ifdef CONFIG_PREEMPT
3398/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003399 * this is the entry point to schedule() from in-kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003400 * off of preempt_enable. Kernel preemptions off return from interrupt
3401 * occur there and call schedule directly.
3402 */
3403asmlinkage void __sched preempt_schedule(void)
3404{
3405 struct thread_info *ti = current_thread_info();
3406#ifdef CONFIG_PREEMPT_BKL
3407 struct task_struct *task = current;
3408 int saved_lock_depth;
3409#endif
3410 /*
3411 * If there is a non-zero preempt_count or interrupts are disabled,
3412 * we do not want to preempt the current task. Just return..
3413 */
Nick Pigginbeed33a2006-10-11 01:21:52 -07003414 if (likely(ti->preempt_count || irqs_disabled()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003415 return;
3416
3417need_resched:
3418 add_preempt_count(PREEMPT_ACTIVE);
3419 /*
3420 * We keep the big kernel semaphore locked, but we
3421 * clear ->lock_depth so that schedule() doesnt
3422 * auto-release the semaphore:
3423 */
3424#ifdef CONFIG_PREEMPT_BKL
3425 saved_lock_depth = task->lock_depth;
3426 task->lock_depth = -1;
3427#endif
3428 schedule();
3429#ifdef CONFIG_PREEMPT_BKL
3430 task->lock_depth = saved_lock_depth;
3431#endif
3432 sub_preempt_count(PREEMPT_ACTIVE);
3433
3434 /* we could miss a preemption opportunity between schedule and now */
3435 barrier();
3436 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3437 goto need_resched;
3438}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003439EXPORT_SYMBOL(preempt_schedule);
3440
3441/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003442 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003443 * off of irq context.
3444 * Note, that this is called and return with irqs disabled. This will
3445 * protect us against recursive calling from irq.
3446 */
3447asmlinkage void __sched preempt_schedule_irq(void)
3448{
3449 struct thread_info *ti = current_thread_info();
3450#ifdef CONFIG_PREEMPT_BKL
3451 struct task_struct *task = current;
3452 int saved_lock_depth;
3453#endif
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003454 /* Catch callers which need to be fixed */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003455 BUG_ON(ti->preempt_count || !irqs_disabled());
3456
3457need_resched:
3458 add_preempt_count(PREEMPT_ACTIVE);
3459 /*
3460 * We keep the big kernel semaphore locked, but we
3461 * clear ->lock_depth so that schedule() doesnt
3462 * auto-release the semaphore:
3463 */
3464#ifdef CONFIG_PREEMPT_BKL
3465 saved_lock_depth = task->lock_depth;
3466 task->lock_depth = -1;
3467#endif
3468 local_irq_enable();
3469 schedule();
3470 local_irq_disable();
3471#ifdef CONFIG_PREEMPT_BKL
3472 task->lock_depth = saved_lock_depth;
3473#endif
3474 sub_preempt_count(PREEMPT_ACTIVE);
3475
3476 /* we could miss a preemption opportunity between schedule and now */
3477 barrier();
3478 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3479 goto need_resched;
3480}
3481
3482#endif /* CONFIG_PREEMPT */
3483
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003484int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3485 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003486{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003487 return try_to_wake_up(curr->private, mode, sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003488}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003489EXPORT_SYMBOL(default_wake_function);
3490
3491/*
3492 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3493 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3494 * number) then we wake all the non-exclusive tasks and one exclusive task.
3495 *
3496 * There are circumstances in which we can try to wake a task which has already
3497 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3498 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3499 */
3500static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3501 int nr_exclusive, int sync, void *key)
3502{
3503 struct list_head *tmp, *next;
3504
3505 list_for_each_safe(tmp, next, &q->task_list) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07003506 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3507 unsigned flags = curr->flags;
3508
Linus Torvalds1da177e2005-04-16 15:20:36 -07003509 if (curr->func(curr, mode, sync, key) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07003510 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003511 break;
3512 }
3513}
3514
3515/**
3516 * __wake_up - wake up threads blocked on a waitqueue.
3517 * @q: the waitqueue
3518 * @mode: which threads
3519 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Martin Waitz67be2dd2005-05-01 08:59:26 -07003520 * @key: is directly passed to the wakeup function
Linus Torvalds1da177e2005-04-16 15:20:36 -07003521 */
3522void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003523 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003524{
3525 unsigned long flags;
3526
3527 spin_lock_irqsave(&q->lock, flags);
3528 __wake_up_common(q, mode, nr_exclusive, 0, key);
3529 spin_unlock_irqrestore(&q->lock, flags);
3530}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003531EXPORT_SYMBOL(__wake_up);
3532
3533/*
3534 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3535 */
3536void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3537{
3538 __wake_up_common(q, mode, 1, 0, NULL);
3539}
3540
3541/**
Martin Waitz67be2dd2005-05-01 08:59:26 -07003542 * __wake_up_sync - wake up threads blocked on a waitqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003543 * @q: the waitqueue
3544 * @mode: which threads
3545 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3546 *
3547 * The sync wakeup differs that the waker knows that it will schedule
3548 * away soon, so while the target thread will be woken up, it will not
3549 * be migrated to another CPU - ie. the two threads are 'synchronized'
3550 * with each other. This can prevent needless bouncing between CPUs.
3551 *
3552 * On UP it can prevent extra preemption.
3553 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003554void fastcall
3555__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003556{
3557 unsigned long flags;
3558 int sync = 1;
3559
3560 if (unlikely(!q))
3561 return;
3562
3563 if (unlikely(!nr_exclusive))
3564 sync = 0;
3565
3566 spin_lock_irqsave(&q->lock, flags);
3567 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3568 spin_unlock_irqrestore(&q->lock, flags);
3569}
3570EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3571
3572void fastcall complete(struct completion *x)
3573{
3574 unsigned long flags;
3575
3576 spin_lock_irqsave(&x->wait.lock, flags);
3577 x->done++;
3578 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3579 1, 0, NULL);
3580 spin_unlock_irqrestore(&x->wait.lock, flags);
3581}
3582EXPORT_SYMBOL(complete);
3583
3584void fastcall complete_all(struct completion *x)
3585{
3586 unsigned long flags;
3587
3588 spin_lock_irqsave(&x->wait.lock, flags);
3589 x->done += UINT_MAX/2;
3590 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3591 0, 0, NULL);
3592 spin_unlock_irqrestore(&x->wait.lock, flags);
3593}
3594EXPORT_SYMBOL(complete_all);
3595
3596void fastcall __sched wait_for_completion(struct completion *x)
3597{
3598 might_sleep();
Ingo Molnar48f24c42006-07-03 00:25:40 -07003599
Linus Torvalds1da177e2005-04-16 15:20:36 -07003600 spin_lock_irq(&x->wait.lock);
3601 if (!x->done) {
3602 DECLARE_WAITQUEUE(wait, current);
3603
3604 wait.flags |= WQ_FLAG_EXCLUSIVE;
3605 __add_wait_queue_tail(&x->wait, &wait);
3606 do {
3607 __set_current_state(TASK_UNINTERRUPTIBLE);
3608 spin_unlock_irq(&x->wait.lock);
3609 schedule();
3610 spin_lock_irq(&x->wait.lock);
3611 } while (!x->done);
3612 __remove_wait_queue(&x->wait, &wait);
3613 }
3614 x->done--;
3615 spin_unlock_irq(&x->wait.lock);
3616}
3617EXPORT_SYMBOL(wait_for_completion);
3618
3619unsigned long fastcall __sched
3620wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3621{
3622 might_sleep();
3623
3624 spin_lock_irq(&x->wait.lock);
3625 if (!x->done) {
3626 DECLARE_WAITQUEUE(wait, current);
3627
3628 wait.flags |= WQ_FLAG_EXCLUSIVE;
3629 __add_wait_queue_tail(&x->wait, &wait);
3630 do {
3631 __set_current_state(TASK_UNINTERRUPTIBLE);
3632 spin_unlock_irq(&x->wait.lock);
3633 timeout = schedule_timeout(timeout);
3634 spin_lock_irq(&x->wait.lock);
3635 if (!timeout) {
3636 __remove_wait_queue(&x->wait, &wait);
3637 goto out;
3638 }
3639 } while (!x->done);
3640 __remove_wait_queue(&x->wait, &wait);
3641 }
3642 x->done--;
3643out:
3644 spin_unlock_irq(&x->wait.lock);
3645 return timeout;
3646}
3647EXPORT_SYMBOL(wait_for_completion_timeout);
3648
3649int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3650{
3651 int ret = 0;
3652
3653 might_sleep();
3654
3655 spin_lock_irq(&x->wait.lock);
3656 if (!x->done) {
3657 DECLARE_WAITQUEUE(wait, current);
3658
3659 wait.flags |= WQ_FLAG_EXCLUSIVE;
3660 __add_wait_queue_tail(&x->wait, &wait);
3661 do {
3662 if (signal_pending(current)) {
3663 ret = -ERESTARTSYS;
3664 __remove_wait_queue(&x->wait, &wait);
3665 goto out;
3666 }
3667 __set_current_state(TASK_INTERRUPTIBLE);
3668 spin_unlock_irq(&x->wait.lock);
3669 schedule();
3670 spin_lock_irq(&x->wait.lock);
3671 } while (!x->done);
3672 __remove_wait_queue(&x->wait, &wait);
3673 }
3674 x->done--;
3675out:
3676 spin_unlock_irq(&x->wait.lock);
3677
3678 return ret;
3679}
3680EXPORT_SYMBOL(wait_for_completion_interruptible);
3681
3682unsigned long fastcall __sched
3683wait_for_completion_interruptible_timeout(struct completion *x,
3684 unsigned long timeout)
3685{
3686 might_sleep();
3687
3688 spin_lock_irq(&x->wait.lock);
3689 if (!x->done) {
3690 DECLARE_WAITQUEUE(wait, current);
3691
3692 wait.flags |= WQ_FLAG_EXCLUSIVE;
3693 __add_wait_queue_tail(&x->wait, &wait);
3694 do {
3695 if (signal_pending(current)) {
3696 timeout = -ERESTARTSYS;
3697 __remove_wait_queue(&x->wait, &wait);
3698 goto out;
3699 }
3700 __set_current_state(TASK_INTERRUPTIBLE);
3701 spin_unlock_irq(&x->wait.lock);
3702 timeout = schedule_timeout(timeout);
3703 spin_lock_irq(&x->wait.lock);
3704 if (!timeout) {
3705 __remove_wait_queue(&x->wait, &wait);
3706 goto out;
3707 }
3708 } while (!x->done);
3709 __remove_wait_queue(&x->wait, &wait);
3710 }
3711 x->done--;
3712out:
3713 spin_unlock_irq(&x->wait.lock);
3714 return timeout;
3715}
3716EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3717
3718
3719#define SLEEP_ON_VAR \
3720 unsigned long flags; \
3721 wait_queue_t wait; \
3722 init_waitqueue_entry(&wait, current);
3723
3724#define SLEEP_ON_HEAD \
3725 spin_lock_irqsave(&q->lock,flags); \
3726 __add_wait_queue(q, &wait); \
3727 spin_unlock(&q->lock);
3728
3729#define SLEEP_ON_TAIL \
3730 spin_lock_irq(&q->lock); \
3731 __remove_wait_queue(q, &wait); \
3732 spin_unlock_irqrestore(&q->lock, flags);
3733
3734void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3735{
3736 SLEEP_ON_VAR
3737
3738 current->state = TASK_INTERRUPTIBLE;
3739
3740 SLEEP_ON_HEAD
3741 schedule();
3742 SLEEP_ON_TAIL
3743}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003744EXPORT_SYMBOL(interruptible_sleep_on);
3745
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003746long fastcall __sched
3747interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003748{
3749 SLEEP_ON_VAR
3750
3751 current->state = TASK_INTERRUPTIBLE;
3752
3753 SLEEP_ON_HEAD
3754 timeout = schedule_timeout(timeout);
3755 SLEEP_ON_TAIL
3756
3757 return timeout;
3758}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003759EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3760
3761void fastcall __sched sleep_on(wait_queue_head_t *q)
3762{
3763 SLEEP_ON_VAR
3764
3765 current->state = TASK_UNINTERRUPTIBLE;
3766
3767 SLEEP_ON_HEAD
3768 schedule();
3769 SLEEP_ON_TAIL
3770}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003771EXPORT_SYMBOL(sleep_on);
3772
3773long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3774{
3775 SLEEP_ON_VAR
3776
3777 current->state = TASK_UNINTERRUPTIBLE;
3778
3779 SLEEP_ON_HEAD
3780 timeout = schedule_timeout(timeout);
3781 SLEEP_ON_TAIL
3782
3783 return timeout;
3784}
3785
3786EXPORT_SYMBOL(sleep_on_timeout);
3787
Ingo Molnarb29739f2006-06-27 02:54:51 -07003788#ifdef CONFIG_RT_MUTEXES
3789
3790/*
3791 * rt_mutex_setprio - set the current priority of a task
3792 * @p: task
3793 * @prio: prio value (kernel-internal form)
3794 *
3795 * This function changes the 'effective' priority of a task. It does
3796 * not touch ->normal_prio like __setscheduler().
3797 *
3798 * Used by the rt_mutex code to implement priority inheritance logic.
3799 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003800void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07003801{
3802 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02003803 int oldprio, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003804 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003805 u64 now;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003806
3807 BUG_ON(prio < 0 || prio > MAX_PRIO);
3808
3809 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003810 now = rq_clock(rq);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003811
Andrew Mortond5f9f942007-05-08 20:27:06 -07003812 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02003813 on_rq = p->se.on_rq;
3814 if (on_rq)
3815 dequeue_task(rq, p, 0, now);
3816
3817 if (rt_prio(prio))
3818 p->sched_class = &rt_sched_class;
3819 else
3820 p->sched_class = &fair_sched_class;
3821
Ingo Molnarb29739f2006-06-27 02:54:51 -07003822 p->prio = prio;
3823
Ingo Molnardd41f592007-07-09 18:51:59 +02003824 if (on_rq) {
3825 enqueue_task(rq, p, 0, now);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003826 /*
3827 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07003828 * our priority decreased, or if we are not currently running on
3829 * this runqueue and our priority is higher than the current's
Ingo Molnarb29739f2006-06-27 02:54:51 -07003830 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003831 if (task_running(rq, p)) {
3832 if (p->prio > oldprio)
3833 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02003834 } else {
3835 check_preempt_curr(rq, p);
3836 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07003837 }
3838 task_rq_unlock(rq, &flags);
3839}
3840
3841#endif
3842
Ingo Molnar36c8b582006-07-03 00:25:41 -07003843void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003844{
Ingo Molnardd41f592007-07-09 18:51:59 +02003845 int old_prio, delta, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003846 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003847 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003848 u64 now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003849
3850 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3851 return;
3852 /*
3853 * We have to be careful, if called from sys_setpriority(),
3854 * the task might be in the middle of scheduling on another CPU.
3855 */
3856 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003857 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003858 /*
3859 * The RT priorities are set via sched_setscheduler(), but we still
3860 * allow the 'normal' nice value to be set - but as expected
3861 * it wont have any effect on scheduling until the task is
Ingo Molnardd41f592007-07-09 18:51:59 +02003862 * SCHED_FIFO/SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003863 */
Ingo Molnare05606d2007-07-09 18:51:59 +02003864 if (task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003865 p->static_prio = NICE_TO_PRIO(nice);
3866 goto out_unlock;
3867 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003868 on_rq = p->se.on_rq;
3869 if (on_rq) {
3870 dequeue_task(rq, p, 0, now);
3871 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -07003872 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003873
Linus Torvalds1da177e2005-04-16 15:20:36 -07003874 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07003875 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003876 old_prio = p->prio;
3877 p->prio = effective_prio(p);
3878 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003879
Ingo Molnardd41f592007-07-09 18:51:59 +02003880 if (on_rq) {
3881 enqueue_task(rq, p, 0, now);
3882 inc_load(rq, p, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003883 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07003884 * If the task increased its priority or is running and
3885 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003886 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003887 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003888 resched_task(rq->curr);
3889 }
3890out_unlock:
3891 task_rq_unlock(rq, &flags);
3892}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003893EXPORT_SYMBOL(set_user_nice);
3894
Matt Mackalle43379f2005-05-01 08:59:00 -07003895/*
3896 * can_nice - check if a task can reduce its nice value
3897 * @p: task
3898 * @nice: nice value
3899 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003900int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07003901{
Matt Mackall024f4742005-08-18 11:24:19 -07003902 /* convert nice value [19,-20] to rlimit style value [1,40] */
3903 int nice_rlim = 20 - nice;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003904
Matt Mackalle43379f2005-05-01 08:59:00 -07003905 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3906 capable(CAP_SYS_NICE));
3907}
3908
Linus Torvalds1da177e2005-04-16 15:20:36 -07003909#ifdef __ARCH_WANT_SYS_NICE
3910
3911/*
3912 * sys_nice - change the priority of the current process.
3913 * @increment: priority increment
3914 *
3915 * sys_setpriority is a more generic, but much slower function that
3916 * does similar things.
3917 */
3918asmlinkage long sys_nice(int increment)
3919{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003920 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003921
3922 /*
3923 * Setpriority might change our priority at the same moment.
3924 * We don't have to worry. Conceptually one call occurs first
3925 * and we have a single winner.
3926 */
Matt Mackalle43379f2005-05-01 08:59:00 -07003927 if (increment < -40)
3928 increment = -40;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003929 if (increment > 40)
3930 increment = 40;
3931
3932 nice = PRIO_TO_NICE(current->static_prio) + increment;
3933 if (nice < -20)
3934 nice = -20;
3935 if (nice > 19)
3936 nice = 19;
3937
Matt Mackalle43379f2005-05-01 08:59:00 -07003938 if (increment < 0 && !can_nice(current, nice))
3939 return -EPERM;
3940
Linus Torvalds1da177e2005-04-16 15:20:36 -07003941 retval = security_task_setnice(current, nice);
3942 if (retval)
3943 return retval;
3944
3945 set_user_nice(current, nice);
3946 return 0;
3947}
3948
3949#endif
3950
3951/**
3952 * task_prio - return the priority value of a given task.
3953 * @p: the task in question.
3954 *
3955 * This is the priority value as seen by users in /proc.
3956 * RT tasks are offset by -200. Normal tasks are centered
3957 * around 0, value goes from -16 to +15.
3958 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003959int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003960{
3961 return p->prio - MAX_RT_PRIO;
3962}
3963
3964/**
3965 * task_nice - return the nice value of a given task.
3966 * @p: the task in question.
3967 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003968int task_nice(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003969{
3970 return TASK_NICE(p);
3971}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003972EXPORT_SYMBOL_GPL(task_nice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003973
3974/**
3975 * idle_cpu - is a given cpu idle currently?
3976 * @cpu: the processor in question.
3977 */
3978int idle_cpu(int cpu)
3979{
3980 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3981}
3982
Linus Torvalds1da177e2005-04-16 15:20:36 -07003983/**
3984 * idle_task - return the idle task for a given cpu.
3985 * @cpu: the processor in question.
3986 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003987struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003988{
3989 return cpu_rq(cpu)->idle;
3990}
3991
3992/**
3993 * find_process_by_pid - find a process with a matching PID value.
3994 * @pid: the pid in question.
3995 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003996static inline struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003997{
3998 return pid ? find_task_by_pid(pid) : current;
3999}
4000
4001/* Actually do priority change: must hold rq lock. */
Ingo Molnardd41f592007-07-09 18:51:59 +02004002static void
4003__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004004{
Ingo Molnardd41f592007-07-09 18:51:59 +02004005 BUG_ON(p->se.on_rq);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004006
Linus Torvalds1da177e2005-04-16 15:20:36 -07004007 p->policy = policy;
Ingo Molnardd41f592007-07-09 18:51:59 +02004008 switch (p->policy) {
4009 case SCHED_NORMAL:
4010 case SCHED_BATCH:
4011 case SCHED_IDLE:
4012 p->sched_class = &fair_sched_class;
4013 break;
4014 case SCHED_FIFO:
4015 case SCHED_RR:
4016 p->sched_class = &rt_sched_class;
4017 break;
4018 }
4019
Linus Torvalds1da177e2005-04-16 15:20:36 -07004020 p->rt_priority = prio;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004021 p->normal_prio = normal_prio(p);
4022 /* we are holding p->pi_lock already */
4023 p->prio = rt_mutex_getprio(p);
Peter Williams2dd73a42006-06-27 02:54:34 -07004024 set_load_weight(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004025}
4026
4027/**
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004028 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004029 * @p: the task in question.
4030 * @policy: new policy.
4031 * @param: structure containing the new RT priority.
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004032 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004033 * NOTE that the task may be already dead.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004034 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004035int sched_setscheduler(struct task_struct *p, int policy,
4036 struct sched_param *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004037{
Ingo Molnardd41f592007-07-09 18:51:59 +02004038 int retval, oldprio, oldpolicy = -1, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004039 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004040 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004041
Steven Rostedt66e53932006-06-27 02:54:44 -07004042 /* may grab non-irq protected spin_locks */
4043 BUG_ON(in_interrupt());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004044recheck:
4045 /* double check policy once rq lock held */
4046 if (policy < 0)
4047 policy = oldpolicy = p->policy;
4048 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
Ingo Molnardd41f592007-07-09 18:51:59 +02004049 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4050 policy != SCHED_IDLE)
Ingo Molnarb0a94992006-01-14 13:20:41 -08004051 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004052 /*
4053 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02004054 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4055 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004056 */
4057 if (param->sched_priority < 0 ||
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004058 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
Steven Rostedtd46523e2005-07-25 16:28:39 -04004059 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004060 return -EINVAL;
Ingo Molnare05606d2007-07-09 18:51:59 +02004061 if (rt_policy(policy) != (param->sched_priority != 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004062 return -EINVAL;
4063
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004064 /*
4065 * Allow unprivileged RT tasks to decrease priority:
4066 */
4067 if (!capable(CAP_SYS_NICE)) {
Ingo Molnare05606d2007-07-09 18:51:59 +02004068 if (rt_policy(policy)) {
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004069 unsigned long rlim_rtprio;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004070
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004071 if (!lock_task_sighand(p, &flags))
4072 return -ESRCH;
4073 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4074 unlock_task_sighand(p, &flags);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004075
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004076 /* can't set/change the rt policy */
4077 if (policy != p->policy && !rlim_rtprio)
4078 return -EPERM;
4079
4080 /* can't increase priority */
4081 if (param->sched_priority > p->rt_priority &&
4082 param->sched_priority > rlim_rtprio)
4083 return -EPERM;
4084 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004085 /*
4086 * Like positive nice levels, dont allow tasks to
4087 * move out of SCHED_IDLE either:
4088 */
4089 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4090 return -EPERM;
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004091
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004092 /* can't change other user's priorities */
4093 if ((current->euid != p->euid) &&
4094 (current->euid != p->uid))
4095 return -EPERM;
4096 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004097
4098 retval = security_task_setscheduler(p, policy, param);
4099 if (retval)
4100 return retval;
4101 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07004102 * make sure no PI-waiters arrive (or leave) while we are
4103 * changing the priority of the task:
4104 */
4105 spin_lock_irqsave(&p->pi_lock, flags);
4106 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07004107 * To be able to change p->policy safely, the apropriate
4108 * runqueue lock must be held.
4109 */
Ingo Molnarb29739f2006-06-27 02:54:51 -07004110 rq = __task_rq_lock(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004111 /* recheck policy now with rq lock held */
4112 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4113 policy = oldpolicy = -1;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004114 __task_rq_unlock(rq);
4115 spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004116 goto recheck;
4117 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004118 on_rq = p->se.on_rq;
4119 if (on_rq)
4120 deactivate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004121 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02004122 __setscheduler(rq, p, policy, param->sched_priority);
4123 if (on_rq) {
4124 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004125 /*
4126 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07004127 * our priority decreased, or if we are not currently running on
4128 * this runqueue and our priority is higher than the current's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004129 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07004130 if (task_running(rq, p)) {
4131 if (p->prio > oldprio)
4132 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02004133 } else {
4134 check_preempt_curr(rq, p);
4135 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004136 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07004137 __task_rq_unlock(rq);
4138 spin_unlock_irqrestore(&p->pi_lock, flags);
4139
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07004140 rt_mutex_adjust_pi(p);
4141
Linus Torvalds1da177e2005-04-16 15:20:36 -07004142 return 0;
4143}
4144EXPORT_SYMBOL_GPL(sched_setscheduler);
4145
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004146static int
4147do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004148{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004149 struct sched_param lparam;
4150 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004151 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004152
4153 if (!param || pid < 0)
4154 return -EINVAL;
4155 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4156 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004157
4158 rcu_read_lock();
4159 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004160 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004161 if (p != NULL)
4162 retval = sched_setscheduler(p, policy, &lparam);
4163 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07004164
Linus Torvalds1da177e2005-04-16 15:20:36 -07004165 return retval;
4166}
4167
4168/**
4169 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4170 * @pid: the pid in question.
4171 * @policy: new policy.
4172 * @param: structure containing the new RT priority.
4173 */
4174asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4175 struct sched_param __user *param)
4176{
Jason Baronc21761f2006-01-18 17:43:03 -08004177 /* negative values for policy are not valid */
4178 if (policy < 0)
4179 return -EINVAL;
4180
Linus Torvalds1da177e2005-04-16 15:20:36 -07004181 return do_sched_setscheduler(pid, policy, param);
4182}
4183
4184/**
4185 * sys_sched_setparam - set/change the RT priority of a thread
4186 * @pid: the pid in question.
4187 * @param: structure containing the new RT priority.
4188 */
4189asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4190{
4191 return do_sched_setscheduler(pid, -1, param);
4192}
4193
4194/**
4195 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4196 * @pid: the pid in question.
4197 */
4198asmlinkage long sys_sched_getscheduler(pid_t pid)
4199{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004200 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004201 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004202
4203 if (pid < 0)
4204 goto out_nounlock;
4205
4206 retval = -ESRCH;
4207 read_lock(&tasklist_lock);
4208 p = find_process_by_pid(pid);
4209 if (p) {
4210 retval = security_task_getscheduler(p);
4211 if (!retval)
4212 retval = p->policy;
4213 }
4214 read_unlock(&tasklist_lock);
4215
4216out_nounlock:
4217 return retval;
4218}
4219
4220/**
4221 * sys_sched_getscheduler - get the RT priority of a thread
4222 * @pid: the pid in question.
4223 * @param: structure containing the RT priority.
4224 */
4225asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4226{
4227 struct sched_param lp;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004228 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004229 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004230
4231 if (!param || pid < 0)
4232 goto out_nounlock;
4233
4234 read_lock(&tasklist_lock);
4235 p = find_process_by_pid(pid);
4236 retval = -ESRCH;
4237 if (!p)
4238 goto out_unlock;
4239
4240 retval = security_task_getscheduler(p);
4241 if (retval)
4242 goto out_unlock;
4243
4244 lp.sched_priority = p->rt_priority;
4245 read_unlock(&tasklist_lock);
4246
4247 /*
4248 * This one might sleep, we cannot do it with a spinlock held ...
4249 */
4250 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4251
4252out_nounlock:
4253 return retval;
4254
4255out_unlock:
4256 read_unlock(&tasklist_lock);
4257 return retval;
4258}
4259
4260long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4261{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004262 cpumask_t cpus_allowed;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004263 struct task_struct *p;
4264 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004265
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004266 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004267 read_lock(&tasklist_lock);
4268
4269 p = find_process_by_pid(pid);
4270 if (!p) {
4271 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004272 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004273 return -ESRCH;
4274 }
4275
4276 /*
4277 * It is not safe to call set_cpus_allowed with the
4278 * tasklist_lock held. We will bump the task_struct's
4279 * usage count and then drop tasklist_lock.
4280 */
4281 get_task_struct(p);
4282 read_unlock(&tasklist_lock);
4283
4284 retval = -EPERM;
4285 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4286 !capable(CAP_SYS_NICE))
4287 goto out_unlock;
4288
David Quigleye7834f82006-06-23 02:03:59 -07004289 retval = security_task_setscheduler(p, 0, NULL);
4290 if (retval)
4291 goto out_unlock;
4292
Linus Torvalds1da177e2005-04-16 15:20:36 -07004293 cpus_allowed = cpuset_cpus_allowed(p);
4294 cpus_and(new_mask, new_mask, cpus_allowed);
4295 retval = set_cpus_allowed(p, new_mask);
4296
4297out_unlock:
4298 put_task_struct(p);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004299 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004300 return retval;
4301}
4302
4303static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4304 cpumask_t *new_mask)
4305{
4306 if (len < sizeof(cpumask_t)) {
4307 memset(new_mask, 0, sizeof(cpumask_t));
4308 } else if (len > sizeof(cpumask_t)) {
4309 len = sizeof(cpumask_t);
4310 }
4311 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4312}
4313
4314/**
4315 * sys_sched_setaffinity - set the cpu affinity of a process
4316 * @pid: pid of the process
4317 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4318 * @user_mask_ptr: user-space pointer to the new cpu mask
4319 */
4320asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4321 unsigned long __user *user_mask_ptr)
4322{
4323 cpumask_t new_mask;
4324 int retval;
4325
4326 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4327 if (retval)
4328 return retval;
4329
4330 return sched_setaffinity(pid, new_mask);
4331}
4332
4333/*
4334 * Represents all cpu's present in the system
4335 * In systems capable of hotplug, this map could dynamically grow
4336 * as new cpu's are detected in the system via any platform specific
4337 * method, such as ACPI for e.g.
4338 */
4339
Andi Kleen4cef0c62006-01-11 22:44:57 +01004340cpumask_t cpu_present_map __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004341EXPORT_SYMBOL(cpu_present_map);
4342
4343#ifndef CONFIG_SMP
Andi Kleen4cef0c62006-01-11 22:44:57 +01004344cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004345EXPORT_SYMBOL(cpu_online_map);
4346
Andi Kleen4cef0c62006-01-11 22:44:57 +01004347cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004348EXPORT_SYMBOL(cpu_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004349#endif
4350
4351long sched_getaffinity(pid_t pid, cpumask_t *mask)
4352{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004353 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004354 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004355
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004356 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004357 read_lock(&tasklist_lock);
4358
4359 retval = -ESRCH;
4360 p = find_process_by_pid(pid);
4361 if (!p)
4362 goto out_unlock;
4363
David Quigleye7834f82006-06-23 02:03:59 -07004364 retval = security_task_getscheduler(p);
4365 if (retval)
4366 goto out_unlock;
4367
Jack Steiner2f7016d2006-02-01 03:05:18 -08004368 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004369
4370out_unlock:
4371 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004372 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004373 if (retval)
4374 return retval;
4375
4376 return 0;
4377}
4378
4379/**
4380 * sys_sched_getaffinity - get the cpu affinity of a process
4381 * @pid: pid of the process
4382 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4383 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4384 */
4385asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4386 unsigned long __user *user_mask_ptr)
4387{
4388 int ret;
4389 cpumask_t mask;
4390
4391 if (len < sizeof(cpumask_t))
4392 return -EINVAL;
4393
4394 ret = sched_getaffinity(pid, &mask);
4395 if (ret < 0)
4396 return ret;
4397
4398 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4399 return -EFAULT;
4400
4401 return sizeof(cpumask_t);
4402}
4403
4404/**
4405 * sys_sched_yield - yield the current processor to other threads.
4406 *
Ingo Molnardd41f592007-07-09 18:51:59 +02004407 * This function yields the current CPU to other tasks. If there are no
4408 * other threads running on this CPU then this function will return.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004409 */
4410asmlinkage long sys_sched_yield(void)
4411{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004412 struct rq *rq = this_rq_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004413
4414 schedstat_inc(rq, yld_cnt);
Ingo Molnardd41f592007-07-09 18:51:59 +02004415 if (unlikely(rq->nr_running == 1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004416 schedstat_inc(rq, yld_act_empty);
Ingo Molnardd41f592007-07-09 18:51:59 +02004417 else
4418 current->sched_class->yield_task(rq, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004419
4420 /*
4421 * Since we are going to call schedule() anyway, there's
4422 * no need to preempt or enable interrupts:
4423 */
4424 __release(rq->lock);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004425 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004426 _raw_spin_unlock(&rq->lock);
4427 preempt_enable_no_resched();
4428
4429 schedule();
4430
4431 return 0;
4432}
4433
Andrew Mortone7b38402006-06-30 01:56:00 -07004434static void __cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004435{
Ingo Molnar8e0a43d2006-06-23 02:05:23 -07004436#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4437 __might_sleep(__FILE__, __LINE__);
4438#endif
Ingo Molnar5bbcfd92005-07-07 17:57:04 -07004439 /*
4440 * The BKS might be reacquired before we have dropped
4441 * PREEMPT_ACTIVE, which could trigger a second
4442 * cond_resched() call.
4443 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004444 do {
4445 add_preempt_count(PREEMPT_ACTIVE);
4446 schedule();
4447 sub_preempt_count(PREEMPT_ACTIVE);
4448 } while (need_resched());
4449}
4450
4451int __sched cond_resched(void)
4452{
Ingo Molnar94142322006-12-29 16:48:13 -08004453 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4454 system_state == SYSTEM_RUNNING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004455 __cond_resched();
4456 return 1;
4457 }
4458 return 0;
4459}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004460EXPORT_SYMBOL(cond_resched);
4461
4462/*
4463 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4464 * call schedule, and on return reacquire the lock.
4465 *
4466 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4467 * operations here to prevent schedule() from being called twice (once via
4468 * spin_unlock(), once by hand).
4469 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004470int cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004471{
Jan Kara6df3cec2005-06-13 15:52:32 -07004472 int ret = 0;
4473
Linus Torvalds1da177e2005-04-16 15:20:36 -07004474 if (need_lockbreak(lock)) {
4475 spin_unlock(lock);
4476 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07004477 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004478 spin_lock(lock);
4479 }
Ingo Molnar94142322006-12-29 16:48:13 -08004480 if (need_resched() && system_state == SYSTEM_RUNNING) {
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004481 spin_release(&lock->dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004482 _raw_spin_unlock(lock);
4483 preempt_enable_no_resched();
4484 __cond_resched();
Jan Kara6df3cec2005-06-13 15:52:32 -07004485 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004486 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004487 }
Jan Kara6df3cec2005-06-13 15:52:32 -07004488 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004489}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004490EXPORT_SYMBOL(cond_resched_lock);
4491
4492int __sched cond_resched_softirq(void)
4493{
4494 BUG_ON(!in_softirq());
4495
Ingo Molnar94142322006-12-29 16:48:13 -08004496 if (need_resched() && system_state == SYSTEM_RUNNING) {
Thomas Gleixner98d825672007-05-23 13:58:18 -07004497 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004498 __cond_resched();
4499 local_bh_disable();
4500 return 1;
4501 }
4502 return 0;
4503}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004504EXPORT_SYMBOL(cond_resched_softirq);
4505
Linus Torvalds1da177e2005-04-16 15:20:36 -07004506/**
4507 * yield - yield the current processor to other threads.
4508 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004509 * This is a shortcut for kernel-space yielding - it marks the
Linus Torvalds1da177e2005-04-16 15:20:36 -07004510 * thread runnable and calls sys_sched_yield().
4511 */
4512void __sched yield(void)
4513{
4514 set_current_state(TASK_RUNNING);
4515 sys_sched_yield();
4516}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004517EXPORT_SYMBOL(yield);
4518
4519/*
4520 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4521 * that process accounting knows that this is a task in IO wait state.
4522 *
4523 * But don't do that if it is a deliberate, throttling IO wait (this task
4524 * has set its backing_dev_info: the queue against which it should throttle)
4525 */
4526void __sched io_schedule(void)
4527{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004528 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004529
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004530 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004531 atomic_inc(&rq->nr_iowait);
4532 schedule();
4533 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004534 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004535}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004536EXPORT_SYMBOL(io_schedule);
4537
4538long __sched io_schedule_timeout(long timeout)
4539{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004540 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004541 long ret;
4542
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004543 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004544 atomic_inc(&rq->nr_iowait);
4545 ret = schedule_timeout(timeout);
4546 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004547 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004548 return ret;
4549}
4550
4551/**
4552 * sys_sched_get_priority_max - return maximum RT priority.
4553 * @policy: scheduling class.
4554 *
4555 * this syscall returns the maximum rt_priority that can be used
4556 * by a given scheduling class.
4557 */
4558asmlinkage long sys_sched_get_priority_max(int policy)
4559{
4560 int ret = -EINVAL;
4561
4562 switch (policy) {
4563 case SCHED_FIFO:
4564 case SCHED_RR:
4565 ret = MAX_USER_RT_PRIO-1;
4566 break;
4567 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004568 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004569 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004570 ret = 0;
4571 break;
4572 }
4573 return ret;
4574}
4575
4576/**
4577 * sys_sched_get_priority_min - return minimum RT priority.
4578 * @policy: scheduling class.
4579 *
4580 * this syscall returns the minimum rt_priority that can be used
4581 * by a given scheduling class.
4582 */
4583asmlinkage long sys_sched_get_priority_min(int policy)
4584{
4585 int ret = -EINVAL;
4586
4587 switch (policy) {
4588 case SCHED_FIFO:
4589 case SCHED_RR:
4590 ret = 1;
4591 break;
4592 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004593 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004594 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004595 ret = 0;
4596 }
4597 return ret;
4598}
4599
4600/**
4601 * sys_sched_rr_get_interval - return the default timeslice of a process.
4602 * @pid: pid of the process.
4603 * @interval: userspace pointer to the timeslice value.
4604 *
4605 * this syscall writes the default timeslice value of a given process
4606 * into the user-space timespec buffer. A value of '0' means infinity.
4607 */
4608asmlinkage
4609long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4610{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004611 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004612 int retval = -EINVAL;
4613 struct timespec t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004614
4615 if (pid < 0)
4616 goto out_nounlock;
4617
4618 retval = -ESRCH;
4619 read_lock(&tasklist_lock);
4620 p = find_process_by_pid(pid);
4621 if (!p)
4622 goto out_unlock;
4623
4624 retval = security_task_getscheduler(p);
4625 if (retval)
4626 goto out_unlock;
4627
Peter Williamsb78709c2006-06-26 16:58:00 +10004628 jiffies_to_timespec(p->policy == SCHED_FIFO ?
Ingo Molnardd41f592007-07-09 18:51:59 +02004629 0 : static_prio_timeslice(p->static_prio), &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004630 read_unlock(&tasklist_lock);
4631 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4632out_nounlock:
4633 return retval;
4634out_unlock:
4635 read_unlock(&tasklist_lock);
4636 return retval;
4637}
4638
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004639static const char stat_nam[] = "RSDTtZX";
Ingo Molnar36c8b582006-07-03 00:25:41 -07004640
4641static void show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004642{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004643 unsigned long free = 0;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004644 unsigned state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004645
Linus Torvalds1da177e2005-04-16 15:20:36 -07004646 state = p->state ? __ffs(p->state) + 1 : 0;
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004647 printk("%-13.13s %c", p->comm,
4648 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Linus Torvalds1da177e2005-04-16 15:20:36 -07004649#if (BITS_PER_LONG == 32)
4650 if (state == TASK_RUNNING)
4651 printk(" running ");
4652 else
4653 printk(" %08lX ", thread_saved_pc(p));
4654#else
4655 if (state == TASK_RUNNING)
4656 printk(" running task ");
4657 else
4658 printk(" %016lx ", thread_saved_pc(p));
4659#endif
4660#ifdef CONFIG_DEBUG_STACK_USAGE
4661 {
Al Viro10ebffd2005-11-13 16:06:56 -08004662 unsigned long *n = end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004663 while (!*n)
4664 n++;
Al Viro10ebffd2005-11-13 16:06:56 -08004665 free = (unsigned long)n - (unsigned long)end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004666 }
4667#endif
Ingo Molnar35f6f752007-04-06 21:18:06 +02004668 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004669 if (!p->mm)
4670 printk(" (L-TLB)\n");
4671 else
4672 printk(" (NOTLB)\n");
4673
4674 if (state != TASK_RUNNING)
4675 show_stack(p, NULL);
4676}
4677
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004678void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004679{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004680 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004681
4682#if (BITS_PER_LONG == 32)
4683 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004684 " free sibling\n");
4685 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004686#else
4687 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004688 " free sibling\n");
4689 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004690#endif
4691 read_lock(&tasklist_lock);
4692 do_each_thread(g, p) {
4693 /*
4694 * reset the NMI-timeout, listing all files on a slow
4695 * console might take alot of time:
4696 */
4697 touch_nmi_watchdog();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07004698 if (!state_filter || (p->state & state_filter))
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004699 show_task(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004700 } while_each_thread(g, p);
4701
Jeremy Fitzhardinge04c91672007-05-08 00:28:05 -07004702 touch_all_softlockup_watchdogs();
4703
Ingo Molnardd41f592007-07-09 18:51:59 +02004704#ifdef CONFIG_SCHED_DEBUG
4705 sysrq_sched_debug_show();
4706#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004707 read_unlock(&tasklist_lock);
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004708 /*
4709 * Only show locks if all tasks are dumped:
4710 */
4711 if (state_filter == -1)
4712 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004713}
4714
Ingo Molnar1df21052007-07-09 18:51:58 +02004715void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4716{
Ingo Molnardd41f592007-07-09 18:51:59 +02004717 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02004718}
4719
Ingo Molnarf340c0d2005-06-28 16:40:42 +02004720/**
4721 * init_idle - set up an idle thread for a given CPU
4722 * @idle: task in question
4723 * @cpu: cpu the idle task belongs to
4724 *
4725 * NOTE: this function does not set the idle thread's NEED_RESCHED
4726 * flag, to make booting more robust.
4727 */
Nick Piggin5c1e1762006-10-03 01:14:04 -07004728void __cpuinit init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004729{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004730 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004731 unsigned long flags;
4732
Ingo Molnardd41f592007-07-09 18:51:59 +02004733 __sched_fork(idle);
4734 idle->se.exec_start = sched_clock();
4735
Ingo Molnarb29739f2006-06-27 02:54:51 -07004736 idle->prio = idle->normal_prio = MAX_PRIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004737 idle->cpus_allowed = cpumask_of_cpu(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004738 __set_task_cpu(idle, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004739
4740 spin_lock_irqsave(&rq->lock, flags);
4741 rq->curr = rq->idle = idle;
Nick Piggin4866cde2005-06-25 14:57:23 -07004742#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4743 idle->oncpu = 1;
4744#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004745 spin_unlock_irqrestore(&rq->lock, flags);
4746
4747 /* Set the preempt count _outside_ the spinlocks! */
4748#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
Al Viroa1261f542005-11-13 16:06:55 -08004749 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004750#else
Al Viroa1261f542005-11-13 16:06:55 -08004751 task_thread_info(idle)->preempt_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004752#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02004753 /*
4754 * The idle tasks have their own, simple scheduling class:
4755 */
4756 idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004757}
4758
4759/*
4760 * In a system that switches off the HZ timer nohz_cpu_mask
4761 * indicates which cpus entered this state. This is used
4762 * in the rcu update to wait only for active cpus. For system
4763 * which do not switch off the HZ timer nohz_cpu_mask should
4764 * always be CPU_MASK_NONE.
4765 */
4766cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4767
Ingo Molnardd41f592007-07-09 18:51:59 +02004768/*
4769 * Increase the granularity value when there are more CPUs,
4770 * because with more CPUs the 'effective latency' as visible
4771 * to users decreases. But the relationship is not linear,
4772 * so pick a second-best guess by going with the log2 of the
4773 * number of CPUs.
4774 *
4775 * This idea comes from the SD scheduler of Con Kolivas:
4776 */
4777static inline void sched_init_granularity(void)
4778{
4779 unsigned int factor = 1 + ilog2(num_online_cpus());
4780 const unsigned long gran_limit = 10000000;
4781
4782 sysctl_sched_granularity *= factor;
4783 if (sysctl_sched_granularity > gran_limit)
4784 sysctl_sched_granularity = gran_limit;
4785
4786 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4787 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4788}
4789
Linus Torvalds1da177e2005-04-16 15:20:36 -07004790#ifdef CONFIG_SMP
4791/*
4792 * This is how migration works:
4793 *
Ingo Molnar70b97a72006-07-03 00:25:42 -07004794 * 1) we queue a struct migration_req structure in the source CPU's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004795 * runqueue and wake up that CPU's migration thread.
4796 * 2) we down() the locked semaphore => thread blocks.
4797 * 3) migration thread wakes up (implicitly it forces the migrated
4798 * thread off the CPU)
4799 * 4) it gets the migration request and checks whether the migrated
4800 * task is still in the wrong runqueue.
4801 * 5) if it's in the wrong runqueue then the migration thread removes
4802 * it and puts it into the right queue.
4803 * 6) migration thread up()s the semaphore.
4804 * 7) we wake up and the migration is done.
4805 */
4806
4807/*
4808 * Change a given task's CPU affinity. Migrate the thread to a
4809 * proper CPU and schedule it away if the CPU it's executing on
4810 * is removed from the allowed bitmask.
4811 *
4812 * NOTE: the caller must have a valid reference to the task, the
4813 * task must not exit() & deallocate itself prematurely. The
4814 * call is not atomic; no spinlocks may be held.
4815 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004816int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004817{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004818 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004819 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004820 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07004821 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004822
4823 rq = task_rq_lock(p, &flags);
4824 if (!cpus_intersects(new_mask, cpu_online_map)) {
4825 ret = -EINVAL;
4826 goto out;
4827 }
4828
4829 p->cpus_allowed = new_mask;
4830 /* Can the task run on the task's current CPU? If so, we're done */
4831 if (cpu_isset(task_cpu(p), new_mask))
4832 goto out;
4833
4834 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4835 /* Need help from migration thread: drop lock and wait. */
4836 task_rq_unlock(rq, &flags);
4837 wake_up_process(rq->migration_thread);
4838 wait_for_completion(&req.done);
4839 tlb_migrate_finish(p->mm);
4840 return 0;
4841 }
4842out:
4843 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004844
Linus Torvalds1da177e2005-04-16 15:20:36 -07004845 return ret;
4846}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004847EXPORT_SYMBOL_GPL(set_cpus_allowed);
4848
4849/*
4850 * Move (not current) task off this cpu, onto dest cpu. We're doing
4851 * this because either it can't run here any more (set_cpus_allowed()
4852 * away from this CPU, or CPU going down), or because we're
4853 * attempting to rebalance this task on exec (sched_exec).
4854 *
4855 * So we race with normal scheduler movements, but that's OK, as long
4856 * as the task is no longer on this CPU.
Kirill Korotaevefc30812006-06-27 02:54:32 -07004857 *
4858 * Returns non-zero if task was successfully migrated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004859 */
Kirill Korotaevefc30812006-06-27 02:54:32 -07004860static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004861{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004862 struct rq *rq_dest, *rq_src;
Ingo Molnardd41f592007-07-09 18:51:59 +02004863 int ret = 0, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004864
4865 if (unlikely(cpu_is_offline(dest_cpu)))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004866 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004867
4868 rq_src = cpu_rq(src_cpu);
4869 rq_dest = cpu_rq(dest_cpu);
4870
4871 double_rq_lock(rq_src, rq_dest);
4872 /* Already moved. */
4873 if (task_cpu(p) != src_cpu)
4874 goto out;
4875 /* Affinity changed (again). */
4876 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4877 goto out;
4878
Ingo Molnardd41f592007-07-09 18:51:59 +02004879 on_rq = p->se.on_rq;
4880 if (on_rq)
4881 deactivate_task(rq_src, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004882 set_task_cpu(p, dest_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004883 if (on_rq) {
4884 activate_task(rq_dest, p, 0);
4885 check_preempt_curr(rq_dest, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004886 }
Kirill Korotaevefc30812006-06-27 02:54:32 -07004887 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004888out:
4889 double_rq_unlock(rq_src, rq_dest);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004890 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004891}
4892
4893/*
4894 * migration_thread - this is a highprio system thread that performs
4895 * thread migration by bumping thread off CPU then 'pushing' onto
4896 * another runqueue.
4897 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004898static int migration_thread(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004899{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004900 int cpu = (long)data;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004901 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004902
4903 rq = cpu_rq(cpu);
4904 BUG_ON(rq->migration_thread != current);
4905
4906 set_current_state(TASK_INTERRUPTIBLE);
4907 while (!kthread_should_stop()) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07004908 struct migration_req *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004909 struct list_head *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004910
Christoph Lameter3e1d1d22005-06-24 23:13:50 -07004911 try_to_freeze();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004912
4913 spin_lock_irq(&rq->lock);
4914
4915 if (cpu_is_offline(cpu)) {
4916 spin_unlock_irq(&rq->lock);
4917 goto wait_to_die;
4918 }
4919
4920 if (rq->active_balance) {
4921 active_load_balance(rq, cpu);
4922 rq->active_balance = 0;
4923 }
4924
4925 head = &rq->migration_queue;
4926
4927 if (list_empty(head)) {
4928 spin_unlock_irq(&rq->lock);
4929 schedule();
4930 set_current_state(TASK_INTERRUPTIBLE);
4931 continue;
4932 }
Ingo Molnar70b97a72006-07-03 00:25:42 -07004933 req = list_entry(head->next, struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004934 list_del_init(head->next);
4935
Nick Piggin674311d2005-06-25 14:57:27 -07004936 spin_unlock(&rq->lock);
4937 __migrate_task(req->task, cpu, req->dest_cpu);
4938 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004939
4940 complete(&req->done);
4941 }
4942 __set_current_state(TASK_RUNNING);
4943 return 0;
4944
4945wait_to_die:
4946 /* Wait for kthread_stop */
4947 set_current_state(TASK_INTERRUPTIBLE);
4948 while (!kthread_should_stop()) {
4949 schedule();
4950 set_current_state(TASK_INTERRUPTIBLE);
4951 }
4952 __set_current_state(TASK_RUNNING);
4953 return 0;
4954}
4955
4956#ifdef CONFIG_HOTPLUG_CPU
Kirill Korotaev054b9102006-12-10 02:20:11 -08004957/*
4958 * Figure out where task on dead CPU should go, use force if neccessary.
4959 * NOTE: interrupts should be disabled by the caller
4960 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07004961static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004962{
Kirill Korotaevefc30812006-06-27 02:54:32 -07004963 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004964 cpumask_t mask;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004965 struct rq *rq;
4966 int dest_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004967
Kirill Korotaevefc30812006-06-27 02:54:32 -07004968restart:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004969 /* On same node? */
4970 mask = node_to_cpumask(cpu_to_node(dead_cpu));
Ingo Molnar48f24c42006-07-03 00:25:40 -07004971 cpus_and(mask, mask, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004972 dest_cpu = any_online_cpu(mask);
4973
4974 /* On any allowed CPU? */
4975 if (dest_cpu == NR_CPUS)
Ingo Molnar48f24c42006-07-03 00:25:40 -07004976 dest_cpu = any_online_cpu(p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004977
4978 /* No more Mr. Nice Guy. */
4979 if (dest_cpu == NR_CPUS) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07004980 rq = task_rq_lock(p, &flags);
4981 cpus_setall(p->cpus_allowed);
4982 dest_cpu = any_online_cpu(p->cpus_allowed);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004983 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004984
4985 /*
4986 * Don't tell them about moving exiting tasks or
4987 * kernel threads (both mm NULL), since they never
4988 * leave kernel.
4989 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07004990 if (p->mm && printk_ratelimit())
Linus Torvalds1da177e2005-04-16 15:20:36 -07004991 printk(KERN_INFO "process %d (%s) no "
4992 "longer affine to cpu%d\n",
Ingo Molnar48f24c42006-07-03 00:25:40 -07004993 p->pid, p->comm, dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004994 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07004995 if (!__migrate_task(p, dead_cpu, dest_cpu))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004996 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004997}
4998
4999/*
5000 * While a dead CPU has no uninterruptible tasks queued at this point,
5001 * it might still have a nonzero ->nr_uninterruptible counter, because
5002 * for performance reasons the counter is not stricly tracking tasks to
5003 * their home CPUs. So we just add the counter to another CPU's counter,
5004 * to keep the global sum constant after CPU-down:
5005 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07005006static void migrate_nr_uninterruptible(struct rq *rq_src)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005007{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005008 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005009 unsigned long flags;
5010
5011 local_irq_save(flags);
5012 double_rq_lock(rq_src, rq_dest);
5013 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5014 rq_src->nr_uninterruptible = 0;
5015 double_rq_unlock(rq_src, rq_dest);
5016 local_irq_restore(flags);
5017}
5018
5019/* Run through task list and migrate tasks from the dead cpu. */
5020static void migrate_live_tasks(int src_cpu)
5021{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005022 struct task_struct *p, *t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005023
5024 write_lock_irq(&tasklist_lock);
5025
Ingo Molnar48f24c42006-07-03 00:25:40 -07005026 do_each_thread(t, p) {
5027 if (p == current)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005028 continue;
5029
Ingo Molnar48f24c42006-07-03 00:25:40 -07005030 if (task_cpu(p) == src_cpu)
5031 move_task_off_dead_cpu(src_cpu, p);
5032 } while_each_thread(t, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005033
5034 write_unlock_irq(&tasklist_lock);
5035}
5036
Ingo Molnardd41f592007-07-09 18:51:59 +02005037/*
5038 * Schedules idle task to be the next runnable task on current CPU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005039 * It does so by boosting its priority to highest possible and adding it to
Ingo Molnar48f24c42006-07-03 00:25:40 -07005040 * the _front_ of the runqueue. Used by CPU offline code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005041 */
5042void sched_idle_next(void)
5043{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005044 int this_cpu = smp_processor_id();
Ingo Molnar70b97a72006-07-03 00:25:42 -07005045 struct rq *rq = cpu_rq(this_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046 struct task_struct *p = rq->idle;
5047 unsigned long flags;
5048
5049 /* cpu has to be offline */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005050 BUG_ON(cpu_online(this_cpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005051
Ingo Molnar48f24c42006-07-03 00:25:40 -07005052 /*
5053 * Strictly not necessary since rest of the CPUs are stopped by now
5054 * and interrupts disabled on the current cpu.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005055 */
5056 spin_lock_irqsave(&rq->lock, flags);
5057
Ingo Molnardd41f592007-07-09 18:51:59 +02005058 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005059
5060 /* Add idle task to the _front_ of its priority queue: */
Ingo Molnardd41f592007-07-09 18:51:59 +02005061 activate_idle_task(p, rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005062
5063 spin_unlock_irqrestore(&rq->lock, flags);
5064}
5065
Ingo Molnar48f24c42006-07-03 00:25:40 -07005066/*
5067 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07005068 * offline.
5069 */
5070void idle_task_exit(void)
5071{
5072 struct mm_struct *mm = current->active_mm;
5073
5074 BUG_ON(cpu_online(smp_processor_id()));
5075
5076 if (mm != &init_mm)
5077 switch_mm(mm, &init_mm, current);
5078 mmdrop(mm);
5079}
5080
Kirill Korotaev054b9102006-12-10 02:20:11 -08005081/* called under rq->lock with disabled interrupts */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005082static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005083{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005084 struct rq *rq = cpu_rq(dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005085
5086 /* Must be exiting, otherwise would be on tasklist. */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005087 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005088
5089 /* Cannot have done final schedule yet: would have vanished. */
Oleg Nesterovc394cc92006-09-29 02:01:11 -07005090 BUG_ON(p->state == TASK_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005091
Ingo Molnar48f24c42006-07-03 00:25:40 -07005092 get_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005093
5094 /*
5095 * Drop lock around migration; if someone else moves it,
5096 * that's OK. No task can be added to this CPU, so iteration is
5097 * fine.
Kirill Korotaev054b9102006-12-10 02:20:11 -08005098 * NOTE: interrupts should be left disabled --dev@
Linus Torvalds1da177e2005-04-16 15:20:36 -07005099 */
Kirill Korotaev054b9102006-12-10 02:20:11 -08005100 spin_unlock(&rq->lock);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005101 move_task_off_dead_cpu(dead_cpu, p);
Kirill Korotaev054b9102006-12-10 02:20:11 -08005102 spin_lock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005103
Ingo Molnar48f24c42006-07-03 00:25:40 -07005104 put_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005105}
5106
5107/* release_task() removes task from tasklist, so we won't find dead tasks. */
5108static void migrate_dead_tasks(unsigned int dead_cpu)
5109{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005110 struct rq *rq = cpu_rq(dead_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02005111 struct task_struct *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005112
Ingo Molnardd41f592007-07-09 18:51:59 +02005113 for ( ; ; ) {
5114 if (!rq->nr_running)
5115 break;
5116 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5117 if (!next)
5118 break;
5119 migrate_dead(dead_cpu, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005120 }
5121}
5122#endif /* CONFIG_HOTPLUG_CPU */
5123
5124/*
5125 * migration_call - callback that gets triggered when a CPU is added.
5126 * Here we can start up the necessary migration thread for the new CPU.
5127 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005128static int __cpuinit
5129migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005130{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005131 struct task_struct *p;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005132 int cpu = (long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005133 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005134 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005135
5136 switch (action) {
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005137 case CPU_LOCK_ACQUIRE:
5138 mutex_lock(&sched_hotcpu_mutex);
5139 break;
5140
Linus Torvalds1da177e2005-04-16 15:20:36 -07005141 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005142 case CPU_UP_PREPARE_FROZEN:
Ingo Molnardd41f592007-07-09 18:51:59 +02005143 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005144 if (IS_ERR(p))
5145 return NOTIFY_BAD;
5146 p->flags |= PF_NOFREEZE;
5147 kthread_bind(p, cpu);
5148 /* Must be high prio: stop_machine expects to yield to it. */
5149 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005150 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005151 task_rq_unlock(rq, &flags);
5152 cpu_rq(cpu)->migration_thread = p;
5153 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005154
Linus Torvalds1da177e2005-04-16 15:20:36 -07005155 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005156 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005157 /* Strictly unneccessary, as first user will wake it. */
5158 wake_up_process(cpu_rq(cpu)->migration_thread);
5159 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005160
Linus Torvalds1da177e2005-04-16 15:20:36 -07005161#ifdef CONFIG_HOTPLUG_CPU
5162 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005163 case CPU_UP_CANCELED_FROZEN:
Heiko Carstensfc75cdf2006-06-25 05:49:10 -07005164 if (!cpu_rq(cpu)->migration_thread)
5165 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005166 /* Unbind it from offline cpu so it can run. Fall thru. */
Heiko Carstensa4c4af72005-11-07 00:58:38 -08005167 kthread_bind(cpu_rq(cpu)->migration_thread,
5168 any_online_cpu(cpu_online_map));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005169 kthread_stop(cpu_rq(cpu)->migration_thread);
5170 cpu_rq(cpu)->migration_thread = NULL;
5171 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005172
Linus Torvalds1da177e2005-04-16 15:20:36 -07005173 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005174 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005175 migrate_live_tasks(cpu);
5176 rq = cpu_rq(cpu);
5177 kthread_stop(rq->migration_thread);
5178 rq->migration_thread = NULL;
5179 /* Idle task back to normal (off runqueue, low prio) */
5180 rq = task_rq_lock(rq->idle, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005181 deactivate_task(rq, rq->idle, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005182 rq->idle->static_prio = MAX_PRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +02005183 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5184 rq->idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005185 migrate_dead_tasks(cpu);
5186 task_rq_unlock(rq, &flags);
5187 migrate_nr_uninterruptible(rq);
5188 BUG_ON(rq->nr_running != 0);
5189
5190 /* No need to migrate the tasks: it was best-effort if
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005191 * they didn't take sched_hotcpu_mutex. Just wake up
Linus Torvalds1da177e2005-04-16 15:20:36 -07005192 * the requestors. */
5193 spin_lock_irq(&rq->lock);
5194 while (!list_empty(&rq->migration_queue)) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07005195 struct migration_req *req;
5196
Linus Torvalds1da177e2005-04-16 15:20:36 -07005197 req = list_entry(rq->migration_queue.next,
Ingo Molnar70b97a72006-07-03 00:25:42 -07005198 struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005199 list_del_init(&req->list);
5200 complete(&req->done);
5201 }
5202 spin_unlock_irq(&rq->lock);
5203 break;
5204#endif
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005205 case CPU_LOCK_RELEASE:
5206 mutex_unlock(&sched_hotcpu_mutex);
5207 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005208 }
5209 return NOTIFY_OK;
5210}
5211
5212/* Register at highest priority so that task migration (migrate_all_tasks)
5213 * happens before everything else.
5214 */
Chandra Seetharaman26c21432006-06-27 02:54:10 -07005215static struct notifier_block __cpuinitdata migration_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005216 .notifier_call = migration_call,
5217 .priority = 10
5218};
5219
5220int __init migration_init(void)
5221{
5222 void *cpu = (void *)(long)smp_processor_id();
Akinobu Mita07dccf32006-09-29 02:00:22 -07005223 int err;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005224
5225 /* Start one for the boot CPU: */
Akinobu Mita07dccf32006-09-29 02:00:22 -07005226 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5227 BUG_ON(err == NOTIFY_BAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005228 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5229 register_cpu_notifier(&migration_notifier);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005230
Linus Torvalds1da177e2005-04-16 15:20:36 -07005231 return 0;
5232}
5233#endif
5234
5235#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07005236
5237/* Number of possible processor ids */
5238int nr_cpu_ids __read_mostly = NR_CPUS;
5239EXPORT_SYMBOL(nr_cpu_ids);
5240
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005241#undef SCHED_DOMAIN_DEBUG
Linus Torvalds1da177e2005-04-16 15:20:36 -07005242#ifdef SCHED_DOMAIN_DEBUG
5243static void sched_domain_debug(struct sched_domain *sd, int cpu)
5244{
5245 int level = 0;
5246
Nick Piggin41c7ce92005-06-25 14:57:24 -07005247 if (!sd) {
5248 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5249 return;
5250 }
5251
Linus Torvalds1da177e2005-04-16 15:20:36 -07005252 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5253
5254 do {
5255 int i;
5256 char str[NR_CPUS];
5257 struct sched_group *group = sd->groups;
5258 cpumask_t groupmask;
5259
5260 cpumask_scnprintf(str, NR_CPUS, sd->span);
5261 cpus_clear(groupmask);
5262
5263 printk(KERN_DEBUG);
5264 for (i = 0; i < level + 1; i++)
5265 printk(" ");
5266 printk("domain %d: ", level);
5267
5268 if (!(sd->flags & SD_LOAD_BALANCE)) {
5269 printk("does not load-balance\n");
5270 if (sd->parent)
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005271 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5272 " has parent");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005273 break;
5274 }
5275
5276 printk("span %s\n", str);
5277
5278 if (!cpu_isset(cpu, sd->span))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005279 printk(KERN_ERR "ERROR: domain->span does not contain "
5280 "CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005281 if (!cpu_isset(cpu, group->cpumask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005282 printk(KERN_ERR "ERROR: domain->groups does not contain"
5283 " CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005284
5285 printk(KERN_DEBUG);
5286 for (i = 0; i < level + 2; i++)
5287 printk(" ");
5288 printk("groups:");
5289 do {
5290 if (!group) {
5291 printk("\n");
5292 printk(KERN_ERR "ERROR: group is NULL\n");
5293 break;
5294 }
5295
Eric Dumazet5517d862007-05-08 00:32:57 -07005296 if (!group->__cpu_power) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005297 printk("\n");
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005298 printk(KERN_ERR "ERROR: domain->cpu_power not "
5299 "set\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005300 }
5301
5302 if (!cpus_weight(group->cpumask)) {
5303 printk("\n");
5304 printk(KERN_ERR "ERROR: empty group\n");
5305 }
5306
5307 if (cpus_intersects(groupmask, group->cpumask)) {
5308 printk("\n");
5309 printk(KERN_ERR "ERROR: repeated CPUs\n");
5310 }
5311
5312 cpus_or(groupmask, groupmask, group->cpumask);
5313
5314 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5315 printk(" %s", str);
5316
5317 group = group->next;
5318 } while (group != sd->groups);
5319 printk("\n");
5320
5321 if (!cpus_equal(sd->span, groupmask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005322 printk(KERN_ERR "ERROR: groups don't span "
5323 "domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005324
5325 level++;
5326 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005327 if (!sd)
5328 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005329
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005330 if (!cpus_subset(groupmask, sd->span))
5331 printk(KERN_ERR "ERROR: parent span is not a superset "
5332 "of domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005333
5334 } while (sd);
5335}
5336#else
Ingo Molnar48f24c42006-07-03 00:25:40 -07005337# define sched_domain_debug(sd, cpu) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005338#endif
5339
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005340static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005341{
5342 if (cpus_weight(sd->span) == 1)
5343 return 1;
5344
5345 /* Following flags need at least 2 groups */
5346 if (sd->flags & (SD_LOAD_BALANCE |
5347 SD_BALANCE_NEWIDLE |
5348 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005349 SD_BALANCE_EXEC |
5350 SD_SHARE_CPUPOWER |
5351 SD_SHARE_PKG_RESOURCES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005352 if (sd->groups != sd->groups->next)
5353 return 0;
5354 }
5355
5356 /* Following flags don't use groups */
5357 if (sd->flags & (SD_WAKE_IDLE |
5358 SD_WAKE_AFFINE |
5359 SD_WAKE_BALANCE))
5360 return 0;
5361
5362 return 1;
5363}
5364
Ingo Molnar48f24c42006-07-03 00:25:40 -07005365static int
5366sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005367{
5368 unsigned long cflags = sd->flags, pflags = parent->flags;
5369
5370 if (sd_degenerate(parent))
5371 return 1;
5372
5373 if (!cpus_equal(sd->span, parent->span))
5374 return 0;
5375
5376 /* Does parent contain flags not in child? */
5377 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5378 if (cflags & SD_WAKE_AFFINE)
5379 pflags &= ~SD_WAKE_BALANCE;
5380 /* Flags needing groups don't count if only 1 group in parent */
5381 if (parent->groups == parent->groups->next) {
5382 pflags &= ~(SD_LOAD_BALANCE |
5383 SD_BALANCE_NEWIDLE |
5384 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005385 SD_BALANCE_EXEC |
5386 SD_SHARE_CPUPOWER |
5387 SD_SHARE_PKG_RESOURCES);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005388 }
5389 if (~cflags & pflags)
5390 return 0;
5391
5392 return 1;
5393}
5394
Linus Torvalds1da177e2005-04-16 15:20:36 -07005395/*
5396 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5397 * hold the hotplug lock.
5398 */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005399static void cpu_attach_domain(struct sched_domain *sd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005400{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005401 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005402 struct sched_domain *tmp;
5403
5404 /* Remove the sched domains which do not contribute to scheduling. */
5405 for (tmp = sd; tmp; tmp = tmp->parent) {
5406 struct sched_domain *parent = tmp->parent;
5407 if (!parent)
5408 break;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005409 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005410 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005411 if (parent->parent)
5412 parent->parent->child = tmp;
5413 }
Suresh Siddha245af2c2005-06-25 14:57:25 -07005414 }
5415
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005416 if (sd && sd_degenerate(sd)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005417 sd = sd->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005418 if (sd)
5419 sd->child = NULL;
5420 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005421
5422 sched_domain_debug(sd, cpu);
5423
Nick Piggin674311d2005-06-25 14:57:27 -07005424 rcu_assign_pointer(rq->sd, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005425}
5426
5427/* cpus with isolated domains */
Tim Chen67af63a2006-12-22 01:07:50 -08005428static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005429
5430/* Setup the mask of cpus configured for isolated domains */
5431static int __init isolated_cpu_setup(char *str)
5432{
5433 int ints[NR_CPUS], i;
5434
5435 str = get_options(str, ARRAY_SIZE(ints), ints);
5436 cpus_clear(cpu_isolated_map);
5437 for (i = 1; i <= ints[0]; i++)
5438 if (ints[i] < NR_CPUS)
5439 cpu_set(ints[i], cpu_isolated_map);
5440 return 1;
5441}
5442
5443__setup ("isolcpus=", isolated_cpu_setup);
5444
5445/*
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005446 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5447 * to a function which identifies what group(along with sched group) a CPU
5448 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5449 * (due to the fact that we keep track of groups covered with a cpumask_t).
Linus Torvalds1da177e2005-04-16 15:20:36 -07005450 *
5451 * init_sched_build_groups will build a circular linked list of the groups
5452 * covered by the given span, and will set each group's ->cpumask correctly,
5453 * and ->cpu_power to 0.
5454 */
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005455static void
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005456init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5457 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5458 struct sched_group **sg))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005459{
5460 struct sched_group *first = NULL, *last = NULL;
5461 cpumask_t covered = CPU_MASK_NONE;
5462 int i;
5463
5464 for_each_cpu_mask(i, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005465 struct sched_group *sg;
5466 int group = group_fn(i, cpu_map, &sg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005467 int j;
5468
5469 if (cpu_isset(i, covered))
5470 continue;
5471
5472 sg->cpumask = CPU_MASK_NONE;
Eric Dumazet5517d862007-05-08 00:32:57 -07005473 sg->__cpu_power = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005474
5475 for_each_cpu_mask(j, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005476 if (group_fn(j, cpu_map, NULL) != group)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005477 continue;
5478
5479 cpu_set(j, covered);
5480 cpu_set(j, sg->cpumask);
5481 }
5482 if (!first)
5483 first = sg;
5484 if (last)
5485 last->next = sg;
5486 last = sg;
5487 }
5488 last->next = first;
5489}
5490
John Hawkes9c1cfda2005-09-06 15:18:14 -07005491#define SD_NODES_PER_DOMAIN 16
Linus Torvalds1da177e2005-04-16 15:20:36 -07005492
John Hawkes9c1cfda2005-09-06 15:18:14 -07005493#ifdef CONFIG_NUMA
akpm@osdl.org198e2f12006-01-12 01:05:30 -08005494
John Hawkes9c1cfda2005-09-06 15:18:14 -07005495/**
5496 * find_next_best_node - find the next node to include in a sched_domain
5497 * @node: node whose sched_domain we're building
5498 * @used_nodes: nodes already in the sched_domain
5499 *
5500 * Find the next node to include in a given scheduling domain. Simply
5501 * finds the closest node not already in the @used_nodes map.
5502 *
5503 * Should use nodemask_t.
5504 */
5505static int find_next_best_node(int node, unsigned long *used_nodes)
5506{
5507 int i, n, val, min_val, best_node = 0;
5508
5509 min_val = INT_MAX;
5510
5511 for (i = 0; i < MAX_NUMNODES; i++) {
5512 /* Start at @node */
5513 n = (node + i) % MAX_NUMNODES;
5514
5515 if (!nr_cpus_node(n))
5516 continue;
5517
5518 /* Skip already used nodes */
5519 if (test_bit(n, used_nodes))
5520 continue;
5521
5522 /* Simple min distance search */
5523 val = node_distance(node, n);
5524
5525 if (val < min_val) {
5526 min_val = val;
5527 best_node = n;
5528 }
5529 }
5530
5531 set_bit(best_node, used_nodes);
5532 return best_node;
5533}
5534
5535/**
5536 * sched_domain_node_span - get a cpumask for a node's sched_domain
5537 * @node: node whose cpumask we're constructing
5538 * @size: number of nodes to include in this span
5539 *
5540 * Given a node, construct a good cpumask for its sched_domain to span. It
5541 * should be one that prevents unnecessary balancing, but also spreads tasks
5542 * out optimally.
5543 */
5544static cpumask_t sched_domain_node_span(int node)
5545{
John Hawkes9c1cfda2005-09-06 15:18:14 -07005546 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005547 cpumask_t span, nodemask;
5548 int i;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005549
5550 cpus_clear(span);
5551 bitmap_zero(used_nodes, MAX_NUMNODES);
5552
5553 nodemask = node_to_cpumask(node);
5554 cpus_or(span, span, nodemask);
5555 set_bit(node, used_nodes);
5556
5557 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5558 int next_node = find_next_best_node(node, used_nodes);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005559
John Hawkes9c1cfda2005-09-06 15:18:14 -07005560 nodemask = node_to_cpumask(next_node);
5561 cpus_or(span, span, nodemask);
5562 }
5563
5564 return span;
5565}
5566#endif
5567
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005568int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005569
John Hawkes9c1cfda2005-09-06 15:18:14 -07005570/*
Ingo Molnar48f24c42006-07-03 00:25:40 -07005571 * SMT sched-domains:
John Hawkes9c1cfda2005-09-06 15:18:14 -07005572 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005573#ifdef CONFIG_SCHED_SMT
5574static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005575static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005576
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005577static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5578 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005579{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005580 if (sg)
5581 *sg = &per_cpu(sched_group_cpus, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005582 return cpu;
5583}
5584#endif
5585
Ingo Molnar48f24c42006-07-03 00:25:40 -07005586/*
5587 * multi-core sched-domains:
5588 */
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005589#ifdef CONFIG_SCHED_MC
5590static DEFINE_PER_CPU(struct sched_domain, core_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005591static DEFINE_PER_CPU(struct sched_group, sched_group_core);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005592#endif
5593
5594#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005595static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5596 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005597{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005598 int group;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005599 cpumask_t mask = cpu_sibling_map[cpu];
5600 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005601 group = first_cpu(mask);
5602 if (sg)
5603 *sg = &per_cpu(sched_group_core, group);
5604 return group;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005605}
5606#elif defined(CONFIG_SCHED_MC)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005607static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5608 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005609{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005610 if (sg)
5611 *sg = &per_cpu(sched_group_core, cpu);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005612 return cpu;
5613}
5614#endif
5615
Linus Torvalds1da177e2005-04-16 15:20:36 -07005616static DEFINE_PER_CPU(struct sched_domain, phys_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005617static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005618
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005619static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5620 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005621{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005622 int group;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005623#ifdef CONFIG_SCHED_MC
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005624 cpumask_t mask = cpu_coregroup_map(cpu);
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005625 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005626 group = first_cpu(mask);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005627#elif defined(CONFIG_SCHED_SMT)
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005628 cpumask_t mask = cpu_sibling_map[cpu];
5629 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005630 group = first_cpu(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005631#else
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005632 group = cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005633#endif
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005634 if (sg)
5635 *sg = &per_cpu(sched_group_phys, group);
5636 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005637}
5638
5639#ifdef CONFIG_NUMA
John Hawkes9c1cfda2005-09-06 15:18:14 -07005640/*
5641 * The init_sched_build_groups can't handle what we want to do with node
5642 * groups, so roll our own. Now each node has its own list of groups which
5643 * gets dynamically allocated.
5644 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005645static DEFINE_PER_CPU(struct sched_domain, node_domains);
John Hawkesd1b55132005-09-06 15:18:14 -07005646static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
John Hawkes9c1cfda2005-09-06 15:18:14 -07005647
5648static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005649static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005650
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005651static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5652 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005653{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005654 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5655 int group;
5656
5657 cpus_and(nodemask, nodemask, *cpu_map);
5658 group = first_cpu(nodemask);
5659
5660 if (sg)
5661 *sg = &per_cpu(sched_group_allnodes, group);
5662 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005663}
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005664
Siddha, Suresh B08069032006-03-27 01:15:23 -08005665static void init_numa_sched_groups_power(struct sched_group *group_head)
5666{
5667 struct sched_group *sg = group_head;
5668 int j;
5669
5670 if (!sg)
5671 return;
5672next_sg:
5673 for_each_cpu_mask(j, sg->cpumask) {
5674 struct sched_domain *sd;
5675
5676 sd = &per_cpu(phys_domains, j);
5677 if (j != first_cpu(sd->groups->cpumask)) {
5678 /*
5679 * Only add "power" once for each
5680 * physical package.
5681 */
5682 continue;
5683 }
5684
Eric Dumazet5517d862007-05-08 00:32:57 -07005685 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
Siddha, Suresh B08069032006-03-27 01:15:23 -08005686 }
5687 sg = sg->next;
5688 if (sg != group_head)
5689 goto next_sg;
5690}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005691#endif
5692
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005693#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005694/* Free memory allocated for various sched_group structures */
5695static void free_sched_groups(const cpumask_t *cpu_map)
5696{
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005697 int cpu, i;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005698
5699 for_each_cpu_mask(cpu, *cpu_map) {
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005700 struct sched_group **sched_group_nodes
5701 = sched_group_nodes_bycpu[cpu];
5702
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005703 if (!sched_group_nodes)
5704 continue;
5705
5706 for (i = 0; i < MAX_NUMNODES; i++) {
5707 cpumask_t nodemask = node_to_cpumask(i);
5708 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5709
5710 cpus_and(nodemask, nodemask, *cpu_map);
5711 if (cpus_empty(nodemask))
5712 continue;
5713
5714 if (sg == NULL)
5715 continue;
5716 sg = sg->next;
5717next_sg:
5718 oldsg = sg;
5719 sg = sg->next;
5720 kfree(oldsg);
5721 if (oldsg != sched_group_nodes[i])
5722 goto next_sg;
5723 }
5724 kfree(sched_group_nodes);
5725 sched_group_nodes_bycpu[cpu] = NULL;
5726 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005727}
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005728#else
5729static void free_sched_groups(const cpumask_t *cpu_map)
5730{
5731}
5732#endif
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005733
Linus Torvalds1da177e2005-04-16 15:20:36 -07005734/*
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005735 * Initialize sched groups cpu_power.
5736 *
5737 * cpu_power indicates the capacity of sched group, which is used while
5738 * distributing the load between different sched groups in a sched domain.
5739 * Typically cpu_power for all the groups in a sched domain will be same unless
5740 * there are asymmetries in the topology. If there are asymmetries, group
5741 * having more cpu_power will pickup more load compared to the group having
5742 * less cpu_power.
5743 *
5744 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5745 * the maximum number of tasks a group can handle in the presence of other idle
5746 * or lightly loaded groups in the same sched domain.
5747 */
5748static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5749{
5750 struct sched_domain *child;
5751 struct sched_group *group;
5752
5753 WARN_ON(!sd || !sd->groups);
5754
5755 if (cpu != first_cpu(sd->groups->cpumask))
5756 return;
5757
5758 child = sd->child;
5759
Eric Dumazet5517d862007-05-08 00:32:57 -07005760 sd->groups->__cpu_power = 0;
5761
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005762 /*
5763 * For perf policy, if the groups in child domain share resources
5764 * (for example cores sharing some portions of the cache hierarchy
5765 * or SMT), then set this domain groups cpu_power such that each group
5766 * can handle only one task, when there are other idle groups in the
5767 * same sched domain.
5768 */
5769 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5770 (child->flags &
5771 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
Eric Dumazet5517d862007-05-08 00:32:57 -07005772 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005773 return;
5774 }
5775
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005776 /*
5777 * add cpu_power of each child group to this groups cpu_power
5778 */
5779 group = child->groups;
5780 do {
Eric Dumazet5517d862007-05-08 00:32:57 -07005781 sg_inc_cpu_power(sd->groups, group->__cpu_power);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005782 group = group->next;
5783 } while (group != child->groups);
5784}
5785
5786/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005787 * Build sched domains for a given set of cpus and attach the sched domains
5788 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07005789 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005790static int build_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005791{
5792 int i;
John Hawkesd1b55132005-09-06 15:18:14 -07005793#ifdef CONFIG_NUMA
5794 struct sched_group **sched_group_nodes = NULL;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005795 int sd_allnodes = 0;
John Hawkesd1b55132005-09-06 15:18:14 -07005796
5797 /*
5798 * Allocate the per-node list of sched groups
5799 */
Ingo Molnardd41f592007-07-09 18:51:59 +02005800 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
Srivatsa Vaddagirid3a5aa92006-06-27 02:54:39 -07005801 GFP_KERNEL);
John Hawkesd1b55132005-09-06 15:18:14 -07005802 if (!sched_group_nodes) {
5803 printk(KERN_WARNING "Can not alloc sched group node list\n");
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005804 return -ENOMEM;
John Hawkesd1b55132005-09-06 15:18:14 -07005805 }
5806 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5807#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005808
5809 /*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005810 * Set up domains for cpus specified by the cpu_map.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005811 */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005812 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005813 struct sched_domain *sd = NULL, *p;
5814 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5815
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005816 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005817
5818#ifdef CONFIG_NUMA
Ingo Molnardd41f592007-07-09 18:51:59 +02005819 if (cpus_weight(*cpu_map) >
5820 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
John Hawkes9c1cfda2005-09-06 15:18:14 -07005821 sd = &per_cpu(allnodes_domains, i);
5822 *sd = SD_ALLNODES_INIT;
5823 sd->span = *cpu_map;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005824 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005825 p = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005826 sd_allnodes = 1;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005827 } else
5828 p = NULL;
5829
Linus Torvalds1da177e2005-04-16 15:20:36 -07005830 sd = &per_cpu(node_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005831 *sd = SD_NODE_INIT;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005832 sd->span = sched_domain_node_span(cpu_to_node(i));
5833 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005834 if (p)
5835 p->child = sd;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005836 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005837#endif
5838
5839 p = sd;
5840 sd = &per_cpu(phys_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005841 *sd = SD_CPU_INIT;
5842 sd->span = nodemask;
5843 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005844 if (p)
5845 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005846 cpu_to_phys_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005847
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005848#ifdef CONFIG_SCHED_MC
5849 p = sd;
5850 sd = &per_cpu(core_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005851 *sd = SD_MC_INIT;
5852 sd->span = cpu_coregroup_map(i);
5853 cpus_and(sd->span, sd->span, *cpu_map);
5854 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005855 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005856 cpu_to_core_group(i, cpu_map, &sd->groups);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005857#endif
5858
Linus Torvalds1da177e2005-04-16 15:20:36 -07005859#ifdef CONFIG_SCHED_SMT
5860 p = sd;
5861 sd = &per_cpu(cpu_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005862 *sd = SD_SIBLING_INIT;
5863 sd->span = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005864 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005865 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005866 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005867 cpu_to_cpu_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005868#endif
5869 }
5870
5871#ifdef CONFIG_SCHED_SMT
5872 /* Set up CPU (sibling) groups */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005873 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005874 cpumask_t this_sibling_map = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005875 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005876 if (i != first_cpu(this_sibling_map))
5877 continue;
5878
Ingo Molnardd41f592007-07-09 18:51:59 +02005879 init_sched_build_groups(this_sibling_map, cpu_map,
5880 &cpu_to_cpu_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005881 }
5882#endif
5883
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005884#ifdef CONFIG_SCHED_MC
5885 /* Set up multi-core groups */
5886 for_each_cpu_mask(i, *cpu_map) {
5887 cpumask_t this_core_map = cpu_coregroup_map(i);
5888 cpus_and(this_core_map, this_core_map, *cpu_map);
5889 if (i != first_cpu(this_core_map))
5890 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02005891 init_sched_build_groups(this_core_map, cpu_map,
5892 &cpu_to_core_group);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005893 }
5894#endif
5895
Linus Torvalds1da177e2005-04-16 15:20:36 -07005896 /* Set up physical groups */
5897 for (i = 0; i < MAX_NUMNODES; i++) {
5898 cpumask_t nodemask = node_to_cpumask(i);
5899
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005900 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005901 if (cpus_empty(nodemask))
5902 continue;
5903
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005904 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005905 }
5906
5907#ifdef CONFIG_NUMA
5908 /* Set up node groups */
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005909 if (sd_allnodes)
Ingo Molnardd41f592007-07-09 18:51:59 +02005910 init_sched_build_groups(*cpu_map, cpu_map,
5911 &cpu_to_allnodes_group);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005912
5913 for (i = 0; i < MAX_NUMNODES; i++) {
5914 /* Set up node groups */
5915 struct sched_group *sg, *prev;
5916 cpumask_t nodemask = node_to_cpumask(i);
5917 cpumask_t domainspan;
5918 cpumask_t covered = CPU_MASK_NONE;
5919 int j;
5920
5921 cpus_and(nodemask, nodemask, *cpu_map);
John Hawkesd1b55132005-09-06 15:18:14 -07005922 if (cpus_empty(nodemask)) {
5923 sched_group_nodes[i] = NULL;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005924 continue;
John Hawkesd1b55132005-09-06 15:18:14 -07005925 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005926
5927 domainspan = sched_domain_node_span(i);
5928 cpus_and(domainspan, domainspan, *cpu_map);
5929
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07005930 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005931 if (!sg) {
5932 printk(KERN_WARNING "Can not alloc domain group for "
5933 "node %d\n", i);
5934 goto error;
5935 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005936 sched_group_nodes[i] = sg;
5937 for_each_cpu_mask(j, nodemask) {
5938 struct sched_domain *sd;
5939 sd = &per_cpu(node_domains, j);
5940 sd->groups = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005941 }
Eric Dumazet5517d862007-05-08 00:32:57 -07005942 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005943 sg->cpumask = nodemask;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005944 sg->next = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005945 cpus_or(covered, covered, nodemask);
5946 prev = sg;
5947
5948 for (j = 0; j < MAX_NUMNODES; j++) {
5949 cpumask_t tmp, notcovered;
5950 int n = (i + j) % MAX_NUMNODES;
5951
5952 cpus_complement(notcovered, covered);
5953 cpus_and(tmp, notcovered, *cpu_map);
5954 cpus_and(tmp, tmp, domainspan);
5955 if (cpus_empty(tmp))
5956 break;
5957
5958 nodemask = node_to_cpumask(n);
5959 cpus_and(tmp, tmp, nodemask);
5960 if (cpus_empty(tmp))
5961 continue;
5962
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07005963 sg = kmalloc_node(sizeof(struct sched_group),
5964 GFP_KERNEL, i);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005965 if (!sg) {
5966 printk(KERN_WARNING
5967 "Can not alloc domain group for node %d\n", j);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005968 goto error;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005969 }
Eric Dumazet5517d862007-05-08 00:32:57 -07005970 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005971 sg->cpumask = tmp;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005972 sg->next = prev->next;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005973 cpus_or(covered, covered, tmp);
5974 prev->next = sg;
5975 prev = sg;
5976 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005977 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005978#endif
5979
5980 /* Calculate CPU power for physical packages and nodes */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005981#ifdef CONFIG_SCHED_SMT
5982 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005983 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5984
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005985 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005986 }
5987#endif
5988#ifdef CONFIG_SCHED_MC
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005989 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005990 struct sched_domain *sd = &per_cpu(core_domains, i);
5991
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005992 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005993 }
5994#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005995
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005996 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005997 struct sched_domain *sd = &per_cpu(phys_domains, i);
5998
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005999 init_sched_groups_power(i, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006000 }
6001
John Hawkes9c1cfda2005-09-06 15:18:14 -07006002#ifdef CONFIG_NUMA
Siddha, Suresh B08069032006-03-27 01:15:23 -08006003 for (i = 0; i < MAX_NUMNODES; i++)
6004 init_numa_sched_groups_power(sched_group_nodes[i]);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006005
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006006 if (sd_allnodes) {
6007 struct sched_group *sg;
Siddha, Suresh Bf712c0c72006-07-30 03:02:59 -07006008
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006009 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
Siddha, Suresh Bf712c0c72006-07-30 03:02:59 -07006010 init_numa_sched_groups_power(sg);
6011 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006012#endif
6013
Linus Torvalds1da177e2005-04-16 15:20:36 -07006014 /* Attach the domains */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006015 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006016 struct sched_domain *sd;
6017#ifdef CONFIG_SCHED_SMT
6018 sd = &per_cpu(cpu_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006019#elif defined(CONFIG_SCHED_MC)
6020 sd = &per_cpu(core_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006021#else
6022 sd = &per_cpu(phys_domains, i);
6023#endif
6024 cpu_attach_domain(sd, i);
6025 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006026
6027 return 0;
6028
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006029#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006030error:
6031 free_sched_groups(cpu_map);
6032 return -ENOMEM;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006033#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006034}
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006035/*
6036 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6037 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006038static int arch_init_sched_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006039{
6040 cpumask_t cpu_default_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006041 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006042
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006043 /*
6044 * Setup mask for cpus without special case scheduling requirements.
6045 * For now this just excludes isolated cpus, but could be used to
6046 * exclude other special cases in the future.
6047 */
6048 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6049
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006050 err = build_sched_domains(&cpu_default_map);
6051
6052 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006053}
6054
6055static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006056{
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006057 free_sched_groups(cpu_map);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006058}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006059
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006060/*
6061 * Detach sched domains from a group of cpus specified in cpu_map
6062 * These cpus will now be attached to the NULL domain
6063 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08006064static void detach_destroy_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006065{
6066 int i;
6067
6068 for_each_cpu_mask(i, *cpu_map)
6069 cpu_attach_domain(NULL, i);
6070 synchronize_sched();
6071 arch_destroy_sched_domains(cpu_map);
6072}
6073
6074/*
6075 * Partition sched domains as specified by the cpumasks below.
6076 * This attaches all cpus from the cpumasks to the NULL domain,
6077 * waits for a RCU quiescent period, recalculates sched
6078 * domain information and then attaches them back to the
6079 * correct sched domains
6080 * Call with hotplug lock held
6081 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006082int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006083{
6084 cpumask_t change_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006085 int err = 0;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006086
6087 cpus_and(*partition1, *partition1, cpu_online_map);
6088 cpus_and(*partition2, *partition2, cpu_online_map);
6089 cpus_or(change_map, *partition1, *partition2);
6090
6091 /* Detach sched domains from all of the affected cpus */
6092 detach_destroy_domains(&change_map);
6093 if (!cpus_empty(*partition1))
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006094 err = build_sched_domains(partition1);
6095 if (!err && !cpus_empty(*partition2))
6096 err = build_sched_domains(partition2);
6097
6098 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006099}
6100
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006101#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6102int arch_reinit_sched_domains(void)
6103{
6104 int err;
6105
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006106 mutex_lock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006107 detach_destroy_domains(&cpu_online_map);
6108 err = arch_init_sched_domains(&cpu_online_map);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006109 mutex_unlock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006110
6111 return err;
6112}
6113
6114static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6115{
6116 int ret;
6117
6118 if (buf[0] != '0' && buf[0] != '1')
6119 return -EINVAL;
6120
6121 if (smt)
6122 sched_smt_power_savings = (buf[0] == '1');
6123 else
6124 sched_mc_power_savings = (buf[0] == '1');
6125
6126 ret = arch_reinit_sched_domains();
6127
6128 return ret ? ret : count;
6129}
6130
6131int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6132{
6133 int err = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006134
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006135#ifdef CONFIG_SCHED_SMT
6136 if (smt_capable())
6137 err = sysfs_create_file(&cls->kset.kobj,
6138 &attr_sched_smt_power_savings.attr);
6139#endif
6140#ifdef CONFIG_SCHED_MC
6141 if (!err && mc_capable())
6142 err = sysfs_create_file(&cls->kset.kobj,
6143 &attr_sched_mc_power_savings.attr);
6144#endif
6145 return err;
6146}
6147#endif
6148
6149#ifdef CONFIG_SCHED_MC
6150static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6151{
6152 return sprintf(page, "%u\n", sched_mc_power_savings);
6153}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006154static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6155 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006156{
6157 return sched_power_savings_store(buf, count, 0);
6158}
6159SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6160 sched_mc_power_savings_store);
6161#endif
6162
6163#ifdef CONFIG_SCHED_SMT
6164static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6165{
6166 return sprintf(page, "%u\n", sched_smt_power_savings);
6167}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006168static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6169 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006170{
6171 return sched_power_savings_store(buf, count, 1);
6172}
6173SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6174 sched_smt_power_savings_store);
6175#endif
6176
Linus Torvalds1da177e2005-04-16 15:20:36 -07006177/*
6178 * Force a reinitialization of the sched domains hierarchy. The domains
6179 * and groups cannot be updated in place without racing with the balancing
Nick Piggin41c7ce92005-06-25 14:57:24 -07006180 * code, so we temporarily attach all running cpus to the NULL domain
Linus Torvalds1da177e2005-04-16 15:20:36 -07006181 * which will prevent rebalancing while the sched domains are recalculated.
6182 */
6183static int update_sched_domains(struct notifier_block *nfb,
6184 unsigned long action, void *hcpu)
6185{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006186 switch (action) {
6187 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006188 case CPU_UP_PREPARE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006189 case CPU_DOWN_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006190 case CPU_DOWN_PREPARE_FROZEN:
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006191 detach_destroy_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006192 return NOTIFY_OK;
6193
6194 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006195 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006196 case CPU_DOWN_FAILED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006197 case CPU_DOWN_FAILED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006198 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006199 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006200 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006201 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006202 /*
6203 * Fall through and re-initialise the domains.
6204 */
6205 break;
6206 default:
6207 return NOTIFY_DONE;
6208 }
6209
6210 /* The hotplug lock is already held by cpu_up/cpu_down */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006211 arch_init_sched_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006212
6213 return NOTIFY_OK;
6214}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006215
6216void __init sched_init_smp(void)
6217{
Nick Piggin5c1e1762006-10-03 01:14:04 -07006218 cpumask_t non_isolated_cpus;
6219
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006220 mutex_lock(&sched_hotcpu_mutex);
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006221 arch_init_sched_domains(&cpu_online_map);
Nathan Lynche5e56732007-01-10 23:15:28 -08006222 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006223 if (cpus_empty(non_isolated_cpus))
6224 cpu_set(smp_processor_id(), non_isolated_cpus);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006225 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006226 /* XXX: Theoretical race here - CPU may be hotplugged now */
6227 hotcpu_notifier(update_sched_domains, 0);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006228
6229 /* Move init over to a non-isolated CPU */
6230 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6231 BUG();
Ingo Molnardd41f592007-07-09 18:51:59 +02006232 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006233}
6234#else
6235void __init sched_init_smp(void)
6236{
Ingo Molnardd41f592007-07-09 18:51:59 +02006237 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006238}
6239#endif /* CONFIG_SMP */
6240
6241int in_sched_functions(unsigned long addr)
6242{
6243 /* Linker adds these: start and end of __sched functions */
6244 extern char __sched_text_start[], __sched_text_end[];
Ingo Molnar48f24c42006-07-03 00:25:40 -07006245
Linus Torvalds1da177e2005-04-16 15:20:36 -07006246 return in_lock_functions(addr) ||
6247 (addr >= (unsigned long)__sched_text_start
6248 && addr < (unsigned long)__sched_text_end);
6249}
6250
Ingo Molnardd41f592007-07-09 18:51:59 +02006251static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6252{
6253 cfs_rq->tasks_timeline = RB_ROOT;
6254 cfs_rq->fair_clock = 1;
6255#ifdef CONFIG_FAIR_GROUP_SCHED
6256 cfs_rq->rq = rq;
6257#endif
6258}
6259
Linus Torvalds1da177e2005-04-16 15:20:36 -07006260void __init sched_init(void)
6261{
Ingo Molnardd41f592007-07-09 18:51:59 +02006262 u64 now = sched_clock();
Christoph Lameter476f3532007-05-06 14:48:58 -07006263 int highest_cpu = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006264 int i, j;
6265
6266 /*
6267 * Link up the scheduling class hierarchy:
6268 */
6269 rt_sched_class.next = &fair_sched_class;
6270 fair_sched_class.next = &idle_sched_class;
6271 idle_sched_class.next = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006272
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08006273 for_each_possible_cpu(i) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006274 struct rt_prio_array *array;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006275 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006276
6277 rq = cpu_rq(i);
6278 spin_lock_init(&rq->lock);
Ingo Molnarfcb99372006-07-03 00:25:10 -07006279 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
Nick Piggin78979862005-06-25 14:57:13 -07006280 rq->nr_running = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006281 rq->clock = 1;
6282 init_cfs_rq(&rq->cfs, rq);
6283#ifdef CONFIG_FAIR_GROUP_SCHED
6284 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6285 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6286#endif
6287 rq->ls.load_update_last = now;
6288 rq->ls.load_update_start = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006289
Ingo Molnardd41f592007-07-09 18:51:59 +02006290 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6291 rq->cpu_load[j] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006292#ifdef CONFIG_SMP
Nick Piggin41c7ce92005-06-25 14:57:24 -07006293 rq->sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006294 rq->active_balance = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006295 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006296 rq->push_cpu = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07006297 rq->cpu = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006298 rq->migration_thread = NULL;
6299 INIT_LIST_HEAD(&rq->migration_queue);
6300#endif
6301 atomic_set(&rq->nr_iowait, 0);
6302
Ingo Molnardd41f592007-07-09 18:51:59 +02006303 array = &rq->rt.active;
6304 for (j = 0; j < MAX_RT_PRIO; j++) {
6305 INIT_LIST_HEAD(array->queue + j);
6306 __clear_bit(j, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006307 }
Christoph Lameter476f3532007-05-06 14:48:58 -07006308 highest_cpu = i;
Ingo Molnardd41f592007-07-09 18:51:59 +02006309 /* delimiter for bitsearch: */
6310 __set_bit(MAX_RT_PRIO, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006311 }
6312
Peter Williams2dd73a42006-06-27 02:54:34 -07006313 set_load_weight(&init_task);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006314
Christoph Lameterc9819f42006-12-10 02:20:25 -08006315#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07006316 nr_cpu_ids = highest_cpu + 1;
Christoph Lameterc9819f42006-12-10 02:20:25 -08006317 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6318#endif
6319
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006320#ifdef CONFIG_RT_MUTEXES
6321 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6322#endif
6323
Linus Torvalds1da177e2005-04-16 15:20:36 -07006324 /*
6325 * The boot idle thread does lazy MMU switching as well:
6326 */
6327 atomic_inc(&init_mm.mm_count);
6328 enter_lazy_tlb(&init_mm, current);
6329
6330 /*
6331 * Make us the idle thread. Technically, schedule() should not be
6332 * called from this thread, however somewhere below it might be,
6333 * but because we are the idle thread, we just pick up running again
6334 * when this runqueue becomes "idle".
6335 */
6336 init_idle(current, smp_processor_id());
Ingo Molnardd41f592007-07-09 18:51:59 +02006337 /*
6338 * During early bootup we pretend to be a normal task:
6339 */
6340 current->sched_class = &fair_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006341}
6342
6343#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6344void __might_sleep(char *file, int line)
6345{
Ingo Molnar48f24c42006-07-03 00:25:40 -07006346#ifdef in_atomic
Linus Torvalds1da177e2005-04-16 15:20:36 -07006347 static unsigned long prev_jiffy; /* ratelimiting */
6348
6349 if ((in_atomic() || irqs_disabled()) &&
6350 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6351 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6352 return;
6353 prev_jiffy = jiffies;
Ingo Molnar91368d72006-03-23 03:00:54 -08006354 printk(KERN_ERR "BUG: sleeping function called from invalid"
Linus Torvalds1da177e2005-04-16 15:20:36 -07006355 " context at %s:%d\n", file, line);
6356 printk("in_atomic():%d, irqs_disabled():%d\n",
6357 in_atomic(), irqs_disabled());
Peter Zijlstraa4c410f2006-12-06 20:37:21 -08006358 debug_show_held_locks(current);
Ingo Molnar3117df02006-12-13 00:34:43 -08006359 if (irqs_disabled())
6360 print_irqtrace_events(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006361 dump_stack();
6362 }
6363#endif
6364}
6365EXPORT_SYMBOL(__might_sleep);
6366#endif
6367
6368#ifdef CONFIG_MAGIC_SYSRQ
6369void normalize_rt_tasks(void)
6370{
Ingo Molnara0f98a12007-06-17 18:37:45 +02006371 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006372 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006373 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02006374 int on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006375
6376 read_lock_irq(&tasklist_lock);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006377 do_each_thread(g, p) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006378 p->se.fair_key = 0;
6379 p->se.wait_runtime = 0;
6380 p->se.wait_start_fair = 0;
6381 p->se.wait_start = 0;
6382 p->se.exec_start = 0;
6383 p->se.sleep_start = 0;
6384 p->se.sleep_start_fair = 0;
6385 p->se.block_start = 0;
6386 task_rq(p)->cfs.fair_clock = 0;
6387 task_rq(p)->clock = 0;
6388
6389 if (!rt_task(p)) {
6390 /*
6391 * Renice negative nice level userspace
6392 * tasks back to 0:
6393 */
6394 if (TASK_NICE(p) < 0 && p->mm)
6395 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006396 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02006397 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006398
Ingo Molnarb29739f2006-06-27 02:54:51 -07006399 spin_lock_irqsave(&p->pi_lock, flags);
6400 rq = __task_rq_lock(p);
Ingo Molnardd41f592007-07-09 18:51:59 +02006401#ifdef CONFIG_SMP
6402 /*
6403 * Do not touch the migration thread:
6404 */
6405 if (p == rq->migration_thread)
6406 goto out_unlock;
6407#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006408
Ingo Molnardd41f592007-07-09 18:51:59 +02006409 on_rq = p->se.on_rq;
6410 if (on_rq)
6411 deactivate_task(task_rq(p), p, 0);
6412 __setscheduler(rq, p, SCHED_NORMAL, 0);
6413 if (on_rq) {
6414 activate_task(task_rq(p), p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006415 resched_task(rq->curr);
6416 }
Ingo Molnardd41f592007-07-09 18:51:59 +02006417#ifdef CONFIG_SMP
6418 out_unlock:
6419#endif
Ingo Molnarb29739f2006-06-27 02:54:51 -07006420 __task_rq_unlock(rq);
6421 spin_unlock_irqrestore(&p->pi_lock, flags);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006422 } while_each_thread(g, p);
6423
Linus Torvalds1da177e2005-04-16 15:20:36 -07006424 read_unlock_irq(&tasklist_lock);
6425}
6426
6427#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07006428
6429#ifdef CONFIG_IA64
6430/*
6431 * These functions are only useful for the IA64 MCA handling.
6432 *
6433 * They can only be called when the whole system has been
6434 * stopped - every CPU needs to be quiescent, and no scheduling
6435 * activity can take place. Using them for anything else would
6436 * be a serious bug, and as a result, they aren't even visible
6437 * under any other configuration.
6438 */
6439
6440/**
6441 * curr_task - return the current task for a given cpu.
6442 * @cpu: the processor in question.
6443 *
6444 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6445 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006446struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006447{
6448 return cpu_curr(cpu);
6449}
6450
6451/**
6452 * set_curr_task - set the current task for a given cpu.
6453 * @cpu: the processor in question.
6454 * @p: the task pointer to set.
6455 *
6456 * Description: This function must only be used when non-maskable interrupts
6457 * are serviced on a separate stack. It allows the architecture to switch the
6458 * notion of the current task on a cpu in a non-blocking manner. This function
6459 * must be called with all CPU's synchronized, and interrupts disabled, the
6460 * and caller must save the original value of the current task (see
6461 * curr_task() above) and restore that value before reenabling interrupts and
6462 * re-starting the system.
6463 *
6464 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6465 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006466void set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006467{
6468 cpu_curr(cpu) = p;
6469}
6470
6471#endif