2005-04-16 18:20:36 -04:00
|
|
|
#ifndef _LINUX_SCHED_H
|
|
|
|
#define _LINUX_SCHED_H
|
|
|
|
|
2006-04-26 19:12:56 -04:00
|
|
|
#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cloning flags:
|
|
|
|
*/
|
|
|
|
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
|
|
|
|
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
|
|
|
|
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
|
|
|
|
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
|
|
|
|
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
|
|
|
|
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
|
|
|
|
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
|
|
|
|
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
|
|
|
|
#define CLONE_THREAD 0x00010000 /* Same thread group? */
|
|
|
|
#define CLONE_NEWNS 0x00020000 /* New namespace group? */
|
|
|
|
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
|
|
|
|
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
|
|
|
|
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
|
|
|
|
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
|
|
|
|
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
|
|
|
|
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
|
|
|
|
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
|
|
|
|
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scheduling policies
|
|
|
|
*/
|
|
|
|
#define SCHED_NORMAL 0
|
|
|
|
#define SCHED_FIFO 1
|
|
|
|
#define SCHED_RR 2
|
|
|
|
#define SCHED_BATCH 3
|
|
|
|
|
2006-04-25 09:54:40 -04:00
|
|
|
#ifdef __KERNEL__
|
2006-04-26 19:12:56 -04:00
|
|
|
|
|
|
|
struct sched_param {
|
|
|
|
int sched_priority;
|
|
|
|
};
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
#include <asm/param.h> /* for HZ */
|
|
|
|
|
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/timex.h>
|
|
|
|
#include <linux/jiffies.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/thread_info.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
|
|
|
|
#include <asm/system.h>
|
|
|
|
#include <asm/semaphore.h>
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/ptrace.h>
|
|
|
|
#include <asm/mmu.h>
|
|
|
|
#include <asm/cputime.h>
|
|
|
|
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/sem.h>
|
|
|
|
#include <linux/signal.h>
|
|
|
|
#include <linux/securebits.h>
|
|
|
|
#include <linux/fs_struct.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/pid.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/topology.h>
|
|
|
|
#include <linux/seccomp.h>
|
2006-01-08 04:01:37 -05:00
|
|
|
#include <linux/rcupdate.h>
|
2006-03-27 04:16:22 -05:00
|
|
|
#include <linux/futex.h>
|
2006-06-27 05:54:53 -04:00
|
|
|
#include <linux/rtmutex.h>
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2006-04-25 09:54:40 -04:00
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/param.h>
|
|
|
|
#include <linux/resource.h>
|
|
|
|
#include <linux/timer.h>
|
|
|
|
#include <linux/hrtimer.h>
|
|
|
|
|
|
|
|
#include <asm/processor.h>
|
2005-09-06 18:16:49 -04:00
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
struct exec_domain;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* List of flags we want to share for kernel threads,
|
|
|
|
* if only because they are not used by them anyway.
|
|
|
|
*/
|
|
|
|
#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These are the constant used to fake the fixed-point load-average
|
|
|
|
* counting. Some notes:
|
|
|
|
* - 11 bit fractions expand to 22 bits by the multiplies: this gives
|
|
|
|
* a load-average precision of 10 bits integer + 11 bits fractional
|
|
|
|
* - if you want to count load-averages more often, you need more
|
|
|
|
* precision, or rounding will get you. With 2-second counting freq,
|
|
|
|
* the EXP_n values would be 1981, 2034 and 2043 if still using only
|
|
|
|
* 11 bit fractions.
|
|
|
|
*/
|
|
|
|
extern unsigned long avenrun[]; /* Load averages */
|
|
|
|
|
|
|
|
#define FSHIFT 11 /* nr of bits of precision */
|
|
|
|
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
|
|
|
|
#define LOAD_FREQ (5*HZ) /* 5 sec intervals */
|
|
|
|
#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
|
|
|
|
#define EXP_5 2014 /* 1/exp(5sec/5min) */
|
|
|
|
#define EXP_15 2037 /* 1/exp(5sec/15min) */
|
|
|
|
|
|
|
|
#define CALC_LOAD(load,exp,n) \
|
|
|
|
load *= exp; \
|
|
|
|
load += n*(FIXED_1-exp); \
|
|
|
|
load >>= FSHIFT;
|
|
|
|
|
|
|
|
extern unsigned long total_forks;
|
|
|
|
extern int nr_threads;
|
|
|
|
extern int last_pid;
|
|
|
|
DECLARE_PER_CPU(unsigned long, process_counts);
|
|
|
|
extern int nr_processes(void);
|
|
|
|
extern unsigned long nr_running(void);
|
|
|
|
extern unsigned long nr_uninterruptible(void);
|
2006-03-31 05:31:21 -05:00
|
|
|
extern unsigned long nr_active(void);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern unsigned long nr_iowait(void);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 05:54:34 -04:00
|
|
|
extern unsigned long weighted_cpuload(const int cpu);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
|
2005-09-29 18:18:21 -04:00
|
|
|
/*
|
|
|
|
* Task state bitmask. NOTE! These bits are also
|
|
|
|
* encoded in fs/proc/array.c: get_task_state().
|
|
|
|
*
|
|
|
|
* We have two separate sets of flags: task->state
|
|
|
|
* is about runnability, while task->exit_state are
|
|
|
|
* about the task exiting. Confusing, but this way
|
|
|
|
* modifying one set can't modify the other one by
|
|
|
|
* mistake.
|
|
|
|
*/
|
2005-04-16 18:20:36 -04:00
|
|
|
#define TASK_RUNNING 0
|
|
|
|
#define TASK_INTERRUPTIBLE 1
|
|
|
|
#define TASK_UNINTERRUPTIBLE 2
|
2005-09-29 18:18:21 -04:00
|
|
|
#define TASK_STOPPED 4
|
|
|
|
#define TASK_TRACED 8
|
|
|
|
/* in tsk->exit_state */
|
|
|
|
#define EXIT_ZOMBIE 16
|
|
|
|
#define EXIT_DEAD 32
|
|
|
|
/* in tsk->state again */
|
|
|
|
#define TASK_NONINTERACTIVE 64
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
#define __set_task_state(tsk, state_value) \
|
|
|
|
do { (tsk)->state = (state_value); } while (0)
|
|
|
|
#define set_task_state(tsk, state_value) \
|
|
|
|
set_mb((tsk)->state, (state_value))
|
|
|
|
|
2005-09-13 04:25:14 -04:00
|
|
|
/*
|
|
|
|
* set_current_state() includes a barrier so that the write of current->state
|
|
|
|
* is correctly serialised wrt the caller's subsequent test of whether to
|
|
|
|
* actually sleep:
|
|
|
|
*
|
|
|
|
* set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
* if (do_i_need_to_sleep())
|
|
|
|
* schedule();
|
|
|
|
*
|
|
|
|
* If the caller does not need such serialisation then use __set_current_state()
|
|
|
|
*/
|
2005-04-16 18:20:36 -04:00
|
|
|
#define __set_current_state(state_value) \
|
|
|
|
do { current->state = (state_value); } while (0)
|
|
|
|
#define set_current_state(state_value) \
|
|
|
|
set_mb(current->state, (state_value))
|
|
|
|
|
|
|
|
/* Task command name length */
|
|
|
|
#define TASK_COMM_LEN 16
|
|
|
|
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This serializes "schedule()" and also protects
|
|
|
|
* the run-queue from deletions/modifications (but
|
|
|
|
* _adding_ to the beginning of the run-queue has
|
|
|
|
* a separate lock).
|
|
|
|
*/
|
|
|
|
extern rwlock_t tasklist_lock;
|
|
|
|
extern spinlock_t mmlist_lock;
|
|
|
|
|
|
|
|
typedef struct task_struct task_t;
|
|
|
|
|
|
|
|
extern void sched_init(void);
|
|
|
|
extern void sched_init_smp(void);
|
|
|
|
extern void init_idle(task_t *idle, int cpu);
|
|
|
|
|
|
|
|
extern cpumask_t nohz_cpu_mask;
|
|
|
|
|
|
|
|
extern void show_state(void);
|
|
|
|
extern void show_regs(struct pt_regs *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TASK is a pointer to the task whose backtrace we want to see (or NULL for current
|
|
|
|
* task), SP is the stack pointer of the first frame that should be shown in the back
|
|
|
|
* trace (or NULL if the entire call-chain of the task should be shown).
|
|
|
|
*/
|
|
|
|
extern void show_stack(struct task_struct *task, unsigned long *sp);
|
|
|
|
|
|
|
|
void io_schedule(void);
|
|
|
|
long io_schedule_timeout(long timeout);
|
|
|
|
|
|
|
|
extern void cpu_init (void);
|
|
|
|
extern void trap_init(void);
|
|
|
|
extern void update_process_times(int user);
|
|
|
|
extern void scheduler_tick(void);
|
|
|
|
|
2005-09-06 18:16:27 -04:00
|
|
|
#ifdef CONFIG_DETECT_SOFTLOCKUP
|
2006-03-24 06:18:41 -05:00
|
|
|
extern void softlockup_tick(void);
|
2005-09-06 18:16:27 -04:00
|
|
|
extern void spawn_softlockup_task(void);
|
|
|
|
extern void touch_softlockup_watchdog(void);
|
|
|
|
#else
|
2006-03-24 06:18:41 -05:00
|
|
|
static inline void softlockup_tick(void)
|
2005-09-06 18:16:27 -04:00
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void spawn_softlockup_task(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void touch_softlockup_watchdog(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/* Attach to any functions which should be ignored in wchan output. */
|
|
|
|
#define __sched __attribute__((__section__(".sched.text")))
|
|
|
|
/* Is this address in the __sched functions? */
|
|
|
|
extern int in_sched_functions(unsigned long addr);
|
|
|
|
|
|
|
|
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
|
|
|
|
extern signed long FASTCALL(schedule_timeout(signed long timeout));
|
2005-09-10 03:27:21 -04:00
|
|
|
extern signed long schedule_timeout_interruptible(signed long timeout);
|
|
|
|
extern signed long schedule_timeout_uninterruptible(signed long timeout);
|
2005-04-16 18:20:36 -04:00
|
|
|
asmlinkage void schedule(void);
|
|
|
|
|
|
|
|
struct namespace;
|
|
|
|
|
|
|
|
/* Maximum number of active map areas.. This is a random (large) number */
|
|
|
|
#define DEFAULT_MAX_MAP_COUNT 65536
|
|
|
|
|
|
|
|
extern int sysctl_max_map_count;
|
|
|
|
|
|
|
|
#include <linux/aio.h>
|
|
|
|
|
|
|
|
extern unsigned long
|
|
|
|
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
|
|
|
|
unsigned long, unsigned long);
|
|
|
|
extern unsigned long
|
|
|
|
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
2005-06-21 20:14:49 -04:00
|
|
|
extern void arch_unmap_area(struct mm_struct *, unsigned long);
|
|
|
|
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2005-10-29 21:16:41 -04:00
|
|
|
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
|
|
|
|
/*
|
|
|
|
* The mm counters are not protected by its page_table_lock,
|
|
|
|
* so must be incremented atomically.
|
|
|
|
*/
|
2006-01-06 03:11:20 -05:00
|
|
|
#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
|
|
|
|
#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
|
|
|
|
#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
|
|
|
|
#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
|
|
|
|
#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
|
|
|
|
typedef atomic_long_t mm_counter_t;
|
2005-10-29 21:16:41 -04:00
|
|
|
|
|
|
|
#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
|
|
|
|
/*
|
|
|
|
* The mm counters are protected by its page_table_lock,
|
|
|
|
* so can be incremented directly.
|
|
|
|
*/
|
2005-04-16 18:20:36 -04:00
|
|
|
#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
|
|
|
|
#define get_mm_counter(mm, member) ((mm)->_##member)
|
|
|
|
#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
|
|
|
|
#define inc_mm_counter(mm, member) (mm)->_##member++
|
|
|
|
#define dec_mm_counter(mm, member) (mm)->_##member--
|
2005-10-29 21:16:41 -04:00
|
|
|
typedef unsigned long mm_counter_t;
|
|
|
|
|
|
|
|
#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
|
2005-10-29 21:16:05 -04:00
|
|
|
|
2005-10-29 21:16:41 -04:00
|
|
|
#define get_mm_rss(mm) \
|
|
|
|
(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
|
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:18 -04:00
|
|
|
#define update_hiwater_rss(mm) do { \
|
|
|
|
unsigned long _rss = get_mm_rss(mm); \
|
|
|
|
if ((mm)->hiwater_rss < _rss) \
|
|
|
|
(mm)->hiwater_rss = _rss; \
|
|
|
|
} while (0)
|
|
|
|
#define update_hiwater_vm(mm) do { \
|
|
|
|
if ((mm)->hiwater_vm < (mm)->total_vm) \
|
|
|
|
(mm)->hiwater_vm = (mm)->total_vm; \
|
|
|
|
} while (0)
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
struct mm_struct {
|
|
|
|
struct vm_area_struct * mmap; /* list of VMAs */
|
|
|
|
struct rb_root mm_rb;
|
|
|
|
struct vm_area_struct * mmap_cache; /* last find_vma result */
|
|
|
|
unsigned long (*get_unmapped_area) (struct file *filp,
|
|
|
|
unsigned long addr, unsigned long len,
|
|
|
|
unsigned long pgoff, unsigned long flags);
|
2005-06-21 20:14:49 -04:00
|
|
|
void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
|
2006-02-28 19:59:19 -05:00
|
|
|
unsigned long mmap_base; /* base of mmap area */
|
|
|
|
unsigned long task_size; /* size of task vm space */
|
|
|
|
unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
|
2005-06-21 20:14:49 -04:00
|
|
|
unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
|
2005-04-16 18:20:36 -04:00
|
|
|
pgd_t * pgd;
|
|
|
|
atomic_t mm_users; /* How many users with user space? */
|
|
|
|
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
|
|
|
|
int map_count; /* number of VMAs */
|
|
|
|
struct rw_semaphore mmap_sem;
|
|
|
|
spinlock_t page_table_lock; /* Protects page tables and some counters */
|
|
|
|
|
|
|
|
struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
|
|
|
|
* together off init_mm.mmlist, and are protected
|
|
|
|
* by mmlist_lock
|
|
|
|
*/
|
|
|
|
|
2005-10-29 21:16:41 -04:00
|
|
|
/* Special counters, in some configurations protected by the
|
|
|
|
* page_table_lock, in other configurations by being atomic.
|
|
|
|
*/
|
2005-10-29 21:16:05 -04:00
|
|
|
mm_counter_t _file_rss;
|
2005-04-16 18:20:36 -04:00
|
|
|
mm_counter_t _anon_rss;
|
|
|
|
|
2005-10-29 21:16:19 -04:00
|
|
|
unsigned long hiwater_rss; /* High-watermark of RSS usage */
|
|
|
|
unsigned long hiwater_vm; /* High-water virtual memory usage */
|
|
|
|
|
|
|
|
unsigned long total_vm, locked_vm, shared_vm, exec_vm;
|
|
|
|
unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
|
|
|
|
unsigned long start_code, end_code, start_data, end_data;
|
|
|
|
unsigned long start_brk, brk, start_stack;
|
|
|
|
unsigned long arg_start, arg_end, env_start, env_end;
|
|
|
|
|
2005-09-06 18:16:49 -04:00
|
|
|
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2005-06-23 03:09:43 -04:00
|
|
|
unsigned dumpable:2;
|
2005-04-16 18:20:36 -04:00
|
|
|
cpumask_t cpu_vm_mask;
|
|
|
|
|
|
|
|
/* Architecture-specific MM context */
|
|
|
|
mm_context_t context;
|
|
|
|
|
|
|
|
/* Token based thrashing protection. */
|
|
|
|
unsigned long swap_token_time;
|
|
|
|
char recent_pagein;
|
|
|
|
|
|
|
|
/* coredumping support */
|
|
|
|
int core_waiters;
|
|
|
|
struct completion *core_startup_done, core_done;
|
|
|
|
|
|
|
|
/* aio bits */
|
|
|
|
rwlock_t ioctx_list_lock;
|
|
|
|
struct kioctx *ioctx_list;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct sighand_struct {
|
|
|
|
atomic_t count;
|
|
|
|
struct k_sigaction action[_NSIG];
|
|
|
|
spinlock_t siglock;
|
|
|
|
};
|
|
|
|
|
2006-06-25 08:49:24 -04:00
|
|
|
struct pacct_struct {
|
2006-06-25 08:49:25 -04:00
|
|
|
int ac_flag;
|
|
|
|
long ac_exitcode;
|
2006-06-25 08:49:24 -04:00
|
|
|
unsigned long ac_mem;
|
2006-06-25 08:49:26 -04:00
|
|
|
cputime_t ac_utime, ac_stime;
|
|
|
|
unsigned long ac_minflt, ac_majflt;
|
2006-06-25 08:49:24 -04:00
|
|
|
};
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/*
|
|
|
|
* NOTE! "signal_struct" does not have it's own
|
|
|
|
* locking, because a shared signal_struct always
|
|
|
|
* implies a shared sighand_struct, so locking
|
|
|
|
* sighand_struct is always a proper superset of
|
|
|
|
* the locking of signal_struct.
|
|
|
|
*/
|
|
|
|
struct signal_struct {
|
|
|
|
atomic_t count;
|
|
|
|
atomic_t live;
|
|
|
|
|
|
|
|
wait_queue_head_t wait_chldexit; /* for wait4() */
|
|
|
|
|
|
|
|
/* current thread group signal load-balancing target: */
|
|
|
|
task_t *curr_target;
|
|
|
|
|
|
|
|
/* shared signal handling: */
|
|
|
|
struct sigpending shared_pending;
|
|
|
|
|
|
|
|
/* thread group exit support */
|
|
|
|
int group_exit_code;
|
|
|
|
/* overloaded:
|
|
|
|
* - notify group_exit_task when ->count is equal to notify_count
|
|
|
|
* - everyone except group_exit_task is stopped during signal delivery
|
|
|
|
* of fatal signals, group_exit_task processes the signal.
|
|
|
|
*/
|
|
|
|
struct task_struct *group_exit_task;
|
|
|
|
int notify_count;
|
|
|
|
|
|
|
|
/* thread group stop support, overloads group_exit_code too */
|
|
|
|
int group_stop_count;
|
|
|
|
unsigned int flags; /* see SIGNAL_* flags below */
|
|
|
|
|
|
|
|
/* POSIX.1b Interval Timers */
|
|
|
|
struct list_head posix_timers;
|
|
|
|
|
|
|
|
/* ITIMER_REAL timer for the process */
|
2006-01-09 23:52:34 -05:00
|
|
|
struct hrtimer real_timer;
|
2006-03-26 04:38:12 -05:00
|
|
|
struct task_struct *tsk;
|
2006-01-09 23:52:34 -05:00
|
|
|
ktime_t it_real_incr;
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
|
|
|
|
cputime_t it_prof_expires, it_virt_expires;
|
|
|
|
cputime_t it_prof_incr, it_virt_incr;
|
|
|
|
|
|
|
|
/* job control IDs */
|
|
|
|
pid_t pgrp;
|
|
|
|
pid_t tty_old_pgrp;
|
|
|
|
pid_t session;
|
|
|
|
/* boolean value for session group leader */
|
|
|
|
int leader;
|
|
|
|
|
|
|
|
struct tty_struct *tty; /* NULL if no tty */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cumulative resource counters for dead threads in the group,
|
|
|
|
* and for reaped dead child processes forked by this group.
|
|
|
|
* Live threads maintain their own counters and add to these
|
|
|
|
* in __exit_signal, except for the group leader.
|
|
|
|
*/
|
|
|
|
cputime_t utime, stime, cutime, cstime;
|
|
|
|
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
|
|
|
|
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cumulative ns of scheduled CPU time for dead threads in the
|
|
|
|
* group, not including a zombie group leader. (This only differs
|
|
|
|
* from jiffies_to_ns(utime + stime) if sched_clock uses something
|
|
|
|
* other than jiffies.)
|
|
|
|
*/
|
|
|
|
unsigned long long sched_time;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't bother to synchronize most readers of this at all,
|
|
|
|
* because there is no reader checking a limit that actually needs
|
|
|
|
* to get both rlim_cur and rlim_max atomically, and either one
|
|
|
|
* alone is a single word that can safely be read normally.
|
|
|
|
* getrlimit/setrlimit use task_lock(current->group_leader) to
|
|
|
|
* protect this instead of the siglock, because they really
|
|
|
|
* have no need to disable irqs.
|
|
|
|
*/
|
|
|
|
struct rlimit rlim[RLIM_NLIMITS];
|
|
|
|
|
|
|
|
struct list_head cpu_timers[3];
|
|
|
|
|
|
|
|
/* keep the process-shared keyrings here so that they do the right
|
|
|
|
* thing in threads created with CLONE_THREAD */
|
|
|
|
#ifdef CONFIG_KEYS
|
|
|
|
struct key *session_keyring; /* keyring inherited over fork */
|
|
|
|
struct key *process_keyring; /* keyring private to this process */
|
|
|
|
#endif
|
2006-06-25 08:49:24 -04:00
|
|
|
#ifdef CONFIG_BSD_PROCESS_ACCT
|
|
|
|
struct pacct_struct pacct; /* per-process accounting information */
|
|
|
|
#endif
|
2005-04-16 18:20:36 -04:00
|
|
|
};
|
|
|
|
|
2005-06-25 17:57:23 -04:00
|
|
|
/* Context switch must be unlocked if interrupts are to be enabled */
|
|
|
|
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
# define __ARCH_WANT_UNLOCKED_CTXSW
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/*
|
|
|
|
* Bits in flags field of signal_struct.
|
|
|
|
*/
|
|
|
|
#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
|
|
|
|
#define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */
|
|
|
|
#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */
|
|
|
|
#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
|
2006-01-14 16:20:41 -05:00
|
|
|
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
|
|
|
|
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
|
|
|
|
* values are inverted: lower p->prio value means higher priority.
|
2005-04-16 18:20:36 -04:00
|
|
|
*
|
|
|
|
* The MAX_USER_RT_PRIO value allows the actual maximum
|
|
|
|
* RT priority to be separate from the value exported to
|
|
|
|
* user-space. This allows kernel threads to set their
|
|
|
|
* priority to a value higher than any user task. Note:
|
|
|
|
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define MAX_USER_RT_PRIO 100
|
|
|
|
#define MAX_RT_PRIO MAX_USER_RT_PRIO
|
|
|
|
|
|
|
|
#define MAX_PRIO (MAX_RT_PRIO + 40)
|
|
|
|
|
2006-06-27 05:54:51 -04:00
|
|
|
#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
|
|
|
|
#define rt_task(p) rt_prio((p)->prio)
|
2006-03-31 05:31:29 -05:00
|
|
|
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
|
2006-06-27 05:54:51 -04:00
|
|
|
#define has_rt_policy(p) \
|
|
|
|
unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Some day this will be a full-fledged user tracking system..
|
|
|
|
*/
|
|
|
|
struct user_struct {
|
|
|
|
atomic_t __count; /* reference count */
|
|
|
|
atomic_t processes; /* How many processes does this user have? */
|
|
|
|
atomic_t files; /* How many open files does this user have? */
|
|
|
|
atomic_t sigpending; /* How many pending signals does this user have? */
|
2006-06-01 16:10:59 -04:00
|
|
|
#ifdef CONFIG_INOTIFY_USER
|
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-12 17:06:03 -04:00
|
|
|
atomic_t inotify_watches; /* How many inotify watches does this user have? */
|
|
|
|
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
|
|
|
|
#endif
|
2005-04-16 18:20:36 -04:00
|
|
|
/* protected by mq_lock */
|
|
|
|
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
|
|
|
|
unsigned long locked_shm; /* How many pages of mlocked shm ? */
|
|
|
|
|
|
|
|
#ifdef CONFIG_KEYS
|
|
|
|
struct key *uid_keyring; /* UID specific keyring */
|
|
|
|
struct key *session_keyring; /* UID's default session keyring */
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Hash table maintenance information */
|
|
|
|
struct list_head uidhash_list;
|
|
|
|
uid_t uid;
|
|
|
|
};
|
|
|
|
|
|
|
|
extern struct user_struct *find_user(uid_t);
|
|
|
|
|
|
|
|
extern struct user_struct root_user;
|
|
|
|
#define INIT_USER (&root_user)
|
|
|
|
|
|
|
|
typedef struct prio_array prio_array_t;
|
|
|
|
struct backing_dev_info;
|
|
|
|
struct reclaim_state;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
struct sched_info {
|
|
|
|
/* cumulative counters */
|
|
|
|
unsigned long cpu_time, /* time spent on the cpu */
|
|
|
|
run_delay, /* time spent waiting on a runqueue */
|
|
|
|
pcnt; /* # of timeslices run on this cpu */
|
|
|
|
|
|
|
|
/* timestamps */
|
|
|
|
unsigned long last_arrival, /* when we last ran on a cpu */
|
|
|
|
last_queued; /* when we were last queued to run */
|
|
|
|
};
|
|
|
|
|
|
|
|
extern struct file_operations proc_schedstat_operations;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
enum idle_type
|
|
|
|
{
|
|
|
|
SCHED_IDLE,
|
|
|
|
NOT_IDLE,
|
|
|
|
NEWLY_IDLE,
|
|
|
|
MAX_IDLE_TYPES
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sched-domains (multiprocessor balancing) declarations:
|
|
|
|
*/
|
|
|
|
#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 05:54:34 -04:00
|
|
|
#ifdef CONFIG_SMP
|
2005-04-16 18:20:36 -04:00
|
|
|
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
|
|
|
|
#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
|
|
|
|
#define SD_BALANCE_EXEC 4 /* Balance on exec */
|
2005-06-25 17:57:19 -04:00
|
|
|
#define SD_BALANCE_FORK 8 /* Balance on fork, clone */
|
|
|
|
#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
|
|
|
|
#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
|
|
|
|
#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
|
|
|
|
#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
|
2006-06-27 05:54:42 -04:00
|
|
|
#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
|
|
|
|
|
|
|
|
#define BALANCE_FOR_POWER ((sched_mc_power_savings || sched_smt_power_savings) \
|
|
|
|
? SD_POWERSAVINGS_BALANCE : 0)
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
struct sched_group {
|
|
|
|
struct sched_group *next; /* Must be a circular list */
|
|
|
|
cpumask_t cpumask;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
|
|
|
|
* single CPU. This is read only (except for setup, hotplug CPU).
|
|
|
|
*/
|
|
|
|
unsigned long cpu_power;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct sched_domain {
|
|
|
|
/* These fields must be setup */
|
|
|
|
struct sched_domain *parent; /* top domain must be null terminated */
|
|
|
|
struct sched_group *groups; /* the balancing groups of the domain */
|
|
|
|
cpumask_t span; /* span of all CPUs in this domain */
|
|
|
|
unsigned long min_interval; /* Minimum balance interval ms */
|
|
|
|
unsigned long max_interval; /* Maximum balance interval ms */
|
|
|
|
unsigned int busy_factor; /* less balancing by factor if busy */
|
|
|
|
unsigned int imbalance_pct; /* No balance until over watermark */
|
|
|
|
unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
|
|
|
|
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
|
|
|
|
unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */
|
2005-06-25 17:57:13 -04:00
|
|
|
unsigned int busy_idx;
|
|
|
|
unsigned int idle_idx;
|
|
|
|
unsigned int newidle_idx;
|
|
|
|
unsigned int wake_idx;
|
2005-06-25 17:57:19 -04:00
|
|
|
unsigned int forkexec_idx;
|
2005-04-16 18:20:36 -04:00
|
|
|
int flags; /* See SD_* */
|
|
|
|
|
|
|
|
/* Runtime fields. */
|
|
|
|
unsigned long last_balance; /* init to jiffies. units in jiffies */
|
|
|
|
unsigned int balance_interval; /* initialise to 1. units in ms. */
|
|
|
|
unsigned int nr_balance_failed; /* initialise to 0 */
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
/* load_balance() stats */
|
|
|
|
unsigned long lb_cnt[MAX_IDLE_TYPES];
|
|
|
|
unsigned long lb_failed[MAX_IDLE_TYPES];
|
|
|
|
unsigned long lb_balanced[MAX_IDLE_TYPES];
|
|
|
|
unsigned long lb_imbalance[MAX_IDLE_TYPES];
|
|
|
|
unsigned long lb_gained[MAX_IDLE_TYPES];
|
|
|
|
unsigned long lb_hot_gained[MAX_IDLE_TYPES];
|
|
|
|
unsigned long lb_nobusyg[MAX_IDLE_TYPES];
|
|
|
|
unsigned long lb_nobusyq[MAX_IDLE_TYPES];
|
|
|
|
|
|
|
|
/* Active load balancing */
|
|
|
|
unsigned long alb_cnt;
|
|
|
|
unsigned long alb_failed;
|
|
|
|
unsigned long alb_pushed;
|
|
|
|
|
2005-06-25 17:57:20 -04:00
|
|
|
/* SD_BALANCE_EXEC stats */
|
|
|
|
unsigned long sbe_cnt;
|
|
|
|
unsigned long sbe_balanced;
|
2005-04-16 18:20:36 -04:00
|
|
|
unsigned long sbe_pushed;
|
|
|
|
|
2005-06-25 17:57:20 -04:00
|
|
|
/* SD_BALANCE_FORK stats */
|
|
|
|
unsigned long sbf_cnt;
|
|
|
|
unsigned long sbf_balanced;
|
|
|
|
unsigned long sbf_pushed;
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/* try_to_wake_up() stats */
|
|
|
|
unsigned long ttwu_wake_remote;
|
|
|
|
unsigned long ttwu_move_affine;
|
|
|
|
unsigned long ttwu_move_balance;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2006-06-27 05:54:38 -04:00
|
|
|
extern int partition_sched_domains(cpumask_t *partition1,
|
2005-06-25 17:57:33 -04:00
|
|
|
cpumask_t *partition2);
|
[PATCH] scheduler cache-hot-autodetect
)
From: Ingo Molnar <mingo@elte.hu>
This is the latest version of the scheduler cache-hot-auto-tune patch.
The first problem was that detection time scaled with O(N^2), which is
unacceptable on larger SMP and NUMA systems. To solve this:
- I've added a 'domain distance' function, which is used to cache
measurement results. Each distance is only measured once. This means
that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT
distances 0 and 1, and on SMP distance 0 is measured. The code walks
the domain tree to determine the distance, so it automatically follows
whatever hierarchy an architecture sets up. This cuts down on the boot
time significantly and removes the O(N^2) limit. The only assumption
is that migration costs can be expressed as a function of domain
distance - this covers the overwhelming majority of existing systems,
and is a good guess even for more assymetric systems.
[ People hacking systems that have assymetries that break this
assumption (e.g. different CPU speeds) should experiment a bit with
the cpu_distance() function. Adding a ->migration_distance factor to
the domain structure would be one possible solution - but lets first
see the problem systems, if they exist at all. Lets not overdesign. ]
Another problem was that only a single cache-size was used for measuring
the cost of migration, and most architectures didnt set that variable
up. Furthermore, a single cache-size does not fit NUMA hierarchies with
L3 caches and does not fit HT setups, where different CPUs will often
have different 'effective cache sizes'. To solve this problem:
- Instead of relying on a single cache-size provided by the platform and
sticking to it, the code now auto-detects the 'effective migration
cost' between two measured CPUs, via iterating through a wide range of
cachesizes. The code searches for the maximum migration cost, which
occurs when the working set of the test-workload falls just below the
'effective cache size'. I.e. real-life optimized search is done for
the maximum migration cost, between two real CPUs.
This, amongst other things, has the positive effect hat if e.g. two
CPUs share a L2/L3 cache, a different (and accurate) migration cost
will be found than between two CPUs on the same system that dont share
any caches.
(The reliable measurement of migration costs is tricky - see the source
for details.)
Furthermore i've added various boot-time options to override/tune
migration behavior.
Firstly, there's a blanket override for autodetection:
migration_cost=1000,2000,3000
will override the depth 0/1/2 values with 1msec/2msec/3msec values.
Secondly, there's a global factor that can be used to increase (or
decrease) the autodetected values:
migration_factor=120
will increase the autodetected values by 20%. This option is useful to
tune things in a workload-dependent way - e.g. if a workload is
cache-insensitive then CPU utilization can be maximized by specifying
migration_factor=0.
I've tested the autodetection code quite extensively on x86, on 3
P3/Xeon/2MB, and the autodetected values look pretty good:
Dual Celeron (128K L2 cache):
---------------------
migration cost matrix (max_cache_size: 131072, cpu: 467 MHz):
---------------------
[00] [01]
[00]: - 1.7(1)
[01]: 1.7(1) -
---------------------
cacheflush times [2]: 0.0 (0) 1.7 (1784008)
---------------------
Here the slow memory subsystem dominates system performance, and even
though caches are small, the migration cost is 1.7 msecs.
Dual HT P4 (512K L2 cache):
---------------------
migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz):
---------------------
[00] [01] [02] [03]
[00]: - 0.4(1) 0.0(0) 0.4(1)
[01]: 0.4(1) - 0.4(1) 0.0(0)
[02]: 0.0(0) 0.4(1) - 0.4(1)
[03]: 0.4(1) 0.0(0) 0.4(1) -
---------------------
cacheflush times [2]: 0.0 (33900) 0.4 (448514)
---------------------
Here it can be seen that there is no migration cost between two HT
siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory
system makes inter-physical-CPU migration pretty cheap: 0.4 msecs.
8-way P3/Xeon [2MB L2 cache]:
---------------------
migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz):
---------------------
[00] [01] [02] [03] [04] [05] [06] [07]
[00]: - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[01]: 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[02]: 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[03]: 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[04]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1)
[05]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1)
[06]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1)
[07]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) -
---------------------
cacheflush times [2]: 0.0 (0) 19.2 (19281756)
---------------------
This one has huge caches and a relatively slow memory subsystem - so the
migration cost is 19 msecs.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: <wilder@us.ibm.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-12 04:05:30 -05:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Maximum cache size the migration-costs auto-tuning code will
|
|
|
|
* search from:
|
|
|
|
*/
|
|
|
|
extern unsigned int max_cache_size;
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
|
|
|
|
struct io_context; /* See blkdev.h */
|
|
|
|
void exit_io_context(void);
|
|
|
|
struct cpuset;
|
|
|
|
|
|
|
|
#define NGROUPS_SMALL 32
|
|
|
|
#define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t)))
|
|
|
|
struct group_info {
|
|
|
|
int ngroups;
|
|
|
|
atomic_t usage;
|
|
|
|
gid_t small_block[NGROUPS_SMALL];
|
|
|
|
int nblocks;
|
|
|
|
gid_t *blocks[0];
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* get_group_info() must be called with the owning task locked (via task_lock())
|
|
|
|
* when task != current. The reason being that the vast majority of callers are
|
|
|
|
* looking at current->group_info, which can not be changed except by the
|
|
|
|
* current task. Changing current->group_info requires the task lock, too.
|
|
|
|
*/
|
|
|
|
#define get_group_info(group_info) do { \
|
|
|
|
atomic_inc(&(group_info)->usage); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define put_group_info(group_info) do { \
|
|
|
|
if (atomic_dec_and_test(&(group_info)->usage)) \
|
|
|
|
groups_free(group_info); \
|
|
|
|
} while (0)
|
|
|
|
|
[PATCH] Keys: Make request-key create an authorisation key
The attached patch makes the following changes:
(1) There's a new special key type called ".request_key_auth".
This is an authorisation key for when one process requests a key and
another process is started to construct it. This type of key cannot be
created by the user; nor can it be requested by kernel services.
Authorisation keys hold two references:
(a) Each refers to a key being constructed. When the key being
constructed is instantiated the authorisation key is revoked,
rendering it of no further use.
(b) The "authorising process". This is either:
(i) the process that called request_key(), or:
(ii) if the process that called request_key() itself had an
authorisation key in its session keyring, then the authorising
process referred to by that authorisation key will also be
referred to by the new authorisation key.
This means that the process that initiated a chain of key requests
will authorise the lot of them, and will, by default, wind up with
the keys obtained from them in its keyrings.
(2) request_key() creates an authorisation key which is then passed to
/sbin/request-key in as part of a new session keyring.
(3) When request_key() is searching for a key to hand back to the caller, if
it comes across an authorisation key in the session keyring of the
calling process, it will also search the keyrings of the process
specified therein and it will use the specified process's credentials
(fsuid, fsgid, groups) to do that rather than the calling process's
credentials.
This allows a process started by /sbin/request-key to find keys belonging
to the authorising process.
(4) A key can be read, even if the process executing KEYCTL_READ doesn't have
direct read or search permission if that key is contained within the
keyrings of a process specified by an authorisation key found within the
calling process's session keyring, and is searchable using the
credentials of the authorising process.
This allows a process started by /sbin/request-key to read keys belonging
to the authorising process.
(5) The magic KEY_SPEC_*_KEYRING key IDs when passed to KEYCTL_INSTANTIATE or
KEYCTL_NEGATE will specify a keyring of the authorising process, rather
than the process doing the instantiation.
(6) One of the process keyrings can be nominated as the default to which
request_key() should attach new keys if not otherwise specified. This is
done with KEYCTL_SET_REQKEY_KEYRING and one of the KEY_REQKEY_DEFL_*
constants. The current setting can also be read using this call.
(7) request_key() is partially interruptible. If it is waiting for another
process to finish constructing a key, it can be interrupted. This permits
a request-key cycle to be broken without recourse to rebooting.
Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-Off-By: Benoit Boissinot <benoit.boissinot@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-24 01:00:56 -04:00
|
|
|
extern struct group_info *groups_alloc(int gidsetsize);
|
|
|
|
extern void groups_free(struct group_info *group_info);
|
|
|
|
extern int set_current_groups(struct group_info *group_info);
|
|
|
|
extern int groups_search(struct group_info *group_info, gid_t grp);
|
2005-04-16 18:20:36 -04:00
|
|
|
/* access the groups "array" with this macro */
|
|
|
|
#define GROUP_AT(gi, i) \
|
|
|
|
((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
|
|
|
|
|
2005-09-09 16:02:02 -04:00
|
|
|
#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
|
|
|
|
extern void prefetch_stack(struct task_struct*);
|
|
|
|
#else
|
|
|
|
static inline void prefetch_stack(struct task_struct *t) { }
|
|
|
|
#endif
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
struct audit_context; /* See audit.c */
|
|
|
|
struct mempolicy;
|
2006-04-11 07:52:07 -04:00
|
|
|
struct pipe_inode_info;
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2006-03-31 05:31:23 -05:00
|
|
|
enum sleep_type {
|
|
|
|
SLEEP_NORMAL,
|
|
|
|
SLEEP_NONINTERACTIVE,
|
|
|
|
SLEEP_INTERACTIVE,
|
|
|
|
SLEEP_INTERRUPTED,
|
|
|
|
};
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
struct task_struct {
|
|
|
|
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
|
|
|
|
struct thread_info *thread_info;
|
|
|
|
atomic_t usage;
|
|
|
|
unsigned long flags; /* per process flags, defined below */
|
|
|
|
unsigned long ptrace;
|
|
|
|
|
2005-05-05 19:16:12 -04:00
|
|
|
int lock_depth; /* BKL lock depth */
|
2005-04-16 18:20:36 -04:00
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 05:54:34 -04:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
|
2005-06-25 17:57:23 -04:00
|
|
|
int oncpu;
|
|
|
|
#endif
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 05:54:34 -04:00
|
|
|
#endif
|
|
|
|
int load_weight; /* for niceness load balancing purposes */
|
2006-06-27 05:54:51 -04:00
|
|
|
int prio, static_prio, normal_prio;
|
2005-04-16 18:20:36 -04:00
|
|
|
struct list_head run_list;
|
|
|
|
prio_array_t *array;
|
|
|
|
|
2005-06-27 04:55:12 -04:00
|
|
|
unsigned short ioprio;
|
2006-03-23 14:00:26 -05:00
|
|
|
unsigned int btrace_seq;
|
2005-06-27 04:55:12 -04:00
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
unsigned long sleep_avg;
|
|
|
|
unsigned long long timestamp, last_ran;
|
|
|
|
unsigned long long sched_time; /* sched_clock time spent running */
|
2006-03-31 05:31:23 -05:00
|
|
|
enum sleep_type sleep_type;
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
unsigned long policy;
|
|
|
|
cpumask_t cpus_allowed;
|
|
|
|
unsigned int time_slice, first_time_slice;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
struct sched_info sched_info;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct list_head tasks;
|
|
|
|
/*
|
|
|
|
* ptrace_list/ptrace_children forms the list of my children
|
|
|
|
* that were stolen by a ptracer.
|
|
|
|
*/
|
|
|
|
struct list_head ptrace_children;
|
|
|
|
struct list_head ptrace_list;
|
|
|
|
|
|
|
|
struct mm_struct *mm, *active_mm;
|
|
|
|
|
|
|
|
/* task state */
|
|
|
|
struct linux_binfmt *binfmt;
|
|
|
|
long exit_state;
|
|
|
|
int exit_code, exit_signal;
|
|
|
|
int pdeath_signal; /* The signal sent when the parent dies */
|
|
|
|
/* ??? */
|
|
|
|
unsigned long personality;
|
|
|
|
unsigned did_exec:1;
|
|
|
|
pid_t pid;
|
|
|
|
pid_t tgid;
|
|
|
|
/*
|
|
|
|
* pointers to (original) parent process, youngest child, younger sibling,
|
|
|
|
* older sibling, respectively. (p->father can be replaced with
|
|
|
|
* p->parent->pid)
|
|
|
|
*/
|
|
|
|
struct task_struct *real_parent; /* real parent process (when being debugged) */
|
|
|
|
struct task_struct *parent; /* parent process */
|
|
|
|
/*
|
|
|
|
* children/sibling forms the list of my children plus the
|
|
|
|
* tasks I'm ptracing.
|
|
|
|
*/
|
|
|
|
struct list_head children; /* list of my children */
|
|
|
|
struct list_head sibling; /* linkage in my parent's children list */
|
|
|
|
struct task_struct *group_leader; /* threadgroup leader */
|
|
|
|
|
|
|
|
/* PID/PID hash table linkage. */
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 05:31:42 -05:00
|
|
|
struct pid_link pids[PIDTYPE_MAX];
|
2006-03-28 19:11:25 -05:00
|
|
|
struct list_head thread_group;
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
struct completion *vfork_done; /* for vfork() */
|
|
|
|
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
|
|
|
|
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
|
|
|
|
|
|
|
|
unsigned long rt_priority;
|
|
|
|
cputime_t utime, stime;
|
|
|
|
unsigned long nvcsw, nivcsw; /* context switch counts */
|
|
|
|
struct timespec start_time;
|
|
|
|
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
|
|
|
|
unsigned long min_flt, maj_flt;
|
|
|
|
|
|
|
|
cputime_t it_prof_expires, it_virt_expires;
|
|
|
|
unsigned long long it_sched_expires;
|
|
|
|
struct list_head cpu_timers[3];
|
|
|
|
|
|
|
|
/* process credentials */
|
|
|
|
uid_t uid,euid,suid,fsuid;
|
|
|
|
gid_t gid,egid,sgid,fsgid;
|
|
|
|
struct group_info *group_info;
|
|
|
|
kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
|
|
|
|
unsigned keep_capabilities:1;
|
|
|
|
struct user_struct *user;
|
|
|
|
#ifdef CONFIG_KEYS
|
2006-01-08 04:02:47 -05:00
|
|
|
struct key *request_key_auth; /* assumed request_key authority */
|
2005-04-16 18:20:36 -04:00
|
|
|
struct key *thread_keyring; /* keyring private to this thread */
|
[PATCH] Keys: Make request-key create an authorisation key
The attached patch makes the following changes:
(1) There's a new special key type called ".request_key_auth".
This is an authorisation key for when one process requests a key and
another process is started to construct it. This type of key cannot be
created by the user; nor can it be requested by kernel services.
Authorisation keys hold two references:
(a) Each refers to a key being constructed. When the key being
constructed is instantiated the authorisation key is revoked,
rendering it of no further use.
(b) The "authorising process". This is either:
(i) the process that called request_key(), or:
(ii) if the process that called request_key() itself had an
authorisation key in its session keyring, then the authorising
process referred to by that authorisation key will also be
referred to by the new authorisation key.
This means that the process that initiated a chain of key requests
will authorise the lot of them, and will, by default, wind up with
the keys obtained from them in its keyrings.
(2) request_key() creates an authorisation key which is then passed to
/sbin/request-key in as part of a new session keyring.
(3) When request_key() is searching for a key to hand back to the caller, if
it comes across an authorisation key in the session keyring of the
calling process, it will also search the keyrings of the process
specified therein and it will use the specified process's credentials
(fsuid, fsgid, groups) to do that rather than the calling process's
credentials.
This allows a process started by /sbin/request-key to find keys belonging
to the authorising process.
(4) A key can be read, even if the process executing KEYCTL_READ doesn't have
direct read or search permission if that key is contained within the
keyrings of a process specified by an authorisation key found within the
calling process's session keyring, and is searchable using the
credentials of the authorising process.
This allows a process started by /sbin/request-key to read keys belonging
to the authorising process.
(5) The magic KEY_SPEC_*_KEYRING key IDs when passed to KEYCTL_INSTANTIATE or
KEYCTL_NEGATE will specify a keyring of the authorising process, rather
than the process doing the instantiation.
(6) One of the process keyrings can be nominated as the default to which
request_key() should attach new keys if not otherwise specified. This is
done with KEYCTL_SET_REQKEY_KEYRING and one of the KEY_REQKEY_DEFL_*
constants. The current setting can also be read using this call.
(7) request_key() is partially interruptible. If it is waiting for another
process to finish constructing a key, it can be interrupted. This permits
a request-key cycle to be broken without recourse to rebooting.
Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-Off-By: Benoit Boissinot <benoit.boissinot@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-24 01:00:56 -04:00
|
|
|
unsigned char jit_keyring; /* default keyring to attach requested keys to */
|
2005-04-16 18:20:36 -04:00
|
|
|
#endif
|
|
|
|
int oomkilladj; /* OOM kill score adjustment (bit shift). */
|
2005-05-05 19:16:12 -04:00
|
|
|
char comm[TASK_COMM_LEN]; /* executable name excluding path
|
|
|
|
- access with [gs]et_task_comm (which lock
|
|
|
|
it with task_lock())
|
|
|
|
- initialized normally by flush_old_exec */
|
2005-04-16 18:20:36 -04:00
|
|
|
/* file system info */
|
|
|
|
int link_count, total_link_count;
|
|
|
|
/* ipc stuff */
|
|
|
|
struct sysv_sem sysvsem;
|
|
|
|
/* CPU-specific state of this task */
|
|
|
|
struct thread_struct thread;
|
|
|
|
/* filesystem information */
|
|
|
|
struct fs_struct *fs;
|
|
|
|
/* open file information */
|
|
|
|
struct files_struct *files;
|
|
|
|
/* namespace */
|
|
|
|
struct namespace *namespace;
|
|
|
|
/* signal handlers */
|
|
|
|
struct signal_struct *signal;
|
|
|
|
struct sighand_struct *sighand;
|
|
|
|
|
|
|
|
sigset_t blocked, real_blocked;
|
2006-01-18 20:43:57 -05:00
|
|
|
sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
|
2005-04-16 18:20:36 -04:00
|
|
|
struct sigpending pending;
|
|
|
|
|
|
|
|
unsigned long sas_ss_sp;
|
|
|
|
size_t sas_ss_size;
|
|
|
|
int (*notifier)(void *priv);
|
|
|
|
void *notifier_data;
|
|
|
|
sigset_t *notifier_mask;
|
|
|
|
|
|
|
|
void *security;
|
|
|
|
struct audit_context *audit_context;
|
|
|
|
seccomp_t seccomp;
|
|
|
|
|
|
|
|
/* Thread group tracking */
|
|
|
|
u32 parent_exec_id;
|
|
|
|
u32 self_exec_id;
|
|
|
|
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
|
|
|
|
spinlock_t alloc_lock;
|
|
|
|
|
2006-06-27 05:54:51 -04:00
|
|
|
/* Protection of the PI data structures: */
|
|
|
|
spinlock_t pi_lock;
|
|
|
|
|
2006-06-27 05:54:53 -04:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
|
|
/* PI waiters blocked on a rt_mutex held by this task */
|
|
|
|
struct plist_head pi_waiters;
|
|
|
|
/* Deadlock detection and priority inheritance handling */
|
|
|
|
struct rt_mutex_waiter *pi_blocked_on;
|
|
|
|
# ifdef CONFIG_DEBUG_RT_MUTEXES
|
|
|
|
spinlock_t held_list_lock;
|
|
|
|
struct list_head held_list_head;
|
|
|
|
# endif
|
|
|
|
#endif
|
|
|
|
|
2006-01-09 18:59:20 -05:00
|
|
|
#ifdef CONFIG_DEBUG_MUTEXES
|
|
|
|
/* mutex deadlock detection */
|
|
|
|
struct mutex_waiter *blocked_on;
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/* journalling filesystem info */
|
|
|
|
void *journal_info;
|
|
|
|
|
|
|
|
/* VM state */
|
|
|
|
struct reclaim_state *reclaim_state;
|
|
|
|
|
|
|
|
struct backing_dev_info *backing_dev_info;
|
|
|
|
|
|
|
|
struct io_context *io_context;
|
|
|
|
|
|
|
|
unsigned long ptrace_message;
|
|
|
|
siginfo_t *last_siginfo; /* For ptrace use. */
|
|
|
|
/*
|
|
|
|
* current io wait handle: wait queue entry to use for io waits
|
|
|
|
* If this thread is processing aio, this points at the waitqueue
|
|
|
|
* inside the currently handled kiocb. It may be NULL (i.e. default
|
|
|
|
* to a stack based synchronous wait) if its doing sync IO.
|
|
|
|
*/
|
|
|
|
wait_queue_t *io_wait;
|
|
|
|
/* i/o counters(bytes read/written, #syscalls */
|
|
|
|
u64 rchar, wchar, syscr, syscw;
|
|
|
|
#if defined(CONFIG_BSD_PROCESS_ACCT)
|
|
|
|
u64 acct_rss_mem1; /* accumulated rss usage */
|
|
|
|
u64 acct_vm_mem1; /* accumulated virtual memory usage */
|
|
|
|
clock_t acct_stimexpd; /* clock_t-converted stime since last update */
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
struct mempolicy *mempolicy;
|
|
|
|
short il_next;
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_CPUSETS
|
|
|
|
struct cpuset *cpuset;
|
|
|
|
nodemask_t mems_allowed;
|
|
|
|
int cpuset_mems_generation;
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 06:16:03 -05:00
|
|
|
int cpuset_mem_spread_rotor;
|
2005-04-16 18:20:36 -04:00
|
|
|
#endif
|
2006-03-27 04:16:22 -05:00
|
|
|
struct robust_list_head __user *robust_list;
|
2006-03-27 04:16:24 -05:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
struct compat_robust_list_head __user *compat_robust_list;
|
|
|
|
#endif
|
2006-03-27 04:16:22 -05:00
|
|
|
|
2005-06-27 04:55:12 -04:00
|
|
|
atomic_t fs_excl; /* holding fs exclusive resources */
|
2006-01-08 04:01:37 -05:00
|
|
|
struct rcu_head rcu;
|
2006-04-11 07:52:07 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* cache last used pipe for splice
|
|
|
|
*/
|
|
|
|
struct pipe_inode_info *splice_pipe;
|
2005-04-16 18:20:36 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline pid_t process_group(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return tsk->signal->pgrp;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* pid_alive - check that a task structure is not stale
|
|
|
|
* @p: Task structure to be checked.
|
|
|
|
*
|
|
|
|
* Test if a process is not yet dead (at most zombie state)
|
|
|
|
* If pid_alive fails, then pointers within the task structure
|
|
|
|
* can be stale and must not be dereferenced.
|
|
|
|
*/
|
|
|
|
static inline int pid_alive(struct task_struct *p)
|
|
|
|
{
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 05:31:42 -05:00
|
|
|
return p->pids[PIDTYPE_PID].pid != NULL;
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
extern void free_task(struct task_struct *tsk);
|
|
|
|
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
|
2006-01-08 04:01:37 -05:00
|
|
|
|
2006-03-31 05:31:34 -05:00
|
|
|
extern void __put_task_struct(struct task_struct *t);
|
2006-01-08 04:01:37 -05:00
|
|
|
|
|
|
|
static inline void put_task_struct(struct task_struct *t)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&t->usage))
|
2006-03-31 05:31:37 -05:00
|
|
|
__put_task_struct(t);
|
2006-01-08 04:01:37 -05:00
|
|
|
}
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Per process flags
|
|
|
|
*/
|
|
|
|
#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
|
|
|
|
/* Not implemented yet, only for 486*/
|
|
|
|
#define PF_STARTING 0x00000002 /* being created */
|
|
|
|
#define PF_EXITING 0x00000004 /* getting shut down */
|
|
|
|
#define PF_DEAD 0x00000008 /* Dead */
|
|
|
|
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
|
|
|
|
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
|
|
|
|
#define PF_DUMPCORE 0x00000200 /* dumped core */
|
|
|
|
#define PF_SIGNALED 0x00000400 /* killed by a signal */
|
|
|
|
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
|
|
|
|
#define PF_FLUSHER 0x00001000 /* responsible for disk writeback */
|
|
|
|
#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
|
|
|
|
#define PF_FREEZE 0x00004000 /* this task is being frozen for suspend now */
|
|
|
|
#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
|
|
|
|
#define PF_FROZEN 0x00010000 /* frozen for system suspend */
|
|
|
|
#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
|
|
|
|
#define PF_KSWAPD 0x00040000 /* I am kswapd */
|
|
|
|
#define PF_SWAPOFF 0x00080000 /* I am in swapoff */
|
|
|
|
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
|
2006-06-13 02:26:10 -04:00
|
|
|
#define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */
|
|
|
|
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
|
|
|
|
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
|
|
|
|
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
|
|
|
|
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
|
2006-03-24 06:16:08 -05:00
|
|
|
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only the _current_ task can read/write to tsk->flags, but other
|
|
|
|
* tasks can access tsk->flags in readonly mode for example
|
|
|
|
* with tsk_used_math (like during threaded core dumping).
|
|
|
|
* There is however an exception to this rule during ptrace
|
|
|
|
* or during fork: the ptracer task is allowed to write to the
|
|
|
|
* child->flags of its traced child (same goes for fork, the parent
|
|
|
|
* can write to the child->flags), because we're guaranteed the
|
|
|
|
* child is not running and in turn not changing child->flags
|
|
|
|
* at the same time the parent does it.
|
|
|
|
*/
|
|
|
|
#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
|
|
|
|
#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
|
|
|
|
#define clear_used_math() clear_stopped_child_used_math(current)
|
|
|
|
#define set_used_math() set_stopped_child_used_math(current)
|
|
|
|
#define conditional_stopped_child_used_math(condition, child) \
|
|
|
|
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
|
|
|
|
#define conditional_used_math(condition) \
|
|
|
|
conditional_stopped_child_used_math(condition, current)
|
|
|
|
#define copy_to_stopped_child_used_math(child) \
|
|
|
|
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
|
|
|
|
/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
|
|
|
|
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
|
|
|
|
#define used_math() tsk_used_math(current)
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
|
|
|
|
#else
|
|
|
|
static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
|
|
|
|
{
|
2005-10-30 18:03:21 -05:00
|
|
|
if (!cpu_isset(0, new_mask))
|
2005-04-16 18:20:36 -04:00
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern unsigned long long sched_clock(void);
|
|
|
|
extern unsigned long long current_sched_time(const task_t *current_task);
|
|
|
|
|
|
|
|
/* sched_exec is called by processes performing an exec */
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern void sched_exec(void);
|
|
|
|
#else
|
|
|
|
#define sched_exec() {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
extern void idle_task_exit(void);
|
|
|
|
#else
|
|
|
|
static inline void idle_task_exit(void) {}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern void sched_idle_next(void);
|
2006-06-27 05:54:51 -04:00
|
|
|
|
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
|
|
extern int rt_mutex_getprio(task_t *p);
|
|
|
|
extern void rt_mutex_setprio(task_t *p, int prio);
|
|
|
|
#else
|
|
|
|
static inline int rt_mutex_getprio(task_t *p)
|
|
|
|
{
|
|
|
|
return p->normal_prio;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
extern void set_user_nice(task_t *p, long nice);
|
|
|
|
extern int task_prio(const task_t *p);
|
|
|
|
extern int task_nice(const task_t *p);
|
2005-05-01 11:59:00 -04:00
|
|
|
extern int can_nice(const task_t *p, const int nice);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern int task_curr(const task_t *p);
|
|
|
|
extern int idle_cpu(int cpu);
|
|
|
|
extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
|
|
|
|
extern task_t *idle_task(int cpu);
|
2005-09-11 03:19:06 -04:00
|
|
|
extern task_t *curr_task(int cpu);
|
|
|
|
extern void set_curr_task(int cpu, task_t *p);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
void yield(void);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The default (Linux) execution domain.
|
|
|
|
*/
|
|
|
|
extern struct exec_domain default_exec_domain;
|
|
|
|
|
|
|
|
union thread_union {
|
|
|
|
struct thread_info thread_info;
|
|
|
|
unsigned long stack[THREAD_SIZE/sizeof(long)];
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifndef __HAVE_ARCH_KSTACK_END
|
|
|
|
static inline int kstack_end(void *addr)
|
|
|
|
{
|
|
|
|
/* Reliable end of stack detection:
|
|
|
|
* Some APM bios versions misalign the stack
|
|
|
|
*/
|
|
|
|
return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern union thread_union init_thread_union;
|
|
|
|
extern struct task_struct init_task;
|
|
|
|
|
|
|
|
extern struct mm_struct init_mm;
|
|
|
|
|
|
|
|
#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr)
|
|
|
|
extern struct task_struct *find_task_by_pid_type(int type, int pid);
|
|
|
|
extern void set_special_pids(pid_t session, pid_t pgrp);
|
|
|
|
extern void __set_special_pids(pid_t session, pid_t pgrp);
|
|
|
|
|
|
|
|
/* per-UID process charging. */
|
|
|
|
extern struct user_struct * alloc_uid(uid_t);
|
|
|
|
static inline struct user_struct *get_uid(struct user_struct *u)
|
|
|
|
{
|
|
|
|
atomic_inc(&u->__count);
|
|
|
|
return u;
|
|
|
|
}
|
|
|
|
extern void free_uid(struct user_struct *);
|
|
|
|
extern void switch_uid(struct user_struct *);
|
|
|
|
|
|
|
|
#include <asm/current.h>
|
|
|
|
|
|
|
|
extern void do_timer(struct pt_regs *);
|
|
|
|
|
|
|
|
extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
|
|
|
|
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
|
|
|
|
extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
|
|
|
|
unsigned long clone_flags));
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern void kick_process(struct task_struct *tsk);
|
|
|
|
#else
|
|
|
|
static inline void kick_process(struct task_struct *tsk) { }
|
|
|
|
#endif
|
2005-06-25 17:57:29 -04:00
|
|
|
extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
|
2005-04-16 18:20:36 -04:00
|
|
|
extern void FASTCALL(sched_exit(task_t * p));
|
|
|
|
|
|
|
|
extern int in_group_p(gid_t);
|
|
|
|
extern int in_egroup_p(gid_t);
|
|
|
|
|
|
|
|
extern void proc_caches_init(void);
|
|
|
|
extern void flush_signals(struct task_struct *);
|
|
|
|
extern void flush_signal_handlers(struct task_struct *, int force_default);
|
|
|
|
extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
|
|
|
|
|
|
|
|
static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&tsk->sighand->siglock, flags);
|
|
|
|
ret = dequeue_signal(tsk, mask, info);
|
|
|
|
spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern void block_all_signals(int (*notifier)(void *priv), void *priv,
|
|
|
|
sigset_t *mask);
|
|
|
|
extern void unblock_all_signals(void);
|
|
|
|
extern void release_task(struct task_struct * p);
|
|
|
|
extern int send_sig_info(int, struct siginfo *, struct task_struct *);
|
|
|
|
extern int send_group_sig_info(int, struct siginfo *, struct task_struct *);
|
|
|
|
extern int force_sigsegv(int, struct task_struct *);
|
|
|
|
extern int force_sig_info(int, struct siginfo *, struct task_struct *);
|
|
|
|
extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);
|
|
|
|
extern int kill_pg_info(int, struct siginfo *, pid_t);
|
|
|
|
extern int kill_proc_info(int, struct siginfo *, pid_t);
|
2005-10-10 13:44:29 -04:00
|
|
|
extern int kill_proc_info_as_uid(int, struct siginfo *, pid_t, uid_t, uid_t);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern void do_notify_parent(struct task_struct *, int);
|
|
|
|
extern void force_sig(int, struct task_struct *);
|
|
|
|
extern void force_sig_specific(int, struct task_struct *);
|
|
|
|
extern int send_sig(int, struct task_struct *, int);
|
|
|
|
extern void zap_other_threads(struct task_struct *p);
|
|
|
|
extern int kill_pg(pid_t, int, int);
|
|
|
|
extern int kill_proc(pid_t, int, int);
|
|
|
|
extern struct sigqueue *sigqueue_alloc(void);
|
|
|
|
extern void sigqueue_free(struct sigqueue *);
|
|
|
|
extern int send_sigqueue(int, struct sigqueue *, struct task_struct *);
|
|
|
|
extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *);
|
2006-02-09 14:41:50 -05:00
|
|
|
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
|
|
|
|
|
|
|
|
/* These can be the second arg to send_sig_info/send_group_sig_info. */
|
|
|
|
#define SEND_SIG_NOINFO ((struct siginfo *) 0)
|
|
|
|
#define SEND_SIG_PRIV ((struct siginfo *) 1)
|
|
|
|
#define SEND_SIG_FORCED ((struct siginfo *) 2)
|
|
|
|
|
2005-10-30 18:03:45 -05:00
|
|
|
static inline int is_si_special(const struct siginfo *info)
|
|
|
|
{
|
|
|
|
return info <= SEND_SIG_FORCED;
|
|
|
|
}
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/* True if we are on the alternate signal stack. */
|
|
|
|
|
|
|
|
static inline int on_sig_stack(unsigned long sp)
|
|
|
|
{
|
|
|
|
return (sp - current->sas_ss_sp < current->sas_ss_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int sas_ss_flags(unsigned long sp)
|
|
|
|
{
|
|
|
|
return (current->sas_ss_size == 0 ? SS_DISABLE
|
|
|
|
: on_sig_stack(sp) ? SS_ONSTACK : 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines for handling mm_structs
|
|
|
|
*/
|
|
|
|
extern struct mm_struct * mm_alloc(void);
|
|
|
|
|
|
|
|
/* mmdrop drops the mm and the page tables */
|
|
|
|
extern void FASTCALL(__mmdrop(struct mm_struct *));
|
|
|
|
static inline void mmdrop(struct mm_struct * mm)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&mm->mm_count))
|
|
|
|
__mmdrop(mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* mmput gets rid of the mappings and all user-space */
|
|
|
|
extern void mmput(struct mm_struct *);
|
|
|
|
/* Grab a reference to a task's mm, if it is not already going away */
|
|
|
|
extern struct mm_struct *get_task_mm(struct task_struct *task);
|
|
|
|
/* Remove the current tasks stale references to the old mm_struct */
|
|
|
|
extern void mm_release(struct task_struct *, struct mm_struct *);
|
|
|
|
|
|
|
|
extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
|
|
|
|
extern void flush_thread(void);
|
|
|
|
extern void exit_thread(void);
|
|
|
|
|
|
|
|
extern void exit_files(struct task_struct *);
|
2006-03-28 19:11:16 -05:00
|
|
|
extern void __cleanup_signal(struct signal_struct *);
|
2006-03-28 19:11:27 -05:00
|
|
|
extern void __cleanup_sighand(struct sighand_struct *);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern void exit_itimers(struct signal_struct *);
|
|
|
|
|
|
|
|
extern NORET_TYPE void do_group_exit(int);
|
|
|
|
|
|
|
|
extern void daemonize(const char *, ...);
|
|
|
|
extern int allow_signal(int);
|
|
|
|
extern int disallow_signal(int);
|
|
|
|
extern task_t *child_reaper;
|
|
|
|
|
|
|
|
extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
|
|
|
|
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
|
|
|
|
task_t *fork_idle(int);
|
|
|
|
|
|
|
|
extern void set_task_comm(struct task_struct *tsk, char *from);
|
|
|
|
extern void get_task_comm(char *to, struct task_struct *tsk);
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern void wait_task_inactive(task_t * p);
|
|
|
|
#else
|
|
|
|
#define wait_task_inactive(p) do { } while (0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define remove_parent(p) list_del_init(&(p)->sibling)
|
2006-03-28 19:11:05 -05:00
|
|
|
#define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children)
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2006-04-19 01:20:16 -04:00
|
|
|
#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
#define for_each_process(p) \
|
|
|
|
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Careful: do_each_thread/while_each_thread is a double loop so
|
|
|
|
* 'break' will not work as expected - use goto instead.
|
|
|
|
*/
|
|
|
|
#define do_each_thread(g, t) \
|
|
|
|
for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
|
|
|
|
|
|
|
|
#define while_each_thread(g, t) \
|
|
|
|
while ((t = next_thread(t)) != g)
|
|
|
|
|
2006-04-10 19:16:49 -04:00
|
|
|
/* de_thread depends on thread_group_leader not being a pid based check */
|
|
|
|
#define thread_group_leader(p) (p == p->group_leader)
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2006-04-11 01:54:07 -04:00
|
|
|
static inline task_t *next_thread(const task_t *p)
|
2006-03-28 19:11:25 -05:00
|
|
|
{
|
|
|
|
return list_entry(rcu_dereference(p->thread_group.next),
|
|
|
|
task_t, thread_group);
|
|
|
|
}
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
static inline int thread_group_empty(task_t *p)
|
|
|
|
{
|
2006-03-28 19:11:25 -05:00
|
|
|
return list_empty(&p->thread_group);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#define delay_group_leader(p) \
|
|
|
|
(thread_group_leader(p) && !thread_group_empty(p))
|
|
|
|
|
|
|
|
/*
|
2006-06-23 05:05:18 -04:00
|
|
|
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
|
2005-06-27 04:55:12 -04:00
|
|
|
* subscriptions and synchronises with wait4(). Also used in procfs. Also
|
[PATCH] cpusets: dual semaphore locking overhaul
Overhaul cpuset locking. Replace single semaphore with two semaphores.
The suggestion to use two locks was made by Roman Zippel.
Both locks are global. Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers. Before making actual
changes, the second semaphore, callback_sem must be acquired as well. Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.
The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.
This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention. Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)
This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman). I've never
seen these races fail in any use or test.
See further the comments in the code.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 18:02:30 -05:00
|
|
|
* pins the final release of task.io_context. Also protects ->cpuset.
|
2005-04-16 18:20:36 -04:00
|
|
|
*
|
|
|
|
* Nests both inside and outside of read_lock(&tasklist_lock).
|
|
|
|
* It must not be nested with write_lock_irq(&tasklist_lock),
|
|
|
|
* neither inside nor outside.
|
|
|
|
*/
|
|
|
|
static inline void task_lock(struct task_struct *p)
|
|
|
|
{
|
|
|
|
spin_lock(&p->alloc_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void task_unlock(struct task_struct *p)
|
|
|
|
{
|
|
|
|
spin_unlock(&p->alloc_lock);
|
|
|
|
}
|
|
|
|
|
2006-03-28 19:11:13 -05:00
|
|
|
extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
|
|
|
|
unsigned long *flags);
|
|
|
|
|
|
|
|
static inline void unlock_task_sighand(struct task_struct *tsk,
|
|
|
|
unsigned long *flags)
|
|
|
|
{
|
|
|
|
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
|
|
|
|
}
|
|
|
|
|
2005-11-13 19:06:57 -05:00
|
|
|
#ifndef __HAVE_THREAD_FUNCTIONS
|
|
|
|
|
2005-11-13 19:06:55 -05:00
|
|
|
#define task_thread_info(task) (task)->thread_info
|
2006-01-12 04:05:34 -05:00
|
|
|
#define task_stack_page(task) ((void*)((task)->thread_info))
|
2005-11-13 19:06:55 -05:00
|
|
|
|
2005-11-13 19:06:56 -05:00
|
|
|
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
|
|
|
|
{
|
|
|
|
*task_thread_info(p) = *task_thread_info(org);
|
|
|
|
task_thread_info(p)->task = p;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long *end_of_stack(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return (unsigned long *)(p->thread_info + 1);
|
|
|
|
}
|
|
|
|
|
2005-11-13 19:06:57 -05:00
|
|
|
#endif
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/* set thread flags in other task's structures
|
|
|
|
* - see asm/thread_info.h for TIF_xxxx flags available
|
|
|
|
*/
|
|
|
|
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-13 19:06:55 -05:00
|
|
|
set_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-13 19:06:55 -05:00
|
|
|
clear_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-13 19:06:55 -05:00
|
|
|
return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-13 19:06:55 -05:00
|
|
|
return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-13 19:06:55 -05:00
|
|
|
return test_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void clear_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int signal_pending(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int need_resched(void)
|
|
|
|
{
|
|
|
|
return unlikely(test_thread_flag(TIF_NEED_RESCHED));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cond_resched() and cond_resched_lock(): latency reduction via
|
|
|
|
* explicit rescheduling in places that are safe. The return
|
|
|
|
* value indicates whether a reschedule was done in fact.
|
|
|
|
* cond_resched_lock() will drop the spinlock before scheduling,
|
|
|
|
* cond_resched_softirq() will enable bhs before scheduling.
|
|
|
|
*/
|
|
|
|
extern int cond_resched(void);
|
|
|
|
extern int cond_resched_lock(spinlock_t * lock);
|
|
|
|
extern int cond_resched_softirq(void);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Does a critical section need to be broken due to another
|
|
|
|
* task waiting?:
|
|
|
|
*/
|
|
|
|
#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
|
|
|
|
# define need_lockbreak(lock) ((lock)->break_lock)
|
|
|
|
#else
|
|
|
|
# define need_lockbreak(lock) 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Does a critical section need to be broken due to another
|
|
|
|
* task waiting or preemption being signalled:
|
|
|
|
*/
|
|
|
|
static inline int lock_need_resched(spinlock_t *lock)
|
|
|
|
{
|
|
|
|
if (need_lockbreak(lock) || need_resched())
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reevaluate whether the task has signals pending delivery.
|
|
|
|
This is required every time the blocked sigset_t changes.
|
|
|
|
callers must hold sighand->siglock. */
|
|
|
|
|
|
|
|
extern FASTCALL(void recalc_sigpending_tsk(struct task_struct *t));
|
|
|
|
extern void recalc_sigpending(void);
|
|
|
|
|
|
|
|
extern void signal_wake_up(struct task_struct *t, int resume_stopped);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wrappers for p->thread_info->cpu access. No-op on UP.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
static inline unsigned int task_cpu(const struct task_struct *p)
|
|
|
|
{
|
2005-11-13 19:06:55 -05:00
|
|
|
return task_thread_info(p)->cpu;
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
|
|
{
|
2005-11-13 19:06:55 -05:00
|
|
|
task_thread_info(p)->cpu = cpu;
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static inline unsigned int task_cpu(const struct task_struct *p)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
|
|
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
|
|
|
|
extern void arch_pick_mmap_layout(struct mm_struct *mm);
|
|
|
|
#else
|
|
|
|
static inline void arch_pick_mmap_layout(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
mm->mmap_base = TASK_UNMAPPED_BASE;
|
|
|
|
mm->get_unmapped_area = arch_get_unmapped_area;
|
|
|
|
mm->unmap_area = arch_unmap_area;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
|
|
|
|
extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
|
|
|
|
|
2006-06-27 05:54:42 -04:00
|
|
|
#include <linux/sysdev.h>
|
|
|
|
extern int sched_mc_power_savings, sched_smt_power_savings;
|
|
|
|
extern struct sysdev_attribute attr_sched_mc_power_savings, attr_sched_smt_power_savings;
|
|
|
|
extern int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls);
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
extern void normalize_rt_tasks(void);
|
|
|
|
|
|
|
|
#ifdef CONFIG_PM
|
2005-06-25 02:13:50 -04:00
|
|
|
/*
|
|
|
|
* Check if a process has been frozen
|
|
|
|
*/
|
|
|
|
static inline int frozen(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return p->flags & PF_FROZEN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if there is a request to freeze a process
|
|
|
|
*/
|
|
|
|
static inline int freezing(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return p->flags & PF_FREEZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Request that a process be frozen
|
|
|
|
* FIXME: SMP problem. We may not modify other process' flags!
|
|
|
|
*/
|
|
|
|
static inline void freeze(struct task_struct *p)
|
|
|
|
{
|
|
|
|
p->flags |= PF_FREEZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wake up a frozen process
|
|
|
|
*/
|
|
|
|
static inline int thaw_process(struct task_struct *p)
|
|
|
|
{
|
|
|
|
if (frozen(p)) {
|
|
|
|
p->flags &= ~PF_FROZEN;
|
|
|
|
wake_up_process(p);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* freezing is complete, mark process as frozen
|
|
|
|
*/
|
|
|
|
static inline void frozen_process(struct task_struct *p)
|
|
|
|
{
|
|
|
|
p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern void refrigerator(void);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern int freeze_processes(void);
|
|
|
|
extern void thaw_processes(void);
|
|
|
|
|
2005-06-25 02:13:50 -04:00
|
|
|
static inline int try_to_freeze(void)
|
2005-04-16 18:20:36 -04:00
|
|
|
{
|
2005-06-25 02:13:50 -04:00
|
|
|
if (freezing(current)) {
|
|
|
|
refrigerator();
|
2005-04-16 18:20:36 -04:00
|
|
|
return 1;
|
|
|
|
} else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
2005-06-25 02:13:50 -04:00
|
|
|
static inline int frozen(struct task_struct *p) { return 0; }
|
|
|
|
static inline int freezing(struct task_struct *p) { return 0; }
|
|
|
|
static inline void freeze(struct task_struct *p) { BUG(); }
|
|
|
|
static inline int thaw_process(struct task_struct *p) { return 1; }
|
|
|
|
static inline void frozen_process(struct task_struct *p) { BUG(); }
|
|
|
|
|
|
|
|
static inline void refrigerator(void) {}
|
2005-04-16 18:20:36 -04:00
|
|
|
static inline int freeze_processes(void) { BUG(); return 0; }
|
|
|
|
static inline void thaw_processes(void) {}
|
|
|
|
|
2005-06-25 02:13:50 -04:00
|
|
|
static inline int try_to_freeze(void) { return 0; }
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
#endif /* CONFIG_PM */
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
|
|
|
|
#endif
|