64c7c8f885
Make some changes to the NEED_RESCHED and POLLING_NRFLAG to reduce confusion, and make their semantics rigid. Improves efficiency of resched_task and some cpu_idle routines. * In resched_task: - TIF_NEED_RESCHED is only cleared with the task's runqueue lock held, and as we hold it during resched_task, then there is no need for an atomic test and set there. The only other time this should be set is when the task's quantum expires, in the timer interrupt - this is protected against because the rq lock is irq-safe. - If TIF_NEED_RESCHED is set, then we don't need to do anything. It won't get unset until the task get's schedule()d off. - If we are running on the same CPU as the task we resched, then set TIF_NEED_RESCHED and no further action is required. - If we are running on another CPU, and TIF_POLLING_NRFLAG is *not* set after TIF_NEED_RESCHED has been set, then we need to send an IPI. Using these rules, we are able to remove the test and set operation in resched_task, and make clear the previously vague semantics of POLLING_NRFLAG. * In idle routines: - Enter cpu_idle with preempt disabled. When the need_resched() condition becomes true, explicitly call schedule(). This makes things a bit clearer (IMO), but haven't updated all architectures yet. - Many do a test and clear of TIF_NEED_RESCHED for some reason. According to the resched_task rules, this isn't needed (and actually breaks the assumption that TIF_NEED_RESCHED is only cleared with the runqueue lock held). So remove that. Generally one less locked memory op when switching to the idle thread. - Many idle routines clear TIF_POLLING_NRFLAG, and only set it in the inner most polling idle loops. The above resched_task semantics allow it to be set until before the last time need_resched() is checked before going into a halt requiring interrupt wakeup. Many idle routines simply never enter such a halt, and so POLLING_NRFLAG can be always left set, completely eliminating resched IPIs when rescheduling the idle task. POLLING_NRFLAG width can be increased, to reduce the chance of resched IPIs. Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Ingo Molnar <mingo@elte.hu> Cc: Con Kolivas <kernel@kolivas.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
526 lines
13 KiB
C
526 lines
13 KiB
C
/*
|
|
* linux/arch/alpha/kernel/process.c
|
|
*
|
|
* Copyright (C) 1995 Linus Torvalds
|
|
*/
|
|
|
|
/*
|
|
* This file handles the architecture-dependent parts of process handling.
|
|
*/
|
|
|
|
#include <linux/config.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/module.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/smp_lock.h>
|
|
#include <linux/stddef.h>
|
|
#include <linux/unistd.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/user.h>
|
|
#include <linux/a.out.h>
|
|
#include <linux/utsname.h>
|
|
#include <linux/time.h>
|
|
#include <linux/major.h>
|
|
#include <linux/stat.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/elfcore.h>
|
|
#include <linux/reboot.h>
|
|
#include <linux/tty.h>
|
|
#include <linux/console.h>
|
|
|
|
#include <asm/reg.h>
|
|
#include <asm/uaccess.h>
|
|
#include <asm/system.h>
|
|
#include <asm/io.h>
|
|
#include <asm/pgtable.h>
|
|
#include <asm/hwrpb.h>
|
|
#include <asm/fpu.h>
|
|
|
|
#include "proto.h"
|
|
#include "pci_impl.h"
|
|
|
|
void
|
|
cpu_idle(void)
|
|
{
|
|
set_thread_flag(TIF_POLLING_NRFLAG);
|
|
|
|
while (1) {
|
|
/* FIXME -- EV6 and LCA45 know how to power down
|
|
the CPU. */
|
|
|
|
while (!need_resched())
|
|
cpu_relax();
|
|
schedule();
|
|
}
|
|
}
|
|
|
|
|
|
struct halt_info {
|
|
int mode;
|
|
char *restart_cmd;
|
|
};
|
|
|
|
static void
|
|
common_shutdown_1(void *generic_ptr)
|
|
{
|
|
struct halt_info *how = (struct halt_info *)generic_ptr;
|
|
struct percpu_struct *cpup;
|
|
unsigned long *pflags, flags;
|
|
int cpuid = smp_processor_id();
|
|
|
|
/* No point in taking interrupts anymore. */
|
|
local_irq_disable();
|
|
|
|
cpup = (struct percpu_struct *)
|
|
((unsigned long)hwrpb + hwrpb->processor_offset
|
|
+ hwrpb->processor_size * cpuid);
|
|
pflags = &cpup->flags;
|
|
flags = *pflags;
|
|
|
|
/* Clear reason to "default"; clear "bootstrap in progress". */
|
|
flags &= ~0x00ff0001UL;
|
|
|
|
#ifdef CONFIG_SMP
|
|
/* Secondaries halt here. */
|
|
if (cpuid != boot_cpuid) {
|
|
flags |= 0x00040000UL; /* "remain halted" */
|
|
*pflags = flags;
|
|
clear_bit(cpuid, &cpu_present_mask);
|
|
halt();
|
|
}
|
|
#endif
|
|
|
|
if (how->mode == LINUX_REBOOT_CMD_RESTART) {
|
|
if (!how->restart_cmd) {
|
|
flags |= 0x00020000UL; /* "cold bootstrap" */
|
|
} else {
|
|
/* For SRM, we could probably set environment
|
|
variables to get this to work. We'd have to
|
|
delay this until after srm_paging_stop unless
|
|
we ever got srm_fixup working.
|
|
|
|
At the moment, SRM will use the last boot device,
|
|
but the file and flags will be the defaults, when
|
|
doing a "warm" bootstrap. */
|
|
flags |= 0x00030000UL; /* "warm bootstrap" */
|
|
}
|
|
} else {
|
|
flags |= 0x00040000UL; /* "remain halted" */
|
|
}
|
|
*pflags = flags;
|
|
|
|
#ifdef CONFIG_SMP
|
|
/* Wait for the secondaries to halt. */
|
|
cpu_clear(boot_cpuid, cpu_possible_map);
|
|
while (cpus_weight(cpu_possible_map))
|
|
barrier();
|
|
#endif
|
|
|
|
/* If booted from SRM, reset some of the original environment. */
|
|
if (alpha_using_srm) {
|
|
#ifdef CONFIG_DUMMY_CONSOLE
|
|
/* If we've gotten here after SysRq-b, leave interrupt
|
|
context before taking over the console. */
|
|
if (in_interrupt())
|
|
irq_exit();
|
|
/* This has the effect of resetting the VGA video origin. */
|
|
take_over_console(&dummy_con, 0, MAX_NR_CONSOLES-1, 1);
|
|
#endif
|
|
pci_restore_srm_config();
|
|
set_hae(srm_hae);
|
|
}
|
|
|
|
if (alpha_mv.kill_arch)
|
|
alpha_mv.kill_arch(how->mode);
|
|
|
|
if (! alpha_using_srm && how->mode != LINUX_REBOOT_CMD_RESTART) {
|
|
/* Unfortunately, since MILO doesn't currently understand
|
|
the hwrpb bits above, we can't reliably halt the
|
|
processor and keep it halted. So just loop. */
|
|
return;
|
|
}
|
|
|
|
if (alpha_using_srm)
|
|
srm_paging_stop();
|
|
|
|
halt();
|
|
}
|
|
|
|
static void
|
|
common_shutdown(int mode, char *restart_cmd)
|
|
{
|
|
struct halt_info args;
|
|
args.mode = mode;
|
|
args.restart_cmd = restart_cmd;
|
|
on_each_cpu(common_shutdown_1, &args, 1, 0);
|
|
}
|
|
|
|
void
|
|
machine_restart(char *restart_cmd)
|
|
{
|
|
common_shutdown(LINUX_REBOOT_CMD_RESTART, restart_cmd);
|
|
}
|
|
|
|
|
|
void
|
|
machine_halt(void)
|
|
{
|
|
common_shutdown(LINUX_REBOOT_CMD_HALT, NULL);
|
|
}
|
|
|
|
|
|
void
|
|
machine_power_off(void)
|
|
{
|
|
common_shutdown(LINUX_REBOOT_CMD_POWER_OFF, NULL);
|
|
}
|
|
|
|
|
|
/* Used by sysrq-p, among others. I don't believe r9-r15 are ever
|
|
saved in the context it's used. */
|
|
|
|
void
|
|
show_regs(struct pt_regs *regs)
|
|
{
|
|
dik_show_regs(regs, NULL);
|
|
}
|
|
|
|
/*
|
|
* Re-start a thread when doing execve()
|
|
*/
|
|
void
|
|
start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp)
|
|
{
|
|
set_fs(USER_DS);
|
|
regs->pc = pc;
|
|
regs->ps = 8;
|
|
wrusp(sp);
|
|
}
|
|
|
|
/*
|
|
* Free current thread data structures etc..
|
|
*/
|
|
void
|
|
exit_thread(void)
|
|
{
|
|
}
|
|
|
|
void
|
|
flush_thread(void)
|
|
{
|
|
/* Arrange for each exec'ed process to start off with a clean slate
|
|
with respect to the FPU. This is all exceptions disabled. */
|
|
current_thread_info()->ieee_state = 0;
|
|
wrfpcr(FPCR_DYN_NORMAL | ieee_swcr_to_fpcr(0));
|
|
|
|
/* Clean slate for TLS. */
|
|
current_thread_info()->pcb.unique = 0;
|
|
}
|
|
|
|
void
|
|
release_thread(struct task_struct *dead_task)
|
|
{
|
|
}
|
|
|
|
/*
|
|
* "alpha_clone()".. By the time we get here, the
|
|
* non-volatile registers have also been saved on the
|
|
* stack. We do some ugly pointer stuff here.. (see
|
|
* also copy_thread)
|
|
*
|
|
* Notice that "fork()" is implemented in terms of clone,
|
|
* with parameters (SIGCHLD, 0).
|
|
*/
|
|
int
|
|
alpha_clone(unsigned long clone_flags, unsigned long usp,
|
|
int __user *parent_tid, int __user *child_tid,
|
|
unsigned long tls_value, struct pt_regs *regs)
|
|
{
|
|
if (!usp)
|
|
usp = rdusp();
|
|
|
|
return do_fork(clone_flags, usp, regs, 0, parent_tid, child_tid);
|
|
}
|
|
|
|
int
|
|
alpha_vfork(struct pt_regs *regs)
|
|
{
|
|
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, rdusp(),
|
|
regs, 0, NULL, NULL);
|
|
}
|
|
|
|
/*
|
|
* Copy an alpha thread..
|
|
*
|
|
* Note the "stack_offset" stuff: when returning to kernel mode, we need
|
|
* to have some extra stack-space for the kernel stack that still exists
|
|
* after the "ret_from_fork". When returning to user mode, we only want
|
|
* the space needed by the syscall stack frame (ie "struct pt_regs").
|
|
* Use the passed "regs" pointer to determine how much space we need
|
|
* for a kernel fork().
|
|
*/
|
|
|
|
int
|
|
copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
|
|
unsigned long unused,
|
|
struct task_struct * p, struct pt_regs * regs)
|
|
{
|
|
extern void ret_from_fork(void);
|
|
|
|
struct thread_info *childti = p->thread_info;
|
|
struct pt_regs * childregs;
|
|
struct switch_stack * childstack, *stack;
|
|
unsigned long stack_offset, settls;
|
|
|
|
stack_offset = PAGE_SIZE - sizeof(struct pt_regs);
|
|
if (!(regs->ps & 8))
|
|
stack_offset = (PAGE_SIZE-1) & (unsigned long) regs;
|
|
childregs = (struct pt_regs *)
|
|
(stack_offset + PAGE_SIZE + (long) childti);
|
|
|
|
*childregs = *regs;
|
|
settls = regs->r20;
|
|
childregs->r0 = 0;
|
|
childregs->r19 = 0;
|
|
childregs->r20 = 1; /* OSF/1 has some strange fork() semantics. */
|
|
regs->r20 = 0;
|
|
stack = ((struct switch_stack *) regs) - 1;
|
|
childstack = ((struct switch_stack *) childregs) - 1;
|
|
*childstack = *stack;
|
|
childstack->r26 = (unsigned long) ret_from_fork;
|
|
childti->pcb.usp = usp;
|
|
childti->pcb.ksp = (unsigned long) childstack;
|
|
childti->pcb.flags = 1; /* set FEN, clear everything else */
|
|
|
|
/* Set a new TLS for the child thread? Peek back into the
|
|
syscall arguments that we saved on syscall entry. Oops,
|
|
except we'd have clobbered it with the parent/child set
|
|
of r20. Read the saved copy. */
|
|
/* Note: if CLONE_SETTLS is not set, then we must inherit the
|
|
value from the parent, which will have been set by the block
|
|
copy in dup_task_struct. This is non-intuitive, but is
|
|
required for proper operation in the case of a threaded
|
|
application calling fork. */
|
|
if (clone_flags & CLONE_SETTLS)
|
|
childti->pcb.unique = settls;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Fill in the user structure for an ECOFF core dump.
|
|
*/
|
|
void
|
|
dump_thread(struct pt_regs * pt, struct user * dump)
|
|
{
|
|
/* switch stack follows right below pt_regs: */
|
|
struct switch_stack * sw = ((struct switch_stack *) pt) - 1;
|
|
|
|
dump->magic = CMAGIC;
|
|
dump->start_code = current->mm->start_code;
|
|
dump->start_data = current->mm->start_data;
|
|
dump->start_stack = rdusp() & ~(PAGE_SIZE - 1);
|
|
dump->u_tsize = ((current->mm->end_code - dump->start_code)
|
|
>> PAGE_SHIFT);
|
|
dump->u_dsize = ((current->mm->brk + PAGE_SIZE-1 - dump->start_data)
|
|
>> PAGE_SHIFT);
|
|
dump->u_ssize = (current->mm->start_stack - dump->start_stack
|
|
+ PAGE_SIZE-1) >> PAGE_SHIFT;
|
|
|
|
/*
|
|
* We store the registers in an order/format that is
|
|
* compatible with DEC Unix/OSF/1 as this makes life easier
|
|
* for gdb.
|
|
*/
|
|
dump->regs[EF_V0] = pt->r0;
|
|
dump->regs[EF_T0] = pt->r1;
|
|
dump->regs[EF_T1] = pt->r2;
|
|
dump->regs[EF_T2] = pt->r3;
|
|
dump->regs[EF_T3] = pt->r4;
|
|
dump->regs[EF_T4] = pt->r5;
|
|
dump->regs[EF_T5] = pt->r6;
|
|
dump->regs[EF_T6] = pt->r7;
|
|
dump->regs[EF_T7] = pt->r8;
|
|
dump->regs[EF_S0] = sw->r9;
|
|
dump->regs[EF_S1] = sw->r10;
|
|
dump->regs[EF_S2] = sw->r11;
|
|
dump->regs[EF_S3] = sw->r12;
|
|
dump->regs[EF_S4] = sw->r13;
|
|
dump->regs[EF_S5] = sw->r14;
|
|
dump->regs[EF_S6] = sw->r15;
|
|
dump->regs[EF_A3] = pt->r19;
|
|
dump->regs[EF_A4] = pt->r20;
|
|
dump->regs[EF_A5] = pt->r21;
|
|
dump->regs[EF_T8] = pt->r22;
|
|
dump->regs[EF_T9] = pt->r23;
|
|
dump->regs[EF_T10] = pt->r24;
|
|
dump->regs[EF_T11] = pt->r25;
|
|
dump->regs[EF_RA] = pt->r26;
|
|
dump->regs[EF_T12] = pt->r27;
|
|
dump->regs[EF_AT] = pt->r28;
|
|
dump->regs[EF_SP] = rdusp();
|
|
dump->regs[EF_PS] = pt->ps;
|
|
dump->regs[EF_PC] = pt->pc;
|
|
dump->regs[EF_GP] = pt->gp;
|
|
dump->regs[EF_A0] = pt->r16;
|
|
dump->regs[EF_A1] = pt->r17;
|
|
dump->regs[EF_A2] = pt->r18;
|
|
memcpy((char *)dump->regs + EF_SIZE, sw->fp, 32 * 8);
|
|
}
|
|
|
|
/*
|
|
* Fill in the user structure for a ELF core dump.
|
|
*/
|
|
void
|
|
dump_elf_thread(elf_greg_t *dest, struct pt_regs *pt, struct thread_info *ti)
|
|
{
|
|
/* switch stack follows right below pt_regs: */
|
|
struct switch_stack * sw = ((struct switch_stack *) pt) - 1;
|
|
|
|
dest[ 0] = pt->r0;
|
|
dest[ 1] = pt->r1;
|
|
dest[ 2] = pt->r2;
|
|
dest[ 3] = pt->r3;
|
|
dest[ 4] = pt->r4;
|
|
dest[ 5] = pt->r5;
|
|
dest[ 6] = pt->r6;
|
|
dest[ 7] = pt->r7;
|
|
dest[ 8] = pt->r8;
|
|
dest[ 9] = sw->r9;
|
|
dest[10] = sw->r10;
|
|
dest[11] = sw->r11;
|
|
dest[12] = sw->r12;
|
|
dest[13] = sw->r13;
|
|
dest[14] = sw->r14;
|
|
dest[15] = sw->r15;
|
|
dest[16] = pt->r16;
|
|
dest[17] = pt->r17;
|
|
dest[18] = pt->r18;
|
|
dest[19] = pt->r19;
|
|
dest[20] = pt->r20;
|
|
dest[21] = pt->r21;
|
|
dest[22] = pt->r22;
|
|
dest[23] = pt->r23;
|
|
dest[24] = pt->r24;
|
|
dest[25] = pt->r25;
|
|
dest[26] = pt->r26;
|
|
dest[27] = pt->r27;
|
|
dest[28] = pt->r28;
|
|
dest[29] = pt->gp;
|
|
dest[30] = rdusp();
|
|
dest[31] = pt->pc;
|
|
|
|
/* Once upon a time this was the PS value. Which is stupid
|
|
since that is always 8 for usermode. Usurped for the more
|
|
useful value of the thread's UNIQUE field. */
|
|
dest[32] = ti->pcb.unique;
|
|
}
|
|
|
|
int
|
|
dump_elf_task(elf_greg_t *dest, struct task_struct *task)
|
|
{
|
|
struct thread_info *ti;
|
|
struct pt_regs *pt;
|
|
|
|
ti = task->thread_info;
|
|
pt = (struct pt_regs *)((unsigned long)ti + 2*PAGE_SIZE) - 1;
|
|
|
|
dump_elf_thread(dest, pt, ti);
|
|
|
|
return 1;
|
|
}
|
|
|
|
int
|
|
dump_elf_task_fp(elf_fpreg_t *dest, struct task_struct *task)
|
|
{
|
|
struct thread_info *ti;
|
|
struct pt_regs *pt;
|
|
struct switch_stack *sw;
|
|
|
|
ti = task->thread_info;
|
|
pt = (struct pt_regs *)((unsigned long)ti + 2*PAGE_SIZE) - 1;
|
|
sw = (struct switch_stack *)pt - 1;
|
|
|
|
memcpy(dest, sw->fp, 32 * 8);
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* sys_execve() executes a new program.
|
|
*/
|
|
asmlinkage int
|
|
do_sys_execve(char __user *ufilename, char __user * __user *argv,
|
|
char __user * __user *envp, struct pt_regs *regs)
|
|
{
|
|
int error;
|
|
char *filename;
|
|
|
|
filename = getname(ufilename);
|
|
error = PTR_ERR(filename);
|
|
if (IS_ERR(filename))
|
|
goto out;
|
|
error = do_execve(filename, argv, envp, regs);
|
|
putname(filename);
|
|
out:
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Return saved PC of a blocked thread. This assumes the frame
|
|
* pointer is the 6th saved long on the kernel stack and that the
|
|
* saved return address is the first long in the frame. This all
|
|
* holds provided the thread blocked through a call to schedule() ($15
|
|
* is the frame pointer in schedule() and $15 is saved at offset 48 by
|
|
* entry.S:do_switch_stack).
|
|
*
|
|
* Under heavy swap load I've seen this lose in an ugly way. So do
|
|
* some extra sanity checking on the ranges we expect these pointers
|
|
* to be in so that we can fail gracefully. This is just for ps after
|
|
* all. -- r~
|
|
*/
|
|
|
|
unsigned long
|
|
thread_saved_pc(task_t *t)
|
|
{
|
|
unsigned long base = (unsigned long)t->thread_info;
|
|
unsigned long fp, sp = t->thread_info->pcb.ksp;
|
|
|
|
if (sp > base && sp+6*8 < base + 16*1024) {
|
|
fp = ((unsigned long*)sp)[6];
|
|
if (fp > sp && fp < base + 16*1024)
|
|
return *(unsigned long *)fp;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
unsigned long
|
|
get_wchan(struct task_struct *p)
|
|
{
|
|
unsigned long schedule_frame;
|
|
unsigned long pc;
|
|
if (!p || p == current || p->state == TASK_RUNNING)
|
|
return 0;
|
|
/*
|
|
* This one depends on the frame size of schedule(). Do a
|
|
* "disass schedule" in gdb to find the frame size. Also, the
|
|
* code assumes that sleep_on() follows immediately after
|
|
* interruptible_sleep_on() and that add_timer() follows
|
|
* immediately after interruptible_sleep(). Ugly, isn't it?
|
|
* Maybe adding a wchan field to task_struct would be better,
|
|
* after all...
|
|
*/
|
|
|
|
pc = thread_saved_pc(p);
|
|
if (in_sched_functions(pc)) {
|
|
schedule_frame = ((unsigned long *)p->thread_info->pcb.ksp)[6];
|
|
return ((unsigned long *)schedule_frame)[12];
|
|
}
|
|
return pc;
|
|
}
|