fd781fa25c
When we get a notification that cpu topology changed, we schedule a work struct which just calls arch_reinit_sched_domains. This function in turn calls get_online_cpus() which results int the lockdep warning below. After all it turnded out that it's not legal to call get_online_cpus() from the context of a multi-threaded work queue. It could deadlock this way: process 0 (events/cpu-x): -> run_workqueue -> removes my work_struct from the work queue -> calls work_struct->fn -> get_online_cpus() -> locks on cpu_hotplug.lock since process 1 below is doing cpu hotplug process 1: -> cpu_down (for cpu-x) -> cpu_hotplug_begin (holds cpu_hotplug.lock now) -> cpu-x dead -> notifier_call_chain with CPU_DEAD -> cleanup_workqueue_thread -> flush_cpu_workqueue (succeeds) -> kthread_stop for events/cpu-x -> now kthread_stop waits for my work_struct to complete from within process 0. -> dead. A single threaded workqueue wouldn't have such problems, however there is no such common queue available and it's not worth to create one for the very rare calls to arch_reinit_sched_domains. So we just create a kernel thread from our work struct which calls arch_reinit_sched_domains and are done with it. Thanks to Oleg Nesterov and Peter Zijlstra for helping me figuring out that this isn't a false positive lockdep warning: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.25-03562-g3dc5063-dirty #12 ------------------------------------------------------- events/3/14 is trying to acquire lock: (&cpu_hotplug.lock){--..}, at: [<0000000000076094>] get_online_cpus+0x50/0x78 but task is already holding lock: (topology_work){--..}, at: [<0000000000059cde>] run_workqueue+0x106/0x278 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (topology_work){--..}: [<000000000006fc74>] __lock_acquire+0x1010/0x111c [<000000000006fe40>] lock_acquire+0xc0/0xf8 [<0000000000059d48>] run_workqueue+0x170/0x278 [<0000000000059edc>] worker_thread+0x8c/0xf0 [<000000000005f5bc>] kthread+0x68/0xa0 [<000000000001a33e>] kernel_thread_starter+0x6/0xc [<000000000001a338>] kernel_thread_starter+0x0/0xc -> #1 (events){--..}: [<000000000006fc74>] __lock_acquire+0x1010/0x111c [<000000000006fe40>] lock_acquire+0xc0/0xf8 [<000000000005a23c>] cleanup_workqueue_thread+0x60/0xa8 [<00000000003b2ab8>] workqueue_cpu_callback+0xbc/0x170 [<00000000003bba80>] notifier_call_chain+0x5c/0xa4 [<00000000000655a2>] __raw_notifier_call_chain+0x26/0x38 [<00000000000655e2>] raw_notifier_call_chain+0x2e/0x40 [<0000000000075e00>] cpu_down+0x228/0x31c [<00000000003b1dd8>] store_online+0x64/0xb8 [<00000000001e7128>] sysdev_store+0x48/0x58 [<0000000000121cd2>] sysfs_write_file+0x126/0x1c0 [<00000000000c1944>] vfs_write+0xb0/0x15c [<00000000000c20e6>] sys_write+0x56/0x88 [<0000000000027a68>] sys32_write+0x34/0x4c [<0000000000023f70>] sysc_noemu+0x10/0x16 [<0000000077f3f186>] 0x77f3f186 -> #0 (&cpu_hotplug.lock){--..}: [<000000000006fa84>] __lock_acquire+0xe20/0x111c [<000000000006fe40>] lock_acquire+0xc0/0xf8 [<00000000003b701c>] mutex_lock_nested+0xd0/0x364 [<0000000000076094>] get_online_cpus+0x50/0x78 [<000000000003a03e>] arch_reinit_sched_domains+0x26/0x58 [<000000000002700e>] topology_work_fn+0x26/0x34 [<0000000000059d4e>] run_workqueue+0x176/0x278 [<0000000000059edc>] worker_thread+0x8c/0xf0 [<000000000005f5bc>] kthread+0x68/0xa0 [<000000000001a33e>] kernel_thread_starter+0x6/0xc [<000000000001a338>] kernel_thread_starter+0x0/0xc other info that might help us debug this: 2 locks held by events/3/14: #0: (events){--..}, at: [<0000000000059cde>] run_workqueue+0x106/0x278 #1: (topology_work){--..}, at: [<0000000000059cde>] run_workqueue+0x106/0x278 stack backtrace: CPU: 3 Not tainted 2.6.25-03562-g3dc5063-dirty #12 Process events/3 (pid: 14, task: 000000002fb04038, ksp: 000000002fb0bd70) 0400000000000000 000000002fb0ba40 0000000000000002 0000000000000000 000000002fb0bae0 000000002fb0ba58 000000002fb0ba58 0000000000016488 0000000000000000 000000002fb0bd70 0000000000000000 0000000000000000 000000002fb0ba40 000000000000000c 000000002fb0ba40 000000002fb0bab0 00000000003c99e0 0000000000016488 000000002fb0ba40 000000002fb0ba90 Call Trace: ([<00000000000163fc>] show_trace+0x138/0x158) [<00000000000164e2>] show_stack+0xc6/0xf8 [<0000000000016624>] dump_stack+0xb0/0xc0 [<000000000006cd36>] print_circular_bug_tail+0xa2/0xb4 [<000000000006fa84>] __lock_acquire+0xe20/0x111c [<000000000006fe40>] lock_acquire+0xc0/0xf8 [<00000000003b701c>] mutex_lock_nested+0xd0/0x364 [<0000000000076094>] get_online_cpus+0x50/0x78 [<000000000003a03e>] arch_reinit_sched_domains+0x26/0x58 [<000000000002700e>] topology_work_fn+0x26/0x34 [<0000000000059d4e>] run_workqueue+0x176/0x278 [<0000000000059edc>] worker_thread+0x8c/0xf0 [<000000000005f5bc>] kthread+0x68/0xa0 [<000000000001a33e>] kernel_thread_starter+0x6/0xc [<000000000001a338>] kernel_thread_starter+0x0/0xc INFO: lockdep is turned off. Cc: Oleg Nesterov <oleg@tv-sign.ru> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
342 lines
6.7 KiB
C
342 lines
6.7 KiB
C
/*
|
|
* Copyright IBM Corp. 2007
|
|
* Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/init.h>
|
|
#include <linux/device.h>
|
|
#include <linux/bootmem.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/kthread.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/cpu.h>
|
|
#include <linux/smp.h>
|
|
#include <asm/delay.h>
|
|
#include <asm/s390_ext.h>
|
|
#include <asm/sysinfo.h>
|
|
|
|
#define CPU_BITS 64
|
|
#define NR_MAG 6
|
|
|
|
#define PTF_HORIZONTAL (0UL)
|
|
#define PTF_VERTICAL (1UL)
|
|
#define PTF_CHECK (2UL)
|
|
|
|
struct tl_cpu {
|
|
unsigned char reserved0[4];
|
|
unsigned char :6;
|
|
unsigned char pp:2;
|
|
unsigned char reserved1;
|
|
unsigned short origin;
|
|
unsigned long mask[CPU_BITS / BITS_PER_LONG];
|
|
};
|
|
|
|
struct tl_container {
|
|
unsigned char reserved[8];
|
|
};
|
|
|
|
union tl_entry {
|
|
unsigned char nl;
|
|
struct tl_cpu cpu;
|
|
struct tl_container container;
|
|
};
|
|
|
|
struct tl_info {
|
|
unsigned char reserved0[2];
|
|
unsigned short length;
|
|
unsigned char mag[NR_MAG];
|
|
unsigned char reserved1;
|
|
unsigned char mnest;
|
|
unsigned char reserved2[4];
|
|
union tl_entry tle[0];
|
|
};
|
|
|
|
struct core_info {
|
|
struct core_info *next;
|
|
cpumask_t mask;
|
|
};
|
|
|
|
static void topology_work_fn(struct work_struct *work);
|
|
static struct tl_info *tl_info;
|
|
static struct core_info core_info;
|
|
static int machine_has_topology;
|
|
static int machine_has_topology_irq;
|
|
static struct timer_list topology_timer;
|
|
static void set_topology_timer(void);
|
|
static DECLARE_WORK(topology_work, topology_work_fn);
|
|
|
|
cpumask_t cpu_core_map[NR_CPUS];
|
|
|
|
cpumask_t cpu_coregroup_map(unsigned int cpu)
|
|
{
|
|
struct core_info *core = &core_info;
|
|
cpumask_t mask;
|
|
|
|
cpus_clear(mask);
|
|
if (!machine_has_topology)
|
|
return cpu_present_map;
|
|
mutex_lock(&smp_cpu_state_mutex);
|
|
while (core) {
|
|
if (cpu_isset(cpu, core->mask)) {
|
|
mask = core->mask;
|
|
break;
|
|
}
|
|
core = core->next;
|
|
}
|
|
mutex_unlock(&smp_cpu_state_mutex);
|
|
if (cpus_empty(mask))
|
|
mask = cpumask_of_cpu(cpu);
|
|
return mask;
|
|
}
|
|
|
|
static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
|
|
{
|
|
unsigned int cpu;
|
|
|
|
for (cpu = find_first_bit(&tl_cpu->mask[0], CPU_BITS);
|
|
cpu < CPU_BITS;
|
|
cpu = find_next_bit(&tl_cpu->mask[0], CPU_BITS, cpu + 1))
|
|
{
|
|
unsigned int rcpu, lcpu;
|
|
|
|
rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
|
|
for_each_present_cpu(lcpu) {
|
|
if (__cpu_logical_map[lcpu] == rcpu) {
|
|
cpu_set(lcpu, core->mask);
|
|
smp_cpu_polarization[lcpu] = tl_cpu->pp;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void clear_cores(void)
|
|
{
|
|
struct core_info *core = &core_info;
|
|
|
|
while (core) {
|
|
cpus_clear(core->mask);
|
|
core = core->next;
|
|
}
|
|
}
|
|
|
|
static union tl_entry *next_tle(union tl_entry *tle)
|
|
{
|
|
if (tle->nl)
|
|
return (union tl_entry *)((struct tl_container *)tle + 1);
|
|
else
|
|
return (union tl_entry *)((struct tl_cpu *)tle + 1);
|
|
}
|
|
|
|
static void tl_to_cores(struct tl_info *info)
|
|
{
|
|
union tl_entry *tle, *end;
|
|
struct core_info *core = &core_info;
|
|
|
|
mutex_lock(&smp_cpu_state_mutex);
|
|
clear_cores();
|
|
tle = info->tle;
|
|
end = (union tl_entry *)((unsigned long)info + info->length);
|
|
while (tle < end) {
|
|
switch (tle->nl) {
|
|
case 5:
|
|
case 4:
|
|
case 3:
|
|
case 2:
|
|
break;
|
|
case 1:
|
|
core = core->next;
|
|
break;
|
|
case 0:
|
|
add_cpus_to_core(&tle->cpu, core);
|
|
break;
|
|
default:
|
|
clear_cores();
|
|
machine_has_topology = 0;
|
|
return;
|
|
}
|
|
tle = next_tle(tle);
|
|
}
|
|
mutex_unlock(&smp_cpu_state_mutex);
|
|
}
|
|
|
|
static void topology_update_polarization_simple(void)
|
|
{
|
|
int cpu;
|
|
|
|
mutex_lock(&smp_cpu_state_mutex);
|
|
for_each_present_cpu(cpu)
|
|
smp_cpu_polarization[cpu] = POLARIZATION_HRZ;
|
|
mutex_unlock(&smp_cpu_state_mutex);
|
|
}
|
|
|
|
static int ptf(unsigned long fc)
|
|
{
|
|
int rc;
|
|
|
|
asm volatile(
|
|
" .insn rre,0xb9a20000,%1,%1\n"
|
|
" ipm %0\n"
|
|
" srl %0,28\n"
|
|
: "=d" (rc)
|
|
: "d" (fc) : "cc");
|
|
return rc;
|
|
}
|
|
|
|
int topology_set_cpu_management(int fc)
|
|
{
|
|
int cpu;
|
|
int rc;
|
|
|
|
if (!machine_has_topology)
|
|
return -EOPNOTSUPP;
|
|
if (fc)
|
|
rc = ptf(PTF_VERTICAL);
|
|
else
|
|
rc = ptf(PTF_HORIZONTAL);
|
|
if (rc)
|
|
return -EBUSY;
|
|
for_each_present_cpu(cpu)
|
|
smp_cpu_polarization[cpu] = POLARIZATION_UNKNWN;
|
|
return rc;
|
|
}
|
|
|
|
static void update_cpu_core_map(void)
|
|
{
|
|
int cpu;
|
|
|
|
for_each_present_cpu(cpu)
|
|
cpu_core_map[cpu] = cpu_coregroup_map(cpu);
|
|
}
|
|
|
|
void arch_update_cpu_topology(void)
|
|
{
|
|
struct tl_info *info = tl_info;
|
|
struct sys_device *sysdev;
|
|
int cpu;
|
|
|
|
if (!machine_has_topology) {
|
|
update_cpu_core_map();
|
|
topology_update_polarization_simple();
|
|
return;
|
|
}
|
|
stsi(info, 15, 1, 2);
|
|
tl_to_cores(info);
|
|
update_cpu_core_map();
|
|
for_each_online_cpu(cpu) {
|
|
sysdev = get_cpu_sysdev(cpu);
|
|
kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
|
|
}
|
|
}
|
|
|
|
static int topology_kthread(void *data)
|
|
{
|
|
arch_reinit_sched_domains();
|
|
return 0;
|
|
}
|
|
|
|
static void topology_work_fn(struct work_struct *work)
|
|
{
|
|
/* We can't call arch_reinit_sched_domains() from a multi-threaded
|
|
* workqueue context since it may deadlock in case of cpu hotplug.
|
|
* So we have to create a kernel thread in order to call
|
|
* arch_reinit_sched_domains().
|
|
*/
|
|
kthread_run(topology_kthread, NULL, "topology_update");
|
|
}
|
|
|
|
void topology_schedule_update(void)
|
|
{
|
|
schedule_work(&topology_work);
|
|
}
|
|
|
|
static void topology_timer_fn(unsigned long ignored)
|
|
{
|
|
if (ptf(PTF_CHECK))
|
|
topology_schedule_update();
|
|
set_topology_timer();
|
|
}
|
|
|
|
static void set_topology_timer(void)
|
|
{
|
|
topology_timer.function = topology_timer_fn;
|
|
topology_timer.data = 0;
|
|
topology_timer.expires = jiffies + 60 * HZ;
|
|
add_timer(&topology_timer);
|
|
}
|
|
|
|
static void topology_interrupt(__u16 code)
|
|
{
|
|
schedule_work(&topology_work);
|
|
}
|
|
|
|
static int __init init_topology_update(void)
|
|
{
|
|
int rc;
|
|
|
|
rc = 0;
|
|
if (!machine_has_topology) {
|
|
topology_update_polarization_simple();
|
|
goto out;
|
|
}
|
|
init_timer_deferrable(&topology_timer);
|
|
if (machine_has_topology_irq) {
|
|
rc = register_external_interrupt(0x2005, topology_interrupt);
|
|
if (rc)
|
|
goto out;
|
|
ctl_set_bit(0, 8);
|
|
}
|
|
else
|
|
set_topology_timer();
|
|
out:
|
|
update_cpu_core_map();
|
|
return rc;
|
|
}
|
|
__initcall(init_topology_update);
|
|
|
|
void __init s390_init_cpu_topology(void)
|
|
{
|
|
unsigned long long facility_bits;
|
|
struct tl_info *info;
|
|
struct core_info *core;
|
|
int nr_cores;
|
|
int i;
|
|
|
|
if (stfle(&facility_bits, 1) <= 0)
|
|
return;
|
|
if (!(facility_bits & (1ULL << 52)) || !(facility_bits & (1ULL << 61)))
|
|
return;
|
|
machine_has_topology = 1;
|
|
|
|
if (facility_bits & (1ULL << 51))
|
|
machine_has_topology_irq = 1;
|
|
|
|
tl_info = alloc_bootmem_pages(PAGE_SIZE);
|
|
if (!tl_info)
|
|
goto error;
|
|
info = tl_info;
|
|
stsi(info, 15, 1, 2);
|
|
|
|
nr_cores = info->mag[NR_MAG - 2];
|
|
for (i = 0; i < info->mnest - 2; i++)
|
|
nr_cores *= info->mag[NR_MAG - 3 - i];
|
|
|
|
printk(KERN_INFO "CPU topology:");
|
|
for (i = 0; i < NR_MAG; i++)
|
|
printk(" %d", info->mag[i]);
|
|
printk(" / %d\n", info->mnest);
|
|
|
|
core = &core_info;
|
|
for (i = 0; i < nr_cores; i++) {
|
|
core->next = alloc_bootmem(sizeof(struct core_info));
|
|
core = core->next;
|
|
if (!core)
|
|
goto error;
|
|
}
|
|
return;
|
|
error:
|
|
machine_has_topology = 0;
|
|
machine_has_topology_irq = 0;
|
|
}
|