4518e6a0c0
Embedding percpu first chunk allocator can now handle very sparse unit mapping. Use embedding allocator instead of lpage for 64bit NUMA. This removes extra TLB pressure and the need to do complex and fragile dancing when changing page attributes. For 32bit, using very sparse unit mapping isn't a good idea because the vmalloc space is very constrained. 32bit NUMA machines aren't exactly the focus of optimization and it isn't very clear whether lpage performs better than page. Use page first chunk allocator for 32bit NUMAs. As this leaves setup_pcpu_*() functions pretty much empty, fold them into setup_per_cpu_areas(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Andi Kleen <andi@firstfloor.org>
273 lines
7.4 KiB
C
273 lines
7.4 KiB
C
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/init.h>
|
|
#include <linux/bootmem.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/kexec.h>
|
|
#include <linux/crash_dump.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/topology.h>
|
|
#include <linux/pfn.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/setup.h>
|
|
#include <asm/mpspec.h>
|
|
#include <asm/apicdef.h>
|
|
#include <asm/highmem.h>
|
|
#include <asm/proto.h>
|
|
#include <asm/cpumask.h>
|
|
#include <asm/cpu.h>
|
|
#include <asm/stackprotector.h>
|
|
|
|
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
|
|
# define DBG(x...) printk(KERN_DEBUG x)
|
|
#else
|
|
# define DBG(x...)
|
|
#endif
|
|
|
|
DEFINE_PER_CPU(int, cpu_number);
|
|
EXPORT_PER_CPU_SYMBOL(cpu_number);
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
|
|
#else
|
|
#define BOOT_PERCPU_OFFSET 0
|
|
#endif
|
|
|
|
DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
|
|
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
|
|
|
|
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
|
|
[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
|
|
};
|
|
EXPORT_SYMBOL(__per_cpu_offset);
|
|
|
|
/*
|
|
* On x86_64 symbols referenced from code should be reachable using
|
|
* 32bit relocations. Reserve space for static percpu variables in
|
|
* modules so that they are always served from the first chunk which
|
|
* is located at the percpu segment base. On x86_32, anything can
|
|
* address anywhere. No need to reserve space in the first chunk.
|
|
*/
|
|
#ifdef CONFIG_X86_64
|
|
#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
|
|
#else
|
|
#define PERCPU_FIRST_CHUNK_RESERVE 0
|
|
#endif
|
|
|
|
#ifdef CONFIG_X86_32
|
|
/**
|
|
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
|
|
*
|
|
* If NUMA is not configured or there is only one NUMA node available,
|
|
* there is no reason to consider NUMA. This function determines
|
|
* whether percpu allocation should consider NUMA or not.
|
|
*
|
|
* RETURNS:
|
|
* true if NUMA should be considered; otherwise, false.
|
|
*/
|
|
static bool __init pcpu_need_numa(void)
|
|
{
|
|
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
|
pg_data_t *last = NULL;
|
|
unsigned int cpu;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
int node = early_cpu_to_node(cpu);
|
|
|
|
if (node_online(node) && NODE_DATA(node) &&
|
|
last && last != NODE_DATA(node))
|
|
return true;
|
|
|
|
last = NODE_DATA(node);
|
|
}
|
|
#endif
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
|
|
* @cpu: cpu to allocate for
|
|
* @size: size allocation in bytes
|
|
* @align: alignment
|
|
*
|
|
* Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
|
|
* does the right thing for NUMA regardless of the current
|
|
* configuration.
|
|
*
|
|
* RETURNS:
|
|
* Pointer to the allocated area on success, NULL on failure.
|
|
*/
|
|
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
|
|
unsigned long align)
|
|
{
|
|
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
|
|
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
|
int node = early_cpu_to_node(cpu);
|
|
void *ptr;
|
|
|
|
if (!node_online(node) || !NODE_DATA(node)) {
|
|
ptr = __alloc_bootmem_nopanic(size, align, goal);
|
|
pr_info("cpu %d has no node %d or node-local memory\n",
|
|
cpu, node);
|
|
pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
|
|
cpu, size, __pa(ptr));
|
|
} else {
|
|
ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
|
|
size, align, goal);
|
|
pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
|
|
"%016lx\n", cpu, size, node, __pa(ptr));
|
|
}
|
|
return ptr;
|
|
#else
|
|
return __alloc_bootmem_nopanic(size, align, goal);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Helpers for first chunk memory allocation
|
|
*/
|
|
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
|
|
{
|
|
return pcpu_alloc_bootmem(cpu, size, align);
|
|
}
|
|
|
|
static void __init pcpu_fc_free(void *ptr, size_t size)
|
|
{
|
|
free_bootmem(__pa(ptr), size);
|
|
}
|
|
|
|
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
|
{
|
|
#ifdef CONFIG_NEED_MULTIPLE_NODES
|
|
if (early_cpu_to_node(from) == early_cpu_to_node(to))
|
|
return LOCAL_DISTANCE;
|
|
else
|
|
return REMOTE_DISTANCE;
|
|
#else
|
|
return LOCAL_DISTANCE;
|
|
#endif
|
|
}
|
|
|
|
static void __init pcpup_populate_pte(unsigned long addr)
|
|
{
|
|
populate_extra_pte(addr);
|
|
}
|
|
|
|
static inline void setup_percpu_segment(int cpu)
|
|
{
|
|
#ifdef CONFIG_X86_32
|
|
struct desc_struct gdt;
|
|
|
|
pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
|
|
0x2 | DESCTYPE_S, 0x8);
|
|
gdt.s = 1;
|
|
write_gdt_entry(get_cpu_gdt_table(cpu),
|
|
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
|
|
#endif
|
|
}
|
|
|
|
void __init setup_per_cpu_areas(void)
|
|
{
|
|
unsigned int cpu;
|
|
unsigned long delta;
|
|
int rc;
|
|
|
|
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
|
|
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
|
|
|
|
/*
|
|
* Allocate percpu area. Embedding allocator is our favorite;
|
|
* however, on NUMA configurations, it can result in very
|
|
* sparse unit mapping and vmalloc area isn't spacious enough
|
|
* on 32bit. Use page in that case.
|
|
*/
|
|
#ifdef CONFIG_X86_32
|
|
if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
|
|
pcpu_chosen_fc = PCPU_FC_PAGE;
|
|
#endif
|
|
rc = -EINVAL;
|
|
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
|
|
const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
|
|
const size_t dyn_size = PERCPU_MODULE_RESERVE +
|
|
PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
|
|
|
|
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
|
|
dyn_size, atom_size,
|
|
pcpu_cpu_distance,
|
|
pcpu_fc_alloc, pcpu_fc_free);
|
|
if (rc < 0)
|
|
pr_warning("PERCPU: %s allocator failed (%d), "
|
|
"falling back to page size\n",
|
|
pcpu_fc_names[pcpu_chosen_fc], rc);
|
|
}
|
|
if (rc < 0)
|
|
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
|
|
pcpu_fc_alloc, pcpu_fc_free,
|
|
pcpup_populate_pte);
|
|
if (rc < 0)
|
|
panic("cannot initialize percpu area (err=%d)", rc);
|
|
|
|
/* alrighty, percpu areas up and running */
|
|
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
|
|
for_each_possible_cpu(cpu) {
|
|
per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
|
|
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
|
|
per_cpu(cpu_number, cpu) = cpu;
|
|
setup_percpu_segment(cpu);
|
|
setup_stack_canary_segment(cpu);
|
|
/*
|
|
* Copy data used in early init routines from the
|
|
* initial arrays to the per cpu data areas. These
|
|
* arrays then become expendable and the *_early_ptr's
|
|
* are zeroed indicating that the static arrays are
|
|
* gone.
|
|
*/
|
|
#ifdef CONFIG_X86_LOCAL_APIC
|
|
per_cpu(x86_cpu_to_apicid, cpu) =
|
|
early_per_cpu_map(x86_cpu_to_apicid, cpu);
|
|
per_cpu(x86_bios_cpu_apicid, cpu) =
|
|
early_per_cpu_map(x86_bios_cpu_apicid, cpu);
|
|
#endif
|
|
#ifdef CONFIG_X86_64
|
|
per_cpu(irq_stack_ptr, cpu) =
|
|
per_cpu(irq_stack_union.irq_stack, cpu) +
|
|
IRQ_STACK_SIZE - 64;
|
|
#ifdef CONFIG_NUMA
|
|
per_cpu(x86_cpu_to_node_map, cpu) =
|
|
early_per_cpu_map(x86_cpu_to_node_map, cpu);
|
|
#endif
|
|
#endif
|
|
/*
|
|
* Up to this point, the boot CPU has been using .data.init
|
|
* area. Reload any changed state for the boot CPU.
|
|
*/
|
|
if (cpu == boot_cpu_id)
|
|
switch_to_new_gdt(cpu);
|
|
}
|
|
|
|
/* indicate the early static arrays will soon be gone */
|
|
#ifdef CONFIG_X86_LOCAL_APIC
|
|
early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
|
|
early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
|
|
#endif
|
|
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
|
|
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
|
|
#endif
|
|
|
|
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
|
|
/*
|
|
* make sure boot cpu node_number is right, when boot cpu is on the
|
|
* node that doesn't have mem installed
|
|
*/
|
|
per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
|
|
#endif
|
|
|
|
/* Setup node to cpumask map */
|
|
setup_node_to_cpumask_map();
|
|
|
|
/* Setup cpu initialized, callin, callout masks */
|
|
setup_cpu_local_masks();
|
|
}
|