bbfceef47f
This patch adds in the necessary support for sparsemem such that x86-64 kernels may use sparsemem as an alternative to discontigmem for NUMA kernels. Note that this does no preclude one from continuing to build NUMA kernels using discontigmem, but merely allows the option to build NUMA kernels with sparsemem. Interestingly, the use of sparsemem in lieu of discontigmem in NUMA kernels results in reduced text size for otherwise equivalent kernels as shown in the example builds below: text data bss dec hex filename 2371036 765884 1237108 4374028 42be0c vmlinux.discontig 2366549 776484 1302772 4445805 43d66d vmlinux.sparse Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
303 lines
7.9 KiB
C
303 lines
7.9 KiB
C
/*
|
|
* Generic VM initialization for x86-64 NUMA setups.
|
|
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/string.h>
|
|
#include <linux/init.h>
|
|
#include <linux/bootmem.h>
|
|
#include <linux/mmzone.h>
|
|
#include <linux/ctype.h>
|
|
#include <linux/module.h>
|
|
#include <linux/nodemask.h>
|
|
|
|
#include <asm/e820.h>
|
|
#include <asm/proto.h>
|
|
#include <asm/dma.h>
|
|
#include <asm/numa.h>
|
|
#include <asm/acpi.h>
|
|
|
|
#ifndef Dprintk
|
|
#define Dprintk(x...)
|
|
#endif
|
|
|
|
struct pglist_data *node_data[MAX_NUMNODES];
|
|
bootmem_data_t plat_node_bdata[MAX_NUMNODES];
|
|
|
|
int memnode_shift;
|
|
u8 memnodemap[NODEMAPSIZE];
|
|
|
|
unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
|
|
cpumask_t node_to_cpumask[MAX_NUMNODES];
|
|
|
|
int numa_off __initdata;
|
|
|
|
int __init compute_hash_shift(struct node *nodes, int numnodes)
|
|
{
|
|
int i;
|
|
int shift = 24;
|
|
u64 addr;
|
|
|
|
/* When in doubt use brute force. */
|
|
while (shift < 48) {
|
|
memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
|
|
for (i = 0; i < numnodes; i++) {
|
|
if (nodes[i].start == nodes[i].end)
|
|
continue;
|
|
for (addr = nodes[i].start;
|
|
addr < nodes[i].end;
|
|
addr += (1UL << shift)) {
|
|
if (memnodemap[addr >> shift] != 0xff &&
|
|
memnodemap[addr >> shift] != i) {
|
|
printk(KERN_INFO
|
|
"node %d shift %d addr %Lx conflict %d\n",
|
|
i, shift, addr, memnodemap[addr>>shift]);
|
|
goto next;
|
|
}
|
|
memnodemap[addr >> shift] = i;
|
|
}
|
|
}
|
|
return shift;
|
|
next:
|
|
shift++;
|
|
}
|
|
memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
|
|
return -1;
|
|
}
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
int early_pfn_to_nid(unsigned long pfn)
|
|
{
|
|
return phys_to_nid(pfn << PAGE_SHIFT);
|
|
}
|
|
#endif
|
|
|
|
/* Initialize bootmem allocator for a node */
|
|
void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
|
|
{
|
|
unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
|
|
unsigned long nodedata_phys;
|
|
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
|
|
|
|
start = round_up(start, ZONE_ALIGN);
|
|
|
|
printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
|
|
|
|
start_pfn = start >> PAGE_SHIFT;
|
|
end_pfn = end >> PAGE_SHIFT;
|
|
|
|
memory_present(nodeid, start_pfn, end_pfn);
|
|
nodedata_phys = find_e820_area(start, end, pgdat_size);
|
|
if (nodedata_phys == -1L)
|
|
panic("Cannot find memory pgdat in node %d\n", nodeid);
|
|
|
|
Dprintk("nodedata_phys %lx\n", nodedata_phys);
|
|
|
|
node_data[nodeid] = phys_to_virt(nodedata_phys);
|
|
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
|
|
NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
|
|
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
|
|
NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
|
|
|
|
/* Find a place for the bootmem map */
|
|
bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
|
|
bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
|
|
bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
|
|
if (bootmap_start == -1L)
|
|
panic("Not enough continuous space for bootmap on node %d", nodeid);
|
|
Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
|
|
|
|
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
|
|
bootmap_start >> PAGE_SHIFT,
|
|
start_pfn, end_pfn);
|
|
|
|
e820_bootmem_free(NODE_DATA(nodeid), start, end);
|
|
|
|
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
|
|
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
|
|
node_set_online(nodeid);
|
|
}
|
|
|
|
/* Initialize final allocator for a zone */
|
|
void __init setup_node_zones(int nodeid)
|
|
{
|
|
unsigned long start_pfn, end_pfn;
|
|
unsigned long zones[MAX_NR_ZONES];
|
|
unsigned long dma_end_pfn;
|
|
|
|
memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
|
|
|
|
start_pfn = node_start_pfn(nodeid);
|
|
end_pfn = node_end_pfn(nodeid);
|
|
|
|
Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
|
|
|
|
/* All nodes > 0 have a zero length zone DMA */
|
|
dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
|
|
if (start_pfn < dma_end_pfn) {
|
|
zones[ZONE_DMA] = dma_end_pfn - start_pfn;
|
|
zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
|
|
} else {
|
|
zones[ZONE_NORMAL] = end_pfn - start_pfn;
|
|
}
|
|
|
|
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
|
|
start_pfn, NULL);
|
|
}
|
|
|
|
void __init numa_init_array(void)
|
|
{
|
|
int rr, i;
|
|
/* There are unfortunately some poorly designed mainboards around
|
|
that only connect memory to a single CPU. This breaks the 1:1 cpu->node
|
|
mapping. To avoid this fill in the mapping for all possible
|
|
CPUs, as the number of CPUs is not known yet.
|
|
We round robin the existing nodes. */
|
|
rr = 0;
|
|
for (i = 0; i < NR_CPUS; i++) {
|
|
if (cpu_to_node[i] != NUMA_NO_NODE)
|
|
continue;
|
|
rr = next_node(rr, node_online_map);
|
|
if (rr == MAX_NUMNODES)
|
|
rr = first_node(node_online_map);
|
|
cpu_to_node[i] = rr;
|
|
rr++;
|
|
}
|
|
|
|
set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA_EMU
|
|
int numa_fake __initdata = 0;
|
|
|
|
/* Numa emulation */
|
|
static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
|
|
{
|
|
int i;
|
|
struct node nodes[MAX_NUMNODES];
|
|
unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
|
|
|
|
/* Kludge needed for the hash function */
|
|
if (hweight64(sz) > 1) {
|
|
unsigned long x = 1;
|
|
while ((x << 1) < sz)
|
|
x <<= 1;
|
|
if (x < sz/2)
|
|
printk("Numa emulation unbalanced. Complain to maintainer\n");
|
|
sz = x;
|
|
}
|
|
|
|
memset(&nodes,0,sizeof(nodes));
|
|
for (i = 0; i < numa_fake; i++) {
|
|
nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
|
|
if (i == numa_fake-1)
|
|
sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
|
|
nodes[i].end = nodes[i].start + sz;
|
|
if (i != numa_fake-1)
|
|
nodes[i].end--;
|
|
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
|
|
i,
|
|
nodes[i].start, nodes[i].end,
|
|
(nodes[i].end - nodes[i].start) >> 20);
|
|
node_set_online(i);
|
|
}
|
|
memnode_shift = compute_hash_shift(nodes, numa_fake);
|
|
if (memnode_shift < 0) {
|
|
memnode_shift = 0;
|
|
printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
|
|
return -1;
|
|
}
|
|
for_each_online_node(i)
|
|
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
|
|
numa_init_array();
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
|
|
{
|
|
int i;
|
|
|
|
#ifdef CONFIG_NUMA_EMU
|
|
if (numa_fake && !numa_emulation(start_pfn, end_pfn))
|
|
return;
|
|
#endif
|
|
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
|
|
end_pfn << PAGE_SHIFT))
|
|
return;
|
|
#endif
|
|
|
|
#ifdef CONFIG_K8_NUMA
|
|
if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
|
|
return;
|
|
#endif
|
|
printk(KERN_INFO "%s\n",
|
|
numa_off ? "NUMA turned off" : "No NUMA configuration found");
|
|
|
|
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
|
|
start_pfn << PAGE_SHIFT,
|
|
end_pfn << PAGE_SHIFT);
|
|
/* setup dummy node covering all memory */
|
|
memnode_shift = 63;
|
|
memnodemap[0] = 0;
|
|
nodes_clear(node_online_map);
|
|
node_set_online(0);
|
|
for (i = 0; i < NR_CPUS; i++)
|
|
cpu_to_node[i] = 0;
|
|
node_to_cpumask[0] = cpumask_of_cpu(0);
|
|
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
|
|
}
|
|
|
|
__init void numa_add_cpu(int cpu)
|
|
{
|
|
/* BP is initialized elsewhere */
|
|
if (cpu)
|
|
set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
|
|
}
|
|
|
|
unsigned long __init numa_free_all_bootmem(void)
|
|
{
|
|
int i;
|
|
unsigned long pages = 0;
|
|
for_each_online_node(i) {
|
|
pages += free_all_bootmem_node(NODE_DATA(i));
|
|
}
|
|
return pages;
|
|
}
|
|
|
|
void __init paging_init(void)
|
|
{
|
|
int i;
|
|
for_each_online_node(i) {
|
|
setup_node_zones(i);
|
|
}
|
|
}
|
|
|
|
/* [numa=off] */
|
|
__init int numa_setup(char *opt)
|
|
{
|
|
if (!strncmp(opt,"off",3))
|
|
numa_off = 1;
|
|
#ifdef CONFIG_NUMA_EMU
|
|
if(!strncmp(opt, "fake=", 5)) {
|
|
numa_fake = simple_strtoul(opt+5,NULL,0); ;
|
|
if (numa_fake >= MAX_NUMNODES)
|
|
numa_fake = MAX_NUMNODES;
|
|
}
|
|
#endif
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
if (!strncmp(opt,"noacpi",6))
|
|
acpi_numa = -1;
|
|
#endif
|
|
return 1;
|
|
}
|
|
|
|
EXPORT_SYMBOL(cpu_to_node);
|
|
EXPORT_SYMBOL(node_to_cpumask);
|
|
EXPORT_SYMBOL(memnode_shift);
|
|
EXPORT_SYMBOL(memnodemap);
|
|
EXPORT_SYMBOL(node_data);
|