2005-04-16 18:20:36 -04:00
|
|
|
/*
|
|
|
|
* linux/arch/i386/mm/pgtable.c
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/spinlock.h>
|
2006-09-26 02:32:25 -04:00
|
|
|
#include <linux/module.h>
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
#include <asm/system.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include <asm/fixmap.h>
|
|
|
|
#include <asm/e820.h>
|
|
|
|
#include <asm/tlb.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
|
|
|
|
void show_mem(void)
|
|
|
|
{
|
|
|
|
int total = 0, reserved = 0;
|
|
|
|
int shared = 0, cached = 0;
|
|
|
|
int highmem = 0;
|
|
|
|
struct page *page;
|
|
|
|
pg_data_t *pgdat;
|
|
|
|
unsigned long i;
|
2005-10-29 21:16:52 -04:00
|
|
|
unsigned long flags;
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2005-06-25 17:59:24 -04:00
|
|
|
printk(KERN_INFO "Mem-info:\n");
|
2005-04-16 18:20:36 -04:00
|
|
|
show_free_areas();
|
2005-06-25 17:59:24 -04:00
|
|
|
printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
|
2006-03-27 04:15:59 -05:00
|
|
|
for_each_online_pgdat(pgdat) {
|
2005-10-29 21:16:52 -04:00
|
|
|
pgdat_resize_lock(pgdat, &flags);
|
2005-04-16 18:20:36 -04:00
|
|
|
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
|
[PATCH] remove non-DISCONTIG use of pgdat->node_mem_map
This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code. On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.
There was also a node_mem_map(nid) macro on many architectures. Its use
along with the use of ->node_mem_map itself was not consistent. It has
been removed in favor of two new, more explicit, arch-independent macros:
pgdat_page_nr(pgdat, pagenr)
nid_page_nr(nid, pagenr)
I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I
believe the newer names are much clearer.
These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead. We could
make this the only behavior if people want, but I don't want to change too
much at once. One thing at a time.
This patch removes more code than it adds.
Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/
Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:37 -04:00
|
|
|
page = pgdat_page_nr(pgdat, i);
|
2005-04-16 18:20:36 -04:00
|
|
|
total++;
|
|
|
|
if (PageHighMem(page))
|
|
|
|
highmem++;
|
|
|
|
if (PageReserved(page))
|
|
|
|
reserved++;
|
|
|
|
else if (PageSwapCache(page))
|
|
|
|
cached++;
|
|
|
|
else if (page_count(page))
|
|
|
|
shared += page_count(page) - 1;
|
|
|
|
}
|
2005-10-29 21:16:52 -04:00
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
2005-06-25 17:59:24 -04:00
|
|
|
printk(KERN_INFO "%d pages of RAM\n", total);
|
|
|
|
printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
|
|
|
|
printk(KERN_INFO "%d reserved pages\n", reserved);
|
|
|
|
printk(KERN_INFO "%d pages shared\n", shared);
|
|
|
|
printk(KERN_INFO "%d pages swap cached\n", cached);
|
2005-06-23 03:08:08 -04:00
|
|
|
|
2006-06-30 04:55:39 -04:00
|
|
|
printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
|
2006-06-30 04:55:40 -04:00
|
|
|
printk(KERN_INFO "%lu pages writeback\n",
|
|
|
|
global_page_state(NR_WRITEBACK));
|
2006-06-30 04:55:34 -04:00
|
|
|
printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
|
2006-09-26 02:31:51 -04:00
|
|
|
printk(KERN_INFO "%lu pages slab\n",
|
|
|
|
global_page_state(NR_SLAB_RECLAIMABLE) +
|
|
|
|
global_page_state(NR_SLAB_UNRECLAIMABLE));
|
2006-06-30 04:55:38 -04:00
|
|
|
printk(KERN_INFO "%lu pages pagetables\n",
|
|
|
|
global_page_state(NR_PAGETABLE));
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Associate a virtual page frame with a given physical page frame
|
|
|
|
* and protection flags for that frame.
|
|
|
|
*/
|
|
|
|
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
pgd = swapper_pg_dir + pgd_index(vaddr);
|
|
|
|
if (pgd_none(*pgd)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pud = pud_offset(pgd, vaddr);
|
|
|
|
if (pud_none(*pud)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pmd = pmd_offset(pud, vaddr);
|
|
|
|
if (pmd_none(*pmd)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pte = pte_offset_kernel(pmd, vaddr);
|
2006-12-06 20:14:09 -05:00
|
|
|
if (pgprot_val(flags))
|
|
|
|
/* <pfn,flags> stored as-is, to permit clearing entries */
|
|
|
|
set_pte(pte, pfn_pte(pfn, flags));
|
|
|
|
else
|
|
|
|
pte_clear(&init_mm, vaddr, pte);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It's enough to flush this one mapping.
|
|
|
|
* (PGE mappings get flushed as well)
|
|
|
|
*/
|
|
|
|
__flush_tlb_one(vaddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Associate a large virtual page frame with a given physical page frame
|
|
|
|
* and protection flags for that frame. pfn is for the base of the page,
|
|
|
|
* vaddr is what the page gets mapped to - both must be properly aligned.
|
|
|
|
* The pmd must already be instantiated. Assumes PAE mode.
|
|
|
|
*/
|
|
|
|
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
|
|
|
|
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
|
2005-06-25 17:59:24 -04:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
|
2005-04-16 18:20:36 -04:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
|
2005-06-25 17:59:24 -04:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
|
2005-04-16 18:20:36 -04:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
pgd = swapper_pg_dir + pgd_index(vaddr);
|
|
|
|
if (pgd_none(*pgd)) {
|
2005-06-25 17:59:24 -04:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
|
2005-04-16 18:20:36 -04:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
pud = pud_offset(pgd, vaddr);
|
|
|
|
pmd = pmd_offset(pud, vaddr);
|
|
|
|
set_pmd(pmd, pfn_pmd(pfn, flags));
|
|
|
|
/*
|
|
|
|
* It's enough to flush this one mapping.
|
|
|
|
* (PGE mappings get flushed as well)
|
|
|
|
*/
|
|
|
|
__flush_tlb_one(vaddr);
|
|
|
|
}
|
|
|
|
|
2006-09-26 02:32:25 -04:00
|
|
|
static int fixmaps;
|
|
|
|
unsigned long __FIXADDR_TOP = 0xfffff000;
|
|
|
|
EXPORT_SYMBOL(__FIXADDR_TOP);
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
|
|
|
|
{
|
|
|
|
unsigned long address = __fix_to_virt(idx);
|
|
|
|
|
|
|
|
if (idx >= __end_of_fixed_addresses) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
|
2006-09-26 02:32:25 -04:00
|
|
|
fixmaps++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* reserve_top_address - reserves a hole in the top of kernel address space
|
|
|
|
* @reserve - size of hole to reserve
|
|
|
|
*
|
|
|
|
* Can be used to relocate the fixmap area and poke a hole in the top
|
|
|
|
* of kernel address space to make room for a hypervisor.
|
|
|
|
*/
|
|
|
|
void reserve_top_address(unsigned long reserve)
|
|
|
|
{
|
|
|
|
BUG_ON(fixmaps > 0);
|
2007-02-13 07:26:21 -05:00
|
|
|
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
|
|
|
|
(int)-reserve);
|
2006-09-26 02:32:25 -04:00
|
|
|
__FIXADDR_TOP = -reserve - PAGE_SIZE;
|
|
|
|
__VMALLOC_RESERVE += reserve;
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
|
|
|
|
{
|
|
|
|
return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
|
|
|
|
{
|
|
|
|
struct page *pte;
|
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHPTE
|
|
|
|
pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
|
|
|
|
#else
|
|
|
|
pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
|
|
|
|
#endif
|
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
2006-12-06 23:33:20 -05:00
|
|
|
void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
|
2005-04-16 18:20:36 -04:00
|
|
|
{
|
|
|
|
memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* List of all pgd's needed for non-PAE so it can invalidate entries
|
|
|
|
* in both cached and uncached pgd's; not needed for PAE since the
|
|
|
|
* kernel pmd is shared. If PAE were not to share the pmd a similar
|
|
|
|
* tactic would be needed. This is essentially codepath-based locking
|
|
|
|
* against pageattr.c; it is the unique case in which a valid change
|
|
|
|
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
|
|
|
|
* vmalloc faults work because attached pagetables are never freed.
|
|
|
|
* The locking scheme was chosen on the basis of manfred's
|
|
|
|
* recommendations and having no core impact whatsoever.
|
|
|
|
* -- wli
|
|
|
|
*/
|
|
|
|
DEFINE_SPINLOCK(pgd_lock);
|
|
|
|
struct page *pgd_list;
|
|
|
|
|
|
|
|
static inline void pgd_list_add(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(pgd);
|
|
|
|
page->index = (unsigned long)pgd_list;
|
|
|
|
if (pgd_list)
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
set_page_private(pgd_list, (unsigned long)&page->index);
|
2005-04-16 18:20:36 -04:00
|
|
|
pgd_list = page;
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
set_page_private(page, (unsigned long)&pgd_list);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pgd_list_del(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
struct page *next, **pprev, *page = virt_to_page(pgd);
|
|
|
|
next = (struct page *)page->index;
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
pprev = (struct page **)page_private(page);
|
2005-04-16 18:20:36 -04:00
|
|
|
*pprev = next;
|
|
|
|
if (next)
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
set_page_private(next, (unsigned long)pprev);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
#if (PTRS_PER_PMD == 1)
|
|
|
|
/* Non-PAE pgd constructor */
|
2006-12-06 23:33:20 -05:00
|
|
|
void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
|
2005-04-16 18:20:36 -04:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
/* !PAE, no pagetable sharing */
|
|
|
|
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
/* must happen under lock */
|
2005-09-03 18:56:50 -04:00
|
|
|
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
|
2005-04-16 18:20:36 -04:00
|
|
|
swapper_pg_dir + USER_PTRS_PER_PGD,
|
2005-09-03 18:56:50 -04:00
|
|
|
KERNEL_PGD_PTRS);
|
2007-02-13 07:26:21 -05:00
|
|
|
paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
__pa(swapper_pg_dir) >> PAGE_SHIFT,
|
|
|
|
USER_PTRS_PER_PGD,
|
|
|
|
KERNEL_PGD_PTRS);
|
2005-04-16 18:20:36 -04:00
|
|
|
pgd_list_add(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
#else /* PTRS_PER_PMD > 1 */
|
|
|
|
/* PAE pgd constructor */
|
|
|
|
void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
|
|
|
|
{
|
|
|
|
/* PAE, kernel PMD may be shared */
|
|
|
|
|
|
|
|
if (SHARED_KERNEL_PMD) {
|
|
|
|
clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
|
|
|
|
swapper_pg_dir + USER_PTRS_PER_PGD,
|
|
|
|
KERNEL_PGD_PTRS);
|
|
|
|
} else {
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
pgd_list_add(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* PTRS_PER_PMD */
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2006-12-06 23:33:20 -05:00
|
|
|
void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
|
2005-04-16 18:20:36 -04:00
|
|
|
{
|
|
|
|
unsigned long flags; /* can be called from interrupt context */
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
BUG_ON(SHARED_KERNEL_PMD);
|
|
|
|
|
2007-02-13 07:26:21 -05:00
|
|
|
paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
|
2005-04-16 18:20:36 -04:00
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
pgd_list_del(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
#define UNSHARED_PTRS_PER_PGD \
|
|
|
|
(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
|
|
|
|
|
|
|
|
/* If we allocate a pmd for part of the kernel address space, then
|
|
|
|
make sure its initialized with the appropriate kernel mappings.
|
|
|
|
Otherwise use a cached zeroed pmd. */
|
|
|
|
static pmd_t *pmd_cache_alloc(int idx)
|
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
|
|
|
|
if (idx >= USER_PTRS_PER_PGD) {
|
|
|
|
pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
|
|
|
|
|
|
|
|
if (pmd)
|
|
|
|
memcpy(pmd,
|
|
|
|
(void *)pgd_page_vaddr(swapper_pg_dir[idx]),
|
|
|
|
sizeof(pmd_t) * PTRS_PER_PMD);
|
|
|
|
} else
|
|
|
|
pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
|
|
|
|
|
|
|
|
return pmd;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void pmd_cache_free(pmd_t *pmd, int idx)
|
|
|
|
{
|
|
|
|
if (idx >= USER_PTRS_PER_PGD)
|
|
|
|
free_page((unsigned long)pmd);
|
|
|
|
else
|
|
|
|
kmem_cache_free(pmd_cache, pmd);
|
|
|
|
}
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
pgd_t *pgd_alloc(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
|
|
|
|
|
|
|
|
if (PTRS_PER_PMD == 1 || !pgd)
|
|
|
|
return pgd;
|
|
|
|
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
|
|
|
|
pmd_t *pmd = pmd_cache_alloc(i);
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
if (!pmd)
|
|
|
|
goto out_oom;
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
|
2007-02-13 07:26:21 -05:00
|
|
|
paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
|
2005-04-16 18:20:36 -04:00
|
|
|
set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
|
|
|
|
}
|
|
|
|
return pgd;
|
|
|
|
|
|
|
|
out_oom:
|
2007-02-13 07:26:21 -05:00
|
|
|
for (i--; i >= 0; i--) {
|
|
|
|
pgd_t pgdent = pgd[i];
|
|
|
|
void* pmd = (void *)__va(pgd_val(pgdent)-1);
|
|
|
|
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
pmd_cache_free(pmd, i);
|
2007-02-13 07:26:21 -05:00
|
|
|
}
|
2005-04-16 18:20:36 -04:00
|
|
|
kmem_cache_free(pgd_cache, pgd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void pgd_free(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* in the PAE case user pgd entries are overwritten before usage */
|
|
|
|
if (PTRS_PER_PMD > 1)
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
|
2007-02-13 07:26:21 -05:00
|
|
|
pgd_t pgdent = pgd[i];
|
|
|
|
void* pmd = (void *)__va(pgd_val(pgdent)-1);
|
|
|
|
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 13:27:13 -04:00
|
|
|
pmd_cache_free(pmd, i);
|
2007-02-13 07:26:21 -05:00
|
|
|
}
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-19 16:29:15 -04:00
|
|
|
/* in the non-PAE case, free_pgtables() clears user pgd entries */
|
2005-04-16 18:20:36 -04:00
|
|
|
kmem_cache_free(pgd_cache, pgd);
|
|
|
|
}
|