2005-04-16 18:20:36 -04:00
|
|
|
#ifndef _LINUX_MM_H
|
|
|
|
#define _LINUX_MM_H
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/errno.h>
|
2006-01-11 15:17:46 -05:00
|
|
|
#include <linux/capability.h>
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
|
|
#include <linux/gfp.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/prio_tree.h>
|
|
|
|
#include <linux/fs.h>
|
2006-01-09 18:59:21 -05:00
|
|
|
#include <linux/mutex.h>
|
2006-07-03 03:24:33 -04:00
|
|
|
#include <linux/debug_locks.h>
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
struct mempolicy;
|
|
|
|
struct anon_vma;
|
|
|
|
|
|
|
|
#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
|
|
|
|
extern unsigned long max_mapnr;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern unsigned long num_physpages;
|
|
|
|
extern void * high_memory;
|
|
|
|
extern unsigned long vmalloc_earlyreserve;
|
|
|
|
extern int page_cluster;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
|
|
extern int sysctl_legacy_va_layout;
|
|
|
|
#else
|
|
|
|
#define sysctl_legacy_va_layout 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/processor.h>
|
|
|
|
|
|
|
|
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Linux kernel virtual memory manager primitives.
|
|
|
|
* The idea being to have a "virtual" mm in the same way
|
|
|
|
* we have a virtual fs - giving a cleaner interface to the
|
|
|
|
* mm details, and allowing different kinds of memory mappings
|
|
|
|
* (from shared memory to executable loading to arbitrary
|
|
|
|
* mmap() functions).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This struct defines a memory VMM memory area. There is one of these
|
|
|
|
* per VM-area/task. A VM area is any part of the process virtual memory
|
|
|
|
* space that has a special rule for the page-fault handlers (ie a shared
|
|
|
|
* library, the executable area etc).
|
|
|
|
*/
|
|
|
|
struct vm_area_struct {
|
|
|
|
struct mm_struct * vm_mm; /* The address space we belong to. */
|
|
|
|
unsigned long vm_start; /* Our start address within vm_mm. */
|
|
|
|
unsigned long vm_end; /* The first byte after our end address
|
|
|
|
within vm_mm. */
|
|
|
|
|
|
|
|
/* linked list of VM areas per task, sorted by address */
|
|
|
|
struct vm_area_struct *vm_next;
|
|
|
|
|
|
|
|
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
|
|
|
|
unsigned long vm_flags; /* Flags, listed below. */
|
|
|
|
|
|
|
|
struct rb_node vm_rb;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For areas with an address space and backing store,
|
|
|
|
* linkage into the address_space->i_mmap prio tree, or
|
|
|
|
* linkage to the list of like vmas hanging off its node, or
|
|
|
|
* linkage of vma in the address_space->i_mmap_nonlinear list.
|
|
|
|
*/
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
struct list_head list;
|
|
|
|
void *parent; /* aligns with prio_tree_node parent */
|
|
|
|
struct vm_area_struct *head;
|
|
|
|
} vm_set;
|
|
|
|
|
|
|
|
struct raw_prio_tree_node prio_tree_node;
|
|
|
|
} shared;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
|
|
|
|
* list, after a COW of one of the file pages. A MAP_SHARED vma
|
|
|
|
* can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
|
|
|
|
* or brk vma (with NULL file) can only be in an anon_vma list.
|
|
|
|
*/
|
|
|
|
struct list_head anon_vma_node; /* Serialized by anon_vma->lock */
|
|
|
|
struct anon_vma *anon_vma; /* Serialized by page_table_lock */
|
|
|
|
|
|
|
|
/* Function pointers to deal with this struct. */
|
|
|
|
struct vm_operations_struct * vm_ops;
|
|
|
|
|
|
|
|
/* Information about our backing store: */
|
|
|
|
unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
|
|
|
|
units, *not* PAGE_CACHE_SIZE */
|
|
|
|
struct file * vm_file; /* File we map to (can be NULL). */
|
|
|
|
void * vm_private_data; /* was vm_pte (shared mem) */
|
|
|
|
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
|
|
|
|
|
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
|
|
|
|
* disabled, then there's a single shared list of VMAs maintained by the
|
|
|
|
* system, and mm's subscribe to these individually
|
|
|
|
*/
|
|
|
|
struct vm_list_struct {
|
|
|
|
struct vm_list_struct *next;
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
extern struct rb_root nommu_vma_tree;
|
|
|
|
extern struct rw_semaphore nommu_vma_sem;
|
|
|
|
|
|
|
|
extern unsigned int kobjsize(const void *objp);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vm_flags..
|
|
|
|
*/
|
|
|
|
#define VM_READ 0x00000001 /* currently active flags */
|
|
|
|
#define VM_WRITE 0x00000002
|
|
|
|
#define VM_EXEC 0x00000004
|
|
|
|
#define VM_SHARED 0x00000008
|
|
|
|
|
2005-09-21 12:55:39 -04:00
|
|
|
/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
|
2005-04-16 18:20:36 -04:00
|
|
|
#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
|
|
|
|
#define VM_MAYWRITE 0x00000020
|
|
|
|
#define VM_MAYEXEC 0x00000040
|
|
|
|
#define VM_MAYSHARE 0x00000080
|
|
|
|
|
|
|
|
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
|
|
|
|
#define VM_GROWSUP 0x00000200
|
2005-11-28 17:34:23 -05:00
|
|
|
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
|
2005-04-16 18:20:36 -04:00
|
|
|
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
|
|
|
|
|
|
|
|
#define VM_EXECUTABLE 0x00001000
|
|
|
|
#define VM_LOCKED 0x00002000
|
|
|
|
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
|
|
|
|
|
|
|
|
/* Used by sys_madvise() */
|
|
|
|
#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
|
|
|
|
#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
|
|
|
|
|
|
|
|
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
|
|
|
|
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
|
[PATCH] unpaged: VM_UNPAGED
Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage. The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.
Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched. Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED. Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.
Revert addition of VM_RESERVED to powerpc vdso, it's not needed there. Is it
needed anywhere? It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).
Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 00:32:15 -05:00
|
|
|
#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */
|
2005-04-16 18:20:36 -04:00
|
|
|
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
|
|
|
|
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
|
|
|
|
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
|
|
|
|
#define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
|
2005-12-16 13:21:23 -05:00
|
|
|
#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
|
|
|
|
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
|
|
|
|
#else
|
|
|
|
#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ)
|
|
|
|
#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK
|
|
|
|
#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK))
|
|
|
|
#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ)
|
|
|
|
#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mapping from the currently active vm_flags protection bits (the
|
|
|
|
* low four bits) to a page protection mask..
|
|
|
|
*/
|
|
|
|
extern pgprot_t protection_map[16];
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These are the virtual MM functions - opening of an area, closing and
|
|
|
|
* unmapping it (needed to keep files on disk up-to-date etc), pointer
|
|
|
|
* to the functions called when a no-page or a wp-page exception occurs.
|
|
|
|
*/
|
|
|
|
struct vm_operations_struct {
|
|
|
|
void (*open)(struct vm_area_struct * area);
|
|
|
|
void (*close)(struct vm_area_struct * area);
|
|
|
|
struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
|
|
|
|
int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
|
2006-06-23 05:03:43 -04:00
|
|
|
|
|
|
|
/* notification that a previously read-only page is about to become
|
|
|
|
* writable, if an error is returned it will cause a SIGBUS */
|
|
|
|
int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
|
2005-04-16 18:20:36 -04:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
|
|
|
|
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr);
|
2006-06-25 08:46:48 -04:00
|
|
|
int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
|
|
|
|
const nodemask_t *to, unsigned long flags);
|
2005-04-16 18:20:36 -04:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
struct mmu_gather;
|
|
|
|
struct inode;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Each physical page in the system has a struct page associated with
|
|
|
|
* it to keep track of whatever it is we are using the page for at the
|
|
|
|
* moment. Note that we have no way to track which tasks are using
|
|
|
|
* a page.
|
|
|
|
*/
|
|
|
|
struct page {
|
2005-11-05 11:25:53 -05:00
|
|
|
unsigned long flags; /* Atomic flags, some possibly
|
2005-04-16 18:20:36 -04:00
|
|
|
* updated asynchronously */
|
|
|
|
atomic_t _count; /* Usage count, see below. */
|
|
|
|
atomic_t _mapcount; /* Count of ptes mapped in mms,
|
|
|
|
* to show when page is mapped
|
|
|
|
* & limit reverse map searches.
|
|
|
|
*/
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
union {
|
2006-01-08 04:04:36 -05:00
|
|
|
struct {
|
|
|
|
unsigned long private; /* Mapping-private opaque data:
|
|
|
|
* usually used for buffer_heads
|
|
|
|
* if PagePrivate set; used for
|
2006-04-09 21:21:48 -04:00
|
|
|
* swp_entry_t if PageSwapCache;
|
2006-01-08 04:04:36 -05:00
|
|
|
* indicates order in the buddy
|
2006-04-09 21:21:48 -04:00
|
|
|
* system if PG_buddy is set.
|
2006-01-08 04:04:36 -05:00
|
|
|
*/
|
|
|
|
struct address_space *mapping; /* If low bit clear, points to
|
|
|
|
* inode address_space, or NULL.
|
|
|
|
* If page mapped as anonymous
|
|
|
|
* memory, low bit is set, and
|
|
|
|
* it points to anon_vma object:
|
|
|
|
* see PAGE_MAPPING_ANON below.
|
|
|
|
*/
|
|
|
|
};
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
|
2006-01-08 04:04:36 -05:00
|
|
|
spinlock_t ptl;
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
#endif
|
2006-01-08 04:04:36 -05:00
|
|
|
};
|
2005-04-16 18:20:36 -04:00
|
|
|
pgoff_t index; /* Our offset within mapping. */
|
|
|
|
struct list_head lru; /* Pageout list, eg. active_list
|
|
|
|
* protected by zone->lru_lock !
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* On machines where all RAM is mapped into kernel address space,
|
|
|
|
* we can simply calculate the virtual address. On machines with
|
|
|
|
* highmem some memory is mapped into kernel virtual memory
|
|
|
|
* dynamically, so we need a place to store that address.
|
|
|
|
* Note that this field could be 16 bits on x86 ... ;)
|
|
|
|
*
|
|
|
|
* Architectures with slow multiplication can define
|
|
|
|
* WANT_PAGE_VIRTUAL in asm/page.h
|
|
|
|
*/
|
|
|
|
#if defined(WANT_PAGE_VIRTUAL)
|
|
|
|
void *virtual; /* Kernel virtual address (NULL if
|
|
|
|
not kmapped, ie. highmem) */
|
|
|
|
#endif /* WANT_PAGE_VIRTUAL */
|
|
|
|
};
|
|
|
|
|
2006-01-08 04:04:36 -05:00
|
|
|
#define page_private(page) ((page)->private)
|
|
|
|
#define set_page_private(page, v) ((page)->private = (v))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/*
|
|
|
|
* FIXME: take this include out, include page-flags.h in
|
|
|
|
* files which need it (119 of them)
|
|
|
|
*/
|
|
|
|
#include <linux/page-flags.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Methods to modify the page usage count.
|
|
|
|
*
|
|
|
|
* What counts for a page usage:
|
|
|
|
* - cache mapping (page->mapping)
|
|
|
|
* - private data (page->private)
|
|
|
|
* - page mapped in a task's page tables, each mapping
|
|
|
|
* is counted separately
|
|
|
|
*
|
|
|
|
* Also, many kernel routines increase the page count before a critical
|
|
|
|
* routine so they can be sure the page doesn't go away from under them.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop a ref, return true if the logical refcount fell to zero (the page has
|
|
|
|
* no users)
|
|
|
|
*/
|
2006-03-22 03:08:03 -05:00
|
|
|
static inline int put_page_testzero(struct page *page)
|
|
|
|
{
|
2006-03-22 03:08:03 -05:00
|
|
|
BUG_ON(atomic_read(&page->_count) == 0);
|
|
|
|
return atomic_dec_and_test(&page->_count);
|
2006-03-22 03:08:03 -05:00
|
|
|
}
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/*
|
2006-03-22 03:08:03 -05:00
|
|
|
* Try to grab a ref unless the page has a refcount of zero, return false if
|
|
|
|
* that is the case.
|
2005-04-16 18:20:36 -04:00
|
|
|
*/
|
2006-03-22 03:08:03 -05:00
|
|
|
static inline int get_page_unless_zero(struct page *page)
|
|
|
|
{
|
2006-03-22 03:08:03 -05:00
|
|
|
return atomic_inc_not_zero(&page->_count);
|
2006-03-22 03:08:03 -05:00
|
|
|
}
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
extern void FASTCALL(__page_cache_release(struct page *));
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
static inline int page_count(struct page *page)
|
2005-04-16 18:20:36 -04:00
|
|
|
{
|
2006-03-22 03:08:43 -05:00
|
|
|
if (unlikely(PageCompound(page)))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
page = (struct page *)page_private(page);
|
2006-03-22 03:08:03 -05:00
|
|
|
return atomic_read(&page->_count);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void get_page(struct page *page)
|
|
|
|
{
|
|
|
|
if (unlikely(PageCompound(page)))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
page = (struct page *)page_private(page);
|
2005-04-16 18:20:36 -04:00
|
|
|
atomic_inc(&page->_count);
|
|
|
|
}
|
|
|
|
|
2006-03-22 03:08:40 -05:00
|
|
|
/*
|
|
|
|
* Setup the page count before being freed into the page allocator for
|
|
|
|
* the first time (boot or memory hotplug)
|
|
|
|
*/
|
|
|
|
static inline void init_page_count(struct page *page)
|
|
|
|
{
|
|
|
|
atomic_set(&page->_count, 1);
|
|
|
|
}
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
void put_page(struct page *page);
|
2006-08-14 02:24:27 -04:00
|
|
|
void put_pages_list(struct list_head *pages);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2006-03-22 03:08:05 -05:00
|
|
|
void split_page(struct page *page, unsigned int order);
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/*
|
|
|
|
* Multiple processes may "see" the same page. E.g. for untouched
|
|
|
|
* mappings of /dev/null, all processes see the same page full of
|
|
|
|
* zeroes, and text pages of executables and shared libraries have
|
|
|
|
* only one copy in memory, at most, normally.
|
|
|
|
*
|
|
|
|
* For the non-reserved pages, page_count(page) denotes a reference count.
|
2005-09-21 12:55:38 -04:00
|
|
|
* page_count() == 0 means the page is free. page->lru is then used for
|
|
|
|
* freelist management in the buddy allocator.
|
2005-04-16 18:20:36 -04:00
|
|
|
* page_count() == 1 means the page is used for exactly one purpose
|
|
|
|
* (e.g. a private data page of one process).
|
|
|
|
*
|
|
|
|
* A page may be used for kmalloc() or anyone else who does a
|
|
|
|
* __get_free_page(). In this case the page_count() is at least 1, and
|
|
|
|
* all other fields are unused but should be 0 or NULL. The
|
|
|
|
* management of this page is the responsibility of the one who uses
|
|
|
|
* it.
|
|
|
|
*
|
|
|
|
* The other pages (we may call them "process pages") are completely
|
|
|
|
* managed by the Linux memory manager: I/O, buffers, swapping etc.
|
|
|
|
* The following discussion applies only to them.
|
|
|
|
*
|
|
|
|
* A page may belong to an inode's memory mapping. In this case,
|
|
|
|
* page->mapping is the pointer to the inode, and page->index is the
|
|
|
|
* file offset of the page, in units of PAGE_CACHE_SIZE.
|
|
|
|
*
|
|
|
|
* A page contains an opaque `private' member, which belongs to the
|
|
|
|
* page's address_space. Usually, this is the address of a circular
|
|
|
|
* list of the page's disk buffers.
|
|
|
|
*
|
|
|
|
* For pages belonging to inodes, the page_count() is the number of
|
|
|
|
* attaches, plus 1 if `private' contains something, plus one for
|
|
|
|
* the page cache itself.
|
|
|
|
*
|
2005-09-21 12:55:38 -04:00
|
|
|
* Instead of keeping dirty/clean pages in per address-space lists, we instead
|
|
|
|
* now tag pages as dirty/under writeback in the radix tree.
|
2005-04-16 18:20:36 -04:00
|
|
|
*
|
|
|
|
* There is also a per-mapping radix tree mapping index to the page
|
|
|
|
* in memory if present. The tree is rooted at mapping->root.
|
|
|
|
*
|
|
|
|
* All process pages can do I/O:
|
|
|
|
* - inode pages may need to be read from disk,
|
|
|
|
* - inode pages which have been modified and are MAP_SHARED may need
|
|
|
|
* to be written to disk,
|
|
|
|
* - private pages which have been modified may need to be swapped out
|
|
|
|
* to swap space and (later) to be read back into memory.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The zone field is never updated after free_area_init_core()
|
|
|
|
* sets it, so none of the operations on it need to be atomic.
|
|
|
|
*/
|
2005-06-23 03:07:40 -04:00
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* page->flags layout:
|
|
|
|
*
|
|
|
|
* There are three possibilities for how page->flags get
|
|
|
|
* laid out. The first is for the normal case, without
|
|
|
|
* sparsemem. The second is for sparsemem when there is
|
|
|
|
* plenty of space for node and section. The last is when
|
|
|
|
* we have run out of space and have to fall back to an
|
|
|
|
* alternate (slower) way of determining the node.
|
|
|
|
*
|
|
|
|
* No sparsemem: | NODE | ZONE | ... | FLAGS |
|
|
|
|
* with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
|
|
|
|
* no space for node: | SECTION | ZONE | ... | FLAGS |
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SPARSEMEM
|
|
|
|
#define SECTIONS_WIDTH SECTIONS_SHIFT
|
|
|
|
#else
|
|
|
|
#define SECTIONS_WIDTH 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define ZONES_WIDTH ZONES_SHIFT
|
|
|
|
|
|
|
|
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
|
|
|
|
#define NODES_WIDTH NODES_SHIFT
|
|
|
|
#else
|
|
|
|
#define NODES_WIDTH 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
|
2005-11-05 11:25:53 -05:00
|
|
|
#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
|
|
|
|
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are going to use the flags for the page to node mapping if its in
|
|
|
|
* there. This includes the case where there is no node, so it is implicit.
|
|
|
|
*/
|
|
|
|
#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0)
|
|
|
|
|
|
|
|
#ifndef PFN_SECTION_SHIFT
|
|
|
|
#define PFN_SECTION_SHIFT 0
|
|
|
|
#endif
|
2005-06-23 03:07:40 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Define the bit shifts to access each section. For non-existant
|
|
|
|
* sections we define the shift as 0; that plus a 0 mask ensures
|
|
|
|
* the compiler will optimise away reference to them.
|
|
|
|
*/
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
|
|
|
|
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
|
|
|
|
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
|
2005-06-23 03:07:40 -04:00
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
|
|
|
|
#if FLAGS_HAS_NODE
|
2005-06-23 03:07:40 -04:00
|
|
|
#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT)
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
#else
|
|
|
|
#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
|
|
|
|
#endif
|
2005-06-23 03:07:40 -04:00
|
|
|
#define ZONETABLE_PGSHIFT ZONES_PGSHIFT
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
|
|
|
|
#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
|
2005-06-23 03:07:40 -04:00
|
|
|
#endif
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
|
|
|
|
#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
|
|
|
|
#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
|
2005-06-23 03:07:40 -04:00
|
|
|
#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1)
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
static inline unsigned long page_zonenum(struct page *page)
|
|
|
|
{
|
2005-06-23 03:07:40 -04:00
|
|
|
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
struct zone;
|
|
|
|
extern struct zone *zone_table[];
|
|
|
|
|
2006-06-23 05:03:01 -04:00
|
|
|
static inline int page_zone_id(struct page *page)
|
|
|
|
{
|
|
|
|
return (page->flags >> ZONETABLE_PGSHIFT) & ZONETABLE_MASK;
|
|
|
|
}
|
2005-04-16 18:20:36 -04:00
|
|
|
static inline struct zone *page_zone(struct page *page)
|
|
|
|
{
|
2006-06-23 05:03:01 -04:00
|
|
|
return zone_table[page_zone_id(page)];
|
2005-06-23 03:07:40 -04:00
|
|
|
}
|
|
|
|
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
static inline unsigned long page_to_nid(struct page *page)
|
|
|
|
{
|
|
|
|
if (FLAGS_HAS_NODE)
|
|
|
|
return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
|
|
|
|
else
|
|
|
|
return page_zone(page)->zone_pgdat->node_id;
|
|
|
|
}
|
|
|
|
static inline unsigned long page_to_section(struct page *page)
|
|
|
|
{
|
|
|
|
return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
|
|
|
|
}
|
|
|
|
|
2005-06-23 03:07:40 -04:00
|
|
|
static inline void set_page_zone(struct page *page, unsigned long zone)
|
|
|
|
{
|
|
|
|
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
|
|
|
|
page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
|
|
|
|
}
|
|
|
|
static inline void set_page_node(struct page *page, unsigned long node)
|
|
|
|
{
|
|
|
|
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
|
|
|
|
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
static inline void set_page_section(struct page *page, unsigned long section)
|
|
|
|
{
|
|
|
|
page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
|
|
page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
|
|
|
|
}
|
2005-04-16 18:20:36 -04:00
|
|
|
|
2005-06-23 03:07:40 -04:00
|
|
|
static inline void set_page_links(struct page *page, unsigned long zone,
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
unsigned long node, unsigned long pfn)
|
2005-04-16 18:20:36 -04:00
|
|
|
{
|
2005-06-23 03:07:40 -04:00
|
|
|
set_page_zone(page, zone);
|
|
|
|
set_page_node(page, node);
|
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 03:07:54 -04:00
|
|
|
set_page_section(page, pfn_to_section_nr(pfn));
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
2006-06-30 04:55:32 -04:00
|
|
|
/*
|
|
|
|
* Some inline functions in vmstat.h depend on page_zone()
|
|
|
|
*/
|
|
|
|
#include <linux/vmstat.h>
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
#ifndef CONFIG_DISCONTIGMEM
|
|
|
|
/* The array of struct pages - for discontigmem use pgdat->lmem_map */
|
|
|
|
extern struct page *mem_map;
|
|
|
|
#endif
|
|
|
|
|
2006-01-14 16:21:30 -05:00
|
|
|
static __always_inline void *lowmem_page_address(struct page *page)
|
2005-04-16 18:20:36 -04:00
|
|
|
{
|
|
|
|
return __va(page_to_pfn(page) << PAGE_SHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define HASHED_PAGE_VIRTUAL
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define page_address(page) ((page)->virtual)
|
|
|
|
#define set_page_address(page, address) \
|
|
|
|
do { \
|
|
|
|
(page)->virtual = (address); \
|
|
|
|
} while(0)
|
|
|
|
#define page_address_init() do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(HASHED_PAGE_VIRTUAL)
|
|
|
|
void *page_address(struct page *page);
|
|
|
|
void set_page_address(struct page *page, void *virtual);
|
|
|
|
void page_address_init(void);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
|
|
|
|
#define page_address(page) lowmem_page_address(page)
|
|
|
|
#define set_page_address(page, address) do { } while(0)
|
|
|
|
#define page_address_init() do { } while(0)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On an anonymous page mapped into a user virtual memory area,
|
|
|
|
* page->mapping points to its anon_vma, not to a struct address_space;
|
|
|
|
* with the PAGE_MAPPING_ANON bit set to distinguish it.
|
|
|
|
*
|
|
|
|
* Please note that, confusingly, "page_mapping" refers to the inode
|
|
|
|
* address_space which maps the page from disk; whereas "page_mapped"
|
|
|
|
* refers to user virtual address space into which the page is mapped.
|
|
|
|
*/
|
|
|
|
#define PAGE_MAPPING_ANON 1
|
|
|
|
|
|
|
|
extern struct address_space swapper_space;
|
|
|
|
static inline struct address_space *page_mapping(struct page *page)
|
|
|
|
{
|
|
|
|
struct address_space *mapping = page->mapping;
|
|
|
|
|
|
|
|
if (unlikely(PageSwapCache(page)))
|
|
|
|
mapping = &swapper_space;
|
|
|
|
else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON))
|
|
|
|
mapping = NULL;
|
|
|
|
return mapping;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int PageAnon(struct page *page)
|
|
|
|
{
|
|
|
|
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the pagecache index of the passed page. Regular pagecache pages
|
|
|
|
* use ->index whereas swapcache pages use ->private
|
|
|
|
*/
|
|
|
|
static inline pgoff_t page_index(struct page *page)
|
|
|
|
{
|
|
|
|
if (unlikely(PageSwapCache(page)))
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
return page_private(page);
|
2005-04-16 18:20:36 -04:00
|
|
|
return page->index;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The atomic page->_mapcount, like _count, starts from -1:
|
|
|
|
* so that transitions both from it and to it can be tracked,
|
|
|
|
* using atomic_inc_and_test and atomic_add_negative(-1).
|
|
|
|
*/
|
|
|
|
static inline void reset_page_mapcount(struct page *page)
|
|
|
|
{
|
|
|
|
atomic_set(&(page)->_mapcount, -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int page_mapcount(struct page *page)
|
|
|
|
{
|
|
|
|
return atomic_read(&(page)->_mapcount) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return true if this page is mapped into pagetables.
|
|
|
|
*/
|
|
|
|
static inline int page_mapped(struct page *page)
|
|
|
|
{
|
|
|
|
return atomic_read(&(page)->_mapcount) >= 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Error return values for the *_nopage functions
|
|
|
|
*/
|
|
|
|
#define NOPAGE_SIGBUS (NULL)
|
|
|
|
#define NOPAGE_OOM ((struct page *) (-1))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Different kinds of faults, as returned by handle_mm_fault().
|
|
|
|
* Used to decide whether a process gets delivered SIGBUS or
|
|
|
|
* just gets major/minor fault counters bumped up.
|
|
|
|
*/
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 06:24:01 -04:00
|
|
|
#define VM_FAULT_OOM 0x00
|
|
|
|
#define VM_FAULT_SIGBUS 0x01
|
|
|
|
#define VM_FAULT_MINOR 0x02
|
|
|
|
#define VM_FAULT_MAJOR 0x03
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Special case for get_user_pages.
|
|
|
|
* Must be in a distinct bit from the above VM_FAULT_ flags.
|
|
|
|
*/
|
|
|
|
#define VM_FAULT_WRITE 0x10
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
|
|
|
|
|
|
|
|
extern void show_free_areas(void);
|
|
|
|
|
|
|
|
#ifdef CONFIG_SHMEM
|
|
|
|
struct page *shmem_nopage(struct vm_area_struct *vma,
|
|
|
|
unsigned long address, int *type);
|
|
|
|
int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new);
|
|
|
|
struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr);
|
|
|
|
int shmem_lock(struct file *file, int lock, struct user_struct *user);
|
|
|
|
#else
|
|
|
|
#define shmem_nopage filemap_nopage
|
2006-01-06 03:10:52 -05:00
|
|
|
|
|
|
|
static inline int shmem_lock(struct file *file, int lock,
|
|
|
|
struct user_struct *user)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int shmem_set_policy(struct vm_area_struct *vma,
|
|
|
|
struct mempolicy *new)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-04-16 18:20:36 -04:00
|
|
|
#endif
|
|
|
|
struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
|
2006-01-06 03:11:42 -05:00
|
|
|
extern int shmem_mmap(struct file *file, struct vm_area_struct *vma);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
int shmem_zero_setup(struct vm_area_struct *);
|
|
|
|
|
2006-01-06 03:11:42 -05:00
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
extern unsigned long shmem_get_unmapped_area(struct file *file,
|
|
|
|
unsigned long addr,
|
|
|
|
unsigned long len,
|
|
|
|
unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
static inline int can_do_mlock(void)
|
|
|
|
{
|
|
|
|
if (capable(CAP_IPC_LOCK))
|
|
|
|
return 1;
|
|
|
|
if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
extern int user_shm_lock(size_t, struct user_struct *);
|
|
|
|
extern void user_shm_unlock(size_t, struct user_struct *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parameter block passed down to zap_pte_range in exceptional cases.
|
|
|
|
*/
|
|
|
|
struct zap_details {
|
|
|
|
struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
|
|
|
|
struct address_space *check_mapping; /* Check page->mapping if set */
|
|
|
|
pgoff_t first_index; /* Lowest page->index to unmap */
|
|
|
|
pgoff_t last_index; /* Highest page->index to unmap */
|
|
|
|
spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
|
|
|
|
unsigned long truncate_count; /* Compare vm_truncate_count */
|
|
|
|
};
|
|
|
|
|
2005-11-28 17:34:23 -05:00
|
|
|
struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
|
2005-04-19 16:29:15 -04:00
|
|
|
unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
|
2005-04-16 18:20:36 -04:00
|
|
|
unsigned long size, struct zap_details *);
|
2005-10-29 21:16:30 -04:00
|
|
|
unsigned long unmap_vmas(struct mmu_gather **tlb,
|
2005-04-16 18:20:36 -04:00
|
|
|
struct vm_area_struct *start_vma, unsigned long start_addr,
|
|
|
|
unsigned long end_addr, unsigned long *nr_accounted,
|
|
|
|
struct zap_details *);
|
2005-04-19 16:29:16 -04:00
|
|
|
void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor, unsigned long ceiling);
|
|
|
|
void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
|
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-04-19 16:29:15 -04:00
|
|
|
unsigned long floor, unsigned long ceiling);
|
2005-04-16 18:20:36 -04:00
|
|
|
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|
|
|
struct vm_area_struct *vma);
|
|
|
|
int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
|
|
|
|
unsigned long size, pgprot_t prot);
|
|
|
|
void unmap_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen, int even_cows);
|
|
|
|
|
|
|
|
static inline void unmap_shared_mapping_range(struct address_space *mapping,
|
|
|
|
loff_t const holebegin, loff_t const holelen)
|
|
|
|
{
|
|
|
|
unmap_mapping_range(mapping, holebegin, holelen, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int vmtruncate(struct inode * inode, loff_t offset);
|
[PATCH] madvise(MADV_REMOVE): remove pages from tmpfs shm backing store
Here is the patch to implement madvise(MADV_REMOVE) - which frees up a
given range of pages & its associated backing store. Current
implementation supports only shmfs/tmpfs and other filesystems return
-ENOSYS.
"Some app allocates large tmpfs files, then when some task quits and some
client disconnect, some memory can be released. However the only way to
release tmpfs-swap is to MADV_REMOVE". - Andrea Arcangeli
Databases want to use this feature to drop a section of their bufferpool
(shared memory segments) - without writing back to disk/swap space.
This feature is also useful for supporting hot-plug memory on UML.
Concerns raised by Andrew Morton:
- "We have no plan for holepunching! If we _do_ have such a plan (or
might in the future) then what would the API look like? I think
sys_holepunch(fd, start, len), so we should start out with that."
- Using madvise is very weird, because people will ask "why do I need to
mmap my file before I can stick a hole in it?"
- None of the other madvise operations call into the filesystem in this
manner. A broad question is: is this capability an MM operation or a
filesytem operation? truncate, for example, is a filesystem operation
which sometimes has MM side-effects. madvise is an mm operation and with
this patch, it gains FS side-effects, only they're really, really
significant ones."
Comments:
- Andrea suggested the fs operation too but then it's more efficient to
have it as a mm operation with fs side effects, because they don't
immediatly know fd and physical offset of the range. It's possible to
fixup in userland and to use the fs operation but it's more expensive,
the vmas are already in the kernel and we can use them.
Short term plan & Future Direction:
- We seem to need this interface only for shmfs/tmpfs files in the short
term. We have to add hooks into the filesystem for correctness and
completeness. This is what this patch does.
- In the future, plan is to support both fs and mmap apis also. This
also involves (other) filesystem specific functions to be implemented.
- Current patch doesn't support VM_NONLINEAR - which can be addressed in
the future.
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andrea Arcangeli <andrea@suse.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-06 03:10:38 -05:00
|
|
|
extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
|
|
|
|
extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 06:24:01 -04:00
|
|
|
|
2006-01-06 03:11:44 -05:00
|
|
|
#ifdef CONFIG_MMU
|
|
|
|
extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
|
|
|
|
unsigned long address, int write_access);
|
|
|
|
|
|
|
|
static inline int handle_mm_fault(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma, unsigned long address,
|
|
|
|
int write_access)
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 06:24:01 -04:00
|
|
|
{
|
2006-01-06 03:11:44 -05:00
|
|
|
return __handle_mm_fault(mm, vma, address, write_access) &
|
|
|
|
(~VM_FAULT_WRITE);
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 06:24:01 -04:00
|
|
|
}
|
2006-01-06 03:11:44 -05:00
|
|
|
#else
|
|
|
|
static inline int handle_mm_fault(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma, unsigned long address,
|
|
|
|
int write_access)
|
|
|
|
{
|
|
|
|
/* should never happen if there's no MMU */
|
|
|
|
BUG();
|
|
|
|
return VM_FAULT_SIGBUS;
|
|
|
|
}
|
|
|
|
#endif
|
[PATCH] fix get_user_pages bug
Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.
So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.
But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case. To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.
Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-08-03 06:24:01 -04:00
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
extern int make_pages_present(unsigned long addr, unsigned long end);
|
|
|
|
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
|
|
|
|
void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
|
|
|
|
|
|
|
|
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
|
|
|
|
int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
|
2005-10-29 21:16:12 -04:00
|
|
|
void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
int __set_page_dirty_buffers(struct page *page);
|
|
|
|
int __set_page_dirty_nobuffers(struct page *page);
|
|
|
|
int redirty_page_for_writepage(struct writeback_control *wbc,
|
|
|
|
struct page *page);
|
|
|
|
int FASTCALL(set_page_dirty(struct page *page));
|
|
|
|
int set_page_dirty_lock(struct page *page);
|
|
|
|
int clear_page_dirty_for_io(struct page *page);
|
|
|
|
|
|
|
|
extern unsigned long do_mremap(unsigned long addr,
|
|
|
|
unsigned long old_len, unsigned long new_len,
|
|
|
|
unsigned long flags, unsigned long new_addr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prototype to add a shrinker callback for ageable caches.
|
|
|
|
*
|
|
|
|
* These functions are passed a count `nr_to_scan' and a gfpmask. They should
|
|
|
|
* scan `nr_to_scan' objects, attempting to free them.
|
|
|
|
*
|
2005-05-05 19:16:14 -04:00
|
|
|
* The callback must return the number of objects which remain in the cache.
|
2005-04-16 18:20:36 -04:00
|
|
|
*
|
2005-05-05 19:16:14 -04:00
|
|
|
* The callback will be passed nr_to_scan == 0 when the VM is querying the
|
2005-04-16 18:20:36 -04:00
|
|
|
* cache size, so a fastpath for that case is appropriate.
|
|
|
|
*/
|
2005-10-21 03:18:50 -04:00
|
|
|
typedef int (*shrinker_t)(int nr_to_scan, gfp_t gfp_mask);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Add an aging callback. The int is the number of 'seeks' it takes
|
|
|
|
* to recreate one of the objects that these functions age.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define DEFAULT_SEEKS 2
|
|
|
|
struct shrinker;
|
|
|
|
extern struct shrinker *set_shrinker(int, shrinker_t);
|
|
|
|
extern void remove_shrinker(struct shrinker *shrinker);
|
|
|
|
|
2005-11-29 17:03:14 -05:00
|
|
|
extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl));
|
|
|
|
|
2005-10-29 21:16:22 -04:00
|
|
|
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
|
|
|
|
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
|
|
|
|
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
|
|
|
|
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/*
|
|
|
|
* The following ifdef needed to get the 4level-fixup.h header to work.
|
|
|
|
* Remove it when 4level-fixup.h has been removed.
|
|
|
|
*/
|
2005-10-29 21:16:22 -04:00
|
|
|
#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
|
2005-04-16 18:20:36 -04:00
|
|
|
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
|
|
|
|
{
|
2005-10-29 21:16:22 -04:00
|
|
|
return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
|
|
|
|
NULL: pud_offset(pgd, address);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
|
|
|
|
{
|
2005-10-29 21:16:22 -04:00
|
|
|
return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
|
|
|
|
NULL: pmd_offset(pud, address);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
2005-10-29 21:16:22 -04:00
|
|
|
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
|
|
|
|
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
|
|
|
|
/*
|
|
|
|
* We tuck a spinlock to guard each pagetable page into its struct page,
|
|
|
|
* at page->private, with BUILD_BUG_ON to make sure that this will not
|
|
|
|
* overflow into the next struct page (as it might with DEBUG_SPINLOCK).
|
|
|
|
* When freeing, reset page->mapping so free_pages_check won't complain.
|
|
|
|
*/
|
2006-01-08 04:04:36 -05:00
|
|
|
#define __pte_lockptr(page) &((page)->ptl)
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
#define pte_lock_init(_page) do { \
|
|
|
|
spin_lock_init(__pte_lockptr(_page)); \
|
|
|
|
} while (0)
|
|
|
|
#define pte_lock_deinit(page) ((page)->mapping = NULL)
|
|
|
|
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* We use mm->page_table_lock to guard all pagetable pages of the mm.
|
|
|
|
*/
|
|
|
|
#define pte_lock_init(page) do {} while (0)
|
|
|
|
#define pte_lock_deinit(page) do {} while (0)
|
|
|
|
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
|
|
|
|
#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
|
|
|
|
|
2005-10-29 21:16:23 -04:00
|
|
|
#define pte_offset_map_lock(mm, pmd, address, ptlp) \
|
|
|
|
({ \
|
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 21:16:40 -04:00
|
|
|
spinlock_t *__ptl = pte_lockptr(mm, pmd); \
|
2005-10-29 21:16:23 -04:00
|
|
|
pte_t *__pte = pte_offset_map(pmd, address); \
|
|
|
|
*(ptlp) = __ptl; \
|
|
|
|
spin_lock(__ptl); \
|
|
|
|
__pte; \
|
|
|
|
})
|
|
|
|
|
|
|
|
#define pte_unmap_unlock(pte, ptl) do { \
|
|
|
|
spin_unlock(ptl); \
|
|
|
|
pte_unmap(pte); \
|
|
|
|
} while (0)
|
|
|
|
|
2005-10-29 21:16:22 -04:00
|
|
|
#define pte_alloc_map(mm, pmd, address) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
|
|
|
|
NULL: pte_offset_map(pmd, address))
|
|
|
|
|
2005-10-29 21:16:23 -04:00
|
|
|
#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
|
|
|
|
NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
|
|
|
|
|
2005-10-29 21:16:22 -04:00
|
|
|
#define pte_alloc_kernel(pmd, address) \
|
|
|
|
((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
|
|
|
|
NULL: pte_offset_kernel(pmd, address))
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
extern void free_area_init(unsigned long * zones_size);
|
|
|
|
extern void free_area_init_node(int nid, pg_data_t *pgdat,
|
|
|
|
unsigned long * zones_size, unsigned long zone_start_pfn,
|
|
|
|
unsigned long *zholes_size);
|
|
|
|
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
|
2005-10-29 21:16:54 -04:00
|
|
|
extern void setup_per_zone_pages_min(void);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern void mem_init(void);
|
|
|
|
extern void show_mem(void);
|
|
|
|
extern void si_meminfo(struct sysinfo * val);
|
|
|
|
extern void si_meminfo_node(struct sysinfo *val, int nid);
|
|
|
|
|
2005-06-21 20:14:47 -04:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
extern void setup_per_cpu_pageset(void);
|
|
|
|
#else
|
|
|
|
static inline void setup_per_cpu_pageset(void) {}
|
|
|
|
#endif
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
/* prio_tree.c */
|
|
|
|
void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
|
|
|
|
void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
|
|
|
|
void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
|
|
|
|
struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
|
|
|
|
struct prio_tree_iter *iter);
|
|
|
|
|
|
|
|
#define vma_prio_tree_foreach(vma, iter, root, begin, end) \
|
|
|
|
for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \
|
|
|
|
(vma = vma_prio_tree_next(vma, iter)); )
|
|
|
|
|
|
|
|
static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
|
|
|
|
struct list_head *list)
|
|
|
|
{
|
|
|
|
vma->shared.vm_set.parent = NULL;
|
|
|
|
list_add_tail(&vma->shared.vm_set.list, list);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* mmap.c */
|
|
|
|
extern int __vm_enough_memory(long pages, int cap_sys_admin);
|
|
|
|
extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
|
|
|
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
|
|
|
|
extern struct vm_area_struct *vma_merge(struct mm_struct *,
|
|
|
|
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
|
|
|
|
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
|
|
|
|
struct mempolicy *);
|
|
|
|
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
|
|
|
|
extern int split_vma(struct mm_struct *,
|
|
|
|
struct vm_area_struct *, unsigned long addr, int new_below);
|
|
|
|
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
|
|
|
|
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
|
|
|
|
struct rb_node **, struct rb_node *);
|
2005-10-29 21:15:57 -04:00
|
|
|
extern void unlink_file_vma(struct vm_area_struct *);
|
2005-04-16 18:20:36 -04:00
|
|
|
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
|
|
|
|
unsigned long addr, unsigned long len, pgoff_t pgoff);
|
|
|
|
extern void exit_mmap(struct mm_struct *);
|
2005-05-01 11:58:35 -04:00
|
|
|
extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
|
|
|
|
|
|
|
|
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long pgoff);
|
|
|
|
|
|
|
|
static inline unsigned long do_mmap(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long prot,
|
|
|
|
unsigned long flag, unsigned long offset)
|
|
|
|
{
|
|
|
|
unsigned long ret = -EINVAL;
|
|
|
|
if ((offset + PAGE_ALIGN(len)) < offset)
|
|
|
|
goto out;
|
|
|
|
if (!(offset & ~PAGE_MASK))
|
|
|
|
ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int do_munmap(struct mm_struct *, unsigned long, size_t);
|
|
|
|
|
|
|
|
extern unsigned long do_brk(unsigned long, unsigned long);
|
|
|
|
|
|
|
|
/* filemap.c */
|
|
|
|
extern unsigned long page_unuse(struct page *);
|
|
|
|
extern void truncate_inode_pages(struct address_space *, loff_t);
|
2006-01-06 03:10:36 -05:00
|
|
|
extern void truncate_inode_pages_range(struct address_space *,
|
|
|
|
loff_t lstart, loff_t lend);
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/* generic vm_area_ops exported for stackable file systems */
|
|
|
|
extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
|
|
|
|
extern int filemap_populate(struct vm_area_struct *, unsigned long,
|
|
|
|
unsigned long, pgprot_t, unsigned long, int);
|
|
|
|
|
|
|
|
/* mm/page-writeback.c */
|
|
|
|
int write_one_page(struct page *page, int wait);
|
|
|
|
|
|
|
|
/* readahead.c */
|
|
|
|
#define VM_MAX_READAHEAD 128 /* kbytes */
|
|
|
|
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
|
|
|
|
#define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before
|
|
|
|
* turning readahead off */
|
|
|
|
|
|
|
|
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
2005-11-07 03:59:28 -05:00
|
|
|
pgoff_t offset, unsigned long nr_to_read);
|
2005-04-16 18:20:36 -04:00
|
|
|
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
|
2005-11-07 03:59:28 -05:00
|
|
|
pgoff_t offset, unsigned long nr_to_read);
|
|
|
|
unsigned long page_cache_readahead(struct address_space *mapping,
|
2005-04-16 18:20:36 -04:00
|
|
|
struct file_ra_state *ra,
|
|
|
|
struct file *filp,
|
2005-11-07 03:59:28 -05:00
|
|
|
pgoff_t offset,
|
2005-04-16 18:20:36 -04:00
|
|
|
unsigned long size);
|
|
|
|
void handle_ra_miss(struct address_space *mapping,
|
|
|
|
struct file_ra_state *ra, pgoff_t offset);
|
|
|
|
unsigned long max_sane_readahead(unsigned long nr);
|
|
|
|
|
|
|
|
/* Do stack extension */
|
2005-10-29 21:16:20 -04:00
|
|
|
extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
|
2005-11-18 16:16:42 -05:00
|
|
|
#ifdef CONFIG_IA64
|
2005-10-29 21:16:20 -04:00
|
|
|
extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
|
2005-11-18 16:16:42 -05:00
|
|
|
#endif
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
|
|
|
|
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
|
|
|
|
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
|
|
|
|
struct vm_area_struct **pprev);
|
|
|
|
|
|
|
|
/* Look up the first VMA which intersects the interval start_addr..end_addr-1,
|
|
|
|
NULL if none. Assume start_addr < end_addr. */
|
|
|
|
static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
|
|
|
|
{
|
|
|
|
struct vm_area_struct * vma = find_vma(mm,start_addr);
|
|
|
|
|
|
|
|
if (vma && end_addr <= vma->vm_start)
|
|
|
|
vma = NULL;
|
|
|
|
return vma;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long vma_pages(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
|
2006-07-26 16:39:49 -04:00
|
|
|
pgprot_t vm_get_page_prot(unsigned long vm_flags);
|
2005-10-29 21:16:33 -04:00
|
|
|
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
|
|
|
|
struct page *vmalloc_to_page(void *addr);
|
|
|
|
unsigned long vmalloc_to_pfn(void *addr);
|
|
|
|
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
|
|
|
|
unsigned long pfn, unsigned long size, pgprot_t);
|
2005-11-30 12:35:19 -05:00
|
|
|
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
|
2005-10-29 21:16:33 -04:00
|
|
|
|
2005-11-28 17:34:23 -05:00
|
|
|
struct page *follow_page(struct vm_area_struct *, unsigned long address,
|
2005-10-29 21:16:33 -04:00
|
|
|
unsigned int foll_flags);
|
|
|
|
#define FOLL_WRITE 0x01 /* check pte is writable */
|
|
|
|
#define FOLL_TOUCH 0x02 /* mark page accessed */
|
|
|
|
#define FOLL_GET 0x04 /* do get_page on page */
|
|
|
|
#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
|
2005-04-16 18:20:36 -04:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROC_FS
|
2005-10-29 21:15:56 -04:00
|
|
|
void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
|
2005-04-16 18:20:36 -04:00
|
|
|
#else
|
2005-10-29 21:15:56 -04:00
|
|
|
static inline void vm_stat_account(struct mm_struct *mm,
|
2005-04-16 18:20:36 -04:00
|
|
|
unsigned long flags, struct file *file, long pages)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
|
|
|
#ifndef CONFIG_DEBUG_PAGEALLOC
|
|
|
|
static inline void
|
|
|
|
kernel_map_pages(struct page *page, int numpages, int enable)
|
|
|
|
{
|
2006-01-09 18:59:21 -05:00
|
|
|
if (!PageHighMem(page) && !enable)
|
2006-06-27 05:54:49 -04:00
|
|
|
debug_check_no_locks_freed(page_address(page),
|
|
|
|
numpages * PAGE_SIZE);
|
2005-04-16 18:20:36 -04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk);
|
|
|
|
#ifdef __HAVE_ARCH_GATE_AREA
|
|
|
|
int in_gate_area_no_task(unsigned long addr);
|
|
|
|
int in_gate_area(struct task_struct *task, unsigned long addr);
|
|
|
|
#else
|
|
|
|
int in_gate_area_no_task(unsigned long addr);
|
|
|
|
#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
|
|
|
|
#endif /* __HAVE_ARCH_GATE_AREA */
|
|
|
|
|
2005-04-16 18:24:05 -04:00
|
|
|
/* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */
|
|
|
|
#define OOM_DISABLE -17
|
|
|
|
|
2006-01-08 04:00:39 -05:00
|
|
|
int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
|
|
|
|
void __user *, size_t *, loff_t *);
|
2006-03-22 03:08:19 -05:00
|
|
|
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
|
2006-01-08 04:00:39 -05:00
|
|
|
unsigned long lru_pages);
|
|
|
|
void drop_pagecache(void);
|
|
|
|
void drop_slab(void);
|
|
|
|
|
2006-02-20 21:28:07 -05:00
|
|
|
#ifndef CONFIG_MMU
|
|
|
|
#define randomize_va_space 0
|
|
|
|
#else
|
2006-02-16 17:41:58 -05:00
|
|
|
extern int randomize_va_space;
|
2006-02-20 21:28:07 -05:00
|
|
|
#endif
|
2006-02-16 17:41:58 -05:00
|
|
|
|
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 05:53:50 -04:00
|
|
|
const char *arch_vma_name(struct vm_area_struct *vma);
|
|
|
|
|
2005-04-16 18:20:36 -04:00
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
#endif /* _LINUX_MM_H */
|