Merge "mm: Fix sleeping while atomic during speculative page fault"

This commit is contained in:
qctecmdr 2020-06-25 19:09:41 -07:00 committed by Gerrit - the friendly Code Review server
commit 6e625d330f
31 changed files with 1182 additions and 210 deletions

View File

@ -189,6 +189,7 @@ config ARM64
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
help
ARM 64-bit (AArch64) Linux support.

View File

@ -406,10 +406,9 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re
#define VM_FAULT_BADMAP 0x010000
#define VM_FAULT_BADACCESS 0x020000
static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
unsigned int mm_flags, unsigned long vm_flags)
{
struct vm_area_struct *vma = find_vma(mm, addr);
if (unlikely(!vma))
return VM_FAULT_BADMAP;
@ -456,6 +455,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
vm_fault_t fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
struct vm_area_struct *vma = NULL;
if (kprobe_page_fault(regs, esr))
return 0;
@ -495,6 +495,14 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
/*
* let's try a speculative page fault without grabbing the
* mmap_sem.
*/
fault = handle_speculative_fault(mm, addr, mm_flags, &vma);
if (fault != VM_FAULT_RETRY)
goto done;
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
@ -519,7 +527,10 @@ retry:
#endif
}
fault = __do_page_fault(mm, addr, mm_flags, vm_flags);
if (!vma || !can_reuse_spf_vma(vma, addr))
vma = find_vma(mm, addr);
fault = __do_page_fault(vma, addr, mm_flags, vm_flags);
major |= fault & VM_FAULT_MAJOR;
if (fault & VM_FAULT_RETRY) {
@ -542,11 +553,20 @@ retry:
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;
/*
* Do not try to reuse this vma and fetch it
* again since we will release the mmap_sem.
*/
vma = NULL;
goto retry;
}
}
up_read(&mm->mmap_sem);
done:
/*
* Handle the "normal" (no error) case first.
*/

View File

@ -1277,8 +1277,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags,
vma->vm_flags & ~VM_SOFTDIRTY);
vma_set_page_prot(vma);
vm_write_end(vma);
}
downgrade_write(&mm->mmap_sem);
break;

View File

@ -675,8 +675,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vm_write_begin(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
WRITE_ONCE(vma->vm_flags,
vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING));
vm_write_end(vma);
return 0;
}
@ -919,8 +922,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
else
prev = vma;
}
vma->vm_flags = new_flags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vm_write_end(vma);
}
up_write(&mm->mmap_sem);
mmput(mm);
@ -1487,8 +1492,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
vma->vm_flags = new_flags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
vm_write_end(vma);
skip:
prev = vma;
@ -1650,8 +1657,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
vma->vm_flags = new_flags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vm_write_end(vma);
skip:
prev = vma;

View File

@ -8,7 +8,7 @@
static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
return !!(vma->vm_flags & VM_HUGETLB);
return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
}
#else

View File

@ -124,14 +124,14 @@ static inline void __ClearPageMovable(struct page *page)
#ifdef CONFIG_NUMA_BALANCING
extern bool pmd_trans_migrating(pmd_t pmd);
extern int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node);
struct vm_fault *vmf, int node);
#else
static inline bool pmd_trans_migrating(pmd_t pmd)
{
return false;
}
static inline int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node)
struct vm_fault *vmf, int node)
{
return -EAGAIN; /* can't migrate now */
}

View File

@ -393,6 +393,8 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
#define FAULT_FLAG_PREFAULT_OLD 0x400 /* Make faultaround ptes old */
/* Speculative fault, not holding mmap_sem */
#define FAULT_FLAG_SPECULATIVE 0x200
#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
@ -421,6 +423,10 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
unsigned int sequence;
pmd_t orig_pmd; /* value of PMD at the time of fault */
#endif
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@ -529,6 +535,15 @@ struct vm_operations_struct {
unsigned long addr);
};
static inline void INIT_VMA(struct vm_area_struct *vma)
{
INIT_LIST_HEAD(&vma->anon_vma_chain);
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
seqcount_init(&vma->vm_sequence);
atomic_set(&vma->vm_ref_count, 1);
#endif
}
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
@ -536,7 +551,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
INIT_VMA(vma);
}
static inline void vma_set_anonymous(struct vm_area_struct *vma)
@ -830,9 +845,9 @@ void free_compound_page(struct page *page);
* pte_mkwrite. But get_user_pages can cause write faults for mappings
* that do not have writing enabled, when used by access_process_vm.
*/
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
{
if (likely(vma->vm_flags & VM_WRITE))
if (likely(vma_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
}
@ -1439,8 +1454,14 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap */
};
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, unsigned long vma_flags);
static inline struct page *vm_normal_page(struct vm_area_struct *vma,
unsigned long addr, pte_t pte)
{
return _vm_normal_page(vma, addr, pte, vma->vm_flags);
}
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
@ -1467,6 +1488,47 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address,
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write);
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
static inline void vm_write_begin(struct vm_area_struct *vma)
{
write_seqcount_begin(&vma->vm_sequence);
}
static inline void vm_write_begin_nested(struct vm_area_struct *vma,
int subclass)
{
write_seqcount_begin_nested(&vma->vm_sequence, subclass);
}
static inline void vm_write_end(struct vm_area_struct *vma)
{
write_seqcount_end(&vma->vm_sequence);
}
static inline void vm_raw_write_begin(struct vm_area_struct *vma)
{
raw_write_seqcount_begin(&vma->vm_sequence);
}
static inline void vm_raw_write_end(struct vm_area_struct *vma)
{
raw_write_seqcount_end(&vma->vm_sequence);
}
#else
static inline void vm_write_begin(struct vm_area_struct *vma)
{
}
static inline void vm_write_begin_nested(struct vm_area_struct *vma,
int subclass)
{
}
static inline void vm_write_end(struct vm_area_struct *vma)
{
}
static inline void vm_raw_write_begin(struct vm_area_struct *vma)
{
}
static inline void vm_raw_write_end(struct vm_area_struct *vma)
{
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
@ -1478,6 +1540,43 @@ int invalidate_inode_page(struct page *page);
#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
extern int __handle_speculative_fault(struct mm_struct *mm,
unsigned long address,
unsigned int flags,
struct vm_area_struct **vma);
static inline int handle_speculative_fault(struct mm_struct *mm,
unsigned long address,
unsigned int flags,
struct vm_area_struct **vma)
{
/*
* Try speculative page fault for multithreaded user space task only.
*/
if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) {
*vma = NULL;
return VM_FAULT_RETRY;
}
return __handle_speculative_fault(mm, address, flags, vma);
}
extern bool can_reuse_spf_vma(struct vm_area_struct *vma,
unsigned long address);
#else
static inline int handle_speculative_fault(struct mm_struct *mm,
unsigned long address,
unsigned int flags,
struct vm_area_struct **vma)
{
return VM_FAULT_RETRY;
}
static inline bool can_reuse_spf_vma(struct vm_area_struct *vma,
unsigned long address)
{
return false;
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
@ -2276,16 +2375,29 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
struct vm_area_struct *expand);
struct vm_area_struct *expand, bool keep_locked);
static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
{
return __vma_adjust(vma, start, end, pgoff, insert, NULL);
return __vma_adjust(vma, start, end, pgoff, insert, NULL, false);
}
extern struct vm_area_struct *vma_merge(struct mm_struct *,
extern struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
unsigned long vm_flags, struct anon_vma *anon, struct file *file,
pgoff_t pgoff, struct mempolicy *mpol, struct vm_userfaultfd_ctx uff,
const char __user *user, bool keep_locked);
static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *anon, struct file *file,
pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff,
const char __user *user)
{
return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off,
pol, uff, user, false);
}
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);

View File

@ -361,7 +361,10 @@ struct vm_area_struct {
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
seqcount_t vm_sequence;
atomic_t vm_ref_count; /* see vma_get(), vma_put() */
#endif
ANDROID_KABI_RESERVE(1);
ANDROID_KABI_RESERVE(2);
ANDROID_KABI_RESERVE(3);
@ -385,6 +388,9 @@ struct mm_struct {
struct vm_area_struct *mmap; /* list of VMAs */
struct rb_root mm_rb;
u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
rwlock_t mm_rb_lock;
#endif
#ifdef CONFIG_MMU
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
@ -701,6 +707,7 @@ enum vm_fault_reason {
VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800,
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
VM_FAULT_PTNOTSAME = (__force vm_fault_t)0x004000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};

View File

@ -456,8 +456,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
pgoff += READ_ONCE(vma->vm_pgoff);
return pgoff;
}

View File

@ -179,8 +179,16 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, bool);
void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, int);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long, bool);
void __page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma,
unsigned long address, bool compound);
static inline void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma,
unsigned long address, bool compound)
{
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__page_add_new_anon_rmap(page, vma, address, compound);
}
void page_add_file_rmap(struct page *, bool);
void page_remove_rmap(struct page *, bool);

View File

@ -344,8 +344,14 @@ extern void deactivate_page(struct page *page);
extern void mark_page_lazyfree(struct page *page);
extern void swap_setup(void);
extern void lru_cache_add_active_or_unevictable(struct page *page,
struct vm_area_struct *vma);
extern void __lru_cache_add_active_or_unevictable(struct page *page,
unsigned long vma_flags);
static inline void lru_cache_add_active_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
}
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);

View File

@ -113,6 +113,9 @@ enum vm_event_item { PGPGIN, PGPGOUT,
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
#endif
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
SPECULATIVE_PGFAULT,
#endif
NR_VM_EVENT_ITEMS
};

View File

@ -0,0 +1,88 @@
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pagefault
#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGEFAULT_H
#include <linux/tracepoint.h>
#include <linux/mm.h>
DECLARE_EVENT_CLASS(spf,
TP_PROTO(unsigned long caller,
struct vm_area_struct *vma, unsigned long address),
TP_ARGS(caller, vma, address),
TP_STRUCT__entry(
__field(unsigned long, caller)
__field(unsigned long, vm_start)
__field(unsigned long, vm_end)
__field(unsigned long, address)
),
TP_fast_assign(
__entry->caller = caller;
__entry->vm_start = vma->vm_start;
__entry->vm_end = vma->vm_end;
__entry->address = address;
),
TP_printk("ip:%lx vma:%lx-%lx address:%lx",
__entry->caller, __entry->vm_start, __entry->vm_end,
__entry->address)
);
DEFINE_EVENT(spf, spf_pte_lock,
TP_PROTO(unsigned long caller,
struct vm_area_struct *vma, unsigned long address),
TP_ARGS(caller, vma, address)
);
DEFINE_EVENT(spf, spf_vma_changed,
TP_PROTO(unsigned long caller,
struct vm_area_struct *vma, unsigned long address),
TP_ARGS(caller, vma, address)
);
DEFINE_EVENT(spf, spf_vma_noanon,
TP_PROTO(unsigned long caller,
struct vm_area_struct *vma, unsigned long address),
TP_ARGS(caller, vma, address)
);
DEFINE_EVENT(spf, spf_vma_notsup,
TP_PROTO(unsigned long caller,
struct vm_area_struct *vma, unsigned long address),
TP_ARGS(caller, vma, address)
);
DEFINE_EVENT(spf, spf_vma_access,
TP_PROTO(unsigned long caller,
struct vm_area_struct *vma, unsigned long address),
TP_ARGS(caller, vma, address)
);
DEFINE_EVENT(spf, spf_pmd_changed,
TP_PROTO(unsigned long caller,
struct vm_area_struct *vma, unsigned long address),
TP_ARGS(caller, vma, address)
);
#endif /* _TRACE_PAGEFAULT_H */
/* This part must be outside protection */
#include <trace/define_trace.h>

View File

@ -362,7 +362,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
if (new) {
*new = *orig;
INIT_LIST_HEAD(&new->anon_vma_chain);
INIT_VMA(new);
}
return new;
}
@ -486,7 +486,7 @@ EXPORT_SYMBOL(free_task);
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
{
struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL;
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
@ -605,8 +605,18 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
rb_parent = &tmp->vm_rb;
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
if (!(tmp->vm_flags & VM_WIPEONFORK)) {
if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
/*
* Mark this VMA as changing to prevent the
* speculative page fault hanlder to process
* it until the TLB are flushed below.
*/
last = mpnt;
vm_write_begin(mpnt);
}
retval = copy_page_range(mm, oldmm, mpnt);
}
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@ -619,6 +629,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
/*
* Since the TLB has been flush, we can safely unmark the
* copied VMAs and allows the speculative page fault handler to
* process them again.
* Walk back the VMA list from the last marked VMA.
*/
for (; last; last = last->vm_prev) {
if (last->vm_flags & VM_DONTCOPY)
continue;
if (!(last->vm_flags & VM_WIPEONFORK))
vm_write_end(last);
}
}
up_write(&oldmm->mmap_sem);
dup_userfaultfd_complete(&uf);
fail_uprobe_end:
@ -1028,6 +1054,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
rwlock_init(&mm->mm_rb_lock);
#endif
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);

View File

@ -1268,8 +1268,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
entry = mk_pte(pages[i], vmf->vma_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
@ -2252,7 +2252,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
entry = maybe_mkwrite(entry, vma->vm_flags);
if (!write)
entry = pte_wrprotect(entry);
if (!young)

View File

@ -3950,6 +3950,8 @@ retry:
.vma = vma,
.address = haddr,
.flags = flags,
.vma_flags = vma->vm_flags,
.vma_page_prot = vma->vm_page_prot,
/*
* Hard to debug if it ends up being
* used by a callee that assumes

View File

@ -28,6 +28,9 @@
*/
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
.mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
#endif
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),

View File

@ -36,6 +36,26 @@ void page_writeback_init(void);
vm_fault_t do_swap_page(struct vm_fault *vmf);
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
extern struct vm_area_struct *get_vma(struct mm_struct *mm,
unsigned long addr);
extern void put_vma(struct vm_area_struct *vma);
static inline bool vma_has_changed(struct vm_fault *vmf)
{
int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb);
unsigned int seq = READ_ONCE(vmf->vma->vm_sequence.sequence);
/*
* Matches both the wmb in write_seqlock_{begin,end}() and
* the wmb in vma_rb_erase().
*/
smp_rmb();
return ret || seq != vmf->sequence;
}
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);

View File

@ -900,6 +900,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.pgoff = linear_page_index(vma, address),
.vma_flags = vma->vm_flags,
.vma_page_prot = vma->vm_page_prot,
};
/* we only decide to swapin, if there is enough young ptes */
@ -1026,6 +1028,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (mm_find_pmd(mm, address) != pmd)
goto out;
vm_write_begin(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
@ -1062,6 +1065,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
vm_write_end(vma);
result = SCAN_FAIL;
goto out;
}
@ -1097,6 +1101,7 @@ static void collapse_huge_page(struct mm_struct *mm,
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
vm_write_end(vma);
*hpage = NULL;

View File

@ -171,7 +171,9 @@ success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vm_write_end(vma);
out_convert_errno:
/*
@ -495,9 +497,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
.target_task = task,
};
vm_write_begin(vma);
tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
tlb_end_vma(tlb, vma);
vm_write_end(vma);
}
static long madvise_cold(struct task_struct *task,
@ -531,9 +535,11 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
.target_task = task,
};
vm_write_begin(vma);
tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
tlb_end_vma(tlb, vma);
vm_write_end(vma);
}
static inline bool can_do_pageout(struct vm_area_struct *vma)
@ -736,10 +742,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
vm_write_begin(vma);
tlb_start_vma(&tlb, vma);
walk_page_range(vma->vm_mm, range.start, range.end,
&madvise_free_walk_ops, &tlb);
tlb_end_vma(&tlb, vma);
vm_write_end(vma);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, range.start, range.end);

File diff suppressed because it is too large Load Diff

View File

@ -380,8 +380,11 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
struct vm_area_struct *vma;
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vm_write_begin(vma);
mpol_rebind_policy(vma->vm_policy, new);
vm_write_end(vma);
}
up_write(&mm->mmap_sem);
}
@ -596,9 +599,11 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
{
int nr_updated;
vm_write_begin(vma);
nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
vm_write_end(vma);
return nr_updated;
}
@ -711,6 +716,7 @@ static int vma_replace_policy(struct vm_area_struct *vma,
if (IS_ERR(new))
return PTR_ERR(new);
vm_write_begin(vma);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
@ -718,11 +724,17 @@ static int vma_replace_policy(struct vm_area_struct *vma,
}
old = vma->vm_policy;
vma->vm_policy = new; /* protected by mmap_sem */
/*
* The speculative page fault handler accesses this field without
* hodling the mmap_sem.
*/
WRITE_ONCE(vma->vm_policy, new);
vm_write_end(vma);
mpol_put(old);
return 0;
err_out:
vm_write_end(vma);
mpol_put(new);
return err;
}
@ -1703,23 +1715,28 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = NULL;
struct mempolicy *pol;
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
pol = vma->vm_ops->get_policy(vma, addr);
} else if (vma->vm_policy) {
pol = vma->vm_policy;
if (!vma)
return NULL;
/*
* shmem_alloc_page() passes MPOL_F_SHARED policy with
* a pseudo vma whose vma->vm_ops=NULL. Take a reference
* count on these policies which will be dropped by
* mpol_cond_put() later
*/
if (mpol_needs_cond_ref(pol))
mpol_get(pol);
}
if (vma->vm_ops && vma->vm_ops->get_policy)
return vma->vm_ops->get_policy(vma, addr);
/*
* This could be called without holding the mmap_sem in the
* speculative page fault handler's path.
*/
pol = READ_ONCE(vma->vm_policy);
if (pol) {
/*
* shmem_alloc_page() passes MPOL_F_SHARED policy with
* a pseudo vma whose vma->vm_ops=NULL. Take a reference
* count on these policies which will be dropped by
* mpol_cond_put() later
*/
if (mpol_needs_cond_ref(pol))
mpol_get(pol);
}
return pol;

View File

@ -241,7 +241,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
*/
entry = pte_to_swp_entry(*pvmw.pte);
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
pte = maybe_mkwrite(pte, vma->vm_flags);
if (unlikely(is_zone_device_page(new))) {
if (is_device_private_page(new)) {
@ -1968,7 +1968,7 @@ bool pmd_trans_migrating(pmd_t pmd)
* node. Caller is expected to have an elevated reference count on
* the page that will be dropped by this function before returning.
*/
int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
int migrate_misplaced_page(struct page *page, struct vm_fault *vmf,
int node)
{
pg_data_t *pgdat = NODE_DATA(node);
@ -1981,7 +1981,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* with execute permissions as they are probably shared libraries.
*/
if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
(vma->vm_flags & VM_EXEC))
(vmf->vma_flags & VM_EXEC))
goto out;
/*

View File

@ -445,7 +445,9 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK);
vm_write_end(vma);
while (start < end) {
struct page *page;
@ -569,10 +571,11 @@ success:
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
if (lock)
vma->vm_flags = newflags;
else
if (lock) {
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, newflags);
vm_write_end(vma);
} else
munlock_vma_pages_range(vma, start, end);
out:

218
mm/mmap.c
View File

@ -165,6 +165,27 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
}
static void __free_vma(struct vm_area_struct *vma)
{
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
vm_area_free(vma);
}
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
void put_vma(struct vm_area_struct *vma)
{
if (atomic_dec_and_test(&vma->vm_ref_count))
__free_vma(vma);
}
#else
static inline void put_vma(struct vm_area_struct *vma)
{
__free_vma(vma);
}
#endif
/*
* Close a vm structure and free it, returning the next.
*/
@ -175,10 +196,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
vm_area_free(vma);
put_vma(vma);
return next;
}
@ -431,6 +449,13 @@ static void validate_mm(struct mm_struct *mm)
RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
struct vm_area_struct, vm_rb,
unsigned long, rb_subtree_gap, vma_compute_gap)
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock)
#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
#else
#define mm_rb_write_lock(mm) do { } while (0)
#define mm_rb_write_unlock(mm) do { } while (0)
#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
/*
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
@ -447,26 +472,37 @@ static void vma_gap_update(struct vm_area_struct *vma)
}
static inline void vma_rb_insert(struct vm_area_struct *vma,
struct rb_root *root)
struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
/* All rb_subtree_gap values must be consistent prior to insertion */
validate_mm_rb(root, NULL);
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
/*
* Note rb_erase_augmented is a fairly large inline function,
* so make sure we instantiate it only once with our desired
* augmented rbtree callbacks.
*/
mm_rb_write_lock(mm);
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
mm_rb_write_unlock(mm); /* wmb */
/*
* Ensure the removal is complete before clearing the node.
* Matched by vma_has_changed()/handle_speculative_fault().
*/
RB_CLEAR_NODE(&vma->vm_rb);
}
static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
struct rb_root *root,
struct mm_struct *mm,
struct vm_area_struct *ignore)
{
/*
@ -474,21 +510,21 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
* with the possible exception of the "next" vma being erased if
* next->vm_start was reduced.
*/
validate_mm_rb(root, ignore);
validate_mm_rb(&mm->mm_rb, ignore);
__vma_rb_erase(vma, root);
__vma_rb_erase(vma, mm);
}
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
struct rb_root *root)
struct mm_struct *mm)
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
* with the possible exception of the vma being erased.
*/
validate_mm_rb(root, vma);
validate_mm_rb(&mm->mm_rb, vma);
__vma_rb_erase(vma, root);
__vma_rb_erase(vma, mm);
}
/*
@ -603,10 +639,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
* immediately update the gap to the correct value. Finally we
* rebalance the rbtree after all augmented values have been set.
*/
mm_rb_write_lock(mm);
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
vma->rb_subtree_gap = 0;
vma_gap_update(vma);
vma_rb_insert(vma, &mm->mm_rb);
vma_rb_insert(vma, mm);
mm_rb_write_unlock(mm);
}
static void __vma_link_file(struct vm_area_struct *vma)
@ -682,7 +720,7 @@ static __always_inline void __vma_unlink_common(struct mm_struct *mm,
{
struct vm_area_struct *next;
vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
vma_rb_erase_ignore(vma, mm, ignore);
next = vma->vm_next;
if (has_prev)
prev->vm_next = next;
@ -716,7 +754,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm,
*/
int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
struct vm_area_struct *expand)
struct vm_area_struct *expand, bool keep_locked)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
@ -728,6 +766,30 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
long adjust_next = 0;
int remove_next = 0;
/*
* Why using vm_raw_write*() functions here to avoid lockdep's warning ?
*
* Locked is complaining about a theoretical lock dependency, involving
* 3 locks:
* mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
*
* Here are the major path leading to this dependency :
* 1. __vma_adjust() mmap_sem -> vm_sequence -> i_mmap_rwsem
* 2. move_vmap() mmap_sem -> vm_sequence -> fs_reclaim
* 3. __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem
* 4. unmap_mapping_range() i_mmap_rwsem -> vm_sequence
*
* So there is no way to solve this easily, especially because in
* unmap_mapping_range() the i_mmap_rwsem is grab while the impacted
* VMAs are not yet known.
* However, the way the vm_seq is used is guarantying that we will
* never block on it since we just check for its value and never wait
* for it to move, see vma_has_changed() and handle_speculative_fault().
*/
vm_raw_write_begin(vma);
if (next)
vm_raw_write_begin(next);
if (next && !insert) {
struct vm_area_struct *exporter = NULL, *importer = NULL;
@ -808,8 +870,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
if (error)
if (error) {
if (next && next != vma)
vm_raw_write_end(next);
vm_raw_write_end(vma);
return error;
}
}
}
again:
@ -855,17 +921,18 @@ again:
}
if (start != vma->vm_start) {
vma->vm_start = start;
WRITE_ONCE(vma->vm_start, start);
start_changed = true;
}
if (end != vma->vm_end) {
vma->vm_end = end;
WRITE_ONCE(vma->vm_end, end);
end_changed = true;
}
vma->vm_pgoff = pgoff;
WRITE_ONCE(vma->vm_pgoff, pgoff);
if (adjust_next) {
next->vm_start += adjust_next << PAGE_SHIFT;
next->vm_pgoff += adjust_next;
WRITE_ONCE(next->vm_start,
next->vm_start + (adjust_next << PAGE_SHIFT));
WRITE_ONCE(next->vm_pgoff, next->vm_pgoff + adjust_next);
}
if (root) {
@ -930,15 +997,13 @@ again:
}
if (remove_next) {
if (file) {
if (file)
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
}
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
vm_area_free(next);
vm_raw_write_end(next);
put_vma(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
@ -952,6 +1017,8 @@ again:
* "vma->vm_next" gap must be updated.
*/
next = vma->vm_next;
if (next)
vm_raw_write_begin(next);
} else {
/*
* For the scope of the comment "next" and
@ -998,6 +1065,11 @@ again:
if (insert && file)
uprobe_mmap(insert);
if (next && next != vma)
vm_raw_write_end(next);
if (!keep_locked)
vm_raw_write_end(vma);
validate_mm(mm);
return 0;
@ -1137,13 +1209,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
* parameter) may establish ptes with the wrong permissions of NNNN
* instead of the right permissions of XXXX.
*/
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
const char __user *anon_name)
const char __user *anon_name, bool keep_locked)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@ -1193,10 +1265,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
/* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
prev);
prev, keep_locked);
} else /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL, prev);
end, prev->vm_pgoff, NULL, prev,
keep_locked);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
@ -1214,10 +1287,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
addr, prev->vm_pgoff, NULL, next,
keep_locked);
else { /* cases 3, 8 */
err = __vma_adjust(area, addr, next->vm_end,
next->vm_pgoff - pglen, NULL, next);
next->vm_pgoff - pglen, NULL, next,
keep_locked);
/*
* In case 3 area is already equal to next and
* this is a noop, but in case 8 "area" has
@ -1837,12 +1912,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
out:
perf_event_mmap(vma);
vm_write_begin(vma);
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
WRITE_ONCE(vma->vm_flags,
vma->vm_flags & VM_LOCKED_CLEAR_MASK);
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
@ -1857,9 +1934,10 @@ out:
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY);
vma_set_page_prot(vma);
vm_write_end(vma);
return addr;
@ -2241,15 +2319,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
static struct vm_area_struct *__find_vma(struct mm_struct *mm,
unsigned long addr)
{
struct rb_node *rb_node;
struct vm_area_struct *vma;
/* Check the cache first. */
vma = vmacache_find(mm, addr);
if (likely(vma))
return vma;
struct vm_area_struct *vma = NULL;
rb_node = mm->mm_rb.rb_node;
@ -2267,13 +2341,40 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
rb_node = rb_node->rb_right;
}
return vma;
}
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
/* Check the cache first. */
vma = vmacache_find(mm, addr);
if (likely(vma))
return vma;
vma = __find_vma(mm, addr);
if (vma)
vmacache_update(addr, vma);
return vma;
}
EXPORT_SYMBOL(find_vma);
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
read_lock(&mm->mm_rb_lock);
vma = __find_vma(mm, addr);
if (vma)
atomic_inc(&vma->vm_ref_count);
read_unlock(&mm->mm_rb_lock);
return vma;
}
#endif
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
*/
@ -2494,8 +2595,8 @@ int expand_downwards(struct vm_area_struct *vma,
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
WRITE_ONCE(vma->vm_start, address);
WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
@ -2641,7 +2742,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
vma_rb_erase(vma, &mm->mm_rb);
vma_rb_erase(vma, mm);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
@ -3236,9 +3337,21 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
/* There is 3 cases to manage here in
* AAAA AAAA AAAA AAAA
* PPPP.... PPPP......NNNN PPPP....NNNN PP........NN
* PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL
* PPPPPPPPNNNN(2)
* PPPPNNNNNNNN(3)
*
* new_vma == prev in case A,1,2
* new_vma == next in case B,3
*/
new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
vma_policy(vma), vma->vm_userfaultfd_ctx,
vma_get_anon_name(vma), true);
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@ -3276,6 +3389,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
/*
* As the VMA is linked right now, it may be hit by the
* speculative page fault handler. But we don't want it to
* to start mapping page in this area until the caller has
* potentially move the pte from the moved VMA. To prevent
* that we protect it right now, and let the caller unprotect
* it once the move is done.
*/
vm_raw_write_begin(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
*need_rmap_locks = false;
}

View File

@ -454,12 +454,14 @@ success:
* vm_flags and vm_page_prot are protected by the mmap_sem
* held in write mode.
*/
vma->vm_flags = newflags;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, newflags);
dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
vma_set_page_prot(vma);
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable, 0);
vm_write_end(vma);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major

View File

@ -357,6 +357,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (!new_vma)
return -ENOMEM;
/* new_vma is returned protected by copy_vma, to prevent speculative
* page fault to be done in the destination area before we move the pte.
* Now, we must also protect the source VMA since we don't want pages
* to be mapped in our back while we are copying the PTEs.
*/
if (vma != new_vma)
vm_raw_write_begin(vma);
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
if (moved_len < old_len) {
@ -373,6 +381,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
*/
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
true);
if (vma != new_vma)
vm_raw_write_end(vma);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
@ -381,7 +391,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mremap_userfaultfd_prep(new_vma, uf);
arch_remap(mm, old_addr, old_addr + old_len,
new_addr, new_addr + new_len);
if (vma != new_vma)
vm_raw_write_end(vma);
}
vm_raw_write_end(new_vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {

View File

@ -1138,7 +1138,7 @@ void do_page_add_anon_rmap(struct page *page,
}
/**
* page_add_new_anon_rmap - add pte mapping to a new anonymous page
* __page_add_new_anon_rmap - add pte mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
@ -1148,12 +1148,11 @@ void do_page_add_anon_rmap(struct page *page,
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
void __page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, bool compound)
{
int nr = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__SetPageSwapBacked(page);
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);

View File

@ -452,12 +452,12 @@ void lru_cache_add(struct page *page)
* directly back onto it's zone's unevictable list, it does NOT use a
* per cpu pagevec.
*/
void lru_cache_add_active_or_unevictable(struct page *page,
struct vm_area_struct *vma)
void __lru_cache_add_active_or_unevictable(struct page *page,
unsigned long vma_flags)
{
VM_BUG_ON_PAGE(PageLRU(page), page);
if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
SetPageActive(page);
else if (!TestSetPageMlocked(page)) {
/*

View File

@ -534,7 +534,11 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
* Caller must hold read mmap_sem if vmf->vma is not NULL.
* Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
* This is needed to ensure the VMA will not be freed in our back. In the case
* of the speculative page fault handler, this cannot happen, even if we don't
* hold the mmap_sem. Callees are assumed to take care of reading VMA's fields
* using READ_ONCE() to read consistent values.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
@ -631,9 +635,9 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
unsigned long *start,
unsigned long *end)
{
*start = max3(lpfn, PFN_DOWN(vma->vm_start),
*start = max3(lpfn, PFN_DOWN(READ_ONCE(vma->vm_start)),
PFN_DOWN(faddr & PMD_MASK));
*end = min3(rpfn, PFN_DOWN(vma->vm_end),
*end = min3(rpfn, PFN_DOWN(READ_ONCE(vma->vm_end)),
PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
}

View File

@ -1300,7 +1300,10 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
"speculative_pgfault"
#endif
#endif /* CONFIG_VM_EVENT_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */