diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 94b5d0878b7a4..ddcd621a49ad1 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -75,7 +75,7 @@ Currently, these files are in /proc/sys/vm: - watermark_boost_factor - watermark_scale_factor - zone_reclaim_mode - +- want_old_faultaround_pte admin_reserve_kbytes ==================== @@ -995,3 +995,24 @@ of other processes running on other nodes will not be affected. Allowing regular swap effectively restricts allocations to the local node unless explicitly overridden by memory policies or cpuset configurations. + + +want_old_faultaround_pte: +========================= + +By default faultaround code produces young pte. When want_old_faultaround_pte is +set to 1, faultaround produces old ptes. + +During sparse file access faultaround gets more pages mapped and when all of +them are young (default), under memory pressure, this makes vmscan swap out anon +pages instead, or to drop other page cache pages which otherwise stay resident. +Setting want_old_faultaround_pte to 1 avoids this. + +Making the faultaround ptes old can result in performance regression on some +architectures. This is due to cycles spent in micro-faults which would take page +walk to set young bit in the pte. One such known test that shows a regression on +x86 is unixbench shell8. Set want_old_faultaround_pte to 1 on architectures +which does not show this regression or if the workload shows overall performance +benefit with old faultaround ptes. + +The default value is 0. diff --git a/include/linux/mm.h b/include/linux/mm.h index f7ba5016bd957..8abf98f7be063 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -392,6 +392,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_PREFAULT_OLD 0x400 /* Make faultaround ptes old */ #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ @@ -2882,5 +2883,7 @@ static inline int pages_identical(struct page *page1, struct page *page2) return !memcmp_pages(page1, page2); } +extern int want_old_faultaround_pte; + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88986dded472e..45380ac22a513 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1671,6 +1671,15 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, + { + .procname = "want_old_faultaround_pte", + .data = &want_old_faultaround_pte, + .maxlen = sizeof(want_old_faultaround_pte), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", diff --git a/mm/filemap.c b/mm/filemap.c index a231316067742..b4f3c74e62b40 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -53,6 +53,8 @@ #include +int want_old_faultaround_pte = 1; + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -2685,6 +2687,14 @@ void filemap_map_pages(struct vm_fault *vmf, if (vmf->pte) vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; + + if (want_old_faultaround_pte) { + if (xas.xa_index == vmf->pgoff) + vmf->flags &= ~FAULT_FLAG_PREFAULT_OLD; + else + vmf->flags |= FAULT_FLAG_PREFAULT_OLD; + } + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); diff --git a/mm/memory.c b/mm/memory.c index 772720a2fd9d8..b23f9a66be64d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3314,6 +3314,10 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + + if (vmf->flags & FAULT_FLAG_PREFAULT_OLD) + entry = pte_mkold(entry); + /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);