From ba915b85e55364a5e6a59a56eadc34f547e52123 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 4 Mar 2024 14:22:12 +0100 Subject: [PATCH 01/13] UPSTREAM: netfilter: nf_tables: mark set as dead when unbinding anonymous set with timeout commit 552705a3650bbf46a22b1adedc1b04181490fc36 upstream. While the rhashtable set gc runs asynchronously, a race allows it to collect elements from anonymous sets with timeouts while it is being released from the commit path. Mingi Cho originally reported this issue in a different path in 6.1.x with a pipapo set with low timeouts which is not possible upstream since 7395dfacfff6 ("netfilter: nf_tables: use timestamp to check for set element timeout"). Fix this by setting on the dead flag for anonymous sets to skip async gc in this case. According to 08e4c8c5919f ("netfilter: nf_tables: mark newset as dead on transaction abort"), Florian plans to accelerate abort path by releasing objects via workqueue, therefore, this sets on the dead flag for abort path too. Bug: 329205787 Cc: stable@vger.kernel.org Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Reported-by: Mingi Cho Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 406b0241d0eb598a0b330ab20ae325537d8d8163) Signed-off-by: Lee Jones Change-Id: I6170493c267e020c50a739150f8c421deb635b35 --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7a18fc90a6cd..9ea94dccc5bc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4068,6 +4068,7 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) { list_del_rcu(&set->list); + set->dead = 1; if (event) nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_KERNEL); From a0aeb4678bb69d19b263ed2e677c67dfeea2a915 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 28 Mar 2024 13:27:36 +0100 Subject: [PATCH 02/13] UPSTREAM: netfilter: nf_tables: release batch on table validation from abort path commit a45e6889575c2067d3c0212b6bc1022891e65b91 upstream. Unlike early commit path stage which triggers a call to abort, an explicit release of the batch is required on abort, otherwise mutex is released and commit_list remains in place. Add WARN_ON_ONCE to ensure commit_list is empty from the abort path before releasing the mutex. After this patch, commit_list is always assumed to be empty before grabbing the mutex, therefore 03c1f1ef1584 ("netfilter: Cleanup nft_net->module_list from nf_tables_exit_net()") only needs to release the pending modules for registration. Bug: 332996726 Cc: stable@vger.kernel.org Fixes: c0391b6ab810 ("netfilter: nf_tables: missing validation from the abort path") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman (cherry picked from commit b0b36dcbe0f24383612e5e62bd48df5a8107f7fc) Signed-off-by: Lee Jones Change-Id: I38f9b05ac4eadd1d2b7b306cccaf0aeacb61b57a --- net/netfilter/nf_tables_api.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9ea94dccc5bc..0d487f5e3f81 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7531,10 +7531,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; struct nft_trans_elem *te; + int err = 0; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) - return -EAGAIN; + err = -EAGAIN; list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { @@ -7667,7 +7668,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) else nf_tables_module_autoload_cleanup(net); - return 0; + return err; } static void nf_tables_cleanup(struct net *net) @@ -7686,6 +7687,8 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + mutex_unlock(&nft_net->commit_mutex); return ret; @@ -8377,8 +8380,11 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(nft_net); - if (!list_empty(&nft_net->commit_list)) - __nf_tables_abort(net, NFNL_ABORT_NONE); + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + + if (!list_empty(&nft_net->module_list)) + nf_tables_module_autoload_cleanup(net); + __nft_release_tables(net); nft_gc_seq_end(nft_net, gc_seq); From 70c1800271967d60e9207552384a85a223b03b33 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 28 Mar 2024 14:23:55 +0100 Subject: [PATCH 03/13] UPSTREAM: netfilter: nf_tables: release mutex after nft_gc_seq_end from abort path commit 0d459e2ffb541841714839e8228b845458ed3b27 upstream. The commit mutex should not be released during the critical section between nft_gc_seq_begin() and nft_gc_seq_end(), otherwise, async GC worker could collect expired objects and get the released commit lock within the same GC sequence. nf_tables_module_autoload() temporarily releases the mutex to load module dependencies, then it goes back to replay the transaction again. Move it at the end of the abort phase after nft_gc_seq_end() is called. Bug: 332996726 Cc: stable@vger.kernel.org Fixes: 720344340fb9 ("netfilter: nf_tables: GC transaction race with abort path") Reported-by: Kuan-Ting Chen Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 8038ee3c3e5b59bcd78467686db5270c68544e30) Signed-off-by: Lee Jones Change-Id: I637389421d8eca5ab59a41bd1a4b70432440034c --- net/netfilter/nf_tables_api.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0d487f5e3f81..77c471616552 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7663,11 +7663,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_abort_release(trans); } - if (action == NFNL_ABORT_AUTOLOAD) - nf_tables_module_autoload(net); - else - nf_tables_module_autoload_cleanup(net); - return err; } @@ -7689,6 +7684,14 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + /* module autoload needs to happen after GC sequence update because it + * temporarily releases and grabs mutex again. + */ + if (action == NFNL_ABORT_AUTOLOAD) + nf_tables_module_autoload(net); + else + nf_tables_module_autoload_cleanup(net); + mutex_unlock(&nft_net->commit_mutex); return ret; From a563a5f035dfd59989ab358aeb67c2fe8223e764 Mon Sep 17 00:00:00 2001 From: guchongchong Date: Mon, 15 Apr 2024 11:40:30 +0800 Subject: [PATCH 04/13] ANDROID: GKI: add snd_compr_stop_error to Xiaomi_abi snd_compr_stop_error is necessary for Xiaomi_abi, otherwise it will not compile Bug: 334719622 Change-Id: I7f4ff6d40af487d6cf5795600060d0cc5ac4bc64 Signed-off-by: guchongchong --- android/abi_gki_aarch64_xiaomi | 1 + 1 file changed, 1 insertion(+) diff --git a/android/abi_gki_aarch64_xiaomi b/android/abi_gki_aarch64_xiaomi index 824b53703ce5..744dcb8fe69e 100644 --- a/android/abi_gki_aarch64_xiaomi +++ b/android/abi_gki_aarch64_xiaomi @@ -10,6 +10,7 @@ snd_soc_get_volsw_range snd_soc_info_volsw_range snd_soc_put_volsw_range + snd_compr_stop_error # required by cs35l45_dlkm.ko devm_snd_soc_register_component From c54460e994bad77ca2ae96a1c3bf34ac21461b71 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Fri, 5 Apr 2024 11:00:40 -0700 Subject: [PATCH 05/13] ANDROID: 16K: Introduce /sys/kernel/mm/pgsize_miration/enabled Migrating from 4kB to 16kB page-size in Android requires first making the platform page-agnostic, which involves increasing Android-ELFs' max-page-size (p_align) from 4kB to 16kB. Increasing the ELF max-page-size was found to cause compatibility issues in apps that use obfuscation or depend on the ELF segments being mapped based on 4kB-alignment. Working around these compatibility issues involves both kernel and userspace (dynamic linker) changes. Introduce a knob for userspace (dynamic linker) to determine whether the kernel supports the mitigations needed for page-size migration compatibility. The knob also allows for userspace to turn on or off these mitigations by writing 1 or 0 to /sys/kernel/mm/pgsize_miration/enabled: echo 1 > /sys/kernel/mm//pgsize_miration/enabled # Enable echo 0 > /sys/kernel/mm//pgsize_miration/enabled # Disable Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: I9ac1d15d397b8226b27827ecffa30502da91e10e Signed-off-by: Kalesh Singh --- mm/Makefile | 2 +- mm/pgsize_migration.c | 105 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 mm/pgsize_migration.c diff --git a/mm/Makefile b/mm/Makefile index d996846697ef..da6ec4a8f912 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -42,7 +42,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o $(mmu-y) + debug.o gup.o pgsize_migration.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c new file mode 100644 index 000000000000..9015d5905a16 --- /dev/null +++ b/mm/pgsize_migration.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Page Size Migration + * + * This file contains the core logic of mitigations to ensure + * app compatibility during the transition from 4kB to 16kB + * page size in Android. + * + * Copyright (c) 2024, Google LLC. + * Author: Kalesh Singh + */ + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_64BIT +#if PAGE_SIZE == SZ_4K +DEFINE_STATIC_KEY_TRUE(pgsize_migration_enabled); + +#define is_pgsize_migration_enabled() (static_branch_likely(&pgsize_migration_enabled)) +#else /* PAGE_SIZE != SZ_4K */ +DEFINE_STATIC_KEY_FALSE(pgsize_migration_enabled); + +#define is_pgsize_migration_enabled() (static_branch_unlikely(&pgsize_migration_enabled)) +#endif /* PAGE_SIZE == SZ_4K */ + +static ssize_t show_pgsize_migration_enabled(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + if (is_pgsize_migration_enabled()) + return sprintf(buf, "%d\n", 1); + else + return sprintf(buf, "%d\n", 0); +} + +static ssize_t store_pgsize_migration_enabled(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + /* Migration is only applicable to 4kB kernels */ + if (PAGE_SIZE != SZ_4K) + return n; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + if (val == 1) + static_branch_enable(&pgsize_migration_enabled); + else if (val == 0) + static_branch_disable(&pgsize_migration_enabled); + + return n; +} + +static struct kobj_attribute pgsize_migration_enabled_attr = __ATTR( + enabled, + 0644, + show_pgsize_migration_enabled, + store_pgsize_migration_enabled +); + +static struct attribute *pgsize_migration_attrs[] = { + &pgsize_migration_enabled_attr.attr, + NULL +}; + +static struct attribute_group pgsize_migration_attr_group = { + .name = "pgsize_migration", + .attrs = pgsize_migration_attrs, +}; + +/** + * What: /sys/kernel/mm/pgsize_migration/enabled + * Date: April 2024 + * KernelVersion: v5.4+ (GKI kernels) + * Contact: Kalesh Singh + * Description: /sys/kernel/mm/pgsize_migration/enabled + * allows for userspace to turn on or off page size + * migration mitigations necessary for app compatibility + * during Android's transition from 4kB to 16kB page size. + * Such mitigations include preserving /proc//[s]maps + * output as if there was no segment extension by the + * dynamic loader; and preventing fault around in the padding + * sections of ELF LOAD segment mappings. + * Users: Bionic's dynamic linker + */ +static int __init init_pgsize_migration(void) +{ + if (sysfs_create_group(mm_kobj, &pgsize_migration_attr_group)) + pr_err("pgsize_migration: failed to create sysfs group\n"); + + return 0; +}; +late_initcall(init_pgsize_migration); +#endif /* CONFIG_64BIT */ From 05f9de39f76b46d37347638f6bfbad3986d93563 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 4 Apr 2024 22:21:32 -0700 Subject: [PATCH 06/13] ANDROID: 16K: Introduce ELF padding representation for VMAs The dynamic linker may extend ELF LOAD segment mappings to be contiguous in memory when loading a 16kB compatible ELF on a 4kB page-size system. This is done to reduce the use of unreclaimable VMA slab memory for the otherwise necessary "gap" VMAs. The extended portion of the mapping (VMA) can be viewed as "padding", meaning that the mapping in that range corresponds to an area of the file that does not contain contents of the respective segments (maybe zero's depending on how the ELF is built). For some compatibility mitigations, the region of a VMA corresponding to these padding sections need to be known. In order to represent such regions without adding addtional overhead or breaking ABI, some upper bits of vm_flags are used. Add the VMA padding pages representation and the necessary APIs to manipulate it. Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: Ieb9fa98e30ec9b0bec62256624f14e3ed6062a75 Signed-off-by: Kalesh Singh --- include/linux/pgsize_migration.h | 64 ++++++++++++++++++++++++++++++++ mm/pgsize_migration.c | 21 +++++++++++ 2 files changed, 85 insertions(+) create mode 100644 include/linux/pgsize_migration.h diff --git a/include/linux/pgsize_migration.h b/include/linux/pgsize_migration.h new file mode 100644 index 000000000000..60f719d44107 --- /dev/null +++ b/include/linux/pgsize_migration.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGE_SIZE_MIGRATION_H +#define _LINUX_PAGE_SIZE_MIGRATION_H + +/* + * Page Size Migration + * + * Copyright (c) 2024, Google LLC. + * Author: Kalesh Singh + * + * This file contains the APIs for mitigations to ensure + * app compatibility during the transition from 4kB to 16kB + * page size in Android. + */ + +#include +#include + +/* + * vm_flags representation of VMA padding pages. + * + * This allows the kernel to identify the portion of an ELF LOAD segment VMA + * that is padding. + * + * 4 high bits of vm_flags [63,60] are used to represent ELF segment padding + * up to 60kB, which is sufficient for ELFs of both 16kB and 64kB segment + * alignment (p_align). + * + * The representation is illustrated below. + * + * 63 62 61 60 + * _________ _________ _________ _________ + * | Bit 3 | Bit 2 | Bit 1 | Bit 0 | + * | of 4kB | of 4kB | of 4kB | of 4kB | + * | chunks | chunks | chunks | chunks | + * |_________|_________|_________|_________| + */ + +#define VM_PAD_WIDTH 4 +#define VM_PAD_SHIFT (BITS_PER_LONG - VM_PAD_WIDTH) +#define VM_TOTAL_PAD_PAGES ((1ULL << VM_PAD_WIDTH) - 1) + +#if PAGE_SIZE == SZ_4K && defined(CONFIG_64BIT) +extern void vma_set_pad_pages(struct vm_area_struct *vma, + unsigned long nr_pages); + +extern unsigned long vma_pad_pages(struct vm_area_struct *vma); +#else /* PAGE_SIZE != SZ_4K || !defined(CONFIG_64BIT) */ +static inline void vma_set_pad_pages(struct vm_area_struct *vma, + unsigned long nr_pages) +{ +} + +static inline unsigned long vma_pad_pages(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* PAGE_SIZE == SZ_4K && defined(CONFIG_64BIT) */ + +static inline unsigned long vma_data_pages(struct vm_area_struct *vma) +{ + return vma_pages(vma) - vma_pad_pages(vma); +} +#endif /* _LINUX_PAGE_SIZE_MIGRATION_H */ diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index 9015d5905a16..013d256727b8 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -10,6 +10,8 @@ * Author: Kalesh Singh */ +#include + #include #include #include @@ -102,4 +104,23 @@ static int __init init_pgsize_migration(void) return 0; }; late_initcall(init_pgsize_migration); + +#if PAGE_SIZE == SZ_4K +void vma_set_pad_pages(struct vm_area_struct *vma, + unsigned long nr_pages) +{ + if (!is_pgsize_migration_enabled()) + return; + + vma->vm_flags |= (nr_pages << VM_PAD_SHIFT); +} + +unsigned long vma_pad_pages(struct vm_area_struct *vma) +{ + if (!is_pgsize_migration_enabled()) + return 0; + + return vma->vm_flags >> VM_PAD_SHIFT; +} +#endif /* PAGE_SIZE == SZ_4K */ #endif /* CONFIG_64BIT */ From 0f0e4aae7a59400120d6ddea66eb440fc9912d9a Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 4 Apr 2024 22:21:32 -0700 Subject: [PATCH 07/13] ANDROID: 16K: Use MADV_DONTNEED to save VMA padding pages. When performing LOAD segment extension, the dynamic linker knows what portion of the VMA is padding. In order for the kernel to implement mitigations that ensure app compatibility, the extent of the padding must be made available to the kernel. To achieve this, reuse MADV_DONTNEED on single VMAs to hint the padding range to the kernel. This information is then stored in vm_flag bits. This allows userspace (dynamic linker) to set the padding pages on the VMA without a need for new out-of-tree UAPI. Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: I3421de32ab38ad3cb0fbce73ecbd8f7314287cde Signed-off-by: Kalesh Singh --- include/linux/pgsize_migration.h | 8 +++++ mm/madvise.c | 3 ++ mm/pgsize_migration.c | 56 ++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/include/linux/pgsize_migration.h b/include/linux/pgsize_migration.h index 60f719d44107..fd1e74ea4283 100644 --- a/include/linux/pgsize_migration.h +++ b/include/linux/pgsize_migration.h @@ -45,6 +45,9 @@ extern void vma_set_pad_pages(struct vm_area_struct *vma, unsigned long nr_pages); extern unsigned long vma_pad_pages(struct vm_area_struct *vma); + +extern void madvise_vma_pad_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end); #else /* PAGE_SIZE != SZ_4K || !defined(CONFIG_64BIT) */ static inline void vma_set_pad_pages(struct vm_area_struct *vma, unsigned long nr_pages) @@ -55,6 +58,11 @@ static inline unsigned long vma_pad_pages(struct vm_area_struct *vma) { return 0; } + +static inline void madvise_vma_pad_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} #endif /* PAGE_SIZE == SZ_4K && defined(CONFIG_64BIT) */ static inline unsigned long vma_data_pages(struct vm_area_struct *vma) diff --git a/mm/madvise.c b/mm/madvise.c index 776378379fab..ed1fffbe6f03 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -756,6 +757,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { + madvise_vma_pad_pages(vma, start, end); + zap_page_range(vma, start, end - start); return 0; } diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index 013d256727b8..6b84172e37f7 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -122,5 +122,61 @@ unsigned long vma_pad_pages(struct vm_area_struct *vma) return vma->vm_flags >> VM_PAD_SHIFT; } + +static __always_inline bool str_has_suffix(const char *str, const char *suffix) +{ + size_t str_len = strlen(str); + size_t suffix_len = strlen(suffix); + + if (str_len < suffix_len) + return false; + + return !strncmp(str + str_len - suffix_len, suffix, suffix_len); +} + +/* + * Saves the number of padding pages for an ELF segment mapping + * in vm_flags. + * + * The number of padding pages is deduced from the madvise DONTNEED range [start, end) + * if the following conditions are met: + * 1) The range is enclosed by a single VMA + * 2) The range ends at the end address of the VMA + * 3) The range starts at an address greater than the start address of the VMA + * 4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES. + * 5) The VMA is a regular file backed VMA (filemap_fault) + * 6) The file backing the VMA is a shared library (*.so) + */ +void madvise_vma_pad_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long nr_pad_pages; + + if (!is_pgsize_migration_enabled()) + return; + + /* Only handle this for file backed VMAs */ + if (!vma->vm_file || !vma->vm_ops || vma->vm_ops->fault != filemap_fault) + return; + + + /* Limit this to only shared libraries (*.so) */ + if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so")) + return; + + /* + * If the madvise range is it at the end of the file save the number of + * pages in vm_flags (only need 4 bits are needed for 16kB aligned ELFs). + */ + if (start <= vma->vm_start || end != vma->vm_end) + return; + + nr_pad_pages = (end - start) >> PAGE_SHIFT; + + if (!nr_pad_pages || nr_pad_pages > VM_TOTAL_PAD_PAGES) + return; + + vma_set_pad_pages(vma, nr_pad_pages); +} #endif /* PAGE_SIZE == SZ_4K */ #endif /* CONFIG_64BIT */ From 6ad75e7a9d0150e87dc82cc3d6106f9f686103a9 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 4 Apr 2024 22:37:48 -0700 Subject: [PATCH 08/13] ANDROID: 16K: Exclude ELF padding for fault around range Userspace apps often analyze memory consumption by the use of mm rss_stat counters -- via the kmem/rss_stat trace event or from /proc//statm. rss_stat counters are only updated when the PTEs are updated. What this means is that pages can be present in the page cache from readahead but not visible to userspace (not attributed to the app) as there is no corresponding VMA (PTEs) for the respective page cache pages. A side effect of the loader now extending ELF LOAD segments to be contiguously mapped in the virtual address space, means that the VMA is extended to cover the padding pages. When filesystems, such as f2fs and ext4, that implement vm_ops->map_pages() attempt to perform a do_fault_around() the extent of the fault around is restricted by the area of the enclosing VMA. Since the loader extends LOAD segment VMAs to be contiguously mapped, the extent of the fault around is also increased. The result of which, is that the PTEs corresponding to the padding pages are updated and reflected in the rss_stat counters. It is not common that userspace application developers be aware of this nuance in the kernel's memory accounting. To avoid apparent regressions in memory usage to userspace, restrict the fault around range to only valid data pages (i.e. exclude the padding pages at the end of the VMA). Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: I2c7a39ec1b040be2b9fb47801f95042f5dbf869d Signed-off-by: Kalesh Singh --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 9b98d52c174a..d79db8aba779 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -3604,7 +3605,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) end_pgoff = start_pgoff - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_data_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { From 1375f8328beedd25110e2fa2c28f602847ff6456 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 4 Apr 2024 23:02:30 -0700 Subject: [PATCH 09/13] ANDROID: 16K: Separate padding from ELF LOAD segment mappings In has been found that some in-field apps depend on the output of /proc/*/maps to determine the address ranges of other operations. With the extension of LOAD segments VMAs to be contiguous in memory, the apps may perform operations on an area that is not backed by the underlying file, which results in a SIGBUS. Other apps have crashed with yet unindentified reasons. To avoid breaking in-field apps, maintain the output of /proc/*/[s]maps with PROT_NONE VMAs for the padding pages of LOAD segmetns instead of showing the segment extensions. NOTE: This does not allocate actual backing VMAs for the shown PROT_NONE mappings. This approach maintains 2 possible assumptions that userspace (apps) could be depending on: 1) That LOAD segment mappings are "contiguous" (not speparated by unrelated mappings) in memory. 2) That no virtual address space is available between mappings of consecutive LOAD segments for the same ELF. For example the output of /proc/*/[s]maps before and after this change is shown below. Segments maintain PROT_NONE gaps ("[page size compat]") for app compatiblity but these are not backed by actual slab VMA memory. Maps Before: 7fb03604d000-7fb036051000 r--p 00000000 fe:09 21935719 /system/lib64/libnetd_client.so 7fb036051000-7fb036055000 r-xp 00004000 fe:09 21935719 /system/lib64/libnetd_client.so 7fb036055000-7fb036059000 r--p 00008000 fe:09 21935719 /system/lib64/libnetd_client.so 7fb036059000-7fb03605a000 rw-p 0000c000 fe:09 21935719 /system/lib64/libnetd_client.so Maps After: 7fc707390000-7fc707393000 r--p 00000000 fe:09 21935719 /system/lib64/libnetd_client.so 7fc707393000-7fc707394000 ---p 00000000 00:00 0 [page size compat] 7fc707394000-7fc707398000 r-xp 00004000 fe:09 21935719 /system/lib64/libnetd_client.so 7fc707398000-7fc707399000 r--p 00008000 fe:09 21935719 /system/lib64/libnetd_client.so 7fc707399000-7fc70739c000 ---p 00000000 00:00 0 [page size compat] 7fc70739c000-7fc70739d000 rw-p 0000c000 fe:09 21935719 /system/lib64/libnetd_client.so Smaps Before: 7fb03604d000-7fb036051000 r--p 00000000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 16 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 16 kB Pss: 0 kB Pss_Dirty: 0 kB Shared_Clean: 16 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 16 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd mr mw me 7fb036051000-7fb036055000 r-xp 00004000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 16 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 16 kB Pss: 0 kB Pss_Dirty: 0 kB Shared_Clean: 16 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 16 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd ex mr mw me 7fb036055000-7fb036059000 r--p 00008000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 16 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 4 kB Pss: 4 kB Pss_Dirty: 4 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 4 kB Referenced: 4 kB Anonymous: 4 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd mr mw me ac 7fb036059000-7fb03605a000 rw-p 0000c000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 4 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 4 kB Pss: 4 kB Pss_Dirty: 4 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 4 kB Referenced: 4 kB Anonymous: 4 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd wr mr mw me ac Smaps After: 7fc707390000-7fc707393000 r--p 00000000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 12 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 12 kB Pss: 0 kB Shared_Clean: 12 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 12 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd mr mw me ?? 7fc707393000-7fc707394000 ---p 00000000 00:00 0 [page size compat] Size: 4 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: mr mw me 7fc707394000-7fc707398000 r-xp 00004000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 16 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 16 kB Pss: 0 kB Shared_Clean: 16 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 16 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd ex mr mw me 7fc707398000-7fc707399000 r--p 00008000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 4 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 4 kB Pss: 4 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 4 kB Referenced: 4 kB Anonymous: 4 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd mr mw me ac ?? ?? 7fc707399000-7fc70739c000 ---p 00000000 00:00 0 [page size compat] Size: 12 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: mr mw me ac 7fc70739c000-7fc70739d000 rw-p 0000c000 fe:09 21935719 /system/lib64/libnetd_client.so Size: 4 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 4 kB Pss: 4 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 4 kB Referenced: 4 kB Anonymous: 4 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd wr mr mw me ac Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: I12bf2c106fafc74a500d79155b81dde5db42661e Signed-off-by: Kalesh Singh --- fs/proc/task_mmu.c | 23 ++++++-- include/linux/pgsize_migration.h | 29 ++++++++++ mm/pgsize_migration.c | 92 ++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+), 3 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a60199ff47e9..05ffaa1f55c9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -419,7 +420,13 @@ done: static int show_map(struct seq_file *m, void *v) { - show_map_vma(m, v); + struct vm_area_struct *pad_vma = get_pad_vma(v); + struct vm_area_struct *vma = get_data_vma(v); + + show_map_vma(m, vma); + + show_map_pad_vma(vma, pad_vma, m, show_map_vma); + m_cache_vma(m, v); return 0; } @@ -877,7 +884,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, seq_puts(m, " kB\n"); } -static int show_smap(struct seq_file *m, void *v) +static void show_smap_vma(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss; @@ -907,8 +914,18 @@ static int show_smap(struct seq_file *m, void *v) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); - m_cache_vma(m, vma); +} +static int show_smap(struct seq_file *m, void *v) +{ + struct vm_area_struct *pad_vma = get_pad_vma(v); + struct vm_area_struct *vma = get_data_vma(v); + + show_smap_vma(m, vma); + + show_map_pad_vma(vma, pad_vma, m, (show_pad_vma_fn)show_smap_vma); + + m_cache_vma(m, v); return 0; } diff --git a/include/linux/pgsize_migration.h b/include/linux/pgsize_migration.h index fd1e74ea4283..7ab0f288bcf9 100644 --- a/include/linux/pgsize_migration.h +++ b/include/linux/pgsize_migration.h @@ -14,6 +14,7 @@ */ #include +#include #include /* @@ -39,6 +40,10 @@ #define VM_PAD_WIDTH 4 #define VM_PAD_SHIFT (BITS_PER_LONG - VM_PAD_WIDTH) #define VM_TOTAL_PAD_PAGES ((1ULL << VM_PAD_WIDTH) - 1) +#define VM_PAD_MASK (VM_TOTAL_PAD_PAGES << VM_PAD_SHIFT) +#define VMA_PAD_START(vma) (vma->vm_end - (vma_pad_pages(vma) << PAGE_SHIFT)) + +typedef void (*show_pad_vma_fn)(struct seq_file *m, struct vm_area_struct *vma); #if PAGE_SIZE == SZ_4K && defined(CONFIG_64BIT) extern void vma_set_pad_pages(struct vm_area_struct *vma, @@ -48,6 +53,14 @@ extern unsigned long vma_pad_pages(struct vm_area_struct *vma); extern void madvise_vma_pad_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end); + +extern struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma); + +extern struct vm_area_struct *get_data_vma(struct vm_area_struct *vma); + +extern void show_map_pad_vma(struct vm_area_struct *vma, + struct vm_area_struct *pad, + struct seq_file *m, show_pad_vma_fn func); #else /* PAGE_SIZE != SZ_4K || !defined(CONFIG_64BIT) */ static inline void vma_set_pad_pages(struct vm_area_struct *vma, unsigned long nr_pages) @@ -63,6 +76,22 @@ static inline void madvise_vma_pad_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end) { } + +static inline struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline struct vm_area_struct *get_data_vma(struct vm_area_struct *vma) +{ + return vma; +} + +static inline void show_map_pad_vma(struct vm_area_struct *vma, + struct vm_area_struct *pad, + struct seq_file *m, show_pad_vma_fn func) +{ +} #endif /* PAGE_SIZE == SZ_4K && defined(CONFIG_64BIT) */ static inline unsigned long vma_data_pages(struct vm_area_struct *vma) diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index 6b84172e37f7..c98870f41018 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #ifdef CONFIG_64BIT @@ -178,5 +179,96 @@ void madvise_vma_pad_pages(struct vm_area_struct *vma, vma_set_pad_pages(vma, nr_pad_pages); } + +static const char *pad_vma_name(struct vm_area_struct *vma) +{ + return "[page size compat]"; +} + +static const struct vm_operations_struct pad_vma_ops = { + .name = pad_vma_name, +}; + +/* + * Returns a new VMA representing the padding in @vma, if no padding + * in @vma returns NULL. + */ +struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *pad; + + if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK)) + return NULL; + + pad = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + + *pad = *vma; + + /* Remove file */ + pad->vm_file = NULL; + + /* Add vm_ops->name */ + pad->vm_ops = &pad_vma_ops; + + /* Adjust the start to begin at the start of the padding section */ + pad->vm_start = VMA_PAD_START(pad); + + /* Make the pad vma PROT_NONE */ + pad->vm_flags = pad->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + + /* Remove padding bits */ + pad->vm_flags = pad->vm_flags & ~VM_PAD_MASK; + + return pad; +} + +/* + * Returns a new VMA exclusing the padding from @vma; if no padding in + * @vma returns @vma. + */ +struct vm_area_struct *get_data_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *data; + + if (!is_pgsize_migration_enabled() || !(vma->vm_flags & VM_PAD_MASK)) + return vma; + + data = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL); + + *data = *vma; + + /* Adjust the end to the start of the padding section */ + data->vm_end = VMA_PAD_START(data); + + return data; +} + +/* + * Calls the show_pad_vma_fn on the @pad VMA, and frees the copies of @vma + * and @pad. + */ +void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad, + struct seq_file *m, show_pad_vma_fn func) +{ + if (!pad) + return; + + /* + * This cannot happen. If @pad vma was allocated the corresponding + * @vma should have the VM_PAD_MASK bit(s) set. + */ + BUG_ON(!(vma->vm_flags & VM_PAD_MASK)); + + /* + * This cannot happen. @pad is a section of the original VMA. + * Therefore @vma cannot be null if @pad is not null. + */ + BUG_ON(!vma); + + func(m, pad); + + kfree(pad); + kfree(vma); +} #endif /* PAGE_SIZE == SZ_4K */ #endif /* CONFIG_64BIT */ From 95ac7272d7232db2229ed56404d485d0891925cb Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Fri, 19 Apr 2024 14:41:35 -0700 Subject: [PATCH 10/13] ANDROID: 16K: Only madvise padding from dynamic linker context Only preform padding advise from the execution context on bionic's dynamic linker. This ensures that madvise() doesn't have unwanted side effects. Also rearrange the order of fail checks in madvise_vma_pad_pages() in order of ascending cost. Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: I3e05b8780c6eda78007f86b613f8c11dd18ac28f Signed-off-by: Kalesh Singh --- mm/pgsize_migration.c | 75 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 10 deletions(-) diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index c98870f41018..605e405d7efe 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -135,6 +136,56 @@ static __always_inline bool str_has_suffix(const char *str, const char *suffix) return !strncmp(str + str_len - suffix_len, suffix, suffix_len); } +/* + * The dynamic linker, or interpreter, operates within the process context + * of the binary that necessitated dynamic linking. + * + * Consequently, process context identifiers; like PID, comm, ...; cannot + * be used to differentiate whether the execution context belongs to the + * dynamic linker or not. + * + * linker_ctx() deduces whether execution is currently in the dynamic linker's + * context by correlating the current userspace instruction pointer with the + * VMAs of the current task. + * + * Returns true if in linker context, otherwise false. + * + * Caller must hold mmap lock in read mode. + */ +static inline bool linker_ctx(void) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct file *file; + + if (!regs) + return false; + + vma = find_vma(mm, instruction_pointer(regs)); + + /* Current execution context, the VMA must be present */ + BUG_ON(!vma); + + file = vma->vm_file; + if (!file) + return false; + + if ((vma->vm_flags & VM_EXEC)) { + char buf[64]; + const int bufsize = sizeof(buf); + char *path; + + memset(buf, 0, bufsize); + path = d_path(&file->f_path, buf, bufsize); + + if (!strcmp(path, "/system/bin/linker64")) + return true; + } + + return false; +} + /* * Saves the number of padding pages for an ELF segment mapping * in vm_flags. @@ -147,6 +198,7 @@ static __always_inline bool str_has_suffix(const char *str, const char *suffix) * 4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES. * 5) The VMA is a regular file backed VMA (filemap_fault) * 6) The file backing the VMA is a shared library (*.so) + * 7) The madvise was requested by bionic's dynamic linker. */ void madvise_vma_pad_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -156,18 +208,9 @@ void madvise_vma_pad_pages(struct vm_area_struct *vma, if (!is_pgsize_migration_enabled()) return; - /* Only handle this for file backed VMAs */ - if (!vma->vm_file || !vma->vm_ops || vma->vm_ops->fault != filemap_fault) - return; - - - /* Limit this to only shared libraries (*.so) */ - if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so")) - return; - /* * If the madvise range is it at the end of the file save the number of - * pages in vm_flags (only need 4 bits are needed for 16kB aligned ELFs). + * pages in vm_flags (only need 4 bits are needed for up to 64kB aligned ELFs). */ if (start <= vma->vm_start || end != vma->vm_end) return; @@ -177,6 +220,18 @@ void madvise_vma_pad_pages(struct vm_area_struct *vma, if (!nr_pad_pages || nr_pad_pages > VM_TOTAL_PAD_PAGES) return; + /* Only handle this for file backed VMAs */ + if (!vma->vm_file || !vma->vm_ops || vma->vm_ops->fault != filemap_fault) + return; + + /* Limit this to only shared libraries (*.so) */ + if (!str_has_suffix(vma->vm_file->f_path.dentry->d_name.name, ".so")) + return; + + /* Only bionic's dynamic linker needs to hint padding pages. */ + if (!linker_ctx()) + return; + vma_set_pad_pages(vma, nr_pad_pages); } From 7231bbf0e47554dbbfe4df196dcca3127232df76 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 25 Apr 2024 09:59:08 -0700 Subject: [PATCH 11/13] ANDROID: 16K: madvise_vma_pad_pages: Remove filemap_fault check Some file systems like F2FS use a custom filemap_fault ops. Remove this check, as checking vm_file is sufficient. Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: Id6a584d934f06650c0a95afd1823669fc77ba2c2 Signed-off-by: Kalesh Singh --- mm/pgsize_migration.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index 605e405d7efe..e427950a6d6d 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -196,7 +196,7 @@ static inline bool linker_ctx(void) * 2) The range ends at the end address of the VMA * 3) The range starts at an address greater than the start address of the VMA * 4) The number of the pages in the range does not exceed VM_TOTAL_PAD_PAGES. - * 5) The VMA is a regular file backed VMA (filemap_fault) + * 5) The VMA is a file backed VMA. * 6) The file backing the VMA is a shared library (*.so) * 7) The madvise was requested by bionic's dynamic linker. */ @@ -221,7 +221,7 @@ void madvise_vma_pad_pages(struct vm_area_struct *vma, return; /* Only handle this for file backed VMAs */ - if (!vma->vm_file || !vma->vm_ops || vma->vm_ops->fault != filemap_fault) + if (!vma->vm_file) return; /* Limit this to only shared libraries (*.so) */ From ecba20dd597e9a37e40d59cc3574e32a7a4d0471 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 22 Apr 2024 14:24:59 -0700 Subject: [PATCH 12/13] ANDROID: 16K: Handle pad VMA splits and merges In some cases a VMA with padding representation may be split, and therefore the padding flags must be updated accordingly. There are 3 cases to handle: Given: | DDDDPPPP | where: - D represents 1 page of data; - P represents 1 page of padding; - | represents the boundaries (start/end) of the VMA 1) Split exactly at the padding boundary | DDDDPPPP | --> | DDDD | PPPP | - Remove padding flags from the first VMA. - The second VMA is all padding 2) Split within the padding area | DDDDPPPP | --> | DDDDPP | PP | - Subtract the length of the second VMA from the first VMA's padding. - The second VMA is all padding, adjust its padding length (flags) 3) Split within the data area | DDDDPPPP | --> | DD | DDPPPP | - Remove padding flags from the first VMA. - The second VMA is has the same padding as from before the split. To simplify the semantics merging of padding VMAs is not allowed. If a split produces a VMA that is entirely padding, show_[s]maps() only outputs the padding VMA entry (as the data entry is of length 0). Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: Ie2628ced5512e2c7f8af25fabae1f38730c8bb1a Signed-off-by: Kalesh Singh --- fs/proc/task_mmu.c | 7 ++-- include/linux/pgsize_migration.h | 34 ++++++++++++++++ mm/mlock.c | 3 +- mm/mmap.c | 7 +++- mm/mprotect.c | 3 +- mm/pgsize_migration.c | 69 +++++++++++++++++++++++++++++++- 6 files changed, 115 insertions(+), 8 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 05ffaa1f55c9..4843a0473487 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -423,7 +423,8 @@ static int show_map(struct seq_file *m, void *v) struct vm_area_struct *pad_vma = get_pad_vma(v); struct vm_area_struct *vma = get_data_vma(v); - show_map_vma(m, vma); + if (vma_pages(vma)) + show_map_vma(m, vma); show_map_pad_vma(vma, pad_vma, m, show_map_vma); @@ -913,7 +914,6 @@ static void show_smap_vma(struct seq_file *m, void *v) if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); - } static int show_smap(struct seq_file *m, void *v) @@ -921,7 +921,8 @@ static int show_smap(struct seq_file *m, void *v) struct vm_area_struct *pad_vma = get_pad_vma(v); struct vm_area_struct *vma = get_data_vma(v); - show_smap_vma(m, vma); + if (vma_pages(vma)) + show_smap_vma(m, vma); show_map_pad_vma(vma, pad_vma, m, (show_pad_vma_fn)show_smap_vma); diff --git a/include/linux/pgsize_migration.h b/include/linux/pgsize_migration.h index 7ab0f288bcf9..5c47ec28ea7d 100644 --- a/include/linux/pgsize_migration.h +++ b/include/linux/pgsize_migration.h @@ -61,6 +61,9 @@ extern struct vm_area_struct *get_data_vma(struct vm_area_struct *vma); extern void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad, struct seq_file *m, show_pad_vma_fn func); + +extern void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new, + unsigned long addr, int new_below); #else /* PAGE_SIZE != SZ_4K || !defined(CONFIG_64BIT) */ static inline void vma_set_pad_pages(struct vm_area_struct *vma, unsigned long nr_pages) @@ -92,10 +95,41 @@ static inline void show_map_pad_vma(struct vm_area_struct *vma, struct seq_file *m, show_pad_vma_fn func) { } + +static inline void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new, + unsigned long addr, int new_below) +{ +} #endif /* PAGE_SIZE == SZ_4K && defined(CONFIG_64BIT) */ static inline unsigned long vma_data_pages(struct vm_area_struct *vma) { return vma_pages(vma) - vma_pad_pages(vma); } + +/* + * Sets the correct padding bits / flags for a VMA split. + */ +static inline unsigned long vma_pad_fixup_flags(struct vm_area_struct *vma, + unsigned long newflags) +{ + if (newflags & VM_PAD_MASK) + return (newflags & ~VM_PAD_MASK) | (vma->vm_flags & VM_PAD_MASK); + else + return newflags; +} + +/* + * Merging of padding VMAs is uncommon, as padding is only allowed + * from the linker context. + * + * To simplify the semantics, adjacent VMAs with padding are not + * allowed to merge. + */ +static inline bool is_mergable_pad_vma(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + /* Padding VMAs cannot be merged with other padding or real VMAs */ + return !((vma->vm_flags | vm_flags) & VM_PAD_MASK); +} #endif /* _LINUX_PAGE_SIZE_MIGRATION_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 646acba3045b..c76fb104507a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -571,7 +572,7 @@ success: */ if (lock) - vma->vm_flags = newflags; + vma->vm_flags = vma_pad_fixup_flags(vma, newflags); else munlock_vma_pages_range(vma, start, end); diff --git a/mm/mmap.c b/mm/mmap.c index 8f1716292c63..bcc12a4f0662 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -1030,6 +1031,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (vma_get_anon_name(vma) != anon_name) return 0; + if (!is_mergable_pad_vma(vma, vm_flags)) + return 0; return 1; } @@ -2731,8 +2734,10 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); /* Success. */ - if (!err) + if (!err) { + split_pad_vma(vma, new, addr, new_below); return 0; + } /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) diff --git a/mm/mprotect.c b/mm/mprotect.c index 87a59d46b4de..99cf8a97b2e4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -454,7 +455,7 @@ success: * vm_flags and vm_page_prot are protected by the mmap_sem * held in write mode. */ - vma->vm_flags = newflags; + vma->vm_flags = vma_pad_fixup_flags(vma, newflags); dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index e427950a6d6d..305036b3d45a 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -114,6 +114,7 @@ void vma_set_pad_pages(struct vm_area_struct *vma, if (!is_pgsize_migration_enabled()) return; + vma->vm_flags &= ~VM_PAD_MASK; vma->vm_flags |= (nr_pages << VM_PAD_SHIFT); } @@ -269,10 +270,10 @@ struct vm_area_struct *get_pad_vma(struct vm_area_struct *vma) pad->vm_start = VMA_PAD_START(pad); /* Make the pad vma PROT_NONE */ - pad->vm_flags = pad->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + pad->vm_flags &= ~(VM_READ|VM_WRITE|VM_EXEC); /* Remove padding bits */ - pad->vm_flags = pad->vm_flags & ~VM_PAD_MASK; + pad->vm_flags &= ~VM_PAD_MASK; return pad; } @@ -325,5 +326,69 @@ void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad, kfree(pad); kfree(vma); } + +/* + * When splitting a padding VMA there are a couple of cases to handle. + * + * Given: + * + * | DDDDPPPP | + * + * where: + * - D represents 1 page of data; + * - P represents 1 page of padding; + * - | represents the boundaries (start/end) of the VMA + * + * + * 1) Split exactly at the padding boundary + * + * | DDDDPPPP | --> | DDDD | PPPP | + * + * - Remove padding flags from the first VMA. + * - The second VMA is all padding + * + * 2) Split within the padding area + * + * | DDDDPPPP | --> | DDDDPP | PP | + * + * - Subtract the length of the second VMA from the first VMA's padding. + * - The second VMA is all padding, adjust its padding length (flags) + * + * 3) Split within the data area + * + * | DDDDPPPP | --> | DD | DDPPPP | + * + * - Remove padding flags from the first VMA. + * - The second VMA is has the same padding as from before the split. + */ +void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new, + unsigned long addr, int new_below) +{ + unsigned long nr_pad_pages = vma_pad_pages(vma); + unsigned long nr_vma2_pages; + struct vm_area_struct *first; + struct vm_area_struct *second; + + if (!nr_pad_pages) + return; + + if (new_below) { + first = new; + second = vma; + } else { + first = vma; + second = new; + } + + nr_vma2_pages = vma_pages(second); + + if (nr_vma2_pages >= nr_pad_pages) { /* Case 1 & 3 */ + first->vm_flags &= ~VM_PAD_MASK; + vma_set_pad_pages(second, nr_pad_pages); + } else { /* Case 2 */ + vma_set_pad_pages(first, nr_pad_pages - nr_vma2_pages); + vma_set_pad_pages(second, nr_vma2_pages); + } +} #endif /* PAGE_SIZE == SZ_4K */ #endif /* CONFIG_64BIT */ From f952d4f3c8ef60c93566792929fc1f598af30a8a Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 30 Apr 2024 13:42:47 -0700 Subject: [PATCH 13/13] ANDROID: 16K: Fix show maps CFI failure If the kernel is built CONFIG_CFI_CLANG=y, reading smaps may cause a panic. This is due to a failed CFI check; which is triggered becuase the signature of the function pointer for printing smaps padding VMAs does not match exactly with that for show_smap(). Fix this by casting the function pointer to the expected type based on whether printing maps or smaps padding. Bug: 330117029 Bug: 327600007 Bug: 330767927 Bug: 328266487 Bug: 329803029 Change-Id: I65564a547dacbc4131f8557344c8c96e51f90cd5 Signed-off-by: Kalesh Singh --- fs/proc/task_mmu.c | 4 ++-- include/linux/pgsize_migration.h | 6 ++---- mm/pgsize_migration.c | 10 ++++++++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4843a0473487..1afd67def3c1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -426,7 +426,7 @@ static int show_map(struct seq_file *m, void *v) if (vma_pages(vma)) show_map_vma(m, vma); - show_map_pad_vma(vma, pad_vma, m, show_map_vma); + show_map_pad_vma(vma, pad_vma, m, show_map_vma, false); m_cache_vma(m, v); return 0; @@ -924,7 +924,7 @@ static int show_smap(struct seq_file *m, void *v) if (vma_pages(vma)) show_smap_vma(m, vma); - show_map_pad_vma(vma, pad_vma, m, (show_pad_vma_fn)show_smap_vma); + show_map_pad_vma(vma, pad_vma, m, show_smap_vma, true); m_cache_vma(m, v); return 0; diff --git a/include/linux/pgsize_migration.h b/include/linux/pgsize_migration.h index 5c47ec28ea7d..fbfb1b9b9196 100644 --- a/include/linux/pgsize_migration.h +++ b/include/linux/pgsize_migration.h @@ -43,8 +43,6 @@ #define VM_PAD_MASK (VM_TOTAL_PAD_PAGES << VM_PAD_SHIFT) #define VMA_PAD_START(vma) (vma->vm_end - (vma_pad_pages(vma) << PAGE_SHIFT)) -typedef void (*show_pad_vma_fn)(struct seq_file *m, struct vm_area_struct *vma); - #if PAGE_SIZE == SZ_4K && defined(CONFIG_64BIT) extern void vma_set_pad_pages(struct vm_area_struct *vma, unsigned long nr_pages); @@ -60,7 +58,7 @@ extern struct vm_area_struct *get_data_vma(struct vm_area_struct *vma); extern void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad, - struct seq_file *m, show_pad_vma_fn func); + struct seq_file *m, void *func, bool smaps); extern void split_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *new, unsigned long addr, int new_below); @@ -92,7 +90,7 @@ static inline struct vm_area_struct *get_data_vma(struct vm_area_struct *vma) static inline void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad, - struct seq_file *m, show_pad_vma_fn func) + struct seq_file *m, void *func, bool smaps) { } diff --git a/mm/pgsize_migration.c b/mm/pgsize_migration.c index 305036b3d45a..9efadd1412a3 100644 --- a/mm/pgsize_migration.c +++ b/mm/pgsize_migration.c @@ -21,6 +21,9 @@ #include #include +typedef void (*show_pad_maps_fn) (struct seq_file *m, struct vm_area_struct *vma); +typedef void (*show_pad_smaps_fn) (struct seq_file *m, void *v); + #ifdef CONFIG_64BIT #if PAGE_SIZE == SZ_4K DEFINE_STATIC_KEY_TRUE(pgsize_migration_enabled); @@ -304,7 +307,7 @@ struct vm_area_struct *get_data_vma(struct vm_area_struct *vma) * and @pad. */ void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad, - struct seq_file *m, show_pad_vma_fn func) + struct seq_file *m, void *func, bool smaps) { if (!pad) return; @@ -321,7 +324,10 @@ void show_map_pad_vma(struct vm_area_struct *vma, struct vm_area_struct *pad, */ BUG_ON(!vma); - func(m, pad); + if (smaps) + ((show_pad_smaps_fn)func)(m, pad); + else + ((show_pad_maps_fn)func)(m, pad); kfree(pad); kfree(vma);