diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index ddcd621a49ad..e2d76a21d47e 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -39,6 +39,7 @@ Currently, these files are in /proc/sys/vm: - extfrag_threshold - extra_free_kbytes - hugetlb_shm_group +- kswapd_threads - laptop_mode - legacy_va_layout - lowmem_reserve_ratio @@ -310,6 +311,25 @@ hugetlb_shm_group hugetlb_shm_group contains group id that is allowed to create SysV shared memory segment using hugetlb page. +kswapd_threads +============== +kswapd_threads allows you to control the number of kswapd threads per node +running on the system. This provides the ability to devote additional CPU +resources toward proactive page replacement with the goal of reducing +direct reclaims. When direct reclaims are prevented, the CPU consumed +by them is prevented as well. Depending on the workload, the result can +cause aggregate CPU usage on the system to go up, down or stay the same. + +More aggressive page replacement can reduce direct reclaims which cause +latency for tasks and decrease throughput when doing filesystem IO through +the pagecache. Direct reclaims are recorded using the allocstall counter +in /proc/vmstat. + +The default value is 1 and the range of acceptible values are 1-16. +Always start with lower values in the 2-6 range. Higher values should +be justified with testing. If direct reclaims occur in spite of high +values, the cost of direct reclaims (in latency) that occur can be +higher due to increased lock contention. laptop_mode =========== diff --git a/include/linux/mm.h b/include/linux/mm.h index 67e18454cb14..1b77773f1f39 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2208,6 +2208,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context, struct vmem_altmap *); extern void setup_per_zone_wmarks(void); +extern void update_kswapd_threads(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); @@ -2228,6 +2229,7 @@ extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); /* page_alloc.c */ +extern int kswapd_threads; extern int min_free_kbytes; extern int watermark_boost_factor; extern int watermark_scale_factor; @@ -2901,5 +2903,12 @@ static inline int pages_identical(struct page *page1, struct page *page2) extern int want_old_faultaround_pte; +#ifndef CONFIG_MULTIPLE_KSWAPD +static inline void update_kswapd_threads_node(int nid) {} +static inline int multi_kswapd_run(int nid) { return 0; } +static inline void multi_kswapd_stop(int nid) {} +static inline void multi_kswapd_cpu_online(pg_data_t *pgdat, + const struct cpumask *mask) {} +#endif /* CONFIG_MULTIPLE_KSWAPD */ #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 560df1ae3a3e..c42a4d5abb63 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -39,6 +39,8 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 +#define MAX_KSWAPD_THREADS 16 + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, @@ -750,8 +752,13 @@ typedef struct pglist_data { int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; - struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/end() */ + struct task_struct *kswapd; +#ifdef CONFIG_MULTIPLE_KSWAPD + /* + * Protected by mem_hotplug_begin/end() + */ + struct task_struct *mkswapd[MAX_KSWAPD_THREADS]; +#endif int kswapd_order; enum zone_type kswapd_classzone_idx; @@ -964,6 +971,9 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; +int kswapd_threads_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *pos); int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int watermark_boost_factor_sysctl_handler(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0eab7139acb9..4b6865ff4629 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -143,6 +143,8 @@ static int six_hundred_forty_kb = 640 * 1024; static unsigned int __maybe_unused half_million = 500000; static unsigned int __maybe_unused one_hundred_million = 100000000; static unsigned int __maybe_unused one_million = 1000000; +static int __maybe_unused max_kswapd_threads = MAX_KSWAPD_THREADS; + #ifdef CONFIG_SCHED_WALT static int neg_three = -3; static int three = 3; @@ -1832,6 +1834,17 @@ static struct ctl_table vm_table[] = { .proc_handler = watermark_boost_factor_sysctl_handler, .extra1 = SYSCTL_ZERO, }, +#ifdef CONFIG_MULTIPLE_KSWAPD + { + .procname = "kswapd_threads", + .data = &kswapd_threads, + .maxlen = sizeof(kswapd_threads), + .mode = 0644, + .proc_handler = kswapd_threads_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = &max_kswapd_threads, + }, +#endif { .procname = "watermark_scale_factor", .data = &watermark_scale_factor, diff --git a/mm/Kconfig b/mm/Kconfig index 35ecdc808844..ddbbe0644d93 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -846,3 +846,16 @@ config PROCESS_RECLAIM (echo all > /proc/PID/reclaim) reclaims all pages. Any other value is ignored. + +config MULTIPLE_KSWAPD + bool "Spawn multiple kswapd threads" + depends on QGKI + default y + help + kswapd_threads allows you to control the number of kswapd threads + per node running on the system. The default value is 1 and the + range of acceptible values are 1-16. The number of threads can + be controlled by below command: + (echo > /proc/sys/vm/kswapd_threads) + + Values not in the range of 1..16 are ignored. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 021726afd3c5..1419b070215d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8051,6 +8051,22 @@ int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, return 0; } +#ifdef CONFIG_MULTIPLE_KSWAPD +int kswapd_threads_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + update_kswapd_threads(); + + return 0; +} +#endif int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 258f345c03c5..2b9a58ad27a1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -133,6 +133,13 @@ struct scan_control { struct reclaim_state reclaim_state; }; +/* + * Number of active kswapd threads + */ +#define DEF_KSWAPD_THREADS_PER_NODE 1 +int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE; +int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE; + #ifdef ARCH_HAS_PREFETCH #define prefetch_prev_lru_page(_page, _base, _field) \ do { \ @@ -4103,6 +4110,116 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) } #endif /* CONFIG_HIBERNATION */ +#ifdef CONFIG_MULTIPLE_KSWAPD +static void update_kswapd_threads_node(int nid) +{ + pg_data_t *pgdat; + int drop, increase; + int last_idx, start_idx, hid; + int nr_threads = kswapd_threads_current; + + pgdat = NODE_DATA(nid); + last_idx = nr_threads - 1; + if (kswapd_threads < nr_threads) { + drop = nr_threads - kswapd_threads; + for (hid = last_idx; hid > (last_idx - drop); hid--) { + if (pgdat->mkswapd[hid]) { + kthread_stop(pgdat->mkswapd[hid]); + pgdat->mkswapd[hid] = NULL; + } + } + } else { + increase = kswapd_threads - nr_threads; + start_idx = last_idx + 1; + for (hid = start_idx; hid < (start_idx + increase); hid++) { + pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, + "kswapd%d:%d", nid, hid); + if (IS_ERR(pgdat->mkswapd[hid])) { + pr_err("Failed to start kswapd%d on node %d\n", + hid, nid); + pgdat->mkswapd[hid] = NULL; + /* + * We are out of resources. Do not start any + * more threads. + */ + break; + } + } + } +} + +void update_kswapd_threads(void) +{ + int nid; + + if (kswapd_threads_current == kswapd_threads) + return; + + /* + * Hold the memory hotplug lock to avoid racing with memory + * hotplug initiated updates + */ + mem_hotplug_begin(); + for_each_node_state(nid, N_MEMORY) + update_kswapd_threads_node(nid); + + pr_info("kswapd_thread count changed, old:%d new:%d\n", + kswapd_threads_current, kswapd_threads); + kswapd_threads_current = kswapd_threads; + mem_hotplug_done(); +} + +static int multi_kswapd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int hid, nr_threads = kswapd_threads; + int ret = 0; + + pgdat->mkswapd[0] = pgdat->kswapd; + for (hid = 1; hid < nr_threads; ++hid) { + pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d", + nid, hid); + if (IS_ERR(pgdat->mkswapd[hid])) { + /* failure at boot is fatal */ + WARN_ON(system_state < SYSTEM_RUNNING); + pr_err("Failed to start kswapd%d on node %d\n", + hid, nid); + ret = PTR_ERR(pgdat->mkswapd[hid]); + pgdat->mkswapd[hid] = NULL; + } + } + kswapd_threads_current = nr_threads; + + return ret; +} + +static void multi_kswapd_stop(int nid) +{ + int hid = 0; + int nr_threads = kswapd_threads_current; + struct task_struct *kswapd; + + NODE_DATA(nid)->mkswapd[hid] = NULL; + for (hid = 1; hid < nr_threads; hid++) { + kswapd = NODE_DATA(nid)->mkswapd[hid]; + if (kswapd) { + kthread_stop(kswapd); + NODE_DATA(nid)->mkswapd[hid] = NULL; + } + } +} + +static void multi_kswapd_cpu_online(pg_data_t *pgdat, + const struct cpumask *mask) +{ + int hid; + int nr_threads = kswapd_threads_current; + + for (hid = 1; hid < nr_threads; hid++) + set_cpus_allowed_ptr(pgdat->mkswapd[hid], mask); +} +#endif + /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, @@ -4117,9 +4234,11 @@ static int kswapd_cpu_online(unsigned int cpu) mask = cpumask_of_node(pgdat->node_id); - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) { /* One of our CPUs online: restore mask */ set_cpus_allowed_ptr(pgdat->kswapd, mask); + multi_kswapd_cpu_online(pgdat, mask); + } } return 0; } @@ -4136,14 +4255,17 @@ int kswapd_run(int nid) if (pgdat->kswapd) return 0; - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d:0", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; + return ret; } + ret = multi_kswapd_run(nid); + return ret; } @@ -4159,6 +4281,8 @@ void kswapd_stop(int nid) kthread_stop(kswapd); NODE_DATA(nid)->kswapd = NULL; } + + multi_kswapd_stop(nid); } static int __init kswapd_init(void)