9c70abfc5e
https://source.android.com/docs/security/bulletin/2022-11-01 * tag 'ASB-2022-11-01_11-5.4' of https://android.googlesource.com/kernel/common: UPSTREAM: mm/mremap: hold the rmap lock in write mode when moving page table entries. FROMLIST: binder: fix UAF of alloc->vma in race with munmap() UPSTREAM: mm: Fix TLB flush for not-first PFNMAP mappings in unmap_region() UPSTREAM: mm: Force TLB flush for PFNMAP mappings before unlink_file_vma() UPSTREAM: af_key: Do not call xfrm_probe_algs in parallel UPSTREAM: wifi: cfg80211: fix u8 overflow in cfg80211_update_notlisted_nontrans() UPSTREAM: wifi: cfg80211/mac80211: reject bad MBSSID elements UPSTREAM: wifi: cfg80211: ensure length byte is present before access UPSTREAM: wifi: cfg80211: fix BSS refcounting bugs UPSTREAM: wifi: cfg80211: avoid nontransmitted BSS list corruption UPSTREAM: wifi: mac80211_hwsim: avoid mac80211 warning on bad rate UPSTREAM: wifi: cfg80211: update hidden BSSes to avoid WARN_ON UPSTREAM: mac80211: mlme: find auth challenge directly UPSTREAM: wifi: mac80211: don't parse mbssid in assoc response UPSTREAM: wifi: mac80211: fix MBSSID parsing use-after-free ANDROID: Drop explicit 'CONFIG_INIT_STACK_ALL_ZERO=y' from gki_defconfig UPSTREAM: hardening: Remove Clang's enable flag for -ftrivial-auto-var-init=zero UPSTREAM: hardening: Avoid harmless Clang option under CONFIG_INIT_STACK_ALL_ZERO UPSTREAM: hardening: Clarify Kconfig text for auto-var-init ANDROID: GKI: Update FCNT KMI symbol list ANDROID: Fix kenelci build-break for !CONFIG_PERF_EVENTS BACKPORT: HID: steam: Prevent NULL pointer dereference in steam_{recv,send}_report ANDROID: ABI: Update allowed list for QCOM UPSTREAM: wifi: mac80211_hwsim: use 32-bit skb cookie UPSTREAM: wifi: mac80211_hwsim: add back erroneously removed cast UPSTREAM: wifi: mac80211_hwsim: fix race condition in pending packet ANDROID: incfs: Add check for ATTR_KILL_SUID and ATTR_MODE in incfs_setattr Linux 5.4.210 x86/speculation: Add LFENCE to RSB fill sequence x86/speculation: Add RSB VM Exit protections macintosh/adb: fix oob read in do_adb_query() function media: v4l2-mem2mem: Apply DST_QUEUE_OFF_BASE on MMAP buffers across ioctls selftests: KVM: Handle compiler optimizations in ucall KVM: Don't null dereference ops->destroy selftests/bpf: Fix "dubious pointer arithmetic" test selftests/bpf: Fix test_align verifier log patterns bpf: Test_verifier, #70 error message updates for 32-bit right shift selftests/bpf: Extend verifier and bpf_sock tests for dst_port loads bpf: Verifer, adjust_scalar_min_max_vals to always call update_reg_bounds() ACPI: APEI: Better fix to avoid spamming the console with old error logs ACPI: video: Shortening quirk list by identifying Clevo by board_name only ACPI: video: Force backlight native for some TongFang devices thermal: Fix NULL pointer dereferences in of_thermal_ functions ANDROID: GKI: db845c: Update symbols list and ABI Linux 5.4.209 scsi: core: Fix race between handling STS_RESOURCE and completion mt7601u: add USB device ID for some versions of XiaoDu WiFi Dongle. ARM: crypto: comment out gcc warning that breaks clang builds sctp: leave the err path free in sctp_stream_init to sctp_stream_free sfc: disable softirqs for ptp TX perf symbol: Correct address for bss symbols virtio-net: fix the race between refill work and close netfilter: nf_queue: do not allow packet truncation below transport header offset sctp: fix sleep in atomic context bug in timer handlers i40e: Fix interface init with MSI interrupts (no MSI-X) tcp: Fix a data-race around sysctl_tcp_comp_sack_nr. tcp: Fix a data-race around sysctl_tcp_comp_sack_delay_ns. Documentation: fix sctp_wmem in ip-sysctl.rst tcp: Fix a data-race around sysctl_tcp_invalid_ratelimit. tcp: Fix a data-race around sysctl_tcp_autocorking. tcp: Fix a data-race around sysctl_tcp_min_rtt_wlen. tcp: Fix a data-race around sysctl_tcp_min_tso_segs. net: sungem_phy: Add of_node_put() for reference returned by of_get_parent() igmp: Fix data-races around sysctl_igmp_qrv. ipv6/addrconf: fix a null-ptr-deref bug for ip6_ptr net: ping6: Fix memleak in ipv6_renew_options(). tcp: Fix a data-race around sysctl_tcp_challenge_ack_limit. tcp: Fix a data-race around sysctl_tcp_limit_output_bytes. scsi: ufs: host: Hold reference returned by of_parse_phandle() ice: do not setup vlan for loopback VSI ice: check (DD | EOF) bits on Rx descriptor rather than (EOP | RS) tcp: Fix a data-race around sysctl_tcp_nometrics_save. tcp: Fix a data-race around sysctl_tcp_frto. tcp: Fix a data-race around sysctl_tcp_adv_win_scale. tcp: Fix a data-race around sysctl_tcp_app_win. tcp: Fix data-races around sysctl_tcp_dsack. s390/archrandom: prevent CPACF trng invocations in interrupt context ntfs: fix use-after-free in ntfs_ucsncmp() Bluetooth: L2CAP: Fix use-after-free caused by l2cap_chan_put ANDROID: restore some removed refcount functions ANDROID: add tty_schedule_flip() back to the kernel Linux 5.4.208 x86: drop bogus "cc" clobber from __try_cmpxchg_user_asm() net: usb: ax88179_178a needs FLAG_SEND_ZLP tty: use new tty_insert_flip_string_and_push_buffer() in pty_write() tty: extract tty_flip_buffer_commit() from tty_flip_buffer_push() tty: drop tty_schedule_flip() tty: the rest, stop using tty_schedule_flip() tty: drivers/tty/, stop using tty_schedule_flip() Bluetooth: Fix bt_skb_sendmmsg not allocating partial chunks Bluetooth: SCO: Fix sco_send_frame returning skb->len Bluetooth: Fix passing NULL to PTR_ERR Bluetooth: RFCOMM: Replace use of memcpy_from_msg with bt_skb_sendmmsg Bluetooth: SCO: Replace use of memcpy_from_msg with bt_skb_sendmsg Bluetooth: Add bt_skb_sendmmsg helper Bluetooth: Add bt_skb_sendmsg helper ALSA: memalloc: Align buffer allocations in page size bitfield.h: Fix "type of reg too small for mask" test x86/mce: Deduplicate exception handling mmap locking API: initial implementation as rwsem wrappers x86/uaccess: Implement macros for CMPXCHG on user addresses x86: get rid of small constant size cases in raw_copy_{to,from}_user() locking/refcount: Consolidate implementations of refcount_t locking/refcount: Consolidate REFCOUNT_{MAX,SATURATED} definitions locking/refcount: Move saturation warnings out of line locking/refcount: Improve performance of generic REFCOUNT_FULL code locking/refcount: Move the bulk of the REFCOUNT_FULL implementation into the <linux/refcount.h> header locking/refcount: Remove unused refcount_*_checked() variants locking/refcount: Ensure integer operands are treated as signed locking/refcount: Define constants for saturation and max refcount values ima: remove the IMA_TEMPLATE Kconfig option dlm: fix pending remove if msg allocation fails bpf: Make sure mac_header was set before using it mm/mempolicy: fix uninit-value in mpol_rebind_policy() spi: bcm2835: bcm2835_spi_handle_err(): fix NULL pointer deref for non DMA transfers tcp: Fix data-races around sysctl_tcp_max_reordering. tcp: Fix a data-race around sysctl_tcp_rfc1337. tcp: Fix a data-race around sysctl_tcp_stdurg. tcp: Fix a data-race around sysctl_tcp_retrans_collapse. tcp: Fix data-races around sysctl_tcp_slow_start_after_idle. tcp: Fix a data-race around sysctl_tcp_thin_linear_timeouts. tcp: Fix data-races around sysctl_tcp_recovery. tcp: Fix a data-race around sysctl_tcp_early_retrans. tcp: Fix data-races around sysctl knobs related to SYN option. udp: Fix a data-race around sysctl_udp_l3mdev_accept. ipv4: Fix a data-race around sysctl_fib_multipath_use_neigh. be2net: Fix buffer overflow in be_get_module_eeprom gpio: pca953x: only use single read/write for No AI mode ixgbe: Add locking to prevent panic when setting sriov_numvfs to zero i40e: Fix erroneous adapter reinitialization during recovery process iavf: Fix handling of dummy receive descriptors tcp: Fix data-races around sysctl_tcp_fastopen. tcp: Fix data-races around sysctl_max_syn_backlog. tcp: Fix a data-race around sysctl_tcp_tw_reuse. tcp: Fix a data-race around sysctl_tcp_notsent_lowat. tcp: Fix data-races around some timeout sysctl knobs. tcp: Fix data-races around sysctl_tcp_reordering. tcp: Fix data-races around sysctl_tcp_syncookies. igmp: Fix a data-race around sysctl_igmp_max_memberships. igmp: Fix data-races around sysctl_igmp_llm_reports. net/tls: Fix race in TLS device down flow net: stmmac: fix dma queue left shift overflow issue i2c: cadence: Change large transfer count reset logic to be unconditional tcp: Fix a data-race around sysctl_tcp_probe_interval. tcp: Fix a data-race around sysctl_tcp_probe_threshold. tcp: Fix a data-race around sysctl_tcp_mtu_probe_floor. tcp: Fix data-races around sysctl_tcp_min_snd_mss. tcp: Fix data-races around sysctl_tcp_base_mss. tcp: Fix data-races around sysctl_tcp_mtu_probing. tcp/dccp: Fix a data-race around sysctl_tcp_fwmark_accept. ip: Fix a data-race around sysctl_fwmark_reflect. ip: Fix data-races around sysctl_ip_nonlocal_bind. ip: Fix data-races around sysctl_ip_fwd_use_pmtu. ip: Fix data-races around sysctl_ip_no_pmtu_disc. igc: Reinstate IGC_REMOVED logic and implement it properly perf/core: Fix data race between perf_event_set_output() and perf_mmap_close() pinctrl: ralink: Check for null return of devm_kcalloc power/reset: arm-versatile: Fix refcount leak in versatile_reboot_probe xfrm: xfrm_policy: fix a possible double xfrm_pols_put() in xfrm_bundle_lookup() serial: mvebu-uart: correctly report configured baudrate value PCI: hv: Fix interrupt mapping for multi-MSI PCI: hv: Reuse existing IRTE allocation in compose_msi_msg() PCI: hv: Fix hv_arch_irq_unmask() for multi-MSI PCI: hv: Fix multi-MSI to allow more than one MSI vector xen/gntdev: Ignore failure to unmap INVALID_GRANT_HANDLE lockdown: Fix kexec lockdown bypass with ima policy mlxsw: spectrum_router: Fix IPv4 nexthop gateway indication riscv: add as-options for modules with assembly compontents pinctrl: stm32: fix optional IRQ support to gpios Revert "cgroup: Use separate src/dst nodes when preloading css_sets for migration" Linux 5.4.207 can: m_can: m_can_tx_handler(): fix use after free of skb serial: pl011: UPSTAT_AUTORTS requires .throttle/unthrottle serial: stm32: Clear prev values before setting RTS delays serial: 8250: fix return error code in serial8250_request_std_resource() tty: serial: samsung_tty: set dma burst_size to 1 usb: dwc3: gadget: Fix event pending check usb: typec: add missing uevent when partner support PD USB: serial: ftdi_sio: add Belimo device ids signal handling: don't use BUG_ON() for debugging ARM: dts: stm32: use the correct clock source for CEC on stm32mp151 soc: ixp4xx/npe: Fix unused match warning x86: Clear .brk area at early boot irqchip: or1k-pic: Undefine mask_ack for level triggered hardware ASoC: madera: Fix event generation for rate controls ASoC: madera: Fix event generation for OUT1 demux ASoC: cs47l15: Fix event generation for low power mux control ASoC: wm5110: Fix DRE control ASoC: ops: Fix off by one in range control validation net: sfp: fix memory leak in sfp_probe() nvme: fix regression when disconnect a recovering ctrl NFC: nxp-nci: don't print header length mismatch on i2c error net: tipc: fix possible refcount leak in tipc_sk_create() platform/x86: hp-wmi: Ignore Sanitization Mode event cpufreq: pmac32-cpufreq: Fix refcount leak bug netfilter: br_netfilter: do not skip all hooks with 0 priority virtio_mmio: Restore guest page size on resume virtio_mmio: Add missing PM calls to freeze/restore mm: sysctl: fix missing numa_stat when !CONFIG_HUGETLB_PAGE sfc: fix kernel panic when creating VF seg6: bpf: fix skb checksum in bpf_push_seg6_encap() seg6: fix skb checksum in SRv6 End.B6 and End.B6.Encaps behaviors seg6: fix skb checksum evaluation in SRH encapsulation/insertion sfc: fix use after free when disabling sriov net: ftgmac100: Hold reference returned by of_get_child_by_name() ipv4: Fix data-races around sysctl_ip_dynaddr. raw: Fix a data-race around sysctl_raw_l3mdev_accept. icmp: Fix a data-race around sysctl_icmp_ratemask. icmp: Fix a data-race around sysctl_icmp_ratelimit. drm/i915/gt: Serialize TLB invalidates with GT resets ARM: dts: sunxi: Fix SPI NOR campatible on Orange Pi Zero ARM: dts: at91: sama5d2: Fix typo in i2s1 node ipv4: Fix a data-race around sysctl_fib_sync_mem. icmp: Fix data-races around sysctl. cipso: Fix data-races around sysctl. net: Fix data-races around sysctl_mem. inetpeer: Fix data-races around sysctl. net: stmmac: dwc-qos: Disable split header for Tegra194 ASoC: sgtl5000: Fix noise on shutdown/remove ima: Fix a potential integer overflow in ima_appraise_measurement drm/i915: fix a possible refcount leak in intel_dp_add_mst_connector() ARM: 9210/1: Mark the FDT_FIXED sections as shareable ARM: 9209/1: Spectre-BHB: avoid pr_info() every time a CPU comes out of idle ARM: dts: imx6qdl-ts7970: Fix ngpio typo and count ext4: fix race condition between ext4_write and ext4_convert_inline_data sched/rt: Disable RT_RUNTIME_SHARE by default Revert "evm: Fix memleak in init_desc" nilfs2: fix incorrect masking of permission flags for symlinks drm/panfrost: Fix shrinker list corruption by madvise IOCTL cgroup: Use separate src/dst nodes when preloading css_sets for migration wifi: mac80211: fix queue selection for mesh/OCB interfaces ARM: 9214/1: alignment: advance IT state after emulating Thumb instruction ARM: 9213/1: Print message about disabled Spectre workarounds only once ip: fix dflt addr selection for connected nexthop net: sock: tracing: Fix sock_exceed_buf_limit not to dereference stale pointer tracing/histograms: Fix memory leak problem xen/netback: avoid entering xenvif_rx_next_skb() with an empty rx queue ALSA: hda/realtek - Enable the headset-mic on a Xiaomi's laptop ALSA: hda/realtek - Fix headset mic problem for a HP machine with alc221 ALSA: hda/realtek - Fix headset mic problem for a HP machine with alc671 ALSA: hda/conexant: Apply quirk for another HP ProDesk 600 G3 model ALSA: hda - Add fixup for Dell Latitidue E5430 Linux 5.4.206 Revert "mtd: rawnand: gpmi: Fix setting busy timeout setting" Linux 5.4.205 dmaengine: ti: Add missing put_device in ti_dra7_xbar_route_allocate dmaengine: ti: Fix refcount leak in ti_dra7_xbar_route_allocate dmaengine: at_xdma: handle errors of at_xdmac_alloc_desc() correctly dmaengine: pl330: Fix lockdep warning about non-static key ida: don't use BUG_ON() for debugging dt-bindings: dma: allwinner,sun50i-a64-dma: Fix min/max typo misc: rtsx_usb: set return value in rsp_buf alloc err path misc: rtsx_usb: use separate command and response buffers misc: rtsx_usb: fix use of dma mapped buffer for usb bulk transfer dmaengine: imx-sdma: Allow imx8m for imx7 FW revs i2c: cadence: Unregister the clk notifier in error path selftests: forwarding: fix error message in learning_test selftests: forwarding: fix learning_test when h1 supports IFF_UNICAST_FLT selftests: forwarding: fix flood_unicast_test when h2 supports IFF_UNICAST_FLT ibmvnic: Properly dispose of all skbs during a failover. ARM: at91: pm: use proper compatibles for sam9x60's rtc and rtt ARM: at91: pm: use proper compatible for sama5d2's rtc pinctrl: sunxi: sunxi_pconf_set: use correct offset pinctrl: sunxi: a83t: Fix NAND function name for some pins ARM: meson: Fix refcount leak in meson_smp_prepare_cpus xfs: remove incorrect ASSERT in xfs_rename can: kvaser_usb: kvaser_usb_leaf: fix bittiming limits can: kvaser_usb: kvaser_usb_leaf: fix CAN clock frequency regression can: kvaser_usb: replace run-time checks with struct kvaser_usb_driver_info powerpc/powernv: delay rng platform device creation until later in boot video: of_display_timing.h: include errno.h fbcon: Prevent that screen size is smaller than font size fbcon: Disallow setting font bigger than screen size fbmem: Check virtual screen sizes in fb_set_var() fbdev: fbmem: Fix logo center image dx issue iommu/vt-d: Fix PCI bus rescan device hot add net: rose: fix UAF bug caused by rose_t0timer_expiry usbnet: fix memory leak in error case can: gs_usb: gs_usb_open/close(): fix memory leak can: grcan: grcan_probe(): remove extra of_node_get() can: bcm: use call_rcu() instead of costly synchronize_rcu() mm/slub: add missing TID updates on slab deactivation esp: limit skb_page_frag_refill use to a single page Linux 5.4.204 clocksource/drivers/ixp4xx: remove EXPORT_SYMBOL_GPL from ixp4xx_timer_setup() net: usb: qmi_wwan: add Telit 0x1070 composition net: usb: qmi_wwan: add Telit 0x1060 composition xen/arm: Fix race in RB-tree based P2M accounting xen/blkfront: force data bouncing when backend is untrusted xen/netfront: force data bouncing when backend is untrusted xen/netfront: fix leaking data in shared pages xen/blkfront: fix leaking data in shared pages selftests/rseq: Change type of rseq_offset to ptrdiff_t selftests/rseq: x86-32: use %gs segment selector for accessing rseq thread area selftests/rseq: x86-64: use %fs segment selector for accessing rseq thread area selftests/rseq: Fix: work-around asm goto compiler bugs selftests/rseq: Remove arm/mips asm goto compiler work-around selftests/rseq: Fix warnings about #if checks of undefined tokens selftests/rseq: Fix ppc32 offsets by using long rather than off_t selftests/rseq: Fix ppc32 missing instruction selection "u" and "x" for load/store selftests/rseq: Fix ppc32: wrong rseq_cs 32-bit field pointer on big endian selftests/rseq: Uplift rseq selftests for compatibility with glibc-2.35 selftests/rseq: Introduce thread pointer getters selftests/rseq: Introduce rseq_get_abi() helper selftests/rseq: Remove volatile from __rseq_abi selftests/rseq: Remove useless assignment to cpu variable selftests/rseq: introduce own copy of rseq uapi header selftests/rseq: remove ARRAY_SIZE define from individual tests rseq/selftests,x86_64: Add rseq_offset_deref_addv() ipv6/sit: fix ipip6_tunnel_get_prl return value sit: use min net: dsa: bcm_sf2: force pause link settings hwmon: (ibmaem) don't call platform_device_del() if platform_device_add() fails xen/gntdev: Avoid blocking in unmap_grant_pages() net: tun: avoid disabling NAPI twice NFC: nxp-nci: Don't issue a zero length i2c_master_read() nfc: nfcmrvl: Fix irq_of_parse_and_map() return value net: bonding: fix use-after-free after 802.3ad slave unbind net: bonding: fix possible NULL deref in rlb code net/sched: act_api: Notify user space if any actions were flushed before error netfilter: nft_dynset: restore set element counter when failing to update s390: remove unneeded 'select BUILD_BIN2C' PM / devfreq: exynos-ppmu: Fix refcount leak in of_get_devfreq_events caif_virtio: fix race between virtio_device_ready() and ndo_open() net: ipv6: unexport __init-annotated seg6_hmac_net_init() usbnet: fix memory allocation in helpers linux/dim: Fix divide by 0 in RDMA DIM RDMA/qedr: Fix reporting QP timeout attribute net: tun: stop NAPI when detaching queues net: tun: unlink NAPI from device on destruction selftests/net: pass ipv6_args to udpgso_bench's IPv6 TCP test virtio-net: fix race between ndo_open() and virtio_device_ready() net: usb: ax88179_178a: Fix packet receiving net: rose: fix UAF bugs caused by timer handler SUNRPC: Fix READ_PLUS crasher s390/archrandom: simplify back to earlier design and initialize earlier dm raid: fix KASAN warning in raid5_add_disks dm raid: fix accesses beyond end of raid member array powerpc/bpf: Fix use of user_pt_regs in uapi powerpc/prom_init: Fix kernel config grep nvdimm: Fix badblocks clear off-by-one error ipv6: take care of disable_policy when restoring routes Linux 5.4.203 crypto: arm/ghash-ce - define fpu before fpu registers are referenced crypto: arm - use Kconfig based compiler checks for crypto opcodes ARM: 9029/1: Make iwmmxt.S support Clang's integrated assembler ARM: OMAP2+: drop unnecessary adrl ARM: 8929/1: use APSR_nzcv instead of r15 as mrc operand ARM: 8933/1: replace Sun/Solaris style flag on section directive crypto: arm/sha512-neon - avoid ADRL pseudo instruction crypto: arm/sha256-neon - avoid ADRL pseudo instruction ARM: 8971/1: replace the sole use of a symbol with its definition ARM: 8990/1: use VFP assembler mnemonics in register load/store macros ARM: 8989/1: use .fpu assembler directives instead of assembler arguments net: mscc: ocelot: allow unregistered IP multicast flooding kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add] powerpc/ftrace: Remove ftrace init tramp once kernel init is complete drm: remove drm_fb_helper_modinit Linux 5.4.202 powerpc/pseries: wire up rng during setup_arch() kbuild: link vmlinux only once for CONFIG_TRIM_UNUSED_KSYMS (2nd attempt) random: update comment from copy_to_user() -> copy_to_iter() modpost: fix section mismatch check for exported init/exit sections ARM: cns3xxx: Fix refcount leak in cns3xxx_init ARM: Fix refcount leak in axxia_boot_secondary soc: bcm: brcmstb: pm: pm-arm: Fix refcount leak in brcmstb_pm_probe ARM: exynos: Fix refcount leak in exynos_map_pmu ARM: dts: imx6qdl: correct PU regulator ramp delay powerpc/powernv: wire up rng during setup_arch powerpc/rtas: Allow ibm,platform-dump RTAS call with null buffer address powerpc: Enable execve syscall exit tracepoint parisc: Enable ARCH_HAS_STRICT_MODULE_RWX xtensa: Fix refcount leak bug in time.c xtensa: xtfpga: Fix refcount leak bug in setup iio: adc: axp288: Override TS pin bias current for some models iio: adc: stm32: fix maximum clock rate for stm32mp15x iio: trigger: sysfs: fix use-after-free on remove iio: gyro: mpu3050: Fix the error handling in mpu3050_power_up() iio: accel: mma8452: ignore the return value of reset operation iio:accel:mxc4005: rearrange iio trigger get and register iio:accel:bma180: rearrange iio trigger get and register iio:chemical:ccs811: rearrange iio trigger get and register usb: chipidea: udc: check request status before setting device address xhci: turn off port power in shutdown iio: adc: vf610: fix conversion mode sysfs node name s390/cpumf: Handle events cycles and instructions identical gpio: winbond: Fix error code in winbond_gpio_get() Revert "net/tls: fix tls_sk_proto_close executed repeatedly" virtio_net: fix xdp_rxq_info bug after suspend/resume igb: Make DMA faster when CPU is active on the PCIe link regmap-irq: Fix a bug in regmap_irq_enable() for type_in_mask chips ice: ethtool: advertise 1000M speeds properly afs: Fix dynamic root getattr MIPS: Remove repetitive increase irq_err_count x86/xen: Remove undefined behavior in setup_features() udmabuf: add back sanity check net/tls: fix tls_sk_proto_close executed repeatedly erspan: do not assume transport header is always set drm/msm/mdp4: Fix refcount leak in mdp4_modeset_init_intf net/sched: sch_netem: Fix arithmetic in netem_dump() for 32-bit platforms bonding: ARP monitor spams NETDEV_NOTIFY_PEERS notifiers phy: aquantia: Fix AN when higher speeds than 1G are not advertised bpf: Fix request_sock leak in sk lookup helpers USB: serial: option: add Quectel RM500K module support USB: serial: option: add Quectel EM05-G modem USB: serial: option: add Telit LE910Cx 0x1250 composition random: quiet urandom warning ratelimit suppression message dm mirror log: clear log bits up to BITS_PER_LONG boundary dm era: commit metadata in postsuspend after worker stops ata: libata: add qc->flags in ata_qc_complete_template tracepoint mtd: rawnand: gpmi: Fix setting busy timeout setting mmc: sdhci-pci-o2micro: Fix card detect by dealing with debouncing net: openvswitch: fix parsing of nw_proto for IPv6 fragments ALSA: hda/realtek: Add quirk for Clevo PD70PNT ALSA: hda/realtek - ALC897 headset MIC no sound ALSA: hda/conexant: Fix missing beep setup ALSA: hda/via: Fix missing beep setup random: schedule mix_interrupt_randomness() less often vt: drop old FONT ioctls Linux 5.4.201 Revert "hwmon: Make chip parameter for with_info API mandatory" arm64: mm: Don't invalidate FROM_DEVICE buffers at start of DMA transfer tcp: drop the hash_32() part from the index calculation tcp: increase source port perturb table to 2^16 tcp: dynamically allocate the perturb table used by source ports tcp: add small random increments to the source port tcp: use different parts of the port_offset for index and offset tcp: add some entropy in __inet_hash_connect() usb: gadget: u_ether: fix regression in setting fixed MAC address dm: remove special-casing of bio-based immutable singleton target on NVMe s390/mm: use non-quiescing sske for KVM switch to keyed guest UPSTREAM: ext4: verify dir block before splitting it UPSTREAM: ext4: fix use-after-free in ext4_rename_dir_prepare BACKPORT: ext4: Only advertise encrypted_casefold when encryption and unicode are enabled BACKPORT: ext4: fix no-key deletion for encrypt+casefold BACKPORT: ext4: optimize match for casefolded encrypted dirs BACKPORT: ext4: handle casefolding with encryption Revert "ANDROID: ext4: Handle casefolding with encryption" Revert "ANDROID: ext4: Optimize match for casefolded encrypted dirs" ANDROID: cpu/hotplug: avoid breaking Android ABI by fusing cpuhp steps ANDROID: change function signatures for some random functions. Revert "mailbox: forward the hrtimer if not queued and under a lock" Revert "drm: fix EDID struct for old ARM OABI format" Revert "ALSA: jack: Access input_dev under mutex" Linux 5.4.200 powerpc/mm: Switch obsolete dssall to .long riscv: Less inefficient gcc tishift helpers (and export their symbols) RISC-V: fix barrier() use in <vdso/processor.h> arm64: kprobes: Use BRK instead of single-step when executing instructions out-of-line net: openvswitch: fix leak of nested actions net: openvswitch: fix misuse of the cached connection on tuple changes net/sched: act_police: more accurate MTU policing virtio-pci: Remove wrong address verification in vp_del_vqs() ALSA: hda/realtek: fix right sounds and mute/micmute LEDs for HP machine ALSA: hda/realtek: fix mute/micmute LEDs for HP 440 G8 ext4: add reserved GDT blocks check ext4: make variable "count" signed ext4: fix bug_on ext4_mb_use_inode_pa dm mirror log: round up region bitmap size to BITS_PER_LONG serial: 8250: Store to lsr_save_flags after lsr read usb: gadget: lpc32xx_udc: Fix refcount leak in lpc32xx_udc_probe usb: dwc2: Fix memory leak in dwc2_hcd_init USB: serial: io_ti: add Agilent E5805A support USB: serial: option: add support for Cinterion MV31 with new baseline comedi: vmk80xx: fix expression for tx buffer size i2c: designware: Use standard optional ref clock implementation irqchip/gic-v3: Fix refcount leak in gic_populate_ppi_partitions irqchip/gic-v3: Fix error handling in gic_populate_ppi_partitions irqchip/gic/realview: Fix refcount leak in realview_gic_of_init faddr2line: Fix overlapping text section failures, the sequel certs/blacklist_hashes.c: fix const confusion in certs blacklist arm64: ftrace: fix branch range checks net: bgmac: Fix an erroneous kfree() in bgmac_remove() mlxsw: spectrum_cnt: Reorder counter pools misc: atmel-ssc: Fix IRQ check in ssc_probe tty: goldfish: Fix free_irq() on remove i40e: Fix call trace in setup_tx_descriptors i40e: Fix calculating the number of queue pairs i40e: Fix adding ADQ filter to TC0 clocksource: hyper-v: unexport __init-annotated hv_init_clocksource() pNFS: Don't keep retrying if the server replied NFS4ERR_LAYOUTUNAVAILABLE random: credit cpu and bootloader seeds by default net: ethernet: mtk_eth_soc: fix misuse of mem alloc interface netdev[napi]_alloc_frag ipv6: Fix signed integer overflow in l2tp_ip6_sendmsg nfc: nfcmrvl: Fix memory leak in nfcmrvl_play_deferred virtio-mmio: fix missing put_device() when vm_cmdline_parent registration failed ALSA: hda/realtek - Add HW8326 support scsi: pmcraid: Fix missing resource cleanup in error case scsi: ipr: Fix missing/incorrect resource cleanup in error case scsi: lpfc: Allow reduced polling rate for nvme_admin_async_event cmd completion scsi: lpfc: Fix port stuck in bypassed state after LIP in PT2PT topology scsi: vmw_pvscsi: Expand vcpuHint to 16 bits ASoC: wm_adsp: Fix event generation for wm_adsp_fw_put() ASoC: es8328: Fix event generation for deemphasis control ASoC: wm8962: Fix suspend while playing music ata: libata-core: fix NULL pointer deref in ata_host_alloc_pinfo() ASoC: cs42l56: Correct typo in minimum level for SX volume controls ASoC: cs42l52: Correct TLV for Bypass Volume ASoC: cs53l30: Correct number of volume levels on SX controls ASoC: cs35l36: Update digital volume TLV ASoC: cs42l52: Fix TLV scales for mixer controls dma-debug: make things less spammy under memory pressure ASoC: nau8822: Add operation for internal PLL off and on powerpc/kasan: Silence KASAN warnings in __get_wchan() random: account for arch randomness in bits random: mark bootloader randomness code as __init random: avoid checking crng_ready() twice in random_init() crypto: drbg - make reseeding from get_random_bytes() synchronous crypto: drbg - always try to free Jitter RNG instance crypto: drbg - move dynamic ->reseed_threshold adjustments to __drbg_seed() crypto: drbg - track whether DRBG was seeded with !rng_is_initialized() crypto: drbg - prepare for more fine-grained tracking of seeding state crypto: drbg - always seeded with SP800-90B compliant noise source Revert "random: use static branch for crng_ready()" random: check for signals after page of pool writes random: wire up fops->splice_{read,write}_iter() random: convert to using fops->write_iter() random: convert to using fops->read_iter() random: unify batched entropy implementations random: move randomize_page() into mm where it belongs random: move initialization functions out of hot pages random: make consistent use of buf and len random: use proper return types on get_random_{int,long}_wait() random: remove extern from functions in header random: use static branch for crng_ready() random: credit architectural init the exact amount random: handle latent entropy and command line from random_init() random: use proper jiffies comparison macro random: remove ratelimiting for in-kernel unseeded randomness random: move initialization out of reseeding hot path random: avoid initializing twice in credit race random: use symbolic constants for crng_init states siphash: use one source of truth for siphash permutations random: help compiler out with fast_mix() by using simpler arguments random: do not use input pool from hard IRQs random: order timer entropy functions below interrupt functions random: do not pretend to handle premature next security model random: use first 128 bits of input as fast init random: do not use batches when !crng_ready() random: insist on random_get_entropy() existing in order to simplify xtensa: use fallback for random_get_entropy() instead of zero sparc: use fallback for random_get_entropy() instead of zero um: use fallback for random_get_entropy() instead of zero x86/tsc: Use fallback for random_get_entropy() instead of zero nios2: use fallback for random_get_entropy() instead of zero arm: use fallback for random_get_entropy() instead of zero mips: use fallback for random_get_entropy() instead of just c0 random m68k: use fallback for random_get_entropy() instead of zero timekeeping: Add raw clock fallback for random_get_entropy() powerpc: define get_cycles macro for arch-override alpha: define get_cycles macro for arch-override parisc: define get_cycles macro for arch-override s390: define get_cycles macro for arch-override ia64: define get_cycles macro for arch-override init: call time_init() before rand_initialize() random: fix sysctl documentation nits random: document crng_fast_key_erasure() destination possibility random: make random_get_entropy() return an unsigned long random: allow partial reads if later user copies fail random: check for signals every PAGE_SIZE chunk of /dev/[u]random random: check for signal_pending() outside of need_resched() check random: do not allow user to keep crng key around on stack random: do not split fast init input in add_hwgenerator_randomness() random: mix build-time latent entropy into pool at init random: re-add removed comment about get_random_{u32,u64} reseeding random: treat bootloader trust toggle the same way as cpu trust toggle random: skip fast_init if hwrng provides large chunk of entropy random: check for signal and try earlier when generating entropy random: reseed more often immediately after booting random: make consistent usage of crng_ready() random: use SipHash as interrupt entropy accumulator random: replace custom notifier chain with standard one random: don't let 644 read-only sysctls be written to random: give sysctl_random_min_urandom_seed a more sensible value random: do crng pre-init loading in worker rather than irq random: unify cycles_t and jiffies usage and types random: cleanup UUID handling random: only wake up writers after zap if threshold was passed random: round-robin registers as ulong, not u32 random: clear fast pool, crng, and batches in cpuhp bring up random: pull add_hwgenerator_randomness() declaration into random.h random: check for crng_init == 0 in add_device_randomness() random: unify early init crng load accounting random: do not take pool spinlock at boot random: defer fast pool mixing to worker random: rewrite header introductory comment random: group sysctl functions random: group userspace read/write functions random: group entropy collection functions random: group entropy extraction functions random: group crng functions random: group initialization wait functions random: remove whitespace and reorder includes random: remove useless header comment random: introduce drain_entropy() helper to declutter crng_reseed() random: deobfuscate irq u32/u64 contributions random: add proper SPDX header random: remove unused tracepoints random: remove ifdef'd out interrupt bench random: tie batched entropy generation to base_crng generation random: fix locking for crng_init in crng_reseed() random: zero buffer after reading entropy from userspace random: remove outdated INT_MAX >> 6 check in urandom_read() random: make more consistent use of integer types random: use hash function for crng_slow_load() random: use simpler fast key erasure flow on per-cpu keys random: absorb fast pool into input pool after fast load random: do not xor RDRAND when writing into /dev/random random: ensure early RDSEED goes through mixer on init random: inline leaves of rand_initialize() random: get rid of secondary crngs random: use RDSEED instead of RDRAND in entropy extraction random: fix locking in crng_fast_load() random: remove batched entropy locking random: remove use_input_pool parameter from crng_reseed() random: make credit_entropy_bits() always safe random: always wake up entropy writers after extraction random: use linear min-entropy accumulation crediting random: simplify entropy debiting random: use computational hash for entropy extraction random: only call crng_finalize_init() for primary_crng random: access primary_pool directly rather than through pointer random: continually use hwgenerator randomness random: simplify arithmetic function flow in account() random: selectively clang-format where it makes sense random: access input_pool_data directly rather than through pointer random: cleanup fractional entropy shift constants random: prepend remaining pool constants with POOL_ random: de-duplicate INPUT_POOL constants random: remove unused OUTPUT_POOL constants random: rather than entropy_store abstraction, use global random: remove unused extract_entropy() reserved argument random: remove incomplete last_data logic random: cleanup integer types random: cleanup poolinfo abstraction random: fix typo in comments random: don't reset crng_init_cnt on urandom_read() random: avoid superfluous call to RDRAND in CRNG extraction random: early initialization of ChaCha constants random: initialize ChaCha20 constants with correct endianness random: use IS_ENABLED(CONFIG_NUMA) instead of ifdefs random: harmonize "crng init done" messages random: mix bootloader randomness into pool random: do not re-init if crng_reseed completes before primary init random: do not sign extend bytes for rotation when mixing random: use BLAKE2s instead of SHA1 in extraction random: remove unused irq_flags argument from add_interrupt_randomness() random: document add_hwgenerator_randomness() with other input functions crypto: blake2s - adjust include guard naming crypto: blake2s - include <linux/bug.h> instead of <asm/bug.h> MAINTAINERS: co-maintain random.c random: remove dead code left over from blocking pool random: avoid arch_get_random_seed_long() when collecting IRQ randomness random: add arch_get_random_*long_early() powerpc: Use bool in archrandom.h linux/random.h: Mark CONFIG_ARCH_RANDOM functions __must_check linux/random.h: Use false with bool linux/random.h: Remove arch_has_random, arch_has_random_seed s390: Remove arch_has_random, arch_has_random_seed powerpc: Remove arch_has_random, arch_has_random_seed x86: Remove arch_has_random, arch_has_random_seed random: avoid warnings for !CONFIG_NUMA builds random: split primary/secondary crng init paths random: remove some dead code of poolinfo random: fix typo in add_timer_randomness() random: Add and use pr_fmt() random: convert to ENTROPY_BITS for better code readability random: remove unnecessary unlikely() random: remove kernel.random.read_wakeup_threshold random: delete code to pull data into pools random: remove the blocking pool random: make /dev/random be almost like /dev/urandom random: ignore GRND_RANDOM in getentropy(2) random: add GRND_INSECURE to return best-effort non-cryptographic bytes random: Add a urandom_read_nowait() for random APIs that don't warn random: Don't wake crng_init_wait when crng_init == 1 random: don't forget compat_ioctl on urandom compat_ioctl: remove /dev/random commands lib/crypto: sha1: re-roll loops to reduce code size lib/crypto: blake2s: move hmac construction into wireguard crypto: blake2s - generic C library implementation and selftest nfc: st21nfca: fix incorrect sizing calculations in EVT_TRANSACTION bpf: Fix incorrect memory charge cost calculation in stack_map_alloc() 9p: missing chunk of "fs/9p: Don't update file type when updating file attributes" Revert "ext4: fix use-after-free in ext4_rename_dir_prepare" Revert "ext4: verify dir block before splitting it" Linux 5.4.199 x86/speculation/mmio: Print SMT warning KVM: x86/speculation: Disable Fill buffer clear within guests x86/speculation/mmio: Reuse SRBDS mitigation for SBDS x86/speculation/srbds: Update SRBDS mitigation selection x86/speculation/mmio: Add sysfs reporting for Processor MMIO Stale Data x86/speculation/mmio: Enable CPU Fill buffer clearing on idle x86/bugs: Group MDS, TAA & Processor MMIO Stale Data mitigations x86/speculation/mmio: Add mitigation for Processor MMIO Stale Data x86/speculation: Add a common function for MD_CLEAR mitigation update x86/speculation/mmio: Enumerate Processor MMIO Stale Data bug Documentation: Add documentation for Processor MMIO Stale Data x86/cpu: Add another Alder Lake CPU to the Intel family x86/cpu: Add Lakefield, Alder Lake and Rocket Lake models to the to Intel CPU family x86/cpu: Add Jasper Lake to Intel family cpu/speculation: Add prototype for cpu_show_srbds() Linux 5.4.198 tcp: fix tcp_mtup_probe_success vs wrong snd_cwnd mtd: cfi_cmdset_0002: Use chip_ready() for write on S29GL064N md/raid0: Ignore RAID0 layout if the second zone has only one device powerpc/32: Fix overread/overwrite of thread_struct via ptrace Input: bcm5974 - set missing URB_NO_TRANSFER_DMA_MAP urb flag ixgbe: fix unexpected VLAN Rx in promisc mode on VF ixgbe: fix bcast packets Rx on VF after promisc removal nfc: st21nfca: fix memory leaks in EVT_TRANSACTION handling nfc: st21nfca: fix incorrect validating logic in EVT_TRANSACTION mmc: block: Fix CQE recovery reset success ata: libata-transport: fix {dma|pio|xfer}_mode sysfs files cifs: return errors during session setup during reconnects ALSA: hda/conexant - Fix loopback issue with CX20632 scripts/gdb: change kernel config dumping method vringh: Fix loop descriptors check in the indirect cases nodemask: Fix return values to be unsigned cifs: version operations for smb20 unneeded when legacy support disabled s390/gmap: voluntarily schedule during key setting nbd: fix io hung while disconnecting device nbd: fix race between nbd_alloc_config() and module removal nbd: call genl_unregister_family() first in nbd_cleanup() x86/cpu: Elide KCSAN for cpu_has() and friends modpost: fix undefined behavior of is_arm_mapping_symbol() drm/radeon: fix a possible null pointer dereference ceph: allow ceph.dir.rctime xattr to be updatable Revert "net: af_key: add check for pfkey_broadcast in function pfkey_process" scsi: myrb: Fix up null pointer access on myrb_cleanup() md: protect md_unregister_thread from reentrancy watchdog: wdat_wdt: Stop watchdog when rebooting the system kernfs: Separate kernfs_pr_cont_buf and rename_lock. serial: msm_serial: disable interrupts in __msm_console_write() staging: rtl8712: fix uninit-value in r871xu_drv_init() staging: rtl8712: fix uninit-value in usb_read8() and friends clocksource/drivers/sp804: Avoid error on multiple instances extcon: Modify extcon device to be created after driver data is set misc: rtsx: set NULL intfdata when probe fails usb: dwc2: gadget: don't reset gadget's driver->bus USB: hcd-pci: Fully suspend across freeze/thaw cycle drivers: usb: host: Fix deadlock in oxu_bus_suspend() drivers: tty: serial: Fix deadlock in sa1100_set_termios() USB: host: isp116x: check return value after calling platform_get_resource() drivers: staging: rtl8192e: Fix deadlock in rtllib_beacons_stop() drivers: staging: rtl8192u: Fix deadlock in ieee80211_beacons_stop() tty: Fix a possible resource leak in icom_probe tty: synclink_gt: Fix null-pointer-dereference in slgt_clean() lkdtm/usercopy: Expand size of "out of frame" object iio: st_sensors: Add a local lock for protecting odr iio: dummy: iio_simple_dummy: check the return value of kstrdup() drm: imx: fix compiler warning with gcc-12 net: altera: Fix refcount leak in altera_tse_mdio_create ip_gre: test csum_start instead of transport header net/mlx5: fs, fail conflicting actions net/mlx5: Rearm the FW tracer after each tracer event net: ipv6: unexport __init-annotated seg6_hmac_init() net: xfrm: unexport __init-annotated xfrm4_protocol_init() net: mdio: unexport __init-annotated mdio_bus_init() SUNRPC: Fix the calculation of xdr->end in xdr_get_next_encode_buffer() net/mlx4_en: Fix wrong return value on ioctl EEPROM query failure net: dsa: lantiq_gswip: Fix refcount leak in gswip_gphy_fw_list bpf, arm64: Clear prog->jited_len along prog->jited af_unix: Fix a data-race in unix_dgram_peer_wake_me(). xen: unexport __init-annotated xen_xlate_map_ballooned_pages() netfilter: nf_tables: memleak flow rule from commit path ata: pata_octeon_cf: Fix refcount leak in octeon_cf_probe netfilter: nat: really support inet nat without l3 address xprtrdma: treat all calls not a bcall when bc_serv is NULL video: fbdev: pxa3xx-gcu: release the resources correctly in pxa3xx_gcu_probe/remove() NFSv4: Don't hold the layoutget locks across multiple RPC calls dmaengine: zynqmp_dma: In struct zynqmp_dma_chan fix desc_size data type m68knommu: fix undefined reference to `_init_sp' m68knommu: set ZERO_PAGE() to the allocated zeroed page i2c: cadence: Increase timeout per message if necessary f2fs: remove WARN_ON in f2fs_is_valid_blkaddr tracing: Avoid adding tracer option before update_tracer_options tracing: Fix sleeping function called from invalid context on RT kernel mips: cpc: Fix refcount leak in mips_cpc_default_phys_base perf c2c: Fix sorting in percent_rmt_hitm_cmp() tipc: check attribute length for bearer name afs: Fix infinite loop found by xfstest generic/676 tcp: tcp_rtx_synack() can be called from process context net: sched: add barrier to fix packet stuck problem for lockless qdisc net/mlx5e: Update netdev features after changing XDP state net/mlx5: Don't use already freed action pointer nfp: only report pause frame configuration for physical device ubi: ubi_create_volume: Fix use-after-free when volume creation failed jffs2: fix memory leak in jffs2_do_fill_super modpost: fix removing numeric suffixes net: dsa: mv88e6xxx: Fix refcount leak in mv88e6xxx_mdios_register net: ethernet: mtk_eth_soc: out of bounds read in mtk_hwlro_get_fdir_entry() net: sched: fixed barrier to prevent skbuff sticking in qdisc backlog s390/crypto: fix scatterwalk_unmap() callers in AES-GCM clocksource/drivers/oxnas-rps: Fix irq_of_parse_and_map() return value ASoC: fsl_sai: Fix FSL_SAI_xDR/xFR definition watchdog: ts4800_wdt: Fix refcount leak in ts4800_wdt_probe driver core: fix deadlock in __device_attach driver: base: fix UAF when driver_attach failed bus: ti-sysc: Fix warnings for unbind for serial firmware: dmi-sysfs: Fix memory leak in dmi_sysfs_register_handle serial: stm32-usart: Correct CSIZE, bits, and parity serial: st-asc: Sanitize CSIZE and correct PARENB for CS7 serial: sifive: Sanitize CSIZE and c_iflag serial: sh-sci: Don't allow CS5-6 serial: txx9: Don't allow CS5-6 serial: rda-uart: Don't allow CS5-6 serial: digicolor-usart: Don't allow CS5-6 serial: 8250_fintek: Check SER_RS485_RTS_* only with RS485 serial: meson: acquire port->lock in startup() rtc: mt6397: check return value after calling platform_get_resource() clocksource/drivers/riscv: Events are stopped during CPU suspend soc: rockchip: Fix refcount leak in rockchip_grf_init coresight: cpu-debug: Replace mutex with mutex_trylock on panic notifier serial: sifive: Report actual baud base rather than fixed 115200 phy: qcom-qmp: fix pipe-clock imbalance on power-on failure rpmsg: qcom_smd: Fix returning 0 if irq_of_parse_and_map() fails iio: adc: sc27xx: Fine tune the scale calibration values iio: adc: sc27xx: fix read big scale voltage not right iio: adc: stmpe-adc: Fix wait_for_completion_timeout return value check firmware: stratix10-svc: fix a missing check on list iterator usb: dwc3: pci: Fix pm_runtime_get_sync() error checking rpmsg: qcom_smd: Fix irq_of_parse_and_map() return value pwm: lp3943: Fix duty calculation in case period was clamped staging: fieldbus: Fix the error handling path in anybuss_host_common_probe() usb: musb: Fix missing of_node_put() in omap2430_probe USB: storage: karma: fix rio_karma_init return usb: usbip: add missing device lock on tweak configuration cmd usb: usbip: fix a refcount leak in stub_probe() tty: serial: fsl_lpuart: fix potential bug when using both of_alias_get_id and ida_simple_get tty: serial: owl: Fix missing clk_disable_unprepare() in owl_uart_probe tty: goldfish: Use tty_port_destroy() to destroy port iio: adc: ad7124: Remove shift from scan_type staging: greybus: codecs: fix type confusion of list iterator variable pcmcia: db1xxx_ss: restrict to MIPS_DB1XXX boards md: bcache: check the return value of kzalloc() in detached_dev_do_request() block: fix bio_clone_blkg_association() to associate with proper blkcg_gq bfq: Make sure bfqg for which we are queueing requests is online bfq: Get rid of __bio_blkcg() usage bfq: Remove pointless bfq_init_rq() calls bfq: Drop pointless unlock-lock pair bfq: Avoid merging queues with different parents MIPS: IP27: Remove incorrect `cpu_has_fpu' override RDMA/rxe: Generate a completion for unsupported/invalid opcode Kconfig: add config option for asm goto w/ outputs phy: qcom-qmp: fix reset-controller leak on probe errors blk-iolatency: Fix inflight count imbalances and IO hangs on offline dt-bindings: gpio: altera: correct interrupt-cells docs/conf.py: Cope with removal of language=None in Sphinx 5.0.0 ARM: pxa: maybe fix gpio lookup tables phy: qcom-qmp: fix struct clk leak on probe errors arm64: dts: qcom: ipq8074: fix the sleep clock frequency gma500: fix an incorrect NULL check on list iterator tilcdc: tilcdc_external: fix an incorrect NULL check on list iterator serial: pch: don't overwrite xmit->buf[0] by x_char carl9170: tx: fix an incorrect use of list iterator ASoC: rt5514: Fix event generation for "DSP Voice Wake Up" control rtl818x: Prevent using not initialized queues hugetlb: fix huge_pmd_unshare address update nodemask.h: fix compilation error with GCC12 iommu/msm: Fix an incorrect NULL check on list iterator um: Fix out-of-bounds read in LDT setup um: chan_user: Fix winch_tramp() return value mac80211: upgrade passive scan to active scan on DFS channels after beacon rx irqchip: irq-xtensa-mx: fix initial IRQ affinity irqchip/armada-370-xp: Do not touch Performance Counter Overflow on A375, A38x, A39x RDMA/hfi1: Fix potential integer multiplication overflow errors Kconfig: Add option for asm goto w/ tied outputs to workaround clang-13 bug media: coda: Add more H264 levels for CODA960 media: coda: Fix reported H264 profile mtd: cfi_cmdset_0002: Move and rename chip_check/chip_ready/chip_good_for_write md: fix an incorrect NULL check in md_reload_sb md: fix an incorrect NULL check in does_sb_need_changing drm/bridge: analogix_dp: Grab runtime PM reference for DP-AUX drm/nouveau/clk: Fix an incorrect NULL check on list iterator drm/etnaviv: check for reaped mapping in etnaviv_iommu_unmap_gem drm/amdgpu/cs: make commands with 0 chunks illegal behaviour. scsi: ufs: qcom: Add a readl() to make sure ref_clk gets enabled scsi: dc395x: Fix a missing check on list iterator ocfs2: dlmfs: fix error handling of user_dlm_destroy_lock dlm: fix missing lkb refcount handling dlm: fix plock invalid read mm, compaction: fast_find_migrateblock() should return pfn in the target zone PCI: qcom: Fix unbalanced PHY init on probe errors PCI: qcom: Fix runtime PM imbalance on probe errors PCI/PM: Fix bridge_d3_blacklist[] Elo i2 overwrite of Gigabyte X299 tracing: Fix potential double free in create_var_ref() ACPI: property: Release subnode properties with data nodes ext4: avoid cycles in directory h-tree ext4: verify dir block before splitting it ext4: fix bug_on in ext4_writepages ext4: fix warning in ext4_handle_inode_extension ext4: fix use-after-free in ext4_rename_dir_prepare netfilter: nf_tables: disallow non-stateful expression in sets earlier bfq: Track whether bfq_group is still online bfq: Update cgroup information before merging bio bfq: Split shared queues on move between cgroups efi: Do not import certificates from UEFI Secure Boot for T2 Macs fs-writeback: writeback_sb_inodes:Recalculate 'wrote' according skipped pages iwlwifi: mvm: fix assert 1F04 upon reconfig wifi: mac80211: fix use-after-free in chanctx code f2fs: fix fallocate to use file_modified to update permissions consistently f2fs: don't need inode lock for system hidden quota f2fs: fix deadloop in foreground GC f2fs: fix to clear dirty inode in f2fs_evict_inode() f2fs: fix to do sanity check on block address in f2fs_do_zero_range() f2fs: fix to avoid f2fs_bug_on() in dec_valid_node_count() perf jevents: Fix event syntax error caused by ExtSel perf c2c: Use stdio interface if slang is not supported iommu/amd: Increase timeout waiting for GA log enablement dmaengine: stm32-mdma: remove GISR1 register video: fbdev: clcdfb: Fix refcount leak in clcdfb_of_vram_setup NFSv4/pNFS: Do not fail I/O when we fail to allocate the pNFS layout NFS: Don't report errors from nfs_pageio_complete() more than once NFS: Do not report flush errors in nfs_write_end() NFS: Do not report EINTR/ERESTARTSYS as mapping errors i2c: at91: Initialize dma_buf in at91_twi_xfer() i2c: at91: use dma safe buffers iommu/mediatek: Add list_del in mtk_iommu_remove f2fs: fix dereference of stale list iterator after loop body Input: stmfts - do not leave device disabled in stmfts_input_open RDMA/hfi1: Prevent use of lock before it is initialized mailbox: forward the hrtimer if not queued and under a lock mfd: davinci_voicecodec: Fix possible null-ptr-deref davinci_vc_probe() powerpc/fsl_rio: Fix refcount leak in fsl_rio_setup macintosh: via-pmu and via-cuda need RTC_LIB powerpc/perf: Fix the threshold compare group constraint for power9 powerpc/64: Only WARN if __pa()/__va() called with bad addresses Input: sparcspkr - fix refcount leak in bbc_beep_probe crypto: cryptd - Protect per-CPU resource by disabling BH. tty: fix deadlock caused by calling printk() under tty_port->lock PCI: imx6: Fix PERST# start-up sequence ipc/mqueue: use get_tree_nodev() in mqueue_get_tree() proc: fix dentry/inode overinstantiating under /proc/${pid}/net powerpc/4xx/cpm: Fix return value of __setup() handler powerpc/idle: Fix return value of __setup() handler powerpc/8xx: export 'cpm_setbrg' for modules dax: fix cache flush on PMD-mapped pages drivers/base/node.c: fix compaction sysfs file leak pinctrl: mvebu: Fix irq_of_parse_and_map() return value nvdimm: Allow overwrite in the presence of disabled dimms firmware: arm_scmi: Fix list protocols enumeration in the base protocol scsi: fcoe: Fix Wstringop-overflow warnings in fcoe_wwn_from_mac() mfd: ipaq-micro: Fix error check return value of platform_get_irq() powerpc/fadump: fix PT_LOAD segment for boot memory area arm: mediatek: select arch timer for mt7629 crypto: marvell/cesa - ECB does not IV misc: ocxl: fix possible double free in ocxl_file_register_afu ARM: dts: bcm2835-rpi-b: Fix GPIO line names ARM: dts: bcm2837-rpi-3-b-plus: Fix GPIO line name of power LED ARM: dts: bcm2837-rpi-cm3-io3: Fix GPIO line names for SMPS I2C ARM: dts: bcm2835-rpi-zero-w: Fix GPIO line name for Wifi/BT can: xilinx_can: mark bit timing constants as const KVM: nVMX: Leave most VM-Exit info fields unmodified on failed VM-Entry PCI: rockchip: Fix find_first_zero_bit() limit PCI: cadence: Fix find_first_zero_bit() limit soc: qcom: smsm: Fix missing of_node_put() in smsm_parse_ipc soc: qcom: smp2p: Fix missing of_node_put() in smp2p_parse_ipc ARM: dts: suniv: F1C100: fix watchdog compatible arm64: dts: rockchip: Move drive-impedance-ohm to emmc phy on rk3399 net/smc: postpone sk_refcnt increment in connect() rxrpc: Fix decision on when to generate an IDLE ACK rxrpc: Don't let ack.previousPacket regress rxrpc: Fix overlapping ACK accounting rxrpc: Don't try to resend the request if we're receiving the reply rxrpc: Fix listen() setting the bar too high for the prealloc rings NFC: hci: fix sleep in atomic context bugs in nfc_hci_hcp_message_tx ASoC: wm2000: fix missing clk_disable_unprepare() on error in wm2000_anc_transition() thermal/drivers/broadcom: Fix potential NULL dereference in sr_thermal_probe drm: msm: fix possible memory leak in mdp5_crtc_cursor_set() drm/msm/a6xx: Fix refcount leak in a6xx_gpu_init ext4: reject the 'commit' option on ext2 filesystems media: ov7670: remove ov7670_power_off from ov7670_remove sctp: read sk->sk_bound_dev_if once in sctp_rcv() m68k: math-emu: Fix dependencies of math emulation support Bluetooth: fix dangling sco_conn and use-after-free in sco_sock_timeout media: vsp1: Fix offset calculation for plane cropping media: pvrusb2: fix array-index-out-of-bounds in pvr2_i2c_core_init media: exynos4-is: Change clk_disable to clk_disable_unprepare media: st-delta: Fix PM disable depth imbalance in delta_probe media: aspeed: Fix an error handling path in aspeed_video_probe() scripts/faddr2line: Fix overlapping text section failures regulator: pfuze100: Fix refcount leak in pfuze_parse_regulators_dt ASoC: mxs-saif: Fix refcount leak in mxs_saif_probe ASoC: fsl: Fix refcount leak in imx_sgtl5000_probe perf/amd/ibs: Use interrupt regs ip for stack unwinding Revert "cpufreq: Fix possible race in cpufreq online error path" iomap: iomap_write_failed fix media: uvcvideo: Fix missing check to determine if element is found in list drm/msm: return an error pointer in msm_gem_prime_get_sg_table() drm/msm/mdp5: Return error code in mdp5_mixer_release when deadlock is detected drm/msm/mdp5: Return error code in mdp5_pipe_release when deadlock is detected regulator: core: Fix enable_count imbalance with EXCLUSIVE_GET x86/mm: Cleanup the control_va_addr_alignment() __setup handler irqchip/aspeed-i2c-ic: Fix irq_of_parse_and_map() return value irqchip/exiu: Fix acknowledgment of edge triggered interrupts x86: Fix return value of __setup handlers virtio_blk: fix the discard_granularity and discard_alignment queue limits drm/rockchip: vop: fix possible null-ptr-deref in vop_bind() drm/msm/hdmi: fix error check return value of irq_of_parse_and_map() drm/msm/hdmi: check return value after calling platform_get_resource_byname() drm/msm/dsi: fix error checks and return values for DSI xmit functions drm/msm/disp/dpu1: set vbif hw config to NULL to avoid use after memory free during pm runtime resume perf tools: Add missing headers needed by util/data.h ASoC: rk3328: fix disabling mclk on pclk probe failure x86/speculation: Add missing prototype for unpriv_ebpf_notify() x86/pm: Fix false positive kmemleak report in msr_build_context() scsi: ufs: core: Exclude UECxx from SFR dump list of: overlay: do not break notify on NOTIFY_{OK|STOP} fsnotify: fix wrong lockdep annotations inotify: show inotify mask flags in proc fdinfo ath9k_htc: fix potential out of bounds access with invalid rxstatus->rs_keyix cpufreq: Fix possible race in cpufreq online error path spi: img-spfi: Fix pm_runtime_get_sync() error checking sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq drm/bridge: Fix error handling in analogix_dp_probe HID: elan: Fix potential double free in elan_input_configured HID: hid-led: fix maximum brightness for Dream Cheeky drbd: fix duplicate array initializer efi: Add missing prototype for efi_capsule_setup_info NFC: NULL out the dev->rfkill to prevent UAF spi: spi-ti-qspi: Fix return value handling of wait_for_completion_timeout drm: mali-dp: potential dereference of null pointer drm/komeda: Fix an undefined behavior bug in komeda_plane_add() nl80211: show SSID for P2P_GO interfaces bpf: Fix excessive memory allocation in stack_map_alloc() drm/vc4: txp: Force alpha to be 0xff if it's disabled drm/vc4: txp: Don't set TXP_VSTART_AT_EOF drm/mediatek: Fix mtk_cec_mask() x86/delay: Fix the wrong asm constraint in delay_loop() ASoC: mediatek: Fix missing of_node_put in mt2701_wm8960_machine_probe ASoC: mediatek: Fix error handling in mt8173_max98090_dev_probe drm/bridge: adv7511: clean up CEC adapter when probe fails drm/edid: fix invalid EDID extension block filtering ath9k: fix ar9003_get_eepmisc drm: fix EDID struct for old ARM OABI format RDMA/hfi1: Prevent panic when SDMA is disabled powerpc/iommu: Add missing of_node_put in iommu_init_early_dart macintosh/via-pmu: Fix build failure when CONFIG_INPUT is disabled powerpc/powernv: fix missing of_node_put in uv_init() powerpc/xics: fix refcount leak in icp_opal_init() tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate PCI: Avoid pci_dev_lock() AB/BA deadlock with sriov_numvfs_store() ARM: hisi: Add missing of_node_put after of_find_compatible_node ARM: dts: exynos: add atmel,24c128 fallback to Samsung EEPROM ARM: versatile: Add missing of_node_put in dcscb_init fat: add ratelimit to fat*_ent_bread() powerpc/fadump: Fix fadump to work with a different endian capture kernel ARM: OMAP1: clock: Fix UART rate reporting algorithm fs: jfs: fix possible NULL pointer dereference in dbFree() PM / devfreq: rk3399_dmc: Disable edev on remove() ARM: dts: ox820: align interrupt controller node name with dtschema IB/rdmavt: add missing locks in rvt_ruc_loopback selftests/bpf: fix btf_dump/btf_dump due to recent clang change eth: tg3: silence the GCC 12 array-bounds warning rxrpc: Return an error to sendmsg if call failed hwmon: Make chip parameter for with_info API mandatory ASoC: max98357a: remove dependency on GPIOLIB media: exynos4-is: Fix compile warning net: phy: micrel: Allow probing without .driver_data nbd: Fix hung on disconnect request if socket is closed before ASoC: rt5645: Fix errorenous cleanup order nvme-pci: fix a NULL pointer dereference in nvme_alloc_admin_tags openrisc: start CPU timer early in boot media: cec-adap.c: fix is_configuring state media: coda: limit frame interval enumeration to supported encoder frame sizes rtlwifi: Use pr_warn instead of WARN_ONCE ipmi: Fix pr_fmt to avoid compilation issues ipmi:ssif: Check for NULL msg when handling events and messages ACPI: PM: Block ASUS B1400CEAE from suspend to idle by default dma-debug: change allocation mode from GFP_NOWAIT to GFP_ATIOMIC spi: stm32-qspi: Fix wait_cmd timeout in APM mode s390/preempt: disable __preempt_count_add() optimization for PROFILE_ALL_BRANCHES ASoC: tscs454: Add endianness flag in snd_soc_component_driver HID: bigben: fix slab-out-of-bounds Write in bigben_probe drm/amdgpu/ucode: Remove firmware load type check in amdgpu_ucode_free_bo mlxsw: spectrum_dcb: Do not warn about priority changes ASoC: dapm: Don't fold register value changes into notifications net/mlx5: fs, delete the FTE when there are no rules attached to it ipv6: Don't send rs packets to the interface of ARPHRD_TUNNEL drm: msm: fix error check return value of irq_of_parse_and_map() arm64: compat: Do not treat syscall number as ESR_ELx for a bad syscall drm/amd/pm: fix the compile warning drm/plane: Move range check for format_count earlier scsi: megaraid: Fix error check return value of register_chrdev() mmc: jz4740: Apply DMA engine limits to maximum segment size md/bitmap: don't set sb values if can't pass sanity check media: cx25821: Fix the warning when removing the module media: pci: cx23885: Fix the error handling in cx23885_initdev() media: venus: hfi: avoid null dereference in deinit ath9k: fix QCA9561 PA bias level drm/amd/pm: fix double free in si_parse_power_table() tools/power turbostat: fix ICX DRAM power numbers spi: spi-rspi: Remove setting {src,dst}_{addr,addr_width} based on DMA direction ALSA: jack: Access input_dev under mutex drm/komeda: return early if drm_universal_plane_init() fails. ACPICA: Avoid cache flush inside virtual machines fbcon: Consistently protect deferred_takeover with console_lock() ipv6: fix locking issues with loops over idev->addr_list ipw2x00: Fix potential NULL dereference in libipw_xmit() b43: Fix assigning negative value to unsigned variable b43legacy: Fix assigning negative value to unsigned variable mwifiex: add mutex lock for call in mwifiex_dfs_chan_sw_work_queue drm/virtio: fix NULL pointer dereference in virtio_gpu_conn_get_modes btrfs: repair super block num_devices automatically btrfs: add "0x" prefix for unsupported optional features ptrace: Reimplement PTRACE_KILL by always sending SIGKILL ptrace/xtensa: Replace PT_SINGLESTEP with TIF_SINGLESTEP ptrace/um: Replace PT_DTRACE with TIF_SINGLESTEP perf/x86/intel: Fix event constraints for ICL usb: core: hcd: Add support for deferring roothub registration USB: new quirk for Dell Gen 2 devices USB: serial: option: add Quectel BG95 modem ALSA: hda/realtek - Fix microphone noise on ASUS TUF B550M-PLUS binfmt_flat: do not stop relocating GOT entries prematurely on riscv Conflicts: Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml Documentation/devicetree/bindings~HEAD drivers/char/Kconfig drivers/mmc/core/block.c kernel/sysctl.c Change-Id: If11e1865055bfb94b3268960268c88c3dfc032c3
2991 lines
75 KiB
C
2991 lines
75 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Simple NUMA memory policy for the Linux kernel.
|
|
*
|
|
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
|
|
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
|
|
*
|
|
* NUMA policy allows the user to give hints in which node(s) memory should
|
|
* be allocated.
|
|
*
|
|
* Support four policies per VMA and per process:
|
|
*
|
|
* The VMA policy has priority over the process policy for a page fault.
|
|
*
|
|
* interleave Allocate memory interleaved over a set of nodes,
|
|
* with normal fallback if it fails.
|
|
* For VMA based allocations this interleaves based on the
|
|
* offset into the backing object or offset into the mapping
|
|
* for anonymous memory. For process policy an process counter
|
|
* is used.
|
|
*
|
|
* bind Only allocate memory on a specific set of nodes,
|
|
* no fallback.
|
|
* FIXME: memory is allocated starting with the first node
|
|
* to the last. It would be better if bind would truly restrict
|
|
* the allocation to memory nodes instead
|
|
*
|
|
* preferred Try a specific node first before normal fallback.
|
|
* As a special case NUMA_NO_NODE here means do the allocation
|
|
* on the local CPU. This is normally identical to default,
|
|
* but useful to set in a VMA when you have a non default
|
|
* process policy.
|
|
*
|
|
* default Allocate on the local node first, or when on a VMA
|
|
* use the process policy. This is what Linux always did
|
|
* in a NUMA aware kernel and still does by, ahem, default.
|
|
*
|
|
* The process policy is applied for most non interrupt memory allocations
|
|
* in that process' context. Interrupts ignore the policies and always
|
|
* try to allocate on the local CPU. The VMA policy is only applied for memory
|
|
* allocations for a VMA in the VM.
|
|
*
|
|
* Currently there are a few corner cases in swapping where the policy
|
|
* is not applied, but the majority should be handled. When process policy
|
|
* is used it is not remembered over swap outs/swap ins.
|
|
*
|
|
* Only the highest zone in the zone hierarchy gets policied. Allocations
|
|
* requesting a lower zone just use default policy. This implies that
|
|
* on systems with highmem kernel lowmem allocation don't get policied.
|
|
* Same with GFP_DMA allocations.
|
|
*
|
|
* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
|
|
* all users and remembered even when nobody has memory mapped.
|
|
*/
|
|
|
|
/* Notebook:
|
|
fix mmap readahead to honour policy and enable policy for any page cache
|
|
object
|
|
statistics for bigpages
|
|
global policy for page cache? currently it uses process policy. Requires
|
|
first item above.
|
|
handle mremap for shared memory (currently ignored for the policy)
|
|
grows down?
|
|
make bind policy root only? It can trigger oom much faster and the
|
|
kernel is not always grateful with that.
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/pagewalk.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/sched/numa_balancing.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/nodemask.h>
|
|
#include <linux/cpuset.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/string.h>
|
|
#include <linux/export.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/init.h>
|
|
#include <linux/compat.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/migrate.h>
|
|
#include <linux/ksm.h>
|
|
#include <linux/rmap.h>
|
|
#include <linux/security.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/ctype.h>
|
|
#include <linux/mm_inline.h>
|
|
#include <linux/mmu_notifier.h>
|
|
#include <linux/printk.h>
|
|
#include <linux/swapops.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
#include "internal.h"
|
|
|
|
/* Internal flags */
|
|
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
|
|
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
|
|
|
|
static struct kmem_cache *policy_cache;
|
|
static struct kmem_cache *sn_cache;
|
|
|
|
/* Highest zone. An specific allocation for a zone below that is not
|
|
policied. */
|
|
enum zone_type policy_zone = 0;
|
|
|
|
/*
|
|
* run-time system-wide default policy => local allocation
|
|
*/
|
|
static struct mempolicy default_policy = {
|
|
.refcnt = ATOMIC_INIT(1), /* never free it */
|
|
.mode = MPOL_PREFERRED,
|
|
.flags = MPOL_F_LOCAL,
|
|
};
|
|
|
|
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
|
|
|
|
struct mempolicy *get_task_policy(struct task_struct *p)
|
|
{
|
|
struct mempolicy *pol = p->mempolicy;
|
|
int node;
|
|
|
|
if (pol)
|
|
return pol;
|
|
|
|
node = numa_node_id();
|
|
if (node != NUMA_NO_NODE) {
|
|
pol = &preferred_node_policy[node];
|
|
/* preferred_node_policy is not initialised early in boot */
|
|
if (pol->mode)
|
|
return pol;
|
|
}
|
|
|
|
return &default_policy;
|
|
}
|
|
|
|
static const struct mempolicy_operations {
|
|
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
|
|
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
|
|
} mpol_ops[MPOL_MAX];
|
|
|
|
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
|
|
{
|
|
return pol->flags & MPOL_MODE_FLAGS;
|
|
}
|
|
|
|
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
|
|
const nodemask_t *rel)
|
|
{
|
|
nodemask_t tmp;
|
|
nodes_fold(tmp, *orig, nodes_weight(*rel));
|
|
nodes_onto(*ret, tmp, *rel);
|
|
}
|
|
|
|
static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
|
|
{
|
|
if (nodes_empty(*nodes))
|
|
return -EINVAL;
|
|
pol->v.nodes = *nodes;
|
|
return 0;
|
|
}
|
|
|
|
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
|
|
{
|
|
if (!nodes)
|
|
pol->flags |= MPOL_F_LOCAL; /* local allocation */
|
|
else if (nodes_empty(*nodes))
|
|
return -EINVAL; /* no allowed nodes */
|
|
else
|
|
pol->v.preferred_node = first_node(*nodes);
|
|
return 0;
|
|
}
|
|
|
|
static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
|
|
{
|
|
if (nodes_empty(*nodes))
|
|
return -EINVAL;
|
|
pol->v.nodes = *nodes;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
|
|
* any, for the new policy. mpol_new() has already validated the nodes
|
|
* parameter with respect to the policy mode and flags. But, we need to
|
|
* handle an empty nodemask with MPOL_PREFERRED here.
|
|
*
|
|
* Must be called holding task's alloc_lock to protect task's mems_allowed
|
|
* and mempolicy. May also be called holding the mmap_semaphore for write.
|
|
*/
|
|
static int mpol_set_nodemask(struct mempolicy *pol,
|
|
const nodemask_t *nodes, struct nodemask_scratch *nsc)
|
|
{
|
|
int ret;
|
|
|
|
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
|
|
if (pol == NULL)
|
|
return 0;
|
|
/* Check N_MEMORY */
|
|
nodes_and(nsc->mask1,
|
|
cpuset_current_mems_allowed, node_states[N_MEMORY]);
|
|
|
|
VM_BUG_ON(!nodes);
|
|
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
|
|
nodes = NULL; /* explicit local allocation */
|
|
else {
|
|
if (pol->flags & MPOL_F_RELATIVE_NODES)
|
|
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
|
|
else
|
|
nodes_and(nsc->mask2, *nodes, nsc->mask1);
|
|
|
|
if (mpol_store_user_nodemask(pol))
|
|
pol->w.user_nodemask = *nodes;
|
|
else
|
|
pol->w.cpuset_mems_allowed =
|
|
cpuset_current_mems_allowed;
|
|
}
|
|
|
|
if (nodes)
|
|
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
|
|
else
|
|
ret = mpol_ops[pol->mode].create(pol, NULL);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* This function just creates a new policy, does some check and simple
|
|
* initialization. You must invoke mpol_set_nodemask() to set nodes.
|
|
*/
|
|
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
|
nodemask_t *nodes)
|
|
{
|
|
struct mempolicy *policy;
|
|
|
|
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
|
|
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
|
|
|
|
if (mode == MPOL_DEFAULT) {
|
|
if (nodes && !nodes_empty(*nodes))
|
|
return ERR_PTR(-EINVAL);
|
|
return NULL;
|
|
}
|
|
VM_BUG_ON(!nodes);
|
|
|
|
/*
|
|
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
|
|
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
|
|
* All other modes require a valid pointer to a non-empty nodemask.
|
|
*/
|
|
if (mode == MPOL_PREFERRED) {
|
|
if (nodes_empty(*nodes)) {
|
|
if (((flags & MPOL_F_STATIC_NODES) ||
|
|
(flags & MPOL_F_RELATIVE_NODES)))
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
} else if (mode == MPOL_LOCAL) {
|
|
if (!nodes_empty(*nodes) ||
|
|
(flags & MPOL_F_STATIC_NODES) ||
|
|
(flags & MPOL_F_RELATIVE_NODES))
|
|
return ERR_PTR(-EINVAL);
|
|
mode = MPOL_PREFERRED;
|
|
} else if (nodes_empty(*nodes))
|
|
return ERR_PTR(-EINVAL);
|
|
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
|
|
if (!policy)
|
|
return ERR_PTR(-ENOMEM);
|
|
atomic_set(&policy->refcnt, 1);
|
|
policy->mode = mode;
|
|
policy->flags = flags;
|
|
|
|
return policy;
|
|
}
|
|
|
|
/* Slow path of a mpol destructor. */
|
|
void __mpol_put(struct mempolicy *p)
|
|
{
|
|
if (!atomic_dec_and_test(&p->refcnt))
|
|
return;
|
|
kmem_cache_free(policy_cache, p);
|
|
}
|
|
|
|
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
|
|
{
|
|
}
|
|
|
|
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
|
|
{
|
|
nodemask_t tmp;
|
|
|
|
if (pol->flags & MPOL_F_STATIC_NODES)
|
|
nodes_and(tmp, pol->w.user_nodemask, *nodes);
|
|
else if (pol->flags & MPOL_F_RELATIVE_NODES)
|
|
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
|
|
else {
|
|
nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
|
|
*nodes);
|
|
pol->w.cpuset_mems_allowed = *nodes;
|
|
}
|
|
|
|
if (nodes_empty(tmp))
|
|
tmp = *nodes;
|
|
|
|
pol->v.nodes = tmp;
|
|
}
|
|
|
|
static void mpol_rebind_preferred(struct mempolicy *pol,
|
|
const nodemask_t *nodes)
|
|
{
|
|
nodemask_t tmp;
|
|
|
|
if (pol->flags & MPOL_F_STATIC_NODES) {
|
|
int node = first_node(pol->w.user_nodemask);
|
|
|
|
if (node_isset(node, *nodes)) {
|
|
pol->v.preferred_node = node;
|
|
pol->flags &= ~MPOL_F_LOCAL;
|
|
} else
|
|
pol->flags |= MPOL_F_LOCAL;
|
|
} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
|
|
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
|
|
pol->v.preferred_node = first_node(tmp);
|
|
} else if (!(pol->flags & MPOL_F_LOCAL)) {
|
|
pol->v.preferred_node = node_remap(pol->v.preferred_node,
|
|
pol->w.cpuset_mems_allowed,
|
|
*nodes);
|
|
pol->w.cpuset_mems_allowed = *nodes;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* mpol_rebind_policy - Migrate a policy to a different set of nodes
|
|
*
|
|
* Per-vma policies are protected by mmap_sem. Allocations using per-task
|
|
* policies are protected by task->mems_allowed_seq to prevent a premature
|
|
* OOM/allocation failure due to parallel nodemask modification.
|
|
*/
|
|
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
|
|
{
|
|
if (!pol || pol->mode == MPOL_LOCAL)
|
|
return;
|
|
if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
|
|
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
|
|
return;
|
|
|
|
mpol_ops[pol->mode].rebind(pol, newmask);
|
|
}
|
|
|
|
/*
|
|
* Wrapper for mpol_rebind_policy() that just requires task
|
|
* pointer, and updates task mempolicy.
|
|
*
|
|
* Called with task's alloc_lock held.
|
|
*/
|
|
|
|
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
|
|
{
|
|
mpol_rebind_policy(tsk->mempolicy, new);
|
|
}
|
|
|
|
/*
|
|
* Rebind each vma in mm to new nodemask.
|
|
*
|
|
* Call holding a reference to mm. Takes mm->mmap_sem during call.
|
|
*/
|
|
|
|
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
|
|
down_write(&mm->mmap_sem);
|
|
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
|
vm_write_begin(vma);
|
|
mpol_rebind_policy(vma->vm_policy, new);
|
|
vm_write_end(vma);
|
|
}
|
|
up_write(&mm->mmap_sem);
|
|
}
|
|
|
|
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
|
|
[MPOL_DEFAULT] = {
|
|
.rebind = mpol_rebind_default,
|
|
},
|
|
[MPOL_INTERLEAVE] = {
|
|
.create = mpol_new_interleave,
|
|
.rebind = mpol_rebind_nodemask,
|
|
},
|
|
[MPOL_PREFERRED] = {
|
|
.create = mpol_new_preferred,
|
|
.rebind = mpol_rebind_preferred,
|
|
},
|
|
[MPOL_BIND] = {
|
|
.create = mpol_new_bind,
|
|
.rebind = mpol_rebind_nodemask,
|
|
},
|
|
};
|
|
|
|
static int migrate_page_add(struct page *page, struct list_head *pagelist,
|
|
unsigned long flags);
|
|
|
|
struct queue_pages {
|
|
struct list_head *pagelist;
|
|
unsigned long flags;
|
|
nodemask_t *nmask;
|
|
struct vm_area_struct *prev;
|
|
};
|
|
|
|
/*
|
|
* Check if the page's nid is in qp->nmask.
|
|
*
|
|
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
|
|
* in the invert of qp->nmask.
|
|
*/
|
|
static inline bool queue_pages_required(struct page *page,
|
|
struct queue_pages *qp)
|
|
{
|
|
int nid = page_to_nid(page);
|
|
unsigned long flags = qp->flags;
|
|
|
|
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
|
|
}
|
|
|
|
/*
|
|
* queue_pages_pmd() has four possible return values:
|
|
* 0 - pages are placed on the right node or queued successfully.
|
|
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
|
|
* specified.
|
|
* 2 - THP was split.
|
|
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
|
|
* existing page was already on a node that does not follow the
|
|
* policy.
|
|
*/
|
|
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
|
|
unsigned long end, struct mm_walk *walk)
|
|
{
|
|
int ret = 0;
|
|
struct page *page;
|
|
struct queue_pages *qp = walk->private;
|
|
unsigned long flags;
|
|
|
|
if (unlikely(is_pmd_migration_entry(*pmd))) {
|
|
ret = -EIO;
|
|
goto unlock;
|
|
}
|
|
page = pmd_page(*pmd);
|
|
if (is_huge_zero_page(page)) {
|
|
spin_unlock(ptl);
|
|
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
|
|
ret = 2;
|
|
goto out;
|
|
}
|
|
if (!queue_pages_required(page, qp))
|
|
goto unlock;
|
|
|
|
flags = qp->flags;
|
|
/* go to thp migration */
|
|
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
|
|
if (!vma_migratable(walk->vma) ||
|
|
migrate_page_add(page, qp->pagelist, flags)) {
|
|
ret = 1;
|
|
goto unlock;
|
|
}
|
|
} else
|
|
ret = -EIO;
|
|
unlock:
|
|
spin_unlock(ptl);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Scan through pages checking if pages follow certain conditions,
|
|
* and move them to the pagelist if they do.
|
|
*
|
|
* queue_pages_pte_range() has three possible return values:
|
|
* 0 - pages are placed on the right node or queued successfully.
|
|
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
|
|
* specified.
|
|
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
|
|
* on a node that does not follow the policy.
|
|
*/
|
|
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
|
|
unsigned long end, struct mm_walk *walk)
|
|
{
|
|
struct vm_area_struct *vma = walk->vma;
|
|
struct page *page;
|
|
struct queue_pages *qp = walk->private;
|
|
unsigned long flags = qp->flags;
|
|
int ret;
|
|
bool has_unmovable = false;
|
|
pte_t *pte, *mapped_pte;
|
|
spinlock_t *ptl;
|
|
|
|
ptl = pmd_trans_huge_lock(pmd, vma);
|
|
if (ptl) {
|
|
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
|
|
if (ret != 2)
|
|
return ret;
|
|
}
|
|
/* THP was split, fall through to pte walk */
|
|
|
|
if (pmd_trans_unstable(pmd))
|
|
return 0;
|
|
|
|
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
|
|
for (; addr != end; pte++, addr += PAGE_SIZE) {
|
|
if (!pte_present(*pte))
|
|
continue;
|
|
page = vm_normal_page(vma, addr, *pte);
|
|
if (!page)
|
|
continue;
|
|
/*
|
|
* vm_normal_page() filters out zero pages, but there might
|
|
* still be PageReserved pages to skip, perhaps in a VDSO.
|
|
*/
|
|
if (PageReserved(page))
|
|
continue;
|
|
if (!queue_pages_required(page, qp))
|
|
continue;
|
|
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
|
|
/* MPOL_MF_STRICT must be specified if we get here */
|
|
if (!vma_migratable(vma)) {
|
|
has_unmovable = true;
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Do not abort immediately since there may be
|
|
* temporary off LRU pages in the range. Still
|
|
* need migrate other LRU pages.
|
|
*/
|
|
if (migrate_page_add(page, qp->pagelist, flags))
|
|
has_unmovable = true;
|
|
} else
|
|
break;
|
|
}
|
|
pte_unmap_unlock(mapped_pte, ptl);
|
|
cond_resched();
|
|
|
|
if (has_unmovable)
|
|
return 1;
|
|
|
|
return addr != end ? -EIO : 0;
|
|
}
|
|
|
|
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
|
|
unsigned long addr, unsigned long end,
|
|
struct mm_walk *walk)
|
|
{
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
struct queue_pages *qp = walk->private;
|
|
unsigned long flags = qp->flags;
|
|
struct page *page;
|
|
spinlock_t *ptl;
|
|
pte_t entry;
|
|
|
|
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
|
|
entry = huge_ptep_get(pte);
|
|
if (!pte_present(entry))
|
|
goto unlock;
|
|
page = pte_page(entry);
|
|
if (!queue_pages_required(page, qp))
|
|
goto unlock;
|
|
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
|
|
if (flags & (MPOL_MF_MOVE_ALL) ||
|
|
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
|
|
isolate_huge_page(page, qp->pagelist);
|
|
unlock:
|
|
spin_unlock(ptl);
|
|
#else
|
|
BUG();
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
/*
|
|
* This is used to mark a range of virtual addresses to be inaccessible.
|
|
* These are later cleared by a NUMA hinting fault. Depending on these
|
|
* faults, pages may be migrated for better NUMA placement.
|
|
*
|
|
* This is assuming that NUMA faults are handled using PROT_NONE. If
|
|
* an architecture makes a different choice, it will need further
|
|
* changes to the core.
|
|
*/
|
|
unsigned long change_prot_numa(struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long end)
|
|
{
|
|
int nr_updated;
|
|
|
|
vm_write_begin(vma);
|
|
nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
|
|
if (nr_updated)
|
|
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
|
|
vm_write_end(vma);
|
|
|
|
return nr_updated;
|
|
}
|
|
#else
|
|
static unsigned long change_prot_numa(struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long end)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
static int queue_pages_test_walk(unsigned long start, unsigned long end,
|
|
struct mm_walk *walk)
|
|
{
|
|
struct vm_area_struct *vma = walk->vma;
|
|
struct queue_pages *qp = walk->private;
|
|
unsigned long endvma = vma->vm_end;
|
|
unsigned long flags = qp->flags;
|
|
|
|
/*
|
|
* Need check MPOL_MF_STRICT to return -EIO if possible
|
|
* regardless of vma_migratable
|
|
*/
|
|
if (!vma_migratable(vma) &&
|
|
!(flags & MPOL_MF_STRICT))
|
|
return 1;
|
|
|
|
if (endvma > end)
|
|
endvma = end;
|
|
if (vma->vm_start > start)
|
|
start = vma->vm_start;
|
|
|
|
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
|
|
if (!vma->vm_next && vma->vm_end < end)
|
|
return -EFAULT;
|
|
if (qp->prev && qp->prev->vm_end < vma->vm_start)
|
|
return -EFAULT;
|
|
}
|
|
|
|
qp->prev = vma;
|
|
|
|
if (flags & MPOL_MF_LAZY) {
|
|
/* Similar to task_numa_work, skip inaccessible VMAs */
|
|
if (!is_vm_hugetlb_page(vma) &&
|
|
(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
|
|
!(vma->vm_flags & VM_MIXEDMAP))
|
|
change_prot_numa(vma, start, endvma);
|
|
return 1;
|
|
}
|
|
|
|
/* queue pages from current vma */
|
|
if (flags & MPOL_MF_VALID)
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
static const struct mm_walk_ops queue_pages_walk_ops = {
|
|
.hugetlb_entry = queue_pages_hugetlb,
|
|
.pmd_entry = queue_pages_pte_range,
|
|
.test_walk = queue_pages_test_walk,
|
|
};
|
|
|
|
/*
|
|
* Walk through page tables and collect pages to be migrated.
|
|
*
|
|
* If pages found in a given range are on a set of nodes (determined by
|
|
* @nodes and @flags,) it's isolated and queued to the pagelist which is
|
|
* passed via @private.
|
|
*
|
|
* queue_pages_range() has three possible return values:
|
|
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
|
|
* specified.
|
|
* 0 - queue pages successfully or no misplaced page.
|
|
* errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
|
|
* memory range specified by nodemask and maxnode points outside
|
|
* your accessible address space (-EFAULT)
|
|
*/
|
|
static int
|
|
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
|
|
nodemask_t *nodes, unsigned long flags,
|
|
struct list_head *pagelist)
|
|
{
|
|
struct queue_pages qp = {
|
|
.pagelist = pagelist,
|
|
.flags = flags,
|
|
.nmask = nodes,
|
|
.prev = NULL,
|
|
};
|
|
|
|
return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
|
|
}
|
|
|
|
/*
|
|
* Apply policy to a single VMA
|
|
* This must be called with the mmap_sem held for writing.
|
|
*/
|
|
static int vma_replace_policy(struct vm_area_struct *vma,
|
|
struct mempolicy *pol)
|
|
{
|
|
int err;
|
|
struct mempolicy *old;
|
|
struct mempolicy *new;
|
|
|
|
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
|
|
vma->vm_start, vma->vm_end, vma->vm_pgoff,
|
|
vma->vm_ops, vma->vm_file,
|
|
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
|
|
|
|
new = mpol_dup(pol);
|
|
if (IS_ERR(new))
|
|
return PTR_ERR(new);
|
|
|
|
vm_write_begin(vma);
|
|
if (vma->vm_ops && vma->vm_ops->set_policy) {
|
|
err = vma->vm_ops->set_policy(vma, new);
|
|
if (err)
|
|
goto err_out;
|
|
}
|
|
|
|
old = vma->vm_policy;
|
|
/*
|
|
* The speculative page fault handler accesses this field without
|
|
* hodling the mmap_sem.
|
|
*/
|
|
WRITE_ONCE(vma->vm_policy, new);
|
|
vm_write_end(vma);
|
|
mpol_put(old);
|
|
|
|
return 0;
|
|
err_out:
|
|
vm_write_end(vma);
|
|
mpol_put(new);
|
|
return err;
|
|
}
|
|
|
|
/* Step 2: apply policy to a range and do splits. */
|
|
static int mbind_range(struct mm_struct *mm, unsigned long start,
|
|
unsigned long end, struct mempolicy *new_pol)
|
|
{
|
|
struct vm_area_struct *prev;
|
|
struct vm_area_struct *vma;
|
|
int err = 0;
|
|
pgoff_t pgoff;
|
|
unsigned long vmstart;
|
|
unsigned long vmend;
|
|
|
|
vma = find_vma(mm, start);
|
|
if (!vma || vma->vm_start > start)
|
|
return -EFAULT;
|
|
|
|
prev = vma->vm_prev;
|
|
if (start > vma->vm_start)
|
|
prev = vma;
|
|
|
|
for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
|
|
vmstart = max(start, vma->vm_start);
|
|
vmend = min(end, vma->vm_end);
|
|
|
|
if (mpol_equal(vma_policy(vma), new_pol))
|
|
continue;
|
|
|
|
pgoff = vma->vm_pgoff +
|
|
((vmstart - vma->vm_start) >> PAGE_SHIFT);
|
|
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
|
|
vma->anon_vma, vma->vm_file, pgoff,
|
|
new_pol, vma->vm_userfaultfd_ctx,
|
|
vma_get_anon_name(vma));
|
|
if (prev) {
|
|
vma = prev;
|
|
goto replace;
|
|
}
|
|
if (vma->vm_start != vmstart) {
|
|
err = split_vma(vma->vm_mm, vma, vmstart, 1);
|
|
if (err)
|
|
goto out;
|
|
}
|
|
if (vma->vm_end != vmend) {
|
|
err = split_vma(vma->vm_mm, vma, vmend, 0);
|
|
if (err)
|
|
goto out;
|
|
}
|
|
replace:
|
|
err = vma_replace_policy(vma, new_pol);
|
|
if (err)
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
/* Set the process memory policy */
|
|
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
|
|
nodemask_t *nodes)
|
|
{
|
|
struct mempolicy *new, *old;
|
|
NODEMASK_SCRATCH(scratch);
|
|
int ret;
|
|
|
|
if (!scratch)
|
|
return -ENOMEM;
|
|
|
|
new = mpol_new(mode, flags, nodes);
|
|
if (IS_ERR(new)) {
|
|
ret = PTR_ERR(new);
|
|
goto out;
|
|
}
|
|
|
|
task_lock(current);
|
|
ret = mpol_set_nodemask(new, nodes, scratch);
|
|
if (ret) {
|
|
task_unlock(current);
|
|
mpol_put(new);
|
|
goto out;
|
|
}
|
|
old = current->mempolicy;
|
|
current->mempolicy = new;
|
|
if (new && new->mode == MPOL_INTERLEAVE)
|
|
current->il_prev = MAX_NUMNODES-1;
|
|
task_unlock(current);
|
|
mpol_put(old);
|
|
ret = 0;
|
|
out:
|
|
NODEMASK_SCRATCH_FREE(scratch);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Return nodemask for policy for get_mempolicy() query
|
|
*
|
|
* Called with task's alloc_lock held
|
|
*/
|
|
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
|
|
{
|
|
nodes_clear(*nodes);
|
|
if (p == &default_policy)
|
|
return;
|
|
|
|
switch (p->mode) {
|
|
case MPOL_BIND:
|
|
/* Fall through */
|
|
case MPOL_INTERLEAVE:
|
|
*nodes = p->v.nodes;
|
|
break;
|
|
case MPOL_PREFERRED:
|
|
if (!(p->flags & MPOL_F_LOCAL))
|
|
node_set(p->v.preferred_node, *nodes);
|
|
/* else return empty node mask for local allocation */
|
|
break;
|
|
default:
|
|
BUG();
|
|
}
|
|
}
|
|
|
|
static int lookup_node(struct mm_struct *mm, unsigned long addr)
|
|
{
|
|
struct page *p;
|
|
int err;
|
|
|
|
int locked = 1;
|
|
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
|
|
if (err >= 0) {
|
|
err = page_to_nid(p);
|
|
put_page(p);
|
|
}
|
|
if (locked)
|
|
up_read(&mm->mmap_sem);
|
|
return err;
|
|
}
|
|
|
|
/* Retrieve NUMA policy */
|
|
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
|
|
unsigned long addr, unsigned long flags)
|
|
{
|
|
int err;
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma = NULL;
|
|
struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
|
|
|
|
if (flags &
|
|
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
|
|
return -EINVAL;
|
|
|
|
if (flags & MPOL_F_MEMS_ALLOWED) {
|
|
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
|
|
return -EINVAL;
|
|
*policy = 0; /* just so it's initialized */
|
|
task_lock(current);
|
|
*nmask = cpuset_current_mems_allowed;
|
|
task_unlock(current);
|
|
return 0;
|
|
}
|
|
|
|
if (flags & MPOL_F_ADDR) {
|
|
/*
|
|
* Do NOT fall back to task policy if the
|
|
* vma/shared policy at addr is NULL. We
|
|
* want to return MPOL_DEFAULT in this case.
|
|
*/
|
|
down_read(&mm->mmap_sem);
|
|
vma = find_vma_intersection(mm, addr, addr+1);
|
|
if (!vma) {
|
|
up_read(&mm->mmap_sem);
|
|
return -EFAULT;
|
|
}
|
|
if (vma->vm_ops && vma->vm_ops->get_policy)
|
|
pol = vma->vm_ops->get_policy(vma, addr);
|
|
else
|
|
pol = vma->vm_policy;
|
|
} else if (addr)
|
|
return -EINVAL;
|
|
|
|
if (!pol)
|
|
pol = &default_policy; /* indicates default behavior */
|
|
|
|
if (flags & MPOL_F_NODE) {
|
|
if (flags & MPOL_F_ADDR) {
|
|
/*
|
|
* Take a refcount on the mpol, lookup_node()
|
|
* wil drop the mmap_sem, so after calling
|
|
* lookup_node() only "pol" remains valid, "vma"
|
|
* is stale.
|
|
*/
|
|
pol_refcount = pol;
|
|
vma = NULL;
|
|
mpol_get(pol);
|
|
err = lookup_node(mm, addr);
|
|
if (err < 0)
|
|
goto out;
|
|
*policy = err;
|
|
} else if (pol == current->mempolicy &&
|
|
pol->mode == MPOL_INTERLEAVE) {
|
|
*policy = next_node_in(current->il_prev, pol->v.nodes);
|
|
} else {
|
|
err = -EINVAL;
|
|
goto out;
|
|
}
|
|
} else {
|
|
*policy = pol == &default_policy ? MPOL_DEFAULT :
|
|
pol->mode;
|
|
/*
|
|
* Internal mempolicy flags must be masked off before exposing
|
|
* the policy to userspace.
|
|
*/
|
|
*policy |= (pol->flags & MPOL_MODE_FLAGS);
|
|
}
|
|
|
|
err = 0;
|
|
if (nmask) {
|
|
if (mpol_store_user_nodemask(pol)) {
|
|
*nmask = pol->w.user_nodemask;
|
|
} else {
|
|
task_lock(current);
|
|
get_policy_nodemask(pol, nmask);
|
|
task_unlock(current);
|
|
}
|
|
}
|
|
|
|
out:
|
|
mpol_cond_put(pol);
|
|
if (vma)
|
|
up_read(&mm->mmap_sem);
|
|
if (pol_refcount)
|
|
mpol_put(pol_refcount);
|
|
return err;
|
|
}
|
|
|
|
#ifdef CONFIG_MIGRATION
|
|
/*
|
|
* page migration, thp tail pages can be passed.
|
|
*/
|
|
static int migrate_page_add(struct page *page, struct list_head *pagelist,
|
|
unsigned long flags)
|
|
{
|
|
struct page *head = compound_head(page);
|
|
/*
|
|
* Avoid migrating a page that is shared with others.
|
|
*/
|
|
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
|
|
if (!isolate_lru_page(head)) {
|
|
list_add_tail(&head->lru, pagelist);
|
|
mod_node_page_state(page_pgdat(head),
|
|
NR_ISOLATED_ANON + page_is_file_cache(head),
|
|
hpage_nr_pages(head));
|
|
} else if (flags & MPOL_MF_STRICT) {
|
|
/*
|
|
* Non-movable page may reach here. And, there may be
|
|
* temporary off LRU pages or non-LRU movable pages.
|
|
* Treat them as unmovable pages since they can't be
|
|
* isolated, so they can't be moved at the moment. It
|
|
* should return -EIO for this case too.
|
|
*/
|
|
return -EIO;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* page allocation callback for NUMA node migration */
|
|
struct page *alloc_new_node_page(struct page *page, unsigned long node)
|
|
{
|
|
if (PageHuge(page))
|
|
return alloc_huge_page_node(page_hstate(compound_head(page)),
|
|
node);
|
|
else if (PageTransHuge(page)) {
|
|
struct page *thp;
|
|
|
|
thp = alloc_pages_node(node,
|
|
(GFP_TRANSHUGE | __GFP_THISNODE),
|
|
HPAGE_PMD_ORDER);
|
|
if (!thp)
|
|
return NULL;
|
|
prep_transhuge_page(thp);
|
|
return thp;
|
|
} else
|
|
return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
|
|
__GFP_THISNODE, 0);
|
|
}
|
|
|
|
/*
|
|
* Migrate pages from one node to a target node.
|
|
* Returns error or the number of pages not migrated.
|
|
*/
|
|
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
|
|
int flags)
|
|
{
|
|
nodemask_t nmask;
|
|
LIST_HEAD(pagelist);
|
|
int err = 0;
|
|
|
|
nodes_clear(nmask);
|
|
node_set(source, nmask);
|
|
|
|
/*
|
|
* This does not "check" the range but isolates all pages that
|
|
* need migration. Between passing in the full user address
|
|
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
|
|
*/
|
|
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
|
|
queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
|
|
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
|
|
|
|
if (!list_empty(&pagelist)) {
|
|
err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
|
|
MIGRATE_SYNC, MR_SYSCALL);
|
|
if (err)
|
|
putback_movable_pages(&pagelist);
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
/*
|
|
* Move pages between the two nodesets so as to preserve the physical
|
|
* layout as much as possible.
|
|
*
|
|
* Returns the number of page that could not be moved.
|
|
*/
|
|
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
|
|
const nodemask_t *to, int flags)
|
|
{
|
|
int busy = 0;
|
|
int err;
|
|
nodemask_t tmp;
|
|
|
|
err = migrate_prep();
|
|
if (err)
|
|
return err;
|
|
|
|
down_read(&mm->mmap_sem);
|
|
|
|
/*
|
|
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
|
|
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
|
|
* bit in 'tmp', and return that <source, dest> pair for migration.
|
|
* The pair of nodemasks 'to' and 'from' define the map.
|
|
*
|
|
* If no pair of bits is found that way, fallback to picking some
|
|
* pair of 'source' and 'dest' bits that are not the same. If the
|
|
* 'source' and 'dest' bits are the same, this represents a node
|
|
* that will be migrating to itself, so no pages need move.
|
|
*
|
|
* If no bits are left in 'tmp', or if all remaining bits left
|
|
* in 'tmp' correspond to the same bit in 'to', return false
|
|
* (nothing left to migrate).
|
|
*
|
|
* This lets us pick a pair of nodes to migrate between, such that
|
|
* if possible the dest node is not already occupied by some other
|
|
* source node, minimizing the risk of overloading the memory on a
|
|
* node that would happen if we migrated incoming memory to a node
|
|
* before migrating outgoing memory source that same node.
|
|
*
|
|
* A single scan of tmp is sufficient. As we go, we remember the
|
|
* most recent <s, d> pair that moved (s != d). If we find a pair
|
|
* that not only moved, but what's better, moved to an empty slot
|
|
* (d is not set in tmp), then we break out then, with that pair.
|
|
* Otherwise when we finish scanning from_tmp, we at least have the
|
|
* most recent <s, d> pair that moved. If we get all the way through
|
|
* the scan of tmp without finding any node that moved, much less
|
|
* moved to an empty node, then there is nothing left worth migrating.
|
|
*/
|
|
|
|
tmp = *from;
|
|
while (!nodes_empty(tmp)) {
|
|
int s,d;
|
|
int source = NUMA_NO_NODE;
|
|
int dest = 0;
|
|
|
|
for_each_node_mask(s, tmp) {
|
|
|
|
/*
|
|
* do_migrate_pages() tries to maintain the relative
|
|
* node relationship of the pages established between
|
|
* threads and memory areas.
|
|
*
|
|
* However if the number of source nodes is not equal to
|
|
* the number of destination nodes we can not preserve
|
|
* this node relative relationship. In that case, skip
|
|
* copying memory from a node that is in the destination
|
|
* mask.
|
|
*
|
|
* Example: [2,3,4] -> [3,4,5] moves everything.
|
|
* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
|
|
*/
|
|
|
|
if ((nodes_weight(*from) != nodes_weight(*to)) &&
|
|
(node_isset(s, *to)))
|
|
continue;
|
|
|
|
d = node_remap(s, *from, *to);
|
|
if (s == d)
|
|
continue;
|
|
|
|
source = s; /* Node moved. Memorize */
|
|
dest = d;
|
|
|
|
/* dest not in remaining from nodes? */
|
|
if (!node_isset(dest, tmp))
|
|
break;
|
|
}
|
|
if (source == NUMA_NO_NODE)
|
|
break;
|
|
|
|
node_clear(source, tmp);
|
|
err = migrate_to_node(mm, source, dest, flags);
|
|
if (err > 0)
|
|
busy += err;
|
|
if (err < 0)
|
|
break;
|
|
}
|
|
up_read(&mm->mmap_sem);
|
|
if (err < 0)
|
|
return err;
|
|
return busy;
|
|
|
|
}
|
|
|
|
/*
|
|
* Allocate a new page for page migration based on vma policy.
|
|
* Start by assuming the page is mapped by the same vma as contains @start.
|
|
* Search forward from there, if not. N.B., this assumes that the
|
|
* list of pages handed to migrate_pages()--which is how we get here--
|
|
* is in virtual address order.
|
|
*/
|
|
static struct page *new_page(struct page *page, unsigned long start)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
unsigned long uninitialized_var(address);
|
|
|
|
vma = find_vma(current->mm, start);
|
|
while (vma) {
|
|
address = page_address_in_vma(page, vma);
|
|
if (address != -EFAULT)
|
|
break;
|
|
vma = vma->vm_next;
|
|
}
|
|
|
|
if (PageHuge(page)) {
|
|
return alloc_huge_page_vma(page_hstate(compound_head(page)),
|
|
vma, address);
|
|
} else if (PageTransHuge(page)) {
|
|
struct page *thp;
|
|
|
|
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
|
|
HPAGE_PMD_ORDER);
|
|
if (!thp)
|
|
return NULL;
|
|
prep_transhuge_page(thp);
|
|
return thp;
|
|
}
|
|
/*
|
|
* if !vma, alloc_page_vma() will use task or system default policy
|
|
*/
|
|
return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
|
|
vma, address);
|
|
}
|
|
#else
|
|
|
|
static int migrate_page_add(struct page *page, struct list_head *pagelist,
|
|
unsigned long flags)
|
|
{
|
|
return -EIO;
|
|
}
|
|
|
|
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
|
|
const nodemask_t *to, int flags)
|
|
{
|
|
return -ENOSYS;
|
|
}
|
|
|
|
static struct page *new_page(struct page *page, unsigned long start)
|
|
{
|
|
return NULL;
|
|
}
|
|
#endif
|
|
|
|
static long do_mbind(unsigned long start, unsigned long len,
|
|
unsigned short mode, unsigned short mode_flags,
|
|
nodemask_t *nmask, unsigned long flags)
|
|
{
|
|
struct mm_struct *mm = current->mm;
|
|
struct mempolicy *new;
|
|
unsigned long end;
|
|
int err;
|
|
int ret;
|
|
LIST_HEAD(pagelist);
|
|
|
|
if (flags & ~(unsigned long)MPOL_MF_VALID)
|
|
return -EINVAL;
|
|
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
|
|
return -EPERM;
|
|
|
|
if (start & ~PAGE_MASK)
|
|
return -EINVAL;
|
|
|
|
if (mode == MPOL_DEFAULT)
|
|
flags &= ~MPOL_MF_STRICT;
|
|
|
|
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
|
|
end = start + len;
|
|
|
|
if (end < start)
|
|
return -EINVAL;
|
|
if (end == start)
|
|
return 0;
|
|
|
|
new = mpol_new(mode, mode_flags, nmask);
|
|
if (IS_ERR(new))
|
|
return PTR_ERR(new);
|
|
|
|
if (flags & MPOL_MF_LAZY)
|
|
new->flags |= MPOL_F_MOF;
|
|
|
|
/*
|
|
* If we are using the default policy then operation
|
|
* on discontinuous address spaces is okay after all
|
|
*/
|
|
if (!new)
|
|
flags |= MPOL_MF_DISCONTIG_OK;
|
|
|
|
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
|
|
start, start + len, mode, mode_flags,
|
|
nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
|
|
|
|
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
|
|
|
|
err = migrate_prep();
|
|
if (err)
|
|
goto mpol_out;
|
|
}
|
|
{
|
|
NODEMASK_SCRATCH(scratch);
|
|
if (scratch) {
|
|
down_write(&mm->mmap_sem);
|
|
task_lock(current);
|
|
err = mpol_set_nodemask(new, nmask, scratch);
|
|
task_unlock(current);
|
|
if (err)
|
|
up_write(&mm->mmap_sem);
|
|
} else
|
|
err = -ENOMEM;
|
|
NODEMASK_SCRATCH_FREE(scratch);
|
|
}
|
|
if (err)
|
|
goto mpol_out;
|
|
|
|
ret = queue_pages_range(mm, start, end, nmask,
|
|
flags | MPOL_MF_INVERT, &pagelist);
|
|
|
|
if (ret < 0) {
|
|
err = ret;
|
|
goto up_out;
|
|
}
|
|
|
|
err = mbind_range(mm, start, end, new);
|
|
|
|
if (!err) {
|
|
int nr_failed = 0;
|
|
|
|
if (!list_empty(&pagelist)) {
|
|
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
|
|
nr_failed = migrate_pages(&pagelist, new_page, NULL,
|
|
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
|
|
if (nr_failed)
|
|
putback_movable_pages(&pagelist);
|
|
}
|
|
|
|
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
|
|
err = -EIO;
|
|
} else {
|
|
up_out:
|
|
if (!list_empty(&pagelist))
|
|
putback_movable_pages(&pagelist);
|
|
}
|
|
|
|
up_write(&mm->mmap_sem);
|
|
mpol_out:
|
|
mpol_put(new);
|
|
return err;
|
|
}
|
|
|
|
/*
|
|
* User space interface with variable sized bitmaps for nodelists.
|
|
*/
|
|
|
|
/* Copy a node mask from user space. */
|
|
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
|
|
unsigned long maxnode)
|
|
{
|
|
unsigned long k;
|
|
unsigned long t;
|
|
unsigned long nlongs;
|
|
unsigned long endmask;
|
|
|
|
--maxnode;
|
|
nodes_clear(*nodes);
|
|
if (maxnode == 0 || !nmask)
|
|
return 0;
|
|
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
|
|
return -EINVAL;
|
|
|
|
nlongs = BITS_TO_LONGS(maxnode);
|
|
if ((maxnode % BITS_PER_LONG) == 0)
|
|
endmask = ~0UL;
|
|
else
|
|
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
|
|
|
|
/*
|
|
* When the user specified more nodes than supported just check
|
|
* if the non supported part is all zero.
|
|
*
|
|
* If maxnode have more longs than MAX_NUMNODES, check
|
|
* the bits in that area first. And then go through to
|
|
* check the rest bits which equal or bigger than MAX_NUMNODES.
|
|
* Otherwise, just check bits [MAX_NUMNODES, maxnode).
|
|
*/
|
|
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
|
|
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
|
|
if (get_user(t, nmask + k))
|
|
return -EFAULT;
|
|
if (k == nlongs - 1) {
|
|
if (t & endmask)
|
|
return -EINVAL;
|
|
} else if (t)
|
|
return -EINVAL;
|
|
}
|
|
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
|
|
endmask = ~0UL;
|
|
}
|
|
|
|
if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
|
|
unsigned long valid_mask = endmask;
|
|
|
|
valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
|
|
if (get_user(t, nmask + nlongs - 1))
|
|
return -EFAULT;
|
|
if (t & valid_mask)
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
|
|
return -EFAULT;
|
|
nodes_addr(*nodes)[nlongs-1] &= endmask;
|
|
return 0;
|
|
}
|
|
|
|
/* Copy a kernel node mask to user space */
|
|
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
|
|
nodemask_t *nodes)
|
|
{
|
|
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
|
|
unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
|
|
|
|
if (copy > nbytes) {
|
|
if (copy > PAGE_SIZE)
|
|
return -EINVAL;
|
|
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
|
|
return -EFAULT;
|
|
copy = nbytes;
|
|
}
|
|
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
|
|
}
|
|
|
|
static long kernel_mbind(unsigned long start, unsigned long len,
|
|
unsigned long mode, const unsigned long __user *nmask,
|
|
unsigned long maxnode, unsigned int flags)
|
|
{
|
|
nodemask_t nodes;
|
|
int err;
|
|
unsigned short mode_flags;
|
|
|
|
start = untagged_addr(start);
|
|
mode_flags = mode & MPOL_MODE_FLAGS;
|
|
mode &= ~MPOL_MODE_FLAGS;
|
|
if (mode >= MPOL_MAX)
|
|
return -EINVAL;
|
|
if ((mode_flags & MPOL_F_STATIC_NODES) &&
|
|
(mode_flags & MPOL_F_RELATIVE_NODES))
|
|
return -EINVAL;
|
|
err = get_nodes(&nodes, nmask, maxnode);
|
|
if (err)
|
|
return err;
|
|
return do_mbind(start, len, mode, mode_flags, &nodes, flags);
|
|
}
|
|
|
|
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
|
|
unsigned long, mode, const unsigned long __user *, nmask,
|
|
unsigned long, maxnode, unsigned int, flags)
|
|
{
|
|
return kernel_mbind(start, len, mode, nmask, maxnode, flags);
|
|
}
|
|
|
|
/* Set the process memory policy */
|
|
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
|
|
unsigned long maxnode)
|
|
{
|
|
int err;
|
|
nodemask_t nodes;
|
|
unsigned short flags;
|
|
|
|
flags = mode & MPOL_MODE_FLAGS;
|
|
mode &= ~MPOL_MODE_FLAGS;
|
|
if ((unsigned int)mode >= MPOL_MAX)
|
|
return -EINVAL;
|
|
if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
|
|
return -EINVAL;
|
|
err = get_nodes(&nodes, nmask, maxnode);
|
|
if (err)
|
|
return err;
|
|
return do_set_mempolicy(mode, flags, &nodes);
|
|
}
|
|
|
|
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
|
|
unsigned long, maxnode)
|
|
{
|
|
return kernel_set_mempolicy(mode, nmask, maxnode);
|
|
}
|
|
|
|
static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
|
|
const unsigned long __user *old_nodes,
|
|
const unsigned long __user *new_nodes)
|
|
{
|
|
struct mm_struct *mm = NULL;
|
|
struct task_struct *task;
|
|
nodemask_t task_nodes;
|
|
int err;
|
|
nodemask_t *old;
|
|
nodemask_t *new;
|
|
NODEMASK_SCRATCH(scratch);
|
|
|
|
if (!scratch)
|
|
return -ENOMEM;
|
|
|
|
old = &scratch->mask1;
|
|
new = &scratch->mask2;
|
|
|
|
err = get_nodes(old, old_nodes, maxnode);
|
|
if (err)
|
|
goto out;
|
|
|
|
err = get_nodes(new, new_nodes, maxnode);
|
|
if (err)
|
|
goto out;
|
|
|
|
/* Find the mm_struct */
|
|
rcu_read_lock();
|
|
task = pid ? find_task_by_vpid(pid) : current;
|
|
if (!task) {
|
|
rcu_read_unlock();
|
|
err = -ESRCH;
|
|
goto out;
|
|
}
|
|
get_task_struct(task);
|
|
|
|
err = -EINVAL;
|
|
|
|
/*
|
|
* Check if this process has the right to modify the specified process.
|
|
* Use the regular "ptrace_may_access()" checks.
|
|
*/
|
|
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
|
|
rcu_read_unlock();
|
|
err = -EPERM;
|
|
goto out_put;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
task_nodes = cpuset_mems_allowed(task);
|
|
/* Is the user allowed to access the target nodes? */
|
|
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
|
|
err = -EPERM;
|
|
goto out_put;
|
|
}
|
|
|
|
task_nodes = cpuset_mems_allowed(current);
|
|
nodes_and(*new, *new, task_nodes);
|
|
if (nodes_empty(*new))
|
|
goto out_put;
|
|
|
|
err = security_task_movememory(task);
|
|
if (err)
|
|
goto out_put;
|
|
|
|
mm = get_task_mm(task);
|
|
put_task_struct(task);
|
|
|
|
if (!mm) {
|
|
err = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
err = do_migrate_pages(mm, old, new,
|
|
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
|
|
|
|
mmput(mm);
|
|
out:
|
|
NODEMASK_SCRATCH_FREE(scratch);
|
|
|
|
return err;
|
|
|
|
out_put:
|
|
put_task_struct(task);
|
|
goto out;
|
|
|
|
}
|
|
|
|
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
|
|
const unsigned long __user *, old_nodes,
|
|
const unsigned long __user *, new_nodes)
|
|
{
|
|
return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
|
|
}
|
|
|
|
|
|
/* Retrieve NUMA policy */
|
|
static int kernel_get_mempolicy(int __user *policy,
|
|
unsigned long __user *nmask,
|
|
unsigned long maxnode,
|
|
unsigned long addr,
|
|
unsigned long flags)
|
|
{
|
|
int err;
|
|
int uninitialized_var(pval);
|
|
nodemask_t nodes;
|
|
|
|
addr = untagged_addr(addr);
|
|
|
|
if (nmask != NULL && maxnode < nr_node_ids)
|
|
return -EINVAL;
|
|
|
|
err = do_get_mempolicy(&pval, &nodes, addr, flags);
|
|
|
|
if (err)
|
|
return err;
|
|
|
|
if (policy && put_user(pval, policy))
|
|
return -EFAULT;
|
|
|
|
if (nmask)
|
|
err = copy_nodes_to_user(nmask, maxnode, &nodes);
|
|
|
|
return err;
|
|
}
|
|
|
|
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
|
|
unsigned long __user *, nmask, unsigned long, maxnode,
|
|
unsigned long, addr, unsigned long, flags)
|
|
{
|
|
return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
|
|
}
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
|
|
compat_ulong_t __user *, nmask,
|
|
compat_ulong_t, maxnode,
|
|
compat_ulong_t, addr, compat_ulong_t, flags)
|
|
{
|
|
long err;
|
|
unsigned long __user *nm = NULL;
|
|
unsigned long nr_bits, alloc_size;
|
|
DECLARE_BITMAP(bm, MAX_NUMNODES);
|
|
|
|
nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
|
|
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
|
|
|
|
if (nmask)
|
|
nm = compat_alloc_user_space(alloc_size);
|
|
|
|
err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
|
|
|
|
if (!err && nmask) {
|
|
unsigned long copy_size;
|
|
copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
|
|
err = copy_from_user(bm, nm, copy_size);
|
|
/* ensure entire bitmap is zeroed */
|
|
err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
|
|
err |= compat_put_bitmap(nmask, bm, nr_bits);
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
|
|
compat_ulong_t, maxnode)
|
|
{
|
|
unsigned long __user *nm = NULL;
|
|
unsigned long nr_bits, alloc_size;
|
|
DECLARE_BITMAP(bm, MAX_NUMNODES);
|
|
|
|
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
|
|
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
|
|
|
|
if (nmask) {
|
|
if (compat_get_bitmap(bm, nmask, nr_bits))
|
|
return -EFAULT;
|
|
nm = compat_alloc_user_space(alloc_size);
|
|
if (copy_to_user(nm, bm, alloc_size))
|
|
return -EFAULT;
|
|
}
|
|
|
|
return kernel_set_mempolicy(mode, nm, nr_bits+1);
|
|
}
|
|
|
|
COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
|
|
compat_ulong_t, mode, compat_ulong_t __user *, nmask,
|
|
compat_ulong_t, maxnode, compat_ulong_t, flags)
|
|
{
|
|
unsigned long __user *nm = NULL;
|
|
unsigned long nr_bits, alloc_size;
|
|
nodemask_t bm;
|
|
|
|
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
|
|
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
|
|
|
|
if (nmask) {
|
|
if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
|
|
return -EFAULT;
|
|
nm = compat_alloc_user_space(alloc_size);
|
|
if (copy_to_user(nm, nodes_addr(bm), alloc_size))
|
|
return -EFAULT;
|
|
}
|
|
|
|
return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
|
|
}
|
|
|
|
COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
|
|
compat_ulong_t, maxnode,
|
|
const compat_ulong_t __user *, old_nodes,
|
|
const compat_ulong_t __user *, new_nodes)
|
|
{
|
|
unsigned long __user *old = NULL;
|
|
unsigned long __user *new = NULL;
|
|
nodemask_t tmp_mask;
|
|
unsigned long nr_bits;
|
|
unsigned long size;
|
|
|
|
nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
|
|
size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
|
|
if (old_nodes) {
|
|
if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
|
|
return -EFAULT;
|
|
old = compat_alloc_user_space(new_nodes ? size * 2 : size);
|
|
if (new_nodes)
|
|
new = old + size / sizeof(unsigned long);
|
|
if (copy_to_user(old, nodes_addr(tmp_mask), size))
|
|
return -EFAULT;
|
|
}
|
|
if (new_nodes) {
|
|
if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
|
|
return -EFAULT;
|
|
if (new == NULL)
|
|
new = compat_alloc_user_space(size);
|
|
if (copy_to_user(new, nodes_addr(tmp_mask), size))
|
|
return -EFAULT;
|
|
}
|
|
return kernel_migrate_pages(pid, nr_bits + 1, old, new);
|
|
}
|
|
|
|
#endif /* CONFIG_COMPAT */
|
|
|
|
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
|
|
unsigned long addr)
|
|
{
|
|
struct mempolicy *pol;
|
|
|
|
if (!vma)
|
|
return NULL;
|
|
|
|
if (vma->vm_ops && vma->vm_ops->get_policy)
|
|
return vma->vm_ops->get_policy(vma, addr);
|
|
|
|
/*
|
|
* This could be called without holding the mmap_sem in the
|
|
* speculative page fault handler's path.
|
|
*/
|
|
pol = READ_ONCE(vma->vm_policy);
|
|
if (pol) {
|
|
/*
|
|
* shmem_alloc_page() passes MPOL_F_SHARED policy with
|
|
* a pseudo vma whose vma->vm_ops=NULL. Take a reference
|
|
* count on these policies which will be dropped by
|
|
* mpol_cond_put() later
|
|
*/
|
|
if (mpol_needs_cond_ref(pol))
|
|
mpol_get(pol);
|
|
}
|
|
|
|
return pol;
|
|
}
|
|
|
|
/*
|
|
* get_vma_policy(@vma, @addr)
|
|
* @vma: virtual memory area whose policy is sought
|
|
* @addr: address in @vma for shared policy lookup
|
|
*
|
|
* Returns effective policy for a VMA at specified address.
|
|
* Falls back to current->mempolicy or system default policy, as necessary.
|
|
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
|
|
* count--added by the get_policy() vm_op, as appropriate--to protect against
|
|
* freeing by another task. It is the caller's responsibility to free the
|
|
* extra reference for shared policies.
|
|
*/
|
|
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
|
|
unsigned long addr)
|
|
{
|
|
struct mempolicy *pol = __get_vma_policy(vma, addr);
|
|
|
|
if (!pol)
|
|
pol = get_task_policy(current);
|
|
|
|
return pol;
|
|
}
|
|
|
|
bool vma_policy_mof(struct vm_area_struct *vma)
|
|
{
|
|
struct mempolicy *pol;
|
|
|
|
if (vma->vm_ops && vma->vm_ops->get_policy) {
|
|
bool ret = false;
|
|
|
|
pol = vma->vm_ops->get_policy(vma, vma->vm_start);
|
|
if (pol && (pol->flags & MPOL_F_MOF))
|
|
ret = true;
|
|
mpol_cond_put(pol);
|
|
|
|
return ret;
|
|
}
|
|
|
|
pol = vma->vm_policy;
|
|
if (!pol)
|
|
pol = get_task_policy(current);
|
|
|
|
return pol->flags & MPOL_F_MOF;
|
|
}
|
|
|
|
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
|
|
{
|
|
enum zone_type dynamic_policy_zone = policy_zone;
|
|
|
|
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
|
|
|
|
/*
|
|
* if policy->v.nodes has movable memory only,
|
|
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
|
|
*
|
|
* policy->v.nodes is intersect with node_states[N_MEMORY].
|
|
* so if the following test faile, it implies
|
|
* policy->v.nodes has movable memory only.
|
|
*/
|
|
if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
|
|
dynamic_policy_zone = ZONE_MOVABLE;
|
|
|
|
return zone >= dynamic_policy_zone;
|
|
}
|
|
|
|
/*
|
|
* Return a nodemask representing a mempolicy for filtering nodes for
|
|
* page allocation
|
|
*/
|
|
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
|
|
{
|
|
/* Lower zones don't get a nodemask applied for MPOL_BIND */
|
|
if (unlikely(policy->mode == MPOL_BIND) &&
|
|
apply_policy_zone(policy, gfp_zone(gfp)) &&
|
|
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
|
|
return &policy->v.nodes;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Return the node id preferred by the given mempolicy, or the given id */
|
|
static int policy_node(gfp_t gfp, struct mempolicy *policy,
|
|
int nd)
|
|
{
|
|
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
|
|
nd = policy->v.preferred_node;
|
|
else {
|
|
/*
|
|
* __GFP_THISNODE shouldn't even be used with the bind policy
|
|
* because we might easily break the expectation to stay on the
|
|
* requested node and not break the policy.
|
|
*/
|
|
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
|
|
}
|
|
|
|
return nd;
|
|
}
|
|
|
|
/* Do dynamic interleaving for a process */
|
|
static unsigned interleave_nodes(struct mempolicy *policy)
|
|
{
|
|
unsigned next;
|
|
struct task_struct *me = current;
|
|
|
|
next = next_node_in(me->il_prev, policy->v.nodes);
|
|
if (next < MAX_NUMNODES)
|
|
me->il_prev = next;
|
|
return next;
|
|
}
|
|
|
|
/*
|
|
* Depending on the memory policy provide a node from which to allocate the
|
|
* next slab entry.
|
|
*/
|
|
unsigned int mempolicy_slab_node(void)
|
|
{
|
|
struct mempolicy *policy;
|
|
int node = numa_mem_id();
|
|
|
|
if (in_interrupt())
|
|
return node;
|
|
|
|
policy = current->mempolicy;
|
|
if (!policy || policy->flags & MPOL_F_LOCAL)
|
|
return node;
|
|
|
|
switch (policy->mode) {
|
|
case MPOL_PREFERRED:
|
|
/*
|
|
* handled MPOL_F_LOCAL above
|
|
*/
|
|
return policy->v.preferred_node;
|
|
|
|
case MPOL_INTERLEAVE:
|
|
return interleave_nodes(policy);
|
|
|
|
case MPOL_BIND: {
|
|
struct zoneref *z;
|
|
|
|
/*
|
|
* Follow bind policy behavior and start allocation at the
|
|
* first node.
|
|
*/
|
|
struct zonelist *zonelist;
|
|
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
|
|
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
|
|
z = first_zones_zonelist(zonelist, highest_zoneidx,
|
|
&policy->v.nodes);
|
|
return z->zone ? zone_to_nid(z->zone) : node;
|
|
}
|
|
|
|
default:
|
|
BUG();
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Do static interleaving for a VMA with known offset @n. Returns the n'th
|
|
* node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
|
|
* number of present nodes.
|
|
*/
|
|
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
|
|
{
|
|
unsigned nnodes = nodes_weight(pol->v.nodes);
|
|
unsigned target;
|
|
int i;
|
|
int nid;
|
|
|
|
if (!nnodes)
|
|
return numa_node_id();
|
|
target = (unsigned int)n % nnodes;
|
|
nid = first_node(pol->v.nodes);
|
|
for (i = 0; i < target; i++)
|
|
nid = next_node(nid, pol->v.nodes);
|
|
return nid;
|
|
}
|
|
|
|
/* Determine a node number for interleave */
|
|
static inline unsigned interleave_nid(struct mempolicy *pol,
|
|
struct vm_area_struct *vma, unsigned long addr, int shift)
|
|
{
|
|
if (vma) {
|
|
unsigned long off;
|
|
|
|
/*
|
|
* for small pages, there is no difference between
|
|
* shift and PAGE_SHIFT, so the bit-shift is safe.
|
|
* for huge pages, since vm_pgoff is in units of small
|
|
* pages, we need to shift off the always 0 bits to get
|
|
* a useful offset.
|
|
*/
|
|
BUG_ON(shift < PAGE_SHIFT);
|
|
off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
|
|
off += (addr - vma->vm_start) >> shift;
|
|
return offset_il_node(pol, off);
|
|
} else
|
|
return interleave_nodes(pol);
|
|
}
|
|
|
|
#ifdef CONFIG_HUGETLBFS
|
|
/*
|
|
* huge_node(@vma, @addr, @gfp_flags, @mpol)
|
|
* @vma: virtual memory area whose policy is sought
|
|
* @addr: address in @vma for shared policy lookup and interleave policy
|
|
* @gfp_flags: for requested zone
|
|
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
|
|
* @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
|
|
*
|
|
* Returns a nid suitable for a huge page allocation and a pointer
|
|
* to the struct mempolicy for conditional unref after allocation.
|
|
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
|
|
* @nodemask for filtering the zonelist.
|
|
*
|
|
* Must be protected by read_mems_allowed_begin()
|
|
*/
|
|
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
|
|
struct mempolicy **mpol, nodemask_t **nodemask)
|
|
{
|
|
int nid;
|
|
|
|
*mpol = get_vma_policy(vma, addr);
|
|
*nodemask = NULL; /* assume !MPOL_BIND */
|
|
|
|
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
|
|
nid = interleave_nid(*mpol, vma, addr,
|
|
huge_page_shift(hstate_vma(vma)));
|
|
} else {
|
|
nid = policy_node(gfp_flags, *mpol, numa_node_id());
|
|
if ((*mpol)->mode == MPOL_BIND)
|
|
*nodemask = &(*mpol)->v.nodes;
|
|
}
|
|
return nid;
|
|
}
|
|
|
|
/*
|
|
* init_nodemask_of_mempolicy
|
|
*
|
|
* If the current task's mempolicy is "default" [NULL], return 'false'
|
|
* to indicate default policy. Otherwise, extract the policy nodemask
|
|
* for 'bind' or 'interleave' policy into the argument nodemask, or
|
|
* initialize the argument nodemask to contain the single node for
|
|
* 'preferred' or 'local' policy and return 'true' to indicate presence
|
|
* of non-default mempolicy.
|
|
*
|
|
* We don't bother with reference counting the mempolicy [mpol_get/put]
|
|
* because the current task is examining it's own mempolicy and a task's
|
|
* mempolicy is only ever changed by the task itself.
|
|
*
|
|
* N.B., it is the caller's responsibility to free a returned nodemask.
|
|
*/
|
|
bool init_nodemask_of_mempolicy(nodemask_t *mask)
|
|
{
|
|
struct mempolicy *mempolicy;
|
|
int nid;
|
|
|
|
if (!(mask && current->mempolicy))
|
|
return false;
|
|
|
|
task_lock(current);
|
|
mempolicy = current->mempolicy;
|
|
switch (mempolicy->mode) {
|
|
case MPOL_PREFERRED:
|
|
if (mempolicy->flags & MPOL_F_LOCAL)
|
|
nid = numa_node_id();
|
|
else
|
|
nid = mempolicy->v.preferred_node;
|
|
init_nodemask_of_node(mask, nid);
|
|
break;
|
|
|
|
case MPOL_BIND:
|
|
/* Fall through */
|
|
case MPOL_INTERLEAVE:
|
|
*mask = mempolicy->v.nodes;
|
|
break;
|
|
|
|
default:
|
|
BUG();
|
|
}
|
|
task_unlock(current);
|
|
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* mempolicy_nodemask_intersects
|
|
*
|
|
* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
|
|
* policy. Otherwise, check for intersection between mask and the policy
|
|
* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
|
|
* policy, always return true since it may allocate elsewhere on fallback.
|
|
*
|
|
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
|
|
*/
|
|
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
|
|
const nodemask_t *mask)
|
|
{
|
|
struct mempolicy *mempolicy;
|
|
bool ret = true;
|
|
|
|
if (!mask)
|
|
return ret;
|
|
task_lock(tsk);
|
|
mempolicy = tsk->mempolicy;
|
|
if (!mempolicy)
|
|
goto out;
|
|
|
|
switch (mempolicy->mode) {
|
|
case MPOL_PREFERRED:
|
|
/*
|
|
* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
|
|
* allocate from, they may fallback to other nodes when oom.
|
|
* Thus, it's possible for tsk to have allocated memory from
|
|
* nodes in mask.
|
|
*/
|
|
break;
|
|
case MPOL_BIND:
|
|
case MPOL_INTERLEAVE:
|
|
ret = nodes_intersects(mempolicy->v.nodes, *mask);
|
|
break;
|
|
default:
|
|
BUG();
|
|
}
|
|
out:
|
|
task_unlock(tsk);
|
|
return ret;
|
|
}
|
|
|
|
/* Allocate a page in interleaved policy.
|
|
Own path because it needs to do special accounting. */
|
|
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
|
|
unsigned nid)
|
|
{
|
|
struct page *page;
|
|
|
|
page = __alloc_pages(gfp, order, nid);
|
|
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
|
|
if (!static_branch_likely(&vm_numa_stat_key))
|
|
return page;
|
|
if (page && page_to_nid(page) == nid) {
|
|
preempt_disable();
|
|
__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
|
|
preempt_enable();
|
|
}
|
|
return page;
|
|
}
|
|
|
|
/**
|
|
* alloc_pages_vma - Allocate a page for a VMA.
|
|
*
|
|
* @gfp:
|
|
* %GFP_USER user allocation.
|
|
* %GFP_KERNEL kernel allocations,
|
|
* %GFP_HIGHMEM highmem/user allocations,
|
|
* %GFP_FS allocation should not call back into a file system.
|
|
* %GFP_ATOMIC don't sleep.
|
|
*
|
|
* @order:Order of the GFP allocation.
|
|
* @vma: Pointer to VMA or NULL if not available.
|
|
* @addr: Virtual Address of the allocation. Must be inside the VMA.
|
|
* @node: Which node to prefer for allocation (modulo policy).
|
|
* @hugepage: for hugepages try only the preferred node if possible
|
|
*
|
|
* This function allocates a page from the kernel page pool and applies
|
|
* a NUMA policy associated with the VMA or the current process.
|
|
* When VMA is not NULL caller must hold down_read on the mmap_sem of the
|
|
* mm_struct of the VMA to prevent it from going away. Should be used for
|
|
* all allocations for pages that will be mapped into user space. Returns
|
|
* NULL when no page can be allocated.
|
|
*/
|
|
struct page *
|
|
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
|
unsigned long addr, int node, bool hugepage)
|
|
{
|
|
struct mempolicy *pol;
|
|
struct page *page;
|
|
int preferred_nid;
|
|
nodemask_t *nmask;
|
|
|
|
pol = get_vma_policy(vma, addr);
|
|
|
|
if (pol->mode == MPOL_INTERLEAVE) {
|
|
unsigned nid;
|
|
|
|
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
|
|
mpol_cond_put(pol);
|
|
page = alloc_page_interleave(gfp, order, nid);
|
|
goto out;
|
|
}
|
|
|
|
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
|
|
int hpage_node = node;
|
|
|
|
/*
|
|
* For hugepage allocation and non-interleave policy which
|
|
* allows the current node (or other explicitly preferred
|
|
* node) we only try to allocate from the current/preferred
|
|
* node and don't fall back to other nodes, as the cost of
|
|
* remote accesses would likely offset THP benefits.
|
|
*
|
|
* If the policy is interleave, or does not allow the current
|
|
* node in its nodemask, we allocate the standard way.
|
|
*/
|
|
if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
|
|
hpage_node = pol->v.preferred_node;
|
|
|
|
nmask = policy_nodemask(gfp, pol);
|
|
if (!nmask || node_isset(hpage_node, *nmask)) {
|
|
mpol_cond_put(pol);
|
|
page = __alloc_pages_node(hpage_node,
|
|
gfp | __GFP_THISNODE, order);
|
|
|
|
/*
|
|
* If hugepage allocations are configured to always
|
|
* synchronous compact or the vma has been madvised
|
|
* to prefer hugepage backing, retry allowing remote
|
|
* memory as well.
|
|
*/
|
|
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
|
|
page = __alloc_pages_nodemask(gfp | __GFP_NORETRY,
|
|
order, hpage_node,
|
|
nmask);
|
|
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
nmask = policy_nodemask(gfp, pol);
|
|
preferred_nid = policy_node(gfp, pol, node);
|
|
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
|
|
mpol_cond_put(pol);
|
|
out:
|
|
return page;
|
|
}
|
|
EXPORT_SYMBOL(alloc_pages_vma);
|
|
|
|
/**
|
|
* alloc_pages_current - Allocate pages.
|
|
*
|
|
* @gfp:
|
|
* %GFP_USER user allocation,
|
|
* %GFP_KERNEL kernel allocation,
|
|
* %GFP_HIGHMEM highmem allocation,
|
|
* %GFP_FS don't call back into a file system.
|
|
* %GFP_ATOMIC don't sleep.
|
|
* @order: Power of two of allocation size in pages. 0 is a single page.
|
|
*
|
|
* Allocate a page from the kernel page pool. When not in
|
|
* interrupt context and apply the current process NUMA policy.
|
|
* Returns NULL when no page can be allocated.
|
|
*/
|
|
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
|
|
{
|
|
struct mempolicy *pol = &default_policy;
|
|
struct page *page;
|
|
|
|
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
|
|
pol = get_task_policy(current);
|
|
|
|
/*
|
|
* No reference counting needed for current->mempolicy
|
|
* nor system default_policy
|
|
*/
|
|
if (pol->mode == MPOL_INTERLEAVE)
|
|
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
|
|
else
|
|
page = __alloc_pages_nodemask(gfp, order,
|
|
policy_node(gfp, pol, numa_node_id()),
|
|
policy_nodemask(gfp, pol));
|
|
|
|
return page;
|
|
}
|
|
EXPORT_SYMBOL(alloc_pages_current);
|
|
|
|
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
|
|
{
|
|
struct mempolicy *pol = mpol_dup(vma_policy(src));
|
|
|
|
if (IS_ERR(pol))
|
|
return PTR_ERR(pol);
|
|
dst->vm_policy = pol;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
|
|
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
|
|
* with the mems_allowed returned by cpuset_mems_allowed(). This
|
|
* keeps mempolicies cpuset relative after its cpuset moves. See
|
|
* further kernel/cpuset.c update_nodemask().
|
|
*
|
|
* current's mempolicy may be rebinded by the other task(the task that changes
|
|
* cpuset's mems), so we needn't do rebind work for current task.
|
|
*/
|
|
|
|
/* Slow path of a mempolicy duplicate */
|
|
struct mempolicy *__mpol_dup(struct mempolicy *old)
|
|
{
|
|
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
|
|
|
|
if (!new)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
/* task's mempolicy is protected by alloc_lock */
|
|
if (old == current->mempolicy) {
|
|
task_lock(current);
|
|
*new = *old;
|
|
task_unlock(current);
|
|
} else
|
|
*new = *old;
|
|
|
|
if (current_cpuset_is_being_rebound()) {
|
|
nodemask_t mems = cpuset_mems_allowed(current);
|
|
mpol_rebind_policy(new, &mems);
|
|
}
|
|
atomic_set(&new->refcnt, 1);
|
|
return new;
|
|
}
|
|
|
|
/* Slow path of a mempolicy comparison */
|
|
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
|
{
|
|
if (!a || !b)
|
|
return false;
|
|
if (a->mode != b->mode)
|
|
return false;
|
|
if (a->flags != b->flags)
|
|
return false;
|
|
if (mpol_store_user_nodemask(a))
|
|
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
|
|
return false;
|
|
|
|
switch (a->mode) {
|
|
case MPOL_BIND:
|
|
/* Fall through */
|
|
case MPOL_INTERLEAVE:
|
|
return !!nodes_equal(a->v.nodes, b->v.nodes);
|
|
case MPOL_PREFERRED:
|
|
/* a's ->flags is the same as b's */
|
|
if (a->flags & MPOL_F_LOCAL)
|
|
return true;
|
|
return a->v.preferred_node == b->v.preferred_node;
|
|
default:
|
|
BUG();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Shared memory backing store policy support.
|
|
*
|
|
* Remember policies even when nobody has shared memory mapped.
|
|
* The policies are kept in Red-Black tree linked from the inode.
|
|
* They are protected by the sp->lock rwlock, which should be held
|
|
* for any accesses to the tree.
|
|
*/
|
|
|
|
/*
|
|
* lookup first element intersecting start-end. Caller holds sp->lock for
|
|
* reading or for writing
|
|
*/
|
|
static struct sp_node *
|
|
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
|
|
{
|
|
struct rb_node *n = sp->root.rb_node;
|
|
|
|
while (n) {
|
|
struct sp_node *p = rb_entry(n, struct sp_node, nd);
|
|
|
|
if (start >= p->end)
|
|
n = n->rb_right;
|
|
else if (end <= p->start)
|
|
n = n->rb_left;
|
|
else
|
|
break;
|
|
}
|
|
if (!n)
|
|
return NULL;
|
|
for (;;) {
|
|
struct sp_node *w = NULL;
|
|
struct rb_node *prev = rb_prev(n);
|
|
if (!prev)
|
|
break;
|
|
w = rb_entry(prev, struct sp_node, nd);
|
|
if (w->end <= start)
|
|
break;
|
|
n = prev;
|
|
}
|
|
return rb_entry(n, struct sp_node, nd);
|
|
}
|
|
|
|
/*
|
|
* Insert a new shared policy into the list. Caller holds sp->lock for
|
|
* writing.
|
|
*/
|
|
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
|
|
{
|
|
struct rb_node **p = &sp->root.rb_node;
|
|
struct rb_node *parent = NULL;
|
|
struct sp_node *nd;
|
|
|
|
while (*p) {
|
|
parent = *p;
|
|
nd = rb_entry(parent, struct sp_node, nd);
|
|
if (new->start < nd->start)
|
|
p = &(*p)->rb_left;
|
|
else if (new->end > nd->end)
|
|
p = &(*p)->rb_right;
|
|
else
|
|
BUG();
|
|
}
|
|
rb_link_node(&new->nd, parent, p);
|
|
rb_insert_color(&new->nd, &sp->root);
|
|
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
|
|
new->policy ? new->policy->mode : 0);
|
|
}
|
|
|
|
/* Find shared policy intersecting idx */
|
|
struct mempolicy *
|
|
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
|
|
{
|
|
struct mempolicy *pol = NULL;
|
|
struct sp_node *sn;
|
|
|
|
if (!sp->root.rb_node)
|
|
return NULL;
|
|
read_lock(&sp->lock);
|
|
sn = sp_lookup(sp, idx, idx+1);
|
|
if (sn) {
|
|
mpol_get(sn->policy);
|
|
pol = sn->policy;
|
|
}
|
|
read_unlock(&sp->lock);
|
|
return pol;
|
|
}
|
|
|
|
static void sp_free(struct sp_node *n)
|
|
{
|
|
mpol_put(n->policy);
|
|
kmem_cache_free(sn_cache, n);
|
|
}
|
|
|
|
/**
|
|
* mpol_misplaced - check whether current page node is valid in policy
|
|
*
|
|
* @page: page to be checked
|
|
* @vma: vm area where page mapped
|
|
* @addr: virtual address where page mapped
|
|
*
|
|
* Lookup current policy node id for vma,addr and "compare to" page's
|
|
* node id.
|
|
*
|
|
* Returns:
|
|
* -1 - not misplaced, page is in the right node
|
|
* node - node id where the page should be
|
|
*
|
|
* Policy determination "mimics" alloc_page_vma().
|
|
* Called from fault path where we know the vma and faulting address.
|
|
*/
|
|
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
|
|
{
|
|
struct mempolicy *pol;
|
|
struct zoneref *z;
|
|
int curnid = page_to_nid(page);
|
|
unsigned long pgoff;
|
|
int thiscpu = raw_smp_processor_id();
|
|
int thisnid = cpu_to_node(thiscpu);
|
|
int polnid = NUMA_NO_NODE;
|
|
int ret = -1;
|
|
|
|
pol = get_vma_policy(vma, addr);
|
|
if (!(pol->flags & MPOL_F_MOF))
|
|
goto out;
|
|
|
|
switch (pol->mode) {
|
|
case MPOL_INTERLEAVE:
|
|
pgoff = vma->vm_pgoff;
|
|
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
|
|
polnid = offset_il_node(pol, pgoff);
|
|
break;
|
|
|
|
case MPOL_PREFERRED:
|
|
if (pol->flags & MPOL_F_LOCAL)
|
|
polnid = numa_node_id();
|
|
else
|
|
polnid = pol->v.preferred_node;
|
|
break;
|
|
|
|
case MPOL_BIND:
|
|
|
|
/*
|
|
* allows binding to multiple nodes.
|
|
* use current page if in policy nodemask,
|
|
* else select nearest allowed node, if any.
|
|
* If no allowed nodes, use current [!misplaced].
|
|
*/
|
|
if (node_isset(curnid, pol->v.nodes))
|
|
goto out;
|
|
z = first_zones_zonelist(
|
|
node_zonelist(numa_node_id(), GFP_HIGHUSER),
|
|
gfp_zone(GFP_HIGHUSER),
|
|
&pol->v.nodes);
|
|
polnid = zone_to_nid(z->zone);
|
|
break;
|
|
|
|
default:
|
|
BUG();
|
|
}
|
|
|
|
/* Migrate the page towards the node whose CPU is referencing it */
|
|
if (pol->flags & MPOL_F_MORON) {
|
|
polnid = thisnid;
|
|
|
|
if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
|
|
goto out;
|
|
}
|
|
|
|
if (curnid != polnid)
|
|
ret = polnid;
|
|
out:
|
|
mpol_cond_put(pol);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Drop the (possibly final) reference to task->mempolicy. It needs to be
|
|
* dropped after task->mempolicy is set to NULL so that any allocation done as
|
|
* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
|
|
* policy.
|
|
*/
|
|
void mpol_put_task_policy(struct task_struct *task)
|
|
{
|
|
struct mempolicy *pol;
|
|
|
|
task_lock(task);
|
|
pol = task->mempolicy;
|
|
task->mempolicy = NULL;
|
|
task_unlock(task);
|
|
mpol_put(pol);
|
|
}
|
|
|
|
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
|
|
{
|
|
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
|
|
rb_erase(&n->nd, &sp->root);
|
|
sp_free(n);
|
|
}
|
|
|
|
static void sp_node_init(struct sp_node *node, unsigned long start,
|
|
unsigned long end, struct mempolicy *pol)
|
|
{
|
|
node->start = start;
|
|
node->end = end;
|
|
node->policy = pol;
|
|
}
|
|
|
|
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
|
|
struct mempolicy *pol)
|
|
{
|
|
struct sp_node *n;
|
|
struct mempolicy *newpol;
|
|
|
|
n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
|
|
if (!n)
|
|
return NULL;
|
|
|
|
newpol = mpol_dup(pol);
|
|
if (IS_ERR(newpol)) {
|
|
kmem_cache_free(sn_cache, n);
|
|
return NULL;
|
|
}
|
|
newpol->flags |= MPOL_F_SHARED;
|
|
sp_node_init(n, start, end, newpol);
|
|
|
|
return n;
|
|
}
|
|
|
|
/* Replace a policy range. */
|
|
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
|
|
unsigned long end, struct sp_node *new)
|
|
{
|
|
struct sp_node *n;
|
|
struct sp_node *n_new = NULL;
|
|
struct mempolicy *mpol_new = NULL;
|
|
int ret = 0;
|
|
|
|
restart:
|
|
write_lock(&sp->lock);
|
|
n = sp_lookup(sp, start, end);
|
|
/* Take care of old policies in the same range. */
|
|
while (n && n->start < end) {
|
|
struct rb_node *next = rb_next(&n->nd);
|
|
if (n->start >= start) {
|
|
if (n->end <= end)
|
|
sp_delete(sp, n);
|
|
else
|
|
n->start = end;
|
|
} else {
|
|
/* Old policy spanning whole new range. */
|
|
if (n->end > end) {
|
|
if (!n_new)
|
|
goto alloc_new;
|
|
|
|
*mpol_new = *n->policy;
|
|
atomic_set(&mpol_new->refcnt, 1);
|
|
sp_node_init(n_new, end, n->end, mpol_new);
|
|
n->end = start;
|
|
sp_insert(sp, n_new);
|
|
n_new = NULL;
|
|
mpol_new = NULL;
|
|
break;
|
|
} else
|
|
n->end = start;
|
|
}
|
|
if (!next)
|
|
break;
|
|
n = rb_entry(next, struct sp_node, nd);
|
|
}
|
|
if (new)
|
|
sp_insert(sp, new);
|
|
write_unlock(&sp->lock);
|
|
ret = 0;
|
|
|
|
err_out:
|
|
if (mpol_new)
|
|
mpol_put(mpol_new);
|
|
if (n_new)
|
|
kmem_cache_free(sn_cache, n_new);
|
|
|
|
return ret;
|
|
|
|
alloc_new:
|
|
write_unlock(&sp->lock);
|
|
ret = -ENOMEM;
|
|
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
|
|
if (!n_new)
|
|
goto err_out;
|
|
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
|
|
if (!mpol_new)
|
|
goto err_out;
|
|
atomic_set(&mpol_new->refcnt, 1);
|
|
goto restart;
|
|
}
|
|
|
|
/**
|
|
* mpol_shared_policy_init - initialize shared policy for inode
|
|
* @sp: pointer to inode shared policy
|
|
* @mpol: struct mempolicy to install
|
|
*
|
|
* Install non-NULL @mpol in inode's shared policy rb-tree.
|
|
* On entry, the current task has a reference on a non-NULL @mpol.
|
|
* This must be released on exit.
|
|
* This is called at get_inode() calls and we can use GFP_KERNEL.
|
|
*/
|
|
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
|
|
{
|
|
int ret;
|
|
|
|
sp->root = RB_ROOT; /* empty tree == default mempolicy */
|
|
rwlock_init(&sp->lock);
|
|
|
|
if (mpol) {
|
|
struct vm_area_struct pvma;
|
|
struct mempolicy *new;
|
|
NODEMASK_SCRATCH(scratch);
|
|
|
|
if (!scratch)
|
|
goto put_mpol;
|
|
/* contextualize the tmpfs mount point mempolicy */
|
|
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
|
|
if (IS_ERR(new))
|
|
goto free_scratch; /* no valid nodemask intersection */
|
|
|
|
task_lock(current);
|
|
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
|
|
task_unlock(current);
|
|
if (ret)
|
|
goto put_new;
|
|
|
|
/* Create pseudo-vma that contains just the policy */
|
|
vma_init(&pvma, NULL);
|
|
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
|
|
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
|
|
|
|
put_new:
|
|
mpol_put(new); /* drop initial ref */
|
|
free_scratch:
|
|
NODEMASK_SCRATCH_FREE(scratch);
|
|
put_mpol:
|
|
mpol_put(mpol); /* drop our incoming ref on sb mpol */
|
|
}
|
|
}
|
|
|
|
int mpol_set_shared_policy(struct shared_policy *info,
|
|
struct vm_area_struct *vma, struct mempolicy *npol)
|
|
{
|
|
int err;
|
|
struct sp_node *new = NULL;
|
|
unsigned long sz = vma_pages(vma);
|
|
|
|
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
|
|
vma->vm_pgoff,
|
|
sz, npol ? npol->mode : -1,
|
|
npol ? npol->flags : -1,
|
|
npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
|
|
|
|
if (npol) {
|
|
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
|
|
if (!new)
|
|
return -ENOMEM;
|
|
}
|
|
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
|
|
if (err && new)
|
|
sp_free(new);
|
|
return err;
|
|
}
|
|
|
|
/* Free a backing policy store on inode delete. */
|
|
void mpol_free_shared_policy(struct shared_policy *p)
|
|
{
|
|
struct sp_node *n;
|
|
struct rb_node *next;
|
|
|
|
if (!p->root.rb_node)
|
|
return;
|
|
write_lock(&p->lock);
|
|
next = rb_first(&p->root);
|
|
while (next) {
|
|
n = rb_entry(next, struct sp_node, nd);
|
|
next = rb_next(&n->nd);
|
|
sp_delete(p, n);
|
|
}
|
|
write_unlock(&p->lock);
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
static int __initdata numabalancing_override;
|
|
|
|
static void __init check_numabalancing_enable(void)
|
|
{
|
|
bool numabalancing_default = false;
|
|
|
|
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
|
|
numabalancing_default = true;
|
|
|
|
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
|
|
if (numabalancing_override)
|
|
set_numabalancing_state(numabalancing_override == 1);
|
|
|
|
if (num_online_nodes() > 1 && !numabalancing_override) {
|
|
pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
|
|
numabalancing_default ? "Enabling" : "Disabling");
|
|
set_numabalancing_state(numabalancing_default);
|
|
}
|
|
}
|
|
|
|
static int __init setup_numabalancing(char *str)
|
|
{
|
|
int ret = 0;
|
|
if (!str)
|
|
goto out;
|
|
|
|
if (!strcmp(str, "enable")) {
|
|
numabalancing_override = 1;
|
|
ret = 1;
|
|
} else if (!strcmp(str, "disable")) {
|
|
numabalancing_override = -1;
|
|
ret = 1;
|
|
}
|
|
out:
|
|
if (!ret)
|
|
pr_warn("Unable to parse numa_balancing=\n");
|
|
|
|
return ret;
|
|
}
|
|
__setup("numa_balancing=", setup_numabalancing);
|
|
#else
|
|
static inline void __init check_numabalancing_enable(void)
|
|
{
|
|
}
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
/* assumes fs == KERNEL_DS */
|
|
void __init numa_policy_init(void)
|
|
{
|
|
nodemask_t interleave_nodes;
|
|
unsigned long largest = 0;
|
|
int nid, prefer = 0;
|
|
|
|
policy_cache = kmem_cache_create("numa_policy",
|
|
sizeof(struct mempolicy),
|
|
0, SLAB_PANIC, NULL);
|
|
|
|
sn_cache = kmem_cache_create("shared_policy_node",
|
|
sizeof(struct sp_node),
|
|
0, SLAB_PANIC, NULL);
|
|
|
|
for_each_node(nid) {
|
|
preferred_node_policy[nid] = (struct mempolicy) {
|
|
.refcnt = ATOMIC_INIT(1),
|
|
.mode = MPOL_PREFERRED,
|
|
.flags = MPOL_F_MOF | MPOL_F_MORON,
|
|
.v = { .preferred_node = nid, },
|
|
};
|
|
}
|
|
|
|
/*
|
|
* Set interleaving policy for system init. Interleaving is only
|
|
* enabled across suitably sized nodes (default is >= 16MB), or
|
|
* fall back to the largest node if they're all smaller.
|
|
*/
|
|
nodes_clear(interleave_nodes);
|
|
for_each_node_state(nid, N_MEMORY) {
|
|
unsigned long total_pages = node_present_pages(nid);
|
|
|
|
/* Preserve the largest node */
|
|
if (largest < total_pages) {
|
|
largest = total_pages;
|
|
prefer = nid;
|
|
}
|
|
|
|
/* Interleave this node? */
|
|
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
|
|
node_set(nid, interleave_nodes);
|
|
}
|
|
|
|
/* All too small, use the largest */
|
|
if (unlikely(nodes_empty(interleave_nodes)))
|
|
node_set(prefer, interleave_nodes);
|
|
|
|
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
|
|
pr_err("%s: interleaving failed\n", __func__);
|
|
|
|
check_numabalancing_enable();
|
|
}
|
|
|
|
/* Reset policy of current process to default */
|
|
void numa_default_policy(void)
|
|
{
|
|
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
|
|
}
|
|
|
|
/*
|
|
* Parse and format mempolicy from/to strings
|
|
*/
|
|
|
|
/*
|
|
* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
|
|
*/
|
|
static const char * const policy_modes[] =
|
|
{
|
|
[MPOL_DEFAULT] = "default",
|
|
[MPOL_PREFERRED] = "prefer",
|
|
[MPOL_BIND] = "bind",
|
|
[MPOL_INTERLEAVE] = "interleave",
|
|
[MPOL_LOCAL] = "local",
|
|
};
|
|
|
|
|
|
#ifdef CONFIG_TMPFS
|
|
/**
|
|
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
|
|
* @str: string containing mempolicy to parse
|
|
* @mpol: pointer to struct mempolicy pointer, returned on success.
|
|
*
|
|
* Format of input:
|
|
* <mode>[=<flags>][:<nodelist>]
|
|
*
|
|
* On success, returns 0, else 1
|
|
*/
|
|
int mpol_parse_str(char *str, struct mempolicy **mpol)
|
|
{
|
|
struct mempolicy *new = NULL;
|
|
unsigned short mode_flags;
|
|
nodemask_t nodes;
|
|
char *nodelist = strchr(str, ':');
|
|
char *flags = strchr(str, '=');
|
|
int err = 1, mode;
|
|
|
|
if (flags)
|
|
*flags++ = '\0'; /* terminate mode string */
|
|
|
|
if (nodelist) {
|
|
/* NUL-terminate mode or flags string */
|
|
*nodelist++ = '\0';
|
|
if (nodelist_parse(nodelist, nodes))
|
|
goto out;
|
|
if (!nodes_subset(nodes, node_states[N_MEMORY]))
|
|
goto out;
|
|
} else
|
|
nodes_clear(nodes);
|
|
|
|
mode = match_string(policy_modes, MPOL_MAX, str);
|
|
if (mode < 0)
|
|
goto out;
|
|
|
|
switch (mode) {
|
|
case MPOL_PREFERRED:
|
|
/*
|
|
* Insist on a nodelist of one node only, although later
|
|
* we use first_node(nodes) to grab a single node, so here
|
|
* nodelist (or nodes) cannot be empty.
|
|
*/
|
|
if (nodelist) {
|
|
char *rest = nodelist;
|
|
while (isdigit(*rest))
|
|
rest++;
|
|
if (*rest)
|
|
goto out;
|
|
if (nodes_empty(nodes))
|
|
goto out;
|
|
}
|
|
break;
|
|
case MPOL_INTERLEAVE:
|
|
/*
|
|
* Default to online nodes with memory if no nodelist
|
|
*/
|
|
if (!nodelist)
|
|
nodes = node_states[N_MEMORY];
|
|
break;
|
|
case MPOL_LOCAL:
|
|
/*
|
|
* Don't allow a nodelist; mpol_new() checks flags
|
|
*/
|
|
if (nodelist)
|
|
goto out;
|
|
mode = MPOL_PREFERRED;
|
|
break;
|
|
case MPOL_DEFAULT:
|
|
/*
|
|
* Insist on a empty nodelist
|
|
*/
|
|
if (!nodelist)
|
|
err = 0;
|
|
goto out;
|
|
case MPOL_BIND:
|
|
/*
|
|
* Insist on a nodelist
|
|
*/
|
|
if (!nodelist)
|
|
goto out;
|
|
}
|
|
|
|
mode_flags = 0;
|
|
if (flags) {
|
|
/*
|
|
* Currently, we only support two mutually exclusive
|
|
* mode flags.
|
|
*/
|
|
if (!strcmp(flags, "static"))
|
|
mode_flags |= MPOL_F_STATIC_NODES;
|
|
else if (!strcmp(flags, "relative"))
|
|
mode_flags |= MPOL_F_RELATIVE_NODES;
|
|
else
|
|
goto out;
|
|
}
|
|
|
|
new = mpol_new(mode, mode_flags, &nodes);
|
|
if (IS_ERR(new))
|
|
goto out;
|
|
|
|
/*
|
|
* Save nodes for mpol_to_str() to show the tmpfs mount options
|
|
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
|
|
*/
|
|
if (mode != MPOL_PREFERRED)
|
|
new->v.nodes = nodes;
|
|
else if (nodelist)
|
|
new->v.preferred_node = first_node(nodes);
|
|
else
|
|
new->flags |= MPOL_F_LOCAL;
|
|
|
|
/*
|
|
* Save nodes for contextualization: this will be used to "clone"
|
|
* the mempolicy in a specific context [cpuset] at a later time.
|
|
*/
|
|
new->w.user_nodemask = nodes;
|
|
|
|
err = 0;
|
|
|
|
out:
|
|
/* Restore string for error message */
|
|
if (nodelist)
|
|
*--nodelist = ':';
|
|
if (flags)
|
|
*--flags = '=';
|
|
if (!err)
|
|
*mpol = new;
|
|
return err;
|
|
}
|
|
#endif /* CONFIG_TMPFS */
|
|
|
|
/**
|
|
* mpol_to_str - format a mempolicy structure for printing
|
|
* @buffer: to contain formatted mempolicy string
|
|
* @maxlen: length of @buffer
|
|
* @pol: pointer to mempolicy to be formatted
|
|
*
|
|
* Convert @pol into a string. If @buffer is too short, truncate the string.
|
|
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
|
|
* longest flag, "relative", and to display at least a few node ids.
|
|
*/
|
|
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
|
|
{
|
|
char *p = buffer;
|
|
nodemask_t nodes = NODE_MASK_NONE;
|
|
unsigned short mode = MPOL_DEFAULT;
|
|
unsigned short flags = 0;
|
|
|
|
if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
|
|
mode = pol->mode;
|
|
flags = pol->flags;
|
|
}
|
|
|
|
switch (mode) {
|
|
case MPOL_DEFAULT:
|
|
break;
|
|
case MPOL_PREFERRED:
|
|
if (flags & MPOL_F_LOCAL)
|
|
mode = MPOL_LOCAL;
|
|
else
|
|
node_set(pol->v.preferred_node, nodes);
|
|
break;
|
|
case MPOL_BIND:
|
|
case MPOL_INTERLEAVE:
|
|
nodes = pol->v.nodes;
|
|
break;
|
|
default:
|
|
WARN_ON_ONCE(1);
|
|
snprintf(p, maxlen, "unknown");
|
|
return;
|
|
}
|
|
|
|
p += snprintf(p, maxlen, "%s", policy_modes[mode]);
|
|
|
|
if (flags & MPOL_MODE_FLAGS) {
|
|
p += snprintf(p, buffer + maxlen - p, "=");
|
|
|
|
/*
|
|
* Currently, the only defined flags are mutually exclusive
|
|
*/
|
|
if (flags & MPOL_F_STATIC_NODES)
|
|
p += snprintf(p, buffer + maxlen - p, "static");
|
|
else if (flags & MPOL_F_RELATIVE_NODES)
|
|
p += snprintf(p, buffer + maxlen - p, "relative");
|
|
}
|
|
|
|
if (!nodes_empty(nodes))
|
|
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
|
|
nodemask_pr_args(&nodes));
|
|
}
|