android_kernel_xiaomi_sm8350/mm/mempolicy.c
kamasali Satyanarayan 7a335f8708 Merge android11-5.4.249+ (d57e792) into msm-5.4
* remotes/origin/tmp-d57e792:
  UPSTREAM: media: usb: siano: Fix warning due to null work_func_t function pointer
  UPSTREAM: Bluetooth: L2CAP: Fix use-after-free in l2cap_sock_ready_cb
  ANDROID: ABI: Update allowed list for QCOM
  UPSTREAM: net: tap_open(): set sk_uid from current_fsuid()
  UPSTREAM: net: tun_chr_open(): set sk_uid from current_fsuid()
  UPSTREAM: net/sched: cls_route: No longer copy tcf_result on update to avoid use-after-free
  UPSTREAM: net/sched: cls_fw: No longer copy tcf_result on update to avoid use-after-free
  UPSTREAM: net/sched: cls_u32: No longer copy tcf_result on update to avoid use-after-free
  UPSTREAM: net/sched: cls_fw: Fix improper refcount update leads to use-after-free
  UPSTREAM: media: dvb-core: Fix kernel WARNING for blocking operation in wait_event*()
  ANDROID: ABI: Update allowed list for QCOM
  UPSTREAM: usb: gadget: udc: renesas_usb3: Fix use after free bug in renesas_usb3_remove due to race condition
  UPSTREAM: x86/mm: Avoid using set_pgd() outside of real PGD pages
  UPSTREAM: net/sched: flower: fix possible OOB write in fl_set_geneve_opt()
  UPSTREAM: ipvlan:Fix out-of-bounds caused by unclear skb->cb
  Linux 5.4.249
  xfs: verify buffer contents when we skip log replay
  mm: make wait_on_page_writeback() wait for multiple pending writebacks
  mm: fix VM_BUG_ON(PageTail) and BUG_ON(PageWriteback)
  i2c: imx-lpi2c: fix type char overflow issue when calculating the clock cycle
  x86/apic: Fix kernel panic when booting with intremap=off and x2apic_phys
  drm/radeon: fix race condition UAF in radeon_gem_set_domain_ioctl
  drm/exynos: fix race condition UAF in exynos_g2d_exec_ioctl
  drm/exynos: vidi: fix a wrong error return
  ARM: dts: Fix erroneous ADS touchscreen polarities
  ASoC: nau8824: Add quirk to active-high jack-detect
  s390/cio: unregister device when the only path is gone
  usb: gadget: udc: fix NULL dereference in remove()
  nfcsim.c: Fix error checking for debugfs_create_dir
  media: cec: core: don't set last_initiator if tx in progress
  arm64: Add missing Set/Way CMO encodings
  HID: wacom: Add error check to wacom_parse_and_register()
  scsi: target: iscsi: Prevent login threads from racing between each other
  sch_netem: acquire qdisc lock in netem_change()
  Revert "net: phy: dp83867: perform soft reset and retain established link"
  netfilter: nfnetlink_osf: fix module autoload
  netfilter: nf_tables: disallow element updates of bound anonymous sets
  be2net: Extend xmit workaround to BE3 chip
  net: dsa: mt7530: fix trapping frames on non-MT7621 SoC MT7530 switch
  ipvs: align inner_mac_header for encapsulation
  mmc: usdhi60rol0: fix deferred probing
  mmc: sh_mmcif: fix deferred probing
  mmc: sdhci-acpi: fix deferred probing
  mmc: omap_hsmmc: fix deferred probing
  mmc: omap: fix deferred probing
  mmc: mvsdio: fix deferred probing
  mmc: mvsdio: convert to devm_platform_ioremap_resource
  mmc: mtk-sd: fix deferred probing
  net: qca_spi: Avoid high load if QCA7000 is not available
  xfrm: Linearize the skb after offloading if needed.
  ieee802154: hwsim: Fix possible memory leaks
  rcu: Upgrade rcu_swap_protected() to rcu_replace_pointer()
  x86/mm: Avoid using set_pgd() outside of real PGD pages
  cifs: Fix potential deadlock when updating vol in cifs_reconnect()
  cifs: Merge is_path_valid() into get_normalized_path()
  cifs: Introduce helpers for finding TCP connection
  cifs: Get rid of kstrdup_const()'d paths
  cifs: Clean up DFS referral cache
  nilfs2: prevent general protection fault in nilfs_clear_dirty_page()
  writeback: fix dereferencing NULL mapping->host on writeback_page_template
  ip_tunnels: allow VXLAN/GENEVE to inherit TOS/TTL from VLAN
  mmc: meson-gx: remove redundant mmc_request_done() call from irq context
  cgroup: Do not corrupt task iteration when rebinding subsystem
  PCI: hv: Fix a race condition bug in hv_pci_query_relations()
  Drivers: hv: vmbus: Fix vmbus_wait_for_unload() to scan present CPUs
  nilfs2: fix buffer corruption due to concurrent device reads
  media: dvb-core: Fix use-after-free due to race at dvb_register_device()
  media: dvbdev: fix error logic at dvb_register_device()
  media: dvbdev: Fix memleak in dvb_register_device
  tick/common: Align tick period during sched_timer setup
  x86/purgatory: remove PGO flags
  tracing: Add tracing_reset_all_online_cpus_unlocked() function
  epoll: ep_autoremove_wake_function should use list_del_init_careful
  list: add "list_del_init_careful()" to go with "list_empty_careful()"
  mm: rewrite wait_on_page_bit_common() logic
  nilfs2: reject devices with insufficient block count
  UPSTREAM: net/sched: cls_u32: Fix reference counter leak leading to overflow
  UPSTREAM: memstick: r592: Fix UAF bug in r592_remove due to race condition
  BACKPORT: btrfs: unset reloc control if transaction commit fails in prepare_to_relocate()
  Revert "neighbour: Replace zero-length array with flexible-array member"
  Revert "neighbour: fix unaligned access to pneigh_entry"
  Revert "tcp: deny tcp_disconnect() when threads are waiting"
  Linux 5.4.248
  mmc: block: ensure error propagation for non-blk
  drm/nouveau/kms: Fix NULL pointer dereference in nouveau_connector_detect_depth
  neighbour: delete neigh_lookup_nodev as not used
  net: Remove unused inline function dst_hold_and_use()
  neighbour: Remove unused inline function neigh_key_eq16()
  afs: Fix vlserver probe RTT handling
  selftests/ptp: Fix timestamp printf format for PTP_SYS_OFFSET
  net: tipc: resize nlattr array to correct size
  net: lapbether: only support ethernet devices
  net/sched: cls_api: Fix lockup on flushing explicitly created chain
  drm/nouveau: add nv_encoder pointer check for NULL
  drm/nouveau/kms: Don't change EDID when it hasn't actually changed
  drm/nouveau/dp: check for NULL nv_connector->native_mode
  igb: fix nvm.ops.read() error handling
  sctp: fix an error code in sctp_sf_eat_auth()
  ipvlan: fix bound dev checking for IPv6 l3s mode
  IB/isert: Fix incorrect release of isert connection
  IB/isert: Fix possible list corruption in CMA handler
  IB/isert: Fix dead lock in ib_isert
  IB/uverbs: Fix to consider event queue closing also upon non-blocking mode
  iavf: remove mask from iavf_irq_enable_queues()
  RDMA/rxe: Fix the use-before-initialization error of resp_pkts
  RDMA/rxe: Removed unused name from rxe_task struct
  RDMA/rxe: Remove the unused variable obj
  net/sched: cls_u32: Fix reference counter leak leading to overflow
  ping6: Fix send to link-local addresses with VRF.
  netfilter: nfnetlink: skip error delivery on batch in case of ENOMEM
  spi: fsl-dspi: avoid SCK glitches with continuous transfers
  spi: spi-fsl-dspi: Remove unused chip->void_write_data
  usb: dwc3: gadget: Reset num TRBs before giving back the request
  serial: lantiq: add missing interrupt ack
  USB: serial: option: add Quectel EM061KGL series
  Remove DECnet support from kernel
  ALSA: hda/realtek: Add a quirk for Compaq N14JP6
  net: usb: qmi_wwan: add support for Compal RXM-G1
  RDMA/uverbs: Restrict usage of privileged QKEYs
  nouveau: fix client work fence deletion race
  powerpc/purgatory: remove PGO flags
  kexec: support purgatories with .text.hot sections
  nilfs2: fix possible out-of-bounds segment allocation in resize ioctl
  nilfs2: fix incomplete buffer cleanup in nilfs_btnode_abort_change_key()
  nios2: dts: Fix tse_mac "max-frame-size" property
  ocfs2: check new file size on fallocate call
  ocfs2: fix use-after-free when unmounting read-only filesystem
  drm:amd:amdgpu: Fix missing buffer object unlock in failure path
  xen/blkfront: Only check REQ_FUA for writes
  mips: Move initrd_start check after initrd address sanitisation.
  MIPS: Alchemy: fix dbdma2
  parisc: Flush gatt writes and adjust gatt mask in parisc_agp_mask_memory()
  parisc: Improve cache flushing for PCXL in arch_sync_dma_for_cpu()
  btrfs: handle memory allocation failure in btrfs_csum_one_bio
  power: supply: Fix logic checking if system is running from battery
  irqchip/meson-gpio: Mark OF related data as maybe unused
  regulator: Fix error checking for debugfs_create_dir
  platform/x86: asus-wmi: Ignore WMI events with codes 0x7B, 0xC0
  power: supply: Ratelimit no data debug output
  ARM: dts: vexpress: add missing cache properties
  power: supply: bq27xxx: Use mod_delayed_work() instead of cancel() + schedule()
  power: supply: sc27xx: Fix external_power_changed race
  power: supply: ab8500: Fix external_power_changed race
  s390/dasd: Use correct lock while counting channel queue length
  dasd: refactor dasd_ioctl_information
  KEYS: asymmetric: Copy sig and digest in public_key_verify_signature()
  test_firmware: fix a memory leak with reqs buffer
  ANDROID: HID: Only utilise UHID provided exports if UHID is enabled
  Revert "firmware: arm_sdei: Fix sleep from invalid context BUG"
  UPSTREAM: bluetooth: Perform careful capability checks in hci_sock_ioctl()
  Revert "PM: domains: Fix up terminology with parent/child"
  Revert "PM: domains: Restore comment indentation for generic_pm_domain.child_links"
  Revert "scripts/gdb: bail early if there are no generic PD"
  Revert "uapi/linux/const.h: prefer ISO-friendly __typeof__"
  Revert "netfilter: nf_tables: don't write table validation state without mutex"
  Linux 5.4.247
  Revert "staging: rtl8192e: Replace macro RTL_PCI_DEVICE with PCI_DEVICE"
  mtd: spinand: macronix: Add support for MX35LFxGE4AD
  btrfs: unset reloc control if transaction commit fails in prepare_to_relocate()
  btrfs: check return value of btrfs_commit_transaction in relocation
  rbd: get snapshot context after exclusive lock is ensured to be held
  drm/atomic: Don't pollute crtc_state->mode_blob with error pointers
  cifs: handle empty list of targets in cifs_reconnect()
  cifs: get rid of unused parameter in reconn_setup_dfs_targets()
  ext4: only check dquot_initialize_needed() when debugging
  eeprom: at24: also select REGMAP
  i2c: sprd: Delete i2c adapter in .remove's error path
  bonding (gcc13): synchronize bond_{a,t}lb_xmit() types
  usb: usbfs: Use consistent mmap functions
  usb: usbfs: Enforce page requirements for mmap
  pinctrl: meson-axg: add missing GPIOA_18 gpio group
  rbd: move RBD_OBJ_FLAG_COPYUP_ENABLED flag setting
  Bluetooth: Fix use-after-free in hci_remove_ltk/hci_remove_irk
  ceph: fix use-after-free bug for inodes when flushing capsnaps
  can: j1939: avoid possible use-after-free when j1939_can_rx_register fails
  can: j1939: change j1939_netdev_lock type to mutex
  can: j1939: j1939_sk_send_loop_abort(): improved error queue handling in J1939 Socket
  drm/amdgpu: fix xclk freq on CHIP_STONEY
  ALSA: hda/realtek: Add Lenovo P3 Tower platform
  ALSA: hda/realtek: Add a quirk for HP Slim Desktop S01
  Input: psmouse - fix OOB access in Elantech protocol
  Input: xpad - delete a Razer DeathAdder mouse VID/PID entry
  batman-adv: Broken sync while rescheduling delayed work
  bnxt_en: Query default VLAN before VNIC setup on a VF
  lib: cpu_rmap: Fix potential use-after-free in irq_cpu_rmap_release()
  net: sched: fix possible refcount leak in tc_chain_tmplt_add()
  net: sched: move rtm_tca_policy declaration to include file
  rfs: annotate lockless accesses to RFS sock flow table
  rfs: annotate lockless accesses to sk->sk_rxhash
  netfilter: ipset: Add schedule point in call_ad().
  netfilter: conntrack: fix NULL pointer dereference in nf_confirm_cthelper
  Bluetooth: L2CAP: Add missing checks for invalid DCID
  Bluetooth: Fix l2cap_disconnect_req deadlock
  net: dsa: lan9303: allow vid != 0 in port_fdb_{add|del} methods
  neighbour: fix unaligned access to pneigh_entry
  neighbour: Replace zero-length array with flexible-array member
  spi: qup: Request DMA before enabling clocks
  i40e: fix build warnings in i40e_alloc.h
  i40iw: fix build warning in i40iw_manage_apbvt()
  block/blk-iocost (gcc13): keep large values in a new enum
  blk-iocost: avoid 64-bit division in ioc_timer_fn
  Linux 5.4.246
  drm/edid: fix objtool warning in drm_cvt_modes()
  wifi: rtlwifi: 8192de: correct checking of IQK reload
  drm/edid: Fix uninitialized variable in drm_cvt_modes()
  RDMA/bnxt_re: Remove the qp from list only if the qp destroy succeeds
  RDMA/bnxt_re: Remove set but not used variable 'dev_attr'
  scsi: dpt_i2o: Do not process completions with invalid addresses
  scsi: dpt_i2o: Remove broken pass-through ioctl (I2OUSERCMD)
  regmap: Account for register length when chunking
  test_firmware: fix the memory leak of the allocated firmware buffer
  fbcon: Fix null-ptr-deref in soft_cursor
  ext4: add lockdep annotations for i_data_sem for ea_inode's
  ext4: disallow ea_inodes with extended attributes
  ext4: set lockdep subclass for the ea_inode in ext4_xattr_inode_cache_find()
  ext4: add EA_INODE checking to ext4_iget()
  tracing/probe: trace_probe_primary_from_call(): checked list_first_entry
  selinux: don't use make's grouped targets feature yet
  tty: serial: fsl_lpuart: use UARTCTRL_TXINV to send break instead of UARTCTRL_SBK
  mmc: vub300: fix invalid response handling
  wifi: rtlwifi: remove always-true condition pointed out by GCC 12
  lib/dynamic_debug.c: use address-of operator on section symbols
  treewide: Remove uninitialized_var() usage
  kernel/extable.c: use address-of operator on section symbols
  eth: sun: cassini: remove dead code
  gcc-12: disable '-Wdangling-pointer' warning for now
  ACPI: thermal: drop an always true check
  x86/boot: Wrap literal addresses in absolute_pointer()
  flow_dissector: work around stack frame size warning
  ata: libata-scsi: Use correct device no in ata_find_dev()
  scsi: stex: Fix gcc 13 warnings
  misc: fastrpc: reject new invocations during device removal
  misc: fastrpc: return -EPIPE to invocations on device removal
  usb: gadget: f_fs: Add unbind event before functionfs_unbind
  net: usb: qmi_wwan: Set DTR quirk for BroadMobi BM818
  iio: dac: build ad5758 driver when AD5758 is selected
  iio: dac: mcp4725: Fix i2c_master_send() return value handling
  iio: light: vcnl4035: fixed chip ID check
  HID: wacom: avoid integer overflow in wacom_intuos_inout()
  HID: google: add jewel USB id
  iio: adc: mxs-lradc: fix the order of two cleanup operations
  mailbox: mailbox-test: fix a locking issue in mbox_test_message_write()
  atm: hide unused procfs functions
  ALSA: oss: avoid missing-prototype warnings
  netfilter: conntrack: define variables exp_nat_nla_policy and any_addr with CONFIG_NF_NAT
  wifi: b43: fix incorrect __packed annotation
  scsi: core: Decrease scsi_device's iorequest_cnt if dispatch failed
  arm64/mm: mark private VM_FAULT_X defines as vm_fault_t
  ARM: dts: stm32: add pin map for CAN controller on stm32f7
  wifi: rtl8xxxu: fix authentication timeout due to incorrect RCR value
  media: dvb-core: Fix use-after-free due to race condition at dvb_ca_en50221
  media: dvb-core: Fix kernel WARNING for blocking operation in wait_event*()
  media: dvb-core: Fix use-after-free due on race condition at dvb_net
  media: mn88443x: fix !CONFIG_OF error by drop of_match_ptr from ID table
  media: ttusb-dec: fix memory leak in ttusb_dec_exit_dvb()
  media: dvb_ca_en50221: fix a size write bug
  media: netup_unidvb: fix irq init by register it at the end of probe
  media: dvb-usb: dw2102: fix uninit-value in su3000_read_mac_address
  media: dvb-usb: digitv: fix null-ptr-deref in digitv_i2c_xfer()
  media: dvb-usb-v2: rtl28xxu: fix null-ptr-deref in rtl28xxu_i2c_xfer
  media: dvb-usb-v2: ce6230: fix null-ptr-deref in ce6230_i2c_master_xfer()
  media: dvb-usb-v2: ec168: fix null-ptr-deref in ec168_i2c_xfer()
  media: dvb-usb: az6027: fix three null-ptr-deref in az6027_i2c_xfer()
  media: dvb_demux: fix a bug for the continuity counter
  ASoC: ssm2602: Add workaround for playback distortions
  xfrm: Check if_id in inbound policy/secpath match
  ASoC: dwc: limit the number of overrun messages
  nbd: Fix debugfs_create_dir error checking
  fbdev: stifb: Fix info entry in sti_struct on error path
  fbdev: modedb: Add 1920x1080 at 60 Hz video mode
  media: rcar-vin: Select correct interrupt mode for V4L2_FIELD_ALTERNATE
  ARM: 9295/1: unwind:fix unwind abort for uleb128 case
  mailbox: mailbox-test: Fix potential double-free in mbox_test_message_write()
  watchdog: menz069_wdt: fix watchdog initialisation
  mtd: rawnand: marvell: don't set the NAND frequency select
  mtd: rawnand: marvell: ensure timing values are written
  net: dsa: mv88e6xxx: Increase wait after reset deactivation
  net/sched: flower: fix possible OOB write in fl_set_geneve_opt()
  udp6: Fix race condition in udp6_sendmsg & connect
  net/netlink: fix NETLINK_LIST_MEMBERSHIPS length report
  ocfs2/dlm: move BITS_TO_BYTES() to bitops.h for wider use
  net: sched: fix NULL pointer dereference in mq_attach
  net/sched: Prohibit regrafting ingress or clsact Qdiscs
  net/sched: Reserve TC_H_INGRESS (TC_H_CLSACT) for ingress (clsact) Qdiscs
  net/sched: sch_clsact: Only create under TC_H_CLSACT
  net/sched: sch_ingress: Only create under TC_H_INGRESS
  tcp: Return user_mss for TCP_MAXSEG in CLOSE/LISTEN state if user_mss set
  tcp: deny tcp_disconnect() when threads are waiting
  af_packet: do not use READ_ONCE() in packet_bind()
  mtd: rawnand: ingenic: fix empty stub helper definitions
  amd-xgbe: fix the false linkup in xgbe_phy_status
  af_packet: Fix data-races of pkt_sk(sk)->num.
  netrom: fix info-leak in nr_write_internal()
  net/mlx5: fw_tracer, Fix event handling
  dmaengine: pl330: rename _start to prevent build error
  iommu/amd: Don't block updates to GATag if guest mode is on
  iommu/rockchip: Fix unwind goto issue
  RDMA/bnxt_re: Fix return value of bnxt_re_process_raw_qp_pkt_rx
  RDMA/bnxt_re: Refactor queue pair creation code
  RDMA/bnxt_re: Enable SRIOV VF support on Broadcom's 57500 adapter series
  RDMA/efa: Fix unsupported page sizes in device
  Linux 5.4.245
  netfilter: ctnetlink: Support offloaded conntrack entry deletion
  ipv{4,6}/raw: fix output xfrm lookup wrt protocol
  binder: fix UAF caused by faulty buffer cleanup
  bluetooth: Add cmd validity checks at the start of hci_sock_ioctl()
  io_uring: have io_kill_timeout() honor the request references
  io_uring: don't drop completion lock before timer is fully initialized
  io_uring: always grab lock in io_cancel_async_work()
  cdc_ncm: Fix the build warning
  net/mlx5: Devcom, serialize devcom registration
  net/mlx5: devcom only supports 2 ports
  fs: fix undefined behavior in bit shift for SB_NOUSER
  power: supply: bq24190: Call power_supply_changed() after updating input current
  power: supply: core: Refactor power_supply_set_input_current_limit_from_supplier()
  power: supply: bq27xxx: After charger plug in/out wait 0.5s for things to stabilize
  net: cdc_ncm: Deal with too low values of dwNtbOutMaxSize
  cdc_ncm: Implement the 32-bit version of NCM Transfer Block
  Linux 5.4.244
  3c589_cs: Fix an error handling path in tc589_probe()
  net/mlx5: Devcom, fix error flow in mlx5_devcom_register_device
  net/mlx5: Fix error message when failing to allocate device memory
  forcedeth: Fix an error handling path in nv_probe()
  ASoC: Intel: Skylake: Fix declaration of enum skl_ch_cfg
  x86/show_trace_log_lvl: Ensure stack pointer is aligned, again
  xen/pvcalls-back: fix double frees with pvcalls_new_active_socket()
  coresight: Fix signedness bug in tmc_etr_buf_insert_barrier_packet()
  power: supply: sbs-charger: Fix INHIBITED bit for Status reg
  power: supply: bq27xxx: Fix poll_interval handling and races on remove
  power: supply: bq27xxx: Fix I2C IRQ race on remove
  power: supply: bq27xxx: Fix bq27xxx_battery_update() race condition
  power: supply: leds: Fix blink to LED on transition
  ipv6: Fix out-of-bounds access in ipv6_find_tlv()
  bpf: Fix mask generation for 32-bit narrow loads of 64-bit fields
  selftests: fib_tests: mute cleanup error message
  net: fix skb leak in __skb_tstamp_tx()
  media: radio-shark: Add endpoint checks
  USB: sisusbvga: Add endpoint checks
  USB: core: Add routines for endpoint checks in old drivers
  udplite: Fix NULL pointer dereference in __sk_mem_raise_allocated().
  net: fix stack overflow when LRO is disabled for virtual interfaces
  fbdev: udlfb: Fix endpoint check
  debugobjects: Don't wake up kswapd from fill_pool()
  x86/topology: Fix erroneous smp_num_siblings on Intel Hybrid platforms
  parisc: Fix flush_dcache_page() for usage from irq context
  selftests/memfd: Fix unknown type name build failure
  x86/mm: Avoid incomplete Global INVLPG flushes
  btrfs: use nofs when cleaning up aborted transactions
  gpio: mockup: Fix mode of debugfs files
  parisc: Allow to reboot machine after system halt
  parisc: Handle kgdb breakpoints only in kernel context
  m68k: Move signal frame following exception on 68020/030
  ALSA: hda/realtek: Enable headset onLenovo M70/M90
  ALSA: hda/ca0132: add quirk for EVGA X299 DARK
  mt76: mt7615: Fix build with older compilers
  spi: fsl-cpm: Use 16 bit mode for large transfers with even size
  spi: fsl-spi: Re-organise transfer bits_per_word adaptation
  watchdog: sp5100_tco: Immediately trigger upon starting.
  s390/qdio: fix do_sqbs() inline assembly constraint
  s390/qdio: get rid of register asm
  vc_screen: reload load of struct vc_data pointer in vcs_write() to avoid UAF
  vc_screen: rewrite vcs_size to accept vc, not inode
  usb: gadget: u_ether: Fix host MAC address case
  usb: gadget: u_ether: Convert prints to device prints
  lib/string_helpers: Introduce string_upper() and string_lower() helpers
  HID: wacom: add three styli to wacom_intuos_get_tool_type
  HID: wacom: Add new Intuos Pro Small (PTH-460) device IDs
  HID: wacom: Force pen out of prox if no events have been received in a while
  netfilter: nf_tables: hold mutex on netns pre_exit path
  netfilter: nf_tables: validate NFTA_SET_ELEM_OBJREF based on NFT_SET_OBJECT flag
  netfilter: nf_tables: stricter validation of element data
  netfilter: nf_tables: allow up to 64 bytes in the set element data area
  netfilter: nf_tables: add nft_setelem_parse_key()
  netfilter: nf_tables: validate registers coming from userspace.
  netfilter: nftables: statify nft_parse_register()
  netfilter: nftables: add nft_parse_register_store() and use it
  netfilter: nftables: add nft_parse_register_load() and use it
  nilfs2: fix use-after-free bug of nilfs_root in nilfs_evict_inode()
  powerpc/64s/radix: Fix soft dirty tracking
  tpm/tpm_tis: Disable interrupts for more Lenovo devices
  ceph: force updating the msg pointer in non-split case
  serial: Add support for Advantech PCI-1611U card
  statfs: enforce statfs[64] structure initialization
  KVM: x86: do not report a vCPU as preempted outside instruction boundaries
  can: kvaser_pciefd: Disable interrupts in probe error path
  can: kvaser_pciefd: Do not send EFLUSH command on TFD interrupt
  can: kvaser_pciefd: Clear listen-only bit if not explicitly requested
  can: kvaser_pciefd: Empty SRB buffer in probe
  can: kvaser_pciefd: Call request_irq() before enabling interrupts
  can: kvaser_pciefd: Set CAN_STATE_STOPPED in kvaser_pciefd_stop()
  can: j1939: recvmsg(): allow MSG_CMSG_COMPAT flag
  ALSA: hda/realtek: Add quirk for 2nd ASUS GU603
  ALSA: hda/realtek: Add a quirk for HP EliteDesk 805
  ALSA: hda: Add NVIDIA codec IDs a3 through a7 to patch table
  ALSA: hda: Fix Oops by 9.1 surround channel names
  usb: typec: altmodes/displayport: fix pin_assignment_show
  usb: dwc3: debugfs: Resume dwc3 before accessing registers
  USB: UHCI: adjust zhaoxin UHCI controllers OverCurrent bit value
  usb-storage: fix deadlock when a scsi command timeouts more than once
  USB: usbtmc: Fix direction for 0-length ioctl control messages
  vlan: fix a potential uninit-value in vlan_dev_hard_start_xmit()
  igb: fix bit_shift to be in [1..8] range
  cassini: Fix a memory leak in the error handling path of cas_init_one()
  wifi: iwlwifi: mvm: don't trust firmware n_channels
  net: bcmgenet: Restore phy_stop() depending upon suspend/close
  net: bcmgenet: Remove phy_stop() from bcmgenet_netif_stop()
  net: nsh: Use correct mac_offset to unwind gso skb in nsh_gso_segment()
  drm/exynos: fix g2d_open/close helper function definitions
  media: netup_unidvb: fix use-after-free at del_timer()
  net: hns3: fix reset delay time to avoid configuration timeout
  net: hns3: fix sending pfc frames after reset issue
  erspan: get the proto with the md version for collect_md
  ip_gre, ip6_gre: Fix race condition on o_seqno in collect_md mode
  ip6_gre: Make o_seqno start from 0 in native mode
  ip6_gre: Fix skb_under_panic in __gre6_xmit()
  serial: arc_uart: fix of_iomap leak in `arc_serial_probe`
  vsock: avoid to close connected socket after the timeout
  ALSA: firewire-digi00x: prevent potential use after free
  net: fec: Better handle pm_runtime_get() failing in .remove()
  af_key: Reject optional tunnel/BEET mode templates in outbound policies
  cpupower: Make TSC read per CPU for Mperf monitor
  ASoC: fsl_micfil: register platform component before registering cpu dai
  btrfs: fix space cache inconsistency after error loading it from disk
  btrfs: replace calls to btrfs_find_free_ino with btrfs_find_free_objectid
  mfd: dln2: Fix memory leak in dln2_probe()
  phy: st: miphy28lp: use _poll_timeout functions for waits
  Input: xpad - add constants for GIP interface numbers
  iommu/arm-smmu-v3: Acknowledge pri/event queue overflow if any
  clk: tegra20: fix gcc-7 constant overflow warning
  RDMA/core: Fix multiple -Warray-bounds warnings
  recordmcount: Fix memory leaks in the uwrite function
  sched: Fix KCSAN noinstr violation
  mcb-pci: Reallocate memory region to avoid memory overlapping
  serial: 8250: Reinit port->pm on port specific driver unbind
  usb: typec: tcpm: fix multiple times discover svids error
  HID: wacom: generic: Set battery quirk only when we see battery data
  spi: spi-imx: fix MX51_ECSPI_* macros when cs > 3
  HID: logitech-hidpp: Reconcile USB and Unifying serials
  HID: logitech-hidpp: Don't use the USB serial for USB devices
  staging: rtl8192e: Replace macro RTL_PCI_DEVICE with PCI_DEVICE
  Bluetooth: L2CAP: fix "bad unlock balance" in l2cap_disconnect_rsp
  wifi: iwlwifi: dvm: Fix memcpy: detected field-spanning write backtrace
  wifi: iwlwifi: pcie: Fix integer overflow in iwl_write_to_user_buf
  wifi: iwlwifi: pcie: fix possible NULL pointer dereference
  samples/bpf: Fix fout leak in hbm's run_bpf_prog
  f2fs: fix to drop all dirty pages during umount() if cp_error is set
  ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()
  ext4: set goal start correctly in ext4_mb_normalize_request
  gfs2: Fix inode height consistency check
  scsi: message: mptlan: Fix use after free bug in mptlan_remove() due to race condition
  lib: cpu_rmap: Avoid use after free on rmap->obj array entries
  scsi: target: iscsit: Free cmds before session free
  net: Catch invalid index in XPS mapping
  net: pasemi: Fix return type of pasemi_mac_start_tx()
  scsi: lpfc: Prevent lpfc_debugfs_lockstat_write() buffer overflow
  ext2: Check block size validity during mount
  wifi: brcmfmac: cfg80211: Pass the PMK in binary instead of hex
  ACPICA: ACPICA: check null return of ACPI_ALLOCATE_ZEROED in acpi_db_display_objects
  ACPICA: Avoid undefined behavior: applying zero offset to null pointer
  drm/tegra: Avoid potential 32-bit integer overflow
  ACPI: EC: Fix oops when removing custom query handlers
  firmware: arm_sdei: Fix sleep from invalid context BUG
  memstick: r592: Fix UAF bug in r592_remove due to race condition
  regmap: cache: Return error in cache sync operations for REGCACHE_NONE
  drm/amd/display: Use DC_LOG_DC in the trasform pixel function
  fs: hfsplus: remove WARN_ON() from hfsplus_cat_{read,write}_inode()
  af_unix: Fix data races around sk->sk_shutdown.
  af_unix: Fix a data race of sk->sk_receive_queue->qlen.
  net: datagram: fix data-races in datagram_poll()
  ipvlan:Fix out-of-bounds caused by unclear skb->cb
  net: add vlan_get_protocol_and_depth() helper
  net: tap: check vlan with eth_type_vlan() method
  net: annotate sk->sk_err write from do_recvmmsg()
  netlink: annotate accesses to nlk->cb_running
  netfilter: conntrack: fix possible bug_on with enable_hooks=1
  net: Fix load-tearing on sk->sk_stamp in sock_recv_cmsgs().
  linux/dim: Do nothing if no time delta between samples
  ARM: 9296/1: HP Jornada 7XX: fix kernel-doc warnings
  drm/mipi-dsi: Set the fwnode for mipi_dsi_device
  driver core: add a helper to setup both the of_node and fwnode of a device
  Linux 5.4.243
  drm/amd/display: Fix hang when skipping modeset
  mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock
  drm/exynos: move to use request_irq by IRQF_NO_AUTOEN flag
  drm/msm/adreno: Fix null ptr access in adreno_gpu_cleanup()
  firmware: raspberrypi: fix possible memory leak in rpi_firmware_probe()
  drm/msm: Fix double pm_runtime_disable() call
  PM: domains: Restore comment indentation for generic_pm_domain.child_links
  printk: declare printk_deferred_{enter,safe}() in include/linux/printk.h
  PCI: pciehp: Fix AB-BA deadlock between reset_lock and device_lock
  PCI: pciehp: Use down_read/write_nested(reset_lock) to fix lockdep errors
  drbd: correctly submit flush bio on barrier
  serial: 8250: Fix serial8250_tx_empty() race with DMA Tx
  tty: Prevent writing chars during tcsetattr TCSADRAIN/FLUSH
  ext4: fix invalid free tracking in ext4_xattr_move_to_block()
  ext4: remove a BUG_ON in ext4_mb_release_group_pa()
  ext4: bail out of ext4_xattr_ibody_get() fails for any reason
  ext4: add bounds checking in get_max_inline_xattr_value_size()
  ext4: fix deadlock when converting an inline directory in nojournal mode
  ext4: improve error recovery code paths in __ext4_remount()
  ext4: fix data races when using cached status extents
  ext4: avoid a potential slab-out-of-bounds in ext4_group_desc_csum
  ext4: fix WARNING in mb_find_extent
  HID: wacom: insert timestamp to packed Bluetooth (BT) events
  HID: wacom: Set a default resolution for older tablets
  drm/amdgpu: disable sdma ecc irq only when sdma RAS is enabled in suspend
  drm/amdgpu/gfx: disable gfx9 cp_ecc_error_irq only when enabling legacy gfx ras
  drm/amdgpu: fix an amdgpu_irq_put() issue in gmc_v9_0_hw_fini()
  drm/panel: otm8009a: Set backlight parent to panel device
  f2fs: fix potential corruption when moving a directory
  ARM: dts: s5pv210: correct MIPI CSIS clock name
  ARM: dts: exynos: fix WM8960 clock name in Itop Elite
  remoteproc: st: Call of_node_put() on iteration error
  remoteproc: stm32: Call of_node_put() on iteration error
  sh: nmi_debug: fix return value of __setup handler
  sh: init: use OF_EARLY_FLATTREE for early init
  sh: math-emu: fix macro redefined warning
  inotify: Avoid reporting event with invalid wd
  platform/x86: touchscreen_dmi: Add info for the Dexp Ursus KX210i
  cifs: fix pcchunk length type in smb2_copychunk_range
  btrfs: print-tree: parent bytenr must be aligned to sector size
  btrfs: don't free qgroup space unless specified
  btrfs: fix btrfs_prev_leaf() to not return the same key twice
  perf symbols: Fix return incorrect build_id size in elf_read_build_id()
  perf map: Delete two variable initialisations before null pointer checks in sort__sym_from_cmp()
  perf vendor events power9: Remove UTF-8 characters from JSON files
  virtio_net: suppress cpu stall when free_unused_bufs
  virtio_net: split free_unused_bufs()
  net: dsa: mt7530: fix corrupt frames using trgmii on 40 MHz XTAL MT7621
  ALSA: caiaq: input: Add error handling for unsupported input methods in `snd_usb_caiaq_input_init`
  drm/amdgpu: add a missing lock for AMDGPU_SCHED
  af_packet: Don't send zero-byte data in packet_sendmsg_spkt().
  ionic: remove noise from ethtool rxnfc error msg
  rxrpc: Fix hard call timeout units
  net/sched: act_mirred: Add carrier check
  writeback: fix call of incorrect macro
  net: dsa: mv88e6xxx: add mv88e6321 rsvd2cpu
  sit: update dev->needed_headroom in ipip6_tunnel_bind_dev()
  net/sched: cls_api: remove block_cb from driver_list before freeing
  net/ncsi: clear Tx enable mode when handling a Config required AEN
  relayfs: fix out-of-bounds access in relay_file_read
  kernel/relay.c: fix read_pos error when multiple readers
  crypto: safexcel - Cleanup ring IRQ workqueues on load failure
  crypto: inside-secure - irq balance
  dm verity: fix error handling for check_at_most_once on FEC
  dm verity: skip redundant verity_handle_err() on I/O errors
  mailbox: zynqmp: Fix counts of child nodes
  mailbox: zynq: Switch to flexible array to simplify code
  tick/nohz: Fix cpu_is_hotpluggable() by checking with nohz subsystem
  nohz: Add TICK_DEP_BIT_RCU
  netfilter: nf_tables: deactivate anonymous set from preparation phase
  debugobject: Ensure pool refill (again)
  perf intel-pt: Fix CYC timestamps after standalone CBR
  perf auxtrace: Fix address filter entire kernel size
  dm ioctl: fix nested locking in table_clear() to remove deadlock concern
  dm flakey: fix a crash with invalid table line
  dm integrity: call kmem_cache_destroy() in dm_integrity_init() error path
  dm clone: call kmem_cache_destroy() in dm_clone_init() error path
  s390/dasd: fix hanging blockdevice after request requeue
  btrfs: scrub: reject unsupported scrub flags
  scripts/gdb: fix lx-timerlist for Python3
  clk: rockchip: rk3399: allow clk_cifout to force clk_cifout_src to reparent
  wifi: rtl8xxxu: RTL8192EU always needs full init
  mailbox: zynqmp: Fix typo in IPI documentation
  mailbox: zynqmp: Fix IPI isr handling
  md/raid10: fix null-ptr-deref in raid10_sync_request
  nilfs2: fix infinite loop in nilfs_mdt_get_block()
  nilfs2: do not write dirty data after degenerating to read-only
  parisc: Fix argument pointer in real64_call_asm()
  afs: Fix updating of i_size with dv jump from server
  dmaengine: at_xdmac: do not enable all cyclic channels
  dmaengine: dw-edma: Fix to enable to issue dma request on DMA processing
  dmaengine: dw-edma: Fix to change for continuous transfer
  phy: tegra: xusb: Add missing tegra_xusb_port_unregister for usb2_port and ulpi_port
  pwm: mtk-disp: Disable shadow registers before setting backlight values
  pwm: mtk-disp: Adjust the clocks to avoid them mismatch
  pwm: mtk-disp: Don't check the return code of pwmchip_remove()
  dmaengine: mv_xor_v2: Fix an error code.
  leds: TI_LMU_COMMON: select REGMAP instead of depending on it
  ext4: fix use-after-free read in ext4_find_extent for bigalloc + inline
  openrisc: Properly store r31 to pt_regs on unhandled exceptions
  clocksource/drivers/davinci: Fix memory leak in davinci_timer_register when init fails
  clocksource: davinci: axe a pointless __GFP_NOFAIL
  clocksource/drivers/davinci: Avoid trailing '\n' hidden in pr_fmt()
  RDMA/mlx5: Use correct device num_ports when modify DC
  SUNRPC: remove the maximum number of retries in call_bind_status
  Input: raspberrypi-ts - fix refcount leak in rpi_ts_probe
  input: raspberrypi-ts: Release firmware handle when not needed
  firmware: raspberrypi: Introduce devm_rpi_firmware_get()
  firmware: raspberrypi: Keep count of all consumers
  NFSv4.1: Always send a RECLAIM_COMPLETE after establishing lease
  IB/hfi1: Fix SDMA mmu_rb_node not being evicted in LRU order
  RDMA/siw: Remove namespace check from siw_netdev_event()
  clk: add missing of_node_put() in "assigned-clocks" property parsing
  power: supply: generic-adc-battery: fix unit scaling
  rtc: meson-vrtc: Use ktime_get_real_ts64() to get the current time
  RDMA/mlx4: Prevent shift wrapping in set_user_sq_size()
  rtc: omap: include header for omap_rtc_power_off_program prototype
  RDMA/rdmavt: Delete unnecessary NULL check
  RDMA/siw: Fix potential page_array out of range access
  perf/core: Fix hardlockup failure caused by perf throttle
  powerpc/rtas: use memmove for potentially overlapping buffer copy
  macintosh: via-pmu-led: requires ATA to be set
  powerpc/sysdev/tsi108: fix resource printk format warnings
  powerpc/wii: fix resource printk format warnings
  powerpc/mpc512x: fix resource printk format warning
  macintosh/windfarm_smu_sat: Add missing of_node_put()
  spmi: Add a check for remove callback when removing a SPMI driver
  staging: rtl8192e: Fix W_DISABLE# does not work after stop/start
  serial: 8250: Add missing wakeup event reporting
  tty: serial: fsl_lpuart: adjust buffer length to the intended size
  firmware: stratix10-svc: Fix an NULL vs IS_ERR() bug in probe
  usb: mtu3: fix kernel panic at qmu transfer done irq handler
  usb: chipidea: fix missing goto in `ci_hdrc_probe`
  sh: sq: Fix incorrect element size for allocating bitmap buffer
  uapi/linux/const.h: prefer ISO-friendly __typeof__
  spi: cadence-quadspi: fix suspend-resume implementations
  mtd: spi-nor: cadence-quadspi: Handle probe deferral while requesting DMA channel
  mtd: spi-nor: cadence-quadspi: Don't initialize rx_dma_complete on failure
  mtd: spi-nor: cadence-quadspi: Provide a way to disable DAC mode
  mtd: spi-nor: cadence-quadspi: Make driver independent of flash geometry
  scripts/gdb: bail early if there are no generic PD
  PM: domains: Fix up terminology with parent/child
  scripts/gdb: bail early if there are no clocks
  ia64: salinfo: placate defined-but-not-used warning
  ia64: mm/contig: fix section mismatch warning/error
  of: Fix modalias string generation
  vmci_host: fix a race condition in vmci_host_poll() causing GPF
  spi: fsl-spi: Fix CPM/QE mode Litte Endian
  spi: qup: Don't skip cleanup in remove's error path
  linux/vt_buffer.h: allow either builtin or modular for macros
  ASoC: es8316: Handle optional IRQ assignment
  ASoC: es8316: Use IRQF_NO_AUTOEN when requesting the IRQ
  genirq: Add IRQF_NO_AUTOEN for request_irq/nmi()
  PCI: imx6: Install the fault handler only on compatible match
  usb: gadget: udc: renesas_usb3: Fix use after free bug in renesas_usb3_remove due to race condition
  iio: light: max44009: add missing OF device matching
  fpga: bridge: fix kernel-doc parameter description
  usb: host: xhci-rcar: remove leftover quirk handling
  pstore: Revert pmsg_lock back to a normal mutex
  tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp.
  net: amd: Fix link leak when verifying config failed
  netlink: Use copy_to_user() for optval in netlink_getsockopt().
  Revert "Bluetooth: btsdio: fix use after free bug in btsdio_remove due to unfinished work"
  ipv4: Fix potential uninit variable access bug in __ip_make_skb()
  netfilter: nf_tables: don't write table validation state without mutex
  bpf: Don't EFAULT for getsockopt with optval=NULL
  ixgbe: Enable setting RSS table to default values
  ixgbe: Allow flow hash to be set via ethtool
  wifi: iwlwifi: mvm: check firmware response size
  wifi: iwlwifi: make the loop for card preparation effective
  md/raid10: fix memleak of md thread
  md: update the optimal I/O size on reshape
  md/raid10: fix memleak for 'conf->bio_split'
  md/raid10: fix leak of 'r10bio->remaining' for recovery
  bpf, sockmap: Revert buggy deadlock fix in the sockhash and sockmap
  nvme-fcloop: fix "inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage"
  nvme: fix async event trace event
  nvme: handle the persistent internal error AER
  bpf, sockmap: fix deadlocks in the sockhash and sockmap
  scsi: lpfc: Fix ioremap issues in lpfc_sli4_pci_mem_setup()
  crypto: drbg - Only fail when jent is unavailable in FIPS mode
  crypto: drbg - make drbg_prepare_hrng() handle jent instantiation errors
  bpftool: Fix bug for long instructions in program CFG dumps
  wifi: rtlwifi: fix incorrect error codes in rtl_debugfs_set_write_reg()
  wifi: rtlwifi: fix incorrect error codes in rtl_debugfs_set_write_rfreg()
  rtlwifi: Replace RT_TRACE with rtl_dbg
  rtlwifi: Start changing RT_TRACE into rtl_dbg
  f2fs: handle dqget error in f2fs_transfer_project_quota()
  scsi: megaraid: Fix mega_cmd_done() CMDID_INT_CMDS
  scsi: target: iscsit: Fix TAS handling during conn cleanup
  net/packet: convert po->auxdata to an atomic flag
  net/packet: convert po->origdev to an atomic flag
  net/packet: annotate accesses to po->xmit
  vlan: partially enable SIOCSHWTSTAMP in container
  scm: fix MSG_CTRUNC setting condition for SO_PASSSEC
  wifi: rtw88: mac: Return the original error from rtw_mac_power_switch()
  wifi: rtw88: mac: Return the original error from rtw_pwr_seq_parser()
  tools: bpftool: Remove invalid \' json escape
  wifi: ath6kl: reduce WARN to dev_dbg() in callback
  wifi: ath5k: fix an off by one check in ath5k_eeprom_read_freq_list()
  wifi: ath9k: hif_usb: fix memory leak of remain_skbs
  wifi: ath6kl: minor fix for allocation size
  tick/common: Align tick period with the HZ tick.
  tick: Get rid of tick_period
  tick/sched: Optimize tick_do_update_jiffies64() further
  tick/sched: Reduce seqcount held scope in tick_do_update_jiffies64()
  tick/sched: Use tick_next_period for lockless quick check
  timekeeping: Split jiffies seqlock
  debugobject: Prevent init race with static objects
  arm64: kgdb: Set PSTATE.SS to 1 to re-enable single-step
  x86/ioapic: Don't return 0 from arch_dynirq_lower_bound()
  regulator: stm32-pwr: fix of_iomap leak
  media: rc: gpio-ir-recv: Fix support for wake-up
  media: rcar_fdp1: Fix refcount leak in probe and remove function
  media: rcar_fdp1: Fix the correct variable assignments
  media: rcar_fdp1: Make use of the helper function devm_platform_ioremap_resource()
  media: rcar_fdp1: fix pm_runtime_get_sync() usage count
  media: rcar_fdp1: simplify error check logic at fdp_open()
  media: saa7134: fix use after free bug in saa7134_finidev due to race condition
  media: dm1105: Fix use after free bug in dm1105_remove due to race condition
  x86/apic: Fix atomic update of offset in reserve_eilvt_offset()
  regulator: core: Avoid lockdep reports when resolving supplies
  regulator: core: Consistently set mutex_owner when using ww_mutex_lock_slow()
  drm/lima/lima_drv: Add missing unwind goto in lima_pdev_probe()
  mmc: sdhci-of-esdhc: fix quirk to ignore command inhibit for data
  drm/msm/adreno: drop bogus pm_runtime_set_active()
  drm/msm/adreno: Defer enabling runpm until hw_init()
  drm/msm: fix unbalanced pm_runtime_enable in adreno_gpu_{init, cleanup}
  firmware: qcom_scm: Clear download bit during reboot
  media: av7110: prevent underflow in write_ts_to_decoder()
  media: uapi: add MEDIA_BUS_FMT_METADATA_FIXED media bus format.
  media: bdisp: Add missing check for create_workqueue
  ARM: dts: qcom: ipq8064: Fix the PCI I/O port range
  ARM: dts: qcom: ipq8064: reduce pci IO size to 64K
  ARM: dts: qcom: ipq4019: Fix the PCI I/O port range
  EDAC/skx: Fix overflows on the DRAM row address mapping arrays
  arm64: dts: renesas: r8a774c0: Remove bogus voltages from OPP table
  arm64: dts: renesas: r8a77990: Remove bogus voltages from OPP table
  drm/probe-helper: Cancel previous job before starting new one
  drm/vgem: add missing mutex_destroy
  drm/rockchip: Drop unbalanced obj unref
  erofs: fix potential overflow calculating xattr_isize
  erofs: stop parsing non-compact HEAD index if clusterofs is invalid
  tpm, tpm_tis: Do not skip reset of original interrupt vector
  selinux: ensure av_permissions.h is built when needed
  selinux: fix Makefile dependencies of flask.h
  ubifs: Free memory for tmpfile name
  ubi: Fix return value overwrite issue in try_write_vid_and_data()
  ubifs: Fix memleak when insert_old_idx() failed
  Revert "ubifs: dirty_cow_znode: Fix memleak in error handling path"
  i2c: omap: Fix standard mode false ACK readings
  KVM: nVMX: Emulate NOPs in L2, and PAUSE if it's not intercepted
  reiserfs: Add security prefix to xattr name in reiserfs_security_write()
  ring-buffer: Sync IRQ works before buffer destruction
  pwm: meson: Fix g12a ao clk81 name
  pwm: meson: Fix axg ao mux parents
  kheaders: Use array declaration instead of char
  ipmi: fix SSIF not responding under certain cond.
  ipmi:ssif: Add send_retries increment
  MIPS: fw: Allow firmware to pass a empty env
  xhci: fix debugfs register accesses while suspended
  debugfs: regset32: Add Runtime PM support
  staging: iio: resolver: ads1210: fix config mode
  perf sched: Cast PTHREAD_STACK_MIN to int as it may turn into sysconf(__SC_THREAD_STACK_MIN_VALUE)
  USB: dwc3: fix runtime pm imbalance on unbind
  USB: dwc3: fix runtime pm imbalance on probe errors
  asm-generic/io.h: suppress endianness warnings for readq() and writeq()
  ASoC: Intel: bytcr_rt5640: Add quirk for the Acer Iconia One 7 B1-750
  iio: adc: palmas_gpadc: fix NULL dereference on rmmod
  USB: serial: option: add UNISOC vendor and TOZED LT70C product
  bluetooth: Perform careful capability checks in hci_sock_ioctl()
  drm/fb-helper: set x/yres_virtual in drm_fb_helper_check_var
  wifi: brcmfmac: slab-out-of-bounds read in brcmf_get_assoc_ies()
  counter: 104-quad-8: Fix race condition between FLAG and CNTR reads

Conflicts:
	drivers/firmware/qcom_scm.c
	drivers/md/dm-verity-target.c
	drivers/usb/dwc3/core.c
	drivers/usb/dwc3/debugfs.c
	drivers/usb/gadget/function/f_fs.c

Change-Id: I0d6315cadf7c3458e54bee2de89bd92b968060f7
Signed-off-by: kamasali Satyanarayan <quic_kamasali@quicinc.com>
2023-09-28 16:45:28 +05:30

2990 lines
75 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
* Support four policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
* interleave Allocate memory interleaved over a set of nodes,
* with normal fallback if it fails.
* For VMA based allocations this interleaves based on the
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
*
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
* As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
*
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
*
* The process policy is applied for most non interrupt memory allocations
* in that process' context. Interrupts ignore the policies and always
* try to allocate on the local CPU. The VMA policy is only applied for memory
* allocations for a VMA in the VM.
*
* Currently there are a few corner cases in swapping where the policy
* is not applied, but the majority should be handled. When process policy
* is used it is not remembered over swap outs/swap ins.
*
* Only the highest zone in the zone hierarchy gets policied. Allocations
* requesting a lower zone just use default policy. This implies that
* on systems with highmem kernel lowmem allocation don't get policied.
* Same with GFP_DMA allocations.
*
* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
* all users and remembered even when nobody has memory mapped.
*/
/* Notebook:
fix mmap readahead to honour policy and enable policy for any page cache
object
statistics for bigpages
global policy for page cache? currently it uses process policy. Requires
first item above.
handle mremap for shared memory (currently ignored for the policy)
grows down?
make bind policy root only? It can trigger oom much faster and the
kernel is not always grateful with that.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
#include <asm/tlbflush.h>
#include <linux/uaccess.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
enum zone_type policy_zone = 0;
/*
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_PREFERRED,
.flags = MPOL_F_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
int node;
if (pol)
return pol;
node = numa_node_id();
if (node != NUMA_NO_NODE) {
pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
if (pol->mode)
return pol;
}
return &default_policy;
}
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
const nodemask_t *rel)
{
nodemask_t tmp;
nodes_fold(tmp, *orig, nodes_weight(*rel));
nodes_onto(*ret, tmp, *rel);
}
static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->v.nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
if (!nodes)
pol->flags |= MPOL_F_LOCAL; /* local allocation */
else if (nodes_empty(*nodes))
return -EINVAL; /* no allowed nodes */
else
pol->v.preferred_node = first_node(*nodes);
return 0;
}
static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->v.nodes = *nodes;
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags. But, we need to
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_semaphore for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
int ret;
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else
nodes_and(nsc->mask2, *nodes, nsc->mask1);
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
pol->w.cpuset_mems_allowed =
cpuset_current_mems_allowed;
}
if (nodes)
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
else
ret = mpol_ops[pol->mode].create(pol, NULL);
return ret;
}
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
return NULL;
}
VM_BUG_ON(!nodes);
/*
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
* All other modes require a valid pointer to a non-empty nodemask.
*/
if (mode == MPOL_PREFERRED) {
if (nodes_empty(*nodes)) {
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
return policy;
}
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
if (!atomic_dec_and_test(&p->refcnt))
return;
kmem_cache_free(policy_cache, p);
}
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES)
nodes_and(tmp, pol->w.user_nodemask, *nodes);
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
tmp = *nodes;
pol->v.nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes)
{
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES) {
int node = first_node(pol->w.user_nodemask);
if (node_isset(node, *nodes)) {
pol->v.preferred_node = node;
pol->flags &= ~MPOL_F_LOCAL;
} else
pol->flags |= MPOL_F_LOCAL;
} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
pol->v.preferred_node = first_node(tmp);
} else if (!(pol->flags & MPOL_F_LOCAL)) {
pol->v.preferred_node = node_remap(pol->v.preferred_node,
pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
}
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
* Per-vma policies are protected by mmap_sem. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol || pol->mode == MPOL_LOCAL)
return;
if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
*
* Called with task's alloc_lock held.
*/
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
mpol_rebind_policy(tsk->mempolicy, new);
}
/*
* Rebind each vma in mm to new nodemask.
*
* Call holding a reference to mm. Takes mm->mmap_sem during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vm_write_begin(vma);
mpol_rebind_policy(vma->vm_policy, new);
vm_write_end(vma);
}
up_write(&mm->mmap_sem);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
[MPOL_DEFAULT] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
.create = mpol_new_interleave,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
.create = mpol_new_bind,
.rebind = mpol_rebind_nodemask,
},
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
struct queue_pages {
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
struct vm_area_struct *prev;
};
/*
* Check if the page's nid is in qp->nmask.
*
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
* in the invert of qp->nmask.
*/
static inline bool queue_pages_required(struct page *page,
struct queue_pages *qp)
{
int nid = page_to_nid(page);
unsigned long flags = qp->flags;
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
/*
* queue_pages_pmd() has four possible return values:
* 0 - pages are placed on the right node or queued successfully.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 2 - THP was split.
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing page was already on a node that does not follow the
* policy.
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
int ret = 0;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
ret = -EIO;
goto unlock;
}
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
ret = 2;
goto out;
}
if (!queue_pages_required(page, qp))
goto unlock;
flags = qp->flags;
/* go to thp migration */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
migrate_page_add(page, qp->pagelist, flags)) {
ret = 1;
goto unlock;
}
} else
ret = -EIO;
unlock:
spin_unlock(ptl);
out:
return ret;
}
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*
* queue_pages_pte_range() has three possible return values:
* 0 - pages are placed on the right node or queued successfully.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
* on a node that does not follow the policy.
*/
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
int ret;
bool has_unmovable = false;
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
if (ret != 2)
return ret;
}
/* THP was split, fall through to pte walk */
if (pmd_trans_unstable(pmd))
return 0;
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
/*
* vm_normal_page() filters out zero pages, but there might
* still be PageReserved pages to skip, perhaps in a VDSO.
*/
if (PageReserved(page))
continue;
if (!queue_pages_required(page, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
/* MPOL_MF_STRICT must be specified if we get here */
if (!vma_migratable(vma)) {
has_unmovable = true;
break;
}
/*
* Do not abort immediately since there may be
* temporary off LRU pages in the range. Still
* need migrate other LRU pages.
*/
if (migrate_page_add(page, qp->pagelist, flags))
has_unmovable = true;
} else
break;
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
if (has_unmovable)
return 1;
return addr != end ? -EIO : 0;
}
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
struct page *page;
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
if (!queue_pages_required(page, qp))
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
!hugetlb_pmd_shared(pte)))
isolate_huge_page(page, qp->pagelist);
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
return 0;
}
#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
* faults, pages may be migrated for better NUMA placement.
*
* This is assuming that NUMA faults are handled using PROT_NONE. If
* an architecture makes a different choice, it will need further
* changes to the core.
*/
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
return nr_updated;
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
*/
if (!vma_migratable(vma) &&
!(flags & MPOL_MF_STRICT))
return 1;
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return -EFAULT;
if (qp->prev && qp->prev->vm_end < vma->vm_start)
return -EFAULT;
}
qp->prev = vma;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (!is_vm_hugetlb_page(vma) &&
(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
/* queue pages from current vma */
if (flags & MPOL_MF_VALID)
return 0;
return 1;
}
static const struct mm_walk_ops queue_pages_walk_ops = {
.hugetlb_entry = queue_pages_hugetlb,
.pmd_entry = queue_pages_pte_range,
.test_walk = queue_pages_test_walk,
};
/*
* Walk through page tables and collect pages to be migrated.
*
* If pages found in a given range are on a set of nodes (determined by
* @nodes and @flags,) it's isolated and queued to the pagelist which is
* passed via @private.
*
* queue_pages_range() has three possible return values:
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 0 - queue pages successfully or no misplaced page.
* errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
* memory range specified by nodemask and maxnode points outside
* your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
.prev = NULL,
};
return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
}
/*
* Apply policy to a single VMA
* This must be called with the mmap_sem held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
{
int err;
struct mempolicy *old;
struct mempolicy *new;
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
new = mpol_dup(pol);
if (IS_ERR(new))
return PTR_ERR(new);
vm_write_begin(vma);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
goto err_out;
}
old = vma->vm_policy;
/*
* The speculative page fault handler accesses this field without
* hodling the mmap_sem.
*/
WRITE_ONCE(vma->vm_policy, new);
vm_write_end(vma);
mpol_put(old);
return 0;
err_out:
vm_write_end(vma);
mpol_put(new);
return err;
}
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
unsigned long vmstart;
unsigned long vmend;
vma = find_vma(mm, start);
if (!vma || vma->vm_start > start)
return -EFAULT;
prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
continue;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto replace;
}
if (vma->vm_start != vmstart) {
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
}
replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
}
out:
return err;
}
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
NODEMASK_SCRATCH(scratch);
int ret;
if (!scratch)
return -ENOMEM;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto out;
}
task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
mpol_put(new);
goto out;
}
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
mpol_put(old);
ret = 0;
out:
NODEMASK_SCRATCH_FREE(scratch);
return ret;
}
/*
* Return nodemask for policy for get_mempolicy() query
*
* Called with task's alloc_lock held
*/
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
nodes_clear(*nodes);
if (p == &default_policy)
return;
switch (p->mode) {
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
if (!(p->flags & MPOL_F_LOCAL))
node_set(p->v.preferred_node, *nodes);
/* else return empty node mask for local allocation */
break;
default:
BUG();
}
}
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p;
int err;
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
}
if (locked)
up_read(&mm->mmap_sem);
return err;
}
/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
if (flags & MPOL_F_MEMS_ALLOWED) {
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
task_lock(current);
*nmask = cpuset_current_mems_allowed;
task_unlock(current);
return 0;
}
if (flags & MPOL_F_ADDR) {
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
up_read(&mm->mmap_sem);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
pol = vma->vm_ops->get_policy(vma, addr);
else
pol = vma->vm_policy;
} else if (addr)
return -EINVAL;
if (!pol)
pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
* wil drop the mmap_sem, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->v.nodes);
} else {
err = -EINVAL;
goto out;
}
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
/*
* Internal mempolicy flags must be masked off before exposing
* the policy to userspace.
*/
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
*nmask = pol->w.user_nodemask;
} else {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
}
out:
mpol_cond_put(pol);
if (vma)
up_read(&mm->mmap_sem);
if (pol_refcount)
mpol_put(pol_refcount);
return err;
}
#ifdef CONFIG_MIGRATION
/*
* page migration, thp tail pages can be passed.
*/
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
struct page *head = compound_head(page);
/*
* Avoid migrating a page that is shared with others.
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_cache(head),
hpage_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
* temporary off LRU pages or non-LRU movable pages.
* Treat them as unmovable pages since they can't be
* isolated, so they can't be moved at the moment. It
* should return -EIO for this case too.
*/
return -EIO;
}
}
return 0;
}
/* page allocation callback for NUMA node migration */
struct page *alloc_new_node_page(struct page *page, unsigned long node)
{
if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_pages_node(node,
(GFP_TRANSHUGE | __GFP_THISNODE),
HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
return thp;
} else
return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE, 0);
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
nodes_clear(nmask);
node_set(source, nmask);
/*
* This does not "check" the range but isolates all pages that
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
return err;
}
/*
* Move pages between the two nodesets so as to preserve the physical
* layout as much as possible.
*
* Returns the number of page that could not be moved.
*/
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
int busy = 0;
int err;
nodemask_t tmp;
err = migrate_prep();
if (err)
return err;
down_read(&mm->mmap_sem);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
* bit in 'tmp', and return that <source, dest> pair for migration.
* The pair of nodemasks 'to' and 'from' define the map.
*
* If no pair of bits is found that way, fallback to picking some
* pair of 'source' and 'dest' bits that are not the same. If the
* 'source' and 'dest' bits are the same, this represents a node
* that will be migrating to itself, so no pages need move.
*
* If no bits are left in 'tmp', or if all remaining bits left
* in 'tmp' correspond to the same bit in 'to', return false
* (nothing left to migrate).
*
* This lets us pick a pair of nodes to migrate between, such that
* if possible the dest node is not already occupied by some other
* source node, minimizing the risk of overloading the memory on a
* node that would happen if we migrated incoming memory to a node
* before migrating outgoing memory source that same node.
*
* A single scan of tmp is sufficient. As we go, we remember the
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
* Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
*/
tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
/*
* do_migrate_pages() tries to maintain the relative
* node relationship of the pages established between
* threads and memory areas.
*
* However if the number of source nodes is not equal to
* the number of destination nodes we can not preserve
* this node relative relationship. In that case, skip
* copying memory from a node that is in the destination
* mask.
*
* Example: [2,3,4] -> [3,4,5] moves everything.
* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
*/
if ((nodes_weight(*from) != nodes_weight(*to)) &&
(node_isset(s, *to)))
continue;
d = node_remap(s, *from, *to);
if (s == d)
continue;
source = s; /* Node moved. Memorize */
dest = d;
/* dest not in remaining from nodes? */
if (!node_isset(dest, tmp))
break;
}
if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
busy += err;
if (err < 0)
break;
}
up_read(&mm->mmap_sem);
if (err < 0)
return err;
return busy;
}
/*
* Allocate a new page for page migration based on vma policy.
* Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
vma = vma->vm_next;
}
if (PageHuge(page)) {
return alloc_huge_page_vma(page_hstate(compound_head(page)),
vma, address);
} else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
return thp;
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
*/
return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
vma, address);
}
#else
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
return -EIO;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
return -ENOSYS;
}
static struct page *new_page(struct page *page, unsigned long start)
{
return NULL;
}
#endif
static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
int err;
int ret;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
return -EINVAL;
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
if (flags & MPOL_MF_LAZY)
new->flags |= MPOL_F_MOF;
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
*/
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
start, start + len, mode, mode_flags,
nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
err = migrate_prep();
if (err)
goto mpol_out;
}
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
down_write(&mm->mmap_sem);
task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
task_unlock(current);
if (err)
up_write(&mm->mmap_sem);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
if (err)
goto mpol_out;
ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
if (ret < 0) {
err = ret;
goto up_out;
}
err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_movable_pages(&pagelist);
}
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
} else {
up_out:
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
}
up_write(&mm->mmap_sem);
mpol_out:
mpol_put(new);
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
unsigned long t;
unsigned long nlongs;
unsigned long endmask;
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
return -EINVAL;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
endmask = ~0UL;
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
/*
* When the user specified more nodes than supported just check
* if the non supported part is all zero.
*
* If maxnode have more longs than MAX_NUMNODES, check
* the bits in that area first. And then go through to
* check the rest bits which equal or bigger than MAX_NUMNODES.
* Otherwise, just check bits [MAX_NUMNODES, maxnode).
*/
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
if (t & endmask)
return -EINVAL;
} else if (t)
return -EINVAL;
}
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
endmask = ~0UL;
}
if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
unsigned long valid_mask = endmask;
valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
if (get_user(t, nmask + nlongs - 1))
return -EFAULT;
if (t & valid_mask)
return -EINVAL;
}
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
return 0;
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
}
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
nodemask_t nodes;
int err;
unsigned short mode_flags;
start = untagged_addr(start);
mode_flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if (mode >= MPOL_MAX)
return -EINVAL;
if ((mode_flags & MPOL_F_STATIC_NODES) &&
(mode_flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
{
return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}
/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
int err;
nodemask_t nodes;
unsigned short flags;
flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if ((unsigned int)mode >= MPOL_MAX)
return -EINVAL;
if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(mode, flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
unsigned long, maxnode)
{
return kernel_set_mempolicy(mode, nmask, maxnode);
}
static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *old_nodes,
const unsigned long __user *new_nodes)
{
struct mm_struct *mm = NULL;
struct task_struct *task;
nodemask_t task_nodes;
int err;
nodemask_t *old;
nodemask_t *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
return -ENOMEM;
old = &scratch->mask1;
new = &scratch->mask2;
err = get_nodes(old, old_nodes, maxnode);
if (err)
goto out;
err = get_nodes(new, new_nodes, maxnode);
if (err)
goto out;
/* Find the mm_struct */
rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
rcu_read_unlock();
err = -ESRCH;
goto out;
}
get_task_struct(task);
err = -EINVAL;
/*
* Check if this process has the right to modify the specified process.
* Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out_put;
}
rcu_read_unlock();
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out_put;
}
task_nodes = cpuset_mems_allowed(current);
nodes_and(*new, *new, task_nodes);
if (nodes_empty(*new))
goto out_put;
err = security_task_movememory(task);
if (err)
goto out_put;
mm = get_task_mm(task);
put_task_struct(task);
if (!mm) {
err = -EINVAL;
goto out;
}
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
mmput(mm);
out:
NODEMASK_SCRATCH_FREE(scratch);
return err;
out_put:
put_task_struct(task);
goto out;
}
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, old_nodes,
const unsigned long __user *, new_nodes)
{
return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}
/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr,
unsigned long flags)
{
int err;
int pval;
nodemask_t nodes;
addr = untagged_addr(addr);
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long __user *, nmask, unsigned long, maxnode,
unsigned long, addr, unsigned long, flags)
{
return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode,
compat_ulong_t, addr, compat_ulong_t, flags)
{
long err;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
nm = compat_alloc_user_space(alloc_size);
err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
if (!err && nmask) {
unsigned long copy_size;
copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
err = copy_from_user(bm, nm, copy_size);
/* ensure entire bitmap is zeroed */
err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
err |= compat_put_bitmap(nmask, bm, nr_bits);
}
return err;
}
COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode)
{
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
if (compat_get_bitmap(bm, nmask, nr_bits))
return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
if (copy_to_user(nm, bm, alloc_size))
return -EFAULT;
}
return kernel_set_mempolicy(mode, nm, nr_bits+1);
}
COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
compat_ulong_t, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode, compat_ulong_t, flags)
{
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
nodemask_t bm;
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
if (copy_to_user(nm, nodes_addr(bm), alloc_size))
return -EFAULT;
}
return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
}
COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
compat_ulong_t, maxnode,
const compat_ulong_t __user *, old_nodes,
const compat_ulong_t __user *, new_nodes)
{
unsigned long __user *old = NULL;
unsigned long __user *new = NULL;
nodemask_t tmp_mask;
unsigned long nr_bits;
unsigned long size;
nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (old_nodes) {
if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
return -EFAULT;
old = compat_alloc_user_space(new_nodes ? size * 2 : size);
if (new_nodes)
new = old + size / sizeof(unsigned long);
if (copy_to_user(old, nodes_addr(tmp_mask), size))
return -EFAULT;
}
if (new_nodes) {
if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
return -EFAULT;
if (new == NULL)
new = compat_alloc_user_space(size);
if (copy_to_user(new, nodes_addr(tmp_mask), size))
return -EFAULT;
}
return kernel_migrate_pages(pid, nr_bits + 1, old, new);
}
#endif /* CONFIG_COMPAT */
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol;
if (!vma)
return NULL;
if (vma->vm_ops && vma->vm_ops->get_policy)
return vma->vm_ops->get_policy(vma, addr);
/*
* This could be called without holding the mmap_sem in the
* speculative page fault handler's path.
*/
pol = READ_ONCE(vma->vm_policy);
if (pol) {
/*
* shmem_alloc_page() passes MPOL_F_SHARED policy with
* a pseudo vma whose vma->vm_ops=NULL. Take a reference
* count on these policies which will be dropped by
* mpol_cond_put() later
*/
if (mpol_needs_cond_ref(pol))
mpol_get(pol);
}
return pol;
}
/*
* get_vma_policy(@vma, @addr)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = __get_vma_policy(vma, addr);
if (!pol)
pol = get_task_policy(current);
return pol;
}
bool vma_policy_mof(struct vm_area_struct *vma)
{
struct mempolicy *pol;
if (vma->vm_ops && vma->vm_ops->get_policy) {
bool ret = false;
pol = vma->vm_ops->get_policy(vma, vma->vm_start);
if (pol && (pol->flags & MPOL_F_MOF))
ret = true;
mpol_cond_put(pol);
return ret;
}
pol = vma->vm_policy;
if (!pol)
pol = get_task_policy(current);
return pol->flags & MPOL_F_MOF;
}
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
* if policy->v.nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->v.nodes is intersect with node_states[N_MEMORY].
* so if the following test faile, it implies
* policy->v.nodes has movable memory only.
*/
if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
}
/*
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
return &policy->v.nodes;
return NULL;
}
/* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy,
int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
else {
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
* requested node and not break the policy.
*/
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
return nd;
}
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned next;
struct task_struct *me = current;
next = next_node_in(me->il_prev, policy->v.nodes);
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
}
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
*/
unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
int node = numa_mem_id();
if (in_interrupt())
return node;
policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL)
return node;
switch (policy->mode) {
case MPOL_PREFERRED:
/*
* handled MPOL_F_LOCAL above
*/
return policy->v.preferred_node;
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
case MPOL_BIND: {
struct zoneref *z;
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes);
return z->zone ? zone_to_nid(z->zone) : node;
}
default:
BUG();
}
}
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
* node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
int i;
int nid;
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;
nid = first_node(pol->v.nodes);
for (i = 0; i < target; i++)
nid = next_node(nid, pol->v.nodes);
return nid;
}
/* Determine a node number for interleave */
static inline unsigned interleave_nid(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long addr, int shift)
{
if (vma) {
unsigned long off;
/*
* for small pages, there is no difference between
* shift and PAGE_SHIFT, so the bit-shift is safe.
* for huge pages, since vm_pgoff is in units of small
* pages, we need to shift off the always 0 bits to get
* a useful offset.
*/
BUG_ON(shift < PAGE_SHIFT);
off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, off);
} else
return interleave_nodes(pol);
}
#ifdef CONFIG_HUGETLBFS
/*
* huge_node(@vma, @addr, @gfp_flags, @mpol)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
* @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
*mpol = get_vma_policy(vma, addr);
*nodemask = NULL; /* assume !MPOL_BIND */
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
*nodemask = &(*mpol)->v.nodes;
}
return nid;
}
/*
* init_nodemask_of_mempolicy
*
* If the current task's mempolicy is "default" [NULL], return 'false'
* to indicate default policy. Otherwise, extract the policy nodemask
* for 'bind' or 'interleave' policy into the argument nodemask, or
* initialize the argument nodemask to contain the single node for
* 'preferred' or 'local' policy and return 'true' to indicate presence
* of non-default mempolicy.
*
* We don't bother with reference counting the mempolicy [mpol_get/put]
* because the current task is examining it's own mempolicy and a task's
* mempolicy is only ever changed by the task itself.
*
* N.B., it is the caller's responsibility to free a returned nodemask.
*/
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
int nid;
if (!(mask && current->mempolicy))
return false;
task_lock(current);
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
if (mempolicy->flags & MPOL_F_LOCAL)
nid = numa_node_id();
else
nid = mempolicy->v.preferred_node;
init_nodemask_of_node(mask, nid);
break;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
*mask = mempolicy->v.nodes;
break;
default:
BUG();
}
task_unlock(current);
return true;
}
#endif
/*
* mempolicy_nodemask_intersects
*
* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
* policy. Otherwise, check for intersection between mask and the policy
* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
* policy, always return true since it may allocate elsewhere on fallback.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
bool ret = true;
if (!mask)
return ret;
task_lock(tsk);
mempolicy = tsk->mempolicy;
if (!mempolicy)
goto out;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
/*
* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
* allocate from, they may fallback to other nodes when oom.
* Thus, it's possible for tsk to have allocated memory from
* nodes in mask.
*/
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
ret = nodes_intersects(mempolicy->v.nodes, *mask);
break;
default:
BUG();
}
out:
task_unlock(tsk);
return ret;
}
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
struct page *page;
page = __alloc_pages(gfp, order, nid);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
return page;
}
/**
* alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
* %GFP_KERNEL kernel allocations,
* %GFP_HIGHMEM highmem/user allocations,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
* @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
* @node: Which node to prefer for allocation (modulo policy).
* @hugepage: for hugepages try only the preferred node if possible
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
* When VMA is not NULL caller must hold down_read on the mmap_sem of the
* mm_struct of the VMA to prevent it from going away. Should be used for
* all allocations for pages that will be mapped into user space. Returns
* NULL when no page can be allocated.
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
int preferred_nid;
nodemask_t *nmask;
pol = get_vma_policy(vma, addr);
if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
goto out;
}
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
/*
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
* node) we only try to allocate from the current/preferred
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
hpage_node = pol->v.preferred_node;
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
page = __alloc_pages_node(hpage_node,
gfp | __GFP_THISNODE, order);
/*
* If hugepage allocations are configured to always
* synchronous compact or the vma has been madvised
* to prefer hugepage backing, retry allowing remote
* memory as well.
*/
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
page = __alloc_pages_nodemask(gfp | __GFP_NORETRY,
order, hpage_node,
nmask);
goto out;
}
}
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
return page;
}
EXPORT_SYMBOL(alloc_pages_vma);
/**
* alloc_pages_current - Allocate pages.
*
* @gfp:
* %GFP_USER user allocation,
* %GFP_KERNEL kernel allocation,
* %GFP_HIGHMEM highmem allocation,
* %GFP_FS don't call back into a file system.
* %GFP_ATOMIC don't sleep.
* @order: Power of two of allocation size in pages. 0 is a single page.
*
* Allocate a page from the kernel page pool. When not in
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
struct mempolicy *pol = mpol_dup(vma_policy(src));
if (IS_ERR(pol))
return PTR_ERR(pol);
dst->vm_policy = pol;
return 0;
}
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*
* current's mempolicy may be rebinded by the other task(the task that changes
* cpuset's mems), so we needn't do rebind work for current task.
*/
/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
/* task's mempolicy is protected by alloc_lock */
if (old == current->mempolicy) {
task_lock(current);
*new = *old;
task_unlock(current);
} else
*new = *old;
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
atomic_set(&new->refcnt, 1);
return new;
}
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
return false;
if (a->mode != b->mode)
return false;
if (a->flags != b->flags)
return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
/* a's ->flags is the same as b's */
if (a->flags & MPOL_F_LOCAL)
return true;
return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
return false;
}
}
/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
* They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
/*
* lookup first element intersecting start-end. Caller holds sp->lock for
* reading or for writing
*/
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
struct rb_node *n = sp->root.rb_node;
while (n) {
struct sp_node *p = rb_entry(n, struct sp_node, nd);
if (start >= p->end)
n = n->rb_right;
else if (end <= p->start)
n = n->rb_left;
else
break;
}
if (!n)
return NULL;
for (;;) {
struct sp_node *w = NULL;
struct rb_node *prev = rb_prev(n);
if (!prev)
break;
w = rb_entry(prev, struct sp_node, nd);
if (w->end <= start)
break;
n = prev;
}
return rb_entry(n, struct sp_node, nd);
}
/*
* Insert a new shared policy into the list. Caller holds sp->lock for
* writing.
*/
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
struct rb_node *parent = NULL;
struct sp_node *nd;
while (*p) {
parent = *p;
nd = rb_entry(parent, struct sp_node, nd);
if (new->start < nd->start)
p = &(*p)->rb_left;
else if (new->end > nd->end)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
if (!sp->root.rb_node)
return NULL;
read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
read_unlock(&sp->lock);
return pol;
}
static void sp_free(struct sp_node *n)
{
mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
/**
* mpol_misplaced - check whether current page node is valid in policy
*
* @page: page to be checked
* @vma: vm area where page mapped
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id.
*
* Returns:
* -1 - not misplaced, page is in the right node
* node - node id where the page should be
*
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = NUMA_NO_NODE;
int ret = -1;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, pgoff);
break;
case MPOL_PREFERRED:
if (pol->flags & MPOL_F_LOCAL)
polnid = numa_node_id();
else
polnid = pol->v.preferred_node;
break;
case MPOL_BIND:
/*
* allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->v.nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes);
polnid = zone_to_nid(z->zone);
break;
default:
BUG();
}
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
goto out;
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
/*
* Drop the (possibly final) reference to task->mempolicy. It needs to be
* dropped after task->mempolicy is set to NULL so that any allocation done as
* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
* policy.
*/
void mpol_put_task_policy(struct task_struct *task)
{
struct mempolicy *pol;
task_lock(task);
pol = task->mempolicy;
task->mempolicy = NULL;
task_unlock(task);
mpol_put(pol);
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
sp_free(n);
}
static void sp_node_init(struct sp_node *node, unsigned long start,
unsigned long end, struct mempolicy *pol)
{
node->start = start;
node->end = end;
node->policy = pol;
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
struct sp_node *n;
struct mempolicy *newpol;
n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
newpol = mpol_dup(pol);
if (IS_ERR(newpol)) {
kmem_cache_free(sn_cache, n);
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
sp_node_init(n, start, end, newpol);
return n;
}
/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n;
struct sp_node *n_new = NULL;
struct mempolicy *mpol_new = NULL;
int ret = 0;
restart:
write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
struct rb_node *next = rb_next(&n->nd);
if (n->start >= start) {
if (n->end <= end)
sp_delete(sp, n);
else
n->start = end;
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
if (!n_new)
goto alloc_new;
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
n_new = NULL;
mpol_new = NULL;
break;
} else
n->end = start;
}
if (!next)
break;
n = rb_entry(next, struct sp_node, nd);
}
if (new)
sp_insert(sp, new);
write_unlock(&sp->lock);
ret = 0;
err_out:
if (mpol_new)
mpol_put(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
return ret;
alloc_new:
write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
goto err_out;
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
/**
* mpol_shared_policy_init - initialize shared policy for inode
* @sp: pointer to inode shared policy
* @mpol: struct mempolicy to install
*
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
* This is called at get_inode() calls and we can use GFP_KERNEL.
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
rwlock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
struct mempolicy *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(new))
goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
if (ret)
goto put_new;
/* Create pseudo-vma that contains just the policy */
vma_init(&pvma, NULL);
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
put_new:
mpol_put(new); /* drop initial ref */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
if (!new)
return -ENOMEM;
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
sp_free(new);
return err;
}
/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *p)
{
struct sp_node *n;
struct rb_node *next;
if (!p->root.rb_node)
return;
write_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
write_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
static int __init setup_numabalancing(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
unsigned long largest = 0;
int nid, prefer = 0;
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.v = { .preferred_node = nid, },
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
if (largest < total_pages) {
largest = total_pages;
prefer = nid;
}
/* Interleave this node? */
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
node_set(nid, interleave_nodes);
}
/* All too small, use the largest */
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
/* Reset policy of current process to default */
void numa_default_policy(void)
{
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}
/*
* Parse and format mempolicy from/to strings
*/
/*
* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
};
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* On success, returns 0, else 1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
int err = 1, mode;
if (flags)
*flags++ = '\0'; /* terminate mode string */
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;
switch (mode) {
case MPOL_PREFERRED:
/*
* Insist on a nodelist of one node only, although later
* we use first_node(nodes) to grab a single node, so here
* nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
while (isdigit(*rest))
rest++;
if (*rest)
goto out;
if (nodes_empty(nodes))
goto out;
}
break;
case MPOL_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
* Don't allow a nodelist; mpol_new() checks flags
*/
if (nodelist)
goto out;
mode = MPOL_PREFERRED;
break;
case MPOL_DEFAULT:
/*
* Insist on a empty nodelist
*/
if (!nodelist)
err = 0;
goto out;
case MPOL_BIND:
/*
* Insist on a nodelist
*/
if (!nodelist)
goto out;
}
mode_flags = 0;
if (flags) {
/*
* Currently, we only support two mutually exclusive
* mode flags.
*/
if (!strcmp(flags, "static"))
mode_flags |= MPOL_F_STATIC_NODES;
else if (!strcmp(flags, "relative"))
mode_flags |= MPOL_F_RELATIVE_NODES;
else
goto out;
}
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
goto out;
/*
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
if (mode != MPOL_PREFERRED)
new->v.nodes = nodes;
else if (nodelist)
new->v.preferred_node = first_node(nodes);
else
new->flags |= MPOL_F_LOCAL;
/*
* Save nodes for contextualization: this will be used to "clone"
* the mempolicy in a specific context [cpuset] at a later time.
*/
new->w.user_nodemask = nodes;
err = 0;
out:
/* Restore string for error message */
if (nodelist)
*--nodelist = ':';
if (flags)
*--flags = '=';
if (!err)
*mpol = new;
return err;
}
#endif /* CONFIG_TMPFS */
/**
* mpol_to_str - format a mempolicy structure for printing
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
* Convert @pol into a string. If @buffer is too short, truncate the string.
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
* longest flag, "relative", and to display at least a few node ids.
*/
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
nodemask_t nodes = NODE_MASK_NONE;
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
switch (mode) {
case MPOL_DEFAULT:
break;
case MPOL_PREFERRED:
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
default:
WARN_ON_ONCE(1);
snprintf(p, maxlen, "unknown");
return;
}
p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
*/
if (flags & MPOL_F_STATIC_NODES)
p += snprintf(p, buffer + maxlen - p, "static");
else if (flags & MPOL_F_RELATIVE_NODES)
p += snprintf(p, buffer + maxlen - p, "relative");
}
if (!nodes_empty(nodes))
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}