msm-4.14/fs/userfaultfd.c
Runmin Wang 68f515d91b Merge remote-tracking branch 'remotes/origin/tmp-8a5776a' into msm-next
* remotes/origin/tmp-8a5776a:
  Linux 4.14-rc4
  ARC: [plat-hsdk]: Add reset controller node to manage ethernet reset
  arm64: Ensure fpsimd support is ready before userspace is active
  arm64: Ensure the instruction emulation is ready for userspace
  powerpc/powernv: Increase memory block size to 1GB on radix
  dm raid: fix incorrect status output at the end of a "recover" process
  KVM: add X86_LOCAL_APIC dependency
  ovl: fix regression caused by exclusive upper/work dir protection
  ovl: fix missing unlock_rename() in ovl_do_copy_up()
  ovl: fix dentry leak in ovl_indexdir_cleanup()
  ovl: fix dput() of ERR_PTR in ovl_cleanup_index()
  ovl: fix error value printed in ovl_lookup_index()
  ovl: fix may_write_real() for overlayfs directories
  x86/kvm: Move kvm_fastop_exception to .fixup section
  i2c: i2c-stm32f7: make structure stm32f7_setup static const
  i2c: ensure termination of *_device_id tables
  i2c: i801: Add support for Intel Cedar Fork
  i2c: stm32f7: fix setup structure
  net: 8021q: skip packets if the vlan is down
  Update James Hogan's email address
  drm/i915/glk: Fix DMC/DC state idleness calculation
  drm/i915/cnl: Reprogram DMC firmware after S3/S4 resume
  i40iw: Fix port number for query QP
  i40iw: Add missing memory barriers
  RDMA/qedr: Parse vlan priority as sl
  RDMA/qedr: Parse VLAN ID correctly and ignore the value of zero
  IB/mlx5: Fix label order in error path handling
  arm64: Use larger stacks when KASAN is selected
  ACPI/IORT: Fix PCI ACS enablement
  kvm/x86: Avoid async PF preempting the kernel incorrectly
  clk: samsung: exynos4: Enable VPLL and EPLL clocks for suspend/resume cycle
  dm crypt: reject sector_size feature if device length is not aligned to it
  Btrfs: fix overlap of fs_info::flags values
  bsg-lib: fix use-after-free under memory-pressure
  btrfs: avoid overflow when sector_t is 32 bit
  ARM: dts: stm32: use right pinctrl compatible for stm32f469
  powerpc/mm: Call flush_tlb_kernel_range with interrupts enabled
  powerpc/xive: Clear XIVE internal structures when a CPU is removed
  powerpc/xive: Fix IPI reset
  nvme-pci: Use PCI bus address for data/queues in CMB
  ARM: dts: stm32: Fix STMPE1600 binding on stm32429i-eval board
  watchdog/core: Put softlockup_threads_initialized under ifdef guard
  watchdog/core: Rename some softlockup_* functions
  powerpc/watchdog: Make use of watchdog_nmi_probe()
  watchdog/core, powerpc: Lock cpus across reconfiguration
  watchdog/core, powerpc: Replace watchdog_nmi_reconfigure()
  mmc: sdhci-xenon: Fix clock resource by adding an optional bus clock
  mmc: meson-gx: include tx phase in the tuning process
  mmc: meson-gx: fix rx phase reset
  mmc: meson-gx: make sure the clock is rounded down
  mmc: Delete bounce buffer handling
  lsm: fix smack_inode_removexattr and xattr_getsecurity memleak
  xfs: handle racy AIO in xfs_reflink_end_cow
  xfs: always swap the cow forks when swapping extents
  ARC: [plat-hsdk]: Temporary fix to set CPU frequency to 1GHz
  ARC: fix allnoconfig build warning
  ARCv2: boot log: identify HS48 cores (dual issue)
  ARC: boot log: decontaminate ARCv2 ISA_CONFIG register
  arc: remove redundant UTS_MACHINE define in arch/arc/Makefile
  ARC: [plat-eznps] Update platform maintainer as Noam left
  ARC: [plat-hsdk] use actual clk driver to manage cpu clk
  ARC: [*defconfig] Reenable soft lock-up detector
  ARC: [plat-axs10x] sdio: Temporary fix of sdio ciu frequency
  ARC: [plat-hsdk] sdio: Temporary fix of sdio ciu frequency
  ARC: [plat-axs103] Add temporary quirk to reset ethernet IP
  ARM: defconfig: update Gemini defconfig
  ARM: defconfig: FRAMEBUFFER_CONSOLE can no longer be =m
  include/linux/fs.h: fix comment about struct address_space
  checkpatch: fix ignoring cover-letter logic
  m32r: fix build failure
  lib/ratelimit.c: use deferred printk() version
  kernel/params.c: improve STANDARD_PARAM_DEF readability
  kernel/params.c: fix an overflow in param_attr_show
  kernel/params.c: fix the maximum length in param_get_string
  mm/memory_hotplug: define find_{smallest|biggest}_section_pfn as unsigned long
  mm/memory_hotplug: change pfn_to_section_nr/section_nr_to_pfn macro to inline function
  kernel/kcmp.c: drop branch leftover typo
  memremap: add scheduling point to devm_memremap_pages
  mm, page_alloc: add scheduling point to memmap_init_zone
  mm, memory_hotplug: add scheduling point to __add_pages
  lib/idr.c: fix comment for idr_replace()
  mm: memcontrol: use vmalloc fallback for large kmem memcg arrays
  kernel/sysctl.c: remove duplicate UINT_MAX check on do_proc_douintvec_conv()
  include/linux/bitfield.h: remove 32bit from FIELD_GET comment block
  lib/lz4: make arrays static const, reduces object code size
  exec: binfmt_misc: kill the onstack iname[BINPRM_BUF_SIZE] array
  exec: binfmt_misc: fix race between load_misc_binary() and kill_node()
  exec: binfmt_misc: remove the confusing e->interp_file != NULL checks
  exec: binfmt_misc: shift filp_close(interp_file) from kill_node() to bm_evict_inode()
  exec: binfmt_misc: don't nullify Node->dentry in kill_node()
  exec: load_script: kill the onstack interp[BINPRM_BUF_SIZE] array
  userfaultfd: non-cooperative: fix fork use after free
  mm/device-public-memory: fix edge case in _vm_normal_page()
  mm: fix data corruption caused by lazyfree page
  mm: avoid marking swap cached page as lazyfree
  mm: have filemap_check_and_advance_wb_err clear AS_EIO/AS_ENOSPC
  m32r: define CPU_BIG_ENDIAN
  zram: fix null dereference of handle
  mm: fix RODATA_TEST failure "rodata_test: test data was not read only"
  rapidio: remove global irq spinlocks from the subsystem
  mm: meminit: mark init_reserved_page as __meminit
  z3fold: fix stale list handling
  mm,compaction: serialize waitqueue_active() checks (for real)
  android: binder: drop lru lock in isolate callback
  mm/memcg: avoid page count check for zone device
  mm, memcg: remove hotplug locking from try_charge
  mm, oom_reaper: skip mm structs with mmu notifiers
  z3fold: fix potential race in z3fold_reclaim_page
  sh: sh7269: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration
  sh: sh7264: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration
  sh: sh7757: remove nonexistent GPIO_PT[JLNQ]7_RESV to fix pinctrl registration
  sh: sh7722: remove nonexistent GPIO_PTQ7 to fix pinctrl registration
  mm, hugetlb, soft_offline: save compound page order before page migration
  ksm: fix unlocked iteration over vmas in cmp_and_merge_page()
  include/linux/mm.h: fix typo in VM_MPX definition
  scripts/spelling.txt: add more spelling mistakes to spelling.txt
  kernel/params.c: align add_sysfs_param documentation with code
  alpha: fix build failures
  bpf: fix bpf_tail_call() x64 JIT
  net: stmmac: dwmac-rk: Add RK3128 GMAC support
  blk-mq-debugfs: fix device sched directory for default scheduler
  null_blk: change configfs dependency to select
  blk-throttle: fix possible io stall when upgrade to max
  rndis_host: support Novatel Verizon USB730L
  drm/i915: Fix DDI PHY init if it was already on
  ide: fix IRQ assignment for PCI bus order probing
  ide: pci: free PCI BARs on initialization failure
  ide: free hwif->portdev on hwif_init() failure
  MAINTAINERS: update list for NBD
  net: rtnetlink: fix info leak in RTM_GETSTATS call
  KVM: PPC: Book3S: Fix server always zero from kvmppc_xive_get_xive()
  rcu: Remove extraneous READ_ONCE()s from rcu_irq_{enter,exit}()
  ftrace: Fix kmemleak in unregister_ftrace_graph
  powerpc/4xx: Fix compile error with 64K pages on 40x, 44x
  powerpc: Fix action argument for cpufeatures-based TLB flush
  scsi: ibmvscsis: Fix write_pending failure path
  scsi: libiscsi: Remove iscsi_destroy_session
  scsi: libiscsi: Fix use-after-free race during iscsi_session_teardown
  scsi: sd: Do not override max_sectors_kb sysfs setting
  scsi: sd: Implement blacklist option for WRITE SAME w/ UNMAP
  socket, bpf: fix possible use after free
  nbd: fix -ERESTARTSYS handling
  drm/sun4i: hdmi: Disable clks in bind function error path and unbind function
  ahci: don't ignore result code of ahci_reset_controller()
  mlxsw: spectrum_router: Track RIF of IPIP next hops
  mlxsw: spectrum_router: Move VRF refcounting
  ALSA: usx2y: Suppress kernel warning at page allocation failures
  ceph: fix __choose_mds() for LSSNAP request
  ceph: properly queue cap snap for newly created snap realm
  arm64: fix misleading data abort decoding
  Revert "ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members"
  Revert "HID: multitouch: Support ALPS PTP stick with pid 0x120A"
  HID: hidraw: fix power sequence when closing device
  HID: wacom: Always increment hdev refcount within wacom_get_hdev_data
  mmc: core: add driver strength selection when selecting hs400es
  net: hns3: Fix an error handling path in 'hclge_rss_init_hw()'
  net: mvpp2: Fix clock resource by adding an optional bus clock
  r8152: add Linksys USB3GIGV1 id
  l2tp: fix l2tp_eth module loading
  ip_gre: erspan device should keep dst
  ip_gre: set tunnel hlen properly in erspan_tunnel_init
  ip_gre: check packet length and mtu correctly in erspan_xmit
  ip_gre: get key from session_id correctly in erspan_rcv
  Linux 4.14-rc3
  hwmon: (xgene) Fix up error handling path mixup in 'xgene_hwmon_probe()'
  nvme: fix visibility of "uuid" ns attribute
  tipc: use only positive error codes in messages
  ppp: fix __percpu annotation
  udp: perform source validation for mcast early demux
  IPv4: early demux can return an error code
  ip6_tunnel: update mtu properly for ARPHRD_ETHER tunnel device in tx path
  ip6_gre: ip6gre_tap device should keep dst
  ip_gre: ipgre_tap device should keep dst
  netlink: do not proceed if dump's start() errs
  clk: Export clk_bulk_prepare()
  fix infoleak in waitid(2)
  x86/asm: Use register variable to get stack pointer value
  x86/mm: Disable branch profiling in mem_encrypt.c
  arm64: fault: Route pte translation faults via do_translation_fault
  arm64: mm: Use READ_ONCE when dereferencing pointer to pte table
  RDMA/iwpm: Properly mark end of NL messages
  kvm/x86: Handle async PF in RCU read-side critical sections
  KVM: nVMX: Fix nested #PF intends to break L1's vmlauch/vmresume
  sched/sysctl: Check user input value of sysctl_sched_time_avg
  x86/asm: Fix inline asm call constraints for GCC 4.4
  sched/debug: Add explicit TASK_PARKED printing
  sched/debug: Ignore TASK_IDLE for SysRq-W
  sched/debug: Add explicit TASK_IDLE printing
  sched/tracing: Use common task-state helpers
  locking/rwsem-xadd: Fix missed wakeup due to reordering of load
  sched/tracing: Fix trace_sched_switch task-state printing
  sched/debug: Remove unused variable
  sched/debug: Convert TASK_state to hex
  sched/debug: Implement consistent task-state printing
  um/time: Fixup namespace collision
  perf/aux: Only update ->aux_wakeup in non-overwrite mode
  cxl: Fix memory page not handled
  powerpc: Fix workaround for spurious MCE on POWER9
  PM / s2idle: Invoke the ->wake() platform callback earlier
  Revert "Bluetooth: Add option for disabling legacy ioctl interfaces"
  net: Set sk_prot_creator when cloning sockets to the right proto
  net: dsa: mv88e6xxx: lock mutex when freeing IRQs
  packet: only test po->has_vnet_hdr once in packet_snd
  packet: in packet_do_bind, test fanout with bind_lock held
  net: stmmac: dwmac4: Re-enable MAC Rx before powering down
  net: stmmac: dwc-qos: Add suspend / resume support
  net: dsa: Fix network device registration order
  net: dsa: mv88e6xxx: Allow dsa and cpu ports in multiple vlans
  inetpeer: fix RCU lookup() again
  net: mvpp2: do not select the internal source clock
  net: mvpp2: fix port list indexing
  net: mvpp2: fix parsing fragmentation detection
  dm crypt: fix memory leak in crypt_ctr_cipher_old()
  perf test: Fix vmlinux failure on s390x part 2
  perf test: Fix vmlinux failure on s390x
  KVM: VMX: use cmpxchg64
  tun: bail out from tun_get_user() if the skb is empty
  percpu: fix iteration to prevent skipping over block
  timer: Prepare to change timer callback argument type
  xen/mmu: Call xen_cleanhighmap() with 4MB aligned for page tables mapping
  xen-pciback: relax BAR sizing write value check
  watchdog/hardlockup/perf: Fix spelling mistake: "permanetely" -> "permanently"
  irq/generic-chip: Don't replace domain's name
  usb: dwc3: of-simple: Add compatible for Spreadtrum SC9860 platform
  usb: gadget: udc: atmel: set vbus irqflags explicitly
  usb: gadget: ffs: handle I/O completion in-order
  usb: renesas_usbhs: fix usbhsf_fifo_clear() for RX direction
  usb: renesas_usbhs: fix the BCLR setting condition for non-DCP pipe
  usb: gadget: udc: renesas_usb3: Fix return value of usb3_write_pipe()
  usb: gadget: udc: renesas_usb3: fix Pn_RAMMAP.Pn_MPKT value
  usb: gadget: udc: renesas_usb3: fix for no-data control transfer
  USB: dummy-hcd: Fix erroneous synchronization change
  USB: dummy-hcd: fix infinite-loop resubmission bug
  USB: dummy-hcd: fix connection failures (wrong speed)
  seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter()
  objtool: Support unoptimized frame pointer setup
  objtool: Skip unreachable warnings for GCC 4.4 and older
  net/mlx5: Fix wrong indentation in enable SRIOV code
  net/mlx5: Fix static checker warning on steering tracepoints code
  net/mlx5e: Fix calculated checksum offloads counters
  net/mlx5e: Don't add/remove 802.1ad rules when changing 802.1Q VLAN filter
  net/mlx5e: Print netdev features correctly in error message
  net/mlx5e: Check encap entry state when offloading tunneled flows
  net/mlx5e: Disallow TC offloading of unsupported match/action combinations
  net/mlx5e: Fix erroneous freeing of encap header buffer
  net/mlx5: Check device capability for maximum flow counters
  net/mlx5: Fix FPGA capability location
  net/mlx5e: IPoIB, Fix access to invalid memory address
  md/raid5: cap worker count
  dm-raid: fix a race condition in request handling
  md: fix a race condition for flush request handling
  md: separate request handling
  scsi: ILLEGAL REQUEST + ASC==27 => target failure
  scsi: aacraid: Add a small delay after IOP reset
  cpufreq: docs: Drop intel-pstate.txt from index.txt
  percpu: fix starting offset for chunk statistics traversal
  ACPI / APEI: clear error status before acknowledging the error
  bcache: use llist_for_each_entry_safe() in __closure_wake_up()
  mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user
  IB/hfi1: Unsuccessful PCIe caps tuning should not fail driver load
  IB/hfi1: On error, fix use after free during user context setup
  Revert "IB/ipoib: Update broadcast object if PKey value was changed in index 0"
  IB/hfi1: Return correct value in general interrupt handler
  IB/hfi1: Check eeprom config partition validity
  IB/hfi1: Only reset QSFP after link up and turn off AOC TX
  IB/hfi1: Turn off AOC TX after offline substates
  iommu: Fix comment for iommu_ops.map_sg
  iommu/amd: pr_err() strings should end with newlines
  iommu/mediatek: Limit the physical address in 32bit for v7s
  iommu/io-pgtable-arm-v7s: Need dma-sync while there is no QUIRK_NO_DMA
  mtd: Fix partition alignment check on multi-erasesize devices
  KVM: VMX: simplify and fix vmx_vcpu_pi_load
  KVM: VMX: avoid double list add with VT-d posted interrupts
  KVM: VMX: extract __pi_post_block
  arm64: Make sure SPsel is always set
  quota: Fix quota corruption with generic/232 test
  platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt
  sctp: Fix a big endian bug in sctp_diag_dump()
  vfs: Return -ENXIO for negative SEEK_HOLE / SEEK_DATA offsets
  atlantic: fix iommu errors
  aquantia: Fix transient invalid link down/up indications
  aquantia: Fix Tx queue hangups
  aquantia: Setup max_mtu in ndev to enable jumbo frames
  xfs: revert "xfs: factor rmap btree size into the indlen calculations"
  xfs: Capture state of the right inode in xfs_iflush_done
  xfs: perag initialization should only touch m_ag_max_usable for AG 0
  xfs: update i_size after unwritten conversion in dio completion
  iomap_dio_rw: Allocate AIO completion queue before submitting dio
  xfs: validate bdev support for DAX inode flag
  l2tp: fix race condition in l2tp_tunnel_delete
  vti: fix use after free in vti_tunnel_xmit/vti6_tnl_xmit
  drm/i915/bios: ignore HDMI on port A
  drm/i915: remove redundant variable hw_check
  drm/i915: always update ELD connector type after get modes
  percpu: make this_cpu_generic_read() atomic w.r.t. interrupts
  arm64: dts: rockchip: add the grf clk for dw-mipi-dsi on rk3399
  btrfs: log csums for all modified extents
  Btrfs: fix unexpected result when dio reading corrupted blocks
  btrfs: Report error on removing qgroup if del_qgroup_item fails
  Btrfs: skip checksum when reading compressed data if some IO have failed
  Btrfs: fix kernel oops while reading compressed data
  Btrfs: use btrfs_op instead of bio_op in __btrfs_map_block
  Btrfs: do not backup tree roots when fsync
  btrfs: remove BTRFS_FS_QUOTA_DISABLING flag
  btrfs: propagate error to btrfs_cmp_data_prepare caller
  btrfs: prevent to set invalid default subvolid
  Btrfs: send: fix error number for unknown inode types
  btrfs: fix NULL pointer dereference from free_reloc_roots()
  btrfs: finish ordered extent cleaning if no progress is found
  btrfs: clear ordered flag on cleaning up ordered extents
  Btrfs: fix incorrect {node,sector}size endianness from BTRFS_IOC_FS_INFO
  Btrfs: do not reset bio->bi_ops while writing bio
  Btrfs: use the new helper wbc_to_write_flags
  powerpc: Handle MCE on POWER9 with only DSISR bit 30 set
  drm/tegra: trace: Fix path to include
  x86/fpu: Use using_compacted_format() instead of open coded X86_FEATURE_XSAVES
  x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_user_to_xstate()
  x86/fpu: Eliminate the 'xfeatures' local variable in copy_user_to_xstate()
  x86/fpu: Copy the full header in copy_user_to_xstate()
  x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_kernel_to_xstate()
  x86/fpu: Eliminate the 'xfeatures' local variable in copy_kernel_to_xstate()
  x86/fpu: Copy the full state_header in copy_kernel_to_xstate()
  x86/fpu: Use validate_xstate_header() to validate the xstate_header in __fpu__restore_sig()
  x86/fpu: Use validate_xstate_header() to validate the xstate_header in xstateregs_set()
  x86/fpu: Introduce validate_xstate_header()
  x86/fpu: Rename fpu__activate_fpstate_read/write() to fpu__prepare_[read|write]()
  x86/fpu: Rename fpu__activate_curr() to fpu__initialize()
  x86/fpu: Simplify and speed up fpu__copy()
  x86/fpu: Fix stale comments about lazy FPU logic
  x86/fpu: Rename fpu::fpstate_active to fpu::initialized
  x86/fpu: Remove fpu__current_fpstate_write_begin/end()
  x86/fpu: Fix fpu__activate_fpstate_read() and update comments
  netlink: fix nla_put_{u8,u16,u32} for KASAN
  rocker: fix rocker_tlv_put_* functions for KASAN
  scsi: scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add()
  xfs: remove redundant re-initialization of total_nr_pages
  xfs: Output warning message when discard option was enabled even though the device does not support discard
  xfs: report zeroed or not correctly in xfs_zero_range()
  xfs: kill meaningless variable 'zero'
  fs/xfs: Use %pS printk format for direct addresses
  xfs: evict CoW fork extents when performing finsert/fcollapse
  xfs: don't unconditionally clear the reflink flag on zero-block files
  fix a typo in put_compat_shm_info()
  PCI: Fix race condition with driver_override
  net: qcom/emac: specify the correct size when mapping a DMA buffer
  cpufreq: dt: Fix sysfs duplicate filename creation for platform-device
  scsi: scsi_transport_fc: set scsi_target_id upon rescan
  PM / OPP: Call notifier without holding opp_table->lock
  security/keys: rewrite all of big_key crypto
  security/keys: properly zero out sensitive key material in big_key
  l2tp: fix race between l2tp_session_delete() and l2tp_tunnel_closeall()
  l2tp: ensure sessions are freed after their PPPOL2TP socket
  smp/hotplug: Hotplug state fail injection
  smp/hotplug: Differentiate the AP completion between up and down
  smp/hotplug: Differentiate the AP-work lockdep class between up and down
  smp/hotplug: Callback vs state-machine consistency
  smp/hotplug: Rewrite AP state machine core
  smp/hotplug: Allow external multi-instance rollback
  smp/hotplug: Add state diagram
  MAINTAINERS: Add entry for MediaTek PMIC LED driver
  scsi: scsi_transport_iscsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly
  irqdomain: Add __rcu annotations to radix tree accessors
  irqchip/mips-gic: Use effective affinity to unmask
  irqchip/mips-gic: Fix shifts to extract register fields
  nvme-fcloop: fix port deletes and callbacks
  nvmet-fc: sync header templates with comments
  nvmet-fc: ensure target queue id within range.
  nvmet-fc: on port remove call put outside lock
  nvme-rdma: don't fully stop the controller in error recovery
  nvme-rdma: give up reconnect if state change fails
  nvme-core: Use nvme_wq to queue async events and fw activation
  nvme: fix sqhd reference when admin queue connect fails
  watchdog/hardlockup/perf: Cure UP damage
  gfs2: Fix debugfs glocks dump
  selftests: timers: set-timer-lat: Fix hang when testing unsupported alarms
  selftests: timers: set-timer-lat: fix hang when std out/err are redirected
  selftests/memfd: correct run_tests.sh permission
  selftests/seccomp: Support glibc 2.26 siginfo_t.h
  selftests: futex: Makefile: fix for loops in targets to run silently
  selftests: Makefile: fix for loops in targets to run silently
  selftests: mqueue: Use full path to run tests from Makefile
  selftests: futex: copy sub-dir test scripts for make O=dir run
  PCI: Add dummy pci_acs_enabled() for CONFIG_PCI=n build
  IB/mlx5: Fix NULL deference on mlx5_ib_update_xlt failure
  IB/mlx5: Simplify mlx5_ib_cont_pages
  IB/ipoib: Fix inconsistency with free_netdev and free_rdma_netdev
  IB/ipoib: Fix sysfs Pkey create<->remove possible deadlock
  IB: Correct MR length field to be 64-bit
  IB/core: Fix qp_sec use after free access
  IB/core: Fix typo in the name of the tag-matching cap struct
  perf tools: Fix syscalltbl build failure
  perf report: Fix debug messages with --call-graph option
  dm ioctl: fix alignment of event number in the device list
  block: fix a crash caused by wrong API
  fs: Fix page cache inconsistency when mixing buffered and AIO DIO
  nvmet: implement valid sqhd values in completions
  nvme-fabrics: Allow 0 as KATO value
  nvme: allow timed-out ios to retry
  nvme: stop aer posting if controller state not live
  nvme-pci: Print invalid SGL only once
  nvme-pci: initialize queue memory before interrupts
  nvmet-fc: fix failing max io queue connections
  nvme-fc: use transport-specific sgl format
  nvme: add transport SGL definitions
  nvme.h: remove FC transport-specific error values
  qla2xxx: remove use of FC-specific error codes
  lpfc: remove use of FC-specific error codes
  nvmet-fcloop: remove use of FC-specific error codes
  nvmet-fc: remove use of FC-specific error codes
  nvme-fc: remove use of FC-specific error codes
  loop: remove union of use_aio and ref in struct loop_cmd
  blktrace: Fix potential deadlock between delete & sysfs ops
  nbd: ignore non-nbd ioctl's
  bsg-lib: don't free job in bsg_prepare_job
  brd: fix overflow in __brd_direct_access
  genirq: Check __free_irq() return value for NULL
  futex: Fix pi_state->owner serialization
  KEYS: use kmemdup() in request_key_auth_new()
  KEYS: restrict /proc/keys by credentials at open time
  KEYS: reset parent each time before searching key_user_tree
  KEYS: prevent KEYCTL_READ on negative key
  KEYS: prevent creating a different user's keyrings
  KEYS: fix writing past end of user-supplied buffer in keyring_read()
  KEYS: fix key refcount leak in keyctl_read_key()
  KEYS: fix key refcount leak in keyctl_assume_authority()
  KEYS: don't revoke uninstantiated key in request_key_auth_new()
  KEYS: fix cred refcount leak in request_key_auth_new()
  perf evsel: Fix attr.exclude_kernel setting for default cycles:p
  tools include: Sync kernel ABI headers with tooling headers
  perf tools: Get all of tools/{arch,include}/ in the MANIFEST
  arch: change default endian for microblaze
  microblaze: Cocci spatch "vma_pages"
  microblaze: Add missing kvm_para.h to Kbuild
  perf/x86/intel/uncore: Correct num_boxes for IIO and IRP
  USB: cdc-wdm: ignore -EPIPE from GetEncapsulatedResponse
  USB: devio: Don't corrupt user memory
  USB: devio: Prevent integer overflow in proc_do_submiturb()
  perf/x86/intel/rapl: Add missing CPU IDs
  perf/x86/msr: Add missing CPU IDs
  perf/x86/intel/cstate: Add missing CPU IDs
  x86: Don't cast away the __user in __get_user_asm_u64()
  x86/sysfs: Fix off-by-one error in loop termination
  x86/mm: Fix fault error path using unsafe vma pointer
  x86/numachip: Add const and __initconst to numachip2_clockevent
  x86/fpu: Reinitialize FPU registers if restoring FPU state fails
  x86/fpu: Don't let userspace set bogus xcomp_bv
  qxl: fix framebuffer unpinning
  Linux 4.14-rc2
  staging: iio: ad7192: Fix - use the dedicated reset function avoiding dma from stack.
  iio: core: Return error for failed read_reg
  iio: ad7793: Fix the serial interface reset
  iio: ad_sigma_delta: Implement a dedicated reset function
  IIO: BME280: Updates to Humidity readings need ctrl_reg write!
  iio: adc: mcp320x: Fix readout of negative voltages
  iio: adc: mcp320x: Fix oops on module unload
  iio: adc: stm32: fix bad error check on max_channels
  iio: trigger: stm32-timer: fix a corner case to write preset
  iio: trigger: stm32-timer: preset shouldn't be buffered
  iio: adc: twl4030: Return an error if we can not enable the vusb3v1 regulator in 'twl4030_madc_probe()'
  iio: adc: twl4030: Disable the vusb3v1 rugulator in the error handling path of 'twl4030_madc_probe()'
  iio: adc: twl4030: Fix an error handling path in 'twl4030_madc_probe()'
  x86/fpu: Turn WARN_ON() in context switch into WARN_ON_FPU()
  x86/fpu: Fix boolreturn.cocci warnings
  x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs
  x86/fpu: Remove struct fpu::fpregs_active
  x86/fpu: Decouple fpregs_activate()/fpregs_deactivate() from fpu->fpregs_active
  x86/fpu: Change fpu->fpregs_active users to fpu->fpstate_active
  x86/fpu: Split the state handling in fpu__drop()
  x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic
  x86/fpu: Simplify fpu->fpregs_active use
  x86/fpu: Flip the parameter order in copy_*_to_xstate()
  x86/fpu: Remove 'kbuf' parameter from the copy_user_to_xstate() API
  x86/fpu: Remove 'ubuf' parameter from the copy_kernel_to_xstate() API
  x86/fpu: Split copy_user_to_xstate() into copy_kernel_to_xstate() & copy_user_to_xstate()
  x86/fpu: Simplify __copy_xstate_to_kernel() return values
  x86/fpu: Change 'size_total' parameter to unsigned and standardize the size checks in copy_xstate_to_*()
  x86/fpu: Clarify parameter names in the copy_xstate_to_*() methods
  x86/fpu: Remove the 'start_pos' parameter from the __copy_xstate_to_*() functions
  x86/fpu: Clean up the parameter definitions of copy_xstate_to_*()
  x86/fpu: Clean up parameter order in the copy_xstate_to_*() APIs
  x86/fpu: Remove 'kbuf' parameter from the copy_xstate_to_user() APIs
  x86/fpu: Remove 'ubuf' parameter from the copy_xstate_to_kernel() APIs
  x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user()
  x86/fpu: Rename copyin_to_xsaves()/copyout_from_xsaves() to copy_user_to_xstate()/copy_xstate_to_user()
  tpm: ibmvtpm: simplify crq initialization and document crq format
  tpm: replace msleep() with  usleep_range() in TPM 1.2/2.0 generic drivers
  Documentation: tpm: add powered-while-suspended binding documentation
  tpm: tpm_crb: constify acpi_device_id.
  tpm: vtpm: constify vio_device_id
  security: fix description of values returned by cap_inode_need_killpriv
  net: qualcomm: rmnet: Fix rcu splat in rmnet_is_real_dev_registered
  cnic: Fix an error handling path in 'cnic_alloc_bnx2x_resc()'
  tracing: Remove RCU work arounds from stack tracer
  extable: Enable RCU if it is not watching in kernel_text_address()
  extable: Consolidate *kernel_text_address() functions
  rcu: Allow for page faults in NMI handlers
  as3645a: Unregister indicator LED on device unbind
  as3645a: Use integer numbers for parsing LEDs
  dt: bindings: as3645a: Use LED number to refer to LEDs
  as3645a: Use ams,input-max-microamp as documented in DT bindings
  x86/asm: Fix inline asm call constraints for Clang
  objtool: Handle another GCC stack pointer adjustment bug
  inet: fix improper empty comparison
  net: use inet6_rcv_saddr to compare sockets
  net: set tb->fast_sk_family
  net: orphan frags on stand-alone ptype in dev_queue_xmit_nit
  MAINTAINERS: update git tree locations for ieee802154 subsystem
  SMB3: Don't ignore O_SYNC/O_DSYNC and O_DIRECT flags
  SMB3: handle new statx fields
  arch: remove unused *_segments() macros/functions
  parisc: Unbreak bootloader due to gcc-7 optimizations
  parisc: Reintroduce option to gzip-compress the kernel
  apparmor: fix apparmorfs DAC access permissions
  apparmor: fix build failure on sparc caused by undeclared signals
  apparmor: fix incorrect type assignment when freeing proxies
  apparmor: ensure unconfined profiles have dfas initialized
  apparmor: fix race condition in null profile creation
  apparmor: move new_null_profile to after profile lookup fns()
  apparmor: add base infastructure for socket mediation
  apparmor: add more debug asserts to apparmorfs
  apparmor: make policy_unpack able to audit different info messages
  apparmor: add support for absolute root view based labels
  apparmor: cleanup conditional check for label in label_print
  apparmor: add mount mediation
  apparmor: add the ability to mediate signals
  apparmor: Redundant condition: prev_ns. in [label.c:1498]
  apparmor: Fix an error code in aafs_create()
  apparmor: Fix logical error in verify_header()
  apparmor: Fix shadowed local variable in unpack_trans_table()
  bnxt_re: Don't issue cmd to delete GID for QP1 GID entry before the QP is destroyed
  bnxt_re: Fix memory leak in FRMR path
  bnxt_re: Remove RTNL lock dependency in bnxt_re_query_port
  bnxt_re: Fix race between the netdev register and unregister events
  bnxt_re: Free up devices in module_exit path
  bnxt_re: Fix compare and swap atomic operands
  bnxt_re: Stop issuing further cmds to FW once a cmd times out
  bnxt_re: Fix update of qplib_qp.mtu when modified
  parisc: Add HWPOISON page fault handler code
  parisc: Move init_per_cpu() into init section
  parisc: Check if initrd was loaded into broken RAM
  parisc: Add PDCE_CHECK instruction to HPMC handler
  parisc: Add wrapper for pdc_instr() firmware function
  parisc: Move start_parisc() into init section
  parisc: Stop unwinding at start of stack
  parisc: Fix too large frame size warnings
  i40iw: Add support for port reuse on active side connections
  i40iw: Add missing VLAN priority
  i40iw: Call i40iw_cm_disconn on modify QP to disconnect
  i40iw: Prevent multiple netdev event notifier registrations
  i40iw: Fail open if there are no available MSI-X vectors
  RDMA/vmw_pvrdma: Fix reporting correct opcodes for completion
  IB/bnxt_re: Fix frame stack compilation warning
  IB/mlx5: fix debugfs cleanup
  IB/ocrdma: fix incorrect fall-through on switch statement
  IB/ipoib: Suppress the retry related completion errors
  Input: elan_i2c - extend Flash-Write delay
  iw_cxgb4: remove the stid on listen create failure
  iw_cxgb4: drop listen destroy replies if no ep found
  iw_cxgb4: put ep reference in pass_accept_req()
  USB: g_mass_storage: Fix deadlock when driver is unbound
  USB: gadgetfs: Fix crash caused by inadequate synchronization
  USB: gadgetfs: fix copy_to_user while holding spinlock
  USB: uas: fix bug in handling of alternate settings
  IB/core: Fix for core panic
  cgroup: Reinit cgroup_taskset structure before cgroup_migrate_execute() returns
  ALSA: usb-audio: Check out-of-bounds access by corrupted buffer descriptor
  drivers/perf: arm_pmu_acpi: Release memory obtained by kasprintf
  iommu/of: Remove PCI host bridge node check
  ALSA: pcm: Fix structure definition for X32 ABI
  mmc: sdhci-pci: Fix voltage switch for some Intel host controllers
  staging: rtl8723bs: avoid null pointer dereference on pmlmepriv
  staging: rtl8723bs: add missing range check on id
  mmc: tmio: remove broken and noisy debug macro
  KVM: PPC: Book3S HV: Check for updated HDSISR on P9 HDSI exception
  KVM: nVMX: fix HOST_CR3/HOST_CR4 cache
  Drivers: hv: fcopy: restore correct transfer length
  vmbus: don't acquire the mutex in vmbus_hvsock_device_unregister()
  intel_th: pci: Add Lewisburg PCH support
  intel_th: pci: Add Cedar Fork PCH support
  stm class: Fix a use-after-free
  usb-storage: unusual_devs entry to fix write-access regression for Seagate external drives
  usb-storage: fix bogus hardware error messages for ATA pass-thru devices
  drm/sun4i: cec: Enable back CEC-pin framework
  net: prevent dst uses after free
  net: phy: Fix truncation of large IRQ numbers in phy_attached_print()
  dt-bindings: clk: stm32h7: fix clock-cell size
  Input: uinput - avoid crash when sending FF request to device going away
  Input: uinput - avoid FF flush when destroying device
  net/smc: no close wait in case of process shut down
  net/smc: introduce a delay
  net/smc: terminate link group if out-of-sync is received
  net/smc: longer delay for client link group removal
  net/smc: adapt send request completion notification
  net/smc: adjust net_device refcount
  net/smc: take RCU read lock for routing cache lookup
  net/smc: add receive timeout check
  net/smc: add missing dev_put
  net: stmmac: Cocci spatch "of_table"
  lan78xx: Use default values loaded from EEPROM/OTP after reset
  lan78xx: Allow EEPROM write for less than MAX_EEPROM_SIZE
  lan78xx: Fix for eeprom read/write when device auto suspend
  net: phy: Keep reporting transceiver type
  net: ethtool: Add back transceiver type
  net: qcom/emac: add software control for pause frame mode
  hv_netvsc: fix send buffer failure on MTU change
  net_sched: remove cls_flower idr on failure
  net_sched/hfsc: fix curve activation in hfsc_change_class()
  net_sched: always reset qdisc backlog in qdisc_reset()
  x86/xen: clean up clang build warning
  USB: core: harden cdc_parse_cdc_header
  ath10k: mark PM functions as __maybe_unused
  MIPS: PCI: fix pcibios_map_irq section mismatch
  MIPS: Fix input modify in __write_64bit_c0_split()
  MIPS: MSP71xx: Include asm/setup.h
  selftests: lib.mk: copy test scripts and test files for make O=dir run
  selftests: sync: kselftest and kselftest-clean fail for make O=dir case
  selftests: sync: use TEST_CUSTOM_PROGS instead of TEST_PROGS
  selftests: lib.mk: add TEST_CUSTOM_PROGS to allow custom test run/install
  selftests: watchdog: fix to use TEST_GEN_PROGS and remove clean
  selftests: lib.mk: fix test executable status check to use full path
  selftests: Makefile: clear LDFLAGS for make O=dir use-case
  selftests: lib.mk: kselftest and kselftest-clean fail for make O=dir case
  Makefile: kselftest and kselftest-clean fail for make O=dir case
  reset: Restrict RESET_HSDK to ARC_SOC_HSDK or COMPILE_TEST
  Revert "genirq: Restrict effective affinity to interrupts actually using it"
  powerpc/pseries: Fix parent_dn reference leak in add_dt_node()
  powerpc/pseries: Fix "OF: ERROR: Bad of_node_put() on /cpus" during DLPAR
  powerpc/eeh: Create PHB PEs after EEH is initialized
  ipc/shm: Fix order of parameters when calling copy_compat_shmid_to_user
  iov_iter: fix page_copy_sane for compound pages
  SMB: Validate negotiate (to protect against downgrade) even if signing off
  cifs: release auth_key.response for reconnect.
  cifs: release cifs root_cred after exit_cifs
  CIFS: make arrays static const, reduces object code size
  net: hns3: Fix for pri to tc mapping in TM
  net: hns3: Fix for setting rss_size incorrectly
  net: hns3: Fix typo error for feild in hclge_tm
  net: hns3: Fix for rx priv buf allocation when DCB is not supported
  net: hns3: Fix for rx_priv_buf_alloc not setting rx shared buffer
  net: hns3: Fix for not setting rx private buffer size to zero
  net: hns3: Fix for DEFAULT_DV when dev doesn't support DCB
  net: hns3: Fix initialization when cmd is not supported
  net: hns3: Cleanup for ROCE capability flag in ae_dev
  isdn/i4l: fetch the ppp_write buffer in one shot
  net: fec: return IRQ_HANDLED if fec_ptp_check_pps_event handled it
  net: fec: remove unused interrupt FEC_ENET_TS_TIMER
  net: fec: only check queue 0 if RXF_0/TXF_0 interrupt is set
  net: change skb->mac_header when Generic XDP calls adjust_head
  net: compat: assert the size of cmsg copied in is as expected
  drm/amdkfd: Print event limit messages only once per process
  drm/amdkfd: Fix kernel-queue wrapping bugs
  drm/amdkfd: Fix incorrect destroy_mqd parameter
  [SMB3] Update session and share information displayed for debugging SMB2/SMB3
  bpf: one perf event close won't free bpf program attached by another perf event
  packet: hold bind lock when rebinding to fanout hook
  ALSA: usb-audio: Add sample rate quirk for Plantronics C310/C520-M
  PCI: endpoint: Use correct "end of test" interrupt
  scripts/dtc: dtx_diff - 2nd update of include dts paths to match build
  kbuild: rpm-pkg: fix version number handling
  kbuild: deb-pkg: remove firmware package support
  kbuild: rpm-pkg: delete firmware_install to fix build error
  qtnfmac: cancel scans on wireless interface changes
  qtnfmac: lock access to h/w in tx path
  usb: gadget: dummy: fix nonsensical comparisons
  usb: gadget: udc: fix snps_udc_plat.c build errors
  usb: gadget: function: printer: avoid spinlock recursion
  usb: gadget: core: fix ->udc_set_speed() logic
  s390/topology: enable / disable topology dynamically
  s390/topology: alternative topology for topology-less machines
  powerpc/kprobes: Update optprobes to use emulate_update_regs()
  ALSA: hda - program ICT bits to support HBR audio
  crypto: af_alg - update correct dst SGL entry
  crypto: caam - fix LS1021A support on ARMv7 multiplatform kernel
  crypto: inside-secure - fix gcc-4.9 warnings
  crypto: talitos - Don't provide setkey for non hmac hashing algs.
  crypto: talitos - fix hashing
  crypto: talitos - fix sha224
  crypto: x86/twofish - Fix RBP usage
  crypto: sha512-avx2 - Fix RBP usage
  crypto: x86/sha256-ssse3 - Fix RBP usage
  crypto: x86/sha256-avx2 - Fix RBP usage
  crypto: x86/sha256-avx - Fix RBP usage
  crypto: x86/sha1-ssse3 - Fix RBP usage
  crypto: x86/sha1-avx2 - Fix RBP usage
  crypto: x86/des3_ede - Fix RBP usage
  crypto: x86/cast6 - Fix RBP usage
  crypto: x86/cast5 - Fix RBP usage
  crypto: x86/camellia - Fix RBP usage
  crypto: x86/blowfish - Fix RBP usage
  crypto: drbg - fix freeing of resources
  MIPS: Fix perf event init
  ARM: dts: da850-evm: add serial and ethernet aliases
  cifs: show 'soft' in the mount options for hard mounts
  SMB3: Warn user if trying to sign connection that authenticated as guest
  SMB3: Fix endian warning
  brcmfmac: setup passive scan if requested by user-space
  brcmfmac: add length check in brcmf_cfg80211_escan_handler()
  powerpc/powernv: Clear LPCR[PECE1] via stop-api only for deep state offline
  powerpc/sstep: mullw should calculate a 64 bit signed result
  powerpc/sstep: Fix issues with mcrf
  powerpc/sstep: Fix issues with set_cr0()
  powerpc/tm: Flush TM only if CPU has TM feature
  powerpc/sysrq: Fix oops whem ppmu is not registered
  powerpc/configs: Update for CONFIG_SND changes
  drm/exynos/hdmi: Fix unsafe list iteration
  Fix SMB3.1.1 guest authentication to Samba
  ipv6: fix net.ipv6.conf.all interface DAD handlers
  net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure
  bpf: fix ri->map_owner pointer on bpf_prog_realloc
  net: emac: Fix napi poll list corruption
  tcp: fastopen: fix on syn-data transmit failure
  net: hns3: Fixes the premature exit of loop when matching clients
  net: hns3: Fixes the default VLAN-id of PF
  net: hns3: Fixes the ether address copy with appropriate API
  net: hns3: Fixes the initialization of MAC address in hardware
  net: hns3: Fixes ring-to-vector map-and-unmap command
  net: hns3: Fixes the command used to unmap ring from vector
  net: hns3: Fixes initialization of phy address from firmware
  cpufreq: ti-cpufreq: Support additional am43xx platforms
  bpf: do not disable/enable BH in bpf_map_free_id()
  tracing: Fix trace_pipe behavior for instance traces
  rhashtable: Documentation tweak
  ACPI: properties: Return _DSD hierarchical extension (data) sub-nodes correctly
  ARM: cpuidle: Avoid memleak if init fail
  cpufreq: dt-platdev: Add some missing platforms to the blacklist
  PM: core: Fix device_pm_check_callbacks()
  PM: docs: Drop an excess character from devices.rst
  net: phy: Kconfig: Fix PHY infrastructure menu in menuconfig
  ACPI / bus: Make ACPI_HANDLE() work for non-GPL code again
  selftests/net: msg_zerocopy enable build with older kernel headers
  selftests: actually run the various net selftests
  selftest: add a reuseaddr test
  selftests: silence test output by default
  ALSA: asihpi: fix a potential double-fetch bug when copying puhm
  MIPS: PCI: Move map_irq() hooks out of initdata
  ceph: avoid panic in create_session_open_msg() if utsname() returns NULL
  irqchip.mips-gic: Fix shared interrupt mask writes
  irqchip/gic-v4: Fix building with ancient gcc
  irqchip/gic-v3: Iterate over possible CPUs by for_each_possible_cpu()
  libceph: don't allow bidirectional swap of pg-upmap-items
  ARM: dts: am43xx-epos-evm: Remove extra CPSW EMAC entry
  ARM: dts: am33xx: Add spi alias to match SOC schematics
  ARM: OMAP2+: hsmmc: fix logic to call either omap_hsmmc_init or omap_hsmmc_late_init but not both
  ARM: dts: dra7: Set a default parent to mcasp3_ahclkx_mux
  ARM: OMAP2+: dra7xx: Set OPT_CLKS_IN_RESET flag for gpio1
  ARM: dts: nokia n900: drop unneeded/undocumented parts of the dts
  MAINTAINERS: Remove Yuval Mintz from maintainers list
  arm64: dts: rockchip: Correct MIPI DPHY PLL clock on rk3399
  dt-bindings: fix vendor prefix for Abracon
  of: provide inline helper for of_find_device_by_node
  tracing: Ignore mmiotrace from kernel commandline
  tracing: Erase irqsoff trace with empty write
  USB: fix out-of-bounds in usb_set_configuration
  arm64: dt marvell: Fix AP806 system controller size
  MAINTAINERS: add Macchiatobin maintainers entry
  iommu/qcom: Depend on HAS_DMA to fix compile error
  xen, arm64: drop dummy lookup_address()
  KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt
  KVM: VMX: do not change SN bit in vmx_update_pi_irte()
  KVM: x86: Fix the NULL pointer parameter in check_cr_write()
  drm: exynos: include linux/irq.h
  drm/exynos: Fix suspend/resume support
  drm/exynos: Fix locking in the suspend/resume paths
  iommu/vt-d: Fix harmless section mismatch warning
  iommu: Add missing dependencies
  driver core: remove DRIVER_ATTR
  fpga: altera-cvp: remove DRIVER_ATTR() usage
  Revert "KVM: Don't accept obviously wrong gsi values via KVM_IRQFD"
  s390/mm: fix write access check in gup_huge_pmd()
  s390/mm: make pmdp_invalidate() do invalidation only
  s390/cio: recover from bad paths
  s390/scm_blk: consistently use blk_status_t as error type
  net: systemport: Fix 64-bit statistics dependency
  8139too: revisit napi_complete_done() usage
  fcntl: Don't set si_code to SI_SIGIO when sig == SIGPOLL
  ata_piix: Add Fujitsu-Siemens Lifebook S6120 to short cable IDs
  Documentation: core-api: minor workqueue.rst cleanups
  libnvdimm, namespace: fix btt claim class crash
  tcp: remove two unused functions
  tools/testing/nvdimm: disable labels for nfit_test.1
  bpf: devmap: pass on return value of bpf_map_precharge_memlock
  bnxt_en: check for ingress qdisc in flower offload
  ACPI / watchdog: properly initialize resources
  Documentation: networking: fix ASCII art in switchdev.txt
  net/sched: cls_matchall: fix crash when used with classful qdisc
  ip6_tunnel: do not allow loading ip6_tunnel if ipv6 is disabled in cmdline
  net: phy: Fix mask value write on gmii2rgmii converter speed register
  drm/i915: Remove unused 'in_vbl' from i915_get_crtc_scanoutpos()
  drm/i915/cnp: set min brightness from VBT
  Revert "drm/i915/bxt: Disable device ready before shutdown command"
  drm/i915/bxt: set min brightness from VBT
  drm/i915: Fix an error handling in 'intel_framebuffer_init()'
  drm/i915/gvt: Fix incorrect PCI BARs reporting
  ip6_gre: skb_push ipv6hdr before packing the header in ip6gre_header
  nl80211: fix null-ptr dereference on invalid mesh configuration
  udpv6: Fix the checksum computation when HW checksum does not apply
  selftests/ftrace: multiple_kprobes: Also check for support
  selftests/bpf: Make bpf_util work on uniprocessor systems
  selftests/intel_pstate: No need to compile test progs in the run script
  selftests: intel_pstate: build only on x86
  selftests: breakpoints: re-order TEST_GEN_PROGS targets
  tools: fix testing/selftests/sigaltstack for s390x
  selftests: net: More graceful finding of `ip'.
  serial: sccnxp: Fix error handling in sccnxp_probe()
  tty: serial: lpuart: avoid report NULL interrupt
  serial: bcm63xx: fix timing issue.
  mxser: fix timeout calculation for low rates
  serial: sh-sci: document R8A77970 bindings
  netfilter: ipset: ipset list may return wrong member count for set with timeout
  netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div
  driver core: platform: Don't read past the end of "driver_override" buffer
  Revert "xhci: Limit USB2 port wake support for AMD Promontory hosts"
  xhci: set missing SuperSpeedPlus Link Protocol bit in roothub descriptor
  xhci: Fix sleeping with spin_lock_irq() held in ASmedia 1042A workaround
  usb: host: xhci-plat: allow sysdev to inherit from ACPI
  xhci: fix wrong endpoint ESIT value shown in tracing
  usb: pci-quirks.c: Corrected timeout values used in handshake
  xhci: fix finding correct bus_state structure for USB 3.1 hosts
  usb: xhci: Free the right ring in xhci_add_endpoint()
  base: arch_topology: fix section mismatch build warnings
  driver core: suppress sending MODALIAS in UNBIND uevents
  nvmem: add missing of_node_put() in of_nvmem_cell_get()
  nvmem: core: return EFBIG on out-of-range write
  auxdisplay: charlcd: properly restore atomic counter on error path
  binder: fix memory corruption in binder_transaction binder
  binder: fix an ret value override
  android: binder: fix type mismatch warning
  ALSA: compress: Remove unused variable
  xen: don't compile pv-specific parts if XEN_PV isn't configured
  mtd: nand: remove unused blockmask variable
  PM / QoS: Use the correct variable to check the QoS request type
  ACPI / PMIC: Add code reviewers to MAINTAINERS
  driver core: Fix link to device power management documentation
  ARC: reset: remove the misleading v1 suffix all over
  usb: dwc3: ep0: fix DMA starvation by assigning req->trb on ep0
  staging: vchiq_2835_arm: Fix NULL ptr dereference in free_pagelist
  staging: speakup: fix speakup-r empty line lockup
  staging: pi433: Move limit check to switch default to kill warning
  staging: r8822be: fix null pointer dereferences with a null driver_adapter
  staging: mt29f_spinand: Enable the read ECC before program the page
  staging: unisys/visorbus: add __init/__exit annotations
  isofs: fix build regression
  quota: add missing lock into __dquot_transfer()
  arm64: ensure the kernel is compiled for LP64
  arm64: relax assembly code alignment from 16 byte to 4 byte
  arm64: efi: Don't include EFI fpsimd save/restore code in non-EFI kernels
  mtd: nand: lpc32xx_mlc: Fix an error handling path in lpc32xx_nand_probe()
  usb: Increase quirk delay for USB devices
  uwb: properly check kthread_run return value
  uwb: ensure that endpoint is interrupt
  ARC: reset: add missing DT binding documentation for HSDKv1 reset driver
  ARC: reset: Only build on archs that have IOMEM
  ARM: at91: Replace uses of virt_to_phys with __pa_symbol
  ARM: dts: at91: sama5d27_som1_ek: fix USB host vbus
  ARM: dts: at91: sama5d27_som1_ek: fix typos
  ARM: dts: at91: sama5d27_som1_ek: update pinmux/pinconf for LEDs and USB
  mtd: spi-nor: fix DMA unsafe buffer issue in spi_nor_read_sfdp()
  mtd: spi-nor: Check consistency of the memory size extracted from the SFDP
  clocksource/integrator: Fix section mismatch warning
  Update version of cifs module
  cifs: hide unused functions
  SMB3: Add support for multidialect negotiate (SMB2.1 and later)
  arm64/syscalls: Move address limit check in loop
  arm/syscalls: Optimize address limit check
  Revert "arm/syscalls: Check address limit on user-mode return"
  syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check
  x86/mm/32: Load a sane CR3 before cpu_init() on secondary CPUs
  x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier
  x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware code
  x86/mm: Factor out CR3-building code
  CIFS/SMB3: Update documentation to reflect SMB3 and various changes
  dma-coherent: fix rmem_dma_device_init regression
  clk: rockchip: add sclk_timer5 as critical clock on rk3128
  clk: rockchip: fix up rk3128 pvtm and mipi_24m gate regs error
  clk: rockchip: add pclk_pmu as critical clock on rk3128
  Revert "arm64: dts: rockchip: Add basic cpu frequencies for RK3368"
  genirq: Fix cpumask check in __irq_startup_managed()
  scsi: aacraid: error: testing array offset 'bus' after use
  scsi: lpfc: Don't return internal MBXERR_ERROR code from probe function
  fs/proc: Report eip/esp in /prod/PID/stat for coredumping
  xen: x86: mark xen_find_pt_base as __init
  scsi: aacraid: Fix 2T+ drives on SmartIOC-2000
  scsi: sg: fixup infoleak when using SG_GET_REQUEST_TABLE
  scsi: sg: factor out sg_fill_request_table()
  scsi: sd: Remove unnecessary condition in sd_read_block_limits()
  drm/radeon: disable hard reset in hibernate for APUs
  objtool: Fix object file corruption
  objtool: Do not retrieve data from empty sections
  objtool: Fix memory leak in elf_create_rela_section()
  x86/cpu/AMD: Fix erratum 1076 (CPB bit)
  nl80211: check for the required netlink attributes presence
  scsi: acornscsi: fix build error
  scsi: scsi_transport_fc: fix NULL pointer dereference in fc_bsg_job_timeout
  drm/amdgpu: revert tile table update for oland
  watchdog/hardlockup: Clean up hotplug locking mess
  watchdog/hardlockup/perf: Simplify deferred event destroy
  watchdog/hardlockup/perf: Use new perf CPU enable mechanism
  watchdog/hardlockup/perf: Implement CPU enable replacement
  watchdog/hardlockup/perf: Implement init time detection of perf
  watchdog/hardlockup/perf: Implement init time perf validation
  watchdog/core: Get rid of the racy update loop
  watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage
  watchdog/sysctl: Clean up sysctl variable name space
  watchdog/sysctl: Get rid of the #ifdeffery
  watchdog/core: Clean up header mess
  watchdog/core: Further simplify sysctl handling
  watchdog/core: Get rid of the thread teardown/setup dance
  watchdog/core: Create new thread handling infrastructure
  smpboot/threads, watchdog/core: Avoid runtime allocation
  watchdog/core: Split out cpumask write function
  watchdog/core: Clean up the #ifdef maze
  watchdog/core: Clean up stub functions
  watchdog/core: Remove the park_in_progress obfuscation
  watchdog/hardlockup/perf: Prevent CPU hotplug deadlock
  watchdog/hardlockup/perf: Remove broken self disable on failure
  watchdog/core: Mark hardlockup_detector_disable() __init
  watchdog/core: Rename watchdog_proc_mutex
  watchdog/core: Rework CPU hotplug locking
  watchdog/core: Remove broken suspend/resume interfaces
  parisc, watchdog/core: Use lockup_detector_stop()
  watchdog/core: Provide interface to stop from poweroff()
  perf/x86/intel, watchdog/core: Sanitize PMU HT bug workaround
  watchdog/hardlockup: Provide interface to stop/restart perf events
  HID: wacom: generic: Clear ABS_MISC when tool leaves proximity
  HID: wacom: generic: Send MSC_SERIAL and ABS_MISC when leaving prox
  HID: i2c-hid: allocate hid buffers for real worst case
  s390/dasd: fix race during dasd initialization
  s390/perf: fix bug when creating per-thread event
  etnaviv: fix gem object list corruption
  etnaviv: fix submit error path
  cifs: check rsp for NULL before dereferencing in SMB2_open
  qxl: fix primary surface handling
  drm/amdkfd: check for null dev to avoid a null pointer dereference
  mmc: cavium: Fix use-after-free in of_platform_device_destroy
  mmc: host: fix typo after MMC_DEBUG move
  mmc: block: Fix incorrectly initialized requests
  HID: rmi: Make sure the HID device is opened on resume
  iwlwifi: mvm: fix reorder buffer for 9000 devices
  iwlwifi: mvm: set status before calling iwl_mvm_send_cmd_status()
  iwlwifi: mvm: initialize status in iwl_mvm_add_int_sta_common()
  iwlwifi: mvm: handle FIF_ALLMULTI when setting multicast addresses
  iwlwifi: mvm: use IWL_HCMD_NOCOPY for MCAST_FILTER_CMD
  iwlwifi: mvm: wake the correct mac80211 queue
  iwlwifi: mvm: change state when queueing agg start work
  iwlwifi: mvm: send all non-bufferable frames on the probe queue
  iwlwifi: mvm: Flush non STA TX queues
  iwlwifi: mvm: fix wowlan resume failed to load INIT ucode
  ata: avoid gcc-7 warning in ata_timing_quantize
  HID: multitouch: Support ALPS PTP stick with pid 0x120A
  HID: multitouch: support buttons and trackpoint on Lenovo X1 Tab Gen2
  HID: wacom: Correct coordinate system of touchring and pen twist
  HID: wacom: Properly report negative values from Intuos Pro 2 Bluetooth
  HID: multitouch: Fix system-control buttons not working
  HID: add multi-input quirk for IDC6680 touchscreen
  HID: wacom: leds: Don't try to control the EKR's read-only LEDs
  HID: wacom: bits shifted too much for 9th and 10th buttons
  md/raid5: preserve STRIPE_ON_UNPLUG_LIST in break_stripe_batch_list
  ARM64: dts: meson-gxbb: nanopi-k2: enable sdr104 mode
  ARM64: dts: meson-gxbb: nanopi-k2: enable sdcard UHS modes
  ARM64: dts: meson-gxbb: p20x: enable sdcard UHS modes
  ARM64: dts: meson-gxl: libretech-cc: enable high speed modes
  ARM64: dts: meson-gxl: libretech-cc: add card regulator settle times
  ARM64: dts: meson-gxbb: nanopi-k2: add card regulator settle times
  ARM64: dts: meson: add mmc clk gate pins
  ARM64: dts: meson: remove cap-sd-highspeed from emmc nodes
  ARM64: dts: meson-gx: Use correct mmc clock source 0
  md/raid5: fix a race condition in stripe batch
  iio: magnetometer: st_magn: fix drdy line configuration for LIS3MDL
  iio: adc: ti-ads1015: fix comparator polarity setting
  drm/amdkfd: pass queue's mqd when destroying mqd
  drm/amdkfd: remove memset before memcpy
  powerpc/e6500: Update machine check for L1D cache err
  samples: Unrename SECCOMP_RET_KILL
  selftests/seccomp: Test thread vs process killing
  seccomp: Implement SECCOMP_RET_KILL_PROCESS action
  seccomp: Introduce SECCOMP_RET_KILL_PROCESS
  seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD
  seccomp: Action to log before allowing
  seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW
  seccomp: Selftest for detection of filter flag support
  seccomp: Sysctl to configure actions that are allowed to be logged
  seccomp: Operation for checking if an action is available
  seccomp: Sysctl to display available actions
  seccomp: Provide matching filter for introspection
  selftests/seccomp: Refactor RET_ERRNO tests
  selftests/seccomp: Add simple seccomp overhead benchmark
  selftests/seccomp: Add tests for basic ptrace actions
  uapi linux/kfd_ioctl.h: only use __u32 and __u64
  tile: array underflow in setup_maxnodemem()
  tile: defconfig: Cleanup from old Kconfig options

  Conflicts:
	include/scsi/scsi_device.h

Change-Id: Ia72943c891d02c72b704c2408185eceab9df59ae
Signed-off-by: Runmin Wang <runminw@codeaurora.org>
2017-10-11 17:36:44 -07:00

1962 lines
50 KiB
C

/*
* fs/userfaultfd.c
*
* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
* Copyright (C) 2008-2009 Red Hat, Inc.
* Copyright (C) 2015 Red Hat, Inc.
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
* Some part derived from fs/eventfd.c (anon inode setup) and
* mm/ksm.c (mm hashing).
*/
#include <linux/list.h>
#include <linux/hashtable.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/bug.h>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
enum userfaultfd_state {
UFFD_STATE_WAIT_API,
UFFD_STATE_RUNNING,
};
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline.
*/
struct userfaultfd_ctx {
/* waitqueue head for the pending (i.e. not read) userfaults */
wait_queue_head_t fault_pending_wqh;
/* waitqueue head for the userfaults */
wait_queue_head_t fault_wqh;
/* waitqueue head for the pseudo fd to wakeup poll/read */
wait_queue_head_t fd_wqh;
/* waitqueue head for events */
wait_queue_head_t event_wqh;
/* a refile sequence protected by fault_pending_wqh lock */
struct seqcount refile_seq;
/* pseudo fd refcounting */
atomic_t refcount;
/* userfaultfd syscall flags */
unsigned int flags;
/* features requested from the userspace */
unsigned int features;
/* state machine */
enum userfaultfd_state state;
/* released */
bool released;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
struct userfaultfd_fork_ctx {
struct userfaultfd_ctx *orig;
struct userfaultfd_ctx *new;
struct list_head list;
};
struct userfaultfd_unmap_ctx {
struct userfaultfd_ctx *ctx;
unsigned long start;
unsigned long end;
struct list_head list;
};
struct userfaultfd_wait_queue {
struct uffd_msg msg;
wait_queue_entry_t wq;
struct userfaultfd_ctx *ctx;
bool waken;
};
struct userfaultfd_wake_range {
unsigned long start;
unsigned long len;
};
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
struct userfaultfd_wake_range *range = key;
int ret;
struct userfaultfd_wait_queue *uwq;
unsigned long start, len;
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
ret = 0;
/* len == 0 means wake all */
start = range->start;
len = range->len;
if (len && (start > uwq->msg.arg.pagefault.address ||
start + len <= uwq->msg.arg.pagefault.address))
goto out;
WRITE_ONCE(uwq->waken, true);
/*
* The Program-Order guarantees provided by the scheduler
* ensure uwq->waken is visible before the task is woken.
*/
ret = wake_up_state(wq->private, mode);
if (ret) {
/*
* Wake only once, autoremove behavior.
*
* After the effect of list_del_init is visible to the other
* CPUs, the waitqueue may disappear from under us, see the
* !list_empty_careful() in handle_userfault().
*
* try_to_wake_up() has an implicit smp_mb(), and the
* wq->private is read before calling the extern function
* "wake_up_state" (which in turns calls try_to_wake_up).
*/
list_del_init(&wq->entry);
}
out:
return ret;
}
/**
* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
* context.
* @ctx: [in] Pointer to the userfaultfd context.
*/
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
if (!atomic_inc_not_zero(&ctx->refcount))
BUG();
}
/**
* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
* context.
* @ctx: [in] Pointer to userfaultfd context.
*
* The userfaultfd context reference must have been previously acquired either
* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
*/
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
if (atomic_dec_and_test(&ctx->refcount)) {
VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
}
static inline void msg_init(struct uffd_msg *msg)
{
BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
/*
* Must use memset to zero out the paddings or kernel data is
* leaked to userland.
*/
memset(msg, 0, sizeof(struct uffd_msg));
}
static inline struct uffd_msg userfault_msg(unsigned long address,
unsigned int flags,
unsigned long reason,
unsigned int features)
{
struct uffd_msg msg;
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
msg.arg.pagefault.address = address;
if (flags & FAULT_FLAG_WRITE)
/*
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
* was not set in a UFFD_EVENT_PAGEFAULT, it means it
* was a read fault, otherwise if set it means it's
* a write fault.
*/
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
if (reason & VM_UFFD_WP)
/*
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
* not set in a UFFD_EVENT_PAGEFAULT, it means it was
* a missing fault, otherwise if set it means it's a
* write protect fault.
*/
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
if (features & UFFD_FEATURE_THREAD_ID)
msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
return msg;
}
#ifdef CONFIG_HUGETLB_PAGE
/*
* Same functionality as userfaultfd_must_wait below with modifications for
* hugepmd ranges.
*/
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
unsigned long address,
unsigned long flags,
unsigned long reason)
{
struct mm_struct *mm = ctx->mm;
pte_t *pte;
bool ret = true;
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
if (!pte)
goto out;
ret = false;
/*
* Lockless access: we're in a wait_event so it's ok if it
* changes under us.
*/
if (huge_pte_none(*pte))
ret = true;
if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
ret = true;
out:
return ret;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
unsigned long address,
unsigned long flags,
unsigned long reason)
{
return false; /* should never get here */
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
* Verify the pagetables are still not ok after having reigstered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
* userfault that has already been resolved, if userfaultfd_read and
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
* threads.
*/
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
unsigned long address,
unsigned long flags,
unsigned long reason)
{
struct mm_struct *mm = ctx->mm;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd, _pmd;
pte_t *pte;
bool ret = true;
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
goto out;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
goto out;
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
goto out;
pmd = pmd_offset(pud, address);
/*
* READ_ONCE must function as a barrier with narrower scope
* and it must be equivalent to:
* _pmd = *pmd; barrier();
*
* This is to deal with the instability (as in
* pmd_trans_unstable) of the pmd.
*/
_pmd = READ_ONCE(*pmd);
if (!pmd_present(_pmd))
goto out;
ret = false;
if (pmd_trans_huge(_pmd))
goto out;
/*
* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
* and use the standard pte_offset_map() instead of parsing _pmd.
*/
pte = pte_offset_map(pmd, address);
/*
* Lockless access: we're in a wait_event so it's ok if it
* changes under us.
*/
if (pte_none(*pte))
ret = true;
pte_unmap(pte);
out:
return ret;
}
/*
* The locking rules involved in returning VM_FAULT_RETRY depending on
* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
* recommendation in __lock_page_or_retry is not an understatement.
*
* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
* not set.
*
* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
* set, VM_FAULT_RETRY can still be returned if and only if there are
* fatal_signal_pending()s, and the mmap_sem must be released before
* returning it.
*/
int handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
struct mm_struct *mm = vmf->vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue uwq;
int ret;
bool must_wait, return_to_userland;
long blocking_state;
ret = VM_FAULT_SIGBUS;
/*
* We don't do userfault handling for the final child pid update.
*
* We also don't do userfault handling during
* coredumping. hugetlbfs has the special
* follow_hugetlb_page() to skip missing pages in the
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
* coredumping without mmap_sem and it ends up here.
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
/*
* Coredumping runs without mmap_sem so we can only check that
* the mmap_sem is held, if PF_DUMPCORE was not set.
*/
WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
goto out;
BUG_ON(ctx->mm != mm);
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
/*
* If it's already released don't get it. This avoids to loop
* in __get_user_pages if userfaultfd_release waits on the
* caller of handle_userfault to release the mmap_sem.
*/
if (unlikely(ACCESS_ONCE(ctx->released))) {
/*
* Don't return VM_FAULT_SIGBUS in this case, so a non
* cooperative manager can close the uffd after the
* last UFFDIO_COPY, without risking to trigger an
* involuntary SIGBUS if the process was starting the
* userfaultfd while the userfaultfd was still armed
* (but after the last UFFDIO_COPY). If the uffd
* wasn't already closed when the userfault reached
* this point, that would normally be solved by
* userfaultfd_must_wait returning 'false'.
*
* If we were to return VM_FAULT_SIGBUS here, the non
* cooperative manager would be instead forced to
* always call UFFDIO_UNREGISTER before it can safely
* close the uffd.
*/
ret = VM_FAULT_NOPAGE;
goto out;
}
/*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
* even if FAULT_FLAG_TRIED is set without leading to gup()
* -EBUSY failures, if the userfaultfd is to be extended for
* VM_UFFD_WP tracking and we intend to arm the userfault
* without first stopping userland access to the memory. For
* VM_UFFD_MISSING userfaults this is enough for now.
*/
if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
/*
* Validate the invariant that nowait must allow retry
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
printk(KERN_WARNING
"FAULT_FLAG_ALLOW_RETRY missing %x\n",
vmf->flags);
dump_stack();
}
#endif
goto out;
}
/*
* Handle nowait, not much to do other than tell it to retry
* and wait.
*/
ret = VM_FAULT_RETRY;
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
/* take the reference before dropping the mmap_sem */
userfaultfd_ctx_get(ctx);
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
return_to_userland =
(vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
TASK_KILLABLE;
spin_lock(&ctx->fault_pending_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
* through poll/read().
*/
__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
/*
* The smp_mb() after __set_current_state prevents the reads
* following the spin_unlock to happen before the list_add in
* __add_wait_queue.
*/
set_current_state(blocking_state);
spin_unlock(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vmf->vma))
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
reason);
else
must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
vmf->address,
vmf->flags, reason);
up_read(&mm->mmap_sem);
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
(return_to_userland ? !signal_pending(current) :
!fatal_signal_pending(current)))) {
wake_up_poll(&ctx->fd_wqh, POLLIN);
schedule();
ret |= VM_FAULT_MAJOR;
/*
* False wakeups can orginate even from rwsem before
* up_read() however userfaults will wait either for a
* targeted wakeup on the specific uwq waitqueue from
* wake_userfault() or for signals or for uffd
* release.
*/
while (!READ_ONCE(uwq.waken)) {
/*
* This needs the full smp_store_mb()
* guarantee as the state write must be
* visible to other CPUs before reading
* uwq.waken from other CPUs.
*/
set_current_state(blocking_state);
if (READ_ONCE(uwq.waken) ||
READ_ONCE(ctx->released) ||
(return_to_userland ? signal_pending(current) :
fatal_signal_pending(current)))
break;
schedule();
}
}
__set_current_state(TASK_RUNNING);
if (return_to_userland) {
if (signal_pending(current) &&
!fatal_signal_pending(current)) {
/*
* If we got a SIGSTOP or SIGCONT and this is
* a normal userland page fault, just let
* userland return so the signal will be
* handled and gdb debugging works. The page
* fault code immediately after we return from
* this function is going to release the
* mmap_sem and it's not depending on it
* (unlike gup would if we were not to return
* VM_FAULT_RETRY).
*
* If a fatal signal is pending we still take
* the streamlined VM_FAULT_RETRY failure path
* and there's no need to retake the mmap_sem
* in such case.
*/
down_read(&mm->mmap_sem);
ret = VM_FAULT_NOPAGE;
}
}
/*
* Here we race with the list_del; list_add in
* userfaultfd_ctx_read(), however because we don't ever run
* list_del_init() to refile across the two lists, the prev
* and next pointers will never point to self. list_add also
* would never let any of the two pointers to point to
* self. So list_empty_careful won't risk to see both pointers
* pointing to self at any time during the list refile. The
* only case where list_del_init() is called is the full
* removal in the wake function and there we don't re-list_add
* and it's fine not to block on the spinlock. The uwq on this
* kernel stack can be released after the list_del_init.
*/
if (!list_empty_careful(&uwq.wq.entry)) {
spin_lock(&ctx->fault_pending_wqh.lock);
/*
* No need of list_del_init(), the uwq on the stack
* will be freed shortly anyway.
*/
list_del(&uwq.wq.entry);
spin_unlock(&ctx->fault_pending_wqh.lock);
}
/*
* ctx may go away after this if the userfault pseudo fd is
* already released.
*/
userfaultfd_ctx_put(ctx);
out:
return ret;
}
static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
struct userfaultfd_wait_queue *ewq)
{
if (WARN_ON_ONCE(current->flags & PF_EXITING))
goto out;
ewq->ctx = ctx;
init_waitqueue_entry(&ewq->wq, current);
spin_lock(&ctx->event_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
* through poll/read().
*/
__add_wait_queue(&ctx->event_wqh, &ewq->wq);
for (;;) {
set_current_state(TASK_KILLABLE);
if (ewq->msg.event == 0)
break;
if (ACCESS_ONCE(ctx->released) ||
fatal_signal_pending(current)) {
/*
* &ewq->wq may be queued in fork_event, but
* __remove_wait_queue ignores the head
* parameter. It would be a problem if it
* didn't.
*/
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
if (ewq->msg.event == UFFD_EVENT_FORK) {
struct userfaultfd_ctx *new;
new = (struct userfaultfd_ctx *)
(unsigned long)
ewq->msg.arg.reserved.reserved1;
userfaultfd_ctx_put(new);
}
break;
}
spin_unlock(&ctx->event_wqh.lock);
wake_up_poll(&ctx->fd_wqh, POLLIN);
schedule();
spin_lock(&ctx->event_wqh.lock);
}
__set_current_state(TASK_RUNNING);
spin_unlock(&ctx->event_wqh.lock);
/*
* ctx may go away after this if the userfault pseudo fd is
* already released.
*/
out:
userfaultfd_ctx_put(ctx);
}
static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
struct userfaultfd_wait_queue *ewq)
{
ewq->msg.event = 0;
wake_up_locked(&ctx->event_wqh);
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
}
int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
{
struct userfaultfd_ctx *ctx = NULL, *octx;
struct userfaultfd_fork_ctx *fctx;
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
return 0;
}
list_for_each_entry(fctx, fcs, list)
if (fctx->orig == octx) {
ctx = fctx->new;
break;
}
if (!ctx) {
fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
if (!fctx)
return -ENOMEM;
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
if (!ctx) {
kfree(fctx);
return -ENOMEM;
}
atomic_set(&ctx->refcount, 1);
ctx->flags = octx->flags;
ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
ctx->mm = vma->vm_mm;
atomic_inc(&ctx->mm->mm_count);
userfaultfd_ctx_get(octx);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
}
vma->vm_userfaultfd_ctx.ctx = ctx;
return 0;
}
static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
{
struct userfaultfd_ctx *ctx = fctx->orig;
struct userfaultfd_wait_queue ewq;
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_FORK;
ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
userfaultfd_event_wait_completion(ctx, &ewq);
}
void dup_userfaultfd_complete(struct list_head *fcs)
{
struct userfaultfd_fork_ctx *fctx, *n;
list_for_each_entry_safe(fctx, n, fcs, list) {
dup_fctx(fctx);
list_del(&fctx->list);
kfree(fctx);
}
}
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *vm_ctx)
{
struct userfaultfd_ctx *ctx;
ctx = vma->vm_userfaultfd_ctx.ctx;
if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
}
}
void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
unsigned long from, unsigned long to,
unsigned long len)
{
struct userfaultfd_ctx *ctx = vm_ctx->ctx;
struct userfaultfd_wait_queue ewq;
if (!ctx)
return;
if (to & ~PAGE_MASK) {
userfaultfd_ctx_put(ctx);
return;
}
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMAP;
ewq.msg.arg.remap.from = from;
ewq.msg.arg.remap.to = to;
ewq.msg.arg.remap.len = len;
userfaultfd_event_wait_completion(ctx, &ewq);
}
bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue ewq;
ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
return true;
userfaultfd_ctx_get(ctx);
up_read(&mm->mmap_sem);
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMOVE;
ewq.msg.arg.remove.start = start;
ewq.msg.arg.remove.end = end;
userfaultfd_event_wait_completion(ctx, &ewq);
return false;
}
static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
unsigned long start, unsigned long end)
{
struct userfaultfd_unmap_ctx *unmap_ctx;
list_for_each_entry(unmap_ctx, unmaps, list)
if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
unmap_ctx->end == end)
return true;
return false;
}
int userfaultfd_unmap_prep(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct list_head *unmaps)
{
for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
struct userfaultfd_unmap_ctx *unmap_ctx;
struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
has_unmap_ctx(ctx, unmaps, start, end))
continue;
unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
if (!unmap_ctx)
return -ENOMEM;
userfaultfd_ctx_get(ctx);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
list_add_tail(&unmap_ctx->list, unmaps);
}
return 0;
}
void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
{
struct userfaultfd_unmap_ctx *ctx, *n;
struct userfaultfd_wait_queue ewq;
list_for_each_entry_safe(ctx, n, uf, list) {
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_UNMAP;
ewq.msg.arg.remove.start = ctx->start;
ewq.msg.arg.remove.end = ctx->end;
userfaultfd_event_wait_completion(ctx->ctx, &ewq);
list_del(&ctx->list);
kfree(ctx);
}
}
static int userfaultfd_release(struct inode *inode, struct file *file)
{
struct userfaultfd_ctx *ctx = file->private_data;
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *vma, *prev;
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
ACCESS_ONCE(ctx->released) = true;
if (!mmget_not_zero(mm))
goto wakeup;
/*
* Flush page faults out of all CPUs. NOTE: all page faults
* must be retried without returning VM_FAULT_SIGBUS if
* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
* changes while handle_userfault released the mmap_sem. So
* it's critical that released is set to true (above), before
* taking the mmap_sem for writing.
*/
down_write(&mm->mmap_sem);
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
!!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
if (vma->vm_userfaultfd_ctx.ctx != ctx) {
prev = vma;
continue;
}
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX,
vma_get_anon_name(vma));
if (prev)
vma = prev;
else
prev = vma;
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
up_write(&mm->mmap_sem);
mmput(mm);
wakeup:
/*
* After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
* the fault_*wqh.
*/
spin_lock(&ctx->fault_pending_wqh.lock);
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
spin_unlock(&ctx->fault_pending_wqh.lock);
/* Flush pending events that may still wait on event_wqh */
wake_up_all(&ctx->event_wqh);
wake_up_poll(&ctx->fd_wqh, POLLHUP);
userfaultfd_ctx_put(ctx);
return 0;
}
/* fault_pending_wqh.lock must be hold by the caller */
static inline struct userfaultfd_wait_queue *find_userfault_in(
wait_queue_head_t *wqh)
{
wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
VM_BUG_ON(!spin_is_locked(&wqh->lock));
uwq = NULL;
if (!waitqueue_active(wqh))
goto out;
/* walk in reverse to provide FIFO behavior to read userfaults */
wq = list_last_entry(&wqh->head, typeof(*wq), entry);
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
return uwq;
}
static inline struct userfaultfd_wait_queue *find_userfault(
struct userfaultfd_ctx *ctx)
{
return find_userfault_in(&ctx->fault_pending_wqh);
}
static inline struct userfaultfd_wait_queue *find_userfault_evt(
struct userfaultfd_ctx *ctx)
{
return find_userfault_in(&ctx->event_wqh);
}
static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
{
struct userfaultfd_ctx *ctx = file->private_data;
unsigned int ret;
poll_wait(file, &ctx->fd_wqh, wait);
switch (ctx->state) {
case UFFD_STATE_WAIT_API:
return POLLERR;
case UFFD_STATE_RUNNING:
/*
* poll() never guarantees that read won't block.
* userfaults can be waken before they're read().
*/
if (unlikely(!(file->f_flags & O_NONBLOCK)))
return POLLERR;
/*
* lockless access to see if there are pending faults
* __pollwait last action is the add_wait_queue but
* the spin_unlock would allow the waitqueue_active to
* pass above the actual list_add inside
* add_wait_queue critical section. So use a full
* memory barrier to serialize the list_add write of
* add_wait_queue() with the waitqueue_active read
* below.
*/
ret = 0;
smp_mb();
if (waitqueue_active(&ctx->fault_pending_wqh))
ret = POLLIN;
else if (waitqueue_active(&ctx->event_wqh))
ret = POLLIN;
return ret;
default:
WARN_ON_ONCE(1);
return POLLERR;
}
}
static const struct file_operations userfaultfd_fops;
static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
struct userfaultfd_ctx *new,
struct uffd_msg *msg)
{
int fd;
struct file *file;
unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
fd = get_unused_fd_flags(flags);
if (fd < 0)
return fd;
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
O_RDWR | flags);
if (IS_ERR(file)) {
put_unused_fd(fd);
return PTR_ERR(file);
}
fd_install(fd, file);
msg->arg.reserved.reserved1 = 0;
msg->arg.fork.ufd = fd;
return 0;
}
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
struct uffd_msg *msg)
{
ssize_t ret;
DECLARE_WAITQUEUE(wait, current);
struct userfaultfd_wait_queue *uwq;
/*
* Handling fork event requires sleeping operations, so
* we drop the event_wqh lock, then do these ops, then
* lock it back and wake up the waiter. While the lock is
* dropped the ewq may go away so we keep track of it
* carefully.
*/
LIST_HEAD(fork_event);
struct userfaultfd_ctx *fork_nctx = NULL;
/* always take the fd_wqh lock before the fault_pending_wqh lock */
spin_lock(&ctx->fd_wqh.lock);
__add_wait_queue(&ctx->fd_wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
spin_lock(&ctx->fault_pending_wqh.lock);
uwq = find_userfault(ctx);
if (uwq) {
/*
* Use a seqcount to repeat the lockless check
* in wake_userfault() to avoid missing
* wakeups because during the refile both
* waitqueue could become empty if this is the
* only userfault.
*/
write_seqcount_begin(&ctx->refile_seq);
/*
* The fault_pending_wqh.lock prevents the uwq
* to disappear from under us.
*
* Refile this userfault from
* fault_pending_wqh to fault_wqh, it's not
* pending anymore after we read it.
*
* Use list_del() by hand (as
* userfaultfd_wake_function also uses
* list_del_init() by hand) to be sure nobody
* changes __remove_wait_queue() to use
* list_del_init() in turn breaking the
* !list_empty_careful() check in
* handle_userfault(). The uwq->wq.head list
* must never be empty at any time during the
* refile, or the waitqueue could disappear
* from under us. The "wait_queue_head_t"
* parameter of __remove_wait_queue() is unused
* anyway.
*/
list_del(&uwq->wq.entry);
__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
write_seqcount_end(&ctx->refile_seq);
/* careful to always initialize msg if ret == 0 */
*msg = uwq->msg;
spin_unlock(&ctx->fault_pending_wqh.lock);
ret = 0;
break;
}
spin_unlock(&ctx->fault_pending_wqh.lock);
spin_lock(&ctx->event_wqh.lock);
uwq = find_userfault_evt(ctx);
if (uwq) {
*msg = uwq->msg;
if (uwq->msg.event == UFFD_EVENT_FORK) {
fork_nctx = (struct userfaultfd_ctx *)
(unsigned long)
uwq->msg.arg.reserved.reserved1;
list_move(&uwq->wq.entry, &fork_event);
/*
* fork_nctx can be freed as soon as
* we drop the lock, unless we take a
* reference on it.
*/
userfaultfd_ctx_get(fork_nctx);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
break;
}
userfaultfd_event_complete(ctx, uwq);
spin_unlock(&ctx->event_wqh.lock);
ret = 0;
break;
}
spin_unlock(&ctx->event_wqh.lock);
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
if (no_wait) {
ret = -EAGAIN;
break;
}
spin_unlock(&ctx->fd_wqh.lock);
schedule();
spin_lock(&ctx->fd_wqh.lock);
}
__remove_wait_queue(&ctx->fd_wqh, &wait);
__set_current_state(TASK_RUNNING);
spin_unlock(&ctx->fd_wqh.lock);
if (!ret && msg->event == UFFD_EVENT_FORK) {
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
spin_lock(&ctx->event_wqh.lock);
if (!list_empty(&fork_event)) {
/*
* The fork thread didn't abort, so we can
* drop the temporary refcount.
*/
userfaultfd_ctx_put(fork_nctx);
uwq = list_first_entry(&fork_event,
typeof(*uwq),
wq.entry);
/*
* If fork_event list wasn't empty and in turn
* the event wasn't already released by fork
* (the event is allocated on fork kernel
* stack), put the event back to its place in
* the event_wq. fork_event head will be freed
* as soon as we return so the event cannot
* stay queued there no matter the current
* "ret" value.
*/
list_del(&uwq->wq.entry);
__add_wait_queue(&ctx->event_wqh, &uwq->wq);
/*
* Leave the event in the waitqueue and report
* error to userland if we failed to resolve
* the userfault fork.
*/
if (likely(!ret))
userfaultfd_event_complete(ctx, uwq);
} else {
/*
* Here the fork thread aborted and the
* refcount from the fork thread on fork_nctx
* has already been released. We still hold
* the reference we took before releasing the
* lock above. If resolve_userfault_fork
* failed we've to drop it because the
* fork_nctx has to be freed in such case. If
* it succeeded we'll hold it because the new
* uffd references it.
*/
if (ret)
userfaultfd_ctx_put(fork_nctx);
}
spin_unlock(&ctx->event_wqh.lock);
}
return ret;
}
static ssize_t userfaultfd_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct userfaultfd_ctx *ctx = file->private_data;
ssize_t _ret, ret = 0;
struct uffd_msg msg;
int no_wait = file->f_flags & O_NONBLOCK;
if (ctx->state == UFFD_STATE_WAIT_API)
return -EINVAL;
for (;;) {
if (count < sizeof(msg))
return ret ? ret : -EINVAL;
_ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
if (_ret < 0)
return ret ? ret : _ret;
if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
return ret ? ret : -EFAULT;
ret += sizeof(msg);
buf += sizeof(msg);
count -= sizeof(msg);
/*
* Allow to read more than one fault at time but only
* block if waiting for the very first one.
*/
no_wait = O_NONBLOCK;
}
}
static void __wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range)
{
spin_lock(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
if (waitqueue_active(&ctx->fault_pending_wqh))
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
range);
if (waitqueue_active(&ctx->fault_wqh))
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
spin_unlock(&ctx->fault_pending_wqh.lock);
}
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range)
{
unsigned seq;
bool need_wakeup;
/*
* To be sure waitqueue_active() is not reordered by the CPU
* before the pagetable update, use an explicit SMP memory
* barrier here. PT lock release or up_read(mmap_sem) still
* have release semantics that can allow the
* waitqueue_active() to be reordered before the pte update.
*/
smp_mb();
/*
* Use waitqueue_active because it's very frequent to
* change the address space atomically even if there are no
* userfaults yet. So we take the spinlock only when we're
* sure we've userfaults to wake.
*/
do {
seq = read_seqcount_begin(&ctx->refile_seq);
need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
waitqueue_active(&ctx->fault_wqh);
cond_resched();
} while (read_seqcount_retry(&ctx->refile_seq, seq));
if (need_wakeup)
__wake_userfault(ctx, range);
}
static __always_inline int validate_range(struct mm_struct *mm,
__u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
if (start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
if (start < mmap_min_addr)
return -EINVAL;
if (start >= task_size)
return -EINVAL;
if (len > task_size - start)
return -EINVAL;
return 0;
}
static inline bool vma_can_userfault(struct vm_area_struct *vma)
{
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
vma_is_shmem(vma);
}
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *vma, *prev, *cur;
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
unsigned long vm_flags, new_flags;
bool found;
bool basic_ioctls;
unsigned long start, end, vma_end;
user_uffdio_register = (struct uffdio_register __user *) arg;
ret = -EFAULT;
if (copy_from_user(&uffdio_register, user_uffdio_register,
sizeof(uffdio_register)-sizeof(__u64)))
goto out;
ret = -EINVAL;
if (!uffdio_register.mode)
goto out;
if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
UFFDIO_REGISTER_MODE_WP))
goto out;
vm_flags = 0;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
vm_flags |= VM_UFFD_WP;
/*
* FIXME: remove the below error constraint by
* implementing the wprotect tracking mode.
*/
ret = -EINVAL;
goto out;
}
ret = validate_range(mm, uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
start = uffdio_register.range.start;
end = start + uffdio_register.range.len;
ret = -ENOMEM;
if (!mmget_not_zero(mm))
goto out;
down_write(&mm->mmap_sem);
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
/* check that there's at least one vma in the range */
ret = -EINVAL;
if (vma->vm_start >= end)
goto out_unlock;
/*
* If the first vma contains huge pages, make sure start address
* is aligned to huge page size.
*/
if (is_vm_hugetlb_page(vma)) {
unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
if (start & (vma_hpagesize - 1))
goto out_unlock;
}
/*
* Search for not compatible vmas.
*/
found = false;
basic_ioctls = false;
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
!!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
/* check not compatible vmas */
ret = -EINVAL;
if (!vma_can_userfault(cur))
goto out_unlock;
/*
* If this vma contains ending address, and huge pages
* check alignment.
*/
if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
end > cur->vm_start) {
unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
ret = -EINVAL;
if (end & (vma_hpagesize - 1))
goto out_unlock;
}
/*
* Check that this vma isn't already owned by a
* different userfaultfd. We can't allow more than one
* userfaultfd to own a single vma simultaneously or we
* wouldn't know which one to deliver the userfaults to.
*/
ret = -EBUSY;
if (cur->vm_userfaultfd_ctx.ctx &&
cur->vm_userfaultfd_ctx.ctx != ctx)
goto out_unlock;
/*
* Note vmas containing huge pages
*/
if (is_vm_hugetlb_page(cur))
basic_ioctls = true;
found = true;
}
BUG_ON(!found);
if (vma->vm_start < start)
prev = vma;
ret = 0;
do {
cond_resched();
BUG_ON(!vma_can_userfault(vma));
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
/*
* Nothing to do: this vma is already registered into this
* userfaultfd and with the right tracking mode too.
*/
if (vma->vm_userfaultfd_ctx.ctx == ctx &&
(vma->vm_flags & vm_flags) == vm_flags)
goto skip;
if (vma->vm_start > start)
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }),
vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto next;
}
if (vma->vm_start < start) {
ret = split_vma(mm, vma, start, 1);
if (ret)
break;
}
if (vma->vm_end > end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
break;
}
next:
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx.ctx = ctx;
skip:
prev = vma;
start = vma->vm_end;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
out_unlock:
up_write(&mm->mmap_sem);
mmput(mm);
if (!ret) {
/*
* Now that we scanned all vmas we can already tell
* userland which ioctls methods are guaranteed to
* succeed on this range.
*/
if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
UFFD_API_RANGE_IOCTLS,
&user_uffdio_register->ioctls))
ret = -EFAULT;
}
out:
return ret;
}
static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *vma, *prev, *cur;
int ret;
struct uffdio_range uffdio_unregister;
unsigned long new_flags;
bool found;
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
ret = validate_range(mm, uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
start = uffdio_unregister.start;
end = start + uffdio_unregister.len;
ret = -ENOMEM;
if (!mmget_not_zero(mm))
goto out;
down_write(&mm->mmap_sem);
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
/* check that there's at least one vma in the range */
ret = -EINVAL;
if (vma->vm_start >= end)
goto out_unlock;
/*
* If the first vma contains huge pages, make sure start address
* is aligned to huge page size.
*/
if (is_vm_hugetlb_page(vma)) {
unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
if (start & (vma_hpagesize - 1))
goto out_unlock;
}
/*
* Search for not compatible vmas.
*/
found = false;
ret = -EINVAL;
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
cond_resched();
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
!!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
/*
* Check not compatible vmas, not strictly required
* here as not compatible vmas cannot have an
* userfaultfd_ctx registered on them, but this
* provides for more strict behavior to notice
* unregistration errors.
*/
if (!vma_can_userfault(cur))
goto out_unlock;
found = true;
}
BUG_ON(!found);
if (vma->vm_start < start)
prev = vma;
ret = 0;
do {
cond_resched();
BUG_ON(!vma_can_userfault(vma));
/*
* Nothing to do: this vma is already registered into this
* userfaultfd and with the right tracking mode too.
*/
if (!vma->vm_userfaultfd_ctx.ctx)
goto skip;
if (vma->vm_start > start)
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
if (userfaultfd_missing(vma)) {
/*
* Wake any concurrent pending userfault while
* we unregister, so they will not hang
* permanently and it avoids userland to call
* UFFDIO_WAKE explicitly.
*/
struct userfaultfd_wake_range range;
range.start = start;
range.len = vma_end - start;
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX,
vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto next;
}
if (vma->vm_start < start) {
ret = split_vma(mm, vma, start, 1);
if (ret)
break;
}
if (vma->vm_end > end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
break;
}
next:
/*
* In the vma_merge() successful mprotect-like case 8:
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
skip:
prev = vma;
start = vma->vm_end;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
out_unlock:
up_write(&mm->mmap_sem);
mmput(mm);
out:
return ret;
}
/*
* userfaultfd_wake may be used in combination with the
* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
*/
static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
int ret;
struct uffdio_range uffdio_wake;
struct userfaultfd_wake_range range;
const void __user *buf = (void __user *)arg;
ret = -EFAULT;
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
range.start = uffdio_wake.start;
range.len = uffdio_wake.len;
/*
* len == 0 means wake all and we don't want to wake all here,
* so check it again to be sure.
*/
VM_BUG_ON(!range.len);
wake_userfault(ctx, &range);
ret = 0;
out:
return ret;
}
static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
__s64 ret;
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
struct userfaultfd_wake_range range;
user_uffdio_copy = (struct uffdio_copy __user *) arg;
ret = -EFAULT;
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
/* don't copy "copy" last field */
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
* double check for wraparound just in case. copy_from_user()
* will later check uffdio_copy.src + uffdio_copy.len to fit
* in the userland range.
*/
ret = -EINVAL;
if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
goto out;
if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
uffdio_copy.len);
mmput(ctx->mm);
} else {
return -ESRCH;
}
if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
return -EFAULT;
if (ret < 0)
goto out;
BUG_ON(!ret);
/* len == 0 would wake all */
range.len = ret;
if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
range.start = uffdio_copy.dst;
wake_userfault(ctx, &range);
}
ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
out:
return ret;
}
static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
__s64 ret;
struct uffdio_zeropage uffdio_zeropage;
struct uffdio_zeropage __user *user_uffdio_zeropage;
struct userfaultfd_wake_range range;
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ret = -EFAULT;
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
/* don't copy "zeropage" last field */
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
ret = -EINVAL;
if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
}
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
return -EFAULT;
if (ret < 0)
goto out;
/* len == 0 would wake all */
BUG_ON(!ret);
range.len = ret;
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
range.start = uffdio_zeropage.range.start;
wake_userfault(ctx, &range);
}
ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
out:
return ret;
}
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
* For the current set of features the bits just coincide
*/
return (unsigned int)user_features;
}
/*
* userland asks for a certain API version and we return which bits
* and ioctl commands are implemented in this kernel for such API
* version or -EINVAL if unknown.
*/
static int userfaultfd_api(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
struct uffdio_api uffdio_api;
void __user *buf = (void __user *)arg;
int ret;
__u64 features;
ret = -EINVAL;
if (ctx->state != UFFD_STATE_WAIT_API)
goto out;
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
features = uffdio_api.features;
if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
memset(&uffdio_api, 0, sizeof(uffdio_api));
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out;
ret = -EINVAL;
goto out;
}
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out;
ctx->state = UFFD_STATE_RUNNING;
/* only enable the requested features for this uffd context */
ctx->features = uffd_ctx_features(features);
ret = 0;
out:
return ret;
}
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
unsigned long arg)
{
int ret = -EINVAL;
struct userfaultfd_ctx *ctx = file->private_data;
if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
return -EINVAL;
switch(cmd) {
case UFFDIO_API:
ret = userfaultfd_api(ctx, arg);
break;
case UFFDIO_REGISTER:
ret = userfaultfd_register(ctx, arg);
break;
case UFFDIO_UNREGISTER:
ret = userfaultfd_unregister(ctx, arg);
break;
case UFFDIO_WAKE:
ret = userfaultfd_wake(ctx, arg);
break;
case UFFDIO_COPY:
ret = userfaultfd_copy(ctx, arg);
break;
case UFFDIO_ZEROPAGE:
ret = userfaultfd_zeropage(ctx, arg);
break;
}
return ret;
}
#ifdef CONFIG_PROC_FS
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct userfaultfd_ctx *ctx = f->private_data;
wait_queue_entry_t *wq;
struct userfaultfd_wait_queue *uwq;
unsigned long pending = 0, total = 0;
spin_lock(&ctx->fault_pending_wqh.lock);
list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
pending++;
total++;
}
list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
total++;
}
spin_unlock(&ctx->fault_pending_wqh.lock);
/*
* If more protocols will be added, there will be all shown
* separated by a space. Like this:
* protocols: aa:... bb:...
*/
seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
pending, total, UFFD_API, ctx->features,
UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif
static const struct file_operations userfaultfd_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = userfaultfd_show_fdinfo,
#endif
.release = userfaultfd_release,
.poll = userfaultfd_poll,
.read = userfaultfd_read,
.unlocked_ioctl = userfaultfd_ioctl,
.compat_ioctl = userfaultfd_ioctl,
.llseek = noop_llseek,
};
static void init_once_userfaultfd_ctx(void *mem)
{
struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
init_waitqueue_head(&ctx->fault_pending_wqh);
init_waitqueue_head(&ctx->fault_wqh);
init_waitqueue_head(&ctx->event_wqh);
init_waitqueue_head(&ctx->fd_wqh);
seqcount_init(&ctx->refile_seq);
}
/**
* userfaultfd_file_create - Creates a userfaultfd file pointer.
* @flags: Flags for the userfaultfd file.
*
* This function creates a userfaultfd file pointer, w/out installing
* it into the fd table. This is useful when the userfaultfd file is
* used during the initialization of data structures that require
* extra setup after the userfaultfd creation. So the userfaultfd
* creation is split into the file pointer creation phase, and the
* file descriptor installation phase. In this way races with
* userspace closing the newly installed file descriptor can be
* avoided. Returns a userfaultfd file pointer, or a proper error
* pointer.
*/
static struct file *userfaultfd_file_create(int flags)
{
struct file *file;
struct userfaultfd_ctx *ctx;
BUG_ON(!current->mm);
/* Check the UFFD_* constants for consistency. */
BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
file = ERR_PTR(-EINVAL);
if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
goto out;
file = ERR_PTR(-ENOMEM);
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
if (!ctx)
goto out;
atomic_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
if (IS_ERR(file)) {
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
out:
return file;
}
SYSCALL_DEFINE1(userfaultfd, int, flags)
{
int fd, error;
struct file *file;
error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
if (error < 0)
return error;
fd = error;
file = userfaultfd_file_create(flags);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_put_unused_fd;
}
fd_install(fd, file);
return fd;
err_put_unused_fd:
put_unused_fd(fd);
return error;
}
static int __init userfaultfd_init(void)
{
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
sizeof(struct userfaultfd_ctx),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
init_once_userfaultfd_ctx);
return 0;
}
__initcall(userfaultfd_init);