mirror of
https://github.com/rd-stuffs/msm-4.14.git
synced 2025-02-20 11:45:48 +08:00
* remotes/origin/tmp-8a5776a: Linux 4.14-rc4 ARC: [plat-hsdk]: Add reset controller node to manage ethernet reset arm64: Ensure fpsimd support is ready before userspace is active arm64: Ensure the instruction emulation is ready for userspace powerpc/powernv: Increase memory block size to 1GB on radix dm raid: fix incorrect status output at the end of a "recover" process KVM: add X86_LOCAL_APIC dependency ovl: fix regression caused by exclusive upper/work dir protection ovl: fix missing unlock_rename() in ovl_do_copy_up() ovl: fix dentry leak in ovl_indexdir_cleanup() ovl: fix dput() of ERR_PTR in ovl_cleanup_index() ovl: fix error value printed in ovl_lookup_index() ovl: fix may_write_real() for overlayfs directories x86/kvm: Move kvm_fastop_exception to .fixup section i2c: i2c-stm32f7: make structure stm32f7_setup static const i2c: ensure termination of *_device_id tables i2c: i801: Add support for Intel Cedar Fork i2c: stm32f7: fix setup structure net: 8021q: skip packets if the vlan is down Update James Hogan's email address drm/i915/glk: Fix DMC/DC state idleness calculation drm/i915/cnl: Reprogram DMC firmware after S3/S4 resume i40iw: Fix port number for query QP i40iw: Add missing memory barriers RDMA/qedr: Parse vlan priority as sl RDMA/qedr: Parse VLAN ID correctly and ignore the value of zero IB/mlx5: Fix label order in error path handling arm64: Use larger stacks when KASAN is selected ACPI/IORT: Fix PCI ACS enablement kvm/x86: Avoid async PF preempting the kernel incorrectly clk: samsung: exynos4: Enable VPLL and EPLL clocks for suspend/resume cycle dm crypt: reject sector_size feature if device length is not aligned to it Btrfs: fix overlap of fs_info::flags values bsg-lib: fix use-after-free under memory-pressure btrfs: avoid overflow when sector_t is 32 bit ARM: dts: stm32: use right pinctrl compatible for stm32f469 powerpc/mm: Call flush_tlb_kernel_range with interrupts enabled powerpc/xive: Clear XIVE internal structures when a CPU is removed powerpc/xive: Fix IPI reset nvme-pci: Use PCI bus address for data/queues in CMB ARM: dts: stm32: Fix STMPE1600 binding on stm32429i-eval board watchdog/core: Put softlockup_threads_initialized under ifdef guard watchdog/core: Rename some softlockup_* functions powerpc/watchdog: Make use of watchdog_nmi_probe() watchdog/core, powerpc: Lock cpus across reconfiguration watchdog/core, powerpc: Replace watchdog_nmi_reconfigure() mmc: sdhci-xenon: Fix clock resource by adding an optional bus clock mmc: meson-gx: include tx phase in the tuning process mmc: meson-gx: fix rx phase reset mmc: meson-gx: make sure the clock is rounded down mmc: Delete bounce buffer handling lsm: fix smack_inode_removexattr and xattr_getsecurity memleak xfs: handle racy AIO in xfs_reflink_end_cow xfs: always swap the cow forks when swapping extents ARC: [plat-hsdk]: Temporary fix to set CPU frequency to 1GHz ARC: fix allnoconfig build warning ARCv2: boot log: identify HS48 cores (dual issue) ARC: boot log: decontaminate ARCv2 ISA_CONFIG register arc: remove redundant UTS_MACHINE define in arch/arc/Makefile ARC: [plat-eznps] Update platform maintainer as Noam left ARC: [plat-hsdk] use actual clk driver to manage cpu clk ARC: [*defconfig] Reenable soft lock-up detector ARC: [plat-axs10x] sdio: Temporary fix of sdio ciu frequency ARC: [plat-hsdk] sdio: Temporary fix of sdio ciu frequency ARC: [plat-axs103] Add temporary quirk to reset ethernet IP ARM: defconfig: update Gemini defconfig ARM: defconfig: FRAMEBUFFER_CONSOLE can no longer be =m include/linux/fs.h: fix comment about struct address_space checkpatch: fix ignoring cover-letter logic m32r: fix build failure lib/ratelimit.c: use deferred printk() version kernel/params.c: improve STANDARD_PARAM_DEF readability kernel/params.c: fix an overflow in param_attr_show kernel/params.c: fix the maximum length in param_get_string mm/memory_hotplug: define find_{smallest|biggest}_section_pfn as unsigned long mm/memory_hotplug: change pfn_to_section_nr/section_nr_to_pfn macro to inline function kernel/kcmp.c: drop branch leftover typo memremap: add scheduling point to devm_memremap_pages mm, page_alloc: add scheduling point to memmap_init_zone mm, memory_hotplug: add scheduling point to __add_pages lib/idr.c: fix comment for idr_replace() mm: memcontrol: use vmalloc fallback for large kmem memcg arrays kernel/sysctl.c: remove duplicate UINT_MAX check on do_proc_douintvec_conv() include/linux/bitfield.h: remove 32bit from FIELD_GET comment block lib/lz4: make arrays static const, reduces object code size exec: binfmt_misc: kill the onstack iname[BINPRM_BUF_SIZE] array exec: binfmt_misc: fix race between load_misc_binary() and kill_node() exec: binfmt_misc: remove the confusing e->interp_file != NULL checks exec: binfmt_misc: shift filp_close(interp_file) from kill_node() to bm_evict_inode() exec: binfmt_misc: don't nullify Node->dentry in kill_node() exec: load_script: kill the onstack interp[BINPRM_BUF_SIZE] array userfaultfd: non-cooperative: fix fork use after free mm/device-public-memory: fix edge case in _vm_normal_page() mm: fix data corruption caused by lazyfree page mm: avoid marking swap cached page as lazyfree mm: have filemap_check_and_advance_wb_err clear AS_EIO/AS_ENOSPC m32r: define CPU_BIG_ENDIAN zram: fix null dereference of handle mm: fix RODATA_TEST failure "rodata_test: test data was not read only" rapidio: remove global irq spinlocks from the subsystem mm: meminit: mark init_reserved_page as __meminit z3fold: fix stale list handling mm,compaction: serialize waitqueue_active() checks (for real) android: binder: drop lru lock in isolate callback mm/memcg: avoid page count check for zone device mm, memcg: remove hotplug locking from try_charge mm, oom_reaper: skip mm structs with mmu notifiers z3fold: fix potential race in z3fold_reclaim_page sh: sh7269: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration sh: sh7264: remove nonexistent GPIO_PH[0-7] to fix pinctrl registration sh: sh7757: remove nonexistent GPIO_PT[JLNQ]7_RESV to fix pinctrl registration sh: sh7722: remove nonexistent GPIO_PTQ7 to fix pinctrl registration mm, hugetlb, soft_offline: save compound page order before page migration ksm: fix unlocked iteration over vmas in cmp_and_merge_page() include/linux/mm.h: fix typo in VM_MPX definition scripts/spelling.txt: add more spelling mistakes to spelling.txt kernel/params.c: align add_sysfs_param documentation with code alpha: fix build failures bpf: fix bpf_tail_call() x64 JIT net: stmmac: dwmac-rk: Add RK3128 GMAC support blk-mq-debugfs: fix device sched directory for default scheduler null_blk: change configfs dependency to select blk-throttle: fix possible io stall when upgrade to max rndis_host: support Novatel Verizon USB730L drm/i915: Fix DDI PHY init if it was already on ide: fix IRQ assignment for PCI bus order probing ide: pci: free PCI BARs on initialization failure ide: free hwif->portdev on hwif_init() failure MAINTAINERS: update list for NBD net: rtnetlink: fix info leak in RTM_GETSTATS call KVM: PPC: Book3S: Fix server always zero from kvmppc_xive_get_xive() rcu: Remove extraneous READ_ONCE()s from rcu_irq_{enter,exit}() ftrace: Fix kmemleak in unregister_ftrace_graph powerpc/4xx: Fix compile error with 64K pages on 40x, 44x powerpc: Fix action argument for cpufeatures-based TLB flush scsi: ibmvscsis: Fix write_pending failure path scsi: libiscsi: Remove iscsi_destroy_session scsi: libiscsi: Fix use-after-free race during iscsi_session_teardown scsi: sd: Do not override max_sectors_kb sysfs setting scsi: sd: Implement blacklist option for WRITE SAME w/ UNMAP socket, bpf: fix possible use after free nbd: fix -ERESTARTSYS handling drm/sun4i: hdmi: Disable clks in bind function error path and unbind function ahci: don't ignore result code of ahci_reset_controller() mlxsw: spectrum_router: Track RIF of IPIP next hops mlxsw: spectrum_router: Move VRF refcounting ALSA: usx2y: Suppress kernel warning at page allocation failures ceph: fix __choose_mds() for LSSNAP request ceph: properly queue cap snap for newly created snap realm arm64: fix misleading data abort decoding Revert "ALSA: echoaudio: purge contradictions between dimension matrix members and total number of members" Revert "HID: multitouch: Support ALPS PTP stick with pid 0x120A" HID: hidraw: fix power sequence when closing device HID: wacom: Always increment hdev refcount within wacom_get_hdev_data mmc: core: add driver strength selection when selecting hs400es net: hns3: Fix an error handling path in 'hclge_rss_init_hw()' net: mvpp2: Fix clock resource by adding an optional bus clock r8152: add Linksys USB3GIGV1 id l2tp: fix l2tp_eth module loading ip_gre: erspan device should keep dst ip_gre: set tunnel hlen properly in erspan_tunnel_init ip_gre: check packet length and mtu correctly in erspan_xmit ip_gre: get key from session_id correctly in erspan_rcv Linux 4.14-rc3 hwmon: (xgene) Fix up error handling path mixup in 'xgene_hwmon_probe()' nvme: fix visibility of "uuid" ns attribute tipc: use only positive error codes in messages ppp: fix __percpu annotation udp: perform source validation for mcast early demux IPv4: early demux can return an error code ip6_tunnel: update mtu properly for ARPHRD_ETHER tunnel device in tx path ip6_gre: ip6gre_tap device should keep dst ip_gre: ipgre_tap device should keep dst netlink: do not proceed if dump's start() errs clk: Export clk_bulk_prepare() fix infoleak in waitid(2) x86/asm: Use register variable to get stack pointer value x86/mm: Disable branch profiling in mem_encrypt.c arm64: fault: Route pte translation faults via do_translation_fault arm64: mm: Use READ_ONCE when dereferencing pointer to pte table RDMA/iwpm: Properly mark end of NL messages kvm/x86: Handle async PF in RCU read-side critical sections KVM: nVMX: Fix nested #PF intends to break L1's vmlauch/vmresume sched/sysctl: Check user input value of sysctl_sched_time_avg x86/asm: Fix inline asm call constraints for GCC 4.4 sched/debug: Add explicit TASK_PARKED printing sched/debug: Ignore TASK_IDLE for SysRq-W sched/debug: Add explicit TASK_IDLE printing sched/tracing: Use common task-state helpers locking/rwsem-xadd: Fix missed wakeup due to reordering of load sched/tracing: Fix trace_sched_switch task-state printing sched/debug: Remove unused variable sched/debug: Convert TASK_state to hex sched/debug: Implement consistent task-state printing um/time: Fixup namespace collision perf/aux: Only update ->aux_wakeup in non-overwrite mode cxl: Fix memory page not handled powerpc: Fix workaround for spurious MCE on POWER9 PM / s2idle: Invoke the ->wake() platform callback earlier Revert "Bluetooth: Add option for disabling legacy ioctl interfaces" net: Set sk_prot_creator when cloning sockets to the right proto net: dsa: mv88e6xxx: lock mutex when freeing IRQs packet: only test po->has_vnet_hdr once in packet_snd packet: in packet_do_bind, test fanout with bind_lock held net: stmmac: dwmac4: Re-enable MAC Rx before powering down net: stmmac: dwc-qos: Add suspend / resume support net: dsa: Fix network device registration order net: dsa: mv88e6xxx: Allow dsa and cpu ports in multiple vlans inetpeer: fix RCU lookup() again net: mvpp2: do not select the internal source clock net: mvpp2: fix port list indexing net: mvpp2: fix parsing fragmentation detection dm crypt: fix memory leak in crypt_ctr_cipher_old() perf test: Fix vmlinux failure on s390x part 2 perf test: Fix vmlinux failure on s390x KVM: VMX: use cmpxchg64 tun: bail out from tun_get_user() if the skb is empty percpu: fix iteration to prevent skipping over block timer: Prepare to change timer callback argument type xen/mmu: Call xen_cleanhighmap() with 4MB aligned for page tables mapping xen-pciback: relax BAR sizing write value check watchdog/hardlockup/perf: Fix spelling mistake: "permanetely" -> "permanently" irq/generic-chip: Don't replace domain's name usb: dwc3: of-simple: Add compatible for Spreadtrum SC9860 platform usb: gadget: udc: atmel: set vbus irqflags explicitly usb: gadget: ffs: handle I/O completion in-order usb: renesas_usbhs: fix usbhsf_fifo_clear() for RX direction usb: renesas_usbhs: fix the BCLR setting condition for non-DCP pipe usb: gadget: udc: renesas_usb3: Fix return value of usb3_write_pipe() usb: gadget: udc: renesas_usb3: fix Pn_RAMMAP.Pn_MPKT value usb: gadget: udc: renesas_usb3: fix for no-data control transfer USB: dummy-hcd: Fix erroneous synchronization change USB: dummy-hcd: fix infinite-loop resubmission bug USB: dummy-hcd: fix connection failures (wrong speed) seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() objtool: Support unoptimized frame pointer setup objtool: Skip unreachable warnings for GCC 4.4 and older net/mlx5: Fix wrong indentation in enable SRIOV code net/mlx5: Fix static checker warning on steering tracepoints code net/mlx5e: Fix calculated checksum offloads counters net/mlx5e: Don't add/remove 802.1ad rules when changing 802.1Q VLAN filter net/mlx5e: Print netdev features correctly in error message net/mlx5e: Check encap entry state when offloading tunneled flows net/mlx5e: Disallow TC offloading of unsupported match/action combinations net/mlx5e: Fix erroneous freeing of encap header buffer net/mlx5: Check device capability for maximum flow counters net/mlx5: Fix FPGA capability location net/mlx5e: IPoIB, Fix access to invalid memory address md/raid5: cap worker count dm-raid: fix a race condition in request handling md: fix a race condition for flush request handling md: separate request handling scsi: ILLEGAL REQUEST + ASC==27 => target failure scsi: aacraid: Add a small delay after IOP reset cpufreq: docs: Drop intel-pstate.txt from index.txt percpu: fix starting offset for chunk statistics traversal ACPI / APEI: clear error status before acknowledging the error bcache: use llist_for_each_entry_safe() in __closure_wake_up() mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user IB/hfi1: Unsuccessful PCIe caps tuning should not fail driver load IB/hfi1: On error, fix use after free during user context setup Revert "IB/ipoib: Update broadcast object if PKey value was changed in index 0" IB/hfi1: Return correct value in general interrupt handler IB/hfi1: Check eeprom config partition validity IB/hfi1: Only reset QSFP after link up and turn off AOC TX IB/hfi1: Turn off AOC TX after offline substates iommu: Fix comment for iommu_ops.map_sg iommu/amd: pr_err() strings should end with newlines iommu/mediatek: Limit the physical address in 32bit for v7s iommu/io-pgtable-arm-v7s: Need dma-sync while there is no QUIRK_NO_DMA mtd: Fix partition alignment check on multi-erasesize devices KVM: VMX: simplify and fix vmx_vcpu_pi_load KVM: VMX: avoid double list add with VT-d posted interrupts KVM: VMX: extract __pi_post_block arm64: Make sure SPsel is always set quota: Fix quota corruption with generic/232 test platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt sctp: Fix a big endian bug in sctp_diag_dump() vfs: Return -ENXIO for negative SEEK_HOLE / SEEK_DATA offsets atlantic: fix iommu errors aquantia: Fix transient invalid link down/up indications aquantia: Fix Tx queue hangups aquantia: Setup max_mtu in ndev to enable jumbo frames xfs: revert "xfs: factor rmap btree size into the indlen calculations" xfs: Capture state of the right inode in xfs_iflush_done xfs: perag initialization should only touch m_ag_max_usable for AG 0 xfs: update i_size after unwritten conversion in dio completion iomap_dio_rw: Allocate AIO completion queue before submitting dio xfs: validate bdev support for DAX inode flag l2tp: fix race condition in l2tp_tunnel_delete vti: fix use after free in vti_tunnel_xmit/vti6_tnl_xmit drm/i915/bios: ignore HDMI on port A drm/i915: remove redundant variable hw_check drm/i915: always update ELD connector type after get modes percpu: make this_cpu_generic_read() atomic w.r.t. interrupts arm64: dts: rockchip: add the grf clk for dw-mipi-dsi on rk3399 btrfs: log csums for all modified extents Btrfs: fix unexpected result when dio reading corrupted blocks btrfs: Report error on removing qgroup if del_qgroup_item fails Btrfs: skip checksum when reading compressed data if some IO have failed Btrfs: fix kernel oops while reading compressed data Btrfs: use btrfs_op instead of bio_op in __btrfs_map_block Btrfs: do not backup tree roots when fsync btrfs: remove BTRFS_FS_QUOTA_DISABLING flag btrfs: propagate error to btrfs_cmp_data_prepare caller btrfs: prevent to set invalid default subvolid Btrfs: send: fix error number for unknown inode types btrfs: fix NULL pointer dereference from free_reloc_roots() btrfs: finish ordered extent cleaning if no progress is found btrfs: clear ordered flag on cleaning up ordered extents Btrfs: fix incorrect {node,sector}size endianness from BTRFS_IOC_FS_INFO Btrfs: do not reset bio->bi_ops while writing bio Btrfs: use the new helper wbc_to_write_flags powerpc: Handle MCE on POWER9 with only DSISR bit 30 set drm/tegra: trace: Fix path to include x86/fpu: Use using_compacted_format() instead of open coded X86_FEATURE_XSAVES x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_user_to_xstate() x86/fpu: Eliminate the 'xfeatures' local variable in copy_user_to_xstate() x86/fpu: Copy the full header in copy_user_to_xstate() x86/fpu: Use validate_xstate_header() to validate the xstate_header in copy_kernel_to_xstate() x86/fpu: Eliminate the 'xfeatures' local variable in copy_kernel_to_xstate() x86/fpu: Copy the full state_header in copy_kernel_to_xstate() x86/fpu: Use validate_xstate_header() to validate the xstate_header in __fpu__restore_sig() x86/fpu: Use validate_xstate_header() to validate the xstate_header in xstateregs_set() x86/fpu: Introduce validate_xstate_header() x86/fpu: Rename fpu__activate_fpstate_read/write() to fpu__prepare_[read|write]() x86/fpu: Rename fpu__activate_curr() to fpu__initialize() x86/fpu: Simplify and speed up fpu__copy() x86/fpu: Fix stale comments about lazy FPU logic x86/fpu: Rename fpu::fpstate_active to fpu::initialized x86/fpu: Remove fpu__current_fpstate_write_begin/end() x86/fpu: Fix fpu__activate_fpstate_read() and update comments netlink: fix nla_put_{u8,u16,u32} for KASAN rocker: fix rocker_tlv_put_* functions for KASAN scsi: scsi_transport_fc: Also check for NOTPRESENT in fc_remote_port_add() xfs: remove redundant re-initialization of total_nr_pages xfs: Output warning message when discard option was enabled even though the device does not support discard xfs: report zeroed or not correctly in xfs_zero_range() xfs: kill meaningless variable 'zero' fs/xfs: Use %pS printk format for direct addresses xfs: evict CoW fork extents when performing finsert/fcollapse xfs: don't unconditionally clear the reflink flag on zero-block files fix a typo in put_compat_shm_info() PCI: Fix race condition with driver_override net: qcom/emac: specify the correct size when mapping a DMA buffer cpufreq: dt: Fix sysfs duplicate filename creation for platform-device scsi: scsi_transport_fc: set scsi_target_id upon rescan PM / OPP: Call notifier without holding opp_table->lock security/keys: rewrite all of big_key crypto security/keys: properly zero out sensitive key material in big_key l2tp: fix race between l2tp_session_delete() and l2tp_tunnel_closeall() l2tp: ensure sessions are freed after their PPPOL2TP socket smp/hotplug: Hotplug state fail injection smp/hotplug: Differentiate the AP completion between up and down smp/hotplug: Differentiate the AP-work lockdep class between up and down smp/hotplug: Callback vs state-machine consistency smp/hotplug: Rewrite AP state machine core smp/hotplug: Allow external multi-instance rollback smp/hotplug: Add state diagram MAINTAINERS: Add entry for MediaTek PMIC LED driver scsi: scsi_transport_iscsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly irqdomain: Add __rcu annotations to radix tree accessors irqchip/mips-gic: Use effective affinity to unmask irqchip/mips-gic: Fix shifts to extract register fields nvme-fcloop: fix port deletes and callbacks nvmet-fc: sync header templates with comments nvmet-fc: ensure target queue id within range. nvmet-fc: on port remove call put outside lock nvme-rdma: don't fully stop the controller in error recovery nvme-rdma: give up reconnect if state change fails nvme-core: Use nvme_wq to queue async events and fw activation nvme: fix sqhd reference when admin queue connect fails watchdog/hardlockup/perf: Cure UP damage gfs2: Fix debugfs glocks dump selftests: timers: set-timer-lat: Fix hang when testing unsupported alarms selftests: timers: set-timer-lat: fix hang when std out/err are redirected selftests/memfd: correct run_tests.sh permission selftests/seccomp: Support glibc 2.26 siginfo_t.h selftests: futex: Makefile: fix for loops in targets to run silently selftests: Makefile: fix for loops in targets to run silently selftests: mqueue: Use full path to run tests from Makefile selftests: futex: copy sub-dir test scripts for make O=dir run PCI: Add dummy pci_acs_enabled() for CONFIG_PCI=n build IB/mlx5: Fix NULL deference on mlx5_ib_update_xlt failure IB/mlx5: Simplify mlx5_ib_cont_pages IB/ipoib: Fix inconsistency with free_netdev and free_rdma_netdev IB/ipoib: Fix sysfs Pkey create<->remove possible deadlock IB: Correct MR length field to be 64-bit IB/core: Fix qp_sec use after free access IB/core: Fix typo in the name of the tag-matching cap struct perf tools: Fix syscalltbl build failure perf report: Fix debug messages with --call-graph option dm ioctl: fix alignment of event number in the device list block: fix a crash caused by wrong API fs: Fix page cache inconsistency when mixing buffered and AIO DIO nvmet: implement valid sqhd values in completions nvme-fabrics: Allow 0 as KATO value nvme: allow timed-out ios to retry nvme: stop aer posting if controller state not live nvme-pci: Print invalid SGL only once nvme-pci: initialize queue memory before interrupts nvmet-fc: fix failing max io queue connections nvme-fc: use transport-specific sgl format nvme: add transport SGL definitions nvme.h: remove FC transport-specific error values qla2xxx: remove use of FC-specific error codes lpfc: remove use of FC-specific error codes nvmet-fcloop: remove use of FC-specific error codes nvmet-fc: remove use of FC-specific error codes nvme-fc: remove use of FC-specific error codes loop: remove union of use_aio and ref in struct loop_cmd blktrace: Fix potential deadlock between delete & sysfs ops nbd: ignore non-nbd ioctl's bsg-lib: don't free job in bsg_prepare_job brd: fix overflow in __brd_direct_access genirq: Check __free_irq() return value for NULL futex: Fix pi_state->owner serialization KEYS: use kmemdup() in request_key_auth_new() KEYS: restrict /proc/keys by credentials at open time KEYS: reset parent each time before searching key_user_tree KEYS: prevent KEYCTL_READ on negative key KEYS: prevent creating a different user's keyrings KEYS: fix writing past end of user-supplied buffer in keyring_read() KEYS: fix key refcount leak in keyctl_read_key() KEYS: fix key refcount leak in keyctl_assume_authority() KEYS: don't revoke uninstantiated key in request_key_auth_new() KEYS: fix cred refcount leak in request_key_auth_new() perf evsel: Fix attr.exclude_kernel setting for default cycles:p tools include: Sync kernel ABI headers with tooling headers perf tools: Get all of tools/{arch,include}/ in the MANIFEST arch: change default endian for microblaze microblaze: Cocci spatch "vma_pages" microblaze: Add missing kvm_para.h to Kbuild perf/x86/intel/uncore: Correct num_boxes for IIO and IRP USB: cdc-wdm: ignore -EPIPE from GetEncapsulatedResponse USB: devio: Don't corrupt user memory USB: devio: Prevent integer overflow in proc_do_submiturb() perf/x86/intel/rapl: Add missing CPU IDs perf/x86/msr: Add missing CPU IDs perf/x86/intel/cstate: Add missing CPU IDs x86: Don't cast away the __user in __get_user_asm_u64() x86/sysfs: Fix off-by-one error in loop termination x86/mm: Fix fault error path using unsafe vma pointer x86/numachip: Add const and __initconst to numachip2_clockevent x86/fpu: Reinitialize FPU registers if restoring FPU state fails x86/fpu: Don't let userspace set bogus xcomp_bv qxl: fix framebuffer unpinning Linux 4.14-rc2 staging: iio: ad7192: Fix - use the dedicated reset function avoiding dma from stack. iio: core: Return error for failed read_reg iio: ad7793: Fix the serial interface reset iio: ad_sigma_delta: Implement a dedicated reset function IIO: BME280: Updates to Humidity readings need ctrl_reg write! iio: adc: mcp320x: Fix readout of negative voltages iio: adc: mcp320x: Fix oops on module unload iio: adc: stm32: fix bad error check on max_channels iio: trigger: stm32-timer: fix a corner case to write preset iio: trigger: stm32-timer: preset shouldn't be buffered iio: adc: twl4030: Return an error if we can not enable the vusb3v1 regulator in 'twl4030_madc_probe()' iio: adc: twl4030: Disable the vusb3v1 rugulator in the error handling path of 'twl4030_madc_probe()' iio: adc: twl4030: Fix an error handling path in 'twl4030_madc_probe()' x86/fpu: Turn WARN_ON() in context switch into WARN_ON_FPU() x86/fpu: Fix boolreturn.cocci warnings x86/fpu: Add FPU state copying quirk to handle XRSTOR failure on Intel Skylake CPUs x86/fpu: Remove struct fpu::fpregs_active x86/fpu: Decouple fpregs_activate()/fpregs_deactivate() from fpu->fpregs_active x86/fpu: Change fpu->fpregs_active users to fpu->fpstate_active x86/fpu: Split the state handling in fpu__drop() x86/fpu: Make the fpu state change in fpu__clear() scheduler-atomic x86/fpu: Simplify fpu->fpregs_active use x86/fpu: Flip the parameter order in copy_*_to_xstate() x86/fpu: Remove 'kbuf' parameter from the copy_user_to_xstate() API x86/fpu: Remove 'ubuf' parameter from the copy_kernel_to_xstate() API x86/fpu: Split copy_user_to_xstate() into copy_kernel_to_xstate() & copy_user_to_xstate() x86/fpu: Simplify __copy_xstate_to_kernel() return values x86/fpu: Change 'size_total' parameter to unsigned and standardize the size checks in copy_xstate_to_*() x86/fpu: Clarify parameter names in the copy_xstate_to_*() methods x86/fpu: Remove the 'start_pos' parameter from the __copy_xstate_to_*() functions x86/fpu: Clean up the parameter definitions of copy_xstate_to_*() x86/fpu: Clean up parameter order in the copy_xstate_to_*() APIs x86/fpu: Remove 'kbuf' parameter from the copy_xstate_to_user() APIs x86/fpu: Remove 'ubuf' parameter from the copy_xstate_to_kernel() APIs x86/fpu: Split copy_xstate_to_user() into copy_xstate_to_kernel() & copy_xstate_to_user() x86/fpu: Rename copyin_to_xsaves()/copyout_from_xsaves() to copy_user_to_xstate()/copy_xstate_to_user() tpm: ibmvtpm: simplify crq initialization and document crq format tpm: replace msleep() with usleep_range() in TPM 1.2/2.0 generic drivers Documentation: tpm: add powered-while-suspended binding documentation tpm: tpm_crb: constify acpi_device_id. tpm: vtpm: constify vio_device_id security: fix description of values returned by cap_inode_need_killpriv net: qualcomm: rmnet: Fix rcu splat in rmnet_is_real_dev_registered cnic: Fix an error handling path in 'cnic_alloc_bnx2x_resc()' tracing: Remove RCU work arounds from stack tracer extable: Enable RCU if it is not watching in kernel_text_address() extable: Consolidate *kernel_text_address() functions rcu: Allow for page faults in NMI handlers as3645a: Unregister indicator LED on device unbind as3645a: Use integer numbers for parsing LEDs dt: bindings: as3645a: Use LED number to refer to LEDs as3645a: Use ams,input-max-microamp as documented in DT bindings x86/asm: Fix inline asm call constraints for Clang objtool: Handle another GCC stack pointer adjustment bug inet: fix improper empty comparison net: use inet6_rcv_saddr to compare sockets net: set tb->fast_sk_family net: orphan frags on stand-alone ptype in dev_queue_xmit_nit MAINTAINERS: update git tree locations for ieee802154 subsystem SMB3: Don't ignore O_SYNC/O_DSYNC and O_DIRECT flags SMB3: handle new statx fields arch: remove unused *_segments() macros/functions parisc: Unbreak bootloader due to gcc-7 optimizations parisc: Reintroduce option to gzip-compress the kernel apparmor: fix apparmorfs DAC access permissions apparmor: fix build failure on sparc caused by undeclared signals apparmor: fix incorrect type assignment when freeing proxies apparmor: ensure unconfined profiles have dfas initialized apparmor: fix race condition in null profile creation apparmor: move new_null_profile to after profile lookup fns() apparmor: add base infastructure for socket mediation apparmor: add more debug asserts to apparmorfs apparmor: make policy_unpack able to audit different info messages apparmor: add support for absolute root view based labels apparmor: cleanup conditional check for label in label_print apparmor: add mount mediation apparmor: add the ability to mediate signals apparmor: Redundant condition: prev_ns. in [label.c:1498] apparmor: Fix an error code in aafs_create() apparmor: Fix logical error in verify_header() apparmor: Fix shadowed local variable in unpack_trans_table() bnxt_re: Don't issue cmd to delete GID for QP1 GID entry before the QP is destroyed bnxt_re: Fix memory leak in FRMR path bnxt_re: Remove RTNL lock dependency in bnxt_re_query_port bnxt_re: Fix race between the netdev register and unregister events bnxt_re: Free up devices in module_exit path bnxt_re: Fix compare and swap atomic operands bnxt_re: Stop issuing further cmds to FW once a cmd times out bnxt_re: Fix update of qplib_qp.mtu when modified parisc: Add HWPOISON page fault handler code parisc: Move init_per_cpu() into init section parisc: Check if initrd was loaded into broken RAM parisc: Add PDCE_CHECK instruction to HPMC handler parisc: Add wrapper for pdc_instr() firmware function parisc: Move start_parisc() into init section parisc: Stop unwinding at start of stack parisc: Fix too large frame size warnings i40iw: Add support for port reuse on active side connections i40iw: Add missing VLAN priority i40iw: Call i40iw_cm_disconn on modify QP to disconnect i40iw: Prevent multiple netdev event notifier registrations i40iw: Fail open if there are no available MSI-X vectors RDMA/vmw_pvrdma: Fix reporting correct opcodes for completion IB/bnxt_re: Fix frame stack compilation warning IB/mlx5: fix debugfs cleanup IB/ocrdma: fix incorrect fall-through on switch statement IB/ipoib: Suppress the retry related completion errors Input: elan_i2c - extend Flash-Write delay iw_cxgb4: remove the stid on listen create failure iw_cxgb4: drop listen destroy replies if no ep found iw_cxgb4: put ep reference in pass_accept_req() USB: g_mass_storage: Fix deadlock when driver is unbound USB: gadgetfs: Fix crash caused by inadequate synchronization USB: gadgetfs: fix copy_to_user while holding spinlock USB: uas: fix bug in handling of alternate settings IB/core: Fix for core panic cgroup: Reinit cgroup_taskset structure before cgroup_migrate_execute() returns ALSA: usb-audio: Check out-of-bounds access by corrupted buffer descriptor drivers/perf: arm_pmu_acpi: Release memory obtained by kasprintf iommu/of: Remove PCI host bridge node check ALSA: pcm: Fix structure definition for X32 ABI mmc: sdhci-pci: Fix voltage switch for some Intel host controllers staging: rtl8723bs: avoid null pointer dereference on pmlmepriv staging: rtl8723bs: add missing range check on id mmc: tmio: remove broken and noisy debug macro KVM: PPC: Book3S HV: Check for updated HDSISR on P9 HDSI exception KVM: nVMX: fix HOST_CR3/HOST_CR4 cache Drivers: hv: fcopy: restore correct transfer length vmbus: don't acquire the mutex in vmbus_hvsock_device_unregister() intel_th: pci: Add Lewisburg PCH support intel_th: pci: Add Cedar Fork PCH support stm class: Fix a use-after-free usb-storage: unusual_devs entry to fix write-access regression for Seagate external drives usb-storage: fix bogus hardware error messages for ATA pass-thru devices drm/sun4i: cec: Enable back CEC-pin framework net: prevent dst uses after free net: phy: Fix truncation of large IRQ numbers in phy_attached_print() dt-bindings: clk: stm32h7: fix clock-cell size Input: uinput - avoid crash when sending FF request to device going away Input: uinput - avoid FF flush when destroying device net/smc: no close wait in case of process shut down net/smc: introduce a delay net/smc: terminate link group if out-of-sync is received net/smc: longer delay for client link group removal net/smc: adapt send request completion notification net/smc: adjust net_device refcount net/smc: take RCU read lock for routing cache lookup net/smc: add receive timeout check net/smc: add missing dev_put net: stmmac: Cocci spatch "of_table" lan78xx: Use default values loaded from EEPROM/OTP after reset lan78xx: Allow EEPROM write for less than MAX_EEPROM_SIZE lan78xx: Fix for eeprom read/write when device auto suspend net: phy: Keep reporting transceiver type net: ethtool: Add back transceiver type net: qcom/emac: add software control for pause frame mode hv_netvsc: fix send buffer failure on MTU change net_sched: remove cls_flower idr on failure net_sched/hfsc: fix curve activation in hfsc_change_class() net_sched: always reset qdisc backlog in qdisc_reset() x86/xen: clean up clang build warning USB: core: harden cdc_parse_cdc_header ath10k: mark PM functions as __maybe_unused MIPS: PCI: fix pcibios_map_irq section mismatch MIPS: Fix input modify in __write_64bit_c0_split() MIPS: MSP71xx: Include asm/setup.h selftests: lib.mk: copy test scripts and test files for make O=dir run selftests: sync: kselftest and kselftest-clean fail for make O=dir case selftests: sync: use TEST_CUSTOM_PROGS instead of TEST_PROGS selftests: lib.mk: add TEST_CUSTOM_PROGS to allow custom test run/install selftests: watchdog: fix to use TEST_GEN_PROGS and remove clean selftests: lib.mk: fix test executable status check to use full path selftests: Makefile: clear LDFLAGS for make O=dir use-case selftests: lib.mk: kselftest and kselftest-clean fail for make O=dir case Makefile: kselftest and kselftest-clean fail for make O=dir case reset: Restrict RESET_HSDK to ARC_SOC_HSDK or COMPILE_TEST Revert "genirq: Restrict effective affinity to interrupts actually using it" powerpc/pseries: Fix parent_dn reference leak in add_dt_node() powerpc/pseries: Fix "OF: ERROR: Bad of_node_put() on /cpus" during DLPAR powerpc/eeh: Create PHB PEs after EEH is initialized ipc/shm: Fix order of parameters when calling copy_compat_shmid_to_user iov_iter: fix page_copy_sane for compound pages SMB: Validate negotiate (to protect against downgrade) even if signing off cifs: release auth_key.response for reconnect. cifs: release cifs root_cred after exit_cifs CIFS: make arrays static const, reduces object code size net: hns3: Fix for pri to tc mapping in TM net: hns3: Fix for setting rss_size incorrectly net: hns3: Fix typo error for feild in hclge_tm net: hns3: Fix for rx priv buf allocation when DCB is not supported net: hns3: Fix for rx_priv_buf_alloc not setting rx shared buffer net: hns3: Fix for not setting rx private buffer size to zero net: hns3: Fix for DEFAULT_DV when dev doesn't support DCB net: hns3: Fix initialization when cmd is not supported net: hns3: Cleanup for ROCE capability flag in ae_dev isdn/i4l: fetch the ppp_write buffer in one shot net: fec: return IRQ_HANDLED if fec_ptp_check_pps_event handled it net: fec: remove unused interrupt FEC_ENET_TS_TIMER net: fec: only check queue 0 if RXF_0/TXF_0 interrupt is set net: change skb->mac_header when Generic XDP calls adjust_head net: compat: assert the size of cmsg copied in is as expected drm/amdkfd: Print event limit messages only once per process drm/amdkfd: Fix kernel-queue wrapping bugs drm/amdkfd: Fix incorrect destroy_mqd parameter [SMB3] Update session and share information displayed for debugging SMB2/SMB3 bpf: one perf event close won't free bpf program attached by another perf event packet: hold bind lock when rebinding to fanout hook ALSA: usb-audio: Add sample rate quirk for Plantronics C310/C520-M PCI: endpoint: Use correct "end of test" interrupt scripts/dtc: dtx_diff - 2nd update of include dts paths to match build kbuild: rpm-pkg: fix version number handling kbuild: deb-pkg: remove firmware package support kbuild: rpm-pkg: delete firmware_install to fix build error qtnfmac: cancel scans on wireless interface changes qtnfmac: lock access to h/w in tx path usb: gadget: dummy: fix nonsensical comparisons usb: gadget: udc: fix snps_udc_plat.c build errors usb: gadget: function: printer: avoid spinlock recursion usb: gadget: core: fix ->udc_set_speed() logic s390/topology: enable / disable topology dynamically s390/topology: alternative topology for topology-less machines powerpc/kprobes: Update optprobes to use emulate_update_regs() ALSA: hda - program ICT bits to support HBR audio crypto: af_alg - update correct dst SGL entry crypto: caam - fix LS1021A support on ARMv7 multiplatform kernel crypto: inside-secure - fix gcc-4.9 warnings crypto: talitos - Don't provide setkey for non hmac hashing algs. crypto: talitos - fix hashing crypto: talitos - fix sha224 crypto: x86/twofish - Fix RBP usage crypto: sha512-avx2 - Fix RBP usage crypto: x86/sha256-ssse3 - Fix RBP usage crypto: x86/sha256-avx2 - Fix RBP usage crypto: x86/sha256-avx - Fix RBP usage crypto: x86/sha1-ssse3 - Fix RBP usage crypto: x86/sha1-avx2 - Fix RBP usage crypto: x86/des3_ede - Fix RBP usage crypto: x86/cast6 - Fix RBP usage crypto: x86/cast5 - Fix RBP usage crypto: x86/camellia - Fix RBP usage crypto: x86/blowfish - Fix RBP usage crypto: drbg - fix freeing of resources MIPS: Fix perf event init ARM: dts: da850-evm: add serial and ethernet aliases cifs: show 'soft' in the mount options for hard mounts SMB3: Warn user if trying to sign connection that authenticated as guest SMB3: Fix endian warning brcmfmac: setup passive scan if requested by user-space brcmfmac: add length check in brcmf_cfg80211_escan_handler() powerpc/powernv: Clear LPCR[PECE1] via stop-api only for deep state offline powerpc/sstep: mullw should calculate a 64 bit signed result powerpc/sstep: Fix issues with mcrf powerpc/sstep: Fix issues with set_cr0() powerpc/tm: Flush TM only if CPU has TM feature powerpc/sysrq: Fix oops whem ppmu is not registered powerpc/configs: Update for CONFIG_SND changes drm/exynos/hdmi: Fix unsafe list iteration Fix SMB3.1.1 guest authentication to Samba ipv6: fix net.ipv6.conf.all interface DAD handlers net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure bpf: fix ri->map_owner pointer on bpf_prog_realloc net: emac: Fix napi poll list corruption tcp: fastopen: fix on syn-data transmit failure net: hns3: Fixes the premature exit of loop when matching clients net: hns3: Fixes the default VLAN-id of PF net: hns3: Fixes the ether address copy with appropriate API net: hns3: Fixes the initialization of MAC address in hardware net: hns3: Fixes ring-to-vector map-and-unmap command net: hns3: Fixes the command used to unmap ring from vector net: hns3: Fixes initialization of phy address from firmware cpufreq: ti-cpufreq: Support additional am43xx platforms bpf: do not disable/enable BH in bpf_map_free_id() tracing: Fix trace_pipe behavior for instance traces rhashtable: Documentation tweak ACPI: properties: Return _DSD hierarchical extension (data) sub-nodes correctly ARM: cpuidle: Avoid memleak if init fail cpufreq: dt-platdev: Add some missing platforms to the blacklist PM: core: Fix device_pm_check_callbacks() PM: docs: Drop an excess character from devices.rst net: phy: Kconfig: Fix PHY infrastructure menu in menuconfig ACPI / bus: Make ACPI_HANDLE() work for non-GPL code again selftests/net: msg_zerocopy enable build with older kernel headers selftests: actually run the various net selftests selftest: add a reuseaddr test selftests: silence test output by default ALSA: asihpi: fix a potential double-fetch bug when copying puhm MIPS: PCI: Move map_irq() hooks out of initdata ceph: avoid panic in create_session_open_msg() if utsname() returns NULL irqchip.mips-gic: Fix shared interrupt mask writes irqchip/gic-v4: Fix building with ancient gcc irqchip/gic-v3: Iterate over possible CPUs by for_each_possible_cpu() libceph: don't allow bidirectional swap of pg-upmap-items ARM: dts: am43xx-epos-evm: Remove extra CPSW EMAC entry ARM: dts: am33xx: Add spi alias to match SOC schematics ARM: OMAP2+: hsmmc: fix logic to call either omap_hsmmc_init or omap_hsmmc_late_init but not both ARM: dts: dra7: Set a default parent to mcasp3_ahclkx_mux ARM: OMAP2+: dra7xx: Set OPT_CLKS_IN_RESET flag for gpio1 ARM: dts: nokia n900: drop unneeded/undocumented parts of the dts MAINTAINERS: Remove Yuval Mintz from maintainers list arm64: dts: rockchip: Correct MIPI DPHY PLL clock on rk3399 dt-bindings: fix vendor prefix for Abracon of: provide inline helper for of_find_device_by_node tracing: Ignore mmiotrace from kernel commandline tracing: Erase irqsoff trace with empty write USB: fix out-of-bounds in usb_set_configuration arm64: dt marvell: Fix AP806 system controller size MAINTAINERS: add Macchiatobin maintainers entry iommu/qcom: Depend on HAS_DMA to fix compile error xen, arm64: drop dummy lookup_address() KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt KVM: VMX: do not change SN bit in vmx_update_pi_irte() KVM: x86: Fix the NULL pointer parameter in check_cr_write() drm: exynos: include linux/irq.h drm/exynos: Fix suspend/resume support drm/exynos: Fix locking in the suspend/resume paths iommu/vt-d: Fix harmless section mismatch warning iommu: Add missing dependencies driver core: remove DRIVER_ATTR fpga: altera-cvp: remove DRIVER_ATTR() usage Revert "KVM: Don't accept obviously wrong gsi values via KVM_IRQFD" s390/mm: fix write access check in gup_huge_pmd() s390/mm: make pmdp_invalidate() do invalidation only s390/cio: recover from bad paths s390/scm_blk: consistently use blk_status_t as error type net: systemport: Fix 64-bit statistics dependency 8139too: revisit napi_complete_done() usage fcntl: Don't set si_code to SI_SIGIO when sig == SIGPOLL ata_piix: Add Fujitsu-Siemens Lifebook S6120 to short cable IDs Documentation: core-api: minor workqueue.rst cleanups libnvdimm, namespace: fix btt claim class crash tcp: remove two unused functions tools/testing/nvdimm: disable labels for nfit_test.1 bpf: devmap: pass on return value of bpf_map_precharge_memlock bnxt_en: check for ingress qdisc in flower offload ACPI / watchdog: properly initialize resources Documentation: networking: fix ASCII art in switchdev.txt net/sched: cls_matchall: fix crash when used with classful qdisc ip6_tunnel: do not allow loading ip6_tunnel if ipv6 is disabled in cmdline net: phy: Fix mask value write on gmii2rgmii converter speed register drm/i915: Remove unused 'in_vbl' from i915_get_crtc_scanoutpos() drm/i915/cnp: set min brightness from VBT Revert "drm/i915/bxt: Disable device ready before shutdown command" drm/i915/bxt: set min brightness from VBT drm/i915: Fix an error handling in 'intel_framebuffer_init()' drm/i915/gvt: Fix incorrect PCI BARs reporting ip6_gre: skb_push ipv6hdr before packing the header in ip6gre_header nl80211: fix null-ptr dereference on invalid mesh configuration udpv6: Fix the checksum computation when HW checksum does not apply selftests/ftrace: multiple_kprobes: Also check for support selftests/bpf: Make bpf_util work on uniprocessor systems selftests/intel_pstate: No need to compile test progs in the run script selftests: intel_pstate: build only on x86 selftests: breakpoints: re-order TEST_GEN_PROGS targets tools: fix testing/selftests/sigaltstack for s390x selftests: net: More graceful finding of `ip'. serial: sccnxp: Fix error handling in sccnxp_probe() tty: serial: lpuart: avoid report NULL interrupt serial: bcm63xx: fix timing issue. mxser: fix timeout calculation for low rates serial: sh-sci: document R8A77970 bindings netfilter: ipset: ipset list may return wrong member count for set with timeout netfilter: nat: Do not use ARRAY_SIZE() on spinlocks to fix zero div driver core: platform: Don't read past the end of "driver_override" buffer Revert "xhci: Limit USB2 port wake support for AMD Promontory hosts" xhci: set missing SuperSpeedPlus Link Protocol bit in roothub descriptor xhci: Fix sleeping with spin_lock_irq() held in ASmedia 1042A workaround usb: host: xhci-plat: allow sysdev to inherit from ACPI xhci: fix wrong endpoint ESIT value shown in tracing usb: pci-quirks.c: Corrected timeout values used in handshake xhci: fix finding correct bus_state structure for USB 3.1 hosts usb: xhci: Free the right ring in xhci_add_endpoint() base: arch_topology: fix section mismatch build warnings driver core: suppress sending MODALIAS in UNBIND uevents nvmem: add missing of_node_put() in of_nvmem_cell_get() nvmem: core: return EFBIG on out-of-range write auxdisplay: charlcd: properly restore atomic counter on error path binder: fix memory corruption in binder_transaction binder binder: fix an ret value override android: binder: fix type mismatch warning ALSA: compress: Remove unused variable xen: don't compile pv-specific parts if XEN_PV isn't configured mtd: nand: remove unused blockmask variable PM / QoS: Use the correct variable to check the QoS request type ACPI / PMIC: Add code reviewers to MAINTAINERS driver core: Fix link to device power management documentation ARC: reset: remove the misleading v1 suffix all over usb: dwc3: ep0: fix DMA starvation by assigning req->trb on ep0 staging: vchiq_2835_arm: Fix NULL ptr dereference in free_pagelist staging: speakup: fix speakup-r empty line lockup staging: pi433: Move limit check to switch default to kill warning staging: r8822be: fix null pointer dereferences with a null driver_adapter staging: mt29f_spinand: Enable the read ECC before program the page staging: unisys/visorbus: add __init/__exit annotations isofs: fix build regression quota: add missing lock into __dquot_transfer() arm64: ensure the kernel is compiled for LP64 arm64: relax assembly code alignment from 16 byte to 4 byte arm64: efi: Don't include EFI fpsimd save/restore code in non-EFI kernels mtd: nand: lpc32xx_mlc: Fix an error handling path in lpc32xx_nand_probe() usb: Increase quirk delay for USB devices uwb: properly check kthread_run return value uwb: ensure that endpoint is interrupt ARC: reset: add missing DT binding documentation for HSDKv1 reset driver ARC: reset: Only build on archs that have IOMEM ARM: at91: Replace uses of virt_to_phys with __pa_symbol ARM: dts: at91: sama5d27_som1_ek: fix USB host vbus ARM: dts: at91: sama5d27_som1_ek: fix typos ARM: dts: at91: sama5d27_som1_ek: update pinmux/pinconf for LEDs and USB mtd: spi-nor: fix DMA unsafe buffer issue in spi_nor_read_sfdp() mtd: spi-nor: Check consistency of the memory size extracted from the SFDP clocksource/integrator: Fix section mismatch warning Update version of cifs module cifs: hide unused functions SMB3: Add support for multidialect negotiate (SMB2.1 and later) arm64/syscalls: Move address limit check in loop arm/syscalls: Optimize address limit check Revert "arm/syscalls: Check address limit on user-mode return" syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check x86/mm/32: Load a sane CR3 before cpu_init() on secondary CPUs x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier x86/mm/64: Stop using CR3.PCID == 0 in ASID-aware code x86/mm: Factor out CR3-building code CIFS/SMB3: Update documentation to reflect SMB3 and various changes dma-coherent: fix rmem_dma_device_init regression clk: rockchip: add sclk_timer5 as critical clock on rk3128 clk: rockchip: fix up rk3128 pvtm and mipi_24m gate regs error clk: rockchip: add pclk_pmu as critical clock on rk3128 Revert "arm64: dts: rockchip: Add basic cpu frequencies for RK3368" genirq: Fix cpumask check in __irq_startup_managed() scsi: aacraid: error: testing array offset 'bus' after use scsi: lpfc: Don't return internal MBXERR_ERROR code from probe function fs/proc: Report eip/esp in /prod/PID/stat for coredumping xen: x86: mark xen_find_pt_base as __init scsi: aacraid: Fix 2T+ drives on SmartIOC-2000 scsi: sg: fixup infoleak when using SG_GET_REQUEST_TABLE scsi: sg: factor out sg_fill_request_table() scsi: sd: Remove unnecessary condition in sd_read_block_limits() drm/radeon: disable hard reset in hibernate for APUs objtool: Fix object file corruption objtool: Do not retrieve data from empty sections objtool: Fix memory leak in elf_create_rela_section() x86/cpu/AMD: Fix erratum 1076 (CPB bit) nl80211: check for the required netlink attributes presence scsi: acornscsi: fix build error scsi: scsi_transport_fc: fix NULL pointer dereference in fc_bsg_job_timeout drm/amdgpu: revert tile table update for oland watchdog/hardlockup: Clean up hotplug locking mess watchdog/hardlockup/perf: Simplify deferred event destroy watchdog/hardlockup/perf: Use new perf CPU enable mechanism watchdog/hardlockup/perf: Implement CPU enable replacement watchdog/hardlockup/perf: Implement init time detection of perf watchdog/hardlockup/perf: Implement init time perf validation watchdog/core: Get rid of the racy update loop watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage watchdog/sysctl: Clean up sysctl variable name space watchdog/sysctl: Get rid of the #ifdeffery watchdog/core: Clean up header mess watchdog/core: Further simplify sysctl handling watchdog/core: Get rid of the thread teardown/setup dance watchdog/core: Create new thread handling infrastructure smpboot/threads, watchdog/core: Avoid runtime allocation watchdog/core: Split out cpumask write function watchdog/core: Clean up the #ifdef maze watchdog/core: Clean up stub functions watchdog/core: Remove the park_in_progress obfuscation watchdog/hardlockup/perf: Prevent CPU hotplug deadlock watchdog/hardlockup/perf: Remove broken self disable on failure watchdog/core: Mark hardlockup_detector_disable() __init watchdog/core: Rename watchdog_proc_mutex watchdog/core: Rework CPU hotplug locking watchdog/core: Remove broken suspend/resume interfaces parisc, watchdog/core: Use lockup_detector_stop() watchdog/core: Provide interface to stop from poweroff() perf/x86/intel, watchdog/core: Sanitize PMU HT bug workaround watchdog/hardlockup: Provide interface to stop/restart perf events HID: wacom: generic: Clear ABS_MISC when tool leaves proximity HID: wacom: generic: Send MSC_SERIAL and ABS_MISC when leaving prox HID: i2c-hid: allocate hid buffers for real worst case s390/dasd: fix race during dasd initialization s390/perf: fix bug when creating per-thread event etnaviv: fix gem object list corruption etnaviv: fix submit error path cifs: check rsp for NULL before dereferencing in SMB2_open qxl: fix primary surface handling drm/amdkfd: check for null dev to avoid a null pointer dereference mmc: cavium: Fix use-after-free in of_platform_device_destroy mmc: host: fix typo after MMC_DEBUG move mmc: block: Fix incorrectly initialized requests HID: rmi: Make sure the HID device is opened on resume iwlwifi: mvm: fix reorder buffer for 9000 devices iwlwifi: mvm: set status before calling iwl_mvm_send_cmd_status() iwlwifi: mvm: initialize status in iwl_mvm_add_int_sta_common() iwlwifi: mvm: handle FIF_ALLMULTI when setting multicast addresses iwlwifi: mvm: use IWL_HCMD_NOCOPY for MCAST_FILTER_CMD iwlwifi: mvm: wake the correct mac80211 queue iwlwifi: mvm: change state when queueing agg start work iwlwifi: mvm: send all non-bufferable frames on the probe queue iwlwifi: mvm: Flush non STA TX queues iwlwifi: mvm: fix wowlan resume failed to load INIT ucode ata: avoid gcc-7 warning in ata_timing_quantize HID: multitouch: Support ALPS PTP stick with pid 0x120A HID: multitouch: support buttons and trackpoint on Lenovo X1 Tab Gen2 HID: wacom: Correct coordinate system of touchring and pen twist HID: wacom: Properly report negative values from Intuos Pro 2 Bluetooth HID: multitouch: Fix system-control buttons not working HID: add multi-input quirk for IDC6680 touchscreen HID: wacom: leds: Don't try to control the EKR's read-only LEDs HID: wacom: bits shifted too much for 9th and 10th buttons md/raid5: preserve STRIPE_ON_UNPLUG_LIST in break_stripe_batch_list ARM64: dts: meson-gxbb: nanopi-k2: enable sdr104 mode ARM64: dts: meson-gxbb: nanopi-k2: enable sdcard UHS modes ARM64: dts: meson-gxbb: p20x: enable sdcard UHS modes ARM64: dts: meson-gxl: libretech-cc: enable high speed modes ARM64: dts: meson-gxl: libretech-cc: add card regulator settle times ARM64: dts: meson-gxbb: nanopi-k2: add card regulator settle times ARM64: dts: meson: add mmc clk gate pins ARM64: dts: meson: remove cap-sd-highspeed from emmc nodes ARM64: dts: meson-gx: Use correct mmc clock source 0 md/raid5: fix a race condition in stripe batch iio: magnetometer: st_magn: fix drdy line configuration for LIS3MDL iio: adc: ti-ads1015: fix comparator polarity setting drm/amdkfd: pass queue's mqd when destroying mqd drm/amdkfd: remove memset before memcpy powerpc/e6500: Update machine check for L1D cache err samples: Unrename SECCOMP_RET_KILL selftests/seccomp: Test thread vs process killing seccomp: Implement SECCOMP_RET_KILL_PROCESS action seccomp: Introduce SECCOMP_RET_KILL_PROCESS seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD seccomp: Action to log before allowing seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW seccomp: Selftest for detection of filter flag support seccomp: Sysctl to configure actions that are allowed to be logged seccomp: Operation for checking if an action is available seccomp: Sysctl to display available actions seccomp: Provide matching filter for introspection selftests/seccomp: Refactor RET_ERRNO tests selftests/seccomp: Add simple seccomp overhead benchmark selftests/seccomp: Add tests for basic ptrace actions uapi linux/kfd_ioctl.h: only use __u32 and __u64 tile: array underflow in setup_maxnodemem() tile: defconfig: Cleanup from old Kconfig options Conflicts: include/scsi/scsi_device.h Change-Id: Ia72943c891d02c72b704c2408185eceab9df59ae Signed-off-by: Runmin Wang <runminw@codeaurora.org>
1962 lines
50 KiB
C
1962 lines
50 KiB
C
/*
|
|
* fs/userfaultfd.c
|
|
*
|
|
* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
|
|
* Copyright (C) 2008-2009 Red Hat, Inc.
|
|
* Copyright (C) 2015 Red Hat, Inc.
|
|
*
|
|
* This work is licensed under the terms of the GNU GPL, version 2. See
|
|
* the COPYING file in the top-level directory.
|
|
*
|
|
* Some part derived from fs/eventfd.c (anon inode setup) and
|
|
* mm/ksm.c (mm hashing).
|
|
*/
|
|
|
|
#include <linux/list.h>
|
|
#include <linux/hashtable.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/poll.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/file.h>
|
|
#include <linux/bug.h>
|
|
#include <linux/anon_inodes.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/userfaultfd_k.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/ioctl.h>
|
|
#include <linux/security.h>
|
|
#include <linux/hugetlb.h>
|
|
|
|
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
|
|
|
|
enum userfaultfd_state {
|
|
UFFD_STATE_WAIT_API,
|
|
UFFD_STATE_RUNNING,
|
|
};
|
|
|
|
/*
|
|
* Start with fault_pending_wqh and fault_wqh so they're more likely
|
|
* to be in the same cacheline.
|
|
*/
|
|
struct userfaultfd_ctx {
|
|
/* waitqueue head for the pending (i.e. not read) userfaults */
|
|
wait_queue_head_t fault_pending_wqh;
|
|
/* waitqueue head for the userfaults */
|
|
wait_queue_head_t fault_wqh;
|
|
/* waitqueue head for the pseudo fd to wakeup poll/read */
|
|
wait_queue_head_t fd_wqh;
|
|
/* waitqueue head for events */
|
|
wait_queue_head_t event_wqh;
|
|
/* a refile sequence protected by fault_pending_wqh lock */
|
|
struct seqcount refile_seq;
|
|
/* pseudo fd refcounting */
|
|
atomic_t refcount;
|
|
/* userfaultfd syscall flags */
|
|
unsigned int flags;
|
|
/* features requested from the userspace */
|
|
unsigned int features;
|
|
/* state machine */
|
|
enum userfaultfd_state state;
|
|
/* released */
|
|
bool released;
|
|
/* mm with one ore more vmas attached to this userfaultfd_ctx */
|
|
struct mm_struct *mm;
|
|
};
|
|
|
|
struct userfaultfd_fork_ctx {
|
|
struct userfaultfd_ctx *orig;
|
|
struct userfaultfd_ctx *new;
|
|
struct list_head list;
|
|
};
|
|
|
|
struct userfaultfd_unmap_ctx {
|
|
struct userfaultfd_ctx *ctx;
|
|
unsigned long start;
|
|
unsigned long end;
|
|
struct list_head list;
|
|
};
|
|
|
|
struct userfaultfd_wait_queue {
|
|
struct uffd_msg msg;
|
|
wait_queue_entry_t wq;
|
|
struct userfaultfd_ctx *ctx;
|
|
bool waken;
|
|
};
|
|
|
|
struct userfaultfd_wake_range {
|
|
unsigned long start;
|
|
unsigned long len;
|
|
};
|
|
|
|
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
|
|
int wake_flags, void *key)
|
|
{
|
|
struct userfaultfd_wake_range *range = key;
|
|
int ret;
|
|
struct userfaultfd_wait_queue *uwq;
|
|
unsigned long start, len;
|
|
|
|
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
|
|
ret = 0;
|
|
/* len == 0 means wake all */
|
|
start = range->start;
|
|
len = range->len;
|
|
if (len && (start > uwq->msg.arg.pagefault.address ||
|
|
start + len <= uwq->msg.arg.pagefault.address))
|
|
goto out;
|
|
WRITE_ONCE(uwq->waken, true);
|
|
/*
|
|
* The Program-Order guarantees provided by the scheduler
|
|
* ensure uwq->waken is visible before the task is woken.
|
|
*/
|
|
ret = wake_up_state(wq->private, mode);
|
|
if (ret) {
|
|
/*
|
|
* Wake only once, autoremove behavior.
|
|
*
|
|
* After the effect of list_del_init is visible to the other
|
|
* CPUs, the waitqueue may disappear from under us, see the
|
|
* !list_empty_careful() in handle_userfault().
|
|
*
|
|
* try_to_wake_up() has an implicit smp_mb(), and the
|
|
* wq->private is read before calling the extern function
|
|
* "wake_up_state" (which in turns calls try_to_wake_up).
|
|
*/
|
|
list_del_init(&wq->entry);
|
|
}
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
|
|
* context.
|
|
* @ctx: [in] Pointer to the userfaultfd context.
|
|
*/
|
|
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
|
|
{
|
|
if (!atomic_inc_not_zero(&ctx->refcount))
|
|
BUG();
|
|
}
|
|
|
|
/**
|
|
* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
|
|
* context.
|
|
* @ctx: [in] Pointer to userfaultfd context.
|
|
*
|
|
* The userfaultfd context reference must have been previously acquired either
|
|
* with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
|
|
*/
|
|
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
|
|
{
|
|
if (atomic_dec_and_test(&ctx->refcount)) {
|
|
VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
|
|
VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
|
|
VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
|
|
VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
|
|
VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
|
|
VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
|
|
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
|
|
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
|
|
mmdrop(ctx->mm);
|
|
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
|
|
}
|
|
}
|
|
|
|
static inline void msg_init(struct uffd_msg *msg)
|
|
{
|
|
BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
|
|
/*
|
|
* Must use memset to zero out the paddings or kernel data is
|
|
* leaked to userland.
|
|
*/
|
|
memset(msg, 0, sizeof(struct uffd_msg));
|
|
}
|
|
|
|
static inline struct uffd_msg userfault_msg(unsigned long address,
|
|
unsigned int flags,
|
|
unsigned long reason,
|
|
unsigned int features)
|
|
{
|
|
struct uffd_msg msg;
|
|
msg_init(&msg);
|
|
msg.event = UFFD_EVENT_PAGEFAULT;
|
|
msg.arg.pagefault.address = address;
|
|
if (flags & FAULT_FLAG_WRITE)
|
|
/*
|
|
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
|
|
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
|
|
* was not set in a UFFD_EVENT_PAGEFAULT, it means it
|
|
* was a read fault, otherwise if set it means it's
|
|
* a write fault.
|
|
*/
|
|
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
|
|
if (reason & VM_UFFD_WP)
|
|
/*
|
|
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
|
|
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
|
|
* not set in a UFFD_EVENT_PAGEFAULT, it means it was
|
|
* a missing fault, otherwise if set it means it's a
|
|
* write protect fault.
|
|
*/
|
|
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
|
|
if (features & UFFD_FEATURE_THREAD_ID)
|
|
msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
|
|
return msg;
|
|
}
|
|
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
/*
|
|
* Same functionality as userfaultfd_must_wait below with modifications for
|
|
* hugepmd ranges.
|
|
*/
|
|
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
|
|
struct vm_area_struct *vma,
|
|
unsigned long address,
|
|
unsigned long flags,
|
|
unsigned long reason)
|
|
{
|
|
struct mm_struct *mm = ctx->mm;
|
|
pte_t *pte;
|
|
bool ret = true;
|
|
|
|
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
|
|
|
|
pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
|
|
if (!pte)
|
|
goto out;
|
|
|
|
ret = false;
|
|
|
|
/*
|
|
* Lockless access: we're in a wait_event so it's ok if it
|
|
* changes under us.
|
|
*/
|
|
if (huge_pte_none(*pte))
|
|
ret = true;
|
|
if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
|
|
ret = true;
|
|
out:
|
|
return ret;
|
|
}
|
|
#else
|
|
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
|
|
struct vm_area_struct *vma,
|
|
unsigned long address,
|
|
unsigned long flags,
|
|
unsigned long reason)
|
|
{
|
|
return false; /* should never get here */
|
|
}
|
|
#endif /* CONFIG_HUGETLB_PAGE */
|
|
|
|
/*
|
|
* Verify the pagetables are still not ok after having reigstered into
|
|
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
|
|
* userfault that has already been resolved, if userfaultfd_read and
|
|
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
|
|
* threads.
|
|
*/
|
|
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
|
|
unsigned long address,
|
|
unsigned long flags,
|
|
unsigned long reason)
|
|
{
|
|
struct mm_struct *mm = ctx->mm;
|
|
pgd_t *pgd;
|
|
p4d_t *p4d;
|
|
pud_t *pud;
|
|
pmd_t *pmd, _pmd;
|
|
pte_t *pte;
|
|
bool ret = true;
|
|
|
|
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
|
|
|
|
pgd = pgd_offset(mm, address);
|
|
if (!pgd_present(*pgd))
|
|
goto out;
|
|
p4d = p4d_offset(pgd, address);
|
|
if (!p4d_present(*p4d))
|
|
goto out;
|
|
pud = pud_offset(p4d, address);
|
|
if (!pud_present(*pud))
|
|
goto out;
|
|
pmd = pmd_offset(pud, address);
|
|
/*
|
|
* READ_ONCE must function as a barrier with narrower scope
|
|
* and it must be equivalent to:
|
|
* _pmd = *pmd; barrier();
|
|
*
|
|
* This is to deal with the instability (as in
|
|
* pmd_trans_unstable) of the pmd.
|
|
*/
|
|
_pmd = READ_ONCE(*pmd);
|
|
if (!pmd_present(_pmd))
|
|
goto out;
|
|
|
|
ret = false;
|
|
if (pmd_trans_huge(_pmd))
|
|
goto out;
|
|
|
|
/*
|
|
* the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
|
|
* and use the standard pte_offset_map() instead of parsing _pmd.
|
|
*/
|
|
pte = pte_offset_map(pmd, address);
|
|
/*
|
|
* Lockless access: we're in a wait_event so it's ok if it
|
|
* changes under us.
|
|
*/
|
|
if (pte_none(*pte))
|
|
ret = true;
|
|
pte_unmap(pte);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* The locking rules involved in returning VM_FAULT_RETRY depending on
|
|
* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
|
|
* FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
|
|
* recommendation in __lock_page_or_retry is not an understatement.
|
|
*
|
|
* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
|
|
* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
|
|
* not set.
|
|
*
|
|
* If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
|
|
* set, VM_FAULT_RETRY can still be returned if and only if there are
|
|
* fatal_signal_pending()s, and the mmap_sem must be released before
|
|
* returning it.
|
|
*/
|
|
int handle_userfault(struct vm_fault *vmf, unsigned long reason)
|
|
{
|
|
struct mm_struct *mm = vmf->vma->vm_mm;
|
|
struct userfaultfd_ctx *ctx;
|
|
struct userfaultfd_wait_queue uwq;
|
|
int ret;
|
|
bool must_wait, return_to_userland;
|
|
long blocking_state;
|
|
|
|
ret = VM_FAULT_SIGBUS;
|
|
|
|
/*
|
|
* We don't do userfault handling for the final child pid update.
|
|
*
|
|
* We also don't do userfault handling during
|
|
* coredumping. hugetlbfs has the special
|
|
* follow_hugetlb_page() to skip missing pages in the
|
|
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
|
|
* the no_page_table() helper in follow_page_mask(), but the
|
|
* shmem_vm_ops->fault method is invoked even during
|
|
* coredumping without mmap_sem and it ends up here.
|
|
*/
|
|
if (current->flags & (PF_EXITING|PF_DUMPCORE))
|
|
goto out;
|
|
|
|
/*
|
|
* Coredumping runs without mmap_sem so we can only check that
|
|
* the mmap_sem is held, if PF_DUMPCORE was not set.
|
|
*/
|
|
WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
|
|
|
|
ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
|
|
if (!ctx)
|
|
goto out;
|
|
|
|
BUG_ON(ctx->mm != mm);
|
|
|
|
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
|
|
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
|
|
|
|
if (ctx->features & UFFD_FEATURE_SIGBUS)
|
|
goto out;
|
|
|
|
/*
|
|
* If it's already released don't get it. This avoids to loop
|
|
* in __get_user_pages if userfaultfd_release waits on the
|
|
* caller of handle_userfault to release the mmap_sem.
|
|
*/
|
|
if (unlikely(ACCESS_ONCE(ctx->released))) {
|
|
/*
|
|
* Don't return VM_FAULT_SIGBUS in this case, so a non
|
|
* cooperative manager can close the uffd after the
|
|
* last UFFDIO_COPY, without risking to trigger an
|
|
* involuntary SIGBUS if the process was starting the
|
|
* userfaultfd while the userfaultfd was still armed
|
|
* (but after the last UFFDIO_COPY). If the uffd
|
|
* wasn't already closed when the userfault reached
|
|
* this point, that would normally be solved by
|
|
* userfaultfd_must_wait returning 'false'.
|
|
*
|
|
* If we were to return VM_FAULT_SIGBUS here, the non
|
|
* cooperative manager would be instead forced to
|
|
* always call UFFDIO_UNREGISTER before it can safely
|
|
* close the uffd.
|
|
*/
|
|
ret = VM_FAULT_NOPAGE;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Check that we can return VM_FAULT_RETRY.
|
|
*
|
|
* NOTE: it should become possible to return VM_FAULT_RETRY
|
|
* even if FAULT_FLAG_TRIED is set without leading to gup()
|
|
* -EBUSY failures, if the userfaultfd is to be extended for
|
|
* VM_UFFD_WP tracking and we intend to arm the userfault
|
|
* without first stopping userland access to the memory. For
|
|
* VM_UFFD_MISSING userfaults this is enough for now.
|
|
*/
|
|
if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
|
|
/*
|
|
* Validate the invariant that nowait must allow retry
|
|
* to be sure not to return SIGBUS erroneously on
|
|
* nowait invocations.
|
|
*/
|
|
BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
|
|
#ifdef CONFIG_DEBUG_VM
|
|
if (printk_ratelimit()) {
|
|
printk(KERN_WARNING
|
|
"FAULT_FLAG_ALLOW_RETRY missing %x\n",
|
|
vmf->flags);
|
|
dump_stack();
|
|
}
|
|
#endif
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Handle nowait, not much to do other than tell it to retry
|
|
* and wait.
|
|
*/
|
|
ret = VM_FAULT_RETRY;
|
|
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
|
|
goto out;
|
|
|
|
/* take the reference before dropping the mmap_sem */
|
|
userfaultfd_ctx_get(ctx);
|
|
|
|
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
|
|
uwq.wq.private = current;
|
|
uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
|
|
ctx->features);
|
|
uwq.ctx = ctx;
|
|
uwq.waken = false;
|
|
|
|
return_to_userland =
|
|
(vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
|
|
(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
|
|
blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
|
|
TASK_KILLABLE;
|
|
|
|
spin_lock(&ctx->fault_pending_wqh.lock);
|
|
/*
|
|
* After the __add_wait_queue the uwq is visible to userland
|
|
* through poll/read().
|
|
*/
|
|
__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
|
|
/*
|
|
* The smp_mb() after __set_current_state prevents the reads
|
|
* following the spin_unlock to happen before the list_add in
|
|
* __add_wait_queue.
|
|
*/
|
|
set_current_state(blocking_state);
|
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
|
|
|
if (!is_vm_hugetlb_page(vmf->vma))
|
|
must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
|
|
reason);
|
|
else
|
|
must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
|
|
vmf->address,
|
|
vmf->flags, reason);
|
|
up_read(&mm->mmap_sem);
|
|
|
|
if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
|
|
(return_to_userland ? !signal_pending(current) :
|
|
!fatal_signal_pending(current)))) {
|
|
wake_up_poll(&ctx->fd_wqh, POLLIN);
|
|
schedule();
|
|
ret |= VM_FAULT_MAJOR;
|
|
|
|
/*
|
|
* False wakeups can orginate even from rwsem before
|
|
* up_read() however userfaults will wait either for a
|
|
* targeted wakeup on the specific uwq waitqueue from
|
|
* wake_userfault() or for signals or for uffd
|
|
* release.
|
|
*/
|
|
while (!READ_ONCE(uwq.waken)) {
|
|
/*
|
|
* This needs the full smp_store_mb()
|
|
* guarantee as the state write must be
|
|
* visible to other CPUs before reading
|
|
* uwq.waken from other CPUs.
|
|
*/
|
|
set_current_state(blocking_state);
|
|
if (READ_ONCE(uwq.waken) ||
|
|
READ_ONCE(ctx->released) ||
|
|
(return_to_userland ? signal_pending(current) :
|
|
fatal_signal_pending(current)))
|
|
break;
|
|
schedule();
|
|
}
|
|
}
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
if (return_to_userland) {
|
|
if (signal_pending(current) &&
|
|
!fatal_signal_pending(current)) {
|
|
/*
|
|
* If we got a SIGSTOP or SIGCONT and this is
|
|
* a normal userland page fault, just let
|
|
* userland return so the signal will be
|
|
* handled and gdb debugging works. The page
|
|
* fault code immediately after we return from
|
|
* this function is going to release the
|
|
* mmap_sem and it's not depending on it
|
|
* (unlike gup would if we were not to return
|
|
* VM_FAULT_RETRY).
|
|
*
|
|
* If a fatal signal is pending we still take
|
|
* the streamlined VM_FAULT_RETRY failure path
|
|
* and there's no need to retake the mmap_sem
|
|
* in such case.
|
|
*/
|
|
down_read(&mm->mmap_sem);
|
|
ret = VM_FAULT_NOPAGE;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Here we race with the list_del; list_add in
|
|
* userfaultfd_ctx_read(), however because we don't ever run
|
|
* list_del_init() to refile across the two lists, the prev
|
|
* and next pointers will never point to self. list_add also
|
|
* would never let any of the two pointers to point to
|
|
* self. So list_empty_careful won't risk to see both pointers
|
|
* pointing to self at any time during the list refile. The
|
|
* only case where list_del_init() is called is the full
|
|
* removal in the wake function and there we don't re-list_add
|
|
* and it's fine not to block on the spinlock. The uwq on this
|
|
* kernel stack can be released after the list_del_init.
|
|
*/
|
|
if (!list_empty_careful(&uwq.wq.entry)) {
|
|
spin_lock(&ctx->fault_pending_wqh.lock);
|
|
/*
|
|
* No need of list_del_init(), the uwq on the stack
|
|
* will be freed shortly anyway.
|
|
*/
|
|
list_del(&uwq.wq.entry);
|
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
|
}
|
|
|
|
/*
|
|
* ctx may go away after this if the userfault pseudo fd is
|
|
* already released.
|
|
*/
|
|
userfaultfd_ctx_put(ctx);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
|
|
struct userfaultfd_wait_queue *ewq)
|
|
{
|
|
if (WARN_ON_ONCE(current->flags & PF_EXITING))
|
|
goto out;
|
|
|
|
ewq->ctx = ctx;
|
|
init_waitqueue_entry(&ewq->wq, current);
|
|
|
|
spin_lock(&ctx->event_wqh.lock);
|
|
/*
|
|
* After the __add_wait_queue the uwq is visible to userland
|
|
* through poll/read().
|
|
*/
|
|
__add_wait_queue(&ctx->event_wqh, &ewq->wq);
|
|
for (;;) {
|
|
set_current_state(TASK_KILLABLE);
|
|
if (ewq->msg.event == 0)
|
|
break;
|
|
if (ACCESS_ONCE(ctx->released) ||
|
|
fatal_signal_pending(current)) {
|
|
/*
|
|
* &ewq->wq may be queued in fork_event, but
|
|
* __remove_wait_queue ignores the head
|
|
* parameter. It would be a problem if it
|
|
* didn't.
|
|
*/
|
|
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
|
|
if (ewq->msg.event == UFFD_EVENT_FORK) {
|
|
struct userfaultfd_ctx *new;
|
|
|
|
new = (struct userfaultfd_ctx *)
|
|
(unsigned long)
|
|
ewq->msg.arg.reserved.reserved1;
|
|
|
|
userfaultfd_ctx_put(new);
|
|
}
|
|
break;
|
|
}
|
|
|
|
spin_unlock(&ctx->event_wqh.lock);
|
|
|
|
wake_up_poll(&ctx->fd_wqh, POLLIN);
|
|
schedule();
|
|
|
|
spin_lock(&ctx->event_wqh.lock);
|
|
}
|
|
__set_current_state(TASK_RUNNING);
|
|
spin_unlock(&ctx->event_wqh.lock);
|
|
|
|
/*
|
|
* ctx may go away after this if the userfault pseudo fd is
|
|
* already released.
|
|
*/
|
|
out:
|
|
userfaultfd_ctx_put(ctx);
|
|
}
|
|
|
|
static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
|
|
struct userfaultfd_wait_queue *ewq)
|
|
{
|
|
ewq->msg.event = 0;
|
|
wake_up_locked(&ctx->event_wqh);
|
|
__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
|
|
}
|
|
|
|
int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
|
|
{
|
|
struct userfaultfd_ctx *ctx = NULL, *octx;
|
|
struct userfaultfd_fork_ctx *fctx;
|
|
|
|
octx = vma->vm_userfaultfd_ctx.ctx;
|
|
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
|
|
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
|
|
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
|
|
return 0;
|
|
}
|
|
|
|
list_for_each_entry(fctx, fcs, list)
|
|
if (fctx->orig == octx) {
|
|
ctx = fctx->new;
|
|
break;
|
|
}
|
|
|
|
if (!ctx) {
|
|
fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
|
|
if (!fctx)
|
|
return -ENOMEM;
|
|
|
|
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
|
|
if (!ctx) {
|
|
kfree(fctx);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
atomic_set(&ctx->refcount, 1);
|
|
ctx->flags = octx->flags;
|
|
ctx->state = UFFD_STATE_RUNNING;
|
|
ctx->features = octx->features;
|
|
ctx->released = false;
|
|
ctx->mm = vma->vm_mm;
|
|
atomic_inc(&ctx->mm->mm_count);
|
|
|
|
userfaultfd_ctx_get(octx);
|
|
fctx->orig = octx;
|
|
fctx->new = ctx;
|
|
list_add_tail(&fctx->list, fcs);
|
|
}
|
|
|
|
vma->vm_userfaultfd_ctx.ctx = ctx;
|
|
return 0;
|
|
}
|
|
|
|
static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
|
|
{
|
|
struct userfaultfd_ctx *ctx = fctx->orig;
|
|
struct userfaultfd_wait_queue ewq;
|
|
|
|
msg_init(&ewq.msg);
|
|
|
|
ewq.msg.event = UFFD_EVENT_FORK;
|
|
ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
|
|
|
|
userfaultfd_event_wait_completion(ctx, &ewq);
|
|
}
|
|
|
|
void dup_userfaultfd_complete(struct list_head *fcs)
|
|
{
|
|
struct userfaultfd_fork_ctx *fctx, *n;
|
|
|
|
list_for_each_entry_safe(fctx, n, fcs, list) {
|
|
dup_fctx(fctx);
|
|
list_del(&fctx->list);
|
|
kfree(fctx);
|
|
}
|
|
}
|
|
|
|
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
|
|
struct vm_userfaultfd_ctx *vm_ctx)
|
|
{
|
|
struct userfaultfd_ctx *ctx;
|
|
|
|
ctx = vma->vm_userfaultfd_ctx.ctx;
|
|
if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
|
|
vm_ctx->ctx = ctx;
|
|
userfaultfd_ctx_get(ctx);
|
|
}
|
|
}
|
|
|
|
void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
|
|
unsigned long from, unsigned long to,
|
|
unsigned long len)
|
|
{
|
|
struct userfaultfd_ctx *ctx = vm_ctx->ctx;
|
|
struct userfaultfd_wait_queue ewq;
|
|
|
|
if (!ctx)
|
|
return;
|
|
|
|
if (to & ~PAGE_MASK) {
|
|
userfaultfd_ctx_put(ctx);
|
|
return;
|
|
}
|
|
|
|
msg_init(&ewq.msg);
|
|
|
|
ewq.msg.event = UFFD_EVENT_REMAP;
|
|
ewq.msg.arg.remap.from = from;
|
|
ewq.msg.arg.remap.to = to;
|
|
ewq.msg.arg.remap.len = len;
|
|
|
|
userfaultfd_event_wait_completion(ctx, &ewq);
|
|
}
|
|
|
|
bool userfaultfd_remove(struct vm_area_struct *vma,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
struct userfaultfd_ctx *ctx;
|
|
struct userfaultfd_wait_queue ewq;
|
|
|
|
ctx = vma->vm_userfaultfd_ctx.ctx;
|
|
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
|
|
return true;
|
|
|
|
userfaultfd_ctx_get(ctx);
|
|
up_read(&mm->mmap_sem);
|
|
|
|
msg_init(&ewq.msg);
|
|
|
|
ewq.msg.event = UFFD_EVENT_REMOVE;
|
|
ewq.msg.arg.remove.start = start;
|
|
ewq.msg.arg.remove.end = end;
|
|
|
|
userfaultfd_event_wait_completion(ctx, &ewq);
|
|
|
|
return false;
|
|
}
|
|
|
|
static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
struct userfaultfd_unmap_ctx *unmap_ctx;
|
|
|
|
list_for_each_entry(unmap_ctx, unmaps, list)
|
|
if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
|
|
unmap_ctx->end == end)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
int userfaultfd_unmap_prep(struct vm_area_struct *vma,
|
|
unsigned long start, unsigned long end,
|
|
struct list_head *unmaps)
|
|
{
|
|
for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
|
|
struct userfaultfd_unmap_ctx *unmap_ctx;
|
|
struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
|
|
|
|
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
|
|
has_unmap_ctx(ctx, unmaps, start, end))
|
|
continue;
|
|
|
|
unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
|
|
if (!unmap_ctx)
|
|
return -ENOMEM;
|
|
|
|
userfaultfd_ctx_get(ctx);
|
|
unmap_ctx->ctx = ctx;
|
|
unmap_ctx->start = start;
|
|
unmap_ctx->end = end;
|
|
list_add_tail(&unmap_ctx->list, unmaps);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
|
|
{
|
|
struct userfaultfd_unmap_ctx *ctx, *n;
|
|
struct userfaultfd_wait_queue ewq;
|
|
|
|
list_for_each_entry_safe(ctx, n, uf, list) {
|
|
msg_init(&ewq.msg);
|
|
|
|
ewq.msg.event = UFFD_EVENT_UNMAP;
|
|
ewq.msg.arg.remove.start = ctx->start;
|
|
ewq.msg.arg.remove.end = ctx->end;
|
|
|
|
userfaultfd_event_wait_completion(ctx->ctx, &ewq);
|
|
|
|
list_del(&ctx->list);
|
|
kfree(ctx);
|
|
}
|
|
}
|
|
|
|
static int userfaultfd_release(struct inode *inode, struct file *file)
|
|
{
|
|
struct userfaultfd_ctx *ctx = file->private_data;
|
|
struct mm_struct *mm = ctx->mm;
|
|
struct vm_area_struct *vma, *prev;
|
|
/* len == 0 means wake all */
|
|
struct userfaultfd_wake_range range = { .len = 0, };
|
|
unsigned long new_flags;
|
|
|
|
ACCESS_ONCE(ctx->released) = true;
|
|
|
|
if (!mmget_not_zero(mm))
|
|
goto wakeup;
|
|
|
|
/*
|
|
* Flush page faults out of all CPUs. NOTE: all page faults
|
|
* must be retried without returning VM_FAULT_SIGBUS if
|
|
* userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
|
|
* changes while handle_userfault released the mmap_sem. So
|
|
* it's critical that released is set to true (above), before
|
|
* taking the mmap_sem for writing.
|
|
*/
|
|
down_write(&mm->mmap_sem);
|
|
prev = NULL;
|
|
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
|
cond_resched();
|
|
BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
|
|
!!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
|
|
if (vma->vm_userfaultfd_ctx.ctx != ctx) {
|
|
prev = vma;
|
|
continue;
|
|
}
|
|
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
|
|
prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
|
|
new_flags, vma->anon_vma,
|
|
vma->vm_file, vma->vm_pgoff,
|
|
vma_policy(vma),
|
|
NULL_VM_UFFD_CTX,
|
|
vma_get_anon_name(vma));
|
|
if (prev)
|
|
vma = prev;
|
|
else
|
|
prev = vma;
|
|
vma->vm_flags = new_flags;
|
|
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
|
|
}
|
|
up_write(&mm->mmap_sem);
|
|
mmput(mm);
|
|
wakeup:
|
|
/*
|
|
* After no new page faults can wait on this fault_*wqh, flush
|
|
* the last page faults that may have been already waiting on
|
|
* the fault_*wqh.
|
|
*/
|
|
spin_lock(&ctx->fault_pending_wqh.lock);
|
|
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
|
|
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
|
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
|
|
|
/* Flush pending events that may still wait on event_wqh */
|
|
wake_up_all(&ctx->event_wqh);
|
|
|
|
wake_up_poll(&ctx->fd_wqh, POLLHUP);
|
|
userfaultfd_ctx_put(ctx);
|
|
return 0;
|
|
}
|
|
|
|
/* fault_pending_wqh.lock must be hold by the caller */
|
|
static inline struct userfaultfd_wait_queue *find_userfault_in(
|
|
wait_queue_head_t *wqh)
|
|
{
|
|
wait_queue_entry_t *wq;
|
|
struct userfaultfd_wait_queue *uwq;
|
|
|
|
VM_BUG_ON(!spin_is_locked(&wqh->lock));
|
|
|
|
uwq = NULL;
|
|
if (!waitqueue_active(wqh))
|
|
goto out;
|
|
/* walk in reverse to provide FIFO behavior to read userfaults */
|
|
wq = list_last_entry(&wqh->head, typeof(*wq), entry);
|
|
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
|
|
out:
|
|
return uwq;
|
|
}
|
|
|
|
static inline struct userfaultfd_wait_queue *find_userfault(
|
|
struct userfaultfd_ctx *ctx)
|
|
{
|
|
return find_userfault_in(&ctx->fault_pending_wqh);
|
|
}
|
|
|
|
static inline struct userfaultfd_wait_queue *find_userfault_evt(
|
|
struct userfaultfd_ctx *ctx)
|
|
{
|
|
return find_userfault_in(&ctx->event_wqh);
|
|
}
|
|
|
|
static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
|
|
{
|
|
struct userfaultfd_ctx *ctx = file->private_data;
|
|
unsigned int ret;
|
|
|
|
poll_wait(file, &ctx->fd_wqh, wait);
|
|
|
|
switch (ctx->state) {
|
|
case UFFD_STATE_WAIT_API:
|
|
return POLLERR;
|
|
case UFFD_STATE_RUNNING:
|
|
/*
|
|
* poll() never guarantees that read won't block.
|
|
* userfaults can be waken before they're read().
|
|
*/
|
|
if (unlikely(!(file->f_flags & O_NONBLOCK)))
|
|
return POLLERR;
|
|
/*
|
|
* lockless access to see if there are pending faults
|
|
* __pollwait last action is the add_wait_queue but
|
|
* the spin_unlock would allow the waitqueue_active to
|
|
* pass above the actual list_add inside
|
|
* add_wait_queue critical section. So use a full
|
|
* memory barrier to serialize the list_add write of
|
|
* add_wait_queue() with the waitqueue_active read
|
|
* below.
|
|
*/
|
|
ret = 0;
|
|
smp_mb();
|
|
if (waitqueue_active(&ctx->fault_pending_wqh))
|
|
ret = POLLIN;
|
|
else if (waitqueue_active(&ctx->event_wqh))
|
|
ret = POLLIN;
|
|
|
|
return ret;
|
|
default:
|
|
WARN_ON_ONCE(1);
|
|
return POLLERR;
|
|
}
|
|
}
|
|
|
|
static const struct file_operations userfaultfd_fops;
|
|
|
|
static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
|
|
struct userfaultfd_ctx *new,
|
|
struct uffd_msg *msg)
|
|
{
|
|
int fd;
|
|
struct file *file;
|
|
unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
|
|
|
|
fd = get_unused_fd_flags(flags);
|
|
if (fd < 0)
|
|
return fd;
|
|
|
|
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
|
|
O_RDWR | flags);
|
|
if (IS_ERR(file)) {
|
|
put_unused_fd(fd);
|
|
return PTR_ERR(file);
|
|
}
|
|
|
|
fd_install(fd, file);
|
|
msg->arg.reserved.reserved1 = 0;
|
|
msg->arg.fork.ufd = fd;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
|
|
struct uffd_msg *msg)
|
|
{
|
|
ssize_t ret;
|
|
DECLARE_WAITQUEUE(wait, current);
|
|
struct userfaultfd_wait_queue *uwq;
|
|
/*
|
|
* Handling fork event requires sleeping operations, so
|
|
* we drop the event_wqh lock, then do these ops, then
|
|
* lock it back and wake up the waiter. While the lock is
|
|
* dropped the ewq may go away so we keep track of it
|
|
* carefully.
|
|
*/
|
|
LIST_HEAD(fork_event);
|
|
struct userfaultfd_ctx *fork_nctx = NULL;
|
|
|
|
/* always take the fd_wqh lock before the fault_pending_wqh lock */
|
|
spin_lock(&ctx->fd_wqh.lock);
|
|
__add_wait_queue(&ctx->fd_wqh, &wait);
|
|
for (;;) {
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
spin_lock(&ctx->fault_pending_wqh.lock);
|
|
uwq = find_userfault(ctx);
|
|
if (uwq) {
|
|
/*
|
|
* Use a seqcount to repeat the lockless check
|
|
* in wake_userfault() to avoid missing
|
|
* wakeups because during the refile both
|
|
* waitqueue could become empty if this is the
|
|
* only userfault.
|
|
*/
|
|
write_seqcount_begin(&ctx->refile_seq);
|
|
|
|
/*
|
|
* The fault_pending_wqh.lock prevents the uwq
|
|
* to disappear from under us.
|
|
*
|
|
* Refile this userfault from
|
|
* fault_pending_wqh to fault_wqh, it's not
|
|
* pending anymore after we read it.
|
|
*
|
|
* Use list_del() by hand (as
|
|
* userfaultfd_wake_function also uses
|
|
* list_del_init() by hand) to be sure nobody
|
|
* changes __remove_wait_queue() to use
|
|
* list_del_init() in turn breaking the
|
|
* !list_empty_careful() check in
|
|
* handle_userfault(). The uwq->wq.head list
|
|
* must never be empty at any time during the
|
|
* refile, or the waitqueue could disappear
|
|
* from under us. The "wait_queue_head_t"
|
|
* parameter of __remove_wait_queue() is unused
|
|
* anyway.
|
|
*/
|
|
list_del(&uwq->wq.entry);
|
|
__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
|
|
|
|
write_seqcount_end(&ctx->refile_seq);
|
|
|
|
/* careful to always initialize msg if ret == 0 */
|
|
*msg = uwq->msg;
|
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
|
ret = 0;
|
|
break;
|
|
}
|
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
|
|
|
spin_lock(&ctx->event_wqh.lock);
|
|
uwq = find_userfault_evt(ctx);
|
|
if (uwq) {
|
|
*msg = uwq->msg;
|
|
|
|
if (uwq->msg.event == UFFD_EVENT_FORK) {
|
|
fork_nctx = (struct userfaultfd_ctx *)
|
|
(unsigned long)
|
|
uwq->msg.arg.reserved.reserved1;
|
|
list_move(&uwq->wq.entry, &fork_event);
|
|
/*
|
|
* fork_nctx can be freed as soon as
|
|
* we drop the lock, unless we take a
|
|
* reference on it.
|
|
*/
|
|
userfaultfd_ctx_get(fork_nctx);
|
|
spin_unlock(&ctx->event_wqh.lock);
|
|
ret = 0;
|
|
break;
|
|
}
|
|
|
|
userfaultfd_event_complete(ctx, uwq);
|
|
spin_unlock(&ctx->event_wqh.lock);
|
|
ret = 0;
|
|
break;
|
|
}
|
|
spin_unlock(&ctx->event_wqh.lock);
|
|
|
|
if (signal_pending(current)) {
|
|
ret = -ERESTARTSYS;
|
|
break;
|
|
}
|
|
if (no_wait) {
|
|
ret = -EAGAIN;
|
|
break;
|
|
}
|
|
spin_unlock(&ctx->fd_wqh.lock);
|
|
schedule();
|
|
spin_lock(&ctx->fd_wqh.lock);
|
|
}
|
|
__remove_wait_queue(&ctx->fd_wqh, &wait);
|
|
__set_current_state(TASK_RUNNING);
|
|
spin_unlock(&ctx->fd_wqh.lock);
|
|
|
|
if (!ret && msg->event == UFFD_EVENT_FORK) {
|
|
ret = resolve_userfault_fork(ctx, fork_nctx, msg);
|
|
spin_lock(&ctx->event_wqh.lock);
|
|
if (!list_empty(&fork_event)) {
|
|
/*
|
|
* The fork thread didn't abort, so we can
|
|
* drop the temporary refcount.
|
|
*/
|
|
userfaultfd_ctx_put(fork_nctx);
|
|
|
|
uwq = list_first_entry(&fork_event,
|
|
typeof(*uwq),
|
|
wq.entry);
|
|
/*
|
|
* If fork_event list wasn't empty and in turn
|
|
* the event wasn't already released by fork
|
|
* (the event is allocated on fork kernel
|
|
* stack), put the event back to its place in
|
|
* the event_wq. fork_event head will be freed
|
|
* as soon as we return so the event cannot
|
|
* stay queued there no matter the current
|
|
* "ret" value.
|
|
*/
|
|
list_del(&uwq->wq.entry);
|
|
__add_wait_queue(&ctx->event_wqh, &uwq->wq);
|
|
|
|
/*
|
|
* Leave the event in the waitqueue and report
|
|
* error to userland if we failed to resolve
|
|
* the userfault fork.
|
|
*/
|
|
if (likely(!ret))
|
|
userfaultfd_event_complete(ctx, uwq);
|
|
} else {
|
|
/*
|
|
* Here the fork thread aborted and the
|
|
* refcount from the fork thread on fork_nctx
|
|
* has already been released. We still hold
|
|
* the reference we took before releasing the
|
|
* lock above. If resolve_userfault_fork
|
|
* failed we've to drop it because the
|
|
* fork_nctx has to be freed in such case. If
|
|
* it succeeded we'll hold it because the new
|
|
* uffd references it.
|
|
*/
|
|
if (ret)
|
|
userfaultfd_ctx_put(fork_nctx);
|
|
}
|
|
spin_unlock(&ctx->event_wqh.lock);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t userfaultfd_read(struct file *file, char __user *buf,
|
|
size_t count, loff_t *ppos)
|
|
{
|
|
struct userfaultfd_ctx *ctx = file->private_data;
|
|
ssize_t _ret, ret = 0;
|
|
struct uffd_msg msg;
|
|
int no_wait = file->f_flags & O_NONBLOCK;
|
|
|
|
if (ctx->state == UFFD_STATE_WAIT_API)
|
|
return -EINVAL;
|
|
|
|
for (;;) {
|
|
if (count < sizeof(msg))
|
|
return ret ? ret : -EINVAL;
|
|
_ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
|
|
if (_ret < 0)
|
|
return ret ? ret : _ret;
|
|
if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
|
|
return ret ? ret : -EFAULT;
|
|
ret += sizeof(msg);
|
|
buf += sizeof(msg);
|
|
count -= sizeof(msg);
|
|
/*
|
|
* Allow to read more than one fault at time but only
|
|
* block if waiting for the very first one.
|
|
*/
|
|
no_wait = O_NONBLOCK;
|
|
}
|
|
}
|
|
|
|
static void __wake_userfault(struct userfaultfd_ctx *ctx,
|
|
struct userfaultfd_wake_range *range)
|
|
{
|
|
spin_lock(&ctx->fault_pending_wqh.lock);
|
|
/* wake all in the range and autoremove */
|
|
if (waitqueue_active(&ctx->fault_pending_wqh))
|
|
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
|
|
range);
|
|
if (waitqueue_active(&ctx->fault_wqh))
|
|
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
|
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
|
}
|
|
|
|
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
|
|
struct userfaultfd_wake_range *range)
|
|
{
|
|
unsigned seq;
|
|
bool need_wakeup;
|
|
|
|
/*
|
|
* To be sure waitqueue_active() is not reordered by the CPU
|
|
* before the pagetable update, use an explicit SMP memory
|
|
* barrier here. PT lock release or up_read(mmap_sem) still
|
|
* have release semantics that can allow the
|
|
* waitqueue_active() to be reordered before the pte update.
|
|
*/
|
|
smp_mb();
|
|
|
|
/*
|
|
* Use waitqueue_active because it's very frequent to
|
|
* change the address space atomically even if there are no
|
|
* userfaults yet. So we take the spinlock only when we're
|
|
* sure we've userfaults to wake.
|
|
*/
|
|
do {
|
|
seq = read_seqcount_begin(&ctx->refile_seq);
|
|
need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
|
|
waitqueue_active(&ctx->fault_wqh);
|
|
cond_resched();
|
|
} while (read_seqcount_retry(&ctx->refile_seq, seq));
|
|
if (need_wakeup)
|
|
__wake_userfault(ctx, range);
|
|
}
|
|
|
|
static __always_inline int validate_range(struct mm_struct *mm,
|
|
__u64 start, __u64 len)
|
|
{
|
|
__u64 task_size = mm->task_size;
|
|
|
|
if (start & ~PAGE_MASK)
|
|
return -EINVAL;
|
|
if (len & ~PAGE_MASK)
|
|
return -EINVAL;
|
|
if (!len)
|
|
return -EINVAL;
|
|
if (start < mmap_min_addr)
|
|
return -EINVAL;
|
|
if (start >= task_size)
|
|
return -EINVAL;
|
|
if (len > task_size - start)
|
|
return -EINVAL;
|
|
return 0;
|
|
}
|
|
|
|
static inline bool vma_can_userfault(struct vm_area_struct *vma)
|
|
{
|
|
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
|
|
vma_is_shmem(vma);
|
|
}
|
|
|
|
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
|
unsigned long arg)
|
|
{
|
|
struct mm_struct *mm = ctx->mm;
|
|
struct vm_area_struct *vma, *prev, *cur;
|
|
int ret;
|
|
struct uffdio_register uffdio_register;
|
|
struct uffdio_register __user *user_uffdio_register;
|
|
unsigned long vm_flags, new_flags;
|
|
bool found;
|
|
bool basic_ioctls;
|
|
unsigned long start, end, vma_end;
|
|
|
|
user_uffdio_register = (struct uffdio_register __user *) arg;
|
|
|
|
ret = -EFAULT;
|
|
if (copy_from_user(&uffdio_register, user_uffdio_register,
|
|
sizeof(uffdio_register)-sizeof(__u64)))
|
|
goto out;
|
|
|
|
ret = -EINVAL;
|
|
if (!uffdio_register.mode)
|
|
goto out;
|
|
if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
|
|
UFFDIO_REGISTER_MODE_WP))
|
|
goto out;
|
|
vm_flags = 0;
|
|
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
|
|
vm_flags |= VM_UFFD_MISSING;
|
|
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
|
|
vm_flags |= VM_UFFD_WP;
|
|
/*
|
|
* FIXME: remove the below error constraint by
|
|
* implementing the wprotect tracking mode.
|
|
*/
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
ret = validate_range(mm, uffdio_register.range.start,
|
|
uffdio_register.range.len);
|
|
if (ret)
|
|
goto out;
|
|
|
|
start = uffdio_register.range.start;
|
|
end = start + uffdio_register.range.len;
|
|
|
|
ret = -ENOMEM;
|
|
if (!mmget_not_zero(mm))
|
|
goto out;
|
|
|
|
down_write(&mm->mmap_sem);
|
|
vma = find_vma_prev(mm, start, &prev);
|
|
if (!vma)
|
|
goto out_unlock;
|
|
|
|
/* check that there's at least one vma in the range */
|
|
ret = -EINVAL;
|
|
if (vma->vm_start >= end)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* If the first vma contains huge pages, make sure start address
|
|
* is aligned to huge page size.
|
|
*/
|
|
if (is_vm_hugetlb_page(vma)) {
|
|
unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
|
|
|
|
if (start & (vma_hpagesize - 1))
|
|
goto out_unlock;
|
|
}
|
|
|
|
/*
|
|
* Search for not compatible vmas.
|
|
*/
|
|
found = false;
|
|
basic_ioctls = false;
|
|
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
|
|
cond_resched();
|
|
|
|
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
|
|
!!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
|
|
|
|
/* check not compatible vmas */
|
|
ret = -EINVAL;
|
|
if (!vma_can_userfault(cur))
|
|
goto out_unlock;
|
|
/*
|
|
* If this vma contains ending address, and huge pages
|
|
* check alignment.
|
|
*/
|
|
if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
|
|
end > cur->vm_start) {
|
|
unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
|
|
|
|
ret = -EINVAL;
|
|
|
|
if (end & (vma_hpagesize - 1))
|
|
goto out_unlock;
|
|
}
|
|
|
|
/*
|
|
* Check that this vma isn't already owned by a
|
|
* different userfaultfd. We can't allow more than one
|
|
* userfaultfd to own a single vma simultaneously or we
|
|
* wouldn't know which one to deliver the userfaults to.
|
|
*/
|
|
ret = -EBUSY;
|
|
if (cur->vm_userfaultfd_ctx.ctx &&
|
|
cur->vm_userfaultfd_ctx.ctx != ctx)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* Note vmas containing huge pages
|
|
*/
|
|
if (is_vm_hugetlb_page(cur))
|
|
basic_ioctls = true;
|
|
|
|
found = true;
|
|
}
|
|
BUG_ON(!found);
|
|
|
|
if (vma->vm_start < start)
|
|
prev = vma;
|
|
|
|
ret = 0;
|
|
do {
|
|
cond_resched();
|
|
|
|
BUG_ON(!vma_can_userfault(vma));
|
|
BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
|
|
vma->vm_userfaultfd_ctx.ctx != ctx);
|
|
|
|
/*
|
|
* Nothing to do: this vma is already registered into this
|
|
* userfaultfd and with the right tracking mode too.
|
|
*/
|
|
if (vma->vm_userfaultfd_ctx.ctx == ctx &&
|
|
(vma->vm_flags & vm_flags) == vm_flags)
|
|
goto skip;
|
|
|
|
if (vma->vm_start > start)
|
|
start = vma->vm_start;
|
|
vma_end = min(end, vma->vm_end);
|
|
|
|
new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
|
|
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
|
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
|
vma_policy(vma),
|
|
((struct vm_userfaultfd_ctx){ ctx }),
|
|
vma_get_anon_name(vma));
|
|
if (prev) {
|
|
vma = prev;
|
|
goto next;
|
|
}
|
|
if (vma->vm_start < start) {
|
|
ret = split_vma(mm, vma, start, 1);
|
|
if (ret)
|
|
break;
|
|
}
|
|
if (vma->vm_end > end) {
|
|
ret = split_vma(mm, vma, end, 0);
|
|
if (ret)
|
|
break;
|
|
}
|
|
next:
|
|
/*
|
|
* In the vma_merge() successful mprotect-like case 8:
|
|
* the next vma was merged into the current one and
|
|
* the current one has not been updated yet.
|
|
*/
|
|
vma->vm_flags = new_flags;
|
|
vma->vm_userfaultfd_ctx.ctx = ctx;
|
|
|
|
skip:
|
|
prev = vma;
|
|
start = vma->vm_end;
|
|
vma = vma->vm_next;
|
|
} while (vma && vma->vm_start < end);
|
|
out_unlock:
|
|
up_write(&mm->mmap_sem);
|
|
mmput(mm);
|
|
if (!ret) {
|
|
/*
|
|
* Now that we scanned all vmas we can already tell
|
|
* userland which ioctls methods are guaranteed to
|
|
* succeed on this range.
|
|
*/
|
|
if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
|
|
UFFD_API_RANGE_IOCTLS,
|
|
&user_uffdio_register->ioctls))
|
|
ret = -EFAULT;
|
|
}
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
|
|
unsigned long arg)
|
|
{
|
|
struct mm_struct *mm = ctx->mm;
|
|
struct vm_area_struct *vma, *prev, *cur;
|
|
int ret;
|
|
struct uffdio_range uffdio_unregister;
|
|
unsigned long new_flags;
|
|
bool found;
|
|
unsigned long start, end, vma_end;
|
|
const void __user *buf = (void __user *)arg;
|
|
|
|
ret = -EFAULT;
|
|
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
|
|
goto out;
|
|
|
|
ret = validate_range(mm, uffdio_unregister.start,
|
|
uffdio_unregister.len);
|
|
if (ret)
|
|
goto out;
|
|
|
|
start = uffdio_unregister.start;
|
|
end = start + uffdio_unregister.len;
|
|
|
|
ret = -ENOMEM;
|
|
if (!mmget_not_zero(mm))
|
|
goto out;
|
|
|
|
down_write(&mm->mmap_sem);
|
|
vma = find_vma_prev(mm, start, &prev);
|
|
if (!vma)
|
|
goto out_unlock;
|
|
|
|
/* check that there's at least one vma in the range */
|
|
ret = -EINVAL;
|
|
if (vma->vm_start >= end)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* If the first vma contains huge pages, make sure start address
|
|
* is aligned to huge page size.
|
|
*/
|
|
if (is_vm_hugetlb_page(vma)) {
|
|
unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
|
|
|
|
if (start & (vma_hpagesize - 1))
|
|
goto out_unlock;
|
|
}
|
|
|
|
/*
|
|
* Search for not compatible vmas.
|
|
*/
|
|
found = false;
|
|
ret = -EINVAL;
|
|
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
|
|
cond_resched();
|
|
|
|
BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
|
|
!!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
|
|
|
|
/*
|
|
* Check not compatible vmas, not strictly required
|
|
* here as not compatible vmas cannot have an
|
|
* userfaultfd_ctx registered on them, but this
|
|
* provides for more strict behavior to notice
|
|
* unregistration errors.
|
|
*/
|
|
if (!vma_can_userfault(cur))
|
|
goto out_unlock;
|
|
|
|
found = true;
|
|
}
|
|
BUG_ON(!found);
|
|
|
|
if (vma->vm_start < start)
|
|
prev = vma;
|
|
|
|
ret = 0;
|
|
do {
|
|
cond_resched();
|
|
|
|
BUG_ON(!vma_can_userfault(vma));
|
|
|
|
/*
|
|
* Nothing to do: this vma is already registered into this
|
|
* userfaultfd and with the right tracking mode too.
|
|
*/
|
|
if (!vma->vm_userfaultfd_ctx.ctx)
|
|
goto skip;
|
|
|
|
if (vma->vm_start > start)
|
|
start = vma->vm_start;
|
|
vma_end = min(end, vma->vm_end);
|
|
|
|
if (userfaultfd_missing(vma)) {
|
|
/*
|
|
* Wake any concurrent pending userfault while
|
|
* we unregister, so they will not hang
|
|
* permanently and it avoids userland to call
|
|
* UFFDIO_WAKE explicitly.
|
|
*/
|
|
struct userfaultfd_wake_range range;
|
|
range.start = start;
|
|
range.len = vma_end - start;
|
|
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
|
|
}
|
|
|
|
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
|
|
prev = vma_merge(mm, prev, start, vma_end, new_flags,
|
|
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
|
|
vma_policy(vma),
|
|
NULL_VM_UFFD_CTX,
|
|
vma_get_anon_name(vma));
|
|
if (prev) {
|
|
vma = prev;
|
|
goto next;
|
|
}
|
|
if (vma->vm_start < start) {
|
|
ret = split_vma(mm, vma, start, 1);
|
|
if (ret)
|
|
break;
|
|
}
|
|
if (vma->vm_end > end) {
|
|
ret = split_vma(mm, vma, end, 0);
|
|
if (ret)
|
|
break;
|
|
}
|
|
next:
|
|
/*
|
|
* In the vma_merge() successful mprotect-like case 8:
|
|
* the next vma was merged into the current one and
|
|
* the current one has not been updated yet.
|
|
*/
|
|
vma->vm_flags = new_flags;
|
|
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
|
|
|
|
skip:
|
|
prev = vma;
|
|
start = vma->vm_end;
|
|
vma = vma->vm_next;
|
|
} while (vma && vma->vm_start < end);
|
|
out_unlock:
|
|
up_write(&mm->mmap_sem);
|
|
mmput(mm);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* userfaultfd_wake may be used in combination with the
|
|
* UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
|
|
*/
|
|
static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
|
|
unsigned long arg)
|
|
{
|
|
int ret;
|
|
struct uffdio_range uffdio_wake;
|
|
struct userfaultfd_wake_range range;
|
|
const void __user *buf = (void __user *)arg;
|
|
|
|
ret = -EFAULT;
|
|
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
|
|
goto out;
|
|
|
|
ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
|
|
if (ret)
|
|
goto out;
|
|
|
|
range.start = uffdio_wake.start;
|
|
range.len = uffdio_wake.len;
|
|
|
|
/*
|
|
* len == 0 means wake all and we don't want to wake all here,
|
|
* so check it again to be sure.
|
|
*/
|
|
VM_BUG_ON(!range.len);
|
|
|
|
wake_userfault(ctx, &range);
|
|
ret = 0;
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
|
|
unsigned long arg)
|
|
{
|
|
__s64 ret;
|
|
struct uffdio_copy uffdio_copy;
|
|
struct uffdio_copy __user *user_uffdio_copy;
|
|
struct userfaultfd_wake_range range;
|
|
|
|
user_uffdio_copy = (struct uffdio_copy __user *) arg;
|
|
|
|
ret = -EFAULT;
|
|
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
|
|
/* don't copy "copy" last field */
|
|
sizeof(uffdio_copy)-sizeof(__s64)))
|
|
goto out;
|
|
|
|
ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
|
|
if (ret)
|
|
goto out;
|
|
/*
|
|
* double check for wraparound just in case. copy_from_user()
|
|
* will later check uffdio_copy.src + uffdio_copy.len to fit
|
|
* in the userland range.
|
|
*/
|
|
ret = -EINVAL;
|
|
if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
|
|
goto out;
|
|
if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
|
|
goto out;
|
|
if (mmget_not_zero(ctx->mm)) {
|
|
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
|
|
uffdio_copy.len);
|
|
mmput(ctx->mm);
|
|
} else {
|
|
return -ESRCH;
|
|
}
|
|
if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
|
|
return -EFAULT;
|
|
if (ret < 0)
|
|
goto out;
|
|
BUG_ON(!ret);
|
|
/* len == 0 would wake all */
|
|
range.len = ret;
|
|
if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
|
|
range.start = uffdio_copy.dst;
|
|
wake_userfault(ctx, &range);
|
|
}
|
|
ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
|
|
unsigned long arg)
|
|
{
|
|
__s64 ret;
|
|
struct uffdio_zeropage uffdio_zeropage;
|
|
struct uffdio_zeropage __user *user_uffdio_zeropage;
|
|
struct userfaultfd_wake_range range;
|
|
|
|
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
|
|
|
|
ret = -EFAULT;
|
|
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
|
|
/* don't copy "zeropage" last field */
|
|
sizeof(uffdio_zeropage)-sizeof(__s64)))
|
|
goto out;
|
|
|
|
ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
|
|
uffdio_zeropage.range.len);
|
|
if (ret)
|
|
goto out;
|
|
ret = -EINVAL;
|
|
if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
|
|
goto out;
|
|
|
|
if (mmget_not_zero(ctx->mm)) {
|
|
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
|
|
uffdio_zeropage.range.len);
|
|
mmput(ctx->mm);
|
|
} else {
|
|
return -ESRCH;
|
|
}
|
|
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
|
|
return -EFAULT;
|
|
if (ret < 0)
|
|
goto out;
|
|
/* len == 0 would wake all */
|
|
BUG_ON(!ret);
|
|
range.len = ret;
|
|
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
|
|
range.start = uffdio_zeropage.range.start;
|
|
wake_userfault(ctx, &range);
|
|
}
|
|
ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static inline unsigned int uffd_ctx_features(__u64 user_features)
|
|
{
|
|
/*
|
|
* For the current set of features the bits just coincide
|
|
*/
|
|
return (unsigned int)user_features;
|
|
}
|
|
|
|
/*
|
|
* userland asks for a certain API version and we return which bits
|
|
* and ioctl commands are implemented in this kernel for such API
|
|
* version or -EINVAL if unknown.
|
|
*/
|
|
static int userfaultfd_api(struct userfaultfd_ctx *ctx,
|
|
unsigned long arg)
|
|
{
|
|
struct uffdio_api uffdio_api;
|
|
void __user *buf = (void __user *)arg;
|
|
int ret;
|
|
__u64 features;
|
|
|
|
ret = -EINVAL;
|
|
if (ctx->state != UFFD_STATE_WAIT_API)
|
|
goto out;
|
|
ret = -EFAULT;
|
|
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
|
|
goto out;
|
|
features = uffdio_api.features;
|
|
if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
|
|
memset(&uffdio_api, 0, sizeof(uffdio_api));
|
|
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
|
goto out;
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
/* report all available features and ioctls to userland */
|
|
uffdio_api.features = UFFD_API_FEATURES;
|
|
uffdio_api.ioctls = UFFD_API_IOCTLS;
|
|
ret = -EFAULT;
|
|
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
|
|
goto out;
|
|
ctx->state = UFFD_STATE_RUNNING;
|
|
/* only enable the requested features for this uffd context */
|
|
ctx->features = uffd_ctx_features(features);
|
|
ret = 0;
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
|
|
unsigned long arg)
|
|
{
|
|
int ret = -EINVAL;
|
|
struct userfaultfd_ctx *ctx = file->private_data;
|
|
|
|
if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
|
|
return -EINVAL;
|
|
|
|
switch(cmd) {
|
|
case UFFDIO_API:
|
|
ret = userfaultfd_api(ctx, arg);
|
|
break;
|
|
case UFFDIO_REGISTER:
|
|
ret = userfaultfd_register(ctx, arg);
|
|
break;
|
|
case UFFDIO_UNREGISTER:
|
|
ret = userfaultfd_unregister(ctx, arg);
|
|
break;
|
|
case UFFDIO_WAKE:
|
|
ret = userfaultfd_wake(ctx, arg);
|
|
break;
|
|
case UFFDIO_COPY:
|
|
ret = userfaultfd_copy(ctx, arg);
|
|
break;
|
|
case UFFDIO_ZEROPAGE:
|
|
ret = userfaultfd_zeropage(ctx, arg);
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
|
|
{
|
|
struct userfaultfd_ctx *ctx = f->private_data;
|
|
wait_queue_entry_t *wq;
|
|
struct userfaultfd_wait_queue *uwq;
|
|
unsigned long pending = 0, total = 0;
|
|
|
|
spin_lock(&ctx->fault_pending_wqh.lock);
|
|
list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
|
|
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
|
|
pending++;
|
|
total++;
|
|
}
|
|
list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
|
|
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
|
|
total++;
|
|
}
|
|
spin_unlock(&ctx->fault_pending_wqh.lock);
|
|
|
|
/*
|
|
* If more protocols will be added, there will be all shown
|
|
* separated by a space. Like this:
|
|
* protocols: aa:... bb:...
|
|
*/
|
|
seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
|
|
pending, total, UFFD_API, ctx->features,
|
|
UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
|
|
}
|
|
#endif
|
|
|
|
static const struct file_operations userfaultfd_fops = {
|
|
#ifdef CONFIG_PROC_FS
|
|
.show_fdinfo = userfaultfd_show_fdinfo,
|
|
#endif
|
|
.release = userfaultfd_release,
|
|
.poll = userfaultfd_poll,
|
|
.read = userfaultfd_read,
|
|
.unlocked_ioctl = userfaultfd_ioctl,
|
|
.compat_ioctl = userfaultfd_ioctl,
|
|
.llseek = noop_llseek,
|
|
};
|
|
|
|
static void init_once_userfaultfd_ctx(void *mem)
|
|
{
|
|
struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
|
|
|
|
init_waitqueue_head(&ctx->fault_pending_wqh);
|
|
init_waitqueue_head(&ctx->fault_wqh);
|
|
init_waitqueue_head(&ctx->event_wqh);
|
|
init_waitqueue_head(&ctx->fd_wqh);
|
|
seqcount_init(&ctx->refile_seq);
|
|
}
|
|
|
|
/**
|
|
* userfaultfd_file_create - Creates a userfaultfd file pointer.
|
|
* @flags: Flags for the userfaultfd file.
|
|
*
|
|
* This function creates a userfaultfd file pointer, w/out installing
|
|
* it into the fd table. This is useful when the userfaultfd file is
|
|
* used during the initialization of data structures that require
|
|
* extra setup after the userfaultfd creation. So the userfaultfd
|
|
* creation is split into the file pointer creation phase, and the
|
|
* file descriptor installation phase. In this way races with
|
|
* userspace closing the newly installed file descriptor can be
|
|
* avoided. Returns a userfaultfd file pointer, or a proper error
|
|
* pointer.
|
|
*/
|
|
static struct file *userfaultfd_file_create(int flags)
|
|
{
|
|
struct file *file;
|
|
struct userfaultfd_ctx *ctx;
|
|
|
|
BUG_ON(!current->mm);
|
|
|
|
/* Check the UFFD_* constants for consistency. */
|
|
BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
|
|
BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
|
|
|
|
file = ERR_PTR(-EINVAL);
|
|
if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
|
|
goto out;
|
|
|
|
file = ERR_PTR(-ENOMEM);
|
|
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
|
|
if (!ctx)
|
|
goto out;
|
|
|
|
atomic_set(&ctx->refcount, 1);
|
|
ctx->flags = flags;
|
|
ctx->features = 0;
|
|
ctx->state = UFFD_STATE_WAIT_API;
|
|
ctx->released = false;
|
|
ctx->mm = current->mm;
|
|
/* prevent the mm struct to be freed */
|
|
mmgrab(ctx->mm);
|
|
|
|
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
|
|
O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
|
|
if (IS_ERR(file)) {
|
|
mmdrop(ctx->mm);
|
|
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
|
|
}
|
|
out:
|
|
return file;
|
|
}
|
|
|
|
SYSCALL_DEFINE1(userfaultfd, int, flags)
|
|
{
|
|
int fd, error;
|
|
struct file *file;
|
|
|
|
error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
|
|
if (error < 0)
|
|
return error;
|
|
fd = error;
|
|
|
|
file = userfaultfd_file_create(flags);
|
|
if (IS_ERR(file)) {
|
|
error = PTR_ERR(file);
|
|
goto err_put_unused_fd;
|
|
}
|
|
fd_install(fd, file);
|
|
|
|
return fd;
|
|
|
|
err_put_unused_fd:
|
|
put_unused_fd(fd);
|
|
|
|
return error;
|
|
}
|
|
|
|
static int __init userfaultfd_init(void)
|
|
{
|
|
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
|
|
sizeof(struct userfaultfd_ctx),
|
|
0,
|
|
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
|
|
init_once_userfaultfd_ctx);
|
|
return 0;
|
|
}
|
|
__initcall(userfaultfd_init);
|