msm-4.14/mm/util.c
Greg Kroah-Hartman d6d7b18f40 This is the 4.14.185 stable release
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAl7tx/EACgkQONu9yGCS
 aT4/ABAApnoLXN+Drzomi5DYYQoVyf+4sTgIHSks4Q7S0TggPYLH/UcYAkqxgmjd
 Lj3VKcCSZqoR6gCeTLK34DSxlKezvxKI/u5LVjdRVdyWc4W2y3InqYBGikPeuGfW
 Ud4E2pGe/NgoZ0jf6dxkIxQx3DqtrsY0742MTCG3BNEYo4B4HMcN0LEUEFtwSjqj
 e1LpE8sCX2MhfvAzCvajBhNlIv4Sdgr47+52yhxjS8h04D6rq92jGsdjUjw5Oqtt
 wPIf2v1zxFq3QkPf3pYW5e/FsfzzDk3pIs1bQ4TnoJkAGfI52eBTjekdRSU0TOiG
 U7RBXsVFCEcB/ec5WkiNzUhK4+SpEuO3SDa6u7OVlQ2wGTT/c1k1prJL0AdXwUJn
 NYqOe1WZpXORajLYbVNAQwvbTN3Chho5FI9w+WoU/WvmzxelqXAUCSTsqEhNV8oS
 ZBWu6anawL8C9d0aYsq1DeV2CMEPmQPWp9HeiNtZgl0ZKkZlqBcxW2dccq9h8PH4
 xqRImk+owbKsT0NV3dV150Q41nc2rTQd2jyGeoA+iek17/XDxWT9ICArA+YfBhx1
 uf+dNPyl8g8ATvTZaS7su/Q7T2rsgBc64DO5R+jtTp4QRgl/N2b4QyewfjusrRXA
 x3Ckq0oA9ZOcfLs9IAoTD5L1mmwSaYUa6fhiZHhop2daUPUbAgE=
 =BJUs
 -----END PGP SIGNATURE-----

Merge 4.14.185 into android-4.14-stable

Changes in 4.14.185
	ipv6: fix IPV6_ADDRFORM operation logic
	vxlan: Avoid infinite loop when suppressing NS messages with invalid options
	make 'user_access_begin()' do 'access_ok()'
	Fix 'acccess_ok()' on alpha and SH
	arch/openrisc: Fix issues with access_ok()
	x86: uaccess: Inhibit speculation past access_ok() in user_access_begin()
	lib: Reduce user_access_begin() boundaries in strncpy_from_user() and strnlen_user()
	serial: imx: Fix handling of TC irq in combination with DMA
	crypto: talitos - fix ECB and CBC algs ivsize
	ARM: 8977/1: ptrace: Fix mask for thumb breakpoint hook
	sched/fair: Don't NUMA balance for kthreads
	Input: synaptics - add a second working PNP_ID for Lenovo T470s
	drivers/net/ibmvnic: Update VNIC protocol version reporting
	powerpc/xive: Clear the page tables for the ESB IO mapping
	ath9k_htc: Silence undersized packet warnings
	perf probe: Accept the instance number of kretprobe event
	mm: add kvfree_sensitive() for freeing sensitive data objects
	x86_64: Fix jiffies ODR violation
	x86/PCI: Mark Intel C620 MROMs as having non-compliant BARs
	x86/speculation: Prevent rogue cross-process SSBD shutdown
	x86/reboot/quirks: Add MacBook6,1 reboot quirk
	efi/efivars: Add missing kobject_put() in sysfs entry creation error path
	ALSA: es1688: Add the missed snd_card_free()
	ALSA: hda/realtek - add a pintbl quirk for several Lenovo machines
	ALSA: usb-audio: Fix inconsistent card PM state after resume
	ACPI: sysfs: Fix reference count leak in acpi_sysfs_add_hotplug_profile()
	ACPI: CPPC: Fix reference count leak in acpi_cppc_processor_probe()
	ACPI: GED: add support for _Exx / _Lxx handler methods
	ACPI: PM: Avoid using power resources if there are none for D0
	cgroup, blkcg: Prepare some symbols for module and !CONFIG_CGROUP usages
	nilfs2: fix null pointer dereference at nilfs_segctor_do_construct()
	spi: bcm2835aux: Fix controller unregister order
	spi: bcm-qspi: when tx/rx buffer is NULL set to 0
	crypto: cavium/nitrox - Fix 'nitrox_get_first_device()' when ndevlist is fully iterated
	ALSA: pcm: disallow linking stream to itself
	kvm: x86: Fix L1TF mitigation for shadow MMU
	KVM: x86/mmu: Consolidate "is MMIO SPTE" code
	KVM: x86: only do L1TF workaround on affected processors
	x86/speculation: Change misspelled STIPB to STIBP
	x86/speculation: Add support for STIBP always-on preferred mode
	x86/speculation: Avoid force-disabling IBPB based on STIBP and enhanced IBRS.
	x86/speculation: PR_SPEC_FORCE_DISABLE enforcement for indirect branches.
	spi: dw: fix possible race condition
	spi: dw: Fix controller unregister order
	spi: No need to assign dummy value in spi_unregister_controller()
	spi: Fix controller unregister order
	spi: pxa2xx: Fix controller unregister order
	spi: bcm2835: Fix controller unregister order
	crypto: virtio: Fix use-after-free in virtio_crypto_skcipher_finalize_req()
	crypto: virtio: Fix src/dst scatterlist calculation in __virtio_crypto_skcipher_do_req()
	crypto: virtio: Fix dest length calculation in __virtio_crypto_skcipher_do_req()
	selftests/net: in rxtimestamp getopt_long needs terminating null entry
	ovl: initialize error in ovl_copy_xattr
	proc: Use new_inode not new_inode_pseudo
	video: fbdev: w100fb: Fix a potential double free.
	KVM: nSVM: fix condition for filtering async PF
	KVM: nSVM: leave ASID aside in copy_vmcb_control_area
	KVM: nVMX: Consult only the "basic" exit reason when routing nested exit
	KVM: MIPS: Define KVM_ENTRYHI_ASID to cpu_asid_mask(&boot_cpu_data)
	KVM: MIPS: Fix VPN2_MASK definition for variable cpu_vmbits
	KVM: arm64: Make vcpu_cp1x() work on Big Endian hosts
	ath9k: Fix use-after-free Read in ath9k_wmi_ctrl_rx
	ath9k: Fix use-after-free Write in ath9k_htc_rx_msg
	ath9x: Fix stack-out-of-bounds Write in ath9k_hif_usb_rx_cb
	ath9k: Fix general protection fault in ath9k_hif_usb_rx_cb
	Smack: slab-out-of-bounds in vsscanf
	mm/slub: fix a memory leak in sysfs_slab_add()
	fat: don't allow to mount if the FAT length == 0
	perf: Add cond_resched() to task_function_call()
	agp/intel: Reinforce the barrier after GTT updates
	mmc: sdhci-msm: Clear tuning done flag while hs400 tuning
	mmc: sdio: Fix potential NULL pointer error in mmc_sdio_init_card()
	can: kvaser_usb: kvaser_usb_leaf: Fix some info-leaks to USB devices
	xen/pvcalls-back: test for errors when calling backend_connect()
	ACPI: GED: use correct trigger type field in _Exx / _Lxx handling
	drm: bridge: adv7511: Extend list of audio sample rates
	crypto: ccp -- don't "select" CONFIG_DMADEVICES
	media: si2157: Better check for running tuner in init
	objtool: Ignore empty alternatives
	spi: pxa2xx: Apply CS clk quirk to BXT
	net: ena: fix error returning in ena_com_get_hash_function()
	spi: dw: Zero DMA Tx and Rx configurations on stack
	ixgbe: Fix XDP redirect on archs with PAGE_SIZE above 4K
	MIPS: Loongson: Build ATI Radeon GPU driver as module
	Bluetooth: Add SCO fallback for invalid LMP parameters error
	kgdb: Prevent infinite recursive entries to the debugger
	spi: dw: Enable interrupts in accordance with DMA xfer mode
	clocksource: dw_apb_timer: Make CPU-affiliation being optional
	clocksource: dw_apb_timer_of: Fix missing clockevent timers
	btrfs: do not ignore error from btrfs_next_leaf() when inserting checksums
	ARM: 8978/1: mm: make act_mm() respect THREAD_SIZE
	spi: dw: Fix Rx-only DMA transfers
	x86/kvm/hyper-v: Explicitly align hcall param for kvm_hyperv_exit
	net: vmxnet3: fix possible buffer overflow caused by bad DMA value in vmxnet3_get_rss()
	staging: android: ion: use vmap instead of vm_map_ram
	brcmfmac: fix wrong location to get firmware feature
	tools api fs: Make xxx__mountpoint() more scalable
	e1000: Distribute switch variables for initialization
	dt-bindings: display: mediatek: control dpi pins mode to avoid leakage
	audit: fix a net reference leak in audit_send_reply()
	media: dvb: return -EREMOTEIO on i2c transfer failure.
	media: platform: fcp: Set appropriate DMA parameters
	MIPS: Make sparse_init() using top-down allocation
	audit: fix a net reference leak in audit_list_rules_send()
	netfilter: nft_nat: return EOPNOTSUPP if type or flags are not supported
	net: bcmgenet: set Rx mode before starting netif
	lib/mpi: Fix 64-bit MIPS build with Clang
	exit: Move preemption fixup up, move blocking operations down
	net: lpc-enet: fix error return code in lpc_mii_init()
	media: cec: silence shift wrapping warning in __cec_s_log_addrs()
	net: allwinner: Fix use correct return type for ndo_start_xmit()
	powerpc/spufs: fix copy_to_user while atomic
	Crypto/chcr: fix for ccm(aes) failed test
	MIPS: Truncate link address into 32bit for 32bit kernel
	mips: cm: Fix an invalid error code of INTVN_*_ERR
	kgdb: Fix spurious true from in_dbg_master()
	nvme: refine the Qemu Identify CNS quirk
	wcn36xx: Fix error handling path in 'wcn36xx_probe()'
	net: qed*: Reduce RX and TX default ring count when running inside kdump kernel
	md: don't flush workqueue unconditionally in md_open
	rtlwifi: Fix a double free in _rtl_usb_tx_urb_setup()
	mwifiex: Fix memory corruption in dump_station
	x86/boot: Correct relocation destination on old linkers
	mips: MAAR: Use more precise address mask
	mips: Add udelay lpj numbers adjustment
	x86/mm: Stop printing BRK addresses
	m68k: mac: Don't call via_flush_cache() on Mac IIfx
	macvlan: Skip loopback packets in RX handler
	PCI: Don't disable decoding when mmio_always_on is set
	MIPS: Fix IRQ tracing when call handle_fpe() and handle_msa_fpe()
	mmc: sdhci-msm: Set SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12 quirk
	staging: greybus: sdio: Respect the cmd->busy_timeout from the mmc core
	mmc: via-sdmmc: Respect the cmd->busy_timeout from the mmc core
	ixgbe: fix signed-integer-overflow warning
	mmc: sdhci-esdhc-imx: fix the mask for tuning start point
	spi: dw: Return any value retrieved from the dma_transfer callback
	cpuidle: Fix three reference count leaks
	platform/x86: hp-wmi: Convert simple_strtoul() to kstrtou32()
	string.h: fix incompatibility between FORTIFY_SOURCE and KASAN
	btrfs: send: emit file capabilities after chown
	mm: thp: make the THP mapcount atomic against __split_huge_pmd_locked()
	ima: Fix ima digest hash table key calculation
	ima: Directly assign the ima_default_policy pointer to ima_rules
	evm: Fix possible memory leak in evm_calc_hmac_or_hash()
	ext4: fix EXT_MAX_EXTENT/INDEX to check for zeroed eh_max
	ext4: fix error pointer dereference
	ext4: fix race between ext4_sync_parent() and rename()
	PCI: Disable MSI for Freescale Layerscape PCIe RC mode
	PCI: Avoid FLR for AMD Matisse HD Audio & USB 3.0
	PCI: Avoid FLR for AMD Starship USB 3.0
	PCI: Add ACS quirk for iProc PAXB
	PCI: Add ACS quirk for Ampere root ports
	PCI: Make ACS quirk implementations more uniform
	vga_switcheroo: Deduplicate power state tracking
	vga_switcheroo: Use device link for HDA controller
	PCI: Generalize multi-function power dependency device links
	PCI: Add ACS quirk for Intel Root Complex Integrated Endpoints
	PCI: Unify ACS quirk desired vs provided checking
	btrfs: fix error handling when submitting direct I/O bio
	btrfs: fix wrong file range cleanup after an error filling dealloc range
	blk-mq: move _blk_mq_update_nr_hw_queues synchronize_rcu call
	PCI: Program MPS for RCiEP devices
	e1000e: Disable TSO for buffer overrun workaround
	e1000e: Relax condition to trigger reset for ME workaround
	carl9170: remove P2P_GO support
	media: go7007: fix a miss of snd_card_free
	b43legacy: Fix case where channel status is corrupted
	b43: Fix connection problem with WPA3
	b43_legacy: Fix connection problem with WPA3
	media: ov5640: fix use of destroyed mutex
	igb: Report speed and duplex as unknown when device is runtime suspended
	power: vexpress: add suppress_bind_attrs to true
	pinctrl: samsung: Save/restore eint_mask over suspend for EINT_TYPE GPIOs
	sparc32: fix register window handling in genregs32_[gs]et()
	sparc64: fix misuses of access_process_vm() in genregs32_[sg]et()
	dm crypt: avoid truncating the logical block size
	kernel/cpu_pm: Fix uninitted local in cpu_pm
	ARM: tegra: Correct PL310 Auxiliary Control Register initialization
	drivers/macintosh: Fix memleak in windfarm_pm112 driver
	powerpc/64s: Don't let DT CPU features set FSCR_DSCR
	powerpc/64s: Save FSCR to init_task.thread.fscr after feature init
	kbuild: force to build vmlinux if CONFIG_MODVERSION=y
	sunrpc: svcauth_gss_register_pseudoflavor must reject duplicate registrations.
	sunrpc: clean up properly in gss_mech_unregister()
	mtd: rawnand: brcmnand: fix hamming oob layout
	mtd: rawnand: pasemi: Fix the probe error path
	w1: omap-hdq: cleanup to add missing newline for some dev_dbg
	perf probe: Do not show the skipped events
	perf probe: Fix to check blacklist address correctly
	perf symbols: Fix debuginfo search for Ubuntu
	Linux 4.14.185

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Ifd3a6f3d9643a42802ed8f061a548a5c5ffcb109
2020-06-22 11:22:58 +02:00

764 lines
19 KiB
C

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <asm/sections.h>
#include <linux/uaccess.h>
#include "internal.h"
static inline int is_kernel_rodata(unsigned long addr)
{
return addr >= (unsigned long)__start_rodata &&
addr < (unsigned long)__end_rodata;
}
/**
* kfree_const - conditionally free memory
* @x: pointer to the memory
*
* Function calls kfree only if @x is not in .rodata section.
*/
void kfree_const(const void *x)
{
if (!is_kernel_rodata((unsigned long)x))
kfree(x);
}
EXPORT_SYMBOL(kfree_const);
/**
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
char *kstrdup(const char *s, gfp_t gfp)
{
size_t len;
char *buf;
if (!s)
return NULL;
len = strlen(s) + 1;
buf = kmalloc_track_caller(len, gfp);
if (buf)
memcpy(buf, s, len);
return buf;
}
EXPORT_SYMBOL(kstrdup);
/**
* kstrdup_const - conditionally duplicate an existing const string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Function returns source string if it is in .rodata section otherwise it
* fallbacks to kstrdup.
* Strings allocated by kstrdup_const should be freed by kfree_const.
*/
const char *kstrdup_const(const char *s, gfp_t gfp)
{
if (is_kernel_rodata((unsigned long)s))
return s;
return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);
/**
* kstrndup - allocate space for and copy an existing string
* @s: the string to duplicate
* @max: read at most @max chars from @s
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Note: Use kmemdup_nul() instead if the size is known exactly.
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
size_t len;
char *buf;
if (!s)
return NULL;
len = strnlen(s, max);
buf = kmalloc_track_caller(len+1, gfp);
if (buf) {
memcpy(buf, s, len);
buf[len] = '\0';
}
return buf;
}
EXPORT_SYMBOL(kstrndup);
/**
* kmemdup - duplicate region of memory
*
* @src: memory region to duplicate
* @len: memory region length
* @gfp: GFP mask to use
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
void *p;
p = kmalloc_track_caller(len, gfp);
if (p)
memcpy(p, src, len);
return p;
}
EXPORT_SYMBOL(kmemdup);
/**
* kmemdup_nul - Create a NUL-terminated string from unterminated data
* @s: The data to stringify
* @len: The size of the data
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*/
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
char *buf;
if (!s)
return NULL;
buf = kmalloc_track_caller(len + 1, gfp);
if (buf) {
memcpy(buf, s, len);
buf[len] = '\0';
}
return buf;
}
EXPORT_SYMBOL(kmemdup_nul);
/**
* memdup_user - duplicate memory region from user space
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Returns an ERR_PTR() on failure.
*/
void *memdup_user(const void __user *src, size_t len)
{
void *p;
/*
* Always use GFP_KERNEL, since copy_from_user() can sleep and
* cause pagefault, which makes it pointless to use GFP_NOFS
* or GFP_ATOMIC.
*/
p = kmalloc_track_caller(len, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) {
kfree(p);
return ERR_PTR(-EFAULT);
}
return p;
}
EXPORT_SYMBOL(memdup_user);
/*
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
*/
char *strndup_user(const char __user *s, long n)
{
char *p;
long length;
length = strnlen_user(s, n);
if (!length)
return ERR_PTR(-EFAULT);
if (length > n)
return ERR_PTR(-EINVAL);
p = memdup_user(s, length);
if (IS_ERR(p))
return p;
p[length - 1] = '\0';
return p;
}
EXPORT_SYMBOL(strndup_user);
/**
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
*
* @src: source address in user space
* @len: number of bytes to copy
*
* Returns an ERR_PTR() on failure.
*/
void *memdup_user_nul(const void __user *src, size_t len)
{
char *p;
/*
* Always use GFP_KERNEL, since copy_from_user() can sleep and
* cause pagefault, which makes it pointless to use GFP_NOFS
* or GFP_ATOMIC.
*/
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
if (copy_from_user(p, src, len)) {
kfree(p);
return ERR_PTR(-EFAULT);
}
p[len] = '\0';
return p;
}
EXPORT_SYMBOL(memdup_user_nul);
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
struct vm_area_struct *next;
vma->vm_prev = prev;
if (prev) {
next = prev->vm_next;
prev->vm_next = vma;
} else {
mm->mmap = vma;
if (rb_parent)
next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
else
next = NULL;
}
vma->vm_next = next;
if (next)
next->vm_prev = vma;
}
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
struct task_struct * __maybe_unused t = current;
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
}
#endif
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
* If the architecture not support this function, simply return with no
* page pinned
*/
int __weak __get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return 0;
}
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @write: whether pages will be written to
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*
* get_user_pages_fast provides equivalent functionality to get_user_pages,
* operating on current and current->mm, with force=0 and vma=NULL. However
* unlike get_user_pages, it must be called without mmap_sem held.
*
* get_user_pages_fast may take mmap_sem and page table locks, so no
* assumptions can be made about lack of locking. get_user_pages_fast is to be
* implemented in a way that is advantageous (vs get_user_pages()) when the
* user memory area is already faulted in and present in ptes. However if the
* pages have to be faulted in, it may turn out to be slightly slower so
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
*/
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return get_user_pages_unlocked(start, nr_pages, pages,
write ? FOLL_WRITE : 0);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
if (!ret) {
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate, &uf);
up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
unsigned long vm_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
{
if (unlikely(offset + PAGE_ALIGN(len) < offset))
return -EINVAL;
if (unlikely(offset_in_page(offset)))
return -EINVAL;
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);
/**
* kvmalloc_node - attempt to allocate physically contiguous memory, but upon
* failure, fall back to non-contiguous (vmalloc) allocation.
* @size: size of the request.
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
* @node: numa node to allocate from
*
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
* Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
* fall back to vmalloc.
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
gfp_t kmalloc_flags = flags;
void *ret;
/*
* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
* so the given set of flags has to be compatible.
*/
if ((flags & GFP_KERNEL) != GFP_KERNEL)
return kmalloc_node(size, flags, node);
/*
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
* contribute to a long term fragmentation less than vmalloc fallback.
* However make sure that larger requests are not too disruptive - no
* OOM killer and no allocation failure warnings as we have a fallback.
*/
if (size > PAGE_SIZE) {
kmalloc_flags |= __GFP_NOWARN;
if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
kmalloc_flags |= __GFP_NORETRY;
}
ret = kmalloc_node(size, kmalloc_flags, node);
/*
* It doesn't really make sense to fallback to vmalloc for sub page
* requests
*/
if (ret || size <= PAGE_SIZE)
return ret;
return __vmalloc_node_flags_caller(size, node, flags,
__builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr))
vfree(addr);
else
kfree(addr);
}
EXPORT_SYMBOL(kvfree);
/**
* kvfree_sensitive - Free a data object containing sensitive information.
* @addr: address of the data object to be freed.
* @len: length of the data object.
*
* Use the special memzero_explicit() function to clear the content of a
* kvmalloc'ed object containing sensitive data to make sure that the
* compiler won't optimize out the data clearing.
*/
void kvfree_sensitive(const void *addr, size_t len)
{
if (likely(!ZERO_OR_NULL_PTR(addr))) {
memzero_explicit((void *)addr, len);
kvfree(addr);
}
}
EXPORT_SYMBOL(kvfree_sensitive);
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
mapping = (unsigned long)page->mapping;
mapping &= ~PAGE_MAPPING_FLAGS;
return (void *)mapping;
}
/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
page = compound_head(page);
return __page_rmapping(page);
}
/*
* Return true if this page is mapped into pagetables.
* For compound page it returns true if any subpage of compound page is mapped.
*/
bool page_mapped(struct page *page)
{
int i;
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) >= 0;
page = compound_head(page);
if (atomic_read(compound_mapcount_ptr(page)) >= 0)
return true;
if (PageHuge(page))
return false;
for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
return false;
}
EXPORT_SYMBOL(page_mapped);
struct anon_vma *page_anon_vma(struct page *page)
{
unsigned long mapping;
page = compound_head(page);
mapping = (unsigned long)page->mapping;
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
return NULL;
return __page_rmapping(page);
}
struct address_space *page_mapping(struct page *page)
{
struct address_space *mapping;
page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
return NULL;
if (unlikely(PageSwapCache(page))) {
swp_entry_t entry;
entry.val = page_private(page);
return swap_address_space(entry);
}
mapping = page->mapping;
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
return NULL;
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
EXPORT_SYMBOL(page_mapping);
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
{
int ret;
ret = atomic_read(&page->_mapcount) + 1;
/*
* For file THP page->_mapcount contains total number of mapping
* of the page: no need to look into compound_mapcount.
*/
if (!PageAnon(page) && !PageHuge(page))
return ret;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
return ret;
}
EXPORT_SYMBOL_GPL(__page_mapcount);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
int sysctl_overcommit_ratio __read_mostly = 50;
unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
sysctl_overcommit_kbytes = 0;
return ret;
}
int overcommit_kbytes_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write)
sysctl_overcommit_ratio = 0;
return ret;
}
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
unsigned long vm_commit_limit(void)
{
unsigned long allowed;
if (sysctl_overcommit_kbytes)
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
else
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100);
allowed += total_swap_pages;
return allowed;
}
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
*/
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
* The global memory commitment made in the system can be a metric
* that can be used to drive ballooning decisions when Linux is hosted
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
*/
unsigned long vm_memory_committed(void)
{
return percpu_counter_read_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
*
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
*
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic.
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long free, allowed, reserve;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
"memory commitment underflow");
vm_acct_memory(pages);
/*
* Sometimes we want to use more memory than we have
*/
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
free = global_zone_page_state(NR_FREE_PAGES);
free += global_node_page_state(NR_FILE_PAGES);
/*
* shmem pages shouldn't be counted as free in this
* case, they can't be purged, only swapped out, and
* that won't affect the overall amount of available
* memory in the system.
*/
free -= global_node_page_state(NR_SHMEM);
free += get_nr_swap_pages();
/*
* Any slabs which are created with the
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
free += global_node_page_state(NR_SLAB_RECLAIMABLE);
/*
* Part of the kernel memory, which can be released
* under memory pressure.
*/
free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (free <= totalreserve_pages)
goto error;
else
free -= totalreserve_pages;
/*
* Reserve some for root
*/
if (!cap_sys_admin)
free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
goto error;
}
allowed = vm_commit_limit();
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
/*
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
vm_unacct_memory(pages);
return -ENOMEM;
}
/**
* get_cmdline() - copy the cmdline value to a buffer.
* @task: the task whose cmdline value to copy.
* @buffer: the buffer to copy to.
* @buflen: the length of the buffer. Larger cmdline values are truncated
* to this length.
* Returns the size of the cmdline field copied. Note that the copy does
* not guarantee an ending NULL byte.
*/
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
unsigned long arg_start, arg_end, env_start, env_end;
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
down_read(&mm->mmap_sem);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
up_read(&mm->mmap_sem);
len = arg_end - arg_start;
if (len > buflen)
len = buflen;
res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
/*
* If the nul at the end of args has been overwritten, then
* assume application is using setproctitle(3).
*/
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
len = strnlen(buffer, res);
if (len < res) {
res = len;
} else {
len = env_end - env_start;
if (len > buflen - res)
len = buflen - res;
res += access_process_vm(task, env_start,
buffer+res, len,
FOLL_FORCE);
res = strnlen(buffer, res);
}
}
out_mm:
mmput(mm);
out:
return res;
}