Merge remote-tracking branch 'remotes/origin/tmp-9189141' into msm-4.14

* remotes/origin/tmp-9189141:
  Linux 4.14.13
  KVM: s390: prevent buffer overrun on memory hotplug during migration
  KVM: s390: fix cmma migration for multiple memory slots
  mtd: nand: pxa3xx: Fix READOOB implementation
  parisc: qemu idle sleep support
  parisc: Fix alignment of pa_tlb_lock in assembly on 32-bit SMP kernel
  apparmor: fix regression in mount mediation when feature set is pinned
  x86/microcode/AMD: Add support for fam17h microcode loading
  Input: elantech - add new icbody type 15
  powerpc/mm: Fix SEGV on mapped region to return SEGV_ACCERR
  ARC: uaccess: dont use "l" gcc inline asm constraint modifier
  iommu/arm-smmu-v3: Cope with duplicated Stream IDs
  iommu/arm-smmu-v3: Don't free page table ops twice
  kernel/signal.c: remove the no longer needed SIGNAL_UNKILLABLE check in complete_signal()
  kernel/signal.c: protect the SIGNAL_UNKILLABLE tasks from !sig_kernel_only() signals
  kernel/signal.c: protect the traced SIGNAL_UNKILLABLE tasks from SIGKILL
  x86 / CPU: Always show current CPU frequency in /proc/cpuinfo
  x86 / CPU: Avoid unnecessary IPIs in arch_freq_get_on_cpu()
  fscache: Fix the default for fscache_maybe_release_page()
  sunxi-rsb: Include OF based modalias in device uevent
  drm/i915: Apply Display WA #1183 on skl, kbl, and cfl
  drm/i915: Disable DC states around GMBUS on GLK
  crypto: chelsio - select CRYPTO_GF128MUL
  crypto: pcrypt - fix freeing pcrypt instances
  crypto: chacha20poly1305 - validate the digest size
  crypto: n2 - cure use after free
  efi/capsule-loader: Reinstate virtual capsule mapping
  btrfs: fix refcount_t usage when deleting btrfs_delayed_nodes
  userfaultfd: clear the vma->vm_userfaultfd_ctx if UFFD_EVENT_FORK fails
  mm/sparse.c: wrong allocation for mem_section
  mm/mprotect: add a cond_resched() inside change_pmd_range()
  kernel/acct.c: fix the acct->needcheck check in check_free_space()
  x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN
  x86/alternatives: Add missing '\n' at end of ALTERNATIVE inline asm
  x86/tlb: Drop the _GPL from the cpu_tlbstate export
  x86/events/intel/ds: Use the proper cache flush method for mapping ds buffers
  x86/kaslr: Fix the vaddr_end mess
  x86/mm: Map cpu_entry_area at the same place on 4/5 level
  x86/mm: Set MODULES_END to 0xffffffffff000000
  ANDROID: netfilter: xt_qtaguid: Fix 4.14 compilation
  ANDROID: Squashfs: optimize reading uncompressed data
  ANDROID: Squashfs: implement .readpages()
  ANDROID: Squashfs: replace buffer_head with BIO
  ANDROID: Squashfs: refactor page_actor
  ANDROID: usb: f_fs: Prevent gadget unbind if it is already unbound
  Linux 4.14.12
  rtc: m41t80: remove unneeded checks from m41t80_sqw_set_rate
  rtc: m41t80: avoid i2c read in m41t80_sqw_is_prepared
  rtc: m41t80: avoid i2c read in m41t80_sqw_recalc_rate
  rtc: m41t80: fix m41t80_sqw_round_rate return value
  rtc: m41t80: m41t80_sqw_set_rate should return 0 on success
  Revert "xfrm: Fix stack-out-of-bounds read in xfrm_state_find."
  x86/process: Define cpu_tss_rw in same section as declaration
  x86/pti: Switch to kernel CR3 at early in entry_SYSCALL_compat()
  x86/dumpstack: Print registers for first stack frame
  x86/dumpstack: Fix partial register dumps
  x86/pti: Make sure the user/kernel PTEs match
  x86/cpu, x86/pti: Do not enable PTI on AMD processors
  capabilities: fix buffer overread on very short xattr
  exec: Weaken dumpability for secureexec
  Linux 4.14.11
  tty: fix tty_ldisc_receive_buf() documentation
  n_tty: fix EXTPROC vs ICANON interaction with TIOCINQ (aka FIONREAD)
  x86/ldt: Make LDT pgtable free conditional
  x86/ldt: Plug memory leak in error path
  x86/espfix/64: Fix espfix double-fault handling on 5-level systems
  x86-32: Fix kexec with stack canary (CONFIG_CC_STACKPROTECTOR)
  x86/mm: Remove preempt_disable/enable() from __native_flush_tlb()
  x86/smpboot: Remove stale TLB flush invocations
  nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick()
  staging: android: ion: Fix dma direction for dma_sync_sg_for_cpu/device
  drivers: base: cacheinfo: fix cache type for non-architected system cache
  phy: tegra: fix device-tree node lookups
  binder: fix proc->files use-after-free
  timers: Reinitialize per cpu bases on hotplug
  timers: Invoke timer_start_debug() where it makes sense
  timers: Use deferrable base independent of base::nohz_active
  usb: xhci: Add XHCI_TRUST_TX_LENGTH for Renesas uPD720201
  USB: Fix off by one in type-specific length check of BOS SSP capability
  usb: add RESET_RESUME for ELSA MicroLink 56K
  usb: Add device quirk for Logitech HD Pro Webcam C925e
  USB: serial: option: adding support for YUGA CLM920-NC5
  USB: serial: option: add support for Telit ME910 PID 0x1101
  USB: serial: qcserial: add Sierra Wireless EM7565
  USB: serial: ftdi_sio: add id for Airbus DS P8GR
  USB: chipidea: msm: fix ulpi-node lookup
  usbip: vhci: stop printing kernel pointer addresses in messages
  usbip: stub: stop printing kernel pointer addresses in messages
  usbip: prevent leaking socket pointer address in messages
  usbip: fix usbip bind writing random string after command in match_busid
  sparc64: repair calling incorrect hweight function from stubs
  skbuff: in skb_copy_ubufs unclone before releasing zerocopy
  skbuff: skb_copy_ubufs must release uarg even without user frags
  skbuff: orphan frags before zerocopy clone
  Revert "mlx5: move affinity hints assignments to generic code"
  ipv6: set all.accept_dad to 0 by default
  ipv4: fib: Fix metrics match when deleting a route
  phylink: ensure AN is enabled
  phylink: ensure the PHY interface mode is appropriately set
  bnxt_en: Fix sources of spurious netpoll warnings
  net: sched: fix static key imbalance in case of ingress/clsact_init error
  vxlan: restore dev->mtu setting based on lower device
  net/mlx5: FPGA, return -EINVAL if size is zero
  tcp: refresh tcp_mstamp from timers callbacks
  ipv6: Honor specified parameters in fibmatch lookup
  net: phy: marvell: Limit 88m1101 autoneg errata to 88E1145 as well.
  tcp: fix potential underestimation on rcv_rtt
  mlxsw: spectrum: Disable MAC learning for ovs port
  tipc: fix hanging poll() for stream sockets
  sctp: make sure stream nums can match optlen in sctp_setsockopt_reset_streams
  s390/qeth: fix error handling in checksum cmd callback
  net: dsa: bcm_sf2: Clear IDDQ_GLOBAL_PWR bit for PHY
  sfc: pass valid pointers from efx_enqueue_unwind
  openvswitch: Fix pop_vlan action for double tagged frames
  net/mlx5: Fix error flow in CREATE_QP command
  net/mlx5e: Prevent possible races in VXLAN control flow
  net/mlx5e: Add refcount to VXLAN structure
  net/mlx5e: Fix features check of IPv6 traffic
  net/mlx5e: Fix possible deadlock of VXLAN lock
  net/mlx5: Fix rate limit packet pacing naming and struct
  tcp: invalidate rate samples during SACK reneging
  sock: free skb in skb_complete_tx_timestamp on error
  net: phy: micrel: ksz9031: reconfigure autoneg after phy autoneg workaround
  net: Fix double free and memory corruption in get_net_ns_by_id()
  net: bridge: fix early call to br_stp_change_bridge_id and plug newlink leaks
  ipv4: Fix use-after-free when flushing FIB tables
  ip6_gre: fix device features for ioctl setup
  adding missing rcu_read_unlock in ipxip6_rcv
  sctp: Replace use of sockets_allocated with specified macro.
  net: mvmdio: disable/unprepare clocks in EPROBE_DEFER case
  net: ipv4: fix for a race condition in raw_sendmsg
  s390/qeth: update takeover IPs after configuration change
  s390/qeth: lock IP table while applying takeover changes
  s390/qeth: don't apply takeover changes to RXIP
  s390/qeth: apply takeover changes when mode is toggled
  tcp_bbr: reset long-term bandwidth sampling on loss recovery undo
  tcp_bbr: reset full pipe detection on loss recovery undo
  tg3: Fix rx hang on MTU change with 5717/5719
  tcp md5sig: Use skb's saddr when replying to an incoming segment
  tcp_bbr: record "full bw reached" decision in new full_bw_reached bit
  RDS: Check cmsg_len before dereferencing CMSG_DATA
  ptr_ring: add barriers
  net: reevalulate autoflowlabel setting after sysctl setting
  net: qmi_wwan: add Sierra EM7565 1199:9091
  netlink: Add netns check on taps
  net: igmp: Use correct source address on IGMPv3 reports
  net: fec: unmap the xmit buffer that are not transferred by DMA
  ipv6: mcast: better catch silly mtu values
  ipv4: igmp: guard against silly MTU values
  kbuild: add '-fno-stack-check' to kernel build options
  block: don't let passthrough IO go into .make_request_fn()
  block: fix blk_rq_append_bio
  cpufreq: schedutil: Use idle_calls counter of the remote CPU
  ALSA: hda - Fix missing COEF init for ALC225/295/299
  ALSA: hda - fix headset mic detection issue on a Dell machine
  ALSA: hda - change the location for one mic on a Lenovo machine
  ALSA: hda - Add MIC_NO_PRESENCE fixup for 2 HP machines
  ALSA: hda: Drop useless WARN_ON()
  IB/core: Verify that QP is security enabled in create and destroy
  IB/uverbs: Fix command checking as part of ib_uverbs_ex_modify_qp()
  IB/mlx5: Serialize access to the VMA list
  IB/hfi: Only read capability registers if the capability exists
  gpio: fix "gpio-line-names" property retrieval
  ASoC: tlv320aic31xx: Fix GPIO1 register definition
  ASoC: twl4030: fix child-node lookup
  ASoC: fsl_ssi: AC'97 ops need regmap, clock and cleaning up on failure
  ASoC: da7218: fix fix child-node lookup
  ASoC: wm_adsp: Fix validation of firmware and coeff lengths
  ASoC: codecs: msm8916-wcd: Fix supported formats
  iw_cxgb4: Only validate the MSN for successful completions
  ring-buffer: Do no reuse reader page if still in use
  ring-buffer: Mask out the info bits when returning buffer page length
  x86/ldt: Make the LDT mapping RO
  x86/mm/dump_pagetables: Allow dumping current pagetables
  x86/mm/dump_pagetables: Check user space page table for WX pages
  x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
  x86/mm/pti: Add Kconfig
  x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
  x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
  x86/mm: Use INVPCID for __native_flush_tlb_single()
  x86/mm: Optimize RESTORE_CR3
  x86/mm: Use/Fix PCID to optimize user/kernel switches
  x86/mm: Abstract switching CR3
  x86/mm: Allow flushing for future ASID switches
  x86/pti: Map the vsyscall page if needed
  x86/pti: Put the LDT in its own PGD if PTI is on
  x86/mm/64: Make a full PGD-entry size hole in the memory map
  x86/events/intel/ds: Map debug buffers in cpu_entry_area
  x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
  x86/mm/pti: Map ESPFIX into user space
  x86/mm/pti: Share entry text PMD
  x86/entry: Align entry text section to PMD boundary
  x86/mm/pti: Share cpu_entry_area with user space page tables
  x86/mm/pti: Force entry through trampoline when PTI active
  x86/mm/pti: Add functions to clone kernel PMDs
  x86/mm/pti: Populate user PGD
  x86/mm/pti: Allocate a separate user PGD
  x86/mm/pti: Allow NX poison to be set in p4d/pgd
  x86/mm/pti: Add mapping helper functions
  x86/pti: Add the pti= cmdline option and documentation
  x86/mm/pti: Add infrastructure for page table isolation
  x86/mm/pti: Prepare the x86/entry assembly code for entry/exit CR3 switching
  x86/mm/pti: Disable global pages if PAGE_TABLE_ISOLATION=y
  x86/cpufeatures: Add X86_BUG_CPU_INSECURE
  tracing: Fix crash when it fails to alloc ring buffer
  tracing: Fix possible double free on failure of allocating trace buffer
  tracing: Remove extra zeroing out of the ring buffer page

  Conflicts:
	drivers/staging/android/ion/ion.c
	kernel/time/timer.c

Change-Id: Ia5b16c96ab44e640e2f10ab535c4c672b670cbdc
Signed-off-by: Runmin Wang <runminw@codeaurora.org>
This commit is contained in:
Runmin Wang 2018-01-11 17:49:12 -08:00
commit 5682ea9f33
234 changed files with 3981 additions and 1439 deletions

View File

@ -2694,6 +2694,8 @@
steal time is computed, but won't influence scheduler
behaviour
nopti [X86-64] Disable kernel page table isolation
nolapic [X86-32,APIC] Do not enable or use the local APIC.
nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
@ -3262,6 +3264,12 @@
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
pti= [X86_64]
Control user/kernel address space isolation:
on - enable
off - disable
auto - default setting
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.

View File

@ -12,7 +12,9 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
vaddr_end for KASLR
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
@ -29,20 +31,22 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff91ffffffffffff (=49 bits) hole
ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ...
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
vaddr_end for KASLR
fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
... unused hole ...
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
@ -66,9 +70,10 @@ memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
during EFI runtime calls.
The module mapping space size changes based on the CONFIG requirements for the
following fixmap section.
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time.
Be very careful vs. KASLR when changing anything here. The KASLR address
range must not overlap with anything except the KASAN shadow area, which is
correct as KASAN disables KASLR.

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 14
SUBLEVEL = 10
SUBLEVEL = 13
EXTRAVERSION =
NAME = Petit Gorille
@ -817,6 +817,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
# Make sure -fstack-check isn't enabled (like gentoo apparently did)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
# conserve stack if available
KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)

View File

@ -668,6 +668,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
return 0;
__asm__ __volatile__(
" mov lp_count, %5 \n"
" lp 3f \n"
"1: ldb.ab %3, [%2, 1] \n"
" breq.d %3, 0, 3f \n"
@ -684,8 +685,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
" .word 1b, 4b \n"
" .previous \n"
: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
: "g"(-EFAULT), "l"(count)
: "memory");
: "g"(-EFAULT), "r"(count)
: "lp_count", "lp_start", "lp_end", "memory");
return res;
}

View File

@ -12,6 +12,7 @@
for the semaphore. */
#define __PA_LDCW_ALIGNMENT 16
#define __PA_LDCW_ALIGN_ORDER 4
#define __ldcw_align(a) ({ \
unsigned long __ret = (unsigned long) &(a)->lock[0]; \
__ret = (__ret + __PA_LDCW_ALIGNMENT - 1) \
@ -29,6 +30,7 @@
ldcd). */
#define __PA_LDCW_ALIGNMENT 4
#define __PA_LDCW_ALIGN_ORDER 2
#define __ldcw_align(a) (&(a)->slock)
#define __LDCW "ldcw,co"

View File

@ -35,6 +35,7 @@
#include <asm/pgtable.h>
#include <asm/signal.h>
#include <asm/unistd.h>
#include <asm/ldcw.h>
#include <asm/thread_info.h>
#include <linux/linkage.h>
@ -46,6 +47,14 @@
#endif
.import pa_tlb_lock,data
.macro load_pa_tlb_lock reg
#if __PA_LDCW_ALIGNMENT > 4
load32 PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
depi 0,31,__PA_LDCW_ALIGN_ORDER, \reg
#else
load32 PA(pa_tlb_lock), \reg
#endif
.endm
/* space_to_prot macro creates a prot id from a space id */
@ -457,7 +466,7 @@
.macro tlb_lock spc,ptp,pte,tmp,tmp1,fault
#ifdef CONFIG_SMP
cmpib,COND(=),n 0,\spc,2f
load32 PA(pa_tlb_lock),\tmp
load_pa_tlb_lock \tmp
1: LDCW 0(\tmp),\tmp1
cmpib,COND(=) 0,\tmp1,1b
nop
@ -480,7 +489,7 @@
/* Release pa_tlb_lock lock. */
.macro tlb_unlock1 spc,tmp
#ifdef CONFIG_SMP
load32 PA(pa_tlb_lock),\tmp
load_pa_tlb_lock \tmp
tlb_unlock0 \spc,\tmp
#endif
.endm

View File

@ -36,6 +36,7 @@
#include <asm/assembly.h>
#include <asm/pgtable.h>
#include <asm/cache.h>
#include <asm/ldcw.h>
#include <linux/linkage.h>
.text
@ -333,8 +334,12 @@ ENDPROC_CFI(flush_data_cache_local)
.macro tlb_lock la,flags,tmp
#ifdef CONFIG_SMP
ldil L%pa_tlb_lock,%r1
ldo R%pa_tlb_lock(%r1),\la
#if __PA_LDCW_ALIGNMENT > 4
load32 pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la
depi 0,31,__PA_LDCW_ALIGN_ORDER, \la
#else
load32 pa_tlb_lock, \la
#endif
rsm PSW_SM_I,\flags
1: LDCW 0(\la),\tmp
cmpib,<>,n 0,\tmp,3f

View File

@ -39,6 +39,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/ptrace.h>
@ -183,6 +184,44 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r)
return 1;
}
/*
* Idle thread support
*
* Detect when running on QEMU with SeaBIOS PDC Firmware and let
* QEMU idle the host too.
*/
int running_on_qemu __read_mostly;
void __cpuidle arch_cpu_idle_dead(void)
{
/* nop on real hardware, qemu will offline CPU. */
asm volatile("or %%r31,%%r31,%%r31\n":::);
}
void __cpuidle arch_cpu_idle(void)
{
local_irq_enable();
/* nop on real hardware, qemu will idle sleep. */
asm volatile("or %%r10,%%r10,%%r10\n":::);
}
static int __init parisc_idle_init(void)
{
const char *marker;
/* check QEMU/SeaBIOS marker in PAGE0 */
marker = (char *) &PAGE0->pad0;
running_on_qemu = (memcmp(marker, "SeaBIOS", 8) == 0);
if (!running_on_qemu)
cpu_idle_poll_ctrl(1);
return 0;
}
arch_initcall(parisc_idle_init);
/*
* Copy architecture-specific thread state
*/

View File

@ -145,6 +145,11 @@ static noinline int bad_area(struct pt_regs *regs, unsigned long address)
return __bad_area(regs, address, SEGV_MAPERR);
}
static noinline int bad_access(struct pt_regs *regs, unsigned long address)
{
return __bad_area(regs, address, SEGV_ACCERR);
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
unsigned int fault)
{
@ -490,7 +495,7 @@ retry:
good_area:
if (unlikely(access_error(is_write, is_exec, vma)))
return bad_area(regs, address);
return bad_access(regs, address);
/*
* If for any reason at all we couldn't handle the fault,

View File

@ -794,11 +794,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
if (kvm->arch.use_cmma) {
/*
* Get the last slot. They should be sorted by base_gfn, so the
* last slot is also the one at the end of the address space.
* We have verified above that at least one slot is present.
* Get the first slot. They are reverse sorted by base_gfn, so
* the first slot is also the one at the end of the address
* space. We have verified above that at least one slot is
* present.
*/
ms = slots->memslots + slots->used_slots - 1;
ms = slots->memslots;
/* round up so we only use full longs */
ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
/* allocate enough bytes to store all the bits */

View File

@ -1009,7 +1009,7 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
cbrlo[entries] = gfn << PAGE_SHIFT;
}
if (orc) {
if (orc && gfn < ms->bitmap_size) {
/* increment only if we are really flipping the bit to 1 */
if (!test_and_set_bit(gfn, ms->pgste_bitmap))
atomic64_inc(&ms->dirty_pages);

View File

@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
.previous
ENTRY(__arch_hweight64)
sethi %hi(__sw_hweight16), %g1
jmpl %g1 + %lo(__sw_hweight16), %g0
sethi %hi(__sw_hweight64), %g1
jmpl %g1 + %lo(__sw_hweight64), %g0
nop
ENDPROC(__arch_hweight64)
EXPORT_SYMBOL(__arch_hweight64)

View File

@ -23,6 +23,9 @@
*/
#undef CONFIG_AMD_MEM_ENCRYPT
/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION
#include "misc.h"
/* These actually do the work of building the kernel identity maps. */

View File

@ -1,6 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h>
#include <asm/unwind_hints.h>
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
/*
@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
.macro SET_NOFLUSH_BIT reg:req
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
.endm
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_SWITCH_MASK), \reg
.endm
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm
#define THIS_CPU_user_pcid_flush_mask \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* Test if the ASID needs a flush.
*/
movq \scratch_reg, \scratch_reg2
andq $(0x7FF), \scratch_reg /* mask ASID */
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@
/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
movq \scratch_reg2, \scratch_reg
jmp .Lwrcr3_\@
.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
.Lwrcr3_\@:
/* Flip the PGD and ASID to the user version */
orq $(PTI_SWITCH_MASK), \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
pushq %rax
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
movq %cr3, \scratch_reg
movq \scratch_reg, \save_reg
/*
* Is the "switch mask" all zero? That means that both of
* these are zero:
*
* 1. The user/kernel PCID bit, and
* 2. The user/kernel "bit" that points CR3 to the
* bottom half of the 8k PGD
*
* That indicates a kernel CR3 value, not a user CR3.
*/
testq $(PTI_SWITCH_MASK), \scratch_reg
jz .Ldone_\@
ADJUST_KERNEL_CR3 \scratch_reg
movq \scratch_reg, %cr3
.Ldone_\@:
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* KERNEL pages can always resume with NOFLUSH as we do
* explicit flushes.
*/
bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
jnc .Lnoflush_\@
/*
* Check if there's a pending flush for the user ASID we're
* about to set.
*/
movq \save_reg, \scratch_reg
andq $(0x7FF), \scratch_reg
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
jmp .Lwrcr3_\@
.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg
.Lwrcr3_\@:
/*
* The CR3 write could be avoided when not changing its value,
* but would require a CR3 read *and* a scratch register.
*/
movq \save_reg, %cr3
.Lend_\@:
.endm
#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
.endm
#endif
#endif /* CONFIG_X86_64 */
/*

View File

@ -23,7 +23,6 @@
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
@ -40,6 +39,8 @@
#include <asm/frame.h>
#include <linux/err.h>
#include "calling.h"
.code64
.section .entry.text, "ax"
@ -164,6 +165,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
/* Note: using %rsp as a scratch reg. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
@ -203,6 +207,10 @@ ENTRY(entry_SYSCALL_64)
*/
swapgs
/*
* This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
* is not required to switch CR3.
*/
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@ -399,6 +407,7 @@ syscall_return_via_sysret:
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
@ -736,6 +745,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
* We can do future final exit work right here.
*/
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
/* Restore RDI. */
popq %rdi
SWAPGS
@ -818,7 +829,9 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
SWAPGS
SWAPGS /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */
@ -834,7 +847,6 @@ native_irq_return_ldt:
/* Now RAX == RSP. */
andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
popq %rdi /* Restore user RDI */
/*
* espfix_stack[31:16] == 0. The page tables are set up such that
@ -845,7 +857,11 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack.
*/
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
SWAPGS /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp
UNWIND_HINT_IRET_REGS offset=8
@ -945,6 +961,8 @@ ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC
pushq %rdi
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@ -1244,7 +1262,11 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
1: ret
1:
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
ret
END(paranoid_entry)
/*
@ -1266,6 +1288,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs:
@ -1293,6 +1316,8 @@ ENTRY(error_entry)
* from user mode due to an IRET fault.
*/
SWAPGS
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
@ -1339,6 +1364,7 @@ ENTRY(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
.Lbstep_iret:
@ -1348,10 +1374,11 @@ ENTRY(error_entry)
.Lerror_bad_iret:
/*
* We came from an IRET to user mode, so we have user gsbase.
* Switch to kernel gsbase:
* We came from an IRET to user mode, so we have user
* gsbase and CR3. Switch to kernel gsbase and CR3:
*/
SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/*
* Pretend that the exception came from user mode: set up pt_regs
@ -1383,6 +1410,10 @@ END(error_exit)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
* so we can use real assembly here.
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
ENTRY(nmi)
UNWIND_HINT_IRET_REGS
@ -1446,6 +1477,7 @@ ENTRY(nmi)
swapgs
cld
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
@ -1698,6 +1730,8 @@ end_repeat_nmi:
movq $-1, %rsi
call do_nmi
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:

View File

@ -49,6 +49,10 @@
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS
/* We are about to clobber %rsp anyway, clobbering here is OK */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@ -186,8 +190,13 @@ ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
swapgs
/* Stash user ESP and switch to the kernel stack. */
/* Stash user ESP */
movl %esp, %r8d
/* Use %rsp as scratch reg. User ESP is stashed in r8 */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Switch to the kernel stack */
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
@ -256,10 +265,22 @@ sysret32_from_system_call:
* when the system call started, which is already known to user
* code. We zero R8-R10 to avoid info leaks.
*/
movq RSP-ORIG_RAX(%rsp), %rsp
/*
* The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
* on the process stack which is not mapped to userspace and
* not readable after we SWITCH_TO_USER_CR3. Delay the CR3
* switch until after after the last reference to the process
* stack.
*
* %r8/%r9 are zeroed before the sysret, thus safe to clobber.
*/
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
xorq %r8, %r8
xorq %r9, %r9
xorq %r10, %r10
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs
sysretl
END(entry_SYSCALL_compat)

View File

@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
* vsyscalls but leave the page not present. If so, we skip calling
* this.
*/
static void __init set_vsyscall_pgtable_user_bits(void)
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset_k(VSYSCALL_ADDR);
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
@ -373,7 +373,7 @@ void __init map_vsyscall(void)
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits();
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=

View File

@ -3,16 +3,19 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/insn.h>
#include "../perf_event.h"
/* Waste a full page so it can be mapped into the cpu_entry_area */
DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
@ -279,17 +282,67 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer);
static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
{
unsigned long start = (unsigned long)cea;
phys_addr_t pa;
size_t msz = 0;
pa = virt_to_phys(addr);
preempt_disable();
for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
cea_set_pte(cea, pa, prot);
/*
* This is a cross-CPU update of the cpu_entry_area, we must shoot down
* all TLB entries for it.
*/
flush_tlb_kernel_range(start, start + size);
preempt_enable();
}
static void ds_clear_cea(void *cea, size_t size)
{
unsigned long start = (unsigned long)cea;
size_t msz = 0;
preempt_disable();
for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
cea_set_pte(cea, 0, PAGE_NONE);
flush_tlb_kernel_range(start, start + size);
preempt_enable();
}
static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
{
unsigned int order = get_order(size);
int node = cpu_to_node(cpu);
struct page *page;
page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
return page ? page_address(page) : NULL;
}
static void dsfree_pages(const void *buffer, size_t size)
{
if (buffer)
free_pages((unsigned long)buffer, get_order(size));
}
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
int max;
void *buffer, *ibuffer;
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
size_t bsiz = x86_pmu.pebs_buffer_size;
int max, node = cpu_to_node(cpu);
void *buffer, *ibuffer, *cea;
if (!x86_pmu.pebs)
return 0;
buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
if (unlikely(!buffer))
return -ENOMEM;
@ -300,25 +353,27 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
kfree(buffer);
dsfree_pages(buffer, bsiz);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
}
max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
hwev->ds_pebs_vaddr = buffer;
/* Update the cpu entry area mapping */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds->pebs_buffer_base = (unsigned long) cea;
ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
ds->pebs_index = ds->pebs_buffer_base;
ds->pebs_absolute_maximum = ds->pebs_buffer_base +
max * x86_pmu.pebs_record_size;
max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
return 0;
}
static void release_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *cea;
if (!ds || !x86_pmu.pebs)
return;
@ -326,73 +381,70 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
kfree((void *)(unsigned long)ds->pebs_buffer_base);
/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
hwev->ds_pebs_vaddr = NULL;
}
static int alloc_bts_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
int max, thresh;
void *buffer;
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *buffer, *cea;
int max;
if (!x86_pmu.bts)
return 0;
buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
}
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
ds->bts_buffer_base = (u64)(unsigned long)buffer;
hwev->ds_bts_vaddr = buffer;
/* Update the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
ds->bts_buffer_base = (unsigned long) cea;
ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
ds->bts_index = ds->bts_buffer_base;
ds->bts_absolute_maximum = ds->bts_buffer_base +
max * BTS_RECORD_SIZE;
ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
thresh * BTS_RECORD_SIZE;
max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
ds->bts_absolute_maximum = ds->bts_buffer_base + max;
ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
return 0;
}
static void release_bts_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *cea;
if (!ds || !x86_pmu.bts)
return;
kfree((void *)(unsigned long)ds->bts_buffer_base);
/* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
ds_clear_cea(cea, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
hwev->ds_bts_vaddr = NULL;
}
static int alloc_ds_buffer(int cpu)
{
int node = cpu_to_node(cpu);
struct debug_store *ds;
ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
if (unlikely(!ds))
return -ENOMEM;
struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
}
static void release_ds_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
if (!ds)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
kfree(ds);
}
void release_ds_buffers(void)

View File

@ -14,6 +14,8 @@
#include <linux/perf_event.h>
#include <asm/intel_ds.h>
/* To enable MSR tracing please use the generic trace points. */
/*
@ -77,8 +79,6 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
/*
@ -95,23 +95,6 @@ struct amd_nb {
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
#define PEBS_REGS \
(PERF_REG_X86_AX | \
PERF_REG_X86_BX | \
@ -216,6 +199,8 @@ struct cpu_hw_events {
* Intel DebugStore bits
*/
struct debug_store *ds;
void *ds_pebs_vaddr;
void *ds_bts_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;

View File

@ -140,7 +140,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
".popsection"
".popsection\n"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
OLDINSTR_2(oldinstr, 1, 2) \
@ -151,7 +151,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
".popsection"
".popsection\n"
/*
* Alternative instructions for different CPU types or capabilities.

View File

@ -5,6 +5,7 @@
#include <linux/percpu-defs.h>
#include <asm/processor.h>
#include <asm/intel_ds.h>
/*
* cpu_entry_area is a percpu region that contains things needed by the CPU
@ -40,6 +41,18 @@ struct cpu_entry_area {
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
#ifdef CONFIG_CPU_SUP_INTEL
/*
* Per CPU debug store for Intel performance monitoring. Wastes a
* full page at the moment.
*/
struct debug_store cpu_debug_store;
/*
* The actual PEBS/BTS buffers must be mapped to user space
* Reserve enough fixmap PTEs.
*/
struct debug_store_buffers cpu_debug_buffers;
#endif
};
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))

View File

@ -197,11 +197,12 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
@ -340,5 +341,6 @@
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
#endif /* _ASM_X86_CPUFEATURES_H */

View File

@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
desc->type = (info->read_exec_only ^ 1) << 1;
desc->type |= info->contents << 2;
/* Set the ACCESS bit so it can be mapped RO */
desc->type |= 1;
desc->s = 1;
desc->dpl = 0x3;

View File

@ -44,6 +44,12 @@
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define DISABLE_PTI 0
#else
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
/*
* Make sure to add features to the correct mask
*/
@ -54,7 +60,7 @@
#define DISABLED_MASK4 (DISABLE_PCID)
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 0
#define DISABLED_MASK7 (DISABLE_PTI)
#define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX)
#define DISABLED_MASK10 0

View File

@ -0,0 +1,36 @@
#ifndef _ASM_INTEL_DS_H
#define _ASM_INTEL_DS_H
#include <linux/percpu-defs.h>
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_event_reset[MAX_PEBS_EVENTS];
} __aligned(PAGE_SIZE);
DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
struct debug_store_buffers {
char bts_buffer[BTS_BUFFER_SIZE];
char pebs_buffer[PEBS_BUFFER_SIZE];
};
#endif

View File

@ -50,10 +50,33 @@ struct ldt_struct {
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int nr_entries;
struct desc_struct *entries;
unsigned int nr_entries;
/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
};
/* This is a multiple of PAGE_SIZE. */
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
static inline void *ldt_slot_va(int slot)
{
#ifdef CONFIG_X86_64
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
#else
BUG();
#endif
}
/*
* Used for LDT copy/destruction.
*/
@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
{
return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) {}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see.
*/
if (unlikely(ldt))
set_ldt(ldt->entries, ldt->nr_entries);
else
if (unlikely(ldt)) {
if (static_cpu_has(X86_FEATURE_PTI)) {
if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
/*
* Whoops -- either the new LDT isn't mapped
* (if slot == -1) or is mapped into a bogus
* slot (if slot > 1).
*/
clear_LDT();
return;
}
/*
* If page table isolation is enabled, ldt->entries
* will not be mapped in the userspace pagetables.
* Tell the CPU to access the LDT through the alias
* at ldt_slot_va(ldt->slot).
*/
set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
} else {
set_ldt(ldt->entries, ldt->nr_entries);
}
} else {
clear_LDT();
}
#else
clear_LDT();
#endif
@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64

View File

@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
*/
extern gfp_t __userpte_alloc_gfp;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Instead of one PGD, we acquire two PGDs. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12
* in a pointer to swap between the two 4k halves.
*/
#define PGD_ALLOCATION_ORDER 1
#else
#define PGD_ALLOCATION_ORDER 0
#endif
/*
* Allocate and free page tables.
*/

View File

@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
void ptdump_walk_pgd_level_checkwx(void);
#ifdef CONFIG_DEBUG_WX
@ -846,7 +847,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
static inline int p4d_bad(p4d_t p4d)
{
return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (p4d_flags(p4d) & ~ignore_flags) != 0;
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
@ -880,7 +886,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
unsigned long ignore_flags = _PAGE_USER;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@ -909,7 +920,11 @@ static inline int pgd_none(pgd_t pgd)
* pgd_offset() returns a (pgd_t *)
* pgd_index() is used get the offset into the pgd page's array of pgd_t's;
*/
#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
/*
* a shortcut to get a pgd_t in a given mm
*/
#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
/*
* a shortcut which implies the use of the kernel's pgd, instead
* of a process's
@ -1111,7 +1126,14 @@ static inline int pud_write(pud_t pud)
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/* Clone the user space pgd as well */
memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
count * sizeof(pgd_t));
#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)

View File

@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
* (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
* the user one is in the last 4k. To switch between them, you
* just need to flip the 12th bit in their addresses.
*/
#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
/*
* This generates better code than the inline assembly in
* __set_bit().
*/
static inline void *ptr_set_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr |= BIT(bit);
return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr &= ~BIT(bit);
return (void *)__ptr;
}
static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
*
* Returns true for parts of the PGD that map userspace and
* false for the parts that map the kernel.
*/
static inline bool pgdp_maps_userspace(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;
return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
/*
* Take a PGD location (pgdp) and a pgd value that needs to be set there.
* Populates the user and returns the resulting PGD that must be set in
* the kernel copy of the page tables.
*/
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return pgd;
return __pti_set_user_pgd(pgdp, pgd);
}
#else
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
#endif
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
#else
*p4dp = p4d;
#endif
}
static inline void native_p4d_clear(p4d_t *p4d)
@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
*pgdp = pti_set_user_pgd(pgdp, pgd);
#else
*pgdp = pgd;
#endif
}
static inline void native_pgd_clear(pgd_t *pgd)

View File

@ -75,17 +75,27 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
/*
* See Documentation/x86/x86_64/mm.txt for a description of the memory map.
*
* Be very careful vs. KASLR when changing anything here. The KASLR address
* range must not overlap with anything except the KASAN shadow area, which
* is correct as KASAN disables KASLR.
*/
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#ifdef CONFIG_X86_5LEVEL
# define VMALLOC_SIZE_TB _AC(16384, UL)
# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
# define VMALLOC_SIZE_TB _AC(12800, UL)
# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
# define LDT_PGD_ENTRY _AC(-112, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else
# define VMALLOC_SIZE_TB _AC(32, UL)
# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
# define LDT_PGD_ENTRY _AC(-3, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
@ -100,13 +110,13 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
#define CPU_ENTRY_AREA_PGD _AC(-3, UL)
#define CPU_ENTRY_AREA_PGD _AC(-4, UL)
#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))

View File

@ -38,6 +38,11 @@
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define X86_CR3_PTI_SWITCH_BIT 11
#endif
#else
/*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save

View File

@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
#else
/*
* User space process size. 47bits minus one guard page. The guard
* page is necessary on Intel CPUs: if a SYSCALL instruction is at
* the highest possible canonical userspace address, then that
* syscall will enter the kernel with a non-canonical return
* address, and SYSRET will explode dangerously. We avoid this
* particular problem by preventing anything from being mapped
* at the maximum canonical address.
* User space process size. This is the first address outside the user range.
* There are a few constraints that determine this:
*
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
* We avoid this particular problem by preventing anything executable
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)

View File

@ -0,0 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef _ASM_X86_PTI_H
#define _ASM_X86_PTI_H
#ifndef __ASSEMBLY__
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern void pti_init(void);
extern void pti_check_boottime_disable(void);
#else
static inline void pti_check_boottime_disable(void) { }
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PTI_H */

View File

@ -10,38 +10,90 @@
#include <asm/special_insns.h>
#include <asm/smp.h>
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
/*
* Bump the generation count. This also serves as a full barrier
* that synchronizes with switch_mm(): callers are required to order
* their read of mm_cpumask after their writes to the paging
* structures.
*/
return atomic64_inc_return(&mm->context.tlb_gen);
}
/*
* The x86 feature is called PCID (Process Context IDentifier). It is similar
* to what is traditionally called ASID on the RISC processors.
*
* We don't use the traditional ASID implementation, where each process/mm gets
* its own ASID and flush/restart when we run out of ASID space.
*
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
* that came by on this CPU, allowing cheaper switch_mm between processes on
* this CPU.
*
* We end up with different spaces for different things. To avoid confusion we
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
* the canonical identifier for an mm
*
* kPCID - [1, TLB_NR_DYN_ASIDS]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
*
*/
/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12
/*
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
#define PTI_CONSUMED_ASID_BITS 0
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
#endif
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
/*
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
* for them being zero-based. Another -1 is because ASID 0 is reserved for
* for them being zero-based. Another -1 is because PCID 0 is reserved for
* use by non-PCID-aware users.
*/
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
* lines.
*/
#define TLB_NR_DYN_ASIDS 6
/*
* Given @asid, compute kPCID
*/
static inline u16 kern_pcid(u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not confict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
/*
* The ASID being passed in here should have respected the
* MAX_ASID_AVAILABLE and thus never have the switch bit set.
*/
VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
#endif
/*
* The dynamically-assigned ASIDs that get passed in are small
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
* so do not bother to clear it.
*
* If PCID is on, ASID-aware code paths put the ASID+1 into the
* PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other
@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
return asid + 1;
}
/*
* Given @asid, compute uPCID
*/
static inline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
#endif
return ret;
}
struct pgd_t;
static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{
@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
return !static_cpu_has(X86_FEATURE_PCID);
}
/*
* 6 because 6 should be plenty and struct tlb_state will fit in
* two cache lines.
*/
#define TLB_NR_DYN_ASIDS 6
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@ -134,6 +192,24 @@ struct tlb_state {
*/
bool is_lazy;
/*
* If set we changed the page tables in such a way that we
* needed an invalidation of all contexts (aka. PCIDs / ASIDs).
* This tells us to go invalidate all the non-loaded ctxs[]
* on the next context switch.
*
* The current ctx was kept up-to-date as it ran and does not
* need to be invalidated.
*/
bool invalidate_other;
/*
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
* the corresponding user PCID needs a flush next time we
* switch to it; see SWITCH_TO_USER_CR3.
*/
unsigned short user_pcid_flush_mask;
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one.
@ -211,6 +287,14 @@ static inline unsigned long cr4_read_shadow(void)
return this_cpu_read(cpu_tlbstate.cr4);
}
/*
* Mark all other ASIDs as invalid, preserves the current.
*/
static inline void invalidate_other_asid(void)
{
this_cpu_write(cpu_tlbstate.invalidate_other, true);
}
/*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot
@ -230,19 +314,48 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
extern void initialize_tlbstate_and_flush(void);
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
* See SWITCH_TO_USER_CR3.
*/
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
return;
/*
* We only have a single ASID if PCID is off and the CR3
* write will have flushed it.
*/
if (!cpu_feature_enabled(X86_FEATURE_PCID))
return;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
__set_bit(kern_pcid(asid),
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
/*
* flush the entire current user mapping
*/
static inline void __native_flush_tlb(void)
{
/*
* If current->mm == NULL then we borrow a mm which may change during a
* task switch and therefore we must not be preempted while we write CR3
* back:
* Preemption or interrupts must be disabled to protect the access
* to the per CPU variable and to prevent being preempted between
* read_cr3() and write_cr3().
*/
preempt_disable();
WARN_ON_ONCE(preemptible());
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
native_write_cr3(__native_read_cr3());
preempt_enable();
}
/*
@ -256,6 +369,8 @@ static inline void __native_flush_tlb_global(void)
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
*
* Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@ -282,7 +397,21 @@ static inline void __native_flush_tlb_global(void)
*/
static inline void __native_flush_tlb_single(unsigned long addr)
{
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
* Just use invalidate_user_asid() in case we are called early.
*/
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
invalidate_user_asid(loaded_mm_asid);
else
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
}
/*
@ -298,14 +427,6 @@ static inline void __flush_tlb_all(void)
*/
__flush_tlb();
}
/*
* Note: if we somehow had PCID but not PGE, then this wouldn't work --
* we'd end up flushing kernel translations for the current ASID but
* we might fail to flush kernel translations for other cached ASIDs.
*
* To avoid this issue, we force PCID off if PGE is off.
*/
}
/*
@ -315,6 +436,16 @@ static inline void __flush_tlb_one(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* __flush_tlb_single() will have cleared the TLB entry for this ASID,
* but since kernel space is replicated across all, we must also
* invalidate all others.
*/
invalidate_other_asid();
}
#define TLB_FLUSH_ALL -1UL
@ -375,6 +506,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
/*
* Bump the generation count. This also serves as a full barrier
* that synchronizes with switch_mm(): callers are required to order
* their read of mm_cpumask after their writes to the paging
* structures.
*/
return atomic64_inc_return(&mm->context.tlb_gen);
}
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm)
{

View File

@ -56,18 +56,27 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
* WARNING: The entire pt_regs may not be safe to dereference. In some cases,
* only the iret frame registers are accessible. Use with caution!
* If 'partial' returns true, only the iret frame registers are valid.
*/
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
bool *partial)
{
if (unwind_done(state))
return NULL;
if (partial) {
#ifdef CONFIG_UNWINDER_ORC
*partial = !state->full_regs;
#else
*partial = false;
#endif
}
return state->regs;
}
#else
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
bool *partial)
{
return NULL;
}

View File

@ -7,6 +7,7 @@
#ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void);
extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
/*
* Called on instruction fetch fault in vsyscall page.

View File

@ -78,7 +78,12 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
#define X86_CR3_PCID_BITS 12
#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4

View File

@ -17,6 +17,7 @@
#include <asm/sigframe.h>
#include <asm/bootparam.h>
#include <asm/suspend.h>
#include <asm/tlbflush.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@ -94,6 +95,9 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
/* TLB state for the entry code */
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);

View File

@ -22,7 +22,7 @@ obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
obj-y += aperfmperf.o
obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o

View File

@ -14,6 +14,8 @@
#include <linux/percpu.h>
#include <linux/smp.h>
#include "cpu.h"
struct aperfmperf_sample {
unsigned int khz;
ktime_t time;
@ -24,7 +26,7 @@ struct aperfmperf_sample {
static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
#define APERFMPERF_CACHE_THRESHOLD_MS 10
#define APERFMPERF_REFRESH_DELAY_MS 20
#define APERFMPERF_REFRESH_DELAY_MS 10
#define APERFMPERF_STALE_THRESHOLD_MS 1000
/*
@ -38,14 +40,8 @@ static void aperfmperf_snapshot_khz(void *dummy)
u64 aperf, aperf_delta;
u64 mperf, mperf_delta;
struct aperfmperf_sample *s = this_cpu_ptr(&samples);
ktime_t now = ktime_get();
s64 time_delta = ktime_ms_delta(now, s->time);
unsigned long flags;
/* Don't bother re-computing within the cache threshold time. */
if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return;
local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
@ -61,31 +57,68 @@ static void aperfmperf_snapshot_khz(void *dummy)
if (mperf_delta == 0)
return;
s->time = now;
s->time = ktime_get();
s->aperf = aperf;
s->mperf = mperf;
/* If the previous iteration was too long ago, discard it. */
if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
s->khz = 0;
else
s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
}
unsigned int arch_freq_get_on_cpu(int cpu)
static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
{
unsigned int khz;
s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
/* Don't bother re-computing within the cache threshold time. */
if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return true;
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
/* Return false if the previous iteration was too long ago. */
return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
}
unsigned int aperfmperf_get_khz(int cpu)
{
if (!cpu_khz)
return 0;
if (!static_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
khz = per_cpu(samples.khz, cpu);
if (khz)
return khz;
aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
return per_cpu(samples.khz, cpu);
}
void arch_freq_prepare_all(void)
{
ktime_t now = ktime_get();
bool wait = false;
int cpu;
if (!cpu_khz)
return;
if (!static_cpu_has(X86_FEATURE_APERFMPERF))
return;
for_each_online_cpu(cpu)
if (!aperfmperf_snapshot_cpu(cpu, now, false))
wait = true;
if (wait)
msleep(APERFMPERF_REFRESH_DELAY_MS);
}
unsigned int arch_freq_get_on_cpu(int cpu)
{
if (!cpu_khz)
return 0;
if (!static_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
return per_cpu(samples.khz, cpu);
msleep(APERFMPERF_REFRESH_DELAY_MS);
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);

View File

@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
}
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
if (c->x86_vendor != X86_VENDOR_AMD)
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
fpu__init_system(c);
#ifdef CONFIG_X86_32
@ -1335,7 +1339,10 @@ void syscall_init(void)
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
if (static_cpu_has(X86_FEATURE_PTI))
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
else
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);

View File

@ -47,4 +47,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
#endif /* ARCH_X86_CPU_H */

View File

@ -470,6 +470,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
#define F14H_MPB_MAX_SIZE 1824
#define F15H_MPB_MAX_SIZE 4096
#define F16H_MPB_MAX_SIZE 3458
#define F17H_MPB_MAX_SIZE 3200
switch (family) {
case 0x14:
@ -481,6 +482,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
case 0x16:
max_size = F16H_MPB_MAX_SIZE;
break;
case 0x17:
max_size = F17H_MPB_MAX_SIZE;
break;
default:
max_size = F1XH_MPB_MAX_SIZE;
break;

View File

@ -5,6 +5,8 @@
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
#include "cpu.h"
/*
* Get CPU information for use by the procfs.
*/
@ -78,8 +80,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
unsigned int freq = cpufreq_quick_get(cpu);
unsigned int freq = aperfmperf_get_khz(cpu);
if (!freq)
freq = cpufreq_quick_get(cpu);
if (!freq)
freq = cpu_khz;
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",

View File

@ -76,12 +76,23 @@ void show_iret_regs(struct pt_regs *regs)
regs->sp, regs->flags);
}
static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
bool partial)
{
if (on_stack(info, regs, sizeof(*regs)))
/*
* These on_stack() checks aren't strictly necessary: the unwind code
* has already validated the 'regs' pointer. The checks are done for
* ordering reasons: if the registers are on the next stack, we don't
* want to print them out yet. Otherwise they'll be shown as part of
* the wrong stack. Later, when show_trace_log_lvl() switches to the
* next stack, this function will be called again with the same regs so
* they can be printed in the right context.
*/
if (!partial && on_stack(info, regs, sizeof(*regs))) {
__show_regs(regs, 0);
else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
/*
* When an interrupt or exception occurs in entry code, the
* full pt_regs might not have been saved yet. In that case
@ -98,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
struct stack_info stack_info = {0};
unsigned long visit_mask = 0;
int graph_idx = 0;
bool partial;
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
stack = stack ? : get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
* Iterate through the stacks, starting with the current stack pointer.
@ -120,7 +133,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
@ -140,7 +153,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%s <%s>\n", log_lvl, stack_name);
if (regs)
show_regs_safe(&stack_info, regs);
show_regs_if_on_stack(&stack_info, regs, partial);
/*
* Scan the stack, printing any text addresses we find. At the
@ -164,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/*
* Don't print regs->ip again if it was already printed
* by show_regs_safe() below.
* by show_regs_if_on_stack().
*/
if (regs && stack == &regs->ip)
goto next;
@ -199,9 +212,9 @@ next:
unwind_next_frame(&state);
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state);
regs = unwind_get_entry_regs(&state, &partial);
if (regs)
show_regs_safe(&stack_info, regs);
show_regs_if_on_stack(&stack_info, regs, partial);
}
if (stack_name)
@ -297,11 +310,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
unsigned long sp;
#endif
printk(KERN_DEFAULT
"%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
"%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)

View File

@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \
GLOBAL(name)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
* strictly *need* the second page, but this allows us to
* have a single set_pgd() implementation that does not
* need to worry about whether it has 4k or 8k to work
* with.
*
* This ensures PGDs are 8k long:
*/
#define PTI_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
#define NEXT_PGD_PAGE(name) \
.balign 2 * PAGE_SIZE; \
GLOBAL(name)
#else
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
#define PTI_USER_PGD_FILL 0
#endif
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@ -350,13 +371,14 @@ GLOBAL(name)
.endr
__INITDATA
NEXT_PAGE(early_top_pgt)
NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
#ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#else
NEXT_PAGE(init_top_pgt)
NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
#endif
#ifdef CONFIG_X86_5LEVEL

View File

@ -24,6 +24,7 @@
#include <linux/uaccess.h>
#include <asm/ldt.h>
#include <asm/tlb.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return;
pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
load_mm_ldt(mm);
refresh_ldt_segments();
}
@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return NULL;
}
/* The new LDT isn't aliased for PTI yet. */
new_ldt->slot = -1;
new_ldt->nr_entries = num_entries;
return new_ldt;
}
/*
* If PTI is enabled, this maps the LDT into the kernelmode and
* usermode tables for the given mm.
*
* There is no corresponding unmap function. Even if the LDT is freed, we
* leave the PTEs around until the slot is reused or the mm is destroyed.
* This is harmless: the LDT is always in ordinary memory, and no one will
* access the freed slot.
*
* If we wanted to unmap freed LDTs, we'd also need to do a flush to make
* it useful, and the flush would slow down modify_ldt().
*/
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
bool is_vmalloc, had_top_level_entry;
unsigned long va;
spinlock_t *ptl;
pgd_t *pgd;
int i;
if (!static_cpu_has(X86_FEATURE_PTI))
return 0;
/*
* Any given ldt_struct should have map_ldt_struct() called at most
* once.
*/
WARN_ON(ldt->slot != -1);
/*
* Did we already have the top level entry allocated? We can't
* use pgd_none() for this because it doens't do anything on
* 4-level page table kernels.
*/
pgd = pgd_offset(mm, LDT_BASE_ADDR);
had_top_level_entry = (pgd->pgd != 0);
is_vmalloc = is_vmalloc_addr(ldt->entries);
for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
unsigned long offset = i << PAGE_SHIFT;
const void *src = (char *)ldt->entries + offset;
unsigned long pfn;
pte_t pte, *ptep;
va = (unsigned long)ldt_slot_va(slot) + offset;
pfn = is_vmalloc ? vmalloc_to_pfn(src) :
page_to_pfn(virt_to_page(src));
/*
* Treat the PTI LDT range as a *userspace* range.
* get_locked_pte() will allocate all needed pagetables
* and account for them in this mm.
*/
ptep = get_locked_pte(mm, va, &ptl);
if (!ptep)
return -ENOMEM;
/*
* Map it RO so the easy to find address is not a primary
* target via some kernel interface which misses a
* permission check.
*/
pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
set_pte_at(mm, va, ptep, pte);
pte_unmap_unlock(ptep, ptl);
}
if (mm->context.ldt) {
/*
* We already had an LDT. The top-level entry should already
* have been allocated and synchronized with the usermode
* tables.
*/
WARN_ON(!had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI))
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
} else {
/*
* This is the first time we're mapping an LDT for this process.
* Sync the pgd to the usermode tables.
*/
WARN_ON(had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}
}
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
ldt->slot = slot;
#endif
return 0;
}
static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
unsigned long end = start + (1UL << PGDIR_SHIFT);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
tlb_gather_mmu(&tlb, mm, start, end);
free_pgd_range(&tlb, start, end, start, end);
tlb_finish_mmu(&tlb, start, end);
#endif
}
/* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt)
{
@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt);
retval = map_ldt_struct(mm, new_ldt, 0);
if (retval) {
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}
mm->context.ldt = new_ldt;
out_unlock:
@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL;
}
void ldt_arch_exit_mmap(struct mm_struct *mm)
{
free_ldt_pgtables(mm);
}
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
struct mm_struct *mm = current->mm;
@ -287,6 +413,25 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt);
/*
* If we are using PTI, map the new LDT into the userspace pagetables.
* If there is already an LDT, use the other slot so that other CPUs
* will continue to use the old LDT until install_ldt() switches
* them over to the new LDT.
*/
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
if (error) {
/*
* This only can fail for the first LDT setup. If an LDT is
* already installed then the PTE page is already
* populated. Mop up a half populated page table.
*/
if (!WARN_ON_ONCE(old_ldt))
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}
install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt);
error = 0;

View File

@ -48,8 +48,6 @@ static void load_segments(void)
"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
"\tmovl %%eax,%%ds\n"
"\tmovl %%eax,%%es\n"
"\tmovl %%eax,%%fs\n"
"\tmovl %%eax,%%gs\n"
"\tmovl %%eax,%%ss\n"
: : : "eax", "memory");
#undef STR
@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
set_gdt(phys_to_virt(0), 0);
idt_invalidate(phys_to_virt(0));
set_gdt(phys_to_virt(0), 0);
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,

View File

@ -47,7 +47,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower

View File

@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0xa, 0xf);
spin_unlock_irqrestore(&rtc_lock, flags);
local_flush_tlb();
pr_debug("1.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
start_eip >> 4;
pr_debug("2.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
start_eip & 0xf;
pr_debug("3.\n");
}
static inline void smpboot_restore_warm_reset_vector(void)
{
unsigned long flags;
/*
* Install writable page 0 entry to set BIOS data area.
*/
local_flush_tlb();
/*
* Paranoid: Set warm reset code and vector here back
* to default values.

View File

@ -98,7 +98,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace,
for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
unwind_next_frame(&state)) {
regs = unwind_get_entry_regs(&state);
regs = unwind_get_entry_regs(&state, NULL);
if (regs) {
/*
* Kernel mode registers on the stack indicate an

View File

@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
if (LDT_empty(info) || LDT_zero(info)) {
if (LDT_empty(info) || LDT_zero(info))
memset(desc, 0, sizeof(*desc));
} else {
else
fill_ldt(desc, info);
/*
* Always set the accessed bit so that the CPU
* doesn't try to write to the (read-only) GDT.
*/
desc->type |= 1;
}
++info;
++desc;
}

View File

@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
*
* No need for ist_enter here because we don't use RCU.
*/
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{

View File

@ -61,11 +61,17 @@ jiffies_64 = jiffies;
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
#else
#define X64_ALIGN_RODATA_BEGIN
#define X64_ALIGN_RODATA_END
#define ALIGN_ENTRY_TEXT_BEGIN
#define ALIGN_ENTRY_TEXT_END
#endif
PHDRS {
@ -102,8 +108,10 @@ SECTIONS
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
ALIGN_ENTRY_TEXT_BEGIN
ENTRY_TEXT
IRQENTRY_TEXT
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)

View File

@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o

View File

@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
}
static void percpu_setup_debug_store(int cpu)
{
#ifdef CONFIG_CPU_SUP_INTEL
int npages;
void *cea;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
npages = sizeof(struct debug_store) / PAGE_SIZE;
BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
PAGE_KERNEL);
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
/*
* Force the population of PMDs for not yet allocated per cpu
* memory like debug store buffers.
*/
npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
for (; npages; npages--, cea += PAGE_SIZE)
cea_set_pte(cea, 0, PAGE_NONE);
#endif
}
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(int cpu)
{
@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
__pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
percpu_setup_debug_store(cpu);
}
static __init void setup_cpu_entry_area_ptes(void)

View File

@ -5,7 +5,7 @@
static int ptdump_show(struct seq_file *m, void *v)
{
ptdump_walk_pgd_level(m, NULL);
ptdump_walk_pgd_level_debugfs(m, NULL, false);
return 0;
}
@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
.release = single_release,
};
static struct dentry *pe;
static int ptdump_show_curknl(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
up_read(&current->mm->mmap_sem);
}
return 0;
}
static int ptdump_open_curknl(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show_curknl, NULL);
}
static const struct file_operations ptdump_curknl_fops = {
.owner = THIS_MODULE,
.open = ptdump_open_curknl,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static struct dentry *pe_curusr;
static int ptdump_show_curusr(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
up_read(&current->mm->mmap_sem);
}
return 0;
}
static int ptdump_open_curusr(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show_curusr, NULL);
}
static const struct file_operations ptdump_curusr_fops = {
.owner = THIS_MODULE,
.open = ptdump_open_curusr,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
#endif
static struct dentry *dir, *pe_knl, *pe_curknl;
static int __init pt_dump_debug_init(void)
{
pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
&ptdump_fops);
if (!pe)
dir = debugfs_create_dir("page_tables", NULL);
if (!dir)
return -ENOMEM;
pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
&ptdump_fops);
if (!pe_knl)
goto err;
pe_curknl = debugfs_create_file("current_kernel", 0400,
dir, NULL, &ptdump_curknl_fops);
if (!pe_curknl)
goto err;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pe_curusr = debugfs_create_file("current_user", 0400,
dir, NULL, &ptdump_curusr_fops);
if (!pe_curusr)
goto err;
#endif
return 0;
err:
debugfs_remove_recursive(dir);
return -ENOMEM;
}
static void __exit pt_dump_debug_exit(void)
{
debugfs_remove_recursive(pe);
debugfs_remove_recursive(dir);
}
module_init(pt_dump_debug_init);

View File

@ -52,6 +52,9 @@ enum address_markers_idx {
USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
@ -59,6 +62,9 @@ enum address_markers_idx {
KASAN_SHADOW_END_NR,
#endif
CPU_ENTRY_AREA_NR,
#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
LDT_NR,
#endif
#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
#endif
@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
[KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
[LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
#endif
[CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
#ifdef CONFIG_X86_ESPFIX64
@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
}
static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
bool checkwx)
bool checkwx, bool dmesg)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_top_pgt;
@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
if (pgd) {
start = pgd;
st.to_dmesg = true;
st.to_dmesg = dmesg;
}
st.check_wx = checkwx;
@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
{
ptdump_walk_pgd_level_core(m, pgd, false);
ptdump_walk_pgd_level_core(m, pgd, false, true);
}
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (user && static_cpu_has(X86_FEATURE_PTI))
pgd = kernel_to_user_pgdp(pgd);
#endif
ptdump_walk_pgd_level_core(m, pgd, false, false);
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
static void ptdump_walk_user_pgd_level_checkwx(void)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t *pgd = (pgd_t *) &init_top_pgt;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("x86/mm: Checking user space page tables\n");
pgd = kernel_to_user_pgdp(pgd);
ptdump_walk_pgd_level_core(NULL, pgd, true, false);
#endif
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
void ptdump_walk_pgd_level_checkwx(void)
{
ptdump_walk_pgd_level_core(NULL, NULL, true);
ptdump_walk_pgd_level_core(NULL, NULL, true, false);
ptdump_walk_user_pgd_level_checkwx();
}
static int __init pt_dump_init(void)

View File

@ -20,6 +20,7 @@
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
#include <asm/pti.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@ -161,6 +162,12 @@ struct map_range {
static int page_size_mask;
static void enable_global_pages(void)
{
if (!static_cpu_has(X86_FEATURE_PTI))
__supported_pte_mask |= _PAGE_GLOBAL;
}
static void __init probe_page_size_mask(void)
{
/*
@ -179,11 +186,11 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
__supported_pte_mask &= ~_PAGE_GLOBAL;
if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
} else
__supported_pte_mask &= ~_PAGE_GLOBAL;
enable_global_pages();
}
/* Enable 1 GB linear kernel mappings if available: */
if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@ -196,34 +203,44 @@ static void __init probe_page_size_mask(void)
static void setup_pcid(void)
{
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_PCID)) {
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() --
* the trampoline code can't handle CR4.PCIDE and
* it wouldn't do any good anyway. Despite the name,
* cr4_set_bits_and_update_boot() doesn't actually
* cause the bits in question to remain set all the
* way through the secondary boot asm.
*
* Instead, we brute-force it and set CR4.PCIDE
* manually in start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
* work if PCID is on but PGE is not. Since that
* combination doesn't exist on real hardware, there's
* no reason to try to fully support it, but it's
* polite to avoid corrupting data if we're on
* an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
if (!IS_ENABLED(CONFIG_X86_64))
return;
if (!boot_cpu_has(X86_FEATURE_PCID))
return;
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() -- the
* trampoline code can't handle CR4.PCIDE and it wouldn't
* do any good anyway. Despite the name,
* cr4_set_bits_and_update_boot() doesn't actually cause
* the bits in question to remain set all the way through
* the secondary boot asm.
*
* Instead, we brute-force it and set CR4.PCIDE manually in
* start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
/*
* INVPCID's single-context modes (2/3) only work if we set
* X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
* on systems that have X86_CR4_PCIDE clear, or that have
* no INVPCID support at all.
*/
if (boot_cpu_has(X86_FEATURE_INVPCID))
setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't work if
* PCID is on but PGE is not. Since that combination
* doesn't exist on real hardware, there's no reason to try
* to fully support it, but it's polite to avoid corrupting
* data if we're on an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
#endif
}
#ifdef CONFIG_X86_32
@ -624,6 +641,7 @@ void __init init_mem_mapping(void)
{
unsigned long end;
pti_check_boottime_disable();
probe_page_size_mask();
setup_pcid();
@ -847,12 +865,12 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{

View File

@ -34,25 +34,14 @@
#define TB_SHIFT 40
/*
* Virtual address start and end range for randomization. The end changes base
* on configuration to have the highest amount of space for randomization.
* It increases the possible random position for each randomized region.
* Virtual address start and end range for randomization.
*
* You need to add an if/def entry if you introduce a new memory region
* compatible with KASLR. Your entry must be in logical order with memory
* layout. For example, ESPFIX is before EFI because its virtual address is
* before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
* ensure that this order is correct and won't be changed.
* The end address could depend on more configuration options to make the
* highest amount of space for randomization available, but that's too hard
* to keep straight and caused issues already.
*/
static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
#if defined(CONFIG_X86_ESPFIX64)
static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
#elif defined(CONFIG_EFI)
static const unsigned long vaddr_end = EFI_VA_END;
#else
static const unsigned long vaddr_end = __START_KERNEL_map;
#endif
static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
/* Default values */
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
@ -101,15 +90,12 @@ void __init kernel_randomize_memory(void)
unsigned long remain_entropy;
/*
* All these BUILD_BUG_ON checks ensures the memory layout is
* consistent with the vaddr_start/vaddr_end variables.
* These BUILD_BUG_ON checks ensure the memory layout is consistent
* with the vaddr_start/vaddr_end variables. These checks are very
* limited....
*/
BUILD_BUG_ON(vaddr_start >= vaddr_end);
BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
vaddr_end >= EFI_VA_END);
BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
IS_ENABLED(CONFIG_EFI)) &&
vaddr_end >= __START_KERNEL_map);
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
if (!kaslr_memory_enabled())

View File

@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
static inline pgd_t *_pgd_alloc(void)
{
return (pgd_t *)__get_free_page(PGALLOC_GFP);
return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
free_page((unsigned long)pgd);
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */

388
arch/x86/mm/pti.c Normal file
View File

@ -0,0 +1,388 @@
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* This code is based in part on work published here:
*
* https://github.com/IAIK/KAISER
*
* The original work was written by and and signed off by for the Linux
* kernel by:
*
* Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
* Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
* Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
* Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
*
* Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
* Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
* Andy Lutomirsky <luto@amacapital.net>
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
#include <asm/hypervisor.h>
#include <asm/vsyscall.h>
#include <asm/cmdline.h>
#include <asm/pti.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
/* Backporting helper */
#ifndef __GFP_NOTRACK
#define __GFP_NOTRACK 0
#endif
static void __init pti_print_if_insecure(const char *reason)
{
if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
pr_info("%s\n", reason);
}
static void __init pti_print_if_secure(const char *reason)
{
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
pr_info("%s\n", reason);
}
void __init pti_check_boottime_disable(void)
{
char arg[5];
int ret;
if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
pti_print_if_insecure("disabled on XEN PV.");
return;
}
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
if (ret > 0) {
if (ret == 3 && !strncmp(arg, "off", 3)) {
pti_print_if_insecure("disabled on command line.");
return;
}
if (ret == 2 && !strncmp(arg, "on", 2)) {
pti_print_if_secure("force enabled on command line.");
goto enable;
}
if (ret == 4 && !strncmp(arg, "auto", 4))
goto autosel;
}
if (cmdline_find_option_bool(boot_command_line, "nopti")) {
pti_print_if_insecure("disabled on command line.");
return;
}
autosel:
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
return;
enable:
setup_force_cpu_cap(X86_FEATURE_PTI);
}
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
/*
* Changes to the high (kernel) portion of the kernelmode page
* tables are not automatically propagated to the usermode tables.
*
* Users should keep in mind that, unlike the kernelmode tables,
* there is no vmalloc_fault equivalent for the usermode tables.
* Top-level entries added to init_mm's usermode pgd after boot
* will not be automatically propagated to other mms.
*/
if (!pgdp_maps_userspace(pgdp))
return pgd;
/*
* The user page tables get the full PGD, accessible from
* userspace:
*/
kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
/*
* If this is normal user memory, make it NX in the kernel
* pagetables so that, if we somehow screw up and return to
* usermode with the kernel CR3 loaded, we'll get a page fault
* instead of allowing user code to execute with the wrong CR3.
*
* As exceptions, we don't set NX if:
* - _PAGE_USER is not set. This could be an executable
* EFI runtime mapping or something similar, and the kernel
* may execute from it
* - we don't have NX support
* - we're clearing the PGD (i.e. the new pgd is not present).
*/
if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
(__supported_pte_mask & _PAGE_NX))
pgd.pgd |= _PAGE_NX;
/* return the copy of the PGD we want the kernel to use: */
return pgd;
}
/*
* Walk the user copy of the page tables (optionally) trying to allocate
* page table pages on the way down.
*
* Returns a pointer to a P4D on success, or NULL on failure.
*/
static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
{
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
if (address < PAGE_OFFSET) {
WARN_ONCE(1, "attempt to walk user address\n");
return NULL;
}
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
if (!new_p4d_page)
return NULL;
if (pgd_none(*pgd)) {
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
new_p4d_page = 0;
}
if (new_p4d_page)
free_page(new_p4d_page);
}
BUILD_BUG_ON(pgd_large(*pgd) != 0);
return p4d_offset(pgd, address);
}
/*
* Walk the user copy of the page tables (optionally) trying to allocate
* page table pages on the way down.
*
* Returns a pointer to a PMD on success, or NULL on failure.
*/
static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
if (!new_pud_page)
return NULL;
if (p4d_none(*p4d)) {
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
new_pud_page = 0;
}
if (new_pud_page)
free_page(new_pud_page);
}
pud = pud_offset(p4d, address);
/* The user page tables do not use large mappings: */
if (pud_large(*pud)) {
WARN_ON(1);
return NULL;
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
if (!new_pmd_page)
return NULL;
if (pud_none(*pud)) {
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
new_pmd_page = 0;
}
if (new_pmd_page)
free_page(new_pmd_page);
}
return pmd_offset(pud, address);
}
#ifdef CONFIG_X86_VSYSCALL_EMULATION
/*
* Walk the shadow copy of the page tables (optionally) trying to allocate
* page table pages on the way down. Does not support large pages.
*
* Note: this is only used when mapping *new* kernel data into the
* user/shadow page tables. It is never used for userspace data.
*
* Returns a pointer to a PTE on success, or NULL on failure.
*/
static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
{
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
return NULL;
}
if (pmd_none(*pmd)) {
unsigned long new_pte_page = __get_free_page(gfp);
if (!new_pte_page)
return NULL;
if (pmd_none(*pmd)) {
set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
new_pte_page = 0;
}
if (new_pte_page)
free_page(new_pte_page);
}
pte = pte_offset_kernel(pmd, address);
if (pte_flags(*pte) & _PAGE_USER) {
WARN_ONCE(1, "attempt to walk to user pte\n");
return NULL;
}
return pte;
}
static void __init pti_setup_vsyscall(void)
{
pte_t *pte, *target_pte;
unsigned int level;
pte = lookup_address(VSYSCALL_ADDR, &level);
if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
return;
target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
if (WARN_ON(!target_pte))
return;
*target_pte = *pte;
set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
}
#else
static void __init pti_setup_vsyscall(void) { }
#endif
static void __init
pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
{
unsigned long addr;
/*
* Clone the populated PMDs which cover start to end. These PMD areas
* can have holes.
*/
for (addr = start; addr < end; addr += PMD_SIZE) {
pmd_t *pmd, *target_pmd;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(addr);
if (WARN_ON(pgd_none(*pgd)))
return;
p4d = p4d_offset(pgd, addr);
if (WARN_ON(p4d_none(*p4d)))
return;
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
continue;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
continue;
target_pmd = pti_user_pagetable_walk_pmd(addr);
if (WARN_ON(!target_pmd))
return;
/*
* Copy the PMD. That is, the kernelmode and usermode
* tables will share the last-level page tables of this
* address range
*/
*target_pmd = pmd_clear_flags(*pmd, clear);
}
}
/*
* Clone a single p4d (i.e. a top-level entry on 4-level systems and a
* next-level entry on 5-level systems.
*/
static void __init pti_clone_p4d(unsigned long addr)
{
p4d_t *kernel_p4d, *user_p4d;
pgd_t *kernel_pgd;
user_p4d = pti_user_pagetable_walk_p4d(addr);
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
}
/*
* Clone the CPU_ENTRY_AREA into the user space visible page table.
*/
static void __init pti_clone_user_shared(void)
{
pti_clone_p4d(CPU_ENTRY_AREA_BASE);
}
/*
* Clone the ESPFIX P4D into the user space visinble page table
*/
static void __init pti_setup_espfix64(void)
{
#ifdef CONFIG_X86_ESPFIX64
pti_clone_p4d(ESPFIX_BASE_ADDR);
#endif
}
/*
* Clone the populated PMDs of the entry and irqentry text and force it RO.
*/
static void __init pti_clone_entry_text(void)
{
pti_clone_pmds((unsigned long) __entry_text_start,
(unsigned long) __irqentry_text_end,
_PAGE_RW | _PAGE_GLOBAL);
}
/*
* Initialize kernel page table isolation
*/
void __init pti_init(void)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return;
pr_info("enabled\n");
pti_clone_user_shared();
pti_clone_entry_text();
pti_setup_espfix64();
pti_setup_vsyscall();
}

View File

@ -28,6 +28,38 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
* necessary invalidation by clearing out the 'ctx_id' which
* forces a TLB flush when the context is loaded.
*/
void clear_asid_other(void)
{
u16 asid;
/*
* This is only expected to be set if we have disabled
* kernel _PAGE_GLOBAL pages.
*/
if (!static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON_ONCE(1);
return;
}
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
/* Do not need to flush the current asid */
if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
continue;
/*
* Make sure the next time we go to switch to
* this asid, we do a flush:
*/
this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
}
this_cpu_write(cpu_tlbstate.invalidate_other, false);
}
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
return;
}
if (this_cpu_read(cpu_tlbstate.invalidate_other))
clear_asid_other();
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
unsigned long new_mm_cr3;
if (need_flush) {
invalidate_user_asid(new_asid);
new_mm_cr3 = build_cr3(pgdir, new_asid);
} else {
new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
}
/*
* Caution: many callers of this function expect
* that load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask writes.
*/
write_cr3(new_mm_cr3);
}
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next->pgd, new_asid));
load_new_mm_cr3(next->pgd, new_asid, true);
/*
* NB: This gets called via leave_mm() in the idle path
@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next->pgd, new_asid));
load_new_mm_cr3(next->pgd, new_asid, false);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);

View File

@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
* because we want to avoid inserting EFI region mappings (EFI_VA_END
* to EFI_VA_START) into the standard kernel page tables. Everything
* else can be shared, see efi_sync_low_kernel_mappings().
*
* We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
* allocation.
*/
int __init efi_alloc_page_tables(void)
{
@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
return 0;
gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
if (!efi_pgd)
return -ENOMEM;

View File

@ -592,7 +592,18 @@ static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
/*
* Update the first page pointer to skip over the CSH header.
*/
cap_info->pages[0] += csh->headersize;
cap_info->phys[0] += csh->headersize;
/*
* cap_info->capsule should point at a virtual mapping of the entire
* capsule, starting at the capsule header. Our image has the Quark
* security header prepended, so we cannot rely on the default vmap()
* mapping created by the generic capsule code.
* Given that the Quark firmware does not appear to care about the
* virtual mapping, let's just point cap_info->capsule at our copy
* of the capsule header.
*/
cap_info->capsule = &cap_info->header;
return 1;
}

View File

@ -12,22 +12,29 @@
#include "blk.h"
/*
* Append a bio to a passthrough request. Only works can be merged into
* the request based on the driver constraints.
* Append a bio to a passthrough request. Only works if the bio can be merged
* into the request based on the driver constraints.
*/
int blk_rq_append_bio(struct request *rq, struct bio *bio)
int blk_rq_append_bio(struct request *rq, struct bio **bio)
{
blk_queue_bounce(rq->q, &bio);
struct bio *orig_bio = *bio;
blk_queue_bounce(rq->q, bio);
if (!rq->bio) {
blk_rq_bio_prep(rq->q, rq, bio);
blk_rq_bio_prep(rq->q, rq, *bio);
} else {
if (!ll_back_merge_fn(rq->q, rq, bio))
if (!ll_back_merge_fn(rq->q, rq, *bio)) {
if (orig_bio != *bio) {
bio_put(*bio);
*bio = orig_bio;
}
return -EINVAL;
}
rq->biotail->bi_next = bio;
rq->biotail = bio;
rq->__data_len += bio->bi_iter.bi_size;
rq->biotail->bi_next = *bio;
rq->biotail = *bio;
rq->__data_len += (*bio)->bi_iter.bi_size;
}
return 0;
@ -80,14 +87,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
* We link the bounce buffer in and could have to traverse it
* later so we have to get a ref to prevent it from being freed
*/
ret = blk_rq_append_bio(rq, bio);
bio_get(bio);
ret = blk_rq_append_bio(rq, &bio);
if (ret) {
bio_endio(bio);
__blk_rq_unmap_user(orig_bio);
bio_put(bio);
return ret;
}
bio_get(bio);
return 0;
}
@ -220,7 +225,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
int do_copy = 0;
struct bio *bio;
struct bio *bio, *orig_bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
@ -243,10 +248,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (do_copy)
rq->rq_flags |= RQF_COPY_USER;
ret = blk_rq_append_bio(rq, bio);
orig_bio = bio;
ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
/* request is too big */
bio_put(bio);
bio_put(orig_bio);
return ret;
}

View File

@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
unsigned i = 0;
bool bounce = false;
int sectors = 0;
bool passthrough = bio_is_passthrough(*bio_orig);
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_PAGES)
@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
if (!bounce)
return;
if (sectors < bio_sectors(*bio_orig)) {
if (!passthrough && sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
bio_chain(bio, *bio_orig);
generic_make_request(*bio_orig);
*bio_orig = bio;
}
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;

View File

@ -610,6 +610,11 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
algt->mask));
if (IS_ERR(poly))
return PTR_ERR(poly);
poly_hash = __crypto_hash_alg_common(poly);
err = -EINVAL;
if (poly_hash->digestsize != POLY1305_DIGEST_SIZE)
goto out_put_poly;
err = -ENOMEM;
inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
@ -618,7 +623,6 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
ctx = aead_instance_ctx(inst);
ctx->saltlen = CHACHAPOLY_IV_SIZE - ivsize;
poly_hash = __crypto_hash_alg_common(poly);
err = crypto_init_ahash_spawn(&ctx->poly, poly_hash,
aead_crypto_instance(inst));
if (err)

View File

@ -254,6 +254,14 @@ static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm)
crypto_free_aead(ctx->child);
}
static void pcrypt_free(struct aead_instance *inst)
{
struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst);
crypto_drop_aead(&ctx->spawn);
kfree(inst);
}
static int pcrypt_init_instance(struct crypto_instance *inst,
struct crypto_alg *alg)
{
@ -319,6 +327,8 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb,
inst->alg.encrypt = pcrypt_aead_encrypt;
inst->alg.decrypt = pcrypt_aead_decrypt;
inst->free = pcrypt_free;
err = aead_register_instance(tmpl, inst);
if (err)
goto out_drop_aead;
@ -349,14 +359,6 @@ static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb)
return -EINVAL;
}
static void pcrypt_free(struct crypto_instance *inst)
{
struct pcrypt_instance_ctx *ctx = crypto_instance_ctx(inst);
crypto_drop_aead(&ctx->spawn);
kfree(inst);
}
static int pcrypt_cpumask_change_notify(struct notifier_block *self,
unsigned long val, void *data)
{
@ -469,7 +471,6 @@ static void pcrypt_fini_padata(struct padata_pcrypt *pcrypt)
static struct crypto_template pcrypt_tmpl = {
.name = "pcrypt",
.create = pcrypt_create,
.free = pcrypt_free,
.module = THIS_MODULE,
};

View File

@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
}
static bool cache_node_is_unified(struct cacheinfo *this_leaf)
{
return of_property_read_bool(this_leaf->of_node, "cache-unified");
}
static void cache_of_override_properties(unsigned int cpu)
{
int index;
@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
for (index = 0; index < cache_leaves(cpu); index++) {
this_leaf = this_cpu_ci->info_list + index;
/*
* init_cache_level must setup the cache level correctly
* overriding the architecturally specified levels, so
* if type is NONE at this stage, it should be unified
*/
if (this_leaf->type == CACHE_TYPE_NOCACHE &&
cache_node_is_unified(this_leaf))
this_leaf->type = CACHE_TYPE_UNIFIED;
cache_size(this_leaf);
cache_get_line_size(this_leaf);
cache_nr_sets(this_leaf);

View File

@ -178,6 +178,7 @@ static struct bus_type sunxi_rsb_bus = {
.match = sunxi_rsb_device_match,
.probe = sunxi_rsb_device_probe,
.remove = sunxi_rsb_device_remove,
.uevent = of_device_uevent_modalias,
};
static void sunxi_rsb_dev_release(struct device *dev)

View File

@ -5,6 +5,7 @@ config CRYPTO_DEV_CHELSIO
select CRYPTO_SHA256
select CRYPTO_SHA512
select CRYPTO_AUTHENC
select CRYPTO_GF128MUL
---help---
The Chelsio Crypto Co-processor driver for T6 adapters.

View File

@ -1625,6 +1625,7 @@ static int queue_cache_init(void)
CWQ_ENTRY_SIZE, 0, NULL);
if (!queue_cache[HV_NCS_QTYPE_CWQ - 1]) {
kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]);
queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL;
return -ENOMEM;
}
return 0;
@ -1634,6 +1635,8 @@ static void queue_cache_destroy(void)
{
kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_MAU - 1]);
kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
queue_cache[HV_NCS_QTYPE_MAU - 1] = NULL;
queue_cache[HV_NCS_QTYPE_CWQ - 1] = NULL;
}
static long spu_queue_register_workfn(void *arg)

View File

@ -20,10 +20,6 @@
#define NO_FURTHER_WRITE_ACTION -1
#ifndef phys_to_page
#define phys_to_page(x) pfn_to_page((x) >> PAGE_SHIFT)
#endif
/**
* efi_free_all_buff_pages - free all previous allocated buffer pages
* @cap_info: pointer to current instance of capsule_info structure
@ -35,7 +31,7 @@
static void efi_free_all_buff_pages(struct capsule_info *cap_info)
{
while (cap_info->index > 0)
__free_page(phys_to_page(cap_info->pages[--cap_info->index]));
__free_page(cap_info->pages[--cap_info->index]);
cap_info->index = NO_FURTHER_WRITE_ACTION;
}
@ -71,6 +67,14 @@ int __efi_capsule_setup_info(struct capsule_info *cap_info)
cap_info->pages = temp_page;
temp_page = krealloc(cap_info->phys,
pages_needed * sizeof(phys_addr_t *),
GFP_KERNEL | __GFP_ZERO);
if (!temp_page)
return -ENOMEM;
cap_info->phys = temp_page;
return 0;
}
@ -105,9 +109,24 @@ int __weak efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
**/
static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
{
bool do_vunmap = false;
int ret;
ret = efi_capsule_update(&cap_info->header, cap_info->pages);
/*
* cap_info->capsule may have been assigned already by a quirk
* handler, so only overwrite it if it is NULL
*/
if (!cap_info->capsule) {
cap_info->capsule = vmap(cap_info->pages, cap_info->index,
VM_MAP, PAGE_KERNEL);
if (!cap_info->capsule)
return -ENOMEM;
do_vunmap = true;
}
ret = efi_capsule_update(cap_info->capsule, cap_info->phys);
if (do_vunmap)
vunmap(cap_info->capsule);
if (ret) {
pr_err("capsule update failed\n");
return ret;
@ -165,10 +184,12 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff,
goto failed;
}
cap_info->pages[cap_info->index++] = page_to_phys(page);
cap_info->pages[cap_info->index] = page;
cap_info->phys[cap_info->index] = page_to_phys(page);
cap_info->page_bytes_remain = PAGE_SIZE;
cap_info->index++;
} else {
page = phys_to_page(cap_info->pages[cap_info->index - 1]);
page = cap_info->pages[cap_info->index - 1];
}
kbuff = kmap(page);
@ -252,6 +273,7 @@ static int efi_capsule_release(struct inode *inode, struct file *file)
struct capsule_info *cap_info = file->private_data;
kfree(cap_info->pages);
kfree(cap_info->phys);
kfree(file->private_data);
file->private_data = NULL;
return 0;
@ -281,6 +303,13 @@ static int efi_capsule_open(struct inode *inode, struct file *file)
return -ENOMEM;
}
cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL);
if (!cap_info->phys) {
kfree(cap_info->pages);
kfree(cap_info);
return -ENOMEM;
}
file->private_data = cap_info;
return 0;

View File

@ -1074,7 +1074,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
}
if (!chip->names)
devprop_gpiochip_set_names(chip);
devprop_gpiochip_set_names(chip, dev_fwnode(chip->parent));
acpi_gpiochip_request_regions(acpi_gpio);
acpi_gpiochip_scan_gpios(acpi_gpio);

View File

@ -19,30 +19,27 @@
/**
* devprop_gpiochip_set_names - Set GPIO line names using device properties
* @chip: GPIO chip whose lines should be named, if possible
* @fwnode: Property Node containing the gpio-line-names property
*
* Looks for device property "gpio-line-names" and if it exists assigns
* GPIO line names for the chip. The memory allocated for the assigned
* names belong to the underlying firmware node and should not be released
* by the caller.
*/
void devprop_gpiochip_set_names(struct gpio_chip *chip)
void devprop_gpiochip_set_names(struct gpio_chip *chip,
const struct fwnode_handle *fwnode)
{
struct gpio_device *gdev = chip->gpiodev;
const char **names;
int ret, i;
if (!chip->parent) {
dev_warn(&gdev->dev, "GPIO chip parent is NULL\n");
return;
}
ret = device_property_read_string_array(chip->parent, "gpio-line-names",
ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
NULL, 0);
if (ret < 0)
return;
if (ret != gdev->ngpio) {
dev_warn(chip->parent,
dev_warn(&gdev->dev,
"names %d do not match number of GPIOs %d\n", ret,
gdev->ngpio);
return;
@ -52,10 +49,10 @@ void devprop_gpiochip_set_names(struct gpio_chip *chip)
if (!names)
return;
ret = device_property_read_string_array(chip->parent, "gpio-line-names",
ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
names, gdev->ngpio);
if (ret < 0) {
dev_warn(chip->parent, "failed to read GPIO line names\n");
dev_warn(&gdev->dev, "failed to read GPIO line names\n");
kfree(names);
return;
}

View File

@ -493,7 +493,8 @@ int of_gpiochip_add(struct gpio_chip *chip)
/* If the chip defines names itself, these take precedence */
if (!chip->names)
devprop_gpiochip_set_names(chip);
devprop_gpiochip_set_names(chip,
of_fwnode_handle(chip->of_node));
of_node_get(chip->of_node);

View File

@ -224,7 +224,8 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc)
return desc - &desc->gdev->descs[0];
}
void devprop_gpiochip_set_names(struct gpio_chip *chip);
void devprop_gpiochip_set_names(struct gpio_chip *chip,
const struct fwnode_handle *fwnode);
/* With descriptor prefix */

View File

@ -6944,6 +6944,7 @@ enum {
#define RESET_PCH_HANDSHAKE_ENABLE (1<<4)
#define GEN8_CHICKEN_DCPR_1 _MMIO(0x46430)
#define SKL_SELECT_ALTERNATE_DC_EXIT (1<<30)
#define MASK_WAKEMEM (1<<13)
#define SKL_DFSM _MMIO(0x51000)
@ -8475,6 +8476,7 @@ enum skl_power_gate {
#define BXT_CDCLK_CD2X_DIV_SEL_2 (2<<22)
#define BXT_CDCLK_CD2X_DIV_SEL_4 (3<<22)
#define BXT_CDCLK_CD2X_PIPE(pipe) ((pipe)<<20)
#define CDCLK_DIVMUX_CD_OVERRIDE (1<<19)
#define BXT_CDCLK_CD2X_PIPE_NONE BXT_CDCLK_CD2X_PIPE(3)
#define BXT_CDCLK_SSA_PRECHARGE_ENABLE (1<<16)
#define CDCLK_FREQ_DECIMAL_MASK (0x7ff)

View File

@ -859,16 +859,10 @@ static void skl_set_preferred_cdclk_vco(struct drm_i915_private *dev_priv,
static void skl_dpll0_enable(struct drm_i915_private *dev_priv, int vco)
{
int min_cdclk = skl_calc_cdclk(0, vco);
u32 val;
WARN_ON(vco != 8100000 && vco != 8640000);
/* select the minimum CDCLK before enabling DPLL 0 */
val = CDCLK_FREQ_337_308 | skl_cdclk_decimal(min_cdclk);
I915_WRITE(CDCLK_CTL, val);
POSTING_READ(CDCLK_CTL);
/*
* We always enable DPLL0 with the lowest link rate possible, but still
* taking into account the VCO required to operate the eDP panel at the
@ -922,7 +916,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
{
int cdclk = cdclk_state->cdclk;
int vco = cdclk_state->vco;
u32 freq_select, pcu_ack;
u32 freq_select, pcu_ack, cdclk_ctl;
int ret;
WARN_ON((cdclk == 24000) != (vco == 0));
@ -939,7 +933,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
return;
}
/* set CDCLK_CTL */
/* Choose frequency for this cdclk */
switch (cdclk) {
case 450000:
case 432000:
@ -967,10 +961,33 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
dev_priv->cdclk.hw.vco != vco)
skl_dpll0_disable(dev_priv);
cdclk_ctl = I915_READ(CDCLK_CTL);
if (dev_priv->cdclk.hw.vco != vco) {
/* Wa Display #1183: skl,kbl,cfl */
cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
I915_WRITE(CDCLK_CTL, cdclk_ctl);
}
/* Wa Display #1183: skl,kbl,cfl */
cdclk_ctl |= CDCLK_DIVMUX_CD_OVERRIDE;
I915_WRITE(CDCLK_CTL, cdclk_ctl);
POSTING_READ(CDCLK_CTL);
if (dev_priv->cdclk.hw.vco != vco)
skl_dpll0_enable(dev_priv, vco);
I915_WRITE(CDCLK_CTL, freq_select | skl_cdclk_decimal(cdclk));
/* Wa Display #1183: skl,kbl,cfl */
cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
I915_WRITE(CDCLK_CTL, cdclk_ctl);
cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
I915_WRITE(CDCLK_CTL, cdclk_ctl);
/* Wa Display #1183: skl,kbl,cfl */
cdclk_ctl &= ~CDCLK_DIVMUX_CD_OVERRIDE;
I915_WRITE(CDCLK_CTL, cdclk_ctl);
POSTING_READ(CDCLK_CTL);
/* inform PCU of the change */

View File

@ -598,6 +598,11 @@ void gen9_enable_dc5(struct drm_i915_private *dev_priv)
DRM_DEBUG_KMS("Enabling DC5\n");
/* Wa Display #1183: skl,kbl,cfl */
if (IS_GEN9_BC(dev_priv))
I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
SKL_SELECT_ALTERNATE_DC_EXIT);
gen9_set_dc_state(dev_priv, DC_STATE_EN_UPTO_DC5);
}
@ -625,6 +630,11 @@ void skl_disable_dc6(struct drm_i915_private *dev_priv)
{
DRM_DEBUG_KMS("Disabling DC6\n");
/* Wa Display #1183: skl,kbl,cfl */
if (IS_GEN9_BC(dev_priv))
I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
SKL_SELECT_ALTERNATE_DC_EXIT);
gen9_set_dc_state(dev_priv, DC_STATE_DISABLE);
}
@ -1786,6 +1796,7 @@ void intel_display_power_put(struct drm_i915_private *dev_priv,
GLK_DISPLAY_POWERWELL_2_POWER_DOMAINS | \
BIT_ULL(POWER_DOMAIN_MODESET) | \
BIT_ULL(POWER_DOMAIN_AUX_A) | \
BIT_ULL(POWER_DOMAIN_GMBUS) | \
BIT_ULL(POWER_DOMAIN_INIT))
#define CNL_DISPLAY_POWERWELL_2_POWER_DOMAINS ( \

View File

@ -386,6 +386,9 @@ int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
if (ret)
return ret;
if (!qp->qp_sec)
return 0;
mutex_lock(&real_qp->qp_sec->mutex);
ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
qp->qp_sec);

View File

@ -2085,8 +2085,8 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
return -EOPNOTSUPP;
if (ucore->inlen > sizeof(cmd)) {
if (ib_is_udata_cleared(ucore, sizeof(cmd),
ucore->inlen - sizeof(cmd)))
if (!ib_is_udata_cleared(ucore, sizeof(cmd),
ucore->inlen - sizeof(cmd)))
return -EOPNOTSUPP;
}

View File

@ -1400,7 +1400,8 @@ int ib_close_qp(struct ib_qp *qp)
spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
atomic_dec(&real_qp->usecnt);
ib_close_shared_qp_security(qp->qp_sec);
if (qp->qp_sec)
ib_close_shared_qp_security(qp->qp_sec);
kfree(qp);
return 0;

View File

@ -586,10 +586,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
ret = -EAGAIN;
goto skip_cqe;
}
if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
if (unlikely(!CQE_STATUS(hw_cqe) &&
CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
t4_set_wq_in_error(wq);
hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
goto proc_cqe;
hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
}
goto proc_cqe;
}

View File

@ -1129,7 +1129,6 @@ struct hfi1_devdata {
u16 pcie_lnkctl;
u16 pcie_devctl2;
u32 pci_msix0;
u32 pci_lnkctl3;
u32 pci_tph2;
/*

View File

@ -411,15 +411,12 @@ int restore_pci_variables(struct hfi1_devdata *dd)
if (ret)
goto error;
ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
dd->pci_lnkctl3);
if (ret)
goto error;
ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
if (ret)
goto error;
if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2,
dd->pci_tph2);
if (ret)
goto error;
}
return 0;
error:
@ -469,15 +466,12 @@ int save_pci_variables(struct hfi1_devdata *dd)
if (ret)
goto error;
ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
&dd->pci_lnkctl3);
if (ret)
goto error;
ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
if (ret)
goto error;
if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2,
&dd->pci_tph2);
if (ret)
goto error;
}
return 0;
error:

View File

@ -1415,6 +1415,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
}
INIT_LIST_HEAD(&context->vma_private_list);
mutex_init(&context->vma_private_list_mutex);
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
@ -1576,7 +1577,9 @@ static void mlx5_ib_vma_close(struct vm_area_struct *area)
* mlx5_ib_disassociate_ucontext().
*/
mlx5_ib_vma_priv_data->vma = NULL;
mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
list_del(&mlx5_ib_vma_priv_data->list);
mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
kfree(mlx5_ib_vma_priv_data);
}
@ -1596,10 +1599,13 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
return -ENOMEM;
vma_prv->vma = vma;
vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
vma->vm_private_data = vma_prv;
vma->vm_ops = &mlx5_ib_vm_ops;
mutex_lock(&ctx->vma_private_list_mutex);
list_add(&vma_prv->list, vma_head);
mutex_unlock(&ctx->vma_private_list_mutex);
return 0;
}
@ -1642,6 +1648,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
* mlx5_ib_vma_close.
*/
down_write(&owning_mm->mmap_sem);
mutex_lock(&context->vma_private_list_mutex);
list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
list) {
vma = vma_private->vma;
@ -1656,6 +1663,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
list_del(&vma_private->list);
kfree(vma_private);
}
mutex_unlock(&context->vma_private_list_mutex);
up_write(&owning_mm->mmap_sem);
mmput(owning_mm);
put_task_struct(owning_process);

View File

@ -115,6 +115,8 @@ enum {
struct mlx5_ib_vma_private_data {
struct list_head list;
struct vm_area_struct *vma;
/* protect vma_private_list add/del */
struct mutex *vma_private_list_mutex;
};
struct mlx5_ib_ucontext {
@ -129,6 +131,8 @@ struct mlx5_ib_ucontext {
/* Transport Domain number */
u32 tdn;
struct list_head vma_private_list;
/* protect vma_private_list add/del */
struct mutex vma_private_list_mutex;
unsigned long upd_xlt_page;
/* protect ODP/KSM */

View File

@ -1613,7 +1613,7 @@ static int elantech_set_properties(struct elantech_data *etd)
case 5:
etd->hw_version = 3;
break;
case 6 ... 14:
case 6 ... 15:
etd->hw_version = 4;
break;
default:

View File

@ -1611,13 +1611,15 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
domain->geometry.aperture_end = (1UL << ias) - 1;
domain->geometry.force_aperture = true;
smmu_domain->pgtbl_ops = pgtbl_ops;
ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
if (ret < 0)
if (ret < 0) {
free_io_pgtable_ops(pgtbl_ops);
return ret;
}
return ret;
smmu_domain->pgtbl_ops = pgtbl_ops;
return 0;
}
static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
@ -1644,7 +1646,7 @@ static __le64 *arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
{
int i;
int i, j;
struct arm_smmu_master_data *master = fwspec->iommu_priv;
struct arm_smmu_device *smmu = master->smmu;
@ -1652,6 +1654,13 @@ static void arm_smmu_install_ste_for_dev(struct iommu_fwspec *fwspec)
u32 sid = fwspec->ids[i];
__le64 *step = arm_smmu_get_step_for_sid(smmu, sid);
/* Bridged PCI devices may end up with duplicated IDs */
for (j = 0; j < i; j++)
if (fwspec->ids[j] == sid)
break;
if (j < i)
continue;
arm_smmu_write_strtab_ent(smmu, sid, step, &master->ste);
}
}

View File

@ -950,6 +950,7 @@ static void prepare_start_command(struct pxa3xx_nand_info *info, int command)
switch (command) {
case NAND_CMD_READ0:
case NAND_CMD_READOOB:
case NAND_CMD_PAGEPROG:
info->use_ecc = 1;
break;

View File

@ -167,7 +167,7 @@ static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
reg = reg_readl(priv, REG_SPHY_CNTRL);
if (enable) {
reg |= PHY_RESET;
reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | CK25_DIS);
reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | IDDQ_GLOBAL_PWR | CK25_DIS);
reg_writel(priv, reg, REG_SPHY_CNTRL);
udelay(21);
reg = reg_readl(priv, REG_SPHY_CNTRL);

View File

@ -1875,7 +1875,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
* here forever if we consistently cannot allocate
* buffers.
*/
else if (rc == -ENOMEM)
else if (rc == -ENOMEM && budget)
rx_pkts++;
else if (rc == -EBUSY) /* partial completion */
break;
@ -1961,7 +1961,7 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget)
cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
if (likely(rc == -EIO))
if (likely(rc == -EIO) && budget)
rx_pkts++;
else if (rc == -EBUSY) /* partial completion */
break;

View File

@ -14227,7 +14227,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
/* Reset PHY, otherwise the read DMA engine will be in a mode that
* breaks all requests to 256 bytes.
*/
if (tg3_asic_rev(tp) == ASIC_REV_57766)
if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
tg3_asic_rev(tp) == ASIC_REV_5717 ||
tg3_asic_rev(tp) == ASIC_REV_5719)
reset_phy = true;
err = tg3_restart_hw(tp, reset_phy);

View File

@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_device *dev)
for (i = 0; i < txq->bd.ring_size; i++) {
/* Initialize the BD for every fragment in the page. */
bdp->cbd_sc = cpu_to_fec16(0);
if (bdp->cbd_bufaddr &&
!IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
dma_unmap_single(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
fec16_to_cpu(bdp->cbd_datlen),
DMA_TO_DEVICE);
if (txq->tx_skbuff[i]) {
dev_kfree_skb_any(txq->tx_skbuff[i]);
txq->tx_skbuff[i] = NULL;

View File

@ -344,7 +344,8 @@ static int orion_mdio_probe(struct platform_device *pdev)
dev->regs + MVMDIO_ERR_INT_MASK);
} else if (dev->err_interrupt == -EPROBE_DEFER) {
return -EPROBE_DEFER;
ret = -EPROBE_DEFER;
goto out_mdio;
}
if (pdev->dev.of_node)

View File

@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
case MLX5_CMD_OP_ALLOC_Q_COUNTER:
case MLX5_CMD_OP_QUERY_Q_COUNTER:
case MLX5_CMD_OP_SET_RATE_LIMIT:
case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
case MLX5_CMD_OP_QUERY_RATE_LIMIT:
case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);

View File

@ -590,6 +590,7 @@ struct mlx5e_channel {
struct mlx5_core_dev *mdev;
struct mlx5e_tstamp *tstamp;
int ix;
int cpu;
};
struct mlx5e_channels {

View File

@ -71,11 +71,6 @@ struct mlx5e_channel_param {
struct mlx5e_cq_param icosq_cq;
};
static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
{
return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
}
static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
{
return MLX5_CAP_GEN(mdev, striding_rq) &&
@ -452,17 +447,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
int mtt_sz = mlx5e_get_wqe_mtt_sz();
int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
int node = mlx5e_get_node(c->priv, c->ix);
int i;
rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
GFP_KERNEL, node);
GFP_KERNEL, cpu_to_node(c->cpu));
if (!rq->mpwqe.info)
goto err_out;
/* We allocate more than mtt_sz as we will align the pointer */
rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
GFP_KERNEL, node);
rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
cpu_to_node(c->cpu));
if (unlikely(!rq->mpwqe.mtt_no_align))
goto err_free_wqe_info;
@ -570,7 +564,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
int err;
int i;
rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
rqp->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
&rq->wq_ctrl);
@ -636,8 +630,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
default: /* MLX5_WQ_TYPE_LINKED_LIST */
rq->wqe.frag_info =
kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
GFP_KERNEL,
mlx5e_get_node(c->priv, c->ix));
GFP_KERNEL, cpu_to_node(c->cpu));
if (!rq->wqe.frag_info) {
err = -ENOMEM;
goto err_rq_wq_destroy;
@ -1007,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->uar_map = mdev->mlx5e_res.bfreg.map;
sq->min_inline_mode = params->tx_min_inline_mode;
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
if (err)
return err;
sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
if (err)
goto err_sq_wq_destroy;
@ -1060,13 +1053,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
sq->channel = c;
sq->uar_map = mdev->mlx5e_res.bfreg.map;
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
if (err)
return err;
sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
if (err)
goto err_sq_wq_destroy;
@ -1132,13 +1125,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
if (MLX5_IPSEC_DEV(c->priv->mdev))
set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
if (err)
return err;
sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
if (err)
goto err_sq_wq_destroy;
@ -1510,8 +1503,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
struct mlx5_core_dev *mdev = c->priv->mdev;
int err;
param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
param->wq.buf_numa_node = cpu_to_node(c->cpu);
param->wq.db_numa_node = cpu_to_node(c->cpu);
param->eq_ix = c->ix;
err = mlx5e_alloc_cq_common(mdev, param, cq);
@ -1610,6 +1603,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
mlx5e_free_cq(cq);
}
static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
{
return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
}
static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
struct mlx5e_params *params,
struct mlx5e_channel_param *cparam)
@ -1758,12 +1756,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
{
struct mlx5e_cq_moder icocq_moder = {0, 0};
struct net_device *netdev = priv->netdev;
int cpu = mlx5e_get_cpu(priv, ix);
struct mlx5e_channel *c;
unsigned int irq;
int err;
int eqn;
c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
if (!c)
return -ENOMEM;
@ -1771,6 +1770,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
c->mdev = priv->mdev;
c->tstamp = &priv->tstamp;
c->ix = ix;
c->cpu = cpu;
c->pdev = &priv->mdev->pdev->dev;
c->netdev = priv->netdev;
c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
@ -1859,8 +1859,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
for (tc = 0; tc < c->num_tc; tc++)
mlx5e_activate_txqsq(&c->sq[tc]);
mlx5e_activate_rq(&c->rq);
netif_set_xps_queue(c->netdev,
mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
}
static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
@ -3554,6 +3553,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
struct sk_buff *skb,
netdev_features_t features)
{
unsigned int offset = 0;
struct udphdr *udph;
u8 proto;
u16 port;
@ -3563,7 +3563,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
proto = ip_hdr(skb)->protocol;
break;
case htons(ETH_P_IPV6):
proto = ipv6_hdr(skb)->nexthdr;
proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
break;
default:
goto out;

Some files were not shown because too many files have changed in this diff Show More