Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2017-12-29 11:25:37 +0100
committerJiri Kosina <jkosina@suse.cz>2017-12-29 11:25:37 +0100
commit9d1cfb8cff865bee539ebf307c1cd13bebd892c7 (patch)
tree0e31a953ec800653a6059f1a20afffdaf249d281
parent689902dd104c29534a82563b40474aacaf42c3fd (diff)
parent41927953efda74f2e1f70e5c6b79fbc91d102517 (diff)
Merge branch 'users/jkosina/SLE11-SP4/pti' into SLE11-SP4_EMBARGOrpm-3.0.101-108.21
Conflicts: blacklist.conf series.conf suse-commit: 5f5299b1066bc526314a409fa40cd5bbc86c43dd
-rw-r--r--Documentation/kernel-parameters.txt17
-rw-r--r--arch/powerpc/include/asm/exception-64s.h58
-rw-r--r--arch/powerpc/include/asm/feature-fixups.h16
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/asm/setup.h7
-rw-r--r--arch/powerpc/include/asm/system.h2
-rw-r--r--arch/powerpc/kernel/entry_64.S37
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S57
-rw-r--r--arch/powerpc/kernel/setup.h1
-rw-r--r--arch/powerpc/kernel/setup_64.c76
-rw-r--r--arch/powerpc/kernel/sysfs.c40
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S9
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S4
-rw-r--r--arch/powerpc/lib/feature-fixups.c27
-rw-r--r--arch/powerpc/platforms/powernv/setup.c376
-rw-r--r--arch/powerpc/platforms/pseries/setup.c2
-rw-r--r--arch/s390/include/asm/alternative.h149
-rw-r--r--arch/s390/include/asm/processor.h1
-rw-r--r--arch/s390/include/asm/system.h8
-rw-r--r--arch/s390/kernel/Makefile2
-rw-r--r--arch/s390/kernel/alternative.c123
-rw-r--r--arch/s390/kernel/entry.S73
-rw-r--r--arch/s390/kernel/entry64.S76
-rw-r--r--arch/s390/kernel/ipl.c1
-rw-r--r--arch/s390/kernel/module.c13
-rw-r--r--arch/s390/kernel/setup.c3
-rw-r--r--arch/s390/kernel/smp.c2
-rw-r--r--arch/s390/kernel/vmlinux.lds.S26
-rw-r--r--arch/um/sys-i386/shared/sysdep/system.h1
-rw-r--r--arch/um/sys-x86_64/shared/sysdep/system.h1
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/ia32/ia32entry.S27
-rw-r--r--arch/x86/include/asm/alternative-asm.h35
-rw-r--r--arch/x86/include/asm/cmdline.h8
-rw-r--r--arch/x86/include/asm/cpufeature.h12
-rw-r--r--arch/x86/include/asm/desc.h4
-rw-r--r--arch/x86/include/asm/fixmap.h3
-rw-r--r--arch/x86/include/asm/hardirq.h6
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/kaiser.h143
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h75
-rw-r--r--arch/x86/include/asm/msr-index.h9
-rw-r--r--arch/x86/include/asm/msr.h15
-rw-r--r--arch/x86/include/asm/nops.h4
-rw-r--r--arch/x86/include/asm/pgtable.h27
-rw-r--r--arch/x86/include/asm/pgtable_64.h24
-rw-r--r--arch/x86/include/asm/pgtable_types.h35
-rw-r--r--arch/x86/include/asm/processor-flags.h3
-rw-r--r--arch/x86/include/asm/processor.h5
-rw-r--r--arch/x86/include/asm/proto.h1
-rw-r--r--arch/x86/include/asm/spec_ctrl.h108
-rw-r--r--arch/x86/include/asm/system.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h191
-rw-r--r--arch/x86/include/asm/vsyscall.h1
-rw-r--r--arch/x86/include/asm/vvar.h22
-rw-r--r--arch/x86/include/mach-xen/asm/desc.h4
-rw-r--r--arch/x86/include/mach-xen/asm/fixmap.h3
-rw-r--r--arch/x86/include/mach-xen/asm/mmu_context.h109
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable.h27
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable_64.h24
-rw-r--r--arch/x86/include/mach-xen/asm/pgtable_types.h35
-rw-r--r--arch/x86/include/mach-xen/asm/processor.h4
-rw-r--r--arch/x86/include/mach-xen/asm/system.h1
-rw-r--r--arch/x86/include/mach-xen/asm/tlbflush.h61
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c15
-rw-r--r--arch/x86/kernel/cpu/bugs.c8
-rw-r--r--arch/x86/kernel/cpu/common-xen.c62
-rw-r--r--arch/x86/kernel/cpu/common.c95
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c54
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/spec_ctrl.c98
-rw-r--r--arch/x86/kernel/entry_64-xen.S75
-rw-r--r--arch/x86/kernel/entry_64.S217
-rw-r--r--arch/x86/kernel/espfix_64.c10
-rw-r--r--arch/x86/kernel/head_64.S29
-rw-r--r--arch/x86/kernel/hpet.c5
-rw-r--r--arch/x86/kernel/init_task.c5
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/ldt-xen.c26
-rw-r--r--arch/x86/kernel/ldt.c25
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/smpboot.c5
-rw-r--r--arch/x86/kernel/vmlinux.lds.S28
-rw-r--r--arch/x86/kernel/vsyscall_64-xen.c9
-rw-r--r--arch/x86/kernel/vsyscall_64.c9
-rw-r--r--arch/x86/kvm/svm.c52
-rw-r--r--arch/x86/kvm/vmx.c34
-rw-r--r--arch/x86/kvm/x86.c23
-rw-r--r--arch/x86/lib/Makefile2
-rw-r--r--arch/x86/lib/cmdline.c189
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/init-xen.c2
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_64-xen.c10
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/kaiser.c437
-rw-r--r--arch/x86/mm/pgtable-xen.c3
-rw-r--r--arch/x86/mm/pgtable.c27
-rw-r--r--arch/x86/mm/tlb-xen.c127
-rw-r--r--arch/x86/mm/tlb.c113
-rw-r--r--arch/x86/xen/enlighten.c6
-rw-r--r--drivers/media/video/uvc/uvc_v4l2.c2
-rw-r--r--drivers/net/wireless/ath/carl9170/main.c2
-rw-r--r--drivers/net/wireless/p54/main.c2
-rw-r--r--fs/udf/misc.c7
-rw-r--r--include/asm-generic/vmlinux.lds.h7
-rw-r--r--include/linux/fdtable.h7
-rw-r--r--include/linux/kaiser.h52
-rw-r--r--include/linux/mmu_context.h9
-rw-r--r--include/linux/percpu-defs.h32
-rw-r--r--include/linux/system.h10
-rw-r--r--init/main.c2
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/sched.c6
-rw-r--r--mm/mmu_context.c2
-rw-r--r--net/core/filter.c3
-rw-r--r--security/Kconfig10
124 files changed, 3807 insertions, 460 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index b216460fb730..ecdddeb3ca7a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1775,6 +1775,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
Disable SMEP (Supervisor Mode Execution Protection)
even if it is supported by processor.
+ nospec [X86]
+ Disable indirect branch restricted speculation and
+ indirect branch prediction barrier to avoid performance
+ penalties in trusted environments.
+
noexec32 [X86-64]
This affects only 32-bit executables.
noexec32=on: enable non-executable mappings (default)
@@ -1845,8 +1850,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nointroute [IA-64]
+ noinvpcid [X86] Disable the INVPCID cpu feature.
+
nojitter [IA64] Disables jitter checking for ITC timers.
+ nopti [X86-64] Disable KAISER isolation of kernel from user.
+
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
@@ -1869,6 +1878,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nopat [X86] Disable PAT (page attribute table extension of
pagetables) support.
+ nopcid [X86-64] Disable the PCID cpu feature.
+
norandmaps Don't use address space randomization. Equivalent to
echo 0 > /proc/sys/kernel/randomize_va_space
@@ -2296,6 +2307,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
+ pti= [X86_64]
+ Control KAISER user/kernel address space isolation:
+ on - enable
+ off - disable
+ auto - default setting
+
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index f9fa656e141b..1a91dec97b67 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -35,6 +35,8 @@
* implementations as possible.
*/
+#include <asm/bug.h>
+
#define EX_R9 0
#define EX_R10 8
#define EX_R11 16
@@ -50,6 +52,58 @@
#define EX_PPR 88 /* SMT thread status register (priority) */
/*
+ * The nop instruction allows a secure memory protection instruction to be
+ * inserted with the rfi flush fixup.
+ */
+#define PREPARE_RFI_TO_USER \
+ RFI_FLUSH_FIXUP_SECTION; \
+ nop
+
+#define PREPARE_RFI_TO_GUEST \
+ RFI_FLUSH_FIXUP_SECTION; \
+ nop
+
+#define DEBUG_RFI
+
+#ifdef DEBUG_RFI
+#define CHECK_TARGET_MSR_PR(srr_reg, expected_pr) \
+ SET_SCRATCH0(r3); \
+ mfspr r3,srr_reg; \
+ extrdi r3,r3,1,63-MSR_PR_LG; \
+666: tdnei r3,expected_pr; \
+ EMIT_BUG_ENTRY 666b,__FILE__,__LINE__,0; \
+ GET_SCRATCH0(r3);
+#else
+#define CHECK_TARGET_MSR_PR(expected)
+#endif
+
+#define RFI_TO_KERNEL \
+ CHECK_TARGET_MSR_PR(SPRN_SRR1, 0); \
+ rfid
+
+#define RFI_TO_USER \
+ CHECK_TARGET_MSR_PR(SPRN_SRR1, 1); \
+ PREPARE_RFI_TO_USER; \
+ rfid
+
+#define RFI_TO_GUEST \
+ PREPARE_RFI_TO_GUEST; \
+ rfid
+
+#define HRFI_TO_KERNEL \
+ CHECK_TARGET_MSR_PR(SPRN_HSRR1, 0); \
+ hrfid
+
+#define HRFI_TO_USER \
+ CHECK_TARGET_MSR_PR(SPRN_HSRR1, 1); \
+ PREPARE_RFI_TO_USER; \
+ hrfid
+
+#define HRFI_TO_GUEST \
+ PREPARE_RFI_TO_GUEST; \
+ hrfid
+
+/*
* We're short on space and time in the exception prolog, so we can't
* use the normal SET_REG_IMMEDIATE macro. Normally we just need the
* low halfword of the address, but for Kdump we need the whole low
@@ -122,7 +176,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,943)
mtspr SPRN_##h##SRR0,r12; \
mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
mtspr SPRN_##h##SRR1,r10; \
- h##rfid; \
+ h##rfid; /* should be h##RFI_TO_KERNEL but run out of space */ \
b . /* prevent speculative execution */
#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
__EXCEPTION_PROLOG_PSERIES_1(label, h)
@@ -241,7 +295,7 @@ label##_hv: \
mtspr SPRN_##h##SRR0,r12; \
mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
mtspr SPRN_##h##SRR1,r10; \
- h##rfid; \
+ h##RFI_TO_KERNEL; \
b . /* prevent speculative execution */
#define _MASKABLE_EXCEPTION_PSERIES(vec, label, h) \
__MASKABLE_EXCEPTION_PSERIES(vec, label, h)
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 9a67a38bf7b9..6ef97605fa64 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -184,4 +184,20 @@ label##3: \
FTR_ENTRY_OFFSET label##1b-label##3b; \
.popsection;
+#define RFI_FLUSH_FIXUP_SECTION \
+951: \
+ .pushsection __rfi_flush_fixup,"a"; \
+ .align 2; \
+952: \
+ FTR_ENTRY_OFFSET 951b-952b; \
+ .popsection;
+
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
+extern void do_rfi_flush_fixups(bool enable, unsigned int insn);
+#endif
+
#endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d0e3509e3119..b236b036bdd4 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1034,8 +1034,10 @@
#define PV_630p 0x0041
#define PV_970MP 0x0044
#define PV_970GX 0x0045
+#define PVR_POWER7 0x003F
#define PVR_POWER7p 0x004A
#define PVR_POWER8E 0x004B
+#define PVR_POWER8NVL 0x004C
#define PVR_POWER8 0x004D
#define PV_BE 0x0070
#define PV_PA6T 0x0090
diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index dae19342f0b9..4130653083b5 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -3,4 +3,11 @@
#include <asm-generic/setup.h>
+#ifndef __ASSEMBLY__
+
+void rfi_flush_enable(bool enable);
+void __init setup_rfi_flush(void);
+
+#endif /* !__ASSEMBLY__ */
+
#endif /* _ASM_POWERPC_SETUP_H */
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index 79881269ed2a..15fcbe2bff39 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -42,6 +42,8 @@
#define set_mb(var, value) do { var = value; mb(); } while (0)
+#define gmb() asm volatile("ori 31,31,0")
+
#ifdef __KERNEL__
#define AT_VECTOR_SIZE_ARCH 6 /* entries in ARCH_DLINFO */
#ifdef CONFIG_SMP
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index d8a74c7a3489..5226630a109f 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -32,6 +32,9 @@
#include <asm/ptrace.h>
#include <asm/irqflags.h>
#include <asm/ftrace.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/exception-64s.h>
+#endif
/*
* System calls.
@@ -242,13 +245,23 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
ACCOUNT_CPU_USER_EXIT(r11, r12)
HMT_MEDIUM_LOW_HAS_PPR
ld r13,GPR13(r1) /* only restore r13 if returning to usermode */
+ ld r2,GPR2(r1)
+ ld r1,GPR1(r1)
+ mtlr r4
+ mtcr r5
+ mtspr SPRN_SRR0,r7
+ mtspr SPRN_SRR1,r8
+ RFI_TO_USER
+ b . /* prevent speculative execution */
+
+ /* exit to kernel */
1: ld r2,GPR2(r1)
ld r1,GPR1(r1)
mtlr r4
mtcr r5
mtspr SPRN_SRR0,r7
mtspr SPRN_SRR1,r8
- RFI
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
syscall_error:
@@ -708,7 +721,7 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ACCOUNT_CPU_USER_EXIT(r2, r4)
REST_GPR(13, r1)
-1:
+
mtspr SPRN_SRR1,r3
ld r2,_CCR(r1)
@@ -721,8 +734,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r3,GPR3(r1)
ld r4,GPR4(r1)
ld r1,GPR1(r1)
+ RFI_TO_USER
+ b . /* prevent speculative execution */
+
+1: mtspr SPRN_SRR1,r3
+
+ ld r2,_CCR(r1)
+ mtcrf 0xFF,r2
+ ld r2,_NIP(r1)
+ mtspr SPRN_SRR0,r2
- rfid
+ ld r0,GPR0(r1)
+ ld r2,GPR2(r1)
+ ld r3,GPR3(r1)
+ ld r4,GPR4(r1)
+ ld r1,GPR1(r1)
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
#endif /* CONFIG_PPC_BOOK3E */
@@ -906,7 +933,7 @@ _GLOBAL(enter_rtas)
mtspr SPRN_SRR0,r5
mtspr SPRN_SRR1,r6
- rfid
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
_STATIC(rtas_return_loc)
@@ -929,7 +956,7 @@ _STATIC(rtas_return_loc)
mtspr SPRN_SRR0,r3
mtspr SPRN_SRR1,r4
- rfid
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
.align 3
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 2c4ffe3bdd34..8dd3da8d8cc0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -224,14 +224,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
mtspr SPRN_SRR0,r10
ld r10,PACAKMSR(r13)
mtspr SPRN_SRR1,r10
- rfid
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
/* Fast LE/BE switch system call */
1: mfspr r12,SPRN_SRR1
xori r12,r12,MSR_LE
mtspr SPRN_SRR1,r12
- rfid /* return to userspace */
+ RFI_TO_USER /* return to userspace */
b .
STD_EXCEPTION_PSERIES(0xd00, 0xd00, single_step)
@@ -312,7 +312,7 @@ masked_interrupt:
mtspr SPRN_SRR1,r10
ld r10,PACA_EXGEN+EX_R10(r13)
GET_SCRATCH0(r13)
- rfid
+ RFI_TO_KERNEL
b .
masked_Hinterrupt:
@@ -325,7 +325,7 @@ masked_Hinterrupt:
mtspr SPRN_HSRR1,r10
ld r10,PACA_EXGEN+EX_R10(r13)
GET_SCRATCH0(r13)
- hrfid
+ HRFI_TO_KERNEL
b .
#ifndef CONFIG_BIGMEM
@@ -382,7 +382,7 @@ slb_miss_user_pseries:
mtspr SRR0,r12
mfspr r12,SRR1 /* and SRR1 */
mtspr SRR1,r10
- rfid
+ RFI_TO_KERNEL
b . /* prevent spec. execution */
#endif /* __DISABLED__ */
@@ -591,7 +591,7 @@ slb_miss_user_common:
ld r11,PACA_EXGEN+EX_R11(r13)
ld r12,PACA_EXGEN+EX_R12(r13)
ld r13,PACA_EXGEN+EX_R13(r13)
- rfid
+ RFI_TO_USER
b .
slb_miss_fault:
@@ -649,6 +649,31 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
andi. r10,r12,MSR_RI /* check for unrecoverable exception */
beq- 2f
+ andi. r10,r12,MSR_PR /* check for userspace exception */
+ beq 1f /* returning to kernel */
+
+.machine push
+.machine "power4"
+ mtcrf 0x80,r9
+ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */
+.machine pop
+
+#ifdef CONFIG_PPC_ISERIES
+BEGIN_FW_FTR_SECTION
+ mtspr SPRN_SRR0,r11
+ mtspr SPRN_SRR1,r12
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
+#endif /* CONFIG_PPC_ISERIES */
+ RESTORE_PPR_PACA(PACA_EXSLB, r9)
+ ld r9,PACA_EXSLB+EX_R9(r13)
+ ld r10,PACA_EXSLB+EX_R10(r13)
+ ld r11,PACA_EXSLB+EX_R11(r13)
+ ld r12,PACA_EXSLB+EX_R12(r13)
+ ld r13,PACA_EXSLB+EX_R13(r13)
+ RFI_TO_USER
+ b . /* prevent speculative execution */
+
+1:
.machine push
.machine "power4"
mtcrf 0x80,r9
@@ -667,9 +692,10 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
ld r11,PACA_EXSLB+EX_R11(r13)
ld r12,PACA_EXSLB+EX_R12(r13)
ld r13,PACA_EXSLB+EX_R13(r13)
- rfid
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
+
2:
#ifdef CONFIG_PPC_ISERIES
BEGIN_FW_FTR_SECTION
@@ -682,7 +708,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES)
mtspr SPRN_SRR0,r10
ld r10,PACAKMSR(r13)
mtspr SPRN_SRR1,r10
- rfid
+ RFI_TO_KERNEL
b .
unrecov_slb:
@@ -850,9 +876,20 @@ fast_exception_return:
mtspr SPRN_SRR1,r12
mtspr SPRN_SRR0,r11
+ mfcr r11
+ andi. r12,r12,__MASK(MSR_PR_LG)
+ beq 3f
+
+ mtcr r11
+ REST_4GPRS(10, r1)
+ ld r1,GPR1(r1)
+ RFI_TO_USER
+ b . /* prevent speculative execution */
+
+3: mtcr r11
REST_4GPRS(10, r1)
ld r1,GPR1(r1)
- rfid
+ RFI_TO_KERNEL
b . /* prevent speculative execution */
unrecov_fer:
@@ -1105,7 +1142,7 @@ _GLOBAL(do_stab_bolted)
ld r11,PACA_EXSLB+EX_R11(r13)
ld r12,PACA_EXSLB+EX_R12(r13)
ld r13,PACA_EXSLB+EX_R13(r13)
- rfid
+ RFI_TO_USER
b . /* prevent speculative execution */
#endif
diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
index 4c67ad7fae08..134a46183b78 100644
--- a/arch/powerpc/kernel/setup.h
+++ b/arch/powerpc/kernel/setup.h
@@ -5,5 +5,6 @@ void check_for_initrd(void);
void do_init_bootmem(void);
void setup_panic(void);
extern int do_early_xmon;
+extern bool rfi_flush;
#endif /* _POWERPC_KERNEL_SETUP_H */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 1408118d5cb1..bffd0c6c115a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -673,3 +673,79 @@ struct ppc_pci_io ppc_pci_io;
EXPORT_SYMBOL(ppc_pci_io);
#endif /* CONFIG_PPC_INDIRECT_IO */
+#ifdef CONFIG_PPC_BOOK3S_64
+enum l1d_flush_type {
+ L1D_FLUSH_NONE,
+ L1D_FLUSH_ORI,
+ L1D_FLUSH_MTTRIG,
+};
+
+enum l1d_flush_type l1d_flush_type;
+
+bool rfi_flush;
+
+static void do_rfi_flush(void *val)
+{
+ switch (l1d_flush_type) {
+ case L1D_FLUSH_ORI:
+ asm volatile("ori 30,30,0" ::: "memory");
+ break;
+ case L1D_FLUSH_MTTRIG:
+ asm volatile("mtspr 882,0" ::: "memory");
+ break;
+ default:
+ break;
+ }
+}
+
+void rfi_flush_enable(bool enable)
+{
+ unsigned int insn;
+
+ if (rfi_flush == enable)
+ return;
+
+ switch (l1d_flush_type) {
+ case L1D_FLUSH_ORI:
+ insn = 0x63de0000;
+ break;
+ case L1D_FLUSH_MTTRIG:
+ insn = 0x7c12dba6;
+ break;
+ default:
+ printk("Secure memory protection not enabled! System is vulnerable to local exploit. Update firmware.\n");
+ return;
+ }
+
+ do_rfi_flush_fixups(enable, insn);
+
+ if (enable)
+ on_each_cpu(do_rfi_flush, NULL, 1);
+
+ rfi_flush = enable;
+}
+
+/* This tries to guess the cpu characteristics based on the PVR. */
+static bool get_cpu_characteristics(void)
+{
+ if (__is_processor(PVR_POWER7) || __is_processor(PVR_POWER7p))
+ l1d_flush_type = L1D_FLUSH_NONE;
+ else if (__is_processor(PVR_POWER8E) ||
+ __is_processor(PVR_POWER8))
+ l1d_flush_type = L1D_FLUSH_ORI;
+ else {
+ /* unknown CPU */
+ l1d_flush_type = L1D_FLUSH_NONE;
+ return false;
+ }
+
+ return true;
+}
+
+void __init setup_rfi_flush(void)
+{
+ if (get_cpu_characteristics())
+ rfi_flush_enable(true);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 1433c0a4f2da..e499370a0e55 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -21,6 +21,7 @@
#include <asm/system.h>
#include "cacheinfo.h"
+#include "setup.h"
#ifdef CONFIG_PPC64
#include <asm/paca.h>
@@ -196,6 +197,44 @@ static SYSDEV_ATTR(spurr, 0600, show_spurr, NULL);
static SYSDEV_ATTR(purr, 0600, show_purr, store_purr);
static SYSDEV_ATTR(pir, 0400, show_pir, NULL);
+#ifdef CONFIG_PPC_BOOK3S_64
+static ssize_t show_rfi_flush(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", rfi_flush ? 1 : 0);
+}
+
+static ssize_t __used store_rfi_flush(struct sysdev_class *class,
+ struct sysdev_class_attribute *attr, const char *buf,
+ size_t count)
+{
+ int val;
+ int ret = 0;
+
+ ret = sscanf(buf, "%d", &val);
+ if (ret != 1)
+ return -EINVAL;
+
+ if (val == 1)
+ rfi_flush_enable(true);
+ else if (val == 0)
+ rfi_flush_enable(false);
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static SYSDEV_CLASS_ATTR(rfi_flush, 0600,
+ show_rfi_flush, store_rfi_flush);
+
+static void sysfs_create_rfi_flush(void)
+{
+ sysfs_create_file(&cpu_sysdev_class.kset.kobj,
+ &attr_rfi_flush.attr);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
static unsigned long dscr_default;
static void read_dscr(void *val)
@@ -691,6 +730,7 @@ static int __init topology_init(void)
}
#ifdef CONFIG_PPC64
sysfs_create_dscr_default();
+ sysfs_create_rfi_flush();
#endif /* CONFIG_PPC64 */
return 0;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 3e8fe4b832fd..a4916671de52 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -72,6 +72,15 @@ SECTIONS
/* Read-only data */
RODATA
+#ifdef CONFIG_PPC64
+ . = ALIGN(8);
+ __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
+ __start___rfi_flush_fixup = .;
+ *(__rfi_flush_fixup)
+ __stop___rfi_flush_fixup = .;
+ }
+#endif
+
EXCEPTION_TABLE(0)
NOTES :kernel :notes
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 1a1b34487e71..3365c9219892 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -162,7 +162,7 @@ kvmppc_handler_skip_ins:
GET_SCRATCH0(r13)
/* And get back into the code */
- RFI
+ RFI_TO_GUEST
/*
* This trampoline brings us back to a real mode handler
@@ -199,7 +199,7 @@ _GLOBAL(kvmppc_rmcall)
sync
mtsrr0 r3
mtsrr1 r4
- RFI
+ RFI_TO_KERNEL
#if defined(CONFIG_PPC_BOOK3S_32)
#define STACK_LR INT_FRAME_SIZE+4
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 7a8a7487cee8..3e91e2ffc1d0 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -113,6 +113,33 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
}
}
+#ifdef CONFIG_PPC_BOOK3S_64
+void do_rfi_flush_fixups(bool enable, unsigned int insn)
+{
+ long *start, *end;
+ unsigned int *dest;
+ int i;
+
+ start = PTRRELOC(&__start___rfi_flush_fixup),
+ end = PTRRELOC(&__stop___rfi_flush_fixup);
+
+ for (i = 0; start < end; start++, i++) {
+ dest = (void *)start + *start;
+
+ pr_devel("RFI FLUSH FIXUP %s %lx\n", enable ? "enable" : "disable", (unsigned long)start);
+ if (!enable) {
+ pr_devel("patching dest %lx\n", (unsigned long)dest);
+ patch_instruction(dest, PPC_INST_NOP);
+ } else {
+ pr_devel("patching dest %lx\n", (unsigned long)dest);
+ patch_instruction(dest, insn);
+ }
+ }
+
+ printk(KERN_DEBUG "rfi-fixups: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
{
long *start, *end;
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
new file mode 100644
index 000000000000..320a6a74af3f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -0,0 +1,376 @@
+/*
+ * PowerNV setup code.
+ *
+ * Copyright 2011 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/tty.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/interrupt.h>
+#include <linux/bug.h>
+#include <linux/pci.h>
+#include <linux/cpufreq.h>
+
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/opal.h>
+#include <asm/kexec.h>
+#include <asm/smp.h>
+#include <asm/tm.h>
+
+#include "powernv.h"
+
+static void __init pnv_setup_arch(void)
+{
+ set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
+
+ setup_rfi_flush();
+
+ /* Initialize SMP */
+ pnv_smp_init();
+
+ /* Setup PCI */
+ pnv_pci_init();
+
+ /* Setup RTC and NVRAM callbacks */
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ opal_nvram_init();
+
+ /* Enable NAP mode */
+ powersave_nap = 1;
+
+ /* XXX PMCS */
+}
+
+static void __init pnv_init(void)
+{
+ /*
+ * Initialize the LPC bus now so that legacy serial
+ * ports can be found on it
+ */
+ opal_lpc_init();
+
+#ifdef CONFIG_HVC_OPAL
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ hvc_opal_init_early();
+ else
+#endif
+ add_preferred_console("hvc", 0, NULL);
+}
+
+static void __init pnv_init_IRQ(void)
+{
+ /* Try using a XIVE if available, otherwise use a XICS */
+ if (!xive_native_init())
+ xics_init();
+
+ WARN_ON(!ppc_md.get_irq);
+}
+
+static void pnv_show_cpuinfo(struct seq_file *m)
+{
+ struct device_node *root;
+ const char *model = "";
+
+ root = of_find_node_by_path("/");
+ if (root)
+ model = of_get_property(root, "model", NULL);
+ seq_printf(m, "machine\t\t: PowerNV %s\n", model);
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ seq_printf(m, "firmware\t: OPAL\n");
+ else
+ seq_printf(m, "firmware\t: BML\n");
+ of_node_put(root);
+ if (radix_enabled())
+ seq_printf(m, "MMU\t\t: Radix\n");
+ else
+ seq_printf(m, "MMU\t\t: Hash\n");
+}
+
+static void pnv_prepare_going_down(void)
+{
+ /*
+ * Disable all notifiers from OPAL, we can't
+ * service interrupts anymore anyway
+ */
+ opal_event_shutdown();
+
+ /* Soft disable interrupts */
+ local_irq_disable();
+
+ /*
+ * Return secondary CPUs to firwmare if a flash update
+ * is pending otherwise we will get all sort of error
+ * messages about CPU being stuck etc.. This will also
+ * have the side effect of hard disabling interrupts so
+ * past this point, the kernel is effectively dead.
+ */
+ opal_flash_term_callback();
+}
+
+static void __noreturn pnv_restart(char *cmd)
+{
+ long rc = OPAL_BUSY;
+
+ pnv_prepare_going_down();
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_reboot();
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_power_off(void)
+{
+ long rc = OPAL_BUSY;
+
+ pnv_prepare_going_down();
+
+ while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+ rc = opal_cec_power_down(0);
+ if (rc == OPAL_BUSY_EVENT)
+ opal_poll_events(NULL);
+ else
+ mdelay(10);
+ }
+ for (;;)
+ opal_poll_events(NULL);
+}
+
+static void __noreturn pnv_halt(void)
+{
+ pnv_power_off();
+}
+
+static void pnv_progress(char *s, unsigned short hex)
+{
+}
+
+static void pnv_shutdown(void)
+{
+ /* Let the PCI code clear up IODA tables */
+ pnv_pci_shutdown();
+
+ /*
+ * Stop OPAL activity: Unregister all OPAL interrupts so they
+ * don't fire up while we kexec and make sure all potentially
+ * DMA'ing ops are complete (such as dump retrieval).
+ */
+ opal_shutdown();
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void pnv_kexec_wait_secondaries_down(void)
+{
+ int my_cpu, i, notified = -1;
+
+ my_cpu = get_cpu();
+
+ for_each_online_cpu(i) {
+ uint8_t status;
+ int64_t rc, timeout = 1000;
+
+ if (i == my_cpu)
+ continue;
+
+ for (;;) {
+ rc = opal_query_cpu_status(get_hard_smp_processor_id(i),
+ &status);
+ if (rc != OPAL_SUCCESS || status != OPAL_THREAD_STARTED)
+ break;
+ barrier();
+ if (i != notified) {
+ printk(KERN_INFO "kexec: waiting for cpu %d "
+ "(physical %d) to enter OPAL\n",
+ i, paca[i].hw_cpu_id);
+ notified = i;
+ }
+
+ /*
+ * On crash secondaries might be unreachable or hung,
+ * so timeout if we've waited too long
+ * */
+ mdelay(1);
+ if (timeout-- == 0) {
+ printk(KERN_ERR "kexec: timed out waiting for "
+ "cpu %d (physical %d) to enter OPAL\n",
+ i, paca[i].hw_cpu_id);
+ break;
+ }
+ }
+ }
+}
+
+static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+ u64 reinit_flags;
+
+ if (xive_enabled())
+ xive_kexec_teardown_cpu(secondary);
+ else
+ xics_kexec_teardown_cpu(secondary);
+
+ /* On OPAL, we return all CPUs to firmware */
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return;
+
+ if (secondary) {
+ /* Return secondary CPUs to firmware on OPAL v3 */
+ mb();
+ get_paca()->kexec_state = KEXEC_STATE_REAL_MODE;
+ mb();
+
+ /* Return the CPU to OPAL */
+ opal_return_cpu();
+ } else {
+ /* Primary waits for the secondaries to have reached OPAL */
+ pnv_kexec_wait_secondaries_down();
+
+ /* Switch XIVE back to emulation mode */
+ if (xive_enabled())
+ xive_shutdown();
+
+ /*
+ * We might be running as little-endian - now that interrupts
+ * are disabled, reset the HILE bit to big-endian so we don't
+ * take interrupts in the wrong endian later
+ *
+ * We reinit to enable both radix and hash on P9 to ensure
+ * the mode used by the next kernel is always supported.
+ */
+ reinit_flags = OPAL_REINIT_CPUS_HILE_BE;
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX |
+ OPAL_REINIT_CPUS_MMU_HASH;
+ opal_reinit_cpus(reinit_flags);
+ }
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+static unsigned long pnv_memory_block_size(void)
+{
+ /*
+ * We map the kernel linear region with 1GB large pages on radix. For
+ * memory hot unplug to work our memory block size must be at least
+ * this size.
+ */
+ if (radix_enabled())
+ return 1UL * 1024 * 1024 * 1024;
+ else
+ return 256UL * 1024 * 1024;
+}
+#endif
+
+static void __init pnv_setup_machdep_opal(void)
+{
+ ppc_md.get_boot_time = opal_get_boot_time;
+ ppc_md.restart = pnv_restart;
+ pm_power_off = pnv_power_off;
+ ppc_md.halt = pnv_halt;
+ /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
+ ppc_md.machine_check_exception = opal_machine_check;
+ ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
+ ppc_md.hmi_exception_early = opal_hmi_exception_early;
+ ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
+}
+
+static int __init pnv_probe(void)
+{
+ if (!of_machine_is_compatible("ibm,powernv"))
+ return 0;
+
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ pnv_setup_machdep_opal();
+
+ pr_debug("PowerNV detected !\n");
+
+ pnv_init();
+
+ return 1;
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void __init pnv_tm_init(void)
+{
+ if (!firmware_has_feature(FW_FEATURE_OPAL) ||
+ !pvr_version_is(PVR_POWER9) ||
+ early_cpu_has_feature(CPU_FTR_TM))
+ return;
+
+ if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS)
+ return;
+
+ pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n");
+ cur_cpu_spec->cpu_features |= CPU_FTR_TM;
+ /* Make sure "normal" HTM is off (it should be) */
+ cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM;
+ /* Turn on no suspend mode, and HTM no SC */
+ cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \
+ PPC_FEATURE2_HTM_NOSC;
+ tm_suspend_disabled = true;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+/*
+ * Returns the cpu frequency for 'cpu' in Hz. This is used by
+ * /proc/cpuinfo
+ */
+static unsigned long pnv_get_proc_freq(unsigned int cpu)
+{
+ unsigned long ret_freq;
+
+ ret_freq = cpufreq_get(cpu) * 1000ul;
+
+ /*
+ * If the backend cpufreq driver does not exist,
+ * then fallback to old way of reporting the clockrate.
+ */
+ if (!ret_freq)
+ ret_freq = ppc_proc_freq;
+ return ret_freq;
+}
+
+define_machine(powernv) {
+ .name = "PowerNV",
+ .probe = pnv_probe,
+ .setup_arch = pnv_setup_arch,
+ .init_IRQ = pnv_init_IRQ,
+ .show_cpuinfo = pnv_show_cpuinfo,
+ .get_proc_freq = pnv_get_proc_freq,
+ .progress = pnv_progress,
+ .machine_shutdown = pnv_shutdown,
+ .power_save = NULL,
+ .calibrate_decr = generic_calibrate_decr,
+#ifdef CONFIG_KEXEC_CORE
+ .kexec_cpu_down = pnv_kexec_cpu_down,
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+ .memory_block_size = pnv_memory_block_size,
+#endif
+};
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 1b288d678235..72aa85ffb371 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -386,6 +386,8 @@ static void __init pSeries_setup_arch(void)
fwnmi_init();
+ setup_rfi_flush();
+
/* Find and initialize PCI host bridges */
init_pci_config_tokens();
find_and_init_phbs();
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
new file mode 100644
index 000000000000..da7e097fbaf4
--- /dev/null
+++ b/arch/s390/include/asm/alternative.h
@@ -0,0 +1,149 @@
+#ifndef _ASM_S390_ALTERNATIVE_H
+#define _ASM_S390_ALTERNATIVE_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/stringify.h>
+
+struct alt_instr {
+ s32 instr_offset; /* original instruction */
+ s32 repl_offset; /* offset to replacement instruction */
+ u16 facility; /* facility bit set for replacement */
+ u8 instrlen; /* length of original instruction */
+ u8 replacementlen; /* length of new instruction */
+} __packed;
+
+extern void apply_alternative_instructions(void);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
+/*
+ * |661: |662: |6620 |663:
+ * +-----------+---------------------+
+ * | oldinstr | oldinstr_padding |
+ * | +----------+----------+
+ * | | | |
+ * | | >6 bytes |6/4/2 nops|
+ * | |6 bytes jg----------->
+ * +-----------+---------------------+
+ * ^^ static padding ^^
+ *
+ * .altinstr_replacement section
+ * +---------------------+-----------+
+ * |6641: |6651:
+ * | alternative instr 1 |
+ * +-----------+---------+- - - - - -+
+ * |6642: |6652: |
+ * | alternative instr 2 | padding
+ * +---------------------+- - - - - -+
+ * ^ runtime ^
+ *
+ * .altinstructions section
+ * +---------------------------------+
+ * | alt_instr entries for each |
+ * | alternative instr |
+ * +---------------------------------+
+ */
+
+#define b_altinstr(num) "664"#num
+#define e_altinstr(num) "665"#num
+
+#define e_oldinstr_pad_end "663"
+#define oldinstr_len "662b-661b"
+#define oldinstr_total_len e_oldinstr_pad_end"b-661b"
+#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b"
+#define oldinstr_pad_len(num) \
+ "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \
+ "((" altinstr_len(num) ")-(" oldinstr_len "))"
+
+#define INSTR_LEN_SANITY_CHECK(len) \
+ ".if " len " > 254\n" \
+ "\t.error \"cpu alternatives does not support instructions " \
+ "blocks > 254 bytes\"\n" \
+ ".endif\n" \
+ ".if (" len ") %% 2\n" \
+ "\t.error \"cpu alternatives instructions length is odd\"\n" \
+ ".endif\n"
+
+#define OLDINSTR_PADDING(oldinstr, num) \
+ ".if " oldinstr_pad_len(num) " > 6\n" \
+ "\tjg " e_oldinstr_pad_end "f\n" \
+ "6620:\n" \
+ "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \
+ ".else\n" \
+ "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \
+ "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \
+ "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \
+ ".endif\n"
+
+#define OLDINSTR(oldinstr, num) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ OLDINSTR_PADDING(oldinstr, num) \
+ e_oldinstr_pad_end ":\n" \
+ INSTR_LEN_SANITY_CHECK(oldinstr_len)
+
+#define OLDINSTR_2(oldinstr, num1, num2) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \
+ OLDINSTR_PADDING(oldinstr, num2) \
+ ".else\n" \
+ OLDINSTR_PADDING(oldinstr, num1) \
+ ".endif\n" \
+ e_oldinstr_pad_end ":\n" \
+ INSTR_LEN_SANITY_CHECK(oldinstr_len)
+
+#define ALTINSTR_ENTRY(facility, num) \
+ "\t.long 661b - .\n" /* old instruction */ \
+ "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \
+ "\t.word " __stringify(facility) "\n" /* facility bit */ \
+ "\t.byte " oldinstr_total_len "\n" /* source len */ \
+ "\t.byte " altinstr_len(num) "\n" /* alt instruction len */
+
+#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \
+ b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \
+ INSTR_LEN_SANITY_CHECK(altinstr_len(num))
+
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, altinstr, facility) \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(altinstr, 1) \
+ ".popsection\n" \
+ OLDINSTR(oldinstr, 1) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(facility, 1) \
+ ".popsection\n"
+
+#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(altinstr1, 1) \
+ ALTINSTR_REPLACEMENT(altinstr2, 2) \
+ ".popsection\n" \
+ OLDINSTR_2(oldinstr, 1, 2) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(facility1, 1) \
+ ALTINSTR_ENTRY(facility2, 2) \
+ ".popsection\n"
+
+/*
+ * Alternative instructions for different CPU types or capabilities.
+ *
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ *
+ * oldinstr is padded with jump and nops at compile time if altinstr is
+ * longer. altinstr is padded with jump and nops at run-time during patching.
+ *
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, altinstr, facility) \
+ asm volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory")
+
+#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \
+ asm volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \
+ altinstr2, facility2) ::: "memory")
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_S390_ALTERNATIVE_H */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index f2a2ceb4d976..9f0c00b95bff 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -34,6 +34,7 @@ static inline void get_cpu_id(struct cpuid *ptr)
extern void s390_adjust_jiffies(void);
extern int get_cpu_capability(unsigned int *);
+extern void __bpon(void);
/*
* User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index 353a3b2a6cef..c5f67a6a5743 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
+#include <asm/alternative.h>
#include <asm/types.h>
#include <asm/ptrace.h>
#include <asm/setup.h>
@@ -176,6 +177,13 @@ extern int copy_from_user_real(void *dest, void __user *src, size_t count);
* all memory ops have completed wrt other CPU's ( see 7-15 POP DJB ).
*/
+static inline void gmb(void)
+{
+ asm volatile(
+ ALTERNATIVE("", ".long 0xb2e8f000", 81)
+ : : : "memory");
+}
+
#define eieio() asm volatile("bcr 15,0" : : : "memory")
#define SYNC_OTHER_CORES(x) eieio()
#define mb() eieio()
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 3e9abc26d558..610745ec2599 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -23,7 +23,7 @@ CFLAGS_sysinfo.o += -Iinclude/math-emu -Iarch/s390/math-emu -w
obj-y := bitmap.o traps.o time.o process.o base.o early.o setup.o vtime.o \
processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o \
debug.o irq.o ipl.o dis.o diag.o mem_detect.o sclp.o vdso.o \
- sysinfo.o jump_label.o lgr.o os_info.o
+ sysinfo.o jump_label.o lgr.o os_info.o alternative.o
obj-y += $(if $(CONFIG_64BIT),entry64.o,entry.o)
obj-y += $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
new file mode 100644
index 000000000000..846ece762533
--- /dev/null
+++ b/arch/s390/kernel/alternative.c
@@ -0,0 +1,123 @@
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+
+#define MAX_PATCH_LEN (255 - 1)
+
+static int __initdata_or_module alt_instr_disabled;
+
+static int __init disable_alternative_instructions(char *str)
+{
+ alt_instr_disabled = 1;
+ return 0;
+}
+
+early_param("noaltinstr", disable_alternative_instructions);
+
+extern struct alt_instr __alt_nobp[], __alt_nobp_end[];
+static int __init nobp_setup(char *str)
+{
+ bool enabled;
+ int rc;
+
+ rc = strtobool(str, &enabled);
+ if (!rc && enabled)
+ apply_alternatives(__alt_nobp, __alt_nobp_end);
+ return rc;
+}
+__setup("nobp=", nobp_setup);
+
+struct brcl_insn {
+ u16 opc;
+ s32 disp;
+} __packed;
+
+static u16 __initdata_or_module nop16 = 0x0700;
+static u32 __initdata_or_module nop32 = 0x47000000;
+static struct brcl_insn __initdata_or_module nop48 = {
+ 0xc004, 0
+};
+
+static const void * __initdata_or_module nops[] = {
+ &nop16,
+ &nop32,
+ &nop48
+};
+
+static void __init_or_module add_jump_padding(void *insns, unsigned int len)
+{
+ struct brcl_insn brcl = {
+ 0xc0f4,
+ len / 2
+ };
+
+ memcpy(insns, &brcl, sizeof(brcl));
+ insns += sizeof(brcl);
+ len -= sizeof(brcl);
+
+ while (len > 0) {
+ memcpy(insns, &nop16, 2);
+ insns += 2;
+ len -= 2;
+ }
+}
+
+static void __init_or_module add_padding(void *insns, unsigned int len)
+{
+ if (len > 6)
+ add_jump_padding(insns, len);
+ else if (len >= 2)
+ memcpy(insns, nops[len / 2 - 1], len);
+}
+
+static void __init_or_module __apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ struct alt_instr *a;
+ u8 *instr, *replacement;
+ u8 insnbuf[MAX_PATCH_LEN];
+
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite previously scanned alternative code.
+ */
+ for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
+
+ if (!test_facility(a->facility))
+ continue;
+
+ if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
+ WARN_ONCE(1, "cpu alternatives instructions length is "
+ "odd, skipping patching\n");
+ continue;
+ }
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
+
+ if (a->instrlen > a->replacementlen) {
+ add_padding(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+
+ probe_kernel_write(instr, insnbuf, insnbuf_sz);
+ }
+}
+
+void __init_or_module apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ if (!alt_instr_disabled)
+ __apply_alternatives(start, end);
+}
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+void __init apply_alternative_instructions(void)
+{
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+}
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 48aa683c6311..53affe348d64 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -172,24 +172,47 @@ STACK_SIZE = 1 << STACK_SHIFT
stm %r0,%r11,SP_R0(%r15) # store gprs %r0-%r11 to kernel stack
.endm
- .macro RESTORE_ALL psworg,sync
- mvc \psworg(8),SP_PSW(%r15) # move user PSW to lowcore
- .if !\sync
- ni \psworg+1,0xfd # clear wait state bit
- .endif
- lm %r0,%r15,SP_R0(%r15) # load gprs 0-15 of user
- stpt __LC_EXIT_TIMER
- lpsw \psworg # back to caller
- .endm
-
.macro REENABLE_IRQS
mvc __SF_EMPTY(1,%r15),SP_PSW(%r15)
ni __SF_EMPTY(%r15),0xbf
ssm __SF_EMPTY(%r15)
.endm
+ .macro BPOFF
+ .pushsection .altinstr_replacement, "ax"
+660: .long 0xb2e8c000
+ .popsection
+661: .long 0x47000000
+ .pushsection .altnobp, "a"
+ .long 661b - .
+ .long 660b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
+ .macro BPON
+ .pushsection .altinstr_replacement, "ax"
+662: .long 0xb2e8d000
+ .popsection
+663: .long 0x47000000
+ .pushsection .altnobp, "a"
+ .long 663b - .
+ .long 662b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
.section .kprobes.text, "ax"
+ENTRY(__bpon)
+ .globl __bpon
+ BPON
+ br %r14
+
/*
* Scheduler resume function, called by switch_to
* gpr2 = (task_struct *) prev
@@ -229,6 +252,7 @@ system_call:
stpt __LC_SYNC_ENTER_TIMER
sysc_saveall:
SAVE_ALL_SVC __LC_SVC_OLD_PSW,__LC_SAVE_AREA
+ BPOFF
CREATE_STACK_FRAME __LC_SAVE_AREA
mvc SP_PSW(8,%r15),__LC_SVC_OLD_PSW
mvc SP_ILC(4,%r15),__LC_SVC_ILC
@@ -264,7 +288,11 @@ sysc_tif:
tm __TI_flags+3(%r12),_TIF_WORK_SVC
bnz BASED(sysc_work) # there is work to do (signals etc.)
sysc_restore:
- RESTORE_ALL __LC_RETURN_PSW,1
+ mvc __LC_RETURN_PSW(8),SP_PSW(%r15)
+ lm %r0,%r15,SP_R0(%r15)
+ BPON
+ stpt __LC_EXIT_TIMER
+ lpsw __LC_RETURN_PSW
sysc_done:
#
@@ -454,6 +482,7 @@ pgm_check_handler:
* for LPSW?).
*/
stpt __LC_SYNC_ENTER_TIMER
+ BPOFF
SAVE_ALL_BASE __LC_SAVE_AREA
tm __LC_PGM_INT_CODE+1,0x80 # check whether we got a per exception
bnz BASED(pgm_per) # got per exception -> special case
@@ -569,6 +598,7 @@ kernel_per:
io_int_handler:
stck __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+16
CREATE_STACK_FRAME __LC_SAVE_AREA+16
mvc SP_PSW(8,%r15),0(%r12) # move user PSW to stack
@@ -590,7 +620,15 @@ io_tif:
tm __TI_flags+3(%r12),_TIF_WORK_INT
bnz BASED(io_work) # there is work to do (signals etc.)
io_restore:
- RESTORE_ALL __LC_RETURN_PSW,0
+ mvc __LC_RETURN_PSW(8),SP_PSW(%r15)
+ ni __LC_RETURN_PSW+1,0xfd
+ tm SP_PSW+1(%r15),0x01 # interrupting from user ?
+ bno BASED(io_exit_kernel)
+ BPON
+io_exit_kernel:
+ lm %r0,%r15,SP_R0(%r15)
+ stpt __LC_EXIT_TIMER
+ lpsw __LC_RETURN_PSW
io_done:
#
@@ -707,6 +745,7 @@ io_notify_resume:
ext_int_handler:
stck __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+16
CREATE_STACK_FRAME __LC_SAVE_AREA+16
mvc SP_PSW(8,%r15),0(%r12) # move user PSW to stack
@@ -734,6 +773,7 @@ __critical_end:
.globl mcck_int_handler
mcck_int_handler:
stck __LC_MCCK_CLOCK
+ BPOFF
spt __LC_CPU_TIMER_SAVE_AREA # revalidate cpu timer
lm %r0,%r15,__LC_GPREGS_SAVE_AREA # revalidate gprs
SAVE_ALL_BASE __LC_SAVE_AREA+32
@@ -803,15 +843,12 @@ mcck_no_vtime:
mcck_return:
mvc __LC_RETURN_MCCK_PSW(8),SP_PSW(%r15) # move return PSW
ni __LC_RETURN_MCCK_PSW+1,0xfd # clear wait state bit
+ lm %r0,%r15,SP_R0(%r15) # load gprs 0-15
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
bno BASED(0f)
- lm %r0,%r15,SP_R0(%r15) # load gprs 0-15
+ BPON
stpt __LC_EXIT_TIMER
- lpsw __LC_RETURN_MCCK_PSW # back to caller
-0: lm %r0,%r15,SP_R0(%r15) # load gprs 0-15
- lpsw __LC_RETURN_MCCK_PSW # back to caller
-
- RESTORE_ALL __LC_RETURN_MCCK_PSW,0
+0: lpsw __LC_RETURN_MCCK_PSW
/*
* Restart interruption handler, kick starter for additional CPUs
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index ecc27826ad26..5365d887f264 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -184,19 +184,6 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING)
stmg %r0,%r10,SP_R0(%r15) # store gprs %r0-%r10 to kernel stack
.endm
- .macro RESTORE_ALL psworg,sync
- mvc \psworg(16),SP_PSW(%r15) # move user PSW to lowcore
- .if !\sync
- ni \psworg+1,0xfd # clear wait state bit
- .endif
- lg %r14,__LC_VDSO_PER_CPU
- lmg %r0,%r13,SP_R0(%r15) # load gprs 0-13 of user
- stpt __LC_EXIT_TIMER
- mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
- lmg %r14,%r15,SP_R14(%r15) # load grps 14-15 of user
- lpswe \psworg # back to caller
- .endm
-
.macro LAST_BREAK
srag %r10,%r11,23
jz 0f
@@ -210,8 +197,41 @@ _TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING)
ssm __SF_EMPTY(%r15)
.endm
+ .macro BPOFF
+ .pushsection .altinstr_replacement, "ax"
+660: .long 0xb2e8c000
+ .popsection
+661: .long 0x47000000
+ .pushsection .altnobp, "a"
+ .long 661b - .
+ .long 660b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
+ .macro BPON
+ .pushsection .altinstr_replacement, "ax"
+662: .long 0xb2e8d000
+ .popsection
+663: .long 0x47000000
+ .pushsection .altnobp, "a"
+ .long 663b - .
+ .long 662b - .
+ .word 82
+ .byte 4
+ .byte 4
+ .popsection
+ .endm
+
.section .kprobes.text, "ax"
+ENTRY(__bpon)
+ .globl __bpon
+ BPON
+ br %r14
+
/*
* Scheduler resume function, called by switch_to
* gpr2 = (task_struct *) prev
@@ -250,6 +270,7 @@ system_call:
stpt __LC_SYNC_ENTER_TIMER
sysc_saveall:
SAVE_ALL_SVC __LC_SVC_OLD_PSW,__LC_SAVE_AREA
+ BPOFF
CREATE_STACK_FRAME __LC_SAVE_AREA
mvc SP_PSW(16,%r15),__LC_SVC_OLD_PSW
mvc SP_ILC(4,%r15),__LC_SVC_ILC
@@ -292,7 +313,14 @@ sysc_tif:
tm __TI_flags+7(%r12),_TIF_WORK_SVC
jnz sysc_work # there is work to do (signals etc.)
sysc_restore:
- RESTORE_ALL __LC_RETURN_PSW,1
+ mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
+ lg %r14,__LC_VDSO_PER_CPU
+ lmg %r0,%r13,SP_R0(%r15)
+ BPON
+ stpt __LC_EXIT_TIMER
+ mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
+ lmg %r14,%r15,SP_R14(%r15)
+ lpswe __LC_RETURN_PSW
sysc_done:
#
@@ -471,6 +499,7 @@ pgm_check_handler:
* for LPSW?).
*/
stpt __LC_SYNC_ENTER_TIMER
+ BPOFF
tm __LC_PGM_INT_CODE+1,0x80 # check whether we got a per exception
jnz pgm_per # got per exception -> special case
SAVE_ALL_PGM __LC_PGM_OLD_PSW,__LC_SAVE_AREA
@@ -598,6 +627,7 @@ kernel_per:
io_int_handler:
stck __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+40
CREATE_STACK_FRAME __LC_SAVE_AREA+40
mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack
@@ -620,7 +650,18 @@ io_tif:
tm __TI_flags+7(%r12),_TIF_WORK_INT
jnz io_work # there is work to do (signals etc.)
io_restore:
- RESTORE_ALL __LC_RETURN_PSW,0
+ mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
+ ni __LC_RETURN_PSW+1,0xfd
+ lg %r14,__LC_VDSO_PER_CPU
+ lmg %r0,%r13,SP_R0(%r15)
+ tm SP_PSW+1(%r15),0x01 # returning to user ?
+ jno io_exit_kernel
+ BPON
+ stpt __LC_EXIT_TIMER
+ mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
+io_exit_kernel:
+ lmg %r14,%r15,SP_R14(%r15)
+ lpswe __LC_RETURN_PSW
io_done:
#
@@ -733,6 +774,7 @@ io_notify_resume:
ext_int_handler:
stck __LC_INT_CLOCK
stpt __LC_ASYNC_ENTER_TIMER
+ BPOFF
SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+40
CREATE_STACK_FRAME __LC_SAVE_AREA+40
mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack
@@ -763,6 +805,7 @@ __critical_end:
.globl mcck_int_handler
mcck_int_handler:
stck __LC_MCCK_CLOCK
+ BPOFF
la %r1,4095 # revalidate r1
spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer
lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs
@@ -838,6 +881,7 @@ mcck_return:
lmg %r0,%r15,SP_R0(%r15) # load gprs 0-15
tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
jno 0f
+ BPON
stpt __LC_EXIT_TIMER
0: lpswe __LC_RETURN_MCCK_PSW # back to caller
mcck_done:
@@ -1100,8 +1144,10 @@ sie_loop:
jnz sie_exit
lg %r14,__SF_EMPTY(%r15) # get control block pointer
SPP __SF_EMPTY(%r15) # set guest id
+ BPON
sie 0(%r14)
sie_done:
+ BPOFF
SPP __LC_CMF_HPP # set host id
lg %r14,__LC_THREAD_INFO # pointer thread_info struct
.globl sie_exit
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index ff5a6ca090a6..07d3dc111aa2 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -562,6 +562,7 @@ out:
static void __ipl_run(void *unused)
{
+ __bpon();
diag308(DIAG308_IPL, NULL);
if (MACHINE_IS_VM)
__cpcmd("IPL", NULL, 0, NULL);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index d90d82a50806..74371e5ac784 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -32,6 +32,7 @@
#include <linux/kernel.h>
#include <linux/moduleloader.h>
#include <linux/bug.h>
+#include <asm/alternative.h>
#if 0
#define DEBUGP printk
@@ -407,6 +408,18 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
+ const Elf_Shdr *s;
+ char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ if (!strcmp(".altinstructions", secstrings + s->sh_name)) {
+ /* patch .altinstructions */
+ void *aseg = (void *)s->sh_addr;
+
+ apply_alternatives(aseg, aseg + s->sh_size);
+ }
+ }
+
vfree(me->arch.syminfo);
me->arch.syminfo = NULL;
return 0;
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 7e3974aeae33..349867ada25c 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -47,6 +47,7 @@
#include <linux/memory.h>
#include <linux/compat.h>
+#include <asm/alternative.h>
#include <asm/ipl.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -1115,6 +1116,8 @@ setup_arch(char **cmdline_p)
conmode_default();
set_preferred_console();
+ apply_alternative_instructions();
+
/* Setup zfcpdump support */
setup_zfcpdump(console_devno);
}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 6dce1d700340..3834275128ac 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -130,6 +130,7 @@ void smp_switch_to_ipl_cpu(void (*func)(void *), void *data)
struct pt_regs *regs;
unsigned long sp;
+ __bpon();
if (smp_processor_id() == 0)
func(data);
__load_psw_mask(PSW_BASE_BITS | PSW_DEFAULT_KEY);
@@ -784,6 +785,7 @@ void __cpu_die(unsigned int cpu)
void __noreturn cpu_die(void)
{
idle_task_exit();
+ __bpon();
while (sigp(smp_processor_id(), sigp_stop) == sigp_busy)
cpu_relax();
for (;;);
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index e4c79ebb40e6..27325a2d063e 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -68,6 +68,32 @@ SECTIONS
INIT_TEXT_SECTION(PAGE_SIZE)
/*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ * Note, that it is a part of __init region.
+ */
+ . = ALIGN(8);
+ .altinstructions : {
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
+ __alt_nobp = .;
+ *(.altnobp)
+ __alt_nobp_end = . ;
+ }
+
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ * Note, that it is a part of __init region.
+ */
+ .altinstr_replacement : {
+ *(.altinstr_replacement)
+ }
+
+ /*
* .exit.text is discarded at runtime, not link time,
* to deal with references from __bug_table
*/
diff --git a/arch/um/sys-i386/shared/sysdep/system.h b/arch/um/sys-i386/shared/sysdep/system.h
index d1b93c436200..f131bf8934aa 100644
--- a/arch/um/sys-i386/shared/sysdep/system.h
+++ b/arch/um/sys-i386/shared/sysdep/system.h
@@ -125,7 +125,6 @@ void default_idle(void);
*/
static inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
}
diff --git a/arch/um/sys-x86_64/shared/sysdep/system.h b/arch/um/sys-x86_64/shared/sysdep/system.h
index d1b93c436200..f131bf8934aa 100644
--- a/arch/um/sys-x86_64/shared/sysdep/system.h
+++ b/arch/um/sys-x86_64/shared/sysdep/system.h
@@ -125,7 +125,6 @@ void default_idle(void);
*/
static inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3f19c81a6203..31b44e5874ed 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
* we just keep it from happening
*/
#undef CONFIG_PARAVIRT
+#undef KAISER
#ifdef CONFIG_X86_32
#define _ASM_X86_DESC_H 1
#endif
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index aff00b8182a8..57775ebe251b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -12,8 +12,12 @@
#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/cpufeature.h>
+#include <asm/kaiser.h>
#include <asm/irqflags.h>
#include <linux/linkage.h>
+#include <asm/spec_ctrl.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -120,6 +124,7 @@ ENTRY(ia32_sysenter_target)
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(kernel_stack), %rsp
addq $(KERNEL_STACK_OFFSET),%rsp
/*
@@ -143,6 +148,10 @@ ENTRY(ia32_sysenter_target)
CFI_REL_OFFSET rip,0
pushq_cfi %rax
cld
+
+ ENABLE_IBRS
+ STUFF_RSB
+
SAVE_ARGS 0,0,1
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
@@ -183,6 +192,10 @@ sysexit_from_sys_call:
popq_cfi %rcx /* User %esp */
CFI_REGISTER rsp,rcx
TRACE_IRQS_ON
+
+ DISABLE_IBRS
+
+ SWITCH_USER_CR3
ENABLE_INTERRUPTS_SYSEXIT32
#ifdef CONFIG_AUDITSYSCALL
@@ -281,6 +294,7 @@ ENTRY(ia32_cstar_target)
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movl %esp,%r8d
CFI_REGISTER rsp,r8
movq PER_CPU_VAR(kernel_stack),%rsp
@@ -302,6 +316,10 @@ ENTRY(ia32_cstar_target)
/*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
movq %r8,RSP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+
+ ENABLE_IBRS
+ STUFF_RSB
+
/* no need to do an access_ok check here because r8 has been
32bit zero extended */
/* hardware stack frame is complete now */
@@ -337,6 +355,10 @@ sysretl_from_sys_call:
xorq %r9,%r9
xorq %r8,%r8
TRACE_IRQS_ON
+
+ DISABLE_IBRS
+
+ SWITCH_USER_CR3
movl RSP-ARGOFFSET(%rsp),%esp
CFI_RESTORE rsp
USERGS_SYSRET32
@@ -409,6 +431,7 @@ ENTRY(ia32_syscall)
CFI_REL_OFFSET rip,RIP-RIP
PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
@@ -423,6 +446,10 @@ ENTRY(ia32_syscall)
GET_THREAD_INFO(%r10)
orl $TS_COMPAT,TI_status(%r10)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+
+ ENABLE_IBRS
+ STUFF_RSB
+
jnz ia32_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 94d420b360d1..e6c4644fcd39 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -24,4 +24,39 @@
.byte \alt_len
.endm
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,141b-140b,144f-143f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,141b-140b,144f-143f
+ altinstruction_entry 140b,144f,\feature2,141b-140b,145f-144f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
+.endm
+
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
new file mode 100644
index 000000000000..84ae170bc3d0
--- /dev/null
+++ b/arch/x86/include/asm/cmdline.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_X86_CMDLINE_H
+#define _ASM_X86_CMDLINE_H
+
+int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+ char *buffer, int bufsize);
+
+#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7930344f167b..0058fe4f4334 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -85,11 +85,14 @@
#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_MFENCE_RDTSC (3*32+17) * "" Mfence synchronizes RDTSC */
#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
- /* 21 available, was AMD_C1E */
+
+/* It is in word 13 upstream - move it here due to kABI breakage. */
+#define X86_FEATURE_IBPB (3*32+21) /* Indirect Branch Prediction Barrier */
+
#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
@@ -177,6 +180,11 @@
#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_INVPCID_SINGLE (7*32+ 8) /* Effectively INVPCID && CR4.PCIDE=1 */
+#define X86_FEATURE_SPEC_CTRL ( 7*32+19) /* Control Speculation Control */
+
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 1af4a05e27ba..b6b33a3bf12a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -40,7 +40,11 @@ struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
+#ifdef __GENKSYMS__
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+#else
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
+#endif
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 06ec9d1c245f..ac4256fcac7e 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,6 +78,9 @@ enum fixed_addresses {
VSYSCALL_LAST_PAGE,
VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+#ifndef __GENKSYMS__
+ VVAR_PAGE,
+#endif
VSYSCALL_HPET,
#endif
FIX_DBGP_BASE,
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 13b7cd86ed8f..1a7a8a3b027b 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -21,12 +21,12 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
-#ifndef CONFIG_XEN
+#endif
+#if !defined(CONFIG_XEN)
unsigned int irq_tlb_count;
-#else
+#elif defined(CONFIG_SMP)
unsigned int irq_lock_count;
#endif
-#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 169239574aee..66fb01020dea 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -176,7 +176,11 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
#endif
typedef int vector_irq_t[NR_VECTORS];
+#ifndef __GENKSYMS__
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
+#else
DECLARE_PER_CPU(vector_irq_t, vector_irq);
+#endif
extern void setup_vector_irq(int cpu);
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..b74412641555
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,143 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <asm/nops.h>
+#include <asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory. It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ALTERNATIVE ASM_NOP5, "bts $63, \reg", X86_FEATURE_PCID
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js 9f
+/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KAISER
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.Lend_\@:
+.endm
+
+.macro SWITCH_USER_CR3
+ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KAISER
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+.Lend_\@:
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KAISER
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.Lend_\@:
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3
+.endm
+.macro SWITCH_USER_CR3
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_KAISER
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored. To change the address space, another register is
+ * needed. A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+extern int kaiser_enabled;
+extern void __init kaiser_check_boottime_disable(void);
+#else
+#define kaiser_enabled 0
+static inline void __init kaiser_check_boottime_disable(void) {}
+#endif /* CONFIG_KAISER */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
+/**
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * The mapping is done on a global scope, so no bigger
+ * synchronization has to be done. the pages have to be
+ * manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+/**
+ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * kaiser_init - Initialize the shadow mapping
+ *
+ * Most parts of the shadow mapping can be mapped upon boot
+ * time. Only per-process things like the thread stacks
+ * or a new LDT have to be mapped at runtime. These boot-
+ * time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index e9b162ac1415..7d53c07eeb06 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -24,12 +24,8 @@ typedef struct {
void *vdso;
} mm_context_t;
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
+#ifndef CONFIG_XEN
void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
#endif
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d7d36c81e4cd..7b1f69318546 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -68,81 +68,16 @@ void destroy_context(struct mm_struct *mm);
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#ifdef CONFIG_SMP
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
}
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
-{
- unsigned cpu = smp_processor_id();
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
- if (likely(prev != next)) {
-#ifdef CONFIG_SMP
- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- percpu_write(cpu_tlbstate.active_mm, next);
-#endif
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs much
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. This barrier synchronizes with
- * remote TLB flushers. Fortunately, load_cr3 is
- * serializing and thus acts as a full barrier.
- *
- */
- load_cr3(next->pgd);
-
- /* stop flush ipis for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
-
- /*
- * load the LDT, if the LDT is different:
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
- }
-#ifdef CONFIG_SMP
- else {
- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
- /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, this is a barrier that forces
- * TLB repopulation to be ordered after the
- * store to mm_cpumask.
- */
- load_cr3(next->pgd);
- load_mm_ldt(next);
- }
- }
-#endif
-}
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
do { \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 486b5009a544..7827df7e3890 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -32,6 +32,9 @@
#define EFER_FFXSR (1<<_EFER_FFXSR)
/* Intel MSRs. Some also available on other CPUs */
+#define MSR_IA32_SPEC_CTRL 0x00000048
+#define MSR_IA32_PRED_CMD 0x00000049
+
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
@@ -149,6 +152,8 @@
/* Fam 15h MSRs */
#define MSR_F15H_PERF_CTL 0xc0010200
#define MSR_F15H_PERF_CTR 0xc0010201
+#define MSR_F15H_IC_CFG 0xc0011021
+#define MSR_F15H_IC_CFG_DIS_IND (1 << 14)
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
@@ -158,6 +163,8 @@
#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
#define MSR_FAM10H_NODE_ID 0xc001100c
+#define MSR_F10H_DECFG 0xc0011029
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
@@ -233,6 +240,8 @@
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
+#define FEATURE_ENABLE_IBRS (1<<0)
+#define FEATURE_SET_IBPB (1<<0)
#define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 8fa63b62691f..10bda36d7407 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -135,6 +135,21 @@ static inline unsigned long long native_read_pmc(int counter)
return EAX_EDX_VAL(val, low, high);
}
+#define native_rdmsr(msr, val1, val2) \
+do { \
+ u64 __val = native_read_msr((msr)); \
+ (void)((val1) = (u32)__val); \
+ (void)((val2) = (u32)(__val >> 32)); \
+} while (0)
+
+#define native_wrmsr(msr, low, high) \
+ native_write_msr(msr, low, high)
+
+#define native_wrmsrl(msr, val) \
+ native_write_msr((msr), \
+ (u32)((u64)(val)), \
+ (u32)((u64)(val) >> 32))
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 405b4032a60b..4e0b9846fc6c 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -87,7 +87,11 @@
#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
#define P6_NOP5_ATOMIC P6_NOP5
+#ifdef __ASSEMBLY__
+#define _ASM_MK_NOP(x) __stringify(.byte x)
+#else
#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
+#endif
#if defined(CONFIG_MK7)
#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9d8c44a5c064..f024df4f048a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -17,6 +17,11 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
/*
* ZERO_PAGE is a global shared page that is always zero: used
@@ -571,7 +576,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ pgdval_t ignore_flags = _PAGE_USER;
+ /*
+ * We set NX on KAISER pgds that map userspace memory so
+ * that userspace can not meaningfully use the kernel
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See native_set_pgd().
+ */
+ if (kaiser_enabled)
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -771,7 +786,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ if (kaiser_enabled) {
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+ }
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 975f709e09ae..a4a2b700a4e6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -105,9 +105,31 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+#ifdef CONFIG_KAISER
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+#ifdef CONFIG_DEBUG_VM
+ /* linux/mmdebug.h may not have been included at this point */
+ BUG_ON(!kaiser_enabled);
+#endif
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+ return NULL;
+}
+#endif /* CONFIG_KAISER */
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pgd;
+ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a21ebfdcda97..c6e88747e71f 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -62,7 +62,7 @@
#endif
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
@@ -74,6 +74,33 @@
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
+
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH (0)
+#define X86_CR3_PCID_USER_FLUSH (0)
+#define X86_CR3_PCID_KERN_NOFLUSH (0)
+#define X86_CR3_PCID_USER_NOFLUSH (0)
+#endif
+
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
#define _PAGE_CACHE_WC (_PAGE_PWT)
@@ -107,7 +134,8 @@
#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -129,7 +157,8 @@
#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 0bed21129e7d..360e80d0d217 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -43,7 +43,8 @@
*/
#define X86_CR3_PWT 0x00000008 /* Page Write Through */
#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
-#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH (_AC(1,ULL) << X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5b0206b2af76..2eec9eaa9f50 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -266,7 +266,11 @@ struct tss_struct {
} ____cacheline_aligned;
+#ifndef __GENKSYMS__
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
+#else
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+#endif
/*
* Save the original ist values for checking stack pointers during debugging
@@ -414,6 +418,7 @@ struct stack_canary {
};
DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
+static inline void stuff_rsb(void) {}
#endif /* X86_64 */
extern unsigned int xstate_size;
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 6f414ed88620..4751413306ba 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -18,6 +18,7 @@ void syscall32_cpu_init(void);
void x86_configure_nx(void);
void x86_report_nx(void);
+void stuff_rsb(void);
extern int reboot_force;
diff --git a/arch/x86/include/asm/spec_ctrl.h b/arch/x86/include/asm/spec_ctrl.h
new file mode 100644
index 000000000000..96f61a82396d
--- /dev/null
+++ b/arch/x86/include/asm/spec_ctrl.h
@@ -0,0 +1,108 @@
+#ifndef _ASM_X86_SPEC_CTRL_H
+#define _ASM_X86_SPEC_CTRL_H
+
+#include <linux/stringify.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+#ifdef __ASSEMBLY__
+
+.macro __ENABLE_IBRS_CLOBBER
+ movl $MSR_IA32_SPEC_CTRL, %ecx
+ xorl %edx, %edx
+ movl $FEATURE_ENABLE_IBRS, %eax
+ wrmsr
+.endm
+
+.macro ENABLE_IBRS_CLOBBER
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
+ call x86_ibrs_enabled
+ test %eax, %eax
+ jz .Llfence_\@
+
+ __ENABLE_IBRS_CLOBBER
+ jmp .Lend_\@
+
+.Llfence_\@:
+ lfence
+.Lend_\@:
+.endm
+
+
+.macro ENABLE_IBRS
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
+
+ pushq %rax
+
+ call x86_ibrs_enabled
+ test %eax, %eax
+ jz .Llfence_\@
+
+ pushq %rcx
+ pushq %rdx
+ __ENABLE_IBRS_CLOBBER
+ popq %rdx
+ popq %rcx
+
+ jmp .Lpop_\@
+
+.Llfence_\@:
+ lfence
+
+.Lpop_\@:
+ popq %rax
+
+.Lend_\@:
+.endm
+
+
+.macro DISABLE_IBRS
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_SPEC_CTRL
+
+ pushq %rax
+
+ call x86_ibrs_enabled
+ test %eax, %eax
+ jz .Llfence_\@
+
+ pushq %rcx
+ pushq %rdx
+ movl $MSR_IA32_SPEC_CTRL, %ecx
+ xorl %edx, %edx
+ xorl %eax, %eax
+ wrmsr
+ popq %rdx
+ popq %rcx
+
+ jmp .Lpop_\@
+
+.Llfence_\@:
+ lfence
+
+.Lpop_\@:
+ popq %rax
+
+.Lend_\@:
+.endm
+
+.macro STUFF_RSB
+ ALTERNATIVE "call stuff_rsb", "", X86_FEATURE_SMEP
+.endm
+
+#else /* __ASSEMBLY__ */
+void x86_enable_ibrs(void);
+void x86_disable_ibrs(void);
+void stuff_RSB(void);
+unsigned int x86_ibrs_enabled(void);
+unsigned int x86_ibpb_enabled(void);
+void x86_spec_check(void);
+
+static inline void x86_ibp_barrier(void)
+{
+ if (x86_ibpb_enabled())
+ native_wrmsrl(MSR_IA32_PRED_CMD, FEATURE_SET_IBPB);
+}
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SPEC_CTRL_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 3ecb79853d21..538f30c2783a 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -431,6 +431,9 @@ void stop_this_cpu(void *dummy);
#define wmb() asm volatile("sfence" ::: "memory")
#endif
+/* there is no alternative_2 in 3.0, so reuse rdtsc_barrier */
+#define gmb() rdtsc_barrier()
+
/**
* read_barrier_depends - Flush all pending reads that subsequents reads
* depend on.
@@ -516,7 +519,6 @@ void stop_this_cpu(void *dummy);
*/
static __always_inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 042f6e115940..e6809760331d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,57 @@
#include <asm/processor.h>
#include <asm/system.h>
+#ifndef __GENKSYMS__
+#include <asm/smp.h>
+#endif
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+{
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+ * mappings, we don't want the compiler to reorder any subsequent
+ * memory accesses before the TLB flush.
+ *
+ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+ * invpcid (%rcx), %rax in long mode.
+ */
+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+ : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR 0
+#define INVPCID_TYPE_SINGLE_CTXT 1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
+#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+ unsigned long addr)
+{
+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
#ifdef CONFIG_PARAVIRT_MMU
#include <asm/paravirt.h>
@@ -15,8 +66,36 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+#define kaiser_enabled 0
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
+
static inline void __native_flush_tlb(void)
{
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all_nonglobals();
+ return;
+ }
+ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
}
@@ -25,6 +104,23 @@ static inline void __native_flush_tlb_global(void)
unsigned long flags;
unsigned long cr4;
+ if (kaiser_enabled) {
+ /* Globals are not used at all */
+ __native_flush_tlb();
+ return;
+ }
+
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+ }
+
/*
* Read-modify-write to CR4 - protect it from preemption and
* from interrupts. (Use the raw variant because this code can
@@ -33,17 +129,48 @@ static inline void __native_flush_tlb_global(void)
raw_local_irq_save(flags);
cr4 = native_read_cr4();
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ if (cr4 & X86_CR4_PGE) {
+ /* clear PGE and flush TLB of all entries */
+ native_write_cr4(cr4 & ~X86_CR4_PGE);
+ /* restore PGE as it was before */
+ native_write_cr4(cr4);
+ } else {
+ native_write_cr3(native_read_cr3());
+ }
raw_local_irq_restore(flags);
}
static inline void __native_flush_tlb_single(unsigned long addr)
{
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ /*
+ * SIMICS #GP's if you run INVPCID with type 2/3
+ * and X86_CR4_PCIDE clear. Shame!
+ *
+ * The ASIDs used below are hard-coded. But, we must not
+ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
+ * invlpg in the case we are called early.
+ */
+
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+ }
+ /* Flush the address out of both PCIDs. */
+ /*
+ * An optimization here might be to determine addresses
+ * that are only kernel-mapped and only flush the kernel
+ * ASID. But, userspace flushes are probably much more
+ * important performance-wise.
+ *
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+ if (kaiser_enabled)
+ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
}
static inline void __flush_tlb_all(void)
@@ -52,6 +179,14 @@ static inline void __flush_tlb_all(void)
__flush_tlb_global();
else
__flush_tlb();
+
+ /*
+ * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+ * we'd end up flushing kernel translations for the current ASID but
+ * we might fail to flush kernel translations for other cached ASIDs.
+ *
+ * To avoid this issue, we force PCID off if PGE is off.
+ */
}
static inline void __flush_tlb_one(unsigned long addr)
@@ -81,52 +216,8 @@ static inline void __flush_tlb_one(unsigned long addr)
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
- *
- * x86-64 can only flush individual pages or full VMs. For a range flush
- * we always do the full VM. Might be worth trying if for a small
- * range a few INVLPGs in a row are a win.
*/
-#ifndef CONFIG_SMP
-
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
- if (mm == current->active_mm)
- __flush_tlb();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb();
-}
-
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long va)
-{
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-#else /* SMP */
-
-#include <asm/smp.h>
-
#define local_flush_tlb() __flush_tlb()
extern void flush_tlb_all(void);
@@ -160,8 +251,6 @@ static inline void reset_lazy_tlbstate(void)
percpu_write(cpu_tlbstate.active_mm, &init_mm);
}
-#endif /* SMP */
-
#ifndef CONFIG_PARAVIRT_MMU
#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(mask, mm, va)
#endif
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d55597351f6a..98ffd5390845 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -26,6 +26,7 @@ enum vsyscall_num {
/* kernel space (writeable) */
extern int vgetcpu_mode;
extern struct timezone sys_tz;
+extern unsigned long vsyscall_pgprot;
#include <asm/vvar.h>
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 341b3559452b..a3ef73fcef6d 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -10,15 +10,14 @@
* In normal kernel code, they are used like any other variable.
* In user code, they are accessed through the VVAR macro.
*
- * Each of these variables lives in the vsyscall page, and each
- * one needs a unique offset within the little piece of the page
- * reserved for vvars. Specify that offset in DECLARE_VVAR.
- * (There are 896 bytes available. If you mess up, the linker will
- * catch it.)
+ * These variables live in a page of kernel data that has an extra RO
+ * mapping for userspace. Each variable needs a unique offset within
+ * that page; specify that offset with the DECLARE_VVAR macro. (If
+ * you mess up, the linker will catch it.)
*/
-/* Offset of vars within vsyscall page */
-#define VSYSCALL_VARS_OFFSET (3072 + 128)
+/* Base address of vvars. This is not ABI. */
+#define VVAR_ADDRESS (-10*1024*1024 - 4096)
#if defined(__VVAR_KERNEL_LDS)
@@ -26,17 +25,17 @@
* right place.
*/
#define DECLARE_VVAR(offset, type, name) \
- EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
+ EMIT_VVAR(name, offset)
#else
#define DECLARE_VVAR(offset, type, name) \
static type const * const vvaraddr_ ## name = \
- (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
+ (void *)(VVAR_ADDRESS + (offset));
#define DEFINE_VVAR(type, name) \
- type __vvar_ ## name \
- __attribute__((section(".vsyscall_var_" #name), aligned(16)))
+ type name \
+ __attribute__((section(".vvar_" #name), aligned(16)))
#define VVAR(name) (*vvaraddr_ ## name)
@@ -49,4 +48,3 @@ DECLARE_VVAR(8, int, vgetcpu_mode)
DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
#undef DECLARE_VVAR
-#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/include/mach-xen/asm/desc.h b/arch/x86/include/mach-xen/asm/desc.h
index 336d6208c855..60c5aa946a41 100644
--- a/arch/x86/include/mach-xen/asm/desc.h
+++ b/arch/x86/include/mach-xen/asm/desc.h
@@ -42,7 +42,11 @@ struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
+#ifdef __GENKSYMS__
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+#else
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
+#endif
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
diff --git a/arch/x86/include/mach-xen/asm/fixmap.h b/arch/x86/include/mach-xen/asm/fixmap.h
index 1fcb0c29ffcb..0f308d169fb8 100644
--- a/arch/x86/include/mach-xen/asm/fixmap.h
+++ b/arch/x86/include/mach-xen/asm/fixmap.h
@@ -77,6 +77,9 @@ enum fixed_addresses {
VSYSCALL_LAST_PAGE,
VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+ ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+#ifndef __GENKSYMS__
+ VVAR_PAGE,
+#endif
VSYSCALL_HPET,
#endif
FIX_DBGP_BASE,
diff --git a/arch/x86/include/mach-xen/asm/mmu_context.h b/arch/x86/include/mach-xen/asm/mmu_context.h
index b3ca4abcfb38..122e14e92962 100644
--- a/arch/x86/include/mach-xen/asm/mmu_context.h
+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
@@ -73,7 +73,7 @@ void destroy_context(struct mm_struct *mm);
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+#ifndef CONFIG_XEN /* XEN: no lazy tlb */
if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
@@ -121,109 +121,12 @@ static inline void __prepare_arch_switch(void)
#endif
}
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
-{
- unsigned cpu = smp_processor_id();
- struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
-#ifdef CONFIG_X86_64
- pgd_t *upgd;
-#endif
-
- if (likely(prev != next)) {
- BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
- !PagePinned(virt_to_page(next->pgd)));
-
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- percpu_write(cpu_tlbstate.active_mm, next);
-#endif
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * Re-load page tables: load_cr3(next->pgd).
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs much
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. This barrier synchronizes with
- * remote TLB flushers. Fortunately, load_cr3 is
- * serializing and thus acts as a full barrier.
- *
- */
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = virt_to_mfn(next->pgd);
- op++;
-
- /* xen_new_user_pt(next->pgd) */
-#ifdef CONFIG_X86_64
- op->cmd = MMUEXT_NEW_USER_BASEPTR;
- upgd = __user_pgd(next->pgd);
- op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
- op++;
-#endif
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
- /*
- * load the LDT, if the LDT is different:
- */
- if (unlikely(prev->context.ldt != next->context.ldt)) {
- /* load_mm_ldt(next) */
- const struct ldt_struct *ldt;
-
- /* lockless_dereference synchronizes with smp_store_release */
- ldt = lockless_dereference(next->context.ldt);
- op->cmd = MMUEXT_SET_LDT;
- if (unlikely(ldt)) {
- op->arg1.linear_addr = (long)ldt->entries;
- op->arg2.nr_ents = ldt->size;
- } else {
- op->arg1.linear_addr = 0;
- op->arg2.nr_ents = 0;
- }
- op++;
- }
-
- BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
-
- /* stop TLB flushes for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
- }
-#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
- else {
- percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
- /* We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, this is a barrier that forces
- * TLB repopulation to be ordered after the
- * store to mm_cpumask.
- */
- load_cr3(next->pgd);
- xen_new_user_pt(next->pgd);
- load_mm_ldt(next);
- }
- }
-#endif
-}
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
do { \
diff --git a/arch/x86/include/mach-xen/asm/pgtable.h b/arch/x86/include/mach-xen/asm/pgtable.h
index e14ba006b988..287b3848d6de 100644
--- a/arch/x86/include/mach-xen/asm/pgtable.h
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@ -17,6 +17,11 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
/*
* ZERO_PAGE is a global shared page that is always zero: used
@@ -580,7 +585,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ pgdval_t ignore_flags = _PAGE_USER;
+ /*
+ * We set NX on KAISER pgds that map userspace memory so
+ * that userspace can not meaningfully use the kernel
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See xen_set_pgd().
+ */
+ if (kaiser_enabled)
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -810,7 +825,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+ if (kaiser_enabled) {
+ /* Clone the shadow pgd part as well */
+ memcpy(xen_get_shadow_pgd(dst),
+ xen_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+ }
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/mach-xen/asm/pgtable_64.h b/arch/x86/include/mach-xen/asm/pgtable_64.h
index f58b2ef22f40..f069ff96cee8 100644
--- a/arch/x86/include/mach-xen/asm/pgtable_64.h
+++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
@@ -115,9 +115,31 @@ static inline pgd_t *__user_pgd(pgd_t *pgd)
+ ((unsigned long)pgd & ~PAGE_MASK));
}
+#ifdef CONFIG_KAISER
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *xen_get_shadow_pgd(pgd_t *pgdp)
+{
+#ifdef CONFIG_DEBUG_VM
+ /* linux/mmdebug.h may not have been included at this point */
+ BUG_ON(!kaiser_enabled);
+#endif
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+static inline pgd_t *xen_get_shadow_pgd(pgd_t *pgdp)
+{
+ return NULL;
+}
+#endif /* CONFIG_KAISER */
+
static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- xen_l4_entry_update(pgdp, pgd);
+ xen_l4_entry_update(pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
}
#define xen_pgd_clear(pgd) \
diff --git a/arch/x86/include/mach-xen/asm/pgtable_types.h b/arch/x86/include/mach-xen/asm/pgtable_types.h
index e0ac1f06594d..dbf6b275d640 100644
--- a/arch/x86/include/mach-xen/asm/pgtable_types.h
+++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
@@ -62,7 +62,7 @@
#endif
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#ifndef __ASSEMBLY__
#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
@@ -82,6 +82,33 @@ extern unsigned int __kernel_page_user;
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
+
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH (0)
+#define X86_CR3_PCID_USER_FLUSH (0)
+#define X86_CR3_PCID_KERN_NOFLUSH (0)
+#define X86_CR3_PCID_USER_NOFLUSH (0)
+#endif
+
/*
* PAT settings are part of the hypervisor interface, which sets the
* MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
@@ -121,7 +148,8 @@ extern unsigned int __kernel_page_user;
#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -143,7 +171,8 @@ extern unsigned int __kernel_page_user;
#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
diff --git a/arch/x86/include/mach-xen/asm/processor.h b/arch/x86/include/mach-xen/asm/processor.h
index 03329cbcacf6..7e764714ffd5 100644
--- a/arch/x86/include/mach-xen/asm/processor.h
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@ -283,7 +283,11 @@ struct tss_struct {
} ____cacheline_aligned;
+#ifndef __GENKSYMS__
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss);
+#else
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+#endif
/*
* Save the original ist values for checking stack pointers during debugging
diff --git a/arch/x86/include/mach-xen/asm/system.h b/arch/x86/include/mach-xen/asm/system.h
index b2e67609ce12..ffedcd5d4d7c 100644
--- a/arch/x86/include/mach-xen/asm/system.h
+++ b/arch/x86/include/mach-xen/asm/system.h
@@ -554,7 +554,6 @@ void stop_this_cpu(void *dummy);
*/
static __always_inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
}
diff --git a/arch/x86/include/mach-xen/asm/tlbflush.h b/arch/x86/include/mach-xen/asm/tlbflush.h
index 26eaf2e2aa6c..a2bb03743154 100644
--- a/arch/x86/include/mach-xen/asm/tlbflush.h
+++ b/arch/x86/include/mach-xen/asm/tlbflush.h
@@ -6,6 +6,27 @@
#include <asm/processor.h>
#include <asm/system.h>
+#ifndef __GENKSYMS__
+#include <asm/smp.h>
+#endif
+
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+#define kaiser_enabled 0
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
#define __flush_tlb() xen_tlb_flush()
#define __flush_tlb_global() xen_tlb_flush()
@@ -39,46 +60,8 @@ static inline void __flush_tlb_one(unsigned long addr)
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
- *
- * x86-64 can only flush individual pages or full VMs. For a range flush
- * we always do the full VM. Might be worth trying if for a small
- * range a few INVLPGs in a row are a win.
*/
-#ifndef CONFIG_SMP
-
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
- if (mm == current->active_mm)
- __flush_tlb();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb();
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-#else /* SMP */
-
-#include <asm/smp.h>
-
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_all xen_tlb_flush_all
@@ -111,8 +94,6 @@ static inline void reset_lazy_tlbstate(void)
}
#endif
-#endif /* SMP */
-
static inline void flush_tlb_kernel_range(unsigned long start,
unsigned long end)
{
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index edd4deb99cf3..674c0f29fbf4 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,6 +16,7 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o sched.o mshyperv.o
obj-y += rdrand.o
+obj-y += spec_ctrl.o
obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a84e4483859b..00b914189555 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/pci-direct.h>
+#include <asm/spec_ctrl.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
@@ -629,8 +630,17 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has_xmm2) {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serialized.
+ */
+ if (c->x86 > 0xf)
+ rdmsrl(MSR_F10H_DECFG, value);
+ value |= MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT;
+ wrmsrl(MSR_F10H_DECFG, value);
+
+ /* LFENCE with MSR_F10H_DECFG[1]=1 stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
#ifdef CONFIG_X86_64
@@ -693,6 +703,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
}
}
+ x86_spec_check();
#endif
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 8e69ea1e20be..f5a31ccdf7b9 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -161,6 +161,14 @@ static void __init check_config(void)
void __init check_bugs(void)
{
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
identify_boot_cpu();
#ifndef CONFIG_SMP
printk(KERN_INFO "CPU: ");
diff --git a/arch/x86/kernel/cpu/common-xen.c b/arch/x86/kernel/cpu/common-xen.c
index f4fb4679bea8..1f00603ce637 100644
--- a/arch/x86/kernel/cpu/common-xen.c
+++ b/arch/x86/kernel/cpu/common-xen.c
@@ -92,7 +92,11 @@ static const struct cpu_dev __cpuinitconst default_cpu = {
static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+#ifndef __GENKSYMS__
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
+#else
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#endif
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -296,6 +300,49 @@ static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
}
}
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+#ifndef CONFIG_XEN
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+#ifdef CONFIG_X86_64
+ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ /*
+ * Regardless of whether PCID is enumerated, the
+ * SDM says that it can't be enabled in 32-bit mode.
+ */
+ set_in_cr4(X86_CR4_PCIDE);
+ /*
+ * INVPCID has two "groups" of types:
+ * 1/2: Invalidate an individual address
+ * 3/4: Invalidate all contexts
+ *
+ * 1/2 take a PCID, but 3/4 do not. So, 3/4
+ * ignore the PCID argument in the descriptor.
+ * But, we have to be careful not to call 1/2
+ * with an actual non-zero PCID in them before
+ * we do the above set_in_cr4().
+ */
+ if (cpu_has(c, X86_FEATURE_INVPCID))
+ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ }
+#else
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+#endif
+ }
+#else
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+#endif
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -608,6 +655,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
c->x86_cache_alignment = c->x86_clflush_size;
}
}
+ kaiser_setup_pcid();
}
void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
@@ -896,6 +944,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
+ /* Set up PCID */
+ setup_pcid(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -1132,7 +1183,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
#endif
@@ -1251,6 +1302,15 @@ void __cpuinit cpu_init(void)
struct task_struct *me;
int cpu;
+ if (!kaiser_enabled) {
+ /*
+ * secondary_startup_64() deferred setting PGE in cr4:
+ * init_memory_mapping() sets it on the boot cpu,
+ * but it needs to be set on each secondary cpu.
+ */
+ set_in_cr4(X86_CR4_PGE);
+ }
+
cpu = stack_smp_processor_id();
/* CPU 0 is initialised in head64.c */
if (cpu != 0)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4ee9034db13b..abed67096e16 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -84,7 +84,11 @@ static const struct cpu_dev __cpuinitconst default_cpu = {
static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+#ifndef __GENKSYMS__
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
+#else
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#endif
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -155,6 +159,40 @@ static int __init x86_xsaveopt_setup(char *s)
}
__setup("noxsaveopt", x86_xsaveopt_setup);
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
+static int __init x86_noinvpcid_setup(char *s)
+{
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -276,6 +314,44 @@ static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
}
}
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+#ifdef CONFIG_X86_64
+ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ /*
+ * Regardless of whether PCID is enumerated, the
+ * SDM says that it can't be enabled in 32-bit mode.
+ */
+ set_in_cr4(X86_CR4_PCIDE);
+ /*
+ * INVPCID has two "groups" of types:
+ * 1/2: Invalidate an individual address
+ * 3/4: Invalidate all contexts
+ *
+ * 1/2 take a PCID, but 3/4 do not. So, 3/4
+ * ignore the PCID argument in the descriptor.
+ * But, we have to be careful not to call 1/2
+ * with an actual non-zero PCID in them before
+ * we do the above set_in_cr4().
+ */
+ if (cpu_has(c, X86_FEATURE_INVPCID))
+ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ }
+#else
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+#endif
+ }
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -352,8 +428,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
-__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS];
+__u32 cpu_caps_set[NCAPINTS];
void load_percpu_segment(int cpu)
{
@@ -558,6 +634,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
c->x86_cache_alignment = c->x86_clflush_size;
}
}
+ kaiser_setup_pcid();
}
void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
@@ -833,6 +910,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
+ /* Set up PCID */
+ setup_pcid(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -1055,7 +1135,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
@@ -1155,6 +1235,15 @@ void __cpuinit cpu_init(void)
int cpu;
int i;
+ if (!kaiser_enabled) {
+ /*
+ * secondary_startup_64() deferred setting PGE in cr4:
+ * init_memory_mapping() sets it on the boot cpu,
+ * but it needs to be set on each secondary cpu.
+ */
+ set_in_cr4(X86_CR4_PGE);
+ }
+
cpu = stack_smp_processor_id();
t = &per_cpu(init_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9efe401750e3..907305c40df2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -14,6 +14,7 @@
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
+#include <asm/spec_ctrl.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -493,6 +494,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
"ENERGY_PERF_BIAS: View and update with"
" cpupower-set(8)\n");
}
+
+ x86_spec_check();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index e13b51713adf..5a5503325ad7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,11 +2,15 @@
#include <linux/types.h>
#include <linux/slab.h>
+#include <asm/kaiser.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "perf_event.h"
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
@@ -147,6 +151,39 @@ void fini_debug_store_on_cpu(int cpu)
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+ unsigned int order = get_order(size);
+ struct page *page;
+ unsigned long addr;
+
+ page = alloc_pages_node(node, flags | __GFP_ZERO, order);
+ if (!page)
+ return NULL;
+ addr = (unsigned long)page_address(page);
+ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+ __free_pages(page, order);
+ addr = 0;
+ }
+ return (void *)addr;
+#else
+ return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+ if (!buffer)
+ return;
+ kaiser_remove_mapping((unsigned long)buffer, size);
+ free_pages((unsigned long)buffer, get_order(size));
+#else
+ kfree(buffer);
+#endif
+}
+
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -157,7 +194,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ buffer = dsalloc(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -181,7 +218,7 @@ static void release_pebs_buffer(int cpu)
if (!ds || !x86_pmu.pebs)
return;
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ dsfree((void *)(unsigned long)ds->pebs_buffer_base, PEBS_BUFFER_SIZE);
ds->pebs_buffer_base = 0;
}
@@ -195,7 +232,7 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
+ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -219,19 +256,15 @@ static void release_bts_buffer(int cpu)
if (!ds || !x86_pmu.bts)
return;
- kfree((void *)(unsigned long)ds->bts_buffer_base);
+ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
}
static int alloc_ds_buffer(int cpu)
{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
-
- ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
- if (unlikely(!ds))
- return -ENOMEM;
+ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
+ memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
@@ -245,7 +278,6 @@ static void release_ds_buffer(int cpu)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
}
void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index bccb8a8888e9..d8309c1673e0 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
#ifndef CONFIG_XEN
+ { X86_FEATURE_SPEC_CTRL, CR_EDX,26, 0x00000007, 0 },
+ { X86_FEATURE_IBPB, CR_EBX,12, 0x80000008, 0 },
{ X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
{ X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
{ X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
diff --git a/arch/x86/kernel/cpu/spec_ctrl.c b/arch/x86/kernel/cpu/spec_ctrl.c
new file mode 100644
index 000000000000..e5a689196392
--- /dev/null
+++ b/arch/x86/kernel/cpu/spec_ctrl.c
@@ -0,0 +1,98 @@
+/*
+ * Speculation control stuff
+ *
+ */
+#include <linux/module.h>
+
+#include <asm/msr.h>
+#include <asm/proto.h>
+#include <asm/processor.h>
+#include <asm/spec_ctrl.h>
+
+/*
+ * Keep it open for more flags in case needed.
+ */
+static unsigned int ibrs_state = 0;
+static unsigned int ibpb_state = 0;
+
+unsigned int notrace x86_ibrs_enabled(void)
+{
+ return ibrs_state;
+}
+EXPORT_SYMBOL_GPL(x86_ibrs_enabled);
+
+unsigned int notrace x86_ibpb_enabled(void)
+{
+ return ibpb_state;
+}
+EXPORT_SYMBOL_GPL(x86_ibpb_enabled);
+
+void x86_disable_ibrs(void)
+{
+ if (x86_ibrs_enabled())
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+}
+EXPORT_SYMBOL_GPL(x86_disable_ibrs);
+
+void x86_enable_ibrs(void)
+{
+ if (x86_ibrs_enabled())
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
+}
+EXPORT_SYMBOL_GPL(x86_enable_ibrs);
+
+/*
+ * Do this indirection as otherwise we'd need to backport the
+ * EXPORT_SYMBOL_GPL() for asm stuff.
+ */
+void stuff_RSB(void)
+{
+#ifndef CONFIG_XEN
+ stuff_rsb();
+#endif
+}
+EXPORT_SYMBOL_GPL(stuff_RSB);
+
+/*
+ * Called after upgrading microcode, check CPUID directly.
+ */
+void x86_spec_check(void)
+{
+ if (cpuid_edx(7) & BIT(26)) {
+ ibrs_state = 1;
+ ibpb_state = 1;
+
+ setup_force_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ }
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (cpuid_ebx(0x80000008) & BIT(12)) {
+ ibpb_state = 1;
+ } else {
+ u64 val;
+
+ switch (boot_cpu_data.x86) {
+ case 0x10:
+ case 0x12:
+ case 0x16:
+ pr_info_once("Disabling indirect branch predictor support\n");
+ rdmsrl(MSR_F15H_IC_CFG, val);
+ val |= MSR_F15H_IC_CFG_DIS_IND;
+ wrmsrl(MSR_F15H_IC_CFG, val);
+ break;
+ }
+ ibpb_state = 0;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(x86_spec_check);
+
+static int __init nospec(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ ibrs_state = 0;
+ ibpb_state = 0;
+
+ return 0;
+}
+early_param("nospec", nospec);
diff --git a/arch/x86/kernel/entry_64-xen.S b/arch/x86/kernel/entry_64-xen.S
index 49693e930107..6bed7c967c62 100644
--- a/arch/x86/kernel/entry_64-xen.S
+++ b/arch/x86/kernel/entry_64-xen.S
@@ -57,6 +57,9 @@
#include <asm/ftrace.h>
#include <asm/percpu.h>
#include <asm/pgtable_types.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeature.h>
+#include <asm/kaiser.h>
#include <xen/interface/xen.h>
#include <xen/interface/features.h>
@@ -354,6 +357,7 @@ ENTRY(save_args)
testl $3, CS(%rdi)
je 1f
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
@@ -394,6 +398,12 @@ END(save_rest)
#ifndef CONFIG_XEN
/* save complete stack frame */
.pushsection .kprobes.text, "ax"
+/*
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+ */
ENTRY(save_paranoid)
XCPT_FRAME offset=ORIG_RAX-R15+8
cld
@@ -419,7 +429,26 @@ ENTRY(save_paranoid)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx,%ebx
-1: ret
+1:
+#ifdef CONFIG_KAISER
+ /*
+ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ movq %rax, %cr3
+2:
+#endif
+ ret
CFI_ENDPROC
END(save_paranoid)
.popsection
@@ -1152,30 +1181,41 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
* is fundamentally NMI-unsafe. (we cannot change the soft and
* hard flags at once, atomically)
*/
-
- /* ebx: no swapgs flag */
+/*
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
+ */
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
+ movq %rbx, %r12 /* paranoid_userspace uses %ebx */
+ testl $3, CS(%rsp)
+ jnz paranoid_userspace
+paranoid_kernel:
+ movq %r12, %rbx /* restore after paranoid_userspace */
TRACE_IRQS_IRETQ 0
+#ifdef CONFIG_KAISER
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
+ jnz paranoid_exit_no_swapgs
SWAPGS_UNSAFE_STACK
+paranoid_exit_no_swapgs:
RESTORE_ALL 8
- jmp irq_return
-paranoid_restore:
- TRACE_IRQS_IRETQ 0
- RESTORE_ALL 8
- jmp irq_return
+ jmp irq_return
+
paranoid_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
+ jz paranoid_kernel
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
@@ -1225,6 +1265,13 @@ ENTRY(error_entry)
movq %r14, R14+8(%rsp)
movq %r15, R15+8(%rsp)
#ifndef CONFIG_XEN
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
+ * calling TRACE_IRQS_*. Just unconditionally switch to
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b03c44a000d0..e84e599ce6e8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,9 @@
#include <asm/percpu.h>
#include <asm/asm.h>
#include <asm/pgtable_types.h>
+#include <asm/cpufeature.h>
+#include <asm/kaiser.h>
+#include <asm/spec_ctrl.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -328,6 +331,9 @@ ENTRY(save_args)
testl $3, CS(%rdi)
je 1f
SWAPGS
+ SWITCH_KERNEL_CR3
+ ENABLE_IBRS
+ STUFF_RSB
/*
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
@@ -366,6 +372,12 @@ END(save_rest)
/* save complete stack frame */
.pushsection .kprobes.text, "ax"
+/*
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
+ */
ENTRY(save_paranoid)
XCPT_FRAME offset=ORIG_RAX-R15+8
cld
@@ -384,6 +396,10 @@ ENTRY(save_paranoid)
movq %r13, R13+8(%rsp)
movq %r14, R14+8(%rsp)
movq %r15, R15+8(%rsp)
+
+ /* Do the stuffing unconditionally from user/kernel to be safe */
+ STUFF_RSB
+
movl $1,%ebx
movl $MSR_GS_BASE,%ecx
rdmsr
@@ -391,7 +407,29 @@ ENTRY(save_paranoid)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx,%ebx
-1: ret
+1:
+#ifdef CONFIG_KAISER
+ /*
+ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+ ALTERNATIVE "jmp 2f", "", X86_FEATURE_KAISER
+ movq %cr3, %rax
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ ALTERNATIVE ASM_NOP5, "bts $63, %rax", X86_FEATURE_PCID
+ movq %rax, %cr3
+2:
+#endif
+
+ ENABLE_IBRS
+ ret
CFI_ENDPROC
END(save_paranoid)
.popsection
@@ -468,6 +506,7 @@ ENTRY(system_call)
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -487,6 +526,10 @@ ENTRY(system_call_after_swapgs)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
GET_THREAD_INFO(%rcx)
+
+ ENABLE_IBRS
+ STUFF_RSB
+
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
jnz tracesys
system_call_fastpath:
@@ -519,6 +562,16 @@ sysret_check:
CFI_REGISTER rip,rcx
RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
+
+ DISABLE_IBRS
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq PER_CPU_VAR(old_rsp), %rsp
USERGS_SYSRET64
@@ -858,6 +911,17 @@ retint_swapgs: /* return to user-space */
*/
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+
+ DISABLE_IBRS
+
+ SWITCH_USER_CR3
SWAPGS
jmp restore_args
@@ -898,6 +962,11 @@ native_irq_return_ldt:
pushq_cfi %rax
pushq_cfi %rdi
SWAPGS
+ SWITCH_KERNEL_CR3
+
+ ENABLE_IBRS
+ STUFF_RSB
+
movq PER_CPU_VAR(espfix_waddr),%rdi
movq %rax,(0*8)(%rdi) /* RAX */
movq (2*8)(%rsp),%rax /* RIP */
@@ -913,6 +982,10 @@ native_irq_return_ldt:
andl $0xffff0000,%eax
popq_cfi %rdi
orq PER_CPU_VAR(espfix_stack),%rax
+
+ DISABLE_IBRS
+
+ SWITCH_USER_CR3
SWAPGS
movq %rax,%rsp
popq_cfi %rax
@@ -1447,30 +1520,44 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
* is fundamentally NMI-unsafe. (we cannot change the soft and
* hard flags at once, atomically)
*/
-
- /* ebx: no swapgs flag */
+/*
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
+ */
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
+ movq %rbx, %r12 /* paranoid_userspace uses %ebx */
+ testl $3, CS(%rsp)
+ jnz paranoid_userspace
+paranoid_kernel:
+ movq %r12, %rbx /* restore after paranoid_userspace */
TRACE_IRQS_IRETQ 0
+
+ DISABLE_IBRS
+
+#ifdef CONFIG_KAISER
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
+ jnz paranoid_exit_no_swapgs
SWAPGS_UNSAFE_STACK
+paranoid_exit_no_swapgs:
RESTORE_ALL 8
- jmp irq_return
-paranoid_restore:
- TRACE_IRQS_IRETQ 0
- RESTORE_ALL 8
- jmp irq_return
+ jmp irq_return
+
paranoid_userspace:
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%ebx
andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
+ jz paranoid_kernel
movq %rsp,%rdi /* &pt_regs */
call sync_regs
movq %rax,%rsp /* switch stack for scheduling */
@@ -1518,6 +1605,19 @@ ENTRY(error_entry)
movq %r13, R13+8(%rsp)
movq %r14, R14+8(%rsp)
movq %r15, R15+8(%rsp)
+
+ STUFF_RSB
+
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
+ * calling TRACE_IRQS_*. Just unconditionally switch to
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
+
+ ENABLE_IBRS
+
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1664,6 +1764,8 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
+
cld
movq %rsp, %rdx
movq PER_CPU_VAR(kernel_stack), %rsp
@@ -1690,6 +1792,8 @@ ENTRY(nmi)
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ ENABLE_IBRS
+
/*
* At this point we no longer need to worry about stack damage
* due to nesting -- we're on the normal thread stack and we're
@@ -1700,11 +1804,14 @@ ENTRY(nmi)
movq $-1, %rsi
call do_nmi
+ DISABLE_IBRS
+
/*
* Return back to user mode. We must *not* do the normal exit
* work, because we don't want to enable interrupts. Fortunately,
* do_nmi doesn't modify pt_regs.
*/
+ SWITCH_USER_CR3
SWAPGS
/*
@@ -1863,10 +1970,16 @@ restart_nmi:
je 1f
movq %r12, %cr2
1:
-
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
+
+#ifdef CONFIG_KAISER
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: save_paranoid sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz nmi_swapgs
+ SWITCH_USER_CR3
nmi_swapgs:
+#endif
+ testl $1,%ebx /* swapgs needed? */
+ jnz nmi_restore
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_ALL 8
@@ -1903,6 +2016,76 @@ ENTRY(ignore_sysret)
CFI_ENDPROC
END(ignore_sysret)
+ENTRY(stuff_rsb)
+ call 1f
+ pause
+1: call 2f
+ pause
+2: call 3f;
+ pause
+3: call 4f
+ pause
+4: call 5f
+ pause
+5: call 6f
+ pause
+6: call 7f
+ pause
+7: call 8f
+ pause
+8: call 9f
+ pause
+9: call 10f
+ pause
+10: call 11f
+ pause
+11: call 12f
+ pause
+12: call 13f
+ pause
+13: call 14f
+ pause
+14: call 15f
+ pause
+15: call 16f
+ pause
+16: call 17f
+ pause
+17: call 18f
+ pause
+18: call 19f
+ pause
+19: call 20f
+ pause
+20: call 21f
+ pause
+21: call 22f
+ pause
+22: call 23f
+ pause
+23: call 24f
+ pause
+24: call 25f
+ pause
+25: call 26f
+ pause
+26: call 27f
+ pause
+27: call 28f
+ pause
+28: call 29f
+ pause
+29: call 30f
+ pause
+30: call 31f
+ pause
+31: call 32f
+ pause
+32:
+ add $(32*8), %rsp
+ ret
+END(stuff_rsb)
+
/*
* End of kprobes section
*/
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8a64da36310f..663b5f2eed42 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -40,6 +40,7 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/setup.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -128,7 +129,14 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-
+ /*
+ * Just copy the top-level PGD that is mapping the espfix
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ */
+ if (kaiser_enabled)
+ set_pgd(native_get_shadow_pgd(pgd_p),
+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
/* Randomize the locations */
init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9eaf89345eb6..8eef0c2067e4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -160,8 +160,8 @@ ENTRY(secondary_startup_64)
* after the boot processor executes this code.
*/
- /* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
+ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+ movl $(X86_CR4_PAE | X86_CR4_PSE), %eax
movq %rax, %cr4
/* Setup early boot stage 4 level pagetables. */
@@ -343,6 +343,27 @@ early_idt_ripmsg:
.balign PAGE_SIZE; \
ENTRY(name)
+#ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL 0
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -358,13 +379,14 @@ ENTRY(name)
* 0xffffffff80000000 to physical address 0x000000. (always using
* 2Mbyte large pages provided by PAE mode)
*/
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -390,6 +412,7 @@ NEXT_PAGE(level2_ident_pgt)
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level2_kernel_pgt)
/*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 2dfc5cb5aacd..fe7626b1167a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -10,6 +10,7 @@
#include <linux/cpu.h>
#include <linux/pm.h>
#include <linux/io.h>
+#include <linux/kaiser.h>
#include <asm/fixmap.h>
#include <asm/i8253.h>
@@ -99,7 +100,9 @@ static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+ __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
+ kaiser_add_mapping(__fix_to_virt(VSYSCALL_HPET), PAGE_SIZE,
+ __PAGE_KERNEL_VVAR_NOCACHE);
#endif
}
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index c50f8639545b..c0eed0ccc7f8 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -39,5 +39,10 @@ EXPORT_SYMBOL(init_task);
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
+#if defined(CONFIG_GENKSYMS)
DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+#else
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, init_tss) = INIT_TSS;
#endif
+#endif
+
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 05e4da281025..961e52b71154 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -85,7 +85,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = -1,
};
diff --git a/arch/x86/kernel/ldt-xen.c b/arch/x86/kernel/ldt-xen.c
index c822a047e2e8..d0c2212e5a90 100644
--- a/arch/x86/kernel/ldt-xen.c
+++ b/arch/x86/kernel/ldt-xen.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/kaiser.h>
#include <asm/system.h>
#include <asm/ldt.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(ldt->entries, ldt->size);
}
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
+ int ret;
if (size > LDT_ENTRIES)
return NULL;
@@ -65,7 +76,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
kfree(new_ldt);
return NULL;
}
+
+ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
new_ldt->size = size;
+ if (ret) {
+ __free_ldt_struct(new_ldt);
+ return NULL;
+ }
return new_ldt;
}
@@ -96,13 +114,11 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
+ kaiser_remove_mapping((unsigned long)ldt->entries,
+ ldt->size * LDT_ENTRY_SIZE);
make_pages_writable(ldt->entries, PFN_UP(ldt->size * LDT_ENTRY_SIZE),
XENFEAT_writable_descriptor_tables);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
- else
- free_page((unsigned long)ldt->entries);
- kfree(ldt);
+ __free_ldt_struct(ldt);
}
/*
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index f72f6ed6b619..9dab02470fc4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/kaiser.h>
#include <asm/system.h>
#include <asm/ldt.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(ldt->entries, ldt->size);
}
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ kfree(ldt->entries);
+ kfree(ldt);
+}
+
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
+ int ret;
if (size > LDT_ENTRIES)
return NULL;
@@ -65,7 +76,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
return NULL;
}
+ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
new_ldt->size = size;
+ if (ret) {
+ __free_ldt_struct(new_ldt);
+ return NULL;
+ }
return new_ldt;
}
@@ -95,12 +112,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
+ kaiser_remove_mapping((unsigned long)ldt->entries,
+ ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
- else
- kfree(ldt->entries);
- kfree(ldt);
+ __free_ldt_struct(ldt);
}
/*
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index c598a6c8fedd..99ee721e8068 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -87,6 +87,7 @@
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/perf_event.h>
+#include <asm/spec_ctrl.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -321,8 +322,11 @@ static ssize_t reload_store(struct sys_device *dev,
if (!ret)
ret = tmp_ret;
}
- if (!ret)
+ if (!ret) {
perf_check_microcode();
+ x86_spec_check();
+ }
+
mutex_unlock(&microcode_mutex);
put_online_cpus();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3a40deb0dd22..98e748611f19 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -22,6 +22,7 @@
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/debugreg.h>
+#include <asm/spec_ctrl.h>
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -432,10 +433,14 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
+ x86_disable_ibrs();
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__mwait(ax, cx);
+
+ x86_enable_ibrs();
}
}
@@ -448,12 +453,17 @@ static void mwait_idle(void)
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
+ x86_disable_ibrs();
+
__monitor((void *)&current_thread_info()->flags, 0, 0);
smp_mb();
if (!need_resched())
__sti_mwait(0, 0);
else
local_irq_enable();
+
+ x86_enable_ibrs();
+
trace_power_end(smp_processor_id());
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
} else
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1e2a86545f7e..d56335cc6c5e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,7 +53,7 @@
asmlinkage extern void ret_from_fork(void);
-DEFINE_PER_CPU(unsigned long, old_rsp);
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, old_rsp);
static DEFINE_PER_CPU(unsigned char, is_idle);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d04115afa311..686a2a5fd74a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -377,6 +377,12 @@ void machine_real_restart(unsigned int type)
lowmem_gdt[1] =
GDT_ENTRY(0x009b, restart_pa, 0xffff);
+#ifdef CONFIG_X86_64
+ /* Exiting long mode will fail if CR4.PCIDE is set. */
+ if (static_cpu_has(X86_FEATURE_PCID))
+ clear_in_cr4(X86_CR4_PCIDE);
+#endif
+
/* Jump to the identity-mapped low memory code */
restart_lowmem(type);
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4d20a30930d4..143a2e726cb3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,6 +114,7 @@
#include <asm/mce.h>
#include <asm/alternative.h>
#include <asm/prom.h>
+#include <asm/kaiser.h>
/*
* end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -915,6 +916,12 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ /*
+ * This needs to happen right after XENPV is set on xen and
+ * kaiser_enabled is checked below in cleanup_highmap().
+ */
+ kaiser_check_boottime_disable();
+
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4c852c68beed..3734d3919b66 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -71,6 +71,7 @@
#include <asm/smpboot_hooks.h>
#include <asm/i8259.h>
+#include <asm/spec_ctrl.h>
/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
@@ -1432,8 +1433,12 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
+ x86_disable_ibrs();
+
mwait_play_dead(); /* Only returns on failure */
hlt_play_dead();
+
+ x86_enable_ibrs();
}
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 55b755289e10..048e33ff8ca8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -167,12 +167,6 @@ SECTIONS
#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
- ADDR(.vsyscall_0) + offset \
- : AT(VLOAD(.vsyscall_var_ ## x)) { \
- *(.vsyscall_var_ ## x) \
- } \
- x = VVIRT(.vsyscall_var_ ## x);
. = ALIGN(4096);
__vsyscall_0 = .;
@@ -198,19 +192,31 @@ SECTIONS
*(.vsyscall_3)
}
-#define __VVAR_KERNEL_LDS
-#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
-
- . = __vsyscall_0 + PAGE_SIZE;
+ . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
#undef VSYSCALL_ADDR
#undef VLOAD_OFFSET
#undef VLOAD
#undef VVIRT_OFFSET
#undef VVIRT
+
+ __vvar_page = .;
+
+ .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) \
+ . = offset; \
+ *(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
+ } :data
+
+ . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+
#endif /* CONFIG_X86_64 */
/* Init code and data - will be freed after init */
diff --git a/arch/x86/kernel/vsyscall_64-xen.c b/arch/x86/kernel/vsyscall_64-xen.c
index 911b7a270bfb..12176fc484d9 100644
--- a/arch/x86/kernel/vsyscall_64-xen.c
+++ b/arch/x86/kernel/vsyscall_64-xen.c
@@ -49,6 +49,8 @@
__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","cx","memory"
+unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL;
+
DEFINE_VVAR(int, vgetcpu_mode);
DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
{
@@ -284,9 +286,14 @@ void __init map_vsyscall(void)
{
extern char __vsyscall_0;
unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+ extern char __vvar_page;
+ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, __pgprot(vsyscall_pgprot));
+ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+ (unsigned long)VVAR_ADDRESS);
}
static int __init vsyscall_init(void)
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3e682184d76c..8f4497a0e8ba 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,6 +49,8 @@
__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
#define __syscall_clobber "r11","cx","memory"
+unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL;
+
DEFINE_VVAR(int, vgetcpu_mode);
DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
{
@@ -284,9 +286,14 @@ void __init map_vsyscall(void)
{
extern char __vsyscall_0;
unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+ extern char __vvar_page;
+ unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, __pgprot(vsyscall_pgprot));
+ __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+ (unsigned long)VVAR_ADDRESS);
}
static int __init vsyscall_init(void)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ede1c237acf8..28f4724816d9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -33,6 +33,7 @@
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/kvm_para.h>
+#include <asm/spec_ctrl.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -128,6 +129,8 @@ struct vcpu_svm {
u64 next_rip;
+ u64 spec_ctrl;
+
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
struct {
u16 fs;
@@ -170,6 +173,8 @@ static struct svm_direct_access_msrs {
{ .index = MSR_CSTAR, .always = true },
{ .index = MSR_SYSCALL_MASK, .always = true },
#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = true },
+ { .index = MSR_IA32_PRED_CMD, .always = true },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -382,6 +387,8 @@ struct svm_cpu_data {
struct kvm_ldttss_desc *tss_desc;
struct page *save_area;
+
+ struct vmcb *current_vmcb;
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -1286,11 +1293,18 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
+
+ /*
+ * The VMCB could be recycled, causing a false negative in svm_vcpu_load;
+ * block speculative execution.
+ */
+ x86_ibp_barrier();
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
int i;
if (unlikely(cpu != vcpu->cpu)) {
@@ -1313,6 +1327,11 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
__get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
}
+
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+ x86_ibp_barrier();
+ }
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -3053,6 +3072,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
case MSR_VM_CR:
*data = svm->nested.vm_cr_msr;
break;
+ case MSR_IA32_SPEC_CTRL:
+ *data = svm->spec_ctrl;
+ break;
case MSR_IA32_UCODE_REV:
*data = 0x01000065;
break;
@@ -3168,6 +3190,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
+ case MSR_IA32_SPEC_CTRL:
+ svm->spec_ctrl = data;
+ break;
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -3805,6 +3830,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
local_irq_enable();
+ if (x86_ibrs_enabled() && (svm->spec_ctrl != FEATURE_ENABLE_IBRS))
+ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
asm volatile (
"push %%"R"bp; \n\t"
"mov %c[rbx](%[svm]), %%"R"bx \n\t"
@@ -3849,6 +3877,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
#endif
+ /* Clear host registers (marked as clobbered so it's safe) */
+ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
+ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
+ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
+ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
+#ifdef CONFIG_X86_64
+ "xor %%r8, %%r8 \n\t"
+ "xor %%r9, %%r9 \n\t"
+ "xor %%r10, %%r10 \n\t"
+ "xor %%r11, %%r11 \n\t"
+ "xor %%r12, %%r12 \n\t"
+ "xor %%r13, %%r13 \n\t"
+ "xor %%r14, %%r14 \n\t"
+ "xor %%r15, %%r15 \n\t"
+#endif
"pop %%"R"bp"
:
: [svm]"a"(svm),
@@ -3876,6 +3920,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ if (x86_ibrs_enabled()) {
+ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+ if (svm->spec_ctrl != FEATURE_ENABLE_IBRS)
+ wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS);
+ }
+
+ stuff_RSB();
+
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e765dbf90ce7..a7d7d032d72f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -39,6 +39,9 @@
#include <asm/mce.h>
#include <asm/i387.h>
#include <asm/xcr.h>
+#include <asm/msr-index.h>
+#include <asm/spec_ctrl.h>
+#include <asm/proto.h>
#include "trace.h"
@@ -463,6 +466,8 @@ struct vcpu_vmx {
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
+
+ u64 spec_ctrl;
};
enum segment_cache_field {
@@ -1605,6 +1610,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ x86_ibp_barrier();
}
if (vmx->loaded_vmcs->cpu != cpu) {
@@ -2266,6 +2273,7 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
*/
static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 data;
struct shared_msr_entry *msr;
@@ -2283,8 +2291,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
- vmx_load_host_state(to_vmx(vcpu));
- data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ vmx_load_host_state(vmx);
+ data = vmx->msr_guest_kernel_gs_base;
break;
#endif
case MSR_EFER:
@@ -2292,6 +2300,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
case MSR_IA32_TSC:
data = guest_read_tsc();
break;
+ case MSR_IA32_SPEC_CTRL:
+ data = vmx->spec_ctrl;
+ break;
case MSR_IA32_SYSENTER_CS:
data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -2302,16 +2313,16 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_TSC_AUX:
- if (!to_vmx(vcpu)->rdtscp_enabled)
+ if (!vmx->rdtscp_enabled)
return 1;
/* Otherwise falls through */
default:
- vmx_load_host_state(to_vmx(vcpu));
+ vmx_load_host_state(vmx);
if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
return 0;
- msr = find_msr_entry(to_vmx(vcpu), msr_index);
+ msr = find_msr_entry(vmx, msr_index);
if (msr) {
- vmx_load_host_state(to_vmx(vcpu));
+ vmx_load_host_state(vmx);
data = msr->data;
break;
}
@@ -2366,6 +2377,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_SPEC_CTRL:
+ vmx->spec_ctrl = msr_info->data;
+ break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -6725,6 +6739,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ if (x86_ibrs_enabled())
+ add_atomic_switch_msr(vmx, MSR_IA32_SPEC_CTRL,
+ vmx->spec_ctrl, FEATURE_ENABLE_IBRS);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -6825,6 +6843,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
+ stuff_RSB();
+
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_RFLAGS)
| (1 << VCPU_EXREG_CPL)
@@ -8082,6 +8102,8 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SPEC_CTRL, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_PRED_CMD, false);
memcpy(vmx_msr_bitmap_legacy_x2apic,
vmx_msr_bitmap_legacy, PAGE_SIZE);
memcpy(vmx_msr_bitmap_longmode_x2apic,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdce2fbabea7..998e368219e4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -904,7 +904,8 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_SPEC_CTRL,
};
static unsigned num_msrs_to_save;
@@ -2549,6 +2550,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(RTM) | F(FSGSBASE) | F(ERMS) |
f_invpcid;
+ const u32 kvm_cpuid_7_0_edx_x86_features = F(SPEC_CTRL);
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
do_cpuid_1_ent(entry, function, index);
@@ -2612,7 +2615,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ebx = 0;
entry->eax = 0;
entry->ecx = 0;
- entry->edx = 0;
+ entry->edx &= kvm_cpuid_7_0_edx_x86_features;
break;
}
/* function 0xb has additional index. */
@@ -2674,6 +2677,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_supported_word6_x86_features;
cpuid_mask(&entry->ecx, 6);
break;
+
+ case 0x80000008:
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+
+ /*
+ * cpuid 0x80000008.0.ebx
+ *
+ * Boris: hardcode due to prior kABI fix.
+ */
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ entry->ebx |= (1 << 12);
+
+ break;
+
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
/*Just support up to 0xC0000004 now*/
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a34c2d9d1860..bb850f84f449 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,7 +17,7 @@ clean-files := inat-tables.c
obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
obj-$(CONFIG_XEN) += cache-smp.o
-lib-y := delay.o
+lib-y := delay.o cmdline.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 000000000000..a744506856b1
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,189 @@
+/*
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * Misc librarized functions for cmdline poking.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/setup.h>
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/**
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * @cmdline: the cmdline string
+ * @option: option string to look for
+ *
+ * Returns the position of that @option (starts counting with 1)
+ * or 0 on not found.
+ */
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+ char c;
+ int len, pos = 0, wstart = 0;
+ const char *opptr = NULL;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
+ if (!len)
+ return 0;
+
+ while (len--) {
+ c = *(char *)cmdline++;
+ pos++;
+
+ switch (state) {
+ case st_wordstart:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ wstart = pos;
+ /* fall through */
+
+ case st_wordcmp:
+ if (!*opptr)
+ if (!c || myisspace(c))
+ return wstart;
+ else
+ state = st_wordskip;
+ else if (!c)
+ return 0;
+ else if (c != *opptr++)
+ state = st_wordskip;
+ else if (!len) /* last word and is matching */
+ return wstart;
+ break;
+
+ case st_wordskip:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ state = st_wordstart;
+ break;
+ }
+ }
+
+ return 0; /* Buffer overrun */
+}
+
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+ const char *option, char *buffer, int bufsize)
+{
+ char c;
+ int pos = 0, len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy, /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos++ < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ if (!c)
+ break;
+
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if ((c == '=') && !*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for, prepare to
+ * copy the argument.
+ */
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ /*
+ * Increment len, but don't overrun the
+ * supplied buffer and leave room for the
+ * NULL terminator.
+ */
+ if (++len < bufsize)
+ *bufptr++ = c;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+ int bufsize)
+{
+ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+ buffer, bufsize);
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index ef634b722908..cc085a9c249b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o
+ pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
@@ -7,7 +7,6 @@ CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
-obj-$(CONFIG_SMP) += tlb.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
@@ -28,8 +27,9 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_XEN) += hypervisor.o
-disabled-obj-$(CONFIG_XEN) := gup.o tlb.o
+disabled-obj-$(CONFIG_XEN) := gup.o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_MEMTEST) += memtest.o
+obj-$(CONFIG_KAISER) += kaiser.o
diff --git a/arch/x86/mm/init-xen.c b/arch/x86/mm/init-xen.c
index 5e7626fafe53..b5b0be24aab3 100644
--- a/arch/x86/mm/init-xen.c
+++ b/arch/x86/mm/init-xen.c
@@ -179,7 +179,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
set_in_cr4(X86_CR4_PSE);
/* Enable PGE if available */
- if (cpu_has_pge) {
+ if (cpu_has_pge && !kaiser_enabled) {
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 3e8bfea191e6..b733607ae18b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -187,7 +187,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
set_in_cr4(X86_CR4_PSE);
/* Enable PGE if available */
- if (cpu_has_pge) {
+ if (cpu_has_pge && !kaiser_enabled) {
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
diff --git a/arch/x86/mm/init_64-xen.c b/arch/x86/mm/init_64-xen.c
index 90a58c247596..26e3ec11dbe7 100644
--- a/arch/x86/mm/init_64-xen.c
+++ b/arch/x86/mm/init_64-xen.c
@@ -480,6 +480,16 @@ void __init cleanup_highmap(void)
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
+ else if (kaiser_enabled) {
+ /*
+ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+ * clear that now. This is not important, so long as
+ * CR4.PGE remains clear, but it removes an anomaly.
+ * Physical mapping setup below avoids _PAGE_GLOBAL
+ * by use of massage_pgprot() inside pfn_pte() etc.
+ */
+ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+ }
}
}
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b7aa3c9eb407..943dad47f648 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -312,6 +312,16 @@ void __init cleanup_highmap(void)
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
+ else if (kaiser_enabled) {
+ /*
+ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+ * clear that now. This is not important, so long as
+ * CR4.PGE remains clear, but it removes an anomaly.
+ * Physical mapping setup below avoids _PAGE_GLOBAL
+ * by use of massage_pgprot() inside pfn_pte() etc.
+ */
+ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+ }
}
}
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..ed70ded22479
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,437 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+extern struct mm_struct init_mm;
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h> /* to verify its kaiser declarations */
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/asm-offsets.h>
+#include <asm/cmdline.h>
+
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
+
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+extern char __sched_text_start[], __irqentry_text_start[], __irqentry_text_end[];
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3. It would take
+ * another register. So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes. No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte(). We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(vaddr);
+ /*
+ * We made all the kernel PGDs present in kaiser_init().
+ * We expect them to stay that way.
+ */
+ BUG_ON(pgd_none(*pgd));
+ /*
+ * PGDs are either 512GB or 128TB on all x86_64
+ * configurations. We don't handle these.
+ */
+ BUG_ON(pgd_large(*pgd));
+
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pud_large(*pud))
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pmd_large(*pmd))
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_none(*pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (pgd_none(*pgd)) {
+ WARN_ONCE(1, "All shadow pgds should have been populated");
+ return NULL;
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ pud = pud_offset(pgd, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud))
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(new_pmd_page)));
+ else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pmd = pmd_offset(pud, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd))
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(new_pte_page)));
+ else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ return pte_offset_kernel(pmd, address);
+}
+
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
+{
+ int ret = 0;
+ pte_t *pte;
+ unsigned long start_addr = (unsigned long )__start_addr;
+ unsigned long address = start_addr & PAGE_MASK;
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
+ /*
+ * It is convenient for callers to pass in __PAGE_KERNEL etc,
+ * and there is no actual harm from setting _PAGE_GLOBAL, so
+ * long as CR4.PGE is not set. But it is nonetheless troubling
+ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+ * requires that not to be #defined to 0): so mask it off here.
+ */
+ flags &= ~_PAGE_GLOBAL;
+
+ if (flags & _PAGE_USER)
+ BUG_ON(address < FIXADDR_START || end_addr >= FIXADDR_TOP);
+
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1) {
+ ret = -EIO;
+ break;
+ }
+ pte = kaiser_pagetable_walk(address);
+ if (!pte) {
+ ret = -ENOMEM;
+ break;
+ }
+ if (pte_none(*pte)) {
+ set_pte(pte, __pte(flags | target_address));
+ } else {
+ pte_t tmp;
+ set_pte(&tmp, __pte(flags | target_address));
+ WARN_ON_ONCE(!pte_same(*pte, tmp));
+ }
+ }
+ return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+ unsigned long size = end - start;
+
+ return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated. This ensures that all processes that get
+ * forked have the same entries. This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+ pgd_t new_pgd;
+ pud_t *pud = pud_alloc_one(&init_mm,
+ PAGE_OFFSET + i * PGDIR_SIZE);
+ if (!pud) {
+ WARN_ON(1);
+ break;
+ }
+ new_pgd = __pgd(_PAGE_TABLE |__pa(pud));
+ /*
+ * Make sure not to stomp on some other pgd entry.
+ */
+ if (!pgd_none(pgd[i])) {
+ WARN_ON(1);
+ continue;
+ }
+ set_pgd(pgd + i, new_pgd);
+ }
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do { \
+ int __ret = kaiser_add_user_map(start, size, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+void __init kaiser_check_boottime_disable(void)
+{
+ bool enable = true;
+ char arg[5];
+ int ret;
+
+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+ if (ret > 0) {
+ if (!strncmp(arg, "on", 2))
+ goto enable;
+
+ if (!strncmp(arg, "off", 3))
+ goto disable;
+
+ if (!strncmp(arg, "auto", 4))
+ goto skip;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "nopti"))
+ goto disable;
+
+skip:
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ goto disable;
+
+enable:
+ if (enable)
+ setup_force_cpu_cap(X86_FEATURE_KAISER);
+
+ return;
+
+disable:
+ pr_info("Kernel/User page tables isolation: disabled\n");
+ kaiser_enabled = 0;
+ setup_clear_cpu_cap(X86_FEATURE_KAISER);
+}
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it. If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+ int cpu;
+
+ if (!kaiser_enabled)
+ return;
+
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+ void *percpu_vaddr = __per_cpu_user_mapped_start +
+ per_cpu_offset(cpu);
+ unsigned long percpu_sz = __per_cpu_user_mapped_end -
+ __per_cpu_user_mapped_start;
+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+ __PAGE_KERNEL);
+ }
+
+ /*
+ * Map the entry/exit text section (and things it transitively needs
+ * before switching CR3), which is needed at switches from user to and
+ * from kernel.
+ */
+ kaiser_add_user_map_ptrs_early(__sched_text_start, __entry_text_end,
+ __PAGE_KERNEL_RX);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+ __irqentry_text_end,
+ __PAGE_KERNEL_RX);
+#endif
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+ kaiser_add_user_map_early((void *)VVAR_ADDRESS, PAGE_SIZE,
+ __PAGE_KERNEL_VVAR);
+ kaiser_add_user_map_early((void *)VSYSCALL_START, PAGE_SIZE,
+ vsyscall_pgprot);
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ if (!kaiser_enabled)
+ return 0;
+ return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ unsigned long end = start + size;
+ unsigned long addr;
+ pte_t *pte;
+
+ if (!kaiser_enabled)
+ return;
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ pte = kaiser_pagetable_walk(addr);
+ if (pte)
+ set_pte(pte, __pte(0));
+ }
+}
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!kaiser_enabled)
+ return pgd;
+ /*
+ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
+ * skip cases like kexec and EFI which make temporary low mappings.
+ */
+ if (pgd.pgd & _PAGE_USER) {
+ if (is_userspace_pgd(pgdp)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * Even if the entry is *mapping* userspace, ensure
+ * that userspace can not use it. This way, if we
+ * get out to userspace running on the kernel CR3,
+ * userspace will crash instead of running.
+ */
+ pgd.pgd |= _PAGE_NX;
+ }
+ } else if (!pgd.pgd) {
+ /*
+ * pgd_clear() cannot check _PAGE_USER, and is even used to
+ * clear corrupted pgd entries: so just rely on cases like
+ * kexec and EFI never to be using pgd_clear().
+ */
+ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+ is_userspace_pgd(pgdp))
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+ return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+ if (this_cpu_has(X86_FEATURE_PCID))
+ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCID and pgd and TLB flushing.
+ */
+ this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+ this_cpu_write(x86_cr3_pcid_user,
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
diff --git a/arch/x86/mm/pgtable-xen.c b/arch/x86/mm/pgtable-xen.c
index 40252e8df9c8..d593db2893c1 100644
--- a/arch/x86/mm/pgtable-xen.c
+++ b/arch/x86/mm/pgtable-xen.c
@@ -10,7 +10,7 @@
#include <asm/hypervisor.h>
#include <asm/mmu_context.h>
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -951,6 +951,7 @@ void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+ case VVAR_PAGE:
pte = pfn_pte(phys >> PAGE_SHIFT, flags);
set_pte_vaddr_pud(level3_user_pgt, address, pte);
break;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7353de3d98a7..f2763a6997c7 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -5,7 +5,7 @@
#include <asm/tlb.h>
#include <asm/fixmap.h>
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -253,12 +253,31 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
+/*
+ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER kaiser_enabled
+
+static inline pgd_t *_pgd_alloc(void)
+{
+ /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+ PGD_ALLOCATION_ORDER);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *pmds[PREALLOCATED_PMDS];
- pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+ pgd = _pgd_alloc();
if (pgd == NULL)
goto out;
@@ -288,7 +307,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
out_free_pmds:
free_pmds(pmds);
out_free_pgd:
- free_page((unsigned long)pgd);
+ _pgd_free(pgd);
out:
return NULL;
}
@@ -298,7 +317,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
- free_page((unsigned long)pgd);
+ _pgd_free(pgd);
}
/*
diff --git a/arch/x86/mm/tlb-xen.c b/arch/x86/mm/tlb-xen.c
new file mode 100644
index 000000000000..6820a5749fef
--- /dev/null
+++ b/arch/x86/mm/tlb-xen.c
@@ -0,0 +1,127 @@
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/cache.h>
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+ struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+#ifdef CONFIG_X86_64
+ pgd_t *upgd;
+#endif
+
+ if (likely(prev != next)) {
+ BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+ !PagePinned(virt_to_page(next->pgd)));
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
+#endif
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables: load_cr3(next->pgd).
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs much
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. This barrier synchronizes with
+ * remote TLB flushers. Fortunately, load_cr3 is
+ * serializing and thus acts as a full barrier.
+ *
+ */
+ op->cmd = MMUEXT_NEW_BASEPTR;
+ op->arg1.mfn = virt_to_mfn(next->pgd);
+ op++;
+
+ /* xen_new_user_pt(next->pgd) */
+#ifdef CONFIG_X86_64
+ op->cmd = MMUEXT_NEW_USER_BASEPTR;
+ upgd = __user_pgd(next->pgd);
+ op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
+ op++;
+#endif
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt)) {
+ /* load_mm_ldt(next) */
+ const struct ldt_struct *ldt;
+
+ /* lockless_dereference synchronizes with smp_store_release */
+ ldt = lockless_dereference(next->context.ldt);
+ op->cmd = MMUEXT_SET_LDT;
+ if (unlikely(ldt)) {
+ op->arg1.linear_addr = (long)ldt->entries;
+ op->arg2.nr_ents = ldt->size;
+ } else {
+ op->arg1.linear_addr = 0;
+ op->arg2.nr_ents = 0;
+ }
+ op++;
+ }
+
+ BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+
+ /* stop TLB flushes for the previous mm */
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+ }
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+ else {
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ *
+ * As above, this is a barrier that forces
+ * TLB repopulation to be ordered after the
+ * store to mm_cpumask.
+ */
+ load_cr3(next->pgd);
+ xen_new_user_pt(next->pgd);
+ load_mm_ldt(next);
+ }
+ }
+#endif
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 15cac5a86d1d..20ae46f7485a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -12,12 +12,44 @@
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
+#include <asm/kaiser.h>
+#include <asm/spec_ctrl.h>
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
= { &init_mm, 0, };
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+ unsigned long new_mm_cr3 = __pa(pgdir);
+
+ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+ * Flush KERN below, flush USER when returning to userspace in
+ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+ *
+ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+ * do it here, but can only be used if X86_FEATURE_INVPCID is
+ * available - and many machines support pcid without invpcid.
+ *
+ * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+ * but keep that line in there in case something changes.
+ */
+ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+ kaiser_flush_tlb_on_return_to_user();
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_new_mm_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
/*
- * Smarter SMP flushing macros.
+ * TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
@@ -65,10 +97,87 @@ void leave_mm(int cpu)
BUG();
cpumask_clear_cpu(cpu,
mm_cpumask(percpu_read(cpu_tlbstate.active_mm)));
- load_cr3(swapper_pg_dir);
+ load_new_mm_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+ x86_ibp_barrier();
+
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ percpu_write(cpu_tlbstate.active_mm, next);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_new_mm_cr3() is serializing
+ * and gives the ordering guarantee we need.
+ */
+ load_new_mm_cr3(next->pgd);
+
+ /* stop flush ipis for the previous mm */
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+ /*
+ * load the LDT, if the LDT is different:
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_mm_ldt(next);
+ } else {
+ percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
+ /* We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ *
+ * As above, load_new_mm_cr3() is serializing and orders
+ * TLB fills with respect to the mm_cpumask write.
+ */
+ load_new_mm_cr3(next->pgd);
+ load_mm_ldt(next);
+ }
+ }
+}
+
/*
*
* The flush IPI assumes that a thread switch happens in this order:
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f64648017e07..10ad57cc04e3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -270,6 +270,12 @@ static void __init xen_init_cpuid_mask(void)
(1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
+ /*
+ * Xen PV would need some work to support PCID: CR3 handling as well
+ * as xen_flush_tlb_others() would need updating.
+ */
+ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32)); /* disable PCID */
+
if (!xen_initial_domain())
cpuid_leaf1_edx_mask &=
~((1 << X86_FEATURE_APIC) | /* disable local APIC */
diff --git a/drivers/media/video/uvc/uvc_v4l2.c b/drivers/media/video/uvc/uvc_v4l2.c
index e582381bacfc..24ad87cf5d62 100644
--- a/drivers/media/video/uvc/uvc_v4l2.c
+++ b/drivers/media/video/uvc/uvc_v4l2.c
@@ -21,6 +21,7 @@
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/wait.h>
+#include <linux/system.h>
#include <asm/atomic.h>
#include <media/v4l2-common.h>
@@ -725,6 +726,7 @@ static long uvc_v4l2_do_ioctl(struct file *file, unsigned int cmd, void *arg)
}
pin = iterm->id;
} else if (index < selector->bNrInPins) {
+ gmb();
pin = selector->baSourceID[index];
list_for_each_entry(iterm, &chain->entities, chain) {
if (!UVC_ENTITY_IS_ITERM(iterm))
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index b54966c1dcf2..11ee6586d9c8 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -42,6 +42,7 @@
#include <linux/module.h>
#include <linux/etherdevice.h>
#include <linux/random.h>
+#include <linux/system.h>
#include <net/mac80211.h>
#include <net/cfg80211.h>
#include "hw.h"
@@ -1263,6 +1264,7 @@ static int carl9170_op_conf_tx(struct ieee80211_hw *hw, u16 queue,
mutex_lock(&ar->mutex);
if (queue < ar->hw->queues) {
+ gmb();
memcpy(&ar->edcf[ar9170_qmap[queue]], param, sizeof(*param));
ret = carl9170_set_qos(ar);
} else {
diff --git a/drivers/net/wireless/p54/main.c b/drivers/net/wireless/p54/main.c
index a5a6d9e647bb..159a31454224 100644
--- a/drivers/net/wireless/p54/main.c
+++ b/drivers/net/wireless/p54/main.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/firmware.h>
#include <linux/etherdevice.h>
+#include <linux/system.h>
#include <net/mac80211.h>
@@ -361,6 +362,7 @@ static int p54_conf_tx(struct ieee80211_hw *dev, u16 queue,
mutex_lock(&priv->conf_mutex);
if (queue < dev->queues) {
+ gmb();
P54_SET_QUEUE(priv->qos_params[queue], params->aifs,
params->cw_min, params->cw_max, params->txop);
ret = p54_set_edcf(priv);
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 9215700c00a4..549bf7c770b8 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -25,6 +25,7 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/crc-itu-t.h>
+#include <linux/system.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -105,6 +106,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
iinfo->i_lenEAttr) {
uint32_t aal =
le32_to_cpu(eahd->appAttrLocation);
+
+ gmb();
memmove(&ea[offset - aal + size],
&ea[aal], offset - aal);
offset -= aal;
@@ -115,6 +118,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
iinfo->i_lenEAttr) {
uint32_t ial =
le32_to_cpu(eahd->impAttrLocation);
+
+ gmb();
memmove(&ea[offset - ial + size],
&ea[ial], offset - ial);
offset -= ial;
@@ -126,6 +131,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
iinfo->i_lenEAttr) {
uint32_t aal =
le32_to_cpu(eahd->appAttrLocation);
+
+ gmb();
memmove(&ea[offset - aal + size],
&ea[aal], offset - aal);
offset -= aal;
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 2753589092bc..ae7ea3a422e1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -695,7 +695,14 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
*(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 133c0ba25e30..c45f00dcb079 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,9 @@
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>
+#ifndef __GENKSYMS__
+#include <linux/system.h>
+#endif
#include <asm/atomic.h>
@@ -86,8 +89,10 @@ static inline struct file * fcheck_files(struct files_struct *files, unsigned in
struct file * file = NULL;
struct fdtable *fdt = files_fdtable(files);
- if (fd < fdt->max_fds)
+ if (fd < fdt->max_fds) {
+ gmb();
file = rcu_dereference_check_fdtable(files, fdt->fd[fd]);
+ }
return file;
}
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 000000000000..4a4d6d911a14
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_KAISER
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ /*
+ * Map that page of kernel stack on which we enter from user context.
+ */
+ return kaiser_add_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+ /*
+ * Note: may be called even when kaiser_map_thread_stack() failed.
+ */
+ kaiser_remove_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_KAISER is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+ unsigned long size, unsigned long flags)
+{
+ return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+ unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_KAISER */
+#endif /* _LINUX_KAISER_H */
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index 70fffeba7495..66be4424daf0 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -1,9 +1,18 @@
#ifndef _LINUX_MMU_CONTEXT_H
#define _LINUX_MMU_CONTEXT_H
+#ifndef __GENKSYMS__
+#include <asm/mmu_context.h>
+#endif
+
struct mm_struct;
void use_mm(struct mm_struct *mm);
void unuse_mm(struct mm_struct *mm);
+/* Architectures that care about IRQ state in switch_mm can override this. */
+#ifndef switch_mm_irqs_off
+# define switch_mm_irqs_off switch_mm
+#endif
+
#endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 27ef6b190ea6..56f5eeb78d1d 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -28,6 +28,12 @@
(void)__vpp_verify; \
} while (0)
+#ifdef CONFIG_KAISER
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* s390 and alpha modules require percpu variables to be defined as
* weak to force the compiler to generate GOT based external
@@ -90,6 +96,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -119,6 +131,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -137,11 +157,21 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
*/
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
diff --git a/include/linux/system.h b/include/linux/system.h
new file mode 100644
index 000000000000..3303d28d2ccf
--- /dev/null
+++ b/include/linux/system.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_SYSTEM_H
+#define _LINUX_SYSTEM_H
+
+#include <asm/system.h>
+
+#ifndef gmb
+#define gmb() do { } while (0)
+#endif
+
+#endif
diff --git a/init/main.c b/init/main.c
index 63fd1b6f0655..4dae0bf479ef 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,6 +70,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/random.h>
+#include <linux/kaiser.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -477,6 +478,7 @@ static void __init mm_init(void)
percpu_init_late();
pgtable_cache_init();
vmalloc_init();
+ kaiser_init();
}
asmlinkage void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 75107a4eda40..198b7172f04b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,6 +59,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -137,6 +138,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
+ kaiser_unmap_thread_stack(ti);
free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
#endif
@@ -286,6 +288,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
if (err)
goto out;
+ err = kaiser_map_thread_stack(tsk->stack);
+ if (err)
+ goto out;
+
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
diff --git a/kernel/sched.c b/kernel/sched.c
index 33c88e3b2a50..85283b15a24b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,11 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
+#ifdef __GENKSYMS__
#include <asm/mmu_context.h>
+#else
+#include <linux/mmu_context.h>
+#endif
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -3444,7 +3448,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
- switch_mm(oldmm, mm, next);
+ switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 9e82e937000e..184b46b36018 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -4,9 +4,9 @@
*/
#include <linux/mm.h>
+#include <linux/sched.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
-#include <linux/sched.h>
#include <asm/mmu_context.h>
diff --git a/net/core/filter.c b/net/core/filter.c
index 7769706e1c1d..2640bec07680 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -39,6 +39,7 @@
#include <linux/filter.h>
#include <linux/reciprocal_div.h>
#include <linux/ratelimit.h>
+#include <linux/system.h>
#include <linux/seccomp.h>
/* No hurry in this branch */
@@ -275,9 +276,11 @@ load_b:
X = K;
continue;
case BPF_S_LD_MEM:
+ gmb();
A = mem[K];
continue;
case BPF_S_LDX_MEM:
+ gmb();
X = mem[K];
continue;
case BPF_S_MISC_TAX:
diff --git a/security/Kconfig b/security/Kconfig
index b2f1083a74f4..92181d54f2cd 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -96,6 +96,16 @@ config SECURITY
If you are unsure how to answer this question, answer N.
+config KAISER
+ bool "Remove the kernel mapping in user mode"
+ default y
+ depends on X86_64 && SMP && !XEN
+ help
+ This enforces a strict kernel and user space isolation, in order
+ to close hardware side channels on kernel address information.
+
+ If you are unsure how to answer this question, answer Y.
+
config SECURITYFS
bool "Enable the securityfs filesystem"
help