Home Home > GIT Browse > openSUSE-15.0
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2018-01-12 23:10:06 +0100
committerJiri Kosina <jkosina@suse.cz>2018-01-12 23:10:06 +0100
commit4b687b0b4637adc8a0c21929b41663c27db3d479 (patch)
tree42e9d0502de59979480e332d5e1b7a13e319666a
parent83abbc7c53a4cbb40dbcb33ef4a201b75331f568 (diff)
parentb14d4097b90cbc2f157d4170744a705c0dc446c4 (diff)
Merge remote-tracking branch 'origin/users/bpetkov/SLE15/for-next' into SLE15
Pull x86 entry patches from Borislav Petkov in preparations for PTI merge. suse-commit: 967e232081d278ef43d3d677c809c1ee29bf64db
-rw-r--r--arch/x86/entry/entry_32.S6
-rw-r--r--arch/x86/entry/entry_64.S202
-rw-r--r--arch/x86/entry/entry_64_compat.S14
-rw-r--r--arch/x86/hyperv/hv_init.c2
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/desc.h11
-rw-r--r--arch/x86/include/asm/fixmap.h68
-rw-r--r--arch/x86/include/asm/hypervisor.h53
-rw-r--r--arch/x86/include/asm/irqflags.h3
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/processor.h59
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/switch_to.h8
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/unwind.h20
-rw-r--r--arch/x86/include/asm/x86_init.h24
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c6
-rw-r--r--arch/x86/kernel/asm-offsets_32.c9
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/cpu/common.c170
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c64
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c8
-rw-r--r--arch/x86/kernel/doublefault.c36
-rw-r--r--arch/x86/kernel/dumpstack.c88
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq_64.c4
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/process.c19
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/traps.c69
-rw-r--r--arch/x86/kernel/unwind_orc.c88
-rw-r--r--arch/x86/kernel/vmlinux.lds.S9
-rw-r--r--arch/x86/kernel/x86_init.c9
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/power/cpu.c16
-rw-r--r--arch/x86/xen/enlighten_hvm.c12
-rw-r--r--arch/x86/xen/enlighten_pv.c8
-rw-r--r--arch/x86/xen/mmu_pv.c2
-rw-r--r--arch/x86/xen/xen-asm_64.S41
-rw-r--r--drivers/hv/vmbus_drv.c2
-rw-r--r--drivers/input/mouse/vmmouse.c10
-rw-r--r--drivers/misc/vmw_balloon.c2
-rw-r--r--include/linux/hypervisor.h8
54 files changed, 843 insertions, 386 deletions
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 71c7cf4e7c11..91a2ef302c7f 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -956,7 +956,8 @@ ENTRY(debug)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Ldebug_from_sysenter_stack
@@ -999,7 +1000,8 @@ ENTRY(nmi)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Lnmi_from_sysenter_stack
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index bbc14b5379d3..1371359c2f5e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -135,6 +135,64 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
+ .pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline. This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address). So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline. We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
+ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+ UNWIND_HINT_EMPTY
+ swapgs
+
+ /* Stash the user RSP. */
+ movq %rsp, RSP_SCRATCH
+
+ /* Load the top of the task stack into RSP */
+ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+ /* Start building the simulated IRET frame. */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq RSP_SCRATCH /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+
+ /*
+ * x86 lacks a near absolute jump, and we can't jump to the real
+ * entry text with a relative jump. We could push the target
+ * address and then use retq, but this destroys the pipeline on
+ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
+ * spill RDI and restore it in a second-stage trampoline.
+ */
+ pushq %rdi
+ movq $entry_SYSCALL_64_stage2, %rdi
+ jmp *%rdi
+END(entry_SYSCALL_64_trampoline)
+
+ .popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+ UNWIND_HINT_EMPTY
+ popq %rdi
+ jmp entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@@ -142,25 +200,18 @@ ENTRY(entry_SYSCALL_64)
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-GLOBAL(entry_SYSCALL_64_after_swapgs)
+ swapgs
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- TRACE_IRQS_OFF
-
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -174,6 +225,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
UNWIND_HINT_REGS extra=0
+ TRACE_IRQS_OFF
+
/*
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
@@ -329,8 +382,24 @@ syscall_return_via_sysret:
popq %rsi /* skip rcx */
popq %rdx
popq %rsi
+
+ /*
+ * Now all regs are restored except RSP and RDI.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ pushq RSP-RDI(%rdi) /* RSP */
+ pushq (%rdi) /* RDI */
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+
popq %rdi
- movq RSP-ORIG_RAX(%rsp), %rsp
+ popq %rsp
USERGS_SYSRET64
END(entry_SYSCALL_64)
@@ -465,12 +534,13 @@ END(irq_entries_start)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
- pushfq
- testl $X86_EFLAGS_IF, (%rsp)
+ pushq %rax
+ SAVE_FLAGS(CLBR_RAX)
+ testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
.Lokay_\@:
- addq $8, %rsp
+ popq %rax
#endif
.endm
@@ -562,6 +632,13 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
cld
+
+ testb $3, CS-ORIG_RAX(%rsp)
+ jz 1f
+ SWAPGS
+ call switch_to_thread_stack
+1:
+
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
@@ -571,12 +648,8 @@ END(irq_entries_start)
jz 1f
/*
- * IRQ from user mode. Switch to kernel gsbase and inform context
- * tracking that we're in kernel mode.
- */
- SWAPGS
-
- /*
+ * IRQ from user mode.
+ *
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
* (which can take locks). Since TRACE_IRQS_OFF idempotent,
@@ -629,10 +702,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2
1:
#endif
- SWAPGS
POP_EXTRA_REGS
- POP_C_REGS
- addq $8, %rsp /* skip regs->orig_ax */
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+
+ /*
+ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ /* Copy the IRET frame to the trampoline stack. */
+ pushq 6*8(%rdi) /* SS */
+ pushq 5*8(%rdi) /* RSP */
+ pushq 4*8(%rdi) /* EFLAGS */
+ pushq 3*8(%rdi) /* CS */
+ pushq 2*8(%rdi) /* RIP */
+
+ /* Push user RDI on the trampoline stack. */
+ pushq (%rdi)
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+
+ /* Restore RDI. */
+ popq %rdi
+ SWAPGS
INTERRUPT_RETURN
@@ -845,7 +949,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/*
* Exception entry points.
*/
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+/*
+ * Switch to the thread stack. This is called with the IRET frame and
+ * orig_ax on the stack. (That is, RDI..R12 are not on the stack and
+ * space has not been allocated for them.)
+ */
+ENTRY(switch_to_thread_stack)
+ UNWIND_HINT_FUNC
+
+ pushq %rdi
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+
+ pushq 7*8(%rdi) /* regs->ss */
+ pushq 6*8(%rdi) /* regs->rsp */
+ pushq 5*8(%rdi) /* regs->eflags */
+ pushq 4*8(%rdi) /* regs->cs */
+ pushq 3*8(%rdi) /* regs->ip */
+ pushq 2*8(%rdi) /* regs->orig_ax */
+ pushq 8(%rdi) /* return address */
+ UNWIND_HINT_FUNC
+
+ movq (%rdi), %rdi
+ ret
+END(switch_to_thread_stack)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@@ -864,11 +994,12 @@ ENTRY(\sym)
ALLOC_PT_GPREGS_ON_STACK
- .if \paranoid
- .if \paranoid == 1
+ .if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
- jnz 1f
+ jnz .Lfrom_usermode_switch_stack_\@
.endif
+
+ .if \paranoid
call paranoid_entry
.else
call error_entry
@@ -910,20 +1041,15 @@ ENTRY(\sym)
jmp error_exit
.endif
- .if \paranoid == 1
+ .if \paranoid < 2
/*
- * Paranoid entry from userspace. Switch stacks and treat it
+ * Entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
-1:
+.Lfrom_usermode_switch_stack_\@:
call error_entry
-
- movq %rsp, %rdi /* pt_regs pointer */
- call sync_regs
- movq %rax, %rsp /* switch stack */
-
movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
@@ -1195,6 +1321,14 @@ ENTRY(error_entry)
SWAPGS
.Lerror_entry_from_usermode_after_swapgs:
+ /* Put us onto the real thread stack. */
+ popq %r12 /* save return addr in %12 */
+ movq %rsp, %rdi /* arg0 = pt_regs pointer */
+ call sync_regs
+ movq %rax, %rsp /* switch stack */
+ ENCODE_FRAME_POINTER
+ pushq %r12
+
/*
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1c771a55b68..2270601b6218 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -47,7 +47,7 @@
*/
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ SWAPGS
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
*/
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ swapgs
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- /* Zero-extending 32-bit regs, do not remove */
- movl %eax, %eax
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+ movl %eax, %eax /* discard orig_ax high bits */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -306,8 +305,11 @@ ENTRY(entry_INT80_compat)
*/
movl %eax, %eax
- /* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */
+
+ /* switch to thread stack expects orig_ax to be pushed */
+ call switch_to_thread_stack
+
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 885457aa2a15..935d347c47a5 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -115,7 +115,7 @@ void hyperv_init(void)
__u8 d1 = 0x10; /* SuSE */
__u16 d2 = 0x0; /* -d of a.b.c-d */
- if (x86_hyper != &x86_hyper_ms_hyperv)
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
/* Allocate percpu VP index */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index d59c15c3defd..638ebbf482c0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -135,6 +135,8 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
/*
* Static testing of CPU features. Used the same as boot_cpu_has().
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d0a21b12dd58..4c730d1caa3c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -58,17 +58,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
return this_cpu_ptr(&gdt_page)->gdt;
}
-/* Get the fixmap index for a specific processor */
-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
-{
- return FIX_GDT_REMAP_BEGIN + cpu;
-}
-
/* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
- unsigned int idx = get_cpu_gdt_ro_index(cpu);
- return (struct desc_struct *)__fix_to_virt(idx);
+ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
}
/* Provide the current read-only GDT */
@@ -205,7 +198,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t
#endif
}
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{
struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dcd9fb55e679..4c6d4a199ba1 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE)
#endif
+/*
+ * cpu_entry_area is a percpu region in the fixmap that contains things
+ * needed by the CPU and early entry/exit code. Real types aren't used
+ * for all fields here to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+ * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+ * a a read-only guard page.
+ */
+ struct SYSENTER_stack_page SYSENTER_stack_page;
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+ char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+ /*
+ * Exception stacks used for IST entries.
+ *
+ * In the future, this should have a separate slot for each stack
+ * with guard pages between them.
+ */
+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+};
+
+#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+
+extern void setup_cpu_entry_areas(void);
/*
* Here we define all the compile-time 'special' virtual
@@ -101,8 +140,8 @@ enum fixed_addresses {
FIX_LNW_VRTC,
#endif
/* Fixmap entries to remap the GDTs, one per processor. */
- FIX_GDT_REMAP_BEGIN,
- FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+ FIX_CPU_ENTRY_AREA_TOP,
+ FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
__end_of_permanent_fixed_addresses,
@@ -185,5 +224,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
+static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
+{
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
+}
+
+#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
+ BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
+ __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
+ })
+
+#define get_cpu_entry_area_index(cpu, field) \
+ __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
+
+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
+}
+
+static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
+{
+ return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 0ead9dbb9130..96aa6b9884dc 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,14 +20,22 @@
#ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H
+/* x86 hypervisor types */
+enum x86_hypervisor_type {
+ X86_HYPER_NATIVE = 0,
+ X86_HYPER_VMWARE,
+ X86_HYPER_MS_HYPERV,
+ X86_HYPER_XEN_PV,
+ X86_HYPER_XEN_HVM,
+ X86_HYPER_KVM,
+};
+
#ifdef CONFIG_HYPERVISOR_GUEST
#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
-/*
- * x86 hypervisor information
- */
struct hypervisor_x86 {
/* Hypervisor name */
const char *name;
@@ -35,40 +43,27 @@ struct hypervisor_x86 {
/* Detection routine */
uint32_t (*detect)(void);
- /* Platform setup (run once per boot) */
- void (*init_platform)(void);
-
- /* X2APIC detection (run once per boot) */
- bool (*x2apic_available)(void);
+ /* Hypervisor type */
+ enum x86_hypervisor_type type;
- /* pin current vcpu to specified physical cpu (run rarely) */
- void (*pin_vcpu)(int);
+ /* init time callbacks */
+ struct x86_hyper_init init;
- /* called during init_mem_mapping() to setup early mappings. */
- void (*init_mem_mapping)(void);
+ /* runtime callbacks */
+ struct x86_hyper_runtime runtime;
};
-extern const struct hypervisor_x86 *x86_hyper;
-
-/* Recognized hypervisors */
-extern const struct hypervisor_x86 x86_hyper_vmware;
-extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
-extern const struct hypervisor_x86 x86_hyper_xen_pv;
-extern const struct hypervisor_x86 x86_hyper_xen_hvm;
-extern const struct hypervisor_x86 x86_hyper_kvm;
-
+extern enum x86_hypervisor_type x86_hyper_type;
extern void init_hypervisor_platform(void);
-extern bool hypervisor_x2apic_available(void);
-extern void hypervisor_pin_vcpu(int cpu);
-
-static inline void hypervisor_init_mem_mapping(void)
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
- if (x86_hyper && x86_hyper->init_mem_mapping)
- x86_hyper->init_mem_mapping();
+ return x86_hyper_type == type;
}
#else
static inline void init_hypervisor_platform(void) { }
-static inline bool hypervisor_x2apic_available(void) { return false; }
-static inline void hypervisor_init_mem_mapping(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return type == X86_HYPER_NATIVE;
+}
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index ac7692dcfa2e..d937781e1047 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -141,6 +141,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \
sysretl
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x) pushfq; popq %rax
+#endif
#else
#define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 29a594a3b82a..2a7769dd8fa2 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -25,6 +25,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 3ffd9fa96ce8..811bdd289475 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -965,6 +965,15 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0a6597d057d3..7a3718b76dad 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -161,9 +161,9 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
-extern struct tss_struct doublefault_tss;
-extern __u32 cpu_caps_cleared[NCAPINTS];
-extern __u32 cpu_caps_set[NCAPINTS];
+extern struct x86_hw_tss doublefault_tss;
+extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@@ -251,6 +251,11 @@ static inline void load_cr3(pgd_t *pgdir)
write_cr3(__sme_pa(pgdir));
}
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
@@ -303,7 +308,13 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
+
+ /*
+ * We store cpu_current_top_of_stack in sp1 so it's always accessible.
+ * Linux does not use ring 1, so sp1 is not otherwise needed.
+ */
u64 sp1;
+
u64 sp2;
u64 reserved2;
u64 ist[7];
@@ -321,12 +332,22 @@ struct x86_hw_tss {
#define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000
+struct SYSENTER_stack {
+ unsigned long words[64];
+};
+
+struct SYSENTER_stack_page {
+ struct SYSENTER_stack stack;
+} __aligned(PAGE_SIZE);
+
struct tss_struct {
/*
- * The hardware state:
+ * The fixed hardware portion. This must not cross a page boundary
+ * at risk of violating the SDM's advice and potentially triggering
+ * errata.
*/
struct x86_hw_tss x86_tss;
@@ -337,18 +358,9 @@ struct tss_struct {
* be within the limit.
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+} __aligned(PAGE_SIZE);
-#ifdef CONFIG_X86_32
- /*
- * Space for the temporary SYSENTER stack.
- */
- unsigned long SYSENTER_stack_canary;
- unsigned long SYSENTER_stack[64];
-#endif
-
-} ____cacheline_aligned;
-
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* sizeof(unsigned long) coming from an extra "long" at the end
@@ -362,6 +374,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#else
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif
/*
@@ -521,7 +536,7 @@ static inline void native_set_iopl_mask(unsigned mask)
static inline void
native_load_sp0(unsigned long sp0)
{
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
@@ -533,12 +548,12 @@ static inline void native_swapgs(void)
static inline unsigned long current_top_of_stack(void)
{
-#ifdef CONFIG_X86_64
- return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
-#else
- /* sp0 on x86_32 is special in and around vm86 mode. */
+ /*
+ * We can't read directly from tss.sp0: sp0 on x86_32 is special in
+ * and around vm86 mode and sp0 on x86_64 is special because of the
+ * entry trampoline.
+ */
return this_cpu_read_stable(cpu_current_top_of_stack);
-#endif
}
static inline bool on_thread_stack(void)
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 2e41c50ddf47..95f999576131 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -15,6 +15,7 @@ enum stack_type {
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ,
+ STACK_TYPE_SYSENTER,
STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};
@@ -27,6 +28,8 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
+
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 010cd6e4eafc..cfb6dfe4c457 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -78,10 +78,10 @@ do { \
static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return;
- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
@@ -89,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task)
{
+ /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
load_sp0(task->thread.sp0);
#else
- load_sp0(task_top_of_stack(task));
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ load_sp0(task_top_of_stack(task));
#endif
}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 90e1f9b84534..49cb02d0597f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -203,7 +203,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif
#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 9fc6a94cfea6..1ba3381f5534 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -92,7 +92,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 35d67dc7b69f..e1c1cb5019bc 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -6,6 +6,9 @@
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
struct unwind_state {
struct stack_info stack_info;
unsigned long stack_mask;
@@ -51,15 +54,28 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
}
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+/*
+ * If 'partial' returns true, only the iret frame registers are valid.
+ */
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
{
if (unwind_done(state))
return NULL;
+ if (partial) {
+#ifdef CONFIG_UNWINDER_ORC
+ *partial = !state->full_regs;
+#else
+ *partial = false;
+#endif
+ }
+
return state->regs;
}
#else
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
{
return NULL;
}
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 7ba7e90a9ad6..4d95e5a13c0b 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -114,6 +114,18 @@ struct x86_init_pci {
};
/**
+ * struct x86_hyper_init - x86 hypervisor init functions
+ * @init_platform: platform setup
+ * @x2apic_available: X2APIC detection
+ * @init_mem_mapping: setup early mappings during init_mem_mapping()
+ */
+struct x86_hyper_init {
+ void (*init_platform)(void);
+ bool (*x2apic_available)(void);
+ void (*init_mem_mapping)(void);
+};
+
+/**
* struct x86_init_ops - functions for platform specific setup
*
*/
@@ -126,6 +138,7 @@ struct x86_init_ops {
struct x86_init_timers timers;
struct x86_init_iommu iommu;
struct x86_init_pci pci;
+ struct x86_hyper_init hyper;
};
/**
@@ -199,6 +212,15 @@ struct x86_legacy_features {
};
/**
+ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
+ *
+ * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely)
+ */
+struct x86_hyper_runtime {
+ void (*pin_vcpu)(int cpu);
+};
+
+/**
* struct x86_platform_ops - platform specific runtime functions
* @calibrate_cpu: calibrate CPU
* @calibrate_tsc: calibrate TSC, if different from CPU
@@ -217,6 +239,7 @@ struct x86_legacy_features {
* possible in x86_early_init_platform_quirks() by
* only using the current x86_hardware_subarch
* semantics.
+ * @hyper: x86 hypervisor specific runtime callbacks
*/
struct x86_platform_ops {
unsigned long (*calibrate_cpu)(void);
@@ -232,6 +255,7 @@ struct x86_platform_ops {
void (*apic_post_init)(void);
struct x86_legacy_features legacy;
void (*set_legacy_features)(void);
+ struct x86_hyper_runtime hyper;
};
struct pci_dev;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2d75faf743f2..b7651a44bee8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1573,7 +1573,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
* under KVM
*/
if (max_physical_apicid > 255 ||
- !hypervisor_x2apic_available()) {
+ !x86_init.hyper.x2apic_available()) {
pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
x2apic_disable();
return;
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index de827d6ac8c2..40c3fab107ac 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -92,4 +92,10 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..c4f23da7a0f0 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -49,13 +49,8 @@ void foo(void)
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- offsetofend(struct tss_struct, SYSENTER_stack));
-
- /* Offset from cpu_tss to SYSENTER_stack */
- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
- /* Size of SYSENTER_stack */
- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+ offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index cf42206926af..048f68ff3396 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -22,6 +22,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
BLANK();
#endif
@@ -62,6 +65,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 841199ee801f..c5e21716f033 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -434,8 +434,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS];
-__u32 cpu_caps_set[NCAPINTS];
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu)
{
@@ -448,27 +448,116 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
-/* Setup the fixmap mapping only once per-processor */
-static inline void setup_fixmap_gdt(int cpu)
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
+ SYSENTER_stack_storage);
+
+static void __init
+set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
+{
+ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
+ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
- /* On 64-bit systems, we use a read-only fixmap GDT. */
- pgprot_t prot = PAGE_KERNEL_RO;
+ extern char _entry_trampoline[];
+
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
- * a task gate needs to change an available TSS to busy. If the GDT
- * is read-only, that will triple fault.
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
*
- * On Xen PV, the GDT must be read-only because the hypervisor requires
- * it.
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
*/
- pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
#endif
- __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
+ per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+ *
+ * Avoid placing a page boundary in the part of the TSS that the
+ * processor reads during a task switch (the first 104 bytes). The
+ * processor may not correctly perform address translations if a
+ * boundary occurs in this area. During a task switch, the processor
+ * reads and writes into the first 104 bytes of each TSS (using
+ * contiguous physical addresses beginning with the physical address
+ * of the first byte of the TSS). So, after TSS access begins, if
+ * part of the 104 bytes is not physically contiguous, the processor
+ * will access incorrect information without generating a page-fault
+ * exception.
+ *
+ * There are also a lot of errata involving the TSS spanning a page
+ * boundary. Assert that we're not doing that.
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
+ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE,
+ tss_prot);
+
+#ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+ BUILD_BUG_ON(sizeof(exception_stacks) !=
+ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
+ &per_cpu(exception_stacks, cpu),
+ sizeof(exception_stacks) / PAGE_SIZE,
+ PAGE_KERNEL);
+
+ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ setup_cpu_entry_area(cpu);
}
/* Load the original GDT from the per-cpu structure */
@@ -705,7 +794,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{
int i;
- for (i = 0; i < NCAPINTS; i++) {
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
}
@@ -1199,7 +1288,7 @@ void enable_sep_cpu(void)
return;
cpu = get_cpu();
- tss = &per_cpu(cpu_tss, cpu);
+ tss = &per_cpu(cpu_tss_rw, cpu);
/*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1208,11 +1297,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
-
- wrmsr(MSR_IA32_SYSENTER_ESP,
- (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
- 0);
-
+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@@ -1317,25 +1402,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
-};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
+ extern char _entry_trampoline[];
+ extern char entry_SYSCALL_64_trampoline[];
+
+ int cpu = smp_processor_id();
+ unsigned long SYSCALL64_entry_trampoline =
+ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1346,7 +1425,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1490,7 +1569,7 @@ void cpu_init(void)
if (cpu)
load_ucode_ap();
- t = &per_cpu(cpu_tss, cpu);
+ t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1529,7 +1608,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!oist->ist[0]) {
- char *estacks = per_cpu(exception_stacks, cpu);
+ char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v];
@@ -1540,7 +1619,7 @@ void cpu_init(void)
}
}
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/*
* <= is required because the CPU will access up to
@@ -1555,11 +1634,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, me);
/*
- * Initialize the TSS. Don't bother initializing sp0, as the initial
- * task never enters user mode.
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
*/
- set_tss_desc(cpu, t);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
+ load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1571,7 +1651,6 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
- setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
@@ -1581,7 +1660,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu);
@@ -1615,12 +1694,12 @@ void cpu_init(void)
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
*/
- set_tss_desc(cpu, t);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_mm_ldt(&init_mm);
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
@@ -1632,7 +1711,6 @@ void cpu_init(void)
fpu__init_cpu();
- setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 4fa90006ac68..bea8d3e24f50 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -26,6 +26,12 @@
#include <asm/processor.h>
#include <asm/hypervisor.h>
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_pv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
+
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PV
@@ -41,54 +47,52 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#endif
};
-const struct hypervisor_x86 *x86_hyper;
-EXPORT_SYMBOL(x86_hyper);
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
-static inline void __init
+static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
- const struct hypervisor_x86 *h, * const *p;
+ const struct hypervisor_x86 *h = NULL, * const *p;
uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
- h = *p;
- pri = h->detect();
- if (pri != 0 && pri > max_pri) {
+ pri = (*p)->detect();
+ if (pri > max_pri) {
max_pri = pri;
- x86_hyper = h;
+ h = *p;
}
}
- if (max_pri)
- pr_info("Hypervisor detected: %s\n", x86_hyper->name);
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
}
-void __init init_hypervisor_platform(void)
+static void __init copy_array(const void *src, void *target, unsigned int size)
{
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
- detect_hypervisor_vendor();
-
- if (!x86_hyper)
- return;
-
- if (x86_hyper->init_platform)
- x86_hyper->init_platform();
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
}
-bool __init hypervisor_x2apic_available(void)
+void __init init_hypervisor_platform(void)
{
- return x86_hyper &&
- x86_hyper->x2apic_available &&
- x86_hyper->x2apic_available();
-}
+ const struct hypervisor_x86 *h;
-void hypervisor_pin_vcpu(int cpu)
-{
- if (!x86_hyper)
+ h = detect_hypervisor_vendor();
+
+ if (!h)
return;
- if (x86_hyper->pin_vcpu)
- x86_hyper->pin_vcpu(cpu);
- else
- WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
+
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 42664f944cbc..1feb22440acb 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -259,9 +259,9 @@ static void __init ms_hyperv_init_platform(void)
#endif
}
-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
- .init_platform = ms_hyperv_init_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.init_platform = ms_hyperv_init_platform,
};
-EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 40ed26852ebd..8e005329648b 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void)
(eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
}
-const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
.name = "VMware",
.detect = vmware_platform,
- .init_platform = vmware_platform_setup,
- .x2apic_available = vmware_legacy_x2apic_available,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
};
-EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index f9c324e08d85..a9fe79d49d39 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -49,25 +49,23 @@ static void doublefault_fn(void)
cpu_relax();
}
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-
- .__cr3 = __pa_nodebug(swapper_pg_dir),
- }
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+ .sp0 = STACK_START,
+ .ss0 = __KERNEL_DS,
+ .ldt = 0,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+ .ip = (unsigned long) doublefault_fn,
+ /* 0x2 bit is always set */
+ .flags = X86_EFLAGS_SF | 0x2,
+ .sp = STACK_START,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
};
/* dummy for do_double_fault() call */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index bd265a4cf108..8271bbf2a4c3 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+{
+ struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
+
+ void *begin = ss;
+ void *end = ss + 1;
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_SYSENTER;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
@@ -50,6 +68,39 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
+void show_iret_regs(struct pt_regs *regs)
+{
+ printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+ regs->sp, regs->flags);
+}
+
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+ bool partial)
+{
+ /*
+ * These on_stack() checks aren't strictly necessary: the unwind code
+ * has already validated the 'regs' pointer. The checks are done for
+ * ordering reasons: if the registers are on the next stack, we don't
+ * want to print them out yet. Otherwise they'll be shown as part of
+ * the wrong stack. Later, when show_trace_log_lvl() switches to the
+ * next stack, this function will be called again with the same regs so
+ * they can be printed in the right context.
+ */
+ if (!partial && on_stack(info, regs, sizeof(*regs))) {
+ __show_regs(regs, 0);
+
+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
+ /*
+ * When an interrupt or exception occurs in entry code, the
+ * full pt_regs might not have been saved yet. In that case
+ * just print the iret frame.
+ */
+ show_iret_regs(regs);
+ }
+}
+
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl)
{
@@ -57,6 +108,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
struct stack_info stack_info = {0};
unsigned long visit_mask = 0;
int graph_idx = 0;
+ bool partial;
printk("%sCall Trace:\n", log_lvl);
@@ -71,31 +123,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
+ * - SYSENTER stack
*
- * x86-32 can have up to three stacks:
+ * x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
+ * - SYSENTER stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
- /*
- * If we overflowed the task stack into a guard page, jump back
- * to the bottom of the usable stack.
- */
- if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
- stack = task_stack_page(task);
-
- if (get_stack_info(stack, task, &stack_info, &visit_mask))
- break;
+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
+ }
stack_name = stack_type_name(stack_info.type);
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
- __show_regs(regs, 0);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -119,7 +175,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/*
* Don't print regs->ip again if it was already printed
- * by __show_regs() below.
+ * by show_regs_if_on_stack().
*/
if (regs && stack == &regs->ip)
goto next;
@@ -154,9 +210,9 @@ next:
unwind_next_frame(&state);
/* if the frame has entry regs, print them */
- regs = unwind_get_entry_regs(&state);
- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
- __show_regs(regs, 0);
+ regs = unwind_get_entry_regs(&state, &partial);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
}
if (stack_name)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 4f0481474903..c35d54f38ccc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -25,6 +25,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
+ if (type == STACK_TYPE_SYSENTER)
+ return "SYSENTER";
+
return NULL;
}
@@ -92,6 +95,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
+ if (in_sysenter_stack(stack, info))
+ goto recursion_check;
+
if (in_hardirq_stack(stack, info))
goto recursion_check;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 225af4184f06..16ceab1132e8 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -36,6 +36,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
+ if (type == STACK_TYPE_SYSENTER)
+ return "SYSENTER";
+
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@@ -114,6 +117,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
+ if (in_sysenter_stack(stack, info))
+ goto recursion_check;
+
goto unknown;
recursion_check:
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4a613fed94b6..d13777d49d8b 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -66,7 +66,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(cpu_tss, get_cpu());
+ tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3be74fbdeff2..feca14980e32 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -56,10 +56,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom,
- estack_top, estack_bottom);
+ estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n");
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dcc0154f6871..a83d4bbaff83 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -566,12 +566,12 @@ static uint32_t __init kvm_detect(void)
return kvm_cpuid_base();
}
-const struct hypervisor_x86 x86_hyper_kvm __refconst = {
+const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM",
.detect = kvm_detect,
- .x2apic_available = kvm_para_available,
+ .type = X86_HYPER_KVM,
+ .init.x2apic_available = kvm_para_available,
};
-EXPORT_SYMBOL_GPL(x86_hyper_kvm);
static __init int activate_jump_labels(void)
{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 11aaf1eaa0e4..c354833342bd 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff8a9acbcf8b..0a17cbcdf64c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,7 +46,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower
@@ -55,6 +55,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
* Poison it.
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+#ifdef CONFIG_X86_64
+ /*
+ * .sp1 is cpu_current_top_of_stack. The init task never
+ * runs user code, but cpu_current_top_of_stack should still
+ * be well defined before the first context switch.
+ */
+ .sp1 = TOP_OF_INIT_STACK,
+#endif
+
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
@@ -70,11 +80,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
-#ifdef CONFIG_X86_32
- .SYSENTER_stack_canary = STACK_END_MAGIC,
-#endif
};
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -103,7 +110,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (bp) {
- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c0d60420466c..784ff9147172 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b08b9b6c40eb..c75466232016 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,10 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
- (void *)regs->ip);
- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
- regs->sp, regs->flags);
+ show_iret_regs(regs);
+
if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else
@@ -89,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
+ if (!all)
+ return;
+
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
@@ -99,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
- if (!all)
- return;
-
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = __read_cr3();
@@ -401,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
@@ -463,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
+ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
update_sp0(next_p);
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b9ba5b972a47..ae90592118d1 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -102,7 +102,7 @@ __save_stack_trace_reliable(struct stack_trace *trace,
for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
unwind_next_frame(&state)) {
- regs = unwind_get_entry_regs(&state);
+ regs = unwind_get_entry_regs(&state, NULL);
if (regs) {
/* Success path for user tasks */
if (user_mode(regs))
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 0347ed41c92d..976694564c07 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -356,9 +356,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
- * end up promoting it to a doublefault. In that case, modify
- * the stack to make it look like we just entered the #GP
- * handler from user space, similar to bad_iret.
+ * end up promoting it to a doublefault. In that case, take
+ * advantage of the fact that we're not using the normal (TSS.sp0)
+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
+ * and then modify our own IRET frame so that, when we return,
+ * we land directly at the #GP(0) vector with the stack already
+ * set up according to its expectations.
+ *
+ * The net result is that our #GP handler will think that we
+ * entered from usermode with the bad user context.
*
* No need for ist_enter here because we don't use RCU.
*/
@@ -366,13 +372,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
- struct pt_regs *normal_regs = task_pt_regs(current);
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
- /* Fake a #GP(0) from userspace. */
- memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
+ /*
+ * regs->sp points to the failing IRET frame on the
+ * ESPFIX64 stack. Copy it to the entry stack. This fills
+ * in gpregs->ss through gpregs->ip.
+ *
+ */
+ memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
+
+ /*
+ * Adjust our frame so that we return straight to the #GP
+ * vector with the expected RSP value. This is safe because
+ * we won't enable interupts or schedule before we invoke
+ * general_protection, so nothing will clobber the stack
+ * frame we just set up.
+ */
regs->ip = (unsigned long)general_protection;
- regs->sp = (unsigned long)&normal_regs->orig_ax;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
@@ -397,7 +416,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
- * deliv- ered, the faulting linear address of the second fault will
+ * delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
@@ -608,14 +627,15 @@ NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch off the IST stack if the
- * interrupted code was in user mode. The actual stack switch is done in
- * entry_64.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = task_pt_regs(current);
- *regs = *eregs;
+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+ if (regs != eregs)
+ *regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
@@ -631,13 +651,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
- * correctly, we want move our stack frame to task_pt_regs
- * and we want to pretend that the exception came from the
- * iret target.
+ * correctly, we want to move our stack frame to where it would
+ * be had we entered directly on the entry stack (rather than
+ * just below the IRET frame) and we want to pretend that the
+ * exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
- container_of(task_pt_regs(current),
- struct bad_iret_stack, regs);
+ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@@ -802,14 +822,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
-#if defined(CONFIG_X86_32)
- /*
- * This is the most likely code path that involves non-trivial use
- * of the SYSENTER stack. Check that we haven't overrun it.
- */
- WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
- "Overran or corrupted SYSENTER stack\n");
-#endif
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@@ -977,6 +989,9 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
+ /* Init cpu_entry_area before IST entries are set up */
+ setup_cpu_entry_areas();
+
set_intr_gate(X86_TRAP_DE, divide_error);
set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
/* int4 can be called from all */
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a3f973b2c97a..be86a865087a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL;
}
-static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
+ void *addr = (void *)_addr;
- /*
- * If the address isn't on the current stack, switch to the next one.
- *
- * We may have to traverse multiple stacks to deal with the possibility
- * that info->next_sp could point to an empty stack and the address
- * could be on a subsequent stack.
- */
- while (!on_stack(info, (void *)addr, len))
- if (get_stack_info(info->next_sp, state->task, info,
- &state->stack_mask))
- return false;
+ if (!on_stack(info, addr, len) &&
+ (get_stack_info(addr, state->task, info, &state->stack_mask)))
+ return false;
return true;
}
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
return true;
}
-#define REGS_SIZE (sizeof(struct pt_regs))
-#define SP_OFFSET (offsetof(struct pt_regs, sp))
-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
-
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
- unsigned long *ip, unsigned long *sp, bool full)
+ unsigned long *ip, unsigned long *sp)
{
- size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
- size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
- struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
-
- if (IS_ENABLED(CONFIG_X86_64)) {
- if (!stack_access_ok(state, addr, regs_size))
- return false;
+ struct pt_regs *regs = (struct pt_regs *)addr;
- *ip = regs->ip;
- *sp = regs->sp;
+ /* x86-32 support will be more complicated due to the &regs->sp hack */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
- return true;
- }
-
- if (!stack_access_ok(state, addr, sp_offset))
+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
*ip = regs->ip;
+ *sp = regs->sp;
+ return true;
+}
- if (user_mode(regs)) {
- if (!stack_access_ok(state, addr + sp_offset,
- REGS_SIZE - SP_OFFSET))
- return false;
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
- *sp = regs->sp;
- } else
- *sp = (unsigned long)&regs->sp;
+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+ return false;
+ *ip = regs->ip;
+ *sp = regs->sp;
return true;
}
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
- struct pt_regs *ptregs;
bool indirect = false;
if (unwind_done(state))
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS:
- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS_IRET:
- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
- ptregs = container_of((void *)sp, struct pt_regs, ip);
- if ((unsigned long)ptregs >= prev_sp &&
- on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
- state->regs = ptregs;
- state->full_regs = false;
- } else
- state->regs = NULL;
-
+ state->regs = (void *)sp - IRET_FRAME_OFFSET;
+ state->full_regs = false;
state->signal = true;
break;
@@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
if (get_stack_info((unsigned long *)state->sp, state->task,
- &state->stack_info, &state->stack_mask))
- return;
+ &state->stack_info, &state->stack_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ if (get_stack_info(next_page, state->task, &state->stack_info,
+ &state->stack_mask))
+ return;
+ }
/*
* The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f05f00acac89..423aa36f0150 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -106,6 +106,15 @@ SECTIONS
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ _entry_trampoline = .;
+ *(.entry_trampoline)
+ . = ALIGN(PAGE_SIZE);
+ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
/* End of text section */
_etext = .;
} :text = 0x9090
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a088b2c47f73..5b2d10c1973a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -28,6 +28,8 @@ void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
+bool __init bool_x86_init_noop(void) { return false; }
+void x86_op_int_noop(int cpu) { }
/*
* The platform setup functions are preset with the default functions
@@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata = {
.init_irq = x86_default_pci_init_irq,
.fixup_irqs = x86_default_pci_fixup_irqs,
},
+
+ .hyper = {
+ .init_platform = x86_init_noop,
+ .x2apic_available = bool_x86_init_noop,
+ .init_mem_mapping = x86_init_noop,
+ },
};
struct x86_cpuinit_ops x86_cpuinit = {
@@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.get_nmi_reason = default_get_nmi_reason,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .hyper.pin_vcpu = x86_op_int_noop,
};
EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1a0c301859d7..c1613dc5b764 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2274,7 +2274,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
- (unsigned long)this_cpu_ptr(&cpu_tss));
+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 29df077cb089..cf2ac227c2ac 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -106,10 +106,10 @@ static void delay_mwaitx(unsigned long __loops)
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
/*
- * Use cpu_tss as a cacheline-aligned, seldomly
+ * Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c6cb03eedab7..98f1ada68976 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -637,7 +637,7 @@ void __init init_mem_mapping(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
- hypervisor_init_mem_mapping();
+ x86_init.hyper.init_mem_mapping();
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 78459a6d455a..2a717e023c9f 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -160,17 +160,19 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
- set_tss_desc(cpu, t); /*
- * This just modifies memory; should not be
- * necessary. But... This is necessary, because
- * 386 hardware has concept of busy TSS or some
- * similar stupidity.
- */
+
+ /*
+ * We need to reload TR, which requires that we change the
+ * GDT entry to indicate "available" first.
+ *
+ * XXX: This could probably all be replaced by a call to
+ * force_reload_TR().
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
#ifdef CONFIG_X86_64
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index d1d37631d9c0..e25de9276ce6 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -235,12 +235,12 @@ static uint32_t __init xen_platform_hvm(void)
return xen_cpuid_base();
}
-const struct hypervisor_x86 x86_hyper_xen_hvm = {
+const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = {
.name = "Xen HVM",
.detect = xen_platform_hvm,
- .init_platform = xen_hvm_guest_init,
- .pin_vcpu = xen_pin_vcpu,
- .x2apic_available = xen_x2apic_para_available,
- .init_mem_mapping = xen_hvm_init_mem_mapping,
+ .type = X86_HYPER_XEN_HVM,
+ .init.init_platform = xen_hvm_guest_init,
+ .init.x2apic_available = xen_x2apic_para_available,
+ .init.init_mem_mapping = xen_hvm_init_mem_mapping,
+ .runtime.pin_vcpu = xen_pin_vcpu,
};
-EXPORT_SYMBOL(x86_hyper_xen_hvm);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index fe5b9b2e653d..2a909d0cba1b 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -840,7 +840,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
void xen_set_iopl_mask(unsigned mask)
@@ -1518,9 +1518,9 @@ static uint32_t __init xen_platform_pv(void)
return 0;
}
-const struct hypervisor_x86 x86_hyper_xen_pv = {
+const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
.name = "Xen PV",
.detect = xen_platform_pv,
- .pin_vcpu = xen_pin_vcpu,
+ .type = X86_HYPER_XEN_PV,
+ .runtime.pin_vcpu = xen_pin_vcpu,
};
-EXPORT_SYMBOL(x86_hyper_xen_pv);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 13a085df737f..e3ffdd656726 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2342,7 +2342,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
- case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
+ case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 41d3e7fcc914..e1a90f98790c 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -113,34 +113,47 @@ RELOC(xen_sysret64, 1b+1)
* rip
* r11
* rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
-.macro undo_xen_syscall
- mov 0*8(%rsp), %rcx
- mov 1*8(%rsp), %r11
- mov 5*8(%rsp), %rsp
-.endm
-
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
- undo_xen_syscall
- jmp entry_SYSCALL_64_after_swapgs
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER_DS and __USER_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER_DS, 4*8(%rsp)
+ movq $__USER_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_64_after_hwframe
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
- undo_xen_syscall
- jmp entry_SYSCALL_compat
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_compat_after_hwframe
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
- undo_xen_syscall
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
jmp entry_SYSENTER_compat
ENDPROC(xen_sysenter_target)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 4f3faf513ba3..610223f0e945 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1717,7 +1717,7 @@ static int __init hv_acpi_init(void)
{
int ret, t;
- if (x86_hyper != &x86_hyper_ms_hyperv)
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return -ENODEV;
init_completion(&probe_event);
diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c
index 0f586780ceb4..1ae5c1ef3f5b 100644
--- a/drivers/input/mouse/vmmouse.c
+++ b/drivers/input/mouse/vmmouse.c
@@ -316,11 +316,9 @@ static int vmmouse_enable(struct psmouse *psmouse)
/*
* Array of supported hypervisors.
*/
-static const struct hypervisor_x86 *vmmouse_supported_hypervisors[] = {
- &x86_hyper_vmware,
-#ifdef CONFIG_KVM_GUEST
- &x86_hyper_kvm,
-#endif
+static enum x86_hypervisor_type vmmouse_supported_hypervisors[] = {
+ X86_HYPER_VMWARE,
+ X86_HYPER_KVM,
};
/**
@@ -331,7 +329,7 @@ static bool vmmouse_check_hypervisor(void)
int i;
for (i = 0; i < ARRAY_SIZE(vmmouse_supported_hypervisors); i++)
- if (vmmouse_supported_hypervisors[i] == x86_hyper)
+ if (vmmouse_supported_hypervisors[i] == x86_hyper_type)
return true;
return false;
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 1e688bfec567..9047c0a529b2 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1271,7 +1271,7 @@ static int __init vmballoon_init(void)
* Check if we are running on VMware's hypervisor and bail out
* if we are not.
*/
- if (x86_hyper != &x86_hyper_vmware)
+ if (x86_hyper_type != X86_HYPER_VMWARE)
return -ENODEV;
for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
index 3fa5ef2b3759..35e170ca87a8 100644
--- a/include/linux/hypervisor.h
+++ b/include/linux/hypervisor.h
@@ -6,8 +6,12 @@
* Juergen Gross <jgross@suse.com>
*/
-#ifdef CONFIG_HYPERVISOR_GUEST
-#include <asm/hypervisor.h>
+#ifdef CONFIG_X86
+#include <asm/x86_init.h>
+static inline void hypervisor_pin_vcpu(int cpu)
+{
+ x86_platform.hyper.pin_vcpu(cpu);
+}
#else
static inline void hypervisor_pin_vcpu(int cpu)
{