Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiroslav Benes <mbenes@suse.cz>2018-08-09 13:38:01 +0200
committerMiroslav Benes <mbenes@suse.cz>2018-08-09 13:38:01 +0200
commit25ada3d1dd0be24ff2dae0ce239eb62d63a963dd (patch)
tree95246fbdb0a227d696d69b0f911f5925a0d0712e
parent24ce5f87af9dfc1d8d0354ffc45b7fe4a595cde8 (diff)
parent17fe293af24fc62b6c011b34fc10a83d682c8d69 (diff)
Merge branch 'bsc#1099306_v2_12.3_a' into SLE12-SP3_Update_2_EMBARGO
-rw-r--r--bsc1099306/bsc1099306.h24
-rw-r--r--bsc1099306/kgr_patch_bsc1099306.c153
-rw-r--r--bsc1099306/kgr_patch_bsc1099306.h28
-rw-r--r--bsc1099306/kgr_patch_bsc1099306_irq.c108
-rw-r--r--bsc1099306/kgr_patch_bsc1099306_irq.h15
-rw-r--r--bsc1099306/kgr_patch_bsc1099306_kvm.c2089
-rw-r--r--bsc1099306/kgr_patch_bsc1099306_kvm.h51
-rw-r--r--bsc1099306/kgr_patch_bsc1099306_kvm_intel.c1325
-rw-r--r--bsc1099306/kgr_patch_bsc1099306_kvm_intel.h29
9 files changed, 3822 insertions, 0 deletions
diff --git a/bsc1099306/bsc1099306.h b/bsc1099306/bsc1099306.h
new file mode 100644
index 0000000..1f5860d
--- /dev/null
+++ b/bsc1099306/bsc1099306.h
@@ -0,0 +1,24 @@
+#ifndef _BSC1099306_H
+#define _BSC1099306_H
+
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+DECLARE_PER_CPU(bool, kgr_kvm_cpu_l1tf_flush_l1d);
+
+struct kvm_vcpu;
+
+static inline bool kgr_never_needs_l1d_flush(void)
+{
+ return (static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ boot_cpu_data.x86_vendor != X86_VENDOR_INTEL);
+}
+
+void kgr_set_vcpu_unconfined(struct kvm_vcpu *vcpu, gfp_t gfp_flags);
+
+bool kgr_get_and_clear_vcpu_unconfined(struct kvm_vcpu *vcpu, gfp_t gfp_flags);
+
+void kgr_free_vcpu_unconfined_shadow(struct kvm_vcpu *vcpu);
+
+#endif /* _BSC1099306_H */
diff --git a/bsc1099306/kgr_patch_bsc1099306.c b/bsc1099306/kgr_patch_bsc1099306.c
new file mode 100644
index 0000000..2c92f05
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306.c
@@ -0,0 +1,153 @@
+/*
+ * kgraft_patch_bsc1099306
+ *
+ * Fix for CVE-2018-3646, bsc#1099306
+ *
+ * Upstream commit:
+ * none yet
+ *
+ * SLE12 commit:
+ * none yet
+ *
+ * SLE12-SP1 commit
+ * none yet
+ *
+ * SLE12-SP2 commit:
+ * none yet
+ *
+ * SLE12-SP3 commits:
+ * aa543fea2fef31c3c74941162f618172c3ecdce7
+ * 3d1482acbe6776b27c687c0b5f50e5930119ee9b
+ * 3eea10e53b60be7e52b4c63374744787ecb97076
+ * 9c2cd4dbd4f1c173392730e3b7da6cde3bcff086
+ * 68a781200780b424c97fd1f44f900c81adf810f9
+ *
+ * SLE15 commits:
+ * bc19bc75fbb7400a0c4f8b2d85f4371b32c030c1
+ * e3ea7f331bdd5b9e582cf225efb1890cee2d85e3
+ * 7ffd8ffea589eb271069de635a78d3efa28c4264
+ * e9c295528cd8681eb60ffae07fbc287f3b10ba31
+ * 4a9535617e62a37dd477fd5e2b51f86c1f0dcfc5
+ * 85644246fb14317688b1d5a4048d6936e1f5dfb9
+ * (4b6a8a97cf42cd6008ec782b43461180cf80019c)
+ * (251e13546c70bb432f98319f4f39f3125a30f725)
+ * (4c4aea41570747f6f30cf11ddcb341b554d93fa2)
+ * (6af910f2646c32fd521bc780c4d252c341e22274)
+ * (0c0fd378df4f4ed5a575b3a4eec0bed7996a35ba)
+ *
+ * Copyright (c) 2018 SUSE
+ * Author: Nicolai Stange <nstange@suse.de>
+ *
+ * Based on the original Linux kernel code. Other copyrights apply.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+#include <linux/kernel.h>
+#include "shadow.h"
+#include "kgr_patch_bsc1099306.h"
+#include "bsc1099306.h"
+
+#define KGR_BSC1099306_SHADOW_VCPU_UNCONFINED_ID \
+ KGR_SHADOW_ID(1099306, 0)
+
+
+void kgr_set_vcpu_unconfined(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
+{
+ bool *unconfined;
+
+ if (kgr_never_needs_l1d_flush())
+ return;
+
+ unconfined = klp_shadow_get_or_alloc
+ (vcpu, KGR_BSC1099306_SHADOW_VCPU_UNCONFINED_ID,
+ NULL, sizeof(bool), gfp_flags);
+ if (!unconfined)
+ return;
+ *unconfined = true;
+}
+
+bool kgr_get_and_clear_vcpu_unconfined(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
+{
+ bool *unconfined;
+ bool val = true;
+
+ if (kgr_never_needs_l1d_flush())
+ return false;
+
+ /*
+ * If there's no shadow variable yet or if the allocation
+ * fails, default to 'unconfined'.
+ */
+ unconfined = klp_shadow_get_or_alloc
+ (vcpu, KGR_BSC1099306_SHADOW_VCPU_UNCONFINED_ID,
+ &val, sizeof(bool), gfp_flags);
+ if (!unconfined)
+ return true;
+
+ val = *unconfined;
+ *unconfined = false;
+ return val;
+}
+
+void kgr_free_vcpu_unconfined_shadow(struct kvm_vcpu *vcpu)
+{
+ if (kgr_never_needs_l1d_flush())
+ return;
+ klp_shadow_free(vcpu, KGR_BSC1099306_SHADOW_VCPU_UNCONFINED_ID);
+}
+
+
+
+int kgr_patch_bsc1099306_init(void)
+{
+ int ret;
+
+ ret = __kgr_patch_bsc1099306_irq_init();
+ if (ret)
+ return ret;
+
+ ret = __kgr_patch_bsc1099306_kvm_init();
+ if (ret)
+ return ret;
+
+ ret = __kgr_patch_bsc1099306_kvm_intel_init();
+ if (ret) {
+ __kgr_patch_bsc1099306_kvm_cleanup();
+ return ret;
+ }
+
+ if (!kgr_never_needs_l1d_flush()) {
+ ret = kgr_shadow_init();
+ if (ret) {
+ __kgr_patch_bsc1099306_kvm_cleanup();
+ __kgr_patch_bsc1099306_kvm_intel_cleanup();
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+void kgr_patch_bsc1099306_cleanup(void)
+{
+ if (!kgr_never_needs_l1d_flush())
+ kgr_shadow_cleanup();
+ __kgr_patch_bsc1099306_kvm_intel_cleanup();
+ __kgr_patch_bsc1099306_kvm_cleanup();
+}
+
+#endif /* IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM) */
diff --git a/bsc1099306/kgr_patch_bsc1099306.h b/bsc1099306/kgr_patch_bsc1099306.h
new file mode 100644
index 0000000..1331c87
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306.h
@@ -0,0 +1,28 @@
+#ifndef _KGR_PATCH_BSC1099306_H
+#define _KGR_PATCH_BSC1099306_H
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+#include "kgr_patch_bsc1099306_irq.h"
+#include "kgr_patch_bsc1099306_kvm.h"
+#include "kgr_patch_bsc1099306_kvm_intel.h"
+
+int kgr_patch_bsc1099306_init(void);
+void kgr_patch_bsc1099306_cleanup(void);
+
+#define KGR_PATCH_BSC1099306_FUNCS \
+ __KGR_PATCH_BSC1099306_IRQ_FUNCS \
+ __KGR_PATCH_BSC1099306_KVM_FUNCS \
+ __KGR_PATCH_BSC1099306_KVM_INTEL_FUNCS
+
+#else /* !(IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)) */
+
+static inline int kgr_patch_bsc1099306_init(void) { return 0; }
+static inline void kgr_patch_bsc1099306_cleanup(void) {}
+
+#define KGR_PATCH_BSC1099306_FUNCS
+
+
+#endif /* IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM) */
+
+#endif /* _KGR_PATCH_BSC1099306_H */
diff --git a/bsc1099306/kgr_patch_bsc1099306_irq.c b/bsc1099306/kgr_patch_bsc1099306_irq.c
new file mode 100644
index 0000000..8dd8119
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306_irq.c
@@ -0,0 +1,108 @@
+/*
+ * kgraft_patch_bsc1099306_irq
+ *
+ * Fix for CVE-2018-3646 (irq_enter() part), bsc#1099306
+ *
+ * Copyright (c) 2018 SUSE
+ * Author: Nicolai Stange <nstange@suse.de>
+ *
+ * Based on the original Linux kernel code. Other copyrights apply.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+#include <linux/kernel.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/preempt.h>
+#include <linux/tick.h>
+#include <linux/bottom_half.h>
+#include <linux/vtime.h>
+#include <linux/hardirq.h>
+#include "kgr_patch_bsc1099306_irq.h"
+#include "bsc1099306.h"
+
+
+#if !IS_ENABLED(CONFIG_TICK_ONESHOT)
+#error "Live patch supports only CONFIG_TICK_ONESHOT=y."
+#endif
+
+static void (*kgr_rcu_irq_enter)(void);
+static void (*kgr_tick_irq_enter)(void);
+
+static struct {
+ char *name;
+ void **addr;
+} kgr_funcs[] = {
+ { "rcu_irq_enter", (void *)&kgr_rcu_irq_enter },
+ { "tick_irq_enter", (void *)&kgr_tick_irq_enter },
+};
+
+
+DEFINE_PER_CPU(bool, kgr_kvm_cpu_l1tf_flush_l1d);
+
+
+/* patched */
+void kgr_irq_enter(void)
+{
+ kgr_rcu_irq_enter();
+ if (is_idle_task(current) && !in_interrupt()) {
+ /*
+ * Prevent raise_softirq from needlessly waking up ksoftirqd
+ * here, as softirq will be serviced on return from interrupt.
+ */
+ local_bh_disable();
+ kgr_tick_irq_enter();
+ _local_bh_enable();
+ }
+
+ __irq_enter();
+
+ /*
+ * Fix CVE-2018-3646
+ * +1 line
+ */
+ __this_cpu_write(kgr_kvm_cpu_l1tf_flush_l1d, true);
+}
+
+
+static int kgr_patch_bsc1099306_irq_kallsyms(void)
+{
+ unsigned long addr;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(kgr_funcs); i++) {
+ /* mod_find_symname would be nice, but it is not exported */
+ addr = kallsyms_lookup_name(kgr_funcs[i].name);
+ if (!addr) {
+ pr_err("kgraft-patch: symbol %s not resolved\n",
+ kgr_funcs[i].name);
+ return -ENOENT;
+ }
+
+ *(kgr_funcs[i].addr) = (void *)addr;
+ }
+
+ return 0;
+}
+
+int __kgr_patch_bsc1099306_irq_init(void)
+{
+ return kgr_patch_bsc1099306_irq_kallsyms();
+}
+
+#endif
diff --git a/bsc1099306/kgr_patch_bsc1099306_irq.h b/bsc1099306/kgr_patch_bsc1099306_irq.h
new file mode 100644
index 0000000..b73a031
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306_irq.h
@@ -0,0 +1,15 @@
+#ifndef _KGR_PATCH_BSC1099306_IRQ_H
+#define _KGR_PATCH_BSC1099306_IRQ_H
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+int __kgr_patch_bsc1099306_irq_init(void);
+
+void kgr_irq_enter(void);
+
+#define __KGR_PATCH_BSC1099306_IRQ_FUNCS \
+ KGR_PATCH(irq_enter, kgr_irq_enter), \
+
+#endif /* IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM) */
+
+#endif /* _LIVEPATCH_BSC1099306_IRQ_H */
diff --git a/bsc1099306/kgr_patch_bsc1099306_kvm.c b/bsc1099306/kgr_patch_bsc1099306_kvm.c
new file mode 100644
index 0000000..8f9393b
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306_kvm.c
@@ -0,0 +1,2089 @@
+/*
+ * kgraft_patch_bsc1099306_kvm
+ *
+ * Fix for CVE-2018-3646 (kvm.ko part), bsc#1099306
+ *
+ * Copyright (c) 2018 SUSE
+ * Author: Nicolai Stange <nstange@suse.de>
+ *
+ * Based on the original Linux kernel code. Other copyrights apply.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/tracepoint.h>
+#include <linux/kvm_host.h>
+#include <asm/pvclock.h>
+#include <linux/bitmap.h>
+#include <linux/srcu.h>
+#include <linux/sched.h>
+#include <asm/fpu/internal.h>
+#include <asm/debugreg.h>
+#include <linux/profile.h>
+#include <kvm/iodev.h>
+#include <asm/kvm_emulate.h>
+#include "kgr_patch_bsc1099306_kvm.h"
+#include "bsc1099306.h"
+
+#if !IS_MODULE(CONFIG_KVM)
+#error "Live patch supports only CONFIG_KVM=m"
+#endif
+
+#define KGR_PATCHED_MODULE "kvm"
+
+
+#define __KGR_DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
+ static struct tracepoint *kgr__tracepoint_##name; \
+ static inline void kgr_trace_##name(proto) \
+ { \
+ if (unlikely(static_key_enabled(&kgr__tracepoint_##name->key))) \
+ __DO_TRACE(kgr__tracepoint_##name, \
+ TP_PROTO(data_proto), \
+ TP_ARGS(data_args), \
+ TP_CONDITION(cond),,); \
+ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
+ rcu_read_lock_sched_notrace(); \
+ rcu_dereference_sched(kgr__tracepoint_##name->funcs); \
+ rcu_read_unlock_sched_notrace(); \
+ } \
+ } \
+
+#define KGR_DECLARE_TRACE(name, proto, args) \
+ __KGR_DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
+ cpu_online(raw_smp_processor_id()), \
+ PARAMS(void *__data, proto), \
+ PARAMS(__data, args))
+
+#define KGR_TRACE_EVENT(name, proto, args) \
+ KGR_DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
+
+/* see arch/x86/kvm/trace.h */
+KGR_TRACE_EVENT(kvm_pvclock_update,
+ TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
+ TP_ARGS(vcpu_id, pvclock)
+);
+
+KGR_TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+ TP_ARGS(exception, has_error, error_code)
+);
+
+KGR_TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id)
+);
+
+KGR_TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed)
+);
+
+
+struct kvm_x86_ops **kgr_kvm_x86_ops;
+static unsigned long __percpu *kgr_cpu_tsc_khz;
+static bool *kgr_kvm_has_tsc_control;
+static struct static_key *kgr_kvm_no_apic_vcpu;
+static struct static_key_deferred *kgr_apic_hw_disabled;
+static struct static_key_deferred *kgr_apic_sw_disabled;
+static u64 *kgr_host_xcr0;
+
+static int (*kgr_kvm_apic_accept_pic_intr)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_mmu_unload)(struct kvm_vcpu *vcpu);
+static void (*kgr__kvm_migrate_timers)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_gen_update_masterclock)(struct kvm *kvm);
+static u64 (*kgr_kvm_read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
+static void (*kgr_kvm_get_time_scale)(uint32_t scaled_khz, uint32_t base_khz,
+ s8 *pshift, u32 *pmultiplier);
+static int (*kgr_kvm_write_guest_cached)(struct kvm *kvm,
+ struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len);
+static int (*kgr_kvm_read_guest_cached)(struct kvm *kvm,
+ struct gfn_to_hva_cache *ghc,
+ void *data, unsigned long len);
+static void (*kgr_kvm_mmu_sync_roots)(struct kvm_vcpu *vcpu);
+static void (*kgr_process_nmi)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_pmu_handle_event)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_pmu_deliver_pmi)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_scan_ioapic_routes)(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+static void (*kgr_kvm_ioapic_scan_entry)(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+static void (*kgr_kvm_vcpu_reload_apic_access_page)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_lapic_find_highest_irr)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_apic_accept_events)(struct kvm_vcpu *vcpu);
+static int (*kgr_exception_type)(int vector);
+static unsigned long (*kgr_kvm_get_rflags)(struct kvm_vcpu *vcpu);
+static void (*kgr__kvm_set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void (*kgr_kvm_update_dr7)(struct kvm_vcpu *vcpu);
+static void (*kgr_process_smi)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_cpu_has_injectable_intr)(struct kvm_vcpu *v);
+static int (*kgr_kvm_cpu_get_interrupt)(struct kvm_vcpu *v);
+static void (*kgr_update_cr8_intercept)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_lapic_sync_to_vapic)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_mmu_load)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_load_guest_fpu)(struct kvm_vcpu *vcpu);
+static void (*kgr_wait_lapic_expire)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_update_dr7)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_lapic_sync_from_vapic)(struct kvm_vcpu *vcpu);
+static void (*kgr_init_emulate_ctxt)(struct kvm_vcpu *vcpu);
+static unsigned long (*kgr_kvm_get_linear_rip)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_queue_exception)(struct kvm_vcpu *vcpu, unsigned nr);
+static int (*kgr_x86_decode_insn)(struct x86_emulate_ctxt *ctxt, void *insn,
+ int insn_len);
+static gpa_t (*kgr_kvm_mmu_gva_to_gpa_write)(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+static kvm_pfn_t (*kgr_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
+static void (*kgr_kvm_release_pfn_clean)(kvm_pfn_t pfn);
+static int (*kgr_kvm_mmu_unprotect_page)(struct kvm *kvm, gfn_t gfn);
+static int (*kgr_handle_emulation_failure)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+static bool (*kgr_x86_page_table_writing_insn)(struct x86_emulate_ctxt *ctxt);
+static void
+(*kgr_emulator_invalidate_register_cache)(struct x86_emulate_ctxt *ctxt);
+static int (*kgr_x86_emulate_insn)(struct x86_emulate_ctxt *ctxt);
+static void (*kgr_kvm_smm_changed)(struct kvm_vcpu *vcpu);
+static int (*kgr_complete_emulated_pio)(struct kvm_vcpu *vcpu);
+static int (*kgr_complete_emulated_mmio)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_arch_vcpu_runnable)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_vcpu_block)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_cpu_has_pending_timer)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_inject_pending_timer_irqs)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_vcpu_ready_for_interrupt_injection)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_check_async_pf_completion)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_set_cr8)(struct kvm_vcpu *vcpu, unsigned long cr8);
+static unsigned long (*kgr_kvm_get_cr8)(struct kvm_vcpu *vcpu);
+static u64 (*kgr_kvm_get_apic_base)(struct kvm_vcpu *vcpu);
+static void (*kgr_kvm_queue_exception_e)(struct kvm_vcpu *vcpu, unsigned nr,
+ u32 error_code);
+static int (*kgr_kvm_vcpu_write_guest)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *data, unsigned long len);
+
+static struct {
+ char *name;
+ void **addr;
+} kgr_funcs[] = {
+ { "kvm:__tracepoint_kvm_pvclock_update",
+ (void *)&kgr__tracepoint_kvm_pvclock_update },
+ { "kvm:__tracepoint_kvm_inj_exception",
+ (void *)&kgr__tracepoint_kvm_inj_exception },
+ { "kvm:__tracepoint_kvm_entry",
+ (void *)&kgr__tracepoint_kvm_entry },
+ { "kvm:__tracepoint_kvm_emulate_insn",
+ (void *)&kgr__tracepoint_kvm_emulate_insn },
+ { "kvm:kvm_x86_ops", (void *)&kgr_kvm_x86_ops },
+ { "kvm:cpu_tsc_khz", (void *)&kgr_cpu_tsc_khz },
+ { "kvm:kvm_has_tsc_control", (void *)&kgr_kvm_has_tsc_control },
+ { "kvm:kvm_no_apic_vcpu", (void *)&kgr_kvm_no_apic_vcpu },
+ { "kvm:apic_hw_disabled", (void *)&kgr_apic_hw_disabled },
+ { "kvm:apic_sw_disabled", (void *)&kgr_apic_sw_disabled },
+ { "kvm:host_xcr0", (void *)&kgr_host_xcr0 },
+ { "kvm:kvm_apic_accept_pic_intr",
+ (void *)&kgr_kvm_apic_accept_pic_intr },
+ { "kvm:kvm_mmu_unload", (void *)&kgr_kvm_mmu_unload },
+ { "kvm:__kvm_migrate_timers", (void *)&kgr__kvm_migrate_timers },
+ { "kvm:kvm_gen_update_masterclock",
+ (void *)&kgr_kvm_gen_update_masterclock },
+ { "kvm:kvm_read_l1_tsc", (void *)&kgr_kvm_read_l1_tsc },
+ { "kvm:kvm_get_time_scale", (void *)&kgr_kvm_get_time_scale },
+ { "kvm:kvm_write_guest_cached", (void *)&kgr_kvm_write_guest_cached },
+ { "kvm:kvm_read_guest_cached", (void *)&kgr_kvm_read_guest_cached },
+ { "kvm:kvm_mmu_sync_roots", (void *)&kgr_kvm_mmu_sync_roots },
+ { "kvm:process_nmi", (void *)&kgr_process_nmi },
+ { "kvm:kvm_pmu_handle_event", (void *)&kgr_kvm_pmu_handle_event },
+ { "kvm:kvm_pmu_deliver_pmi", (void *)&kgr_kvm_pmu_deliver_pmi },
+ { "kvm:kvm_scan_ioapic_routes", (void *)&kgr_kvm_scan_ioapic_routes },
+ { "kvm:kvm_ioapic_scan_entry", (void *)&kgr_kvm_ioapic_scan_entry },
+ { "kvm:kvm_vcpu_reload_apic_access_page",
+ (void *)&kgr_kvm_vcpu_reload_apic_access_page },
+ { "kvm:kvm_lapic_find_highest_irr",
+ (void *)&kgr_kvm_lapic_find_highest_irr },
+ { "kvm:kvm_apic_accept_events", (void *)&kgr_kvm_apic_accept_events },
+ { "kvm:exception_type", (void *)&kgr_exception_type },
+ { "kvm:kvm_get_rflags", (void *)&kgr_kvm_get_rflags },
+ { "kvm:__kvm_set_rflags", (void *)&kgr__kvm_set_rflags },
+ { "kvm:kvm_update_dr7", (void *)&kgr_kvm_update_dr7 },
+ { "kvm:process_smi", (void *)&kgr_process_smi },
+ { "kvm:kvm_cpu_has_injectable_intr",
+ (void *)&kgr_kvm_cpu_has_injectable_intr },
+ { "kvm:kvm_cpu_get_interrupt", (void *)&kgr_kvm_cpu_get_interrupt },
+ { "kvm:update_cr8_intercept", (void *)&kgr_update_cr8_intercept },
+ { "kvm:kvm_lapic_sync_to_vapic", (void *)&kgr_kvm_lapic_sync_to_vapic },
+ { "kvm:kvm_mmu_load", (void *)&kgr_kvm_mmu_load },
+ { "kvm:kvm_load_guest_fpu", (void *)&kgr_kvm_load_guest_fpu },
+ { "kvm:wait_lapic_expire", (void *)&kgr_wait_lapic_expire },
+ { "kvm:kvm_update_dr7", (void *)&kgr_kvm_update_dr7 },
+ { "kvm:kvm_lapic_sync_from_vapic",
+ (void *)&kgr_kvm_lapic_sync_from_vapic },
+ { "kvm:init_emulate_ctxt", (void *)&kgr_init_emulate_ctxt },
+ { "kvm:kvm_get_linear_rip", (void *)&kgr_kvm_get_linear_rip },
+ { "kvm:kvm_queue_exception", (void *)&kgr_kvm_queue_exception },
+ { "kvm:x86_decode_insn", (void *)&kgr_x86_decode_insn },
+ { "kvm:kvm_mmu_gva_to_gpa_write",
+ (void *)&kgr_kvm_mmu_gva_to_gpa_write },
+ { "kvm:gfn_to_pfn", (void *)&kgr_gfn_to_pfn },
+ { "kvm:kvm_release_pfn_clean", (void *)&kgr_kvm_release_pfn_clean },
+ { "kvm:kvm_mmu_unprotect_page", (void *)&kgr_kvm_mmu_unprotect_page },
+ { "kvm:handle_emulation_failure",
+ (void *)&kgr_handle_emulation_failure },
+ { "kvm:kvm_set_rflags", (void *)&kgr_kvm_set_rflags },
+ { "kvm:x86_page_table_writing_insn",
+ (void *)&kgr_x86_page_table_writing_insn },
+ { "kvm:emulator_invalidate_register_cache",
+ (void *)&kgr_emulator_invalidate_register_cache },
+ { "kvm:x86_emulate_insn", (void *)&kgr_x86_emulate_insn },
+ { "kvm:kvm_smm_changed", (void *)&kgr_kvm_smm_changed },
+ { "kvm:complete_emulated_pio", (void *)&kgr_complete_emulated_pio },
+ { "kvm:complete_emulated_mmio", (void *)&kgr_complete_emulated_mmio },
+ { "kvm:kvm_arch_vcpu_runnable", (void *)&kgr_kvm_arch_vcpu_runnable },
+ { "kvm:kvm_vcpu_block", (void *)&kgr_kvm_vcpu_block },
+ { "kvm:kvm_cpu_has_pending_timer",
+ (void *)&kgr_kvm_cpu_has_pending_timer },
+ { "kvm:kvm_inject_pending_timer_irqs",
+ (void *)&kgr_kvm_inject_pending_timer_irqs },
+ { "kvm:kvm_vcpu_ready_for_interrupt_injection",
+ (void *)&kgr_kvm_vcpu_ready_for_interrupt_injection },
+ { "kvm:kvm_check_async_pf_completion",
+ (void *)&kgr_kvm_check_async_pf_completion },
+ { "kvm:kvm_set_cr8", (void *)&kgr_kvm_set_cr8 },
+ { "kvm:kvm_get_cr8", (void *)&kgr_kvm_get_cr8 },
+ { "kvm:kvm_get_apic_base", (void *)&kgr_kvm_get_apic_base },
+ { "kvm:kvm_queue_exception_e", (void *)&kgr_kvm_queue_exception_e },
+ { "kvm:kvm_vcpu_write_guest", (void *)&kgr_kvm_vcpu_write_guest },
+};
+
+
+/* from arch/x86/kvm/irq.h */
+#define KGR_PIC_NUM_PINS 16
+
+struct kvm_kpic_state {
+ u8 last_irr; /* edge detection */
+ u8 irr; /* interrupt request register */
+ u8 imr; /* interrupt mask register */
+ u8 isr; /* interrupt service register */
+ u8 priority_add; /* highest irq priority */
+ u8 irq_base;
+ u8 read_reg_select;
+ u8 poll;
+ u8 special_mask;
+ u8 init_state;
+ u8 auto_eoi;
+ u8 rotate_on_auto_eoi;
+ u8 special_fully_nested_mode;
+ u8 init4; /* true if 4 byte init */
+ u8 elcr; /* PIIX edge/trigger selection */
+ u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
+ struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+ spinlock_t lock;
+ bool wakeup_needed;
+ unsigned pending_acks;
+ struct kvm *kvm;
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ int output; /* intr from master PIC */
+ struct kvm_io_device dev_master;
+ struct kvm_io_device dev_slave;
+ struct kvm_io_device dev_eclr;
+ void (*ack_notifier)(void *opaque, int irq);
+ unsigned long irq_states[KGR_PIC_NUM_PINS];
+};
+
+/* inlined */
+static inline struct kvm_pic *kgr_pic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vpic;
+}
+
+/* inlined */
+static inline int kgr_pic_in_kernel(struct kvm *kvm)
+{
+ int ret;
+
+ ret = (kgr_pic_irqchip(kvm) != NULL);
+ return ret;
+}
+
+/* inlined */
+static inline int kgr_irqchip_split(struct kvm *kvm)
+{
+ return kvm->arch.irqchip_split;
+}
+
+/* inlined */
+static inline int kgr_lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ /* Same as irqchip_in_kernel(vcpu->kvm), but with less
+ * pointer chasing and no unnecessary memory barriers.
+ */
+ return vcpu->arch.apic != NULL;
+}
+
+
+/* from arch/x86/kvm/lapic.h */
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ u32 timer_mode;
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ u64 expired_tscdeadline;
+ atomic_t pending; /* accumulated triggered timers */
+};
+
+struct kvm_lapic {
+ unsigned long base_address;
+ struct kvm_io_device dev;
+ struct kvm_timer lapic_timer;
+ u32 divide_count;
+ struct kvm_vcpu *vcpu;
+ bool sw_enabled;
+ bool irr_pending;
+ bool lvt0_in_nmi_mode;
+ /* Number of bits set in ISR. */
+ s16 isr_count;
+ /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+ int highest_isr_cache;
+ /**
+ * APIC register page. The layout matches the register layout seen by
+ * the guest 1:1, because it is accessed by the vmx microcode.
+ * Note: Only one register, the TPR, is used by the microcode.
+ */
+ void *regs;
+ gpa_t vapic_addr;
+ struct gfn_to_hva_cache vapic_cache;
+ unsigned long pending_events;
+ unsigned int sipi_vector;
+};
+
+/* inlined */
+static inline bool kgr_kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+{
+ if (static_key_enabled(kgr_kvm_no_apic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+/* inlined */
+static inline int kgr_kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_enabled(&kgr_apic_hw_disabled->key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return MSR_IA32_APICBASE_ENABLE;
+}
+
+/* inlined */
+static inline bool kgr_kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_enabled(&kgr_apic_sw_disabled->key))
+ return apic->sw_enabled;
+ return true;
+}
+
+/* inlined */
+static inline bool kgr_kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return kgr_kvm_vcpu_has_lapic(vcpu) && kgr_kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+/* inlined */
+static inline int kgr_kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kgr_kvm_apic_present(vcpu) && kgr_kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
+
+/* from arch/x86/kvm/kvm_cache_regs.h */
+#define KGR_KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
+
+/* inlined */
+static inline unsigned long kgr_kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ (*kgr_kvm_x86_ops)->cache_reg(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+/* inlined */
+static inline void kgr_kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+/* inlined */
+static inline unsigned long kgr_kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kgr_kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+/* inlined */
+static inline void kgr_kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kgr_kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+/* inlined */
+static inline ulong kgr_kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KGR_KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ (*kgr_kvm_x86_ops)->decache_cr4_guest_bits(vcpu);
+ return vcpu->arch.cr4 & mask;
+}
+
+/* inlined */
+static inline bool kgr_is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
+/* inlined */
+static inline bool kgr_is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+
+/* from arch/x86/kvm/mmu.h */
+/* inlined */
+static inline int kgr_kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+ if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ return 0;
+
+ return kgr_kvm_mmu_load(vcpu);
+}
+
+
+/* from arch/x86/kvm/hyperv.h */
+static inline struct kvm_vcpu_hv_synic *kgr_vcpu_to_synic(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv.synic;
+}
+
+
+/* from arch/x86/kvm/trace.h */
+#define kgr_trace_kvm_emulate_insn_start(vcpu) kgr_trace_kvm_emulate_insn(vcpu, 0)
+
+
+/* from arch/x86/kvm/x86.h */
+/* inlined */
+static inline void kgr_kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.exception.pending = false;
+}
+
+/* inlined */
+static inline void kgr_kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ bool soft)
+{
+ vcpu->arch.interrupt.pending = true;
+ vcpu->arch.interrupt.soft = soft;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+/* inlined */
+static inline bool kgr_mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
+/* inlined */
+static inline bool kgr_vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
+}
+
+/* inlined */
+static inline bool kgr_vcpu_match_mmio_gva(struct kvm_vcpu *vcpu,
+ unsigned long gva)
+{
+ if (kgr_vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+/* inlined */
+static inline bool kgr_vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (kgr_vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
+/* inlined */
+static inline u64 kgr_get_kernel_ns(void)
+{
+ return ktime_get_boot_ns();
+}
+
+
+/* from arch/x86/kvm/x86.c */
+#define kgr_emul_to_vcpu(ctxt) \
+ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+
+#define KGR_EXCPT_FAULT 0
+#define KGR_EXCPT_TRAP 1
+
+/* inlined */
+static bool kgr_kvm_propagate_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ if (kgr_mmu_is_nested(vcpu) && !fault->nested_page_fault)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
+ else
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+
+ return fault->nested_page_fault;
+}
+
+/* inlined */
+static void kgr_kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kgr_kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+
+/* inlined */
+static void kgr_kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != *kgr_host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, *kgr_host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+
+/* optimized */
+static void kgr_kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
+ }
+}
+
+/* optimized */
+static void kgr_kvm_update_dr6(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ (*kgr_kvm_x86_ops)->set_dr6(vcpu, vcpu->arch.dr6);
+}
+
+/* inlined */
+static u64 kgr_compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+ vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.this_tsc_write;
+ return tsc;
+}
+
+/* inlined */
+/* lined 1564 */
+static inline void kgr_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ (*kgr_kvm_x86_ops)->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+
+/* inlined */
+static int kgr_kvm_guest_time_update(struct kvm_vcpu *v)
+{
+ unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct kvm_arch *ka = &v->kvm->arch;
+ s64 kernel_ns;
+ u64 tsc_timestamp, host_tsc;
+ struct pvclock_vcpu_time_info guest_hv_clock;
+ u8 pvclock_flags;
+ bool use_master_clock;
+
+ kernel_ns = 0;
+ host_tsc = 0;
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ this_tsc_khz = *this_cpu_ptr(kgr_cpu_tsc_khz);
+ if (unlikely(this_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
+ if (!use_master_clock) {
+ host_tsc = rdtsc();
+ kernel_ns = kgr_get_kernel_ns();
+ }
+
+ tsc_timestamp = kgr_kvm_read_l1_tsc(v, host_tsc);
+
+ /*
+ * We may have to catch up the TSC to match elapsed wall clock
+ * time for two reasons, even if kvmclock is used.
+ * 1) CPU could have been running below the maximum TSC rate
+ * 2) Broken TSC compensation resets the base at each VCPU
+ * entry to avoid unknown leaps of TSC even when running
+ * again on the same CPU. This may cause apparent elapsed
+ * time to disappear, and the guest to stand still or run
+ * very slowly.
+ */
+ if (vcpu->tsc_catchup) {
+ u64 tsc = kgr_compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp) {
+ kgr_adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
+ tsc_timestamp = tsc;
+ }
+ }
+
+ local_irq_restore(flags);
+
+ if (!vcpu->pv_time_enabled)
+ return 0;
+
+ if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
+ tgt_tsc_khz = *kgr_kvm_has_tsc_control ?
+ vcpu->virtual_tsc_khz : this_tsc_khz;
+ kgr_kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+ &vcpu->hv_clock.tsc_shift,
+ &vcpu->hv_clock.tsc_to_system_mul);
+ vcpu->hw_tsc_khz = this_tsc_khz;
+ }
+
+ /* With all the info we got, fill in the values */
+ vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+ vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_guest_tsc = tsc_timestamp;
+
+ if (unlikely(kgr_kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return 0;
+
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
+ */
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kgr_kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ /* If the host uses TSC clocksource, then it is stable */
+ if (use_master_clock)
+ pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+ vcpu->hv_clock.flags = pvclock_flags;
+
+ kgr_trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+ kgr_kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kgr_kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+ return 0;
+}
+
+#define KGR_KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+/* inlined */
+static void kgr_kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+ struct kvm *kvm = v->kvm;
+
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+ KGR_KVMCLOCK_UPDATE_DELAY);
+}
+
+/* inlined */
+static void kgr_kvmclock_reset(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pv_time_enabled = false;
+}
+
+/* inlined */
+static void kgr_accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+ u64 delta;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ vcpu->arch.st.accum_steal = delta;
+}
+
+/* inlined */
+static void kgr_record_steal_time(struct kvm_vcpu *vcpu)
+{
+ kgr_accumulate_steal_time(vcpu);
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (unlikely(kgr_kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+ return;
+
+ vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
+ vcpu->arch.st.steal.version += 2;
+ vcpu->arch.st.accum_steal = 0;
+
+ kgr_kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
+/* inlined */
+static int kgr_kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ return (!kgr_lapic_in_kernel(vcpu) ||
+ kgr_kvm_apic_accept_pic_intr(vcpu));
+}
+
+/* inlined */
+static void kgr_toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = (*kgr_kvm_x86_ops)->get_interrupt_shadow(vcpu);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
+ (*kgr_kvm_x86_ops)->set_interrupt_shadow(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+}
+
+/* inlined */
+static bool kgr_inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ if (ctxt->exception.vector == PF_VECTOR)
+ return kgr_kvm_propagate_fault(vcpu, &ctxt->exception);
+
+ if (ctxt->exception.error_code_valid)
+ kgr_kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
+ else
+ kgr_kvm_queue_exception(vcpu, ctxt->exception.vector);
+ return false;
+}
+
+/* optimized */
+static bool kgr_reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+ bool write_fault_to_shadow_pgtable,
+ int emulation_type)
+{
+ gpa_t gpa = cr2;
+ kvm_pfn_t pfn;
+
+ if (emulation_type & EMULTYPE_NO_REEXECUTE)
+ return false;
+
+ if (!vcpu->arch.mmu.direct_map) {
+ /*
+ * Write permission should be allowed since only
+ * write access need to be emulated.
+ */
+ gpa = kgr_kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+
+ /*
+ * If the mapping is invalid in guest, let cpu retry
+ * it to generate fault.
+ */
+ if (gpa == UNMAPPED_GVA)
+ return true;
+ }
+
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = kgr_gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the instruction failed on the error pfn, it can not be fixed,
+ * report the error to userspace.
+ */
+ if (is_error_noslot_pfn(pfn))
+ return false;
+
+ kgr_kvm_release_pfn_clean(pfn);
+
+ /* The instructions are well-emulated on direct mmu. */
+ if (vcpu->arch.mmu.direct_map) {
+ unsigned int indirect_shadow_pages;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (indirect_shadow_pages)
+ kgr_kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+ }
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-enter the
+ * guest to let CPU execute the instruction.
+ */
+ kgr_kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the access faults on its page table, it can not
+ * be fixed by unprotecting shadow page and it should
+ * be reported to userspace.
+ */
+ return !write_fault_to_shadow_pgtable;
+}
+
+/* inlined */
+static bool kgr_retry_instruction(struct x86_emulate_ctxt *ctxt,
+ unsigned long cr2, int emulation_type)
+{
+ struct kvm_vcpu *vcpu = kgr_emul_to_vcpu(ctxt);
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+
+ last_retry_eip = vcpu->arch.last_retry_eip;
+ last_retry_addr = vcpu->arch.last_retry_addr;
+
+ /*
+ * If the emulation is caused by #PF and it is non-page_table
+ * writing instruction, it means the VM-EXIT is caused by shadow
+ * page protected, we can zap the shadow page and retry this
+ * instruction directly.
+ *
+ * Note: if the guest uses a non-page-table modifying instruction
+ * on the PDE that points to the instruction, then we will unmap
+ * the instruction and go to an infinite loop. So, we cache the
+ * last retried eip and the last fault address, if we meet the eip
+ * and the address again, we can break out of the potential infinite
+ * loop.
+ */
+ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
+
+ if (!(emulation_type & EMULTYPE_RETRY))
+ return false;
+
+ if (kgr_x86_page_table_writing_insn(ctxt))
+ return false;
+
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+ return false;
+
+ vcpu->arch.last_retry_eip = ctxt->eip;
+ vcpu->arch.last_retry_addr = cr2;
+
+ if (!vcpu->arch.mmu.direct_map)
+ gpa = kgr_kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+
+ kgr_kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+}
+
+/* inlined */
+static void kgr_kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+ unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
+ vcpu->arch.hflags = emul_flags;
+
+ if (changed & HF_SMM_MASK)
+ kgr_kvm_smm_changed(vcpu);
+}
+
+/* optimized */
+static int kgr_kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+ unsigned long *db)
+{
+ u32 dr6 = 0;
+ int i;
+ u32 enable, rwlen;
+
+ enable = dr7;
+ rwlen = dr7 >> 16;
+ for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+ if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+ dr6 |= (1 << i);
+ return dr6;
+}
+
+/* inlined */
+static void kgr_kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kgr_kvm_queue_exception(vcpu, DB_VECTOR);
+ }
+}
+
+/* inlined */
+static bool kgr_kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+ if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+ (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = kgr_kvm_get_linear_rip(vcpu);
+ u32 dr6 = kgr_kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.guest_debug_dr7,
+ vcpu->arch.eff_db);
+
+ if (dr6 != 0) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = eip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ return true;
+ }
+ }
+
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !(kgr_kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+ unsigned long eip = kgr_kvm_get_linear_rip(vcpu);
+ u32 dr6 = kgr_kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.dr7,
+ vcpu->arch.db);
+
+ if (dr6 != 0) {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ kgr_kvm_queue_exception(vcpu, DB_VECTOR);
+ *r = EMULATE_DONE;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* inlined */
+static int kgr_dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->run->request_interrupt_window &&
+ likely(!kgr_pic_in_kernel(vcpu->kvm));
+}
+
+/* inlined */
+static void kgr_post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->if_flag = (kgr_kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->flags = kgr_is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
+ kvm_run->cr8 = kgr_kvm_get_cr8(vcpu);
+ kvm_run->apic_base = kgr_kvm_get_apic_base(vcpu);
+ kvm_run->ready_for_interrupt_injection =
+ kgr_pic_in_kernel(vcpu->kvm) ||
+ kgr_kvm_vcpu_ready_for_interrupt_injection(vcpu);
+}
+
+/* inlined */
+static int kgr_inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
+{
+ int r;
+
+ /* try to reinject previous events if any */
+ if (vcpu->arch.exception.pending) {
+ kgr_trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
+
+ if (kgr_exception_type(vcpu->arch.exception.nr) == KGR_EXCPT_FAULT)
+ kgr__kvm_set_rflags(vcpu, kgr_kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
+ if (vcpu->arch.exception.nr == DB_VECTOR &&
+ (vcpu->arch.dr7 & DR7_GD)) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kgr_kvm_update_dr7(vcpu);
+ }
+
+ (*kgr_kvm_x86_ops)->queue_exception(vcpu, vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.reinject);
+ return 0;
+ }
+
+ if (vcpu->arch.nmi_injected) {
+ (*kgr_kvm_x86_ops)->set_nmi(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.interrupt.pending) {
+ (*kgr_kvm_x86_ops)->set_irq(vcpu);
+ return 0;
+ }
+
+ if (kgr_is_guest_mode(vcpu) && (*kgr_kvm_x86_ops)->check_nested_events) {
+ r = (*kgr_kvm_x86_ops)->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
+
+ /* try to inject new event if pending */
+ if (vcpu->arch.smi_pending && !kgr_is_smm(vcpu)) {
+ vcpu->arch.smi_pending = false;
+ kgr_process_smi(vcpu);
+ } else if (vcpu->arch.nmi_pending && (*kgr_kvm_x86_ops)->nmi_allowed(vcpu)) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ (*kgr_kvm_x86_ops)->set_nmi(vcpu);
+ } else if (kgr_kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (kgr_is_guest_mode(vcpu) && (*kgr_kvm_x86_ops)->check_nested_events) {
+ r = (*kgr_kvm_x86_ops)->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
+ if ((*kgr_kvm_x86_ops)->interrupt_allowed(vcpu)) {
+ kgr_kvm_queue_interrupt(vcpu, kgr_kvm_cpu_get_interrupt(vcpu),
+ false);
+ (*kgr_kvm_x86_ops)->set_irq(vcpu);
+ }
+ }
+ return 0;
+}
+
+/* inlined */
+static void kgr_process_smi_request(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+/* inlined */
+static void kgr_vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ if (!kgr_kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
+
+ bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+
+ if (kgr_irqchip_split(vcpu->kvm))
+ kgr_kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+ else {
+ if (vcpu->arch.apicv_active)
+ (*kgr_kvm_x86_ops)->sync_pir_to_irr(vcpu);
+ kgr_kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+ }
+ bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
+ kgr_vcpu_to_synic(vcpu)->vec_bitmap, 256);
+ (*kgr_kvm_x86_ops)->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
+/* inlined */
+static void kgr_kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ (*kgr_kvm_x86_ops)->tlb_flush(vcpu);
+}
+
+/* inlined */
+static inline int kgr_vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ if (!kgr_kvm_arch_vcpu_runnable(vcpu) &&
+ (!(*kgr_kvm_x86_ops)->pre_block || (*kgr_kvm_x86_ops)->pre_block(vcpu) == 0)) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kgr_kvm_vcpu_block(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if ((*kgr_kvm_x86_ops)->post_block)
+ (*kgr_kvm_x86_ops)->post_block(vcpu);
+
+ if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ return 1;
+ }
+
+ kgr_kvm_apic_accept_events(vcpu);
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
+ break;
+ case KVM_MP_STATE_INIT_RECEIVED:
+ break;
+ default:
+ return -EINTR;
+ break;
+ }
+ return 1;
+}
+
+/* inlined */
+static inline bool kgr_kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
+
+
+/* from arch/x86/kvm/mmu.c */
+/* inlined */
+static bool kgr_is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ if (vcpu->arch.mmu.direct_map || kgr_mmu_is_nested(vcpu))
+ return kgr_vcpu_match_mmio_gpa(vcpu, addr);
+
+ return kgr_vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+
+/* New, L1d flushing infrastructure */
+/*
+ * The L1D cache is 32 KiB on Skylake, but to flush it we have to read in
+ * 64 KiB because the replacement algorithm is not exactly LRU.
+ */
+#define KGR_L1D_CACHE_ORDER 4
+static void *__read_mostly kgr_empty_zero_pages;
+
+#define KGR_MSR_IA32_FLUSH_L1D 0x10b
+#define KGR_MSR_IA32_FLUSH_L1D_VALUE 0x00000001
+
+static bool kgr_has_l1d_flush_msr = false;
+
+static int kgr_kvm_l1d_flush_init(void)
+{
+ struct page *page;
+ unsigned int max_level;
+ unsigned int i;
+
+ if (kgr_never_needs_l1d_flush())
+ return 0;
+
+ /*
+ * Open code the MSR_IA32_FLUSH_L1D detection (corresponds to
+ * X86_FEATURE_FLUSH_L1D upstream).
+ *
+ * cpuid level is 0x00000007, sublevel is 0.
+ *
+ * Result can be found in bit 28 of %edx.
+ */
+ max_level = cpuid_eax(0);
+ if (max_level >= 0x00000007u) {
+ unsigned int edx;
+
+ edx = cpuid_edx(0x00000007u);
+ if (edx & (1u << 28)) {
+ kgr_has_l1d_flush_msr = true;
+ return 0;
+ }
+ }
+
+ /*
+ * No MSR_IA32_FLUSH_L1D available, use the slow method.
+ */
+ page = alloc_pages(GFP_KERNEL, KGR_L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ kgr_empty_zero_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << KGR_L1D_CACHE_ORDER; ++i) {
+ memset(kgr_empty_zero_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+
+ return 0;
+}
+
+static void kgr_kvm_l1d_flush_cleanup(void)
+{
+ if (!kgr_empty_zero_pages)
+ return;
+ free_pages((unsigned long)kgr_empty_zero_pages, KGR_L1D_CACHE_ORDER);
+}
+
+static void kgr_kvm_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size;
+ bool need_l1d_flush;
+
+ need_l1d_flush = kgr_get_and_clear_vcpu_unconfined(vcpu, GFP_ATOMIC);
+
+ need_l1d_flush |= __this_cpu_read(kgr_kvm_cpu_l1tf_flush_l1d);
+ __this_cpu_write(kgr_kvm_cpu_l1tf_flush_l1d, false);
+
+ if (!need_l1d_flush)
+ return;
+
+ if (kgr_has_l1d_flush_msr) {
+ wrmsrl_safe(KGR_MSR_IA32_FLUSH_L1D,
+ KGR_MSR_IA32_FLUSH_L1D_VALUE);
+ return;
+ }
+
+ /* Should not happen, but be safe. */
+ if (WARN_ON_ONCE(!kgr_empty_zero_pages))
+ return;
+
+ /* FIXME: could this be boot_cpu_data.x86_cache_size * 2? */
+ size = PAGE_SIZE << KGR_L1D_CACHE_ORDER;
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n\t"
+ "11: \n\t"
+ "movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %1\n\t"
+ "jne 11b\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n\t"
+ "12:\n\t"
+ "movzbl (%0, %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %1\n\t"
+ "jne 12b\n\t"
+ "lfence\n\t"
+ : : "r" (kgr_empty_zero_pages), "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+
+/* patched */
+int kgr_kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ struct kvm_vcpu *vcpu = kgr_emul_to_vcpu(ctxt);
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ /*
+ * Fix CVE-2018-3646
+ * +2 lines
+ */
+ kgr_set_vcpu_unconfined(vcpu, GFP_KERNEL);
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+ PFERR_WRITE_MASK,
+ exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kgr_kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= towrite;
+ data += towrite;
+ addr += towrite;
+ }
+out:
+ return r;
+}
+
+/* patched */
+int kgr_x86_emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ int emulation_type,
+ void *insn,
+ int insn_len)
+{
+ int r;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ bool writeback = true;
+ bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+
+ /*
+ * Fix CVE-2018-3646
+ * +2 lines
+ */
+ kgr_set_vcpu_unconfined(vcpu, GFP_KERNEL);
+
+ /*
+ * Clear write_fault_to_shadow_pgtable here to ensure it is
+ * never reused.
+ */
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
+ kgr_kvm_clear_exception_queue(vcpu);
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ kgr_init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since
+ * we do not set complete_userspace_io. This does not
+ * handle watchpoints yet, those would be handled in
+ * the emulate_ops.
+ */
+ if (kgr_kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = kgr_x86_decode_insn(ctxt, insn, insn_len);
+
+ kgr_trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+ if (r != EMULATION_OK) {
+ if (emulation_type & EMULTYPE_TRAP_UD)
+ return EMULATE_FAIL;
+ if (kgr_reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ emulation_type))
+ return EMULATE_DONE;
+ if (emulation_type & EMULTYPE_SKIP)
+ return EMULATE_FAIL;
+ return kgr_handle_emulation_failure(vcpu);
+ }
+ }
+
+ if (emulation_type & EMULTYPE_SKIP) {
+ kgr_kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kgr_kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
+ return EMULATE_DONE;
+ }
+
+ if (kgr_retry_instruction(ctxt, cr2, emulation_type))
+ return EMULATE_DONE;
+
+ /* this is needed for vmware backdoor interface to work since it
+ changes registers values during IO operation */
+ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+ kgr_emulator_invalidate_register_cache(ctxt);
+ }
+
+restart:
+ r = kgr_x86_emulate_insn(ctxt);
+
+ if (r == EMULATION_INTERCEPTED)
+ return EMULATE_DONE;
+
+ if (r == EMULATION_FAILED) {
+ if (kgr_reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ emulation_type))
+ return EMULATE_DONE;
+
+ return kgr_handle_emulation_failure(vcpu);
+ }
+
+ if (ctxt->have_exception) {
+ r = EMULATE_DONE;
+ if (kgr_inject_emulated_exception(vcpu))
+ return r;
+ } else if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in) {
+ /* FIXME: return into emulator if single-stepping. */
+ vcpu->arch.pio.count = 0;
+ } else {
+ writeback = false;
+ vcpu->arch.complete_userspace_io = kgr_complete_emulated_pio;
+ }
+ r = EMULATE_USER_EXIT;
+ } else if (vcpu->mmio_needed) {
+ if (!vcpu->mmio_is_write)
+ writeback = false;
+ r = EMULATE_USER_EXIT;
+ vcpu->arch.complete_userspace_io = kgr_complete_emulated_mmio;
+ } else if (r == EMULATION_RESTART)
+ goto restart;
+ else
+ r = EMULATE_DONE;
+
+ if (writeback) {
+ unsigned long rflags = (*kgr_kvm_x86_ops)->get_rflags(vcpu);
+ bool tf = ((ctxt->regs_valid & (1U << VCPU_REGS_TF)) != 0);
+
+ ctxt->regs_valid &= ~(1U << VCPU_REGS_TF);
+ kgr_toggle_interruptibility(vcpu, ctxt->interruptibility);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ if (vcpu->arch.hflags != ctxt->emul_flags)
+ kgr_kvm_set_hflags(vcpu, ctxt->emul_flags);
+ kgr_kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE &&
+ (tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ kgr_kvm_vcpu_do_singlestep(vcpu, &r);
+ if (!ctxt->have_exception ||
+ kgr_exception_type(ctxt->exception.vector) == KGR_EXCPT_TRAP)
+ kgr__kvm_set_rflags(vcpu, ctxt->eflags);
+
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
+
+ return r;
+}
+
+/* patched, not inlined, but caller also patched */
+static int kgr_vcpu_enter_guest(struct kvm_vcpu *vcpu)
+{
+ int r;
+ bool req_int_win =
+ kgr_dm_request_for_irq_injection(vcpu) &&
+ kgr_kvm_cpu_accept_dm_intr(vcpu);
+
+ bool req_immediate_exit = false;
+
+ if (vcpu->requests) {
+ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ kgr_kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
+ kgr__kvm_migrate_timers(vcpu);
+ if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+ kgr_kvm_gen_update_masterclock(vcpu->kvm);
+ if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+ kgr_kvm_gen_kvmclock_update(vcpu);
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+ r = kgr_kvm_guest_time_update(vcpu);
+ if (unlikely(r))
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
+ kgr_kvm_mmu_sync_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kgr_kvm_vcpu_flush_tlb(vcpu);
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
+ vcpu->fpu_active = 0;
+ (*kgr_kvm_x86_ops)->fpu_deactivate(vcpu);
+ }
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ kgr_record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ kgr_process_smi_request(vcpu);
+ if (kvm_check_request(KVM_REQ_NMI, vcpu))
+ kgr_process_nmi(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kgr_kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kgr_kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ vcpu->arch.ioapic_handled_vectors)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+ kgr_vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
+ kgr_kvm_vcpu_reload_apic_access_page(vcpu);
+ if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ r = 0;
+ goto out;
+ }
+ }
+
+ /*
+ * KVM_REQ_EVENT is not set when posted interrupts are set by
+ * VT-d hardware, so we have to update RVI unconditionally.
+ */
+ if (kgr_kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (vcpu->arch.apicv_active)
+ (*kgr_kvm_x86_ops)->hwapic_irr_update(vcpu,
+ kgr_kvm_lapic_find_highest_irr(vcpu));
+ }
+
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ kgr_kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ r = 1;
+ goto out;
+ }
+
+ if (kgr_inject_pending_event(vcpu, req_int_win) != 0)
+ req_immediate_exit = true;
+ else {
+ /* Enable NMI/IRQ window open exits if needed.
+ *
+ * SMIs have two cases: 1) they can be nested, and
+ * then there is nothing to do here because RSM will
+ * cause a vmexit anyway; 2) or the SMI can be pending
+ * because inject_pending_event has completed the
+ * injection of an IRQ or NMI from the previous vmexit,
+ * and then we request an immediate exit to inject the SMI.
+ */
+ if (vcpu->arch.smi_pending && !kgr_is_smm(vcpu))
+ req_immediate_exit = true;
+ if (vcpu->arch.nmi_pending)
+ (*kgr_kvm_x86_ops)->enable_nmi_window(vcpu);
+ if (kgr_kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
+ (*kgr_kvm_x86_ops)->enable_irq_window(vcpu);
+ }
+
+ if (kgr_kvm_lapic_enabled(vcpu)) {
+ kgr_update_cr8_intercept(vcpu);
+ kgr_kvm_lapic_sync_to_vapic(vcpu);
+ }
+ }
+
+ r = kgr_kvm_mmu_reload(vcpu);
+ if (unlikely(r)) {
+ goto cancel_injection;
+ }
+
+ preempt_disable();
+
+ (*kgr_kvm_x86_ops)->prepare_guest_switch(vcpu);
+
+ if (vcpu->fpu_active)
+ kgr_kvm_load_guest_fpu(vcpu);
+ vcpu->mode = IN_GUEST_MODE;
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+
+ /* We should set ->mode before check ->requests,
+ * see the comment in make_all_cpus_request.
+ */
+ smp_mb__after_srcu_read_unlock();
+
+ local_irq_disable();
+
+ if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
+ || need_resched() || signal_pending(current)) {
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+ local_irq_enable();
+ preempt_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = 1;
+ goto cancel_injection;
+ }
+
+ kgr_kvm_load_guest_xcr0(vcpu);
+
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
+ kgr_trace_kvm_entry(vcpu->vcpu_id);
+ kgr_wait_lapic_expire(vcpu);
+ __kvm_guest_enter();
+
+ if (unlikely(vcpu->arch.switch_db_regs)) {
+ set_debugreg(0, 7);
+ set_debugreg(vcpu->arch.eff_db[0], 0);
+ set_debugreg(vcpu->arch.eff_db[1], 1);
+ set_debugreg(vcpu->arch.eff_db[2], 2);
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ set_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ }
+
+ /*
+ * Fix CVE-2018-3646
+ * +3 lines
+ */
+ if (!kgr_never_needs_l1d_flush())
+ kgr_kvm_l1d_flush(vcpu);
+
+ (*kgr_kvm_x86_ops)->run(vcpu);
+
+ /*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ (*kgr_kvm_x86_ops)->sync_dirty_debug_regs(vcpu);
+ kgr_kvm_update_dr0123(vcpu);
+ kgr_kvm_update_dr6(vcpu);
+ kgr_kvm_update_dr7(vcpu);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ }
+
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
+
+ vcpu->arch.last_guest_tsc = kgr_kvm_read_l1_tsc(vcpu, rdtsc());
+
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+
+ kgr_kvm_put_guest_xcr0(vcpu);
+
+ /* Interrupt is enabled by handle_external_intr() */
+ (*kgr_kvm_x86_ops)->handle_external_intr(vcpu);
+
+ ++vcpu->stat.exits;
+
+ /*
+ * We must have an instruction between local_irq_enable() and
+ * kvm_guest_exit(), so the timer interrupt isn't delayed by
+ * the interrupt shadow. The stat.exits increment will do nicely.
+ * But we need to prevent reordering, hence this barrier():
+ */
+ barrier();
+
+ kvm_guest_exit();
+
+ preempt_enable();
+
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING)) {
+ unsigned long rip = kgr_kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
+ }
+
+ if (unlikely(vcpu->arch.tsc_always_catchup))
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ if (vcpu->arch.apic_attention)
+ kgr_kvm_lapic_sync_from_vapic(vcpu);
+
+ r = (*kgr_kvm_x86_ops)->handle_exit(vcpu);
+ return r;
+
+cancel_injection:
+ (*kgr_kvm_x86_ops)->cancel_injection(vcpu);
+ if (unlikely(vcpu->arch.apic_attention))
+ kgr_kvm_lapic_sync_from_vapic(vcpu);
+out:
+ return r;
+}
+
+/* patched, inlined */
+static int kgr_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+ struct kvm *kvm = vcpu->kvm;
+
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ /*
+ * Fix CVE-2018-3646
+ * +1 line
+ */
+ kgr_set_vcpu_unconfined(vcpu, GFP_KERNEL);
+
+ for (;;) {
+ if (kgr_kvm_vcpu_running(vcpu)) {
+ r = kgr_vcpu_enter_guest(vcpu);
+ } else {
+ r = kgr_vcpu_block(kvm, vcpu);
+ }
+
+ if (r <= 0)
+ break;
+
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ if (kgr_kvm_cpu_has_pending_timer(vcpu))
+ kgr_kvm_inject_pending_timer_irqs(vcpu);
+
+ if (kgr_dm_request_for_irq_injection(vcpu) &&
+ kgr_kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ ++vcpu->stat.request_irq_exits;
+ break;
+ }
+
+ kgr_kvm_check_async_pf_completion(vcpu);
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ break;
+ }
+ if (need_resched()) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ cond_resched();
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+
+ return r;
+}
+
+/* patched, calls inlined vcpu_run() */
+int kgr_kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ int r;
+ sigset_t sigsaved;
+
+ fpu__activate_curr(fpu);
+
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+ kgr_kvm_vcpu_block(vcpu);
+ kgr_kvm_apic_accept_events(vcpu);
+ clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+ r = -EAGAIN;
+ goto out;
+ }
+
+ /* re-sync apic's tpr */
+ if (!kgr_lapic_in_kernel(vcpu)) {
+ if (kgr_kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (unlikely(vcpu->arch.complete_userspace_io)) {
+ int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+ vcpu->arch.complete_userspace_io = NULL;
+ r = cui(vcpu);
+ if (r <= 0)
+ goto out;
+ } else
+ WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
+
+ r = kgr_vcpu_run(vcpu);
+
+out:
+ kgr_post_kvm_run_save(vcpu);
+ if (vcpu->sigset_active)
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+ return r;
+}
+
+/* patched */
+void kgr_kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ /*
+ * Fix CVE-2018-3646
+ * +1 line
+ */
+ kgr_set_vcpu_unconfined(vcpu, GFP_ATOMIC);
+ (*kgr_kvm_x86_ops)->sched_in(vcpu, cpu);
+}
+
+/* patched */
+void kgr_kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+
+ kgr_kvmclock_reset(vcpu);
+
+ /*
+ * Fix CVE-2018-3646
+ * +1 line
+ */
+ kgr_free_vcpu_unconfined_shadow(vcpu);
+ (*kgr_kvm_x86_ops)->vcpu_free(vcpu);
+ free_cpumask_var(wbinvd_dirty_mask);
+}
+
+/* patched */
+struct kvm_vcpu *kgr_kvm_arch_vcpu_create(struct kvm *kvm,
+ unsigned int id)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ printk_once(KERN_WARNING
+ "kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
+
+ vcpu = (*kgr_kvm_x86_ops)->vcpu_create(kvm, id);
+ /*
+ * Fix CVE-2018-3646
+ * +2 lines
+ */
+ if (!IS_ERR(vcpu))
+ kgr_set_vcpu_unconfined(vcpu, GFP_KERNEL);
+
+ return vcpu;
+}
+
+/* patched */
+int kgr_kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = EMULTYPE_RETRY;
+ enum emulation_result er;
+
+ /*
+ * Fix CVE-2018-3646
+ * +1 line
+ */
+ kgr_set_vcpu_unconfined(vcpu, GFP_KERNEL);
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ if (r < 0)
+ goto out;
+
+ if (!r) {
+ r = 1;
+ goto out;
+ }
+
+ if (kgr_is_mmio_page_fault(vcpu, cr2))
+ emulation_type = 0;
+
+ er = kgr_x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
+
+ switch (er) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_USER_EXIT:
+ ++vcpu->stat.mmio_exits;
+ /* fall through */
+ case EMULATE_FAIL:
+ return 0;
+ default:
+ BUG();
+ }
+out:
+ return r;
+}
+
+
+
+static int kgr_patch_bsc1099306_kvm_kallsyms(void)
+{
+ unsigned long addr;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(kgr_funcs); i++) {
+ /* mod_find_symname would be nice, but it is not exported */
+ addr = kallsyms_lookup_name(kgr_funcs[i].name);
+ if (!addr) {
+ pr_err("kgraft-patch: symbol %s not resolved\n",
+ kgr_funcs[i].name);
+ return -ENOENT;
+ }
+
+ *(kgr_funcs[i].addr) = (void *)addr;
+ }
+
+ return 0;
+}
+
+static int kgr_patch_bsc1099306_kvm_module_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct module *mod = data;
+ int ret;
+
+ if (action != MODULE_STATE_COMING || strcmp(mod->name, KGR_PATCHED_MODULE))
+ return 0;
+
+ ret = kgr_patch_bsc1099306_kvm_kallsyms();
+ WARN(ret, "kgraft-patch: delayed kallsyms lookup failed. System is broken and can crash.\n");
+
+ return ret;
+}
+
+static struct notifier_block kgr_patch_bsc1099306_kvm_module_nb = {
+ .notifier_call = kgr_patch_bsc1099306_kvm_module_notify,
+ .priority = INT_MIN+1,
+};
+
+int __kgr_patch_bsc1099306_kvm_init(void)
+{
+ int ret;
+
+ ret = kgr_kvm_l1d_flush_init();
+ if (ret)
+ return ret;
+
+ mutex_lock(&module_mutex);
+ if (find_module(KGR_PATCHED_MODULE)) {
+ ret = kgr_patch_bsc1099306_kvm_kallsyms();
+ if (ret)
+ goto out;
+ }
+
+ ret = register_module_notifier(&kgr_patch_bsc1099306_kvm_module_nb);
+out:
+ mutex_unlock(&module_mutex);
+ if (ret)
+ kgr_kvm_l1d_flush_cleanup();
+ return ret;
+}
+
+void __kgr_patch_bsc1099306_kvm_cleanup(void)
+{
+ unregister_module_notifier(&kgr_patch_bsc1099306_kvm_module_nb);
+ kgr_kvm_l1d_flush_cleanup();
+}
+
+#endif /* IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM) */
diff --git a/bsc1099306/kgr_patch_bsc1099306_kvm.h b/bsc1099306/kgr_patch_bsc1099306_kvm.h
new file mode 100644
index 0000000..9f27f36
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306_kvm.h
@@ -0,0 +1,51 @@
+#ifndef _KGR_PATCH_BSC1099306_KVM_H
+#define _KGR_PATCH_BSC1099306_KVM_H
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+#include <linux/kvm_types.h>
+
+int __kgr_patch_bsc1099306_kvm_init(void);
+void __kgr_patch_bsc1099306_kvm_cleanup(void);
+
+
+struct x86_emulate_ctxt;
+struct x86_exception;
+
+int kgr_kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *exception);
+int kgr_x86_emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ int emulation_type,
+ void *insn,
+ int insn_len);
+int kgr_kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+void kgr_kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
+void kgr_kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
+struct kvm_vcpu *kgr_kvm_arch_vcpu_create(struct kvm *kvm,
+ unsigned int id);
+int kgr_kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+ void *insn, int insn_len);
+
+#define __KGR_PATCH_BSC1099306_KVM_FUNCS \
+ KGR_PATCH_OBJ(kvm_write_guest_virt_system, \
+ kgr_kvm_write_guest_virt_system, "kvm"), \
+ KGR_PATCH_OBJ(x86_emulate_instruction, \
+ kgr_x86_emulate_instruction, "kvm"), \
+ KGR_PATCH_OBJ(kvm_arch_vcpu_ioctl_run, \
+ kgr_kvm_arch_vcpu_ioctl_run, "kvm"), \
+ KGR_PATCH_OBJ(kvm_arch_sched_in, \
+ kgr_kvm_arch_sched_in, "kvm"), \
+ KGR_PATCH_OBJ(kvm_arch_vcpu_free, kgr_kvm_arch_vcpu_free, \
+ "kvm"), \
+ KGR_PATCH_OBJ(kvm_arch_vcpu_create, kgr_kvm_arch_vcpu_create, \
+ "kvm"), \
+ KGR_PATCH_OBJ(kvm_mmu_page_fault, kgr_kvm_mmu_page_fault, \
+ "kvm"), \
+
+
+#endif /* IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM) */
+
+#endif /* _KGR_PATCH_BSC1099306_KVM_H */
diff --git a/bsc1099306/kgr_patch_bsc1099306_kvm_intel.c b/bsc1099306/kgr_patch_bsc1099306_kvm_intel.c
new file mode 100644
index 0000000..6641f8c
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306_kvm_intel.c
@@ -0,0 +1,1325 @@
+/*
+ * kgraft_patch_bsc1099306_kvm_intel
+ *
+ * Fix for CVE-2018-3646 (kvm_intel.ko part), bsc#1099306
+ *
+ * Copyright (c) 2018 SUSE
+ * Author: Nicolai Stange <nstange@suse.de>
+ *
+ * Based on the original Linux kernel code. Other copyrights apply.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/kvm_host.h>
+#include <asm/vmx.h>
+#include <linux/highmem.h>
+#include <asm/segment.h>
+#include "kgr_patch_bsc1099306_kvm_intel.h"
+#include "bsc1099306.h"
+
+#if !IS_MODULE(CONFIG_KVM_INTEL)
+#error "Live patch supports only CONFIG_KVM_INTEL=m"
+#endif
+
+#define KGR_PATCHED_MODULE "kvm_intel"
+
+
+struct vcpu_vmx;
+struct vmcs12;
+
+static bool *kgr_kvm_rebooting;
+static asmlinkage void (*kgr_kvm_spurious_fault)(void);
+struct page* (*kgr_kvm_vcpu_gfn_to_page)(struct kvm_vcpu *vcpu, gfn_t gfn);
+static void (*kgr_kvm_release_page_dirty)(struct page *page);
+static void (*kgr_kvm_release_page_clean)(struct page *page);
+static bool (*kgr_kvm_valid_efer)(struct kvm_vcpu *vcpu, u64 efer);
+static int (*kgr_kvm_vcpu_halt)(struct kvm_vcpu *vcpu);
+static int (*kgr_kvm_set_shared_msr)(unsigned slot, u64 value, u64 mask);
+
+static bool *kgr_enable_ept;
+static bool *kgr_enable_shadow_vmcs;
+static int (*kgr_nested_vmx_check_permission)(struct kvm_vcpu *vcpu);
+static void (*kgr_nested_vmx_failInvalid)(struct kvm_vcpu *vcpu);
+static void (*kgr_skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+static void (*kgr_copy_shadow_to_vmcs12)(struct vcpu_vmx *vmx);
+static void (*kgr_nested_vmx_failValid)(struct kvm_vcpu *vcpu,
+ u32 vm_instruction_error);
+static int (*kgr_nested_vmx_check_msr_switch)(struct kvm_vcpu *vcpu,
+ unsigned long count_field,
+ unsigned long addr_field);
+static void (*kgr_nested_vmx_entry_failure)(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 reason,
+ unsigned long qualification);
+static struct vmcs* (*kgr_alloc_vmcs_cpu)(int cpu);
+static void (*kgr_vmcs_clear)(struct vmcs *vmcs);
+static void (*kgr_vmx_vcpu_put)(struct kvm_vcpu *vcpu);
+static void (*kgr_vmx_vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+static void (*kgr_prepare_vmcs02)(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12);
+static u32 (*kgr_nested_vmx_load_msr)(struct kvm_vcpu *vcpu, u64 gpa,
+ u32 count);
+static void (*kgr_vmx_load_vmcs01)(struct kvm_vcpu *vcpu);
+static void (*kgr_vmcs_writel)(unsigned long field, unsigned long value);
+
+
+static struct {
+ char *name;
+ void **addr;
+} kgr_funcs[] = {
+ { "kvm:kvm_rebooting", (void *)&kgr_kvm_rebooting },
+ { "kvm:kvm_spurious_fault", (void *)&kgr_kvm_spurious_fault },
+ { "kvm:kvm_vcpu_gfn_to_page", (void *)&kgr_kvm_vcpu_gfn_to_page },
+ { "kvm:kvm_release_page_dirty", (void *)&kgr_kvm_release_page_dirty },
+ { "kvm:kvm_release_page_clean", (void *)&kgr_kvm_release_page_clean },
+ { "kvm:kvm_valid_efer", (void *)&kgr_kvm_valid_efer },
+ { "kvm:kvm_vcpu_halt", (void *)&kgr_kvm_vcpu_halt },
+ { "kvm:kvm_set_shared_msr", (void *)&kgr_kvm_set_shared_msr },
+
+ { "kvm_intel:enable_ept", (void *)&kgr_enable_ept },
+ { "kvm_intel:enable_shadow_vmcs", (void *)&kgr_enable_shadow_vmcs },
+ { "kvm_intel:nested_vmx_check_permission",
+ (void *)&kgr_nested_vmx_check_permission },
+ { "kvm_intel:nested_vmx_failInvalid",
+ (void *)&kgr_nested_vmx_failInvalid },
+ { "kvm_intel:skip_emulated_instruction",
+ (void *)&kgr_skip_emulated_instruction },
+ { "kvm_intel:copy_shadow_to_vmcs12",
+ (void *)&kgr_copy_shadow_to_vmcs12 },
+ { "kvm_intel:nested_vmx_failValid",
+ (void *)&kgr_nested_vmx_failValid },
+ { "kvm_intel:nested_vmx_check_msr_switch",
+ (void *)&kgr_nested_vmx_check_msr_switch },
+ { "kvm_intel:nested_vmx_entry_failure",
+ (void *)&kgr_nested_vmx_entry_failure },
+ { "kvm_intel:alloc_vmcs_cpu", (void *)&kgr_alloc_vmcs_cpu },
+ { "kvm_intel:vmcs_clear", (void *)&kgr_vmcs_clear },
+ { "kvm_intel:vmx_vcpu_put", (void *)&kgr_vmx_vcpu_put },
+ { "kvm_intel:vmx_vcpu_load", (void *)&kgr_vmx_vcpu_load },
+ { "kvm_intel:prepare_vmcs02", (void *)&kgr_prepare_vmcs02 },
+ { "kvm_intel:nested_vmx_load_msr", (void *)&kgr_nested_vmx_load_msr },
+ { "kvm_intel:vmx_load_vmcs01", (void *)&kgr_vmx_load_vmcs01 },
+ { "kvm_intel:vmcs_writel", (void *)&kgr_vmcs_writel },
+};
+
+
+/* from arch/x86/include/asm/kvm_host.h */
+/*
+ * Avoid module dependency on kvm.ko.
+ *
+ * This is the original macro but with the load from kvm_rebooting
+ * replaced by an indirect load from *kgr_kvm_rebooting and the call
+ * to kvm_spurious_fault() replaced by an indirect call to
+ * *kgr_kvm_spurious_fault.
+ */
+#define kgr____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
+ "666: " insn "\n\t" \
+ "668: \n\t" \
+ ".pushsection .fixup, \"ax\" \n" \
+ "667: \n\t" \
+ cleanup_insn "\n\t" \
+ "pushq %%rax \n\t" \
+ "movq kgr_kvm_rebooting, %%rax\n\t" \
+ "cmpb $0, (%%rax) \n\t" \
+ "popq %%rax \n\t" \
+ "jne 668b \n\t" \
+ __ASM_SIZE(push) " $666b \n\t" \
+ "movq kgr_kvm_spurious_fault, %%rax \n\t" \
+ "call *%%rax\n\t" \
+ ".popsection \n\t" \
+ _ASM_EXTABLE(666b, 667b)
+
+
+/* from arch/x86/kvm/x86.h */
+/* inlined */
+static inline int kgr_is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.efer & EFER_LMA;
+#else
+ return 0;
+#endif
+}
+
+
+/* from arch/x86/kvm/cpuid.h */
+/* inlined */
+static inline int kgr_cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.maxphyaddr;
+}
+
+
+/* from arch/x86/kvm/kvm_cache_regs.h */
+static inline void kgr_enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void kgr_leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+}
+
+
+/* from arch/x86/kvm/vmx.c */
+#define kgr__ex_clear(x, reg) \
+ kgr____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
+
+#define KGR_NR_AUTOLOAD_MSRS 8
+#define KGR_VMCS02_POOL_SIZE 1
+
+struct loaded_vmcs {
+ struct vmcs *vmcs;
+ int cpu;
+ int launched;
+ struct list_head loaded_vmcss_on_cpu_link;
+};
+
+struct shared_msr_entry {
+ unsigned index;
+ u64 data;
+ u64 mask;
+};
+
+typedef u64 natural_width;
+struct __packed vmcs12 {
+ /* According to the Intel spec, a VMCS region must start with the
+ * following two fields. Then follow implementation-specific data.
+ */
+ u32 revision_id;
+ u32 abort;
+
+ u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+ u32 padding[7]; /* room for future expansion */
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 apic_access_addr;
+ u64 posted_intr_desc_addr;
+ u64 ept_pointer;
+ u64 eoi_exit_bitmap0;
+ u64 eoi_exit_bitmap1;
+ u64 eoi_exit_bitmap2;
+ u64 eoi_exit_bitmap3;
+ u64 xss_exit_bitmap;
+ u64 guest_physical_address;
+ u64 vmcs_link_pointer;
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+ u64 guest_ia32_perf_global_ctrl;
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+ u64 guest_bndcfgs;
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+ u64 host_ia32_perf_global_ctrl;
+ u64 padding64[8]; /* room for future expansion */
+ /*
+ * To allow migration of L1 (complete with its L2 guests) between
+ * machines of different natural widths (32 or 64 bit), we cannot have
+ * unsigned long fields with no explict size. We use u64 (aliased
+ * natural_width) instead. Luckily, x86 is little-endian.
+ */
+ natural_width cr0_guest_host_mask;
+ natural_width cr4_guest_host_mask;
+ natural_width cr0_read_shadow;
+ natural_width cr4_read_shadow;
+ natural_width cr3_target_value0;
+ natural_width cr3_target_value1;
+ natural_width cr3_target_value2;
+ natural_width cr3_target_value3;
+ natural_width exit_qualification;
+ natural_width guest_linear_address;
+ natural_width guest_cr0;
+ natural_width guest_cr3;
+ natural_width guest_cr4;
+ natural_width guest_es_base;
+ natural_width guest_cs_base;
+ natural_width guest_ss_base;
+ natural_width guest_ds_base;
+ natural_width guest_fs_base;
+ natural_width guest_gs_base;
+ natural_width guest_ldtr_base;
+ natural_width guest_tr_base;
+ natural_width guest_gdtr_base;
+ natural_width guest_idtr_base;
+ natural_width guest_dr7;
+ natural_width guest_rsp;
+ natural_width guest_rip;
+ natural_width guest_rflags;
+ natural_width guest_pending_dbg_exceptions;
+ natural_width guest_sysenter_esp;
+ natural_width guest_sysenter_eip;
+ natural_width host_cr0;
+ natural_width host_cr3;
+ natural_width host_cr4;
+ natural_width host_fs_base;
+ natural_width host_gs_base;
+ natural_width host_tr_base;
+ natural_width host_gdtr_base;
+ natural_width host_idtr_base;
+ natural_width host_ia32_sysenter_esp;
+ natural_width host_ia32_sysenter_eip;
+ natural_width host_rsp;
+ natural_width host_rip;
+ natural_width paddingl[8]; /* room for future expansion */
+ u32 pin_based_vm_exec_control;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+ u32 cr3_target_count;
+ u32 vm_exit_controls;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_controls;
+ u32 vm_entry_msr_load_count;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+ u32 secondary_vm_exec_control;
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+ u32 guest_interruptibility_info;
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+ u32 host_ia32_sysenter_cs;
+ u32 vmx_preemption_timer_value;
+ u32 padding32[7]; /* room for future expansion */
+ u16 virtual_processor_id;
+ u16 posted_intr_nv;
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+ u16 guest_intr_status;
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+};
+
+struct vmcs02_list {
+ struct list_head list;
+ gpa_t vmptr;
+ struct loaded_vmcs vmcs02;
+};
+
+struct nested_vmx {
+ /* Has the level1 guest done vmxon? */
+ bool vmxon;
+ gpa_t vmxon_ptr;
+
+ /* The guest-physical address of the current VMCS L1 keeps for L2 */
+ gpa_t current_vmptr;
+ /* The host-usable pointer to the above */
+ struct page *current_vmcs12_page;
+ struct vmcs12 *current_vmcs12;
+ struct vmcs *current_shadow_vmcs;
+ /*
+ * Indicates if the shadow vmcs must be updated with the
+ * data hold by vmcs12
+ */
+ bool sync_shadow_vmcs;
+
+ /* vmcs02_list cache of VMCSs recently used to run L2 guests */
+ struct list_head vmcs02_pool;
+ int vmcs02_num;
+ u64 vmcs01_tsc_offset;
+ bool change_vmcs01_virtual_x2apic_mode;
+ /* L2 must run next, and mustn't decide to exit to L1. */
+ bool nested_run_pending;
+ /*
+ * Guest pages referred to in vmcs02 with host-physical pointers, so
+ * we must keep them pinned while L2 runs.
+ */
+ struct page *apic_access_page;
+ struct page *virtual_apic_page;
+ struct page *pi_desc_page;
+ struct pi_desc *pi_desc;
+ bool pi_pending;
+ u16 posted_intr_nv;
+
+ unsigned long *msr_bitmap;
+
+ struct hrtimer preemption_timer;
+ bool preemption_timer_expired;
+
+ /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
+ u64 vmcs01_debugctl;
+
+ u16 vpid02;
+ u16 last_vpid;
+
+ u32 nested_vmx_procbased_ctls_low;
+ u32 nested_vmx_procbased_ctls_high;
+ u32 nested_vmx_true_procbased_ctls_low;
+ u32 nested_vmx_secondary_ctls_low;
+ u32 nested_vmx_secondary_ctls_high;
+ u32 nested_vmx_pinbased_ctls_low;
+ u32 nested_vmx_pinbased_ctls_high;
+ u32 nested_vmx_exit_ctls_low;
+ u32 nested_vmx_exit_ctls_high;
+ u32 nested_vmx_true_exit_ctls_low;
+ u32 nested_vmx_entry_ctls_low;
+ u32 nested_vmx_entry_ctls_high;
+ u32 nested_vmx_true_entry_ctls_low;
+ u32 nested_vmx_misc_low;
+ u32 nested_vmx_misc_high;
+ u32 nested_vmx_ept_caps;
+ u32 nested_vmx_vpid_caps;
+};
+
+struct pi_desc {
+ u32 pir[8]; /* Posted interrupt requested */
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+struct vcpu_vmx {
+ struct kvm_vcpu vcpu;
+ unsigned long host_rsp;
+ u8 fail;
+ bool nmi_known_unmasked;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ ulong rflags;
+ struct shared_msr_entry *guest_msrs;
+ int nmsrs;
+ int save_nmsrs;
+ unsigned long host_idt_base;
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
+#endif
+ u32 vm_entry_controls_shadow;
+ u32 vm_exit_controls_shadow;
+ /*
+ * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+ * non-nested (L1) guest, it always points to vmcs01. For a nested
+ * guest (L2), it points to a different VMCS.
+ */
+ struct loaded_vmcs vmcs01;
+ struct loaded_vmcs *loaded_vmcs;
+ bool __launched; /* temporary, used in vmx_vcpu_run */
+ struct msr_autoload {
+ unsigned nr;
+ struct vmx_msr_entry guest[KGR_NR_AUTOLOAD_MSRS];
+ struct vmx_msr_entry host[KGR_NR_AUTOLOAD_MSRS];
+ } msr_autoload;
+ struct {
+ int loaded;
+ u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
+ int gs_ldt_reload_needed;
+ int fs_reload_needed;
+ u64 msr_host_bndcfgs;
+ unsigned long vmcs_host_cr4; /* May not match real cr4 */
+ } host_state;
+ struct {
+ int vm86_active;
+ ulong save_rflags;
+ struct kvm_segment segs[8];
+ } rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } seg[8];
+ } segment_cache;
+ int vpid;
+ bool emulation_required;
+
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
+ u32 exit_reason;
+
+ /* Posted interrupt descriptor */
+ struct pi_desc pi_desc;
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
+
+ /* Dynamic PLE window. */
+ int ple_window;
+ bool ple_window_dirty;
+
+ /* Support for PML */
+#define PML_ENTITY_NUM 512
+ struct page *pml_pg;
+
+ u64 current_tsc_ratio;
+
+ bool guest_pkru_valid;
+ u32 guest_pkru;
+ u32 host_pkru;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
+};
+
+/* inlined */
+static inline struct vcpu_vmx *kgr_to_vmx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+/* inlined */
+static inline struct vmcs12 *kgr_get_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return kgr_to_vmx(vcpu)->nested.current_vmcs12;
+}
+
+/* inlined */
+static struct page *kgr_nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ struct page *page = kgr_kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return NULL;
+
+ return page;
+}
+
+/* inlined */
+static void kgr_nested_release_page(struct page *page)
+{
+ kgr_kvm_release_page_dirty(page);
+}
+
+/* inlined */
+static void kgr_nested_release_page_clean(struct page *page)
+{
+ kgr_kvm_release_page_clean(page);
+}
+
+/* inlined */
+static inline bool kgr_nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+ return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+/* inlined */
+static inline bool kgr_nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+ return (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ (vmcs12->secondary_vm_exec_control & bit);
+}
+
+/* inlined */
+static inline bool kgr_nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+ return kgr_nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+/* inlined */
+static inline bool kgr_nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return kgr_nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
+/* inlined */
+static inline bool kgr_nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+ return kgr_nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+/* inlined */
+static inline bool kgr_nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+ return kgr_nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+/* inlined */
+static inline bool kgr_nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
+/* inlined */
+static inline void kgr_loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+ kgr_vmcs_clear(loaded_vmcs->vmcs);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
+/* inlined */
+static __always_inline unsigned long kgr_vmcs_readl(unsigned long field)
+{
+ unsigned long value;
+
+ asm volatile (kgr__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+ : [thunk_target] "=a"(value) : "d"(field) : "cc");
+ return value;
+}
+
+/* inlined */
+static __always_inline u32 kgr_vmcs_read32(unsigned long field)
+{
+ return kgr_vmcs_readl(field);
+}
+
+/* inlined */
+static __always_inline u64 kgr_vmcs_read64(unsigned long field)
+{
+#ifdef CONFIG_X86_64
+ return kgr_vmcs_readl(field);
+#else
+ return kgr_vmcs_readl(field) | ((u64)kgr_vmcs_readl(field+1) << 32);
+#endif
+}
+
+/* inlined */
+static void kgr_vmcs_write16(unsigned long field, u16 value)
+{
+ kgr_vmcs_writel(field, value);
+}
+
+/* inlined */
+static void kgr_vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+/* inlined */
+static inline bool kgr_vmx_control_verify(u32 control, u32 low, u32 high)
+{
+ /*
+ * Bits 0 in high must be 0, and bits 1 in low must be 1.
+ */
+ return ((control & high) | low) == control;
+}
+
+#define KGR_VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define KGR_VMXON_CR4_ALWAYSON X86_CR4_VMXE
+
+/* inlined */
+static struct vmcs *kgr_alloc_vmcs(void)
+{
+ return kgr_alloc_vmcs_cpu(raw_smp_processor_id());
+}
+
+/* inlined */
+static bool kgr_page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return PAGE_ALIGNED(gpa) && !(gpa >> kgr_cpuid_maxphyaddr(vcpu));
+}
+
+/* inlined */
+static bool kgr_nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+ return kgr_get_vmcs12(vcpu)->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK;
+}
+
+/* inlined */
+static bool kgr_nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
+{
+ return kgr_get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_ACK_INTR_ON_EXIT;
+}
+
+/* inlined */
+static bool kgr_nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ unsigned long always_on = KGR_VMXON_CR0_ALWAYSON;
+ struct vmcs12 *vmcs12 = kgr_get_vmcs12(vcpu);
+
+ if (kgr_to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+ kgr_nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ always_on &= ~(X86_CR0_PE | X86_CR0_PG);
+ return (val & always_on) == always_on;
+}
+
+/* inlined */
+static struct loaded_vmcs *kgr_nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+{
+ struct vmcs02_list *item;
+ list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+ if (item->vmptr == vmx->nested.current_vmptr) {
+ list_move(&item->list, &vmx->nested.vmcs02_pool);
+ return &item->vmcs02;
+ }
+
+ if (vmx->nested.vmcs02_num >= max(KGR_VMCS02_POOL_SIZE, 1)) {
+ /* Recycle the least recently used VMCS. */
+ item = list_entry(vmx->nested.vmcs02_pool.prev,
+ struct vmcs02_list, list);
+ item->vmptr = vmx->nested.current_vmptr;
+ list_move(&item->list, &vmx->nested.vmcs02_pool);
+ return &item->vmcs02;
+ }
+
+ /* Create a new VMCS */
+ item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+ if (!item)
+ return NULL;
+ item->vmcs02.vmcs = kgr_alloc_vmcs();
+ if (!item->vmcs02.vmcs) {
+ kfree(item);
+ return NULL;
+ }
+ kgr_loaded_vmcs_init(&item->vmcs02);
+ item->vmptr = vmx->nested.current_vmptr;
+ list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+ vmx->nested.vmcs02_num++;
+ return &item->vmcs02;
+}
+
+/* inlined */
+static int kgr_nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = kgr_to_vmx(vcpu);
+ if (vmx->nested.current_vmptr == -1ull) {
+ kgr_nested_vmx_failInvalid(vcpu);
+ kgr_skip_emulated_instruction(vcpu);
+ return 0;
+ }
+ return 1;
+}
+
+/* inlined */
+static bool kgr_nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = kgr_to_vmx(vcpu);
+ int maxphyaddr = kgr_cpuid_maxphyaddr(vcpu);
+
+ if (kgr_nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
+ vmcs12->apic_access_addr >> maxphyaddr)
+ return false;
+
+ /*
+ * Translate L1 physical address to host physical
+ * address for vmcs02. Keep the page pinned, so this
+ * physical address remains valid. We keep a reference
+ * to it so we can release it later.
+ */
+ if (vmx->nested.apic_access_page) /* shouldn't happen */
+ kgr_nested_release_page(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page =
+ kgr_nested_get_page(vcpu, vmcs12->apic_access_addr);
+ }
+
+ if (kgr_nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
+ vmcs12->virtual_apic_page_addr >> maxphyaddr)
+ return false;
+
+ if (vmx->nested.virtual_apic_page) /* shouldn't happen */
+ kgr_nested_release_page(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page =
+ kgr_nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
+
+ /*
+ * Failing the vm entry is _not_ what the processor does
+ * but it's basically the only possibility we have.
+ * We could still enter the guest if CR8 load exits are
+ * enabled, CR8 store exits are enabled, and virtualize APIC
+ * access is disabled; in this case the processor would never
+ * use the TPR shadow and we could simply clear the bit from
+ * the execution control. But such a configuration is useless,
+ * so let's keep the code simple.
+ */
+ if (!vmx->nested.virtual_apic_page)
+ return false;
+ }
+
+ if (kgr_nested_cpu_has_posted_intr(vmcs12)) {
+ if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
+ vmcs12->posted_intr_desc_addr >> maxphyaddr)
+ return false;
+
+ if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+ kunmap(vmx->nested.pi_desc_page);
+ kgr_nested_release_page(vmx->nested.pi_desc_page);
+ }
+ vmx->nested.pi_desc_page =
+ kgr_nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+ if (!vmx->nested.pi_desc_page)
+ return false;
+
+ vmx->nested.pi_desc =
+ (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
+ if (!vmx->nested.pi_desc) {
+ kgr_nested_release_page_clean(vmx->nested.pi_desc_page);
+ return false;
+ }
+ vmx->nested.pi_desc =
+ (struct pi_desc *)((void *)vmx->nested.pi_desc +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ }
+
+ return true;
+}
+
+/* inlined */
+static int kgr_nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!kgr_nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return 0;
+
+ if (!kgr_page_address_valid(vcpu, vmcs12->msr_bitmap))
+ return -EINVAL;
+
+ return 0;
+}
+
+/* inlined */
+static int kgr_nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!kgr_nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !kgr_nested_cpu_has_apic_reg_virt(vmcs12) &&
+ !kgr_nested_cpu_has_vid(vmcs12) &&
+ !kgr_nested_cpu_has_posted_intr(vmcs12))
+ return 0;
+
+ /*
+ * If virtualize x2apic mode is enabled,
+ * virtualize apic access must be disabled.
+ */
+ if (kgr_nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ kgr_nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return -EINVAL;
+
+ /*
+ * If virtual interrupt delivery is enabled,
+ * we must exit on external interrupts.
+ */
+ if (kgr_nested_cpu_has_vid(vmcs12) &&
+ !kgr_nested_exit_on_intr(vcpu))
+ return -EINVAL;
+
+ /*
+ * bits 15:8 should be zero in posted_intr_nv,
+ * the descriptor address has been already checked
+ * in nested_get_vmcs12_pages.
+ */
+ if (kgr_nested_cpu_has_posted_intr(vmcs12) &&
+ (!kgr_nested_cpu_has_vid(vmcs12) ||
+ !kgr_nested_exit_intr_ack_set(vcpu) ||
+ vmcs12->posted_intr_nv & 0xff00))
+ return -EINVAL;
+
+ /* tpr shadow is needed by all apicv features. */
+ if (!kgr_nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return -EINVAL;
+
+ return 0;
+}
+
+/* inlined */
+static int kgr_nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (vmcs12->vm_exit_msr_load_count == 0 &&
+ vmcs12->vm_exit_msr_store_count == 0 &&
+ vmcs12->vm_entry_msr_load_count == 0)
+ return 0; /* Fast path */
+ if (kgr_nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+ VM_EXIT_MSR_LOAD_ADDR) ||
+ kgr_nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+ VM_EXIT_MSR_STORE_ADDR) ||
+ kgr_nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+ VM_ENTRY_MSR_LOAD_ADDR))
+ return -EINVAL;
+ return 0;
+}
+
+
+
+/* patched */
+void kgr_vmx_save_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = kgr_to_vmx(vcpu);
+ int i;
+
+ /*
+ * Fix CVE-2018-3646
+ * +3 lines
+ *
+ * Note: we can't rely on the kgr_enable_ept pointer being
+ * valid when executing in the context of kvm.ko. Hence we
+ * must either check it here or live with L1D flushes on
+ * !enable_ept hosts.
+ */
+ if (!*kgr_enable_ept)
+ kgr_get_and_clear_vcpu_unconfined(vcpu, GFP_ATOMIC);
+
+ if (vmx->host_state.loaded)
+ return;
+
+ vmx->host_state.loaded = 1;
+ /*
+ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
+ * allow segment selectors with cpl > 0 or ti == 1.
+ */
+ vmx->host_state.ldt_sel = kvm_read_ldt();
+ vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
+ savesegment(fs, vmx->host_state.fs_sel);
+ if (!(vmx->host_state.fs_sel & 7)) {
+ kgr_vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
+ vmx->host_state.fs_reload_needed = 0;
+ } else {
+ kgr_vmcs_write16(HOST_FS_SELECTOR, 0);
+ vmx->host_state.fs_reload_needed = 1;
+ }
+ savesegment(gs, vmx->host_state.gs_sel);
+ if (!(vmx->host_state.gs_sel & 7))
+ kgr_vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
+ else {
+ kgr_vmcs_write16(HOST_GS_SELECTOR, 0);
+ vmx->host_state.gs_ldt_reload_needed = 1;
+ }
+
+#ifdef CONFIG_X86_64
+ savesegment(ds, vmx->host_state.ds_sel);
+ savesegment(es, vmx->host_state.es_sel);
+#endif
+
+#ifdef CONFIG_X86_64
+ kgr_vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
+ kgr_vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
+#else
+ kgr_vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
+ kgr_vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
+#endif
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ if (kgr_is_long_mode(&vmx->vcpu))
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
+ if (boot_cpu_has(X86_FEATURE_MPX))
+ rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kgr_kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
+}
+
+/* patched */
+void kgr_vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+ u32 exit_intr_info = kgr_vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /*
+ * If external interrupt exists, IF bit is set in rflags/eflags on the
+ * interrupt stack frame, and interrupt will be enabled on a return
+ * from interrupt handler.
+ */
+ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
+ == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
+ unsigned int vector;
+ unsigned long entry;
+ gate_desc *desc;
+ struct vcpu_vmx *vmx = kgr_to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+ unsigned long tmp;
+#endif
+
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ desc = (gate_desc *)vmx->host_idt_base + vector;
+ entry = gate_offset(*desc);
+ asm volatile(
+#ifdef CONFIG_X86_64
+ "mov %%" _ASM_SP ", %[sp]\n\t"
+ "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
+ "push $%c[ss]\n\t"
+ "push %[sp]\n\t"
+#endif
+ "pushf\n\t"
+ "orl $0x200, (%%" _ASM_SP ")\n\t"
+ __ASM_SIZE(push) " $%c[cs]\n\t"
+ "call *%[entry]\n\t"
+ :
+#ifdef CONFIG_X86_64
+ [sp]"=&r"(tmp)
+#endif
+ :
+ [entry]"r"(entry),
+ [ss]"i"(__KERNEL_DS),
+ [cs]"i"(__KERNEL_CS)
+ );
+
+ /*
+ * Fix CVE-2018-3646
+ * +1 line
+ */
+ kgr_set_vcpu_unconfined(vcpu, GFP_ATOMIC);
+
+ } else
+ local_irq_enable();
+}
+
+/* patched, optimized */
+static int kgr_nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = kgr_to_vmx(vcpu);
+ int cpu;
+ struct loaded_vmcs *vmcs02;
+ bool ia32e;
+ u32 msr_entry_idx;
+
+ if (!kgr_nested_vmx_check_permission(vcpu) ||
+ !kgr_nested_vmx_check_vmcs12(vcpu))
+ return 1;
+
+ kgr_skip_emulated_instruction(vcpu);
+ vmcs12 = kgr_get_vmcs12(vcpu);
+
+ if (*kgr_enable_shadow_vmcs)
+ kgr_copy_shadow_to_vmcs12(vmx);
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (vmcs12->launch_state == launch) {
+ kgr_nested_vmx_failValid(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+ return 1;
+ }
+
+ if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
+ kgr_nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (!kgr_nested_get_vmcs12_pages(vcpu, vmcs12)) {
+ kgr_nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (kgr_nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
+ kgr_nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (kgr_nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
+ kgr_nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (kgr_nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
+ kgr_nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (!kgr_vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+ vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high) ||
+ !kgr_vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high) ||
+ !kgr_vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high) ||
+ !kgr_vmx_control_verify(vmcs12->vm_exit_controls,
+ vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high) ||
+ !kgr_vmx_control_verify(vmcs12->vm_entry_controls,
+ vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high))
+ {
+ kgr_nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (((vmcs12->host_cr0 & KGR_VMXON_CR0_ALWAYSON) != KGR_VMXON_CR0_ALWAYSON) ||
+ ((vmcs12->host_cr4 & KGR_VMXON_CR4_ALWAYSON) != KGR_VMXON_CR4_ALWAYSON)) {
+ kgr_nested_vmx_failValid(vcpu,
+ VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+ return 1;
+ }
+
+ if (!kgr_nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+ ((vmcs12->guest_cr4 & KGR_VMXON_CR4_ALWAYSON) != KGR_VMXON_CR4_ALWAYSON)) {
+ kgr_nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ return 1;
+ }
+ if (vmcs12->vmcs_link_pointer != -1ull) {
+ kgr_nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+ return 1;
+ }
+
+ /*
+ * If the load IA32_EFER VM-entry control is 1, the following checks
+ * are performed on the field for the IA32_EFER MSR:
+ * - Bits reserved in the IA32_EFER MSR must be 0.
+ * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+ * the IA-32e mode guest VM-exit control. It must also be identical
+ * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+ * CR0.PG) is 1.
+ */
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+ ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
+ if (!kgr_kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
+ ((vmcs12->guest_cr0 & X86_CR0_PG) &&
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
+ kgr_nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ return 1;
+ }
+ }
+
+ /*
+ * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+ * IA32_EFER MSR must be 0 in the field for that register. In addition,
+ * the values of the LMA and LME bits in the field must each be that of
+ * the host address-space size VM-exit control.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+ ia32e = (vmcs12->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+ if (!kgr_kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
+ kgr_nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ return 1;
+ }
+ }
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+
+ vmcs02 = kgr_nested_get_current_vmcs02(vmx);
+ if (!vmcs02)
+ return -ENOMEM;
+
+ kgr_enter_guest_mode(vcpu);
+
+ vmx->nested.vmcs01_tsc_offset = kgr_vmcs_read64(TSC_OFFSET);
+
+ if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.vmcs01_debugctl = kgr_vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = vmcs02;
+ kgr_vmx_vcpu_put(vcpu);
+ kgr_vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ put_cpu();
+
+ kgr_vmx_segment_cache_clear(vmx);
+
+ kgr_prepare_vmcs02(vcpu, vmcs12);
+
+ msr_entry_idx = kgr_nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (msr_entry_idx) {
+ kgr_leave_guest_mode(vcpu);
+ kgr_vmx_load_vmcs01(vcpu);
+ kgr_nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+ return 1;
+ }
+
+ vmcs12->launch_state = 1;
+
+ /*
+ * Fix CVE-2018-3646
+ * +3 lines
+ */
+ /* Hide L1D cache contents from the nested guest. */
+ kgr_set_vcpu_unconfined(vcpu, GFP_KERNEL);
+
+ if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+ return kgr_kvm_vcpu_halt(vcpu);
+
+ vmx->nested.nested_run_pending = 1;
+
+ /*
+ * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+ * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+ * returned as far as L1 is concerned. It will only return (and set
+ * the success flag) when L2 exits (see nested_vmx_vmexit()).
+ */
+ return 1;
+}
+
+/* patched, calls nested_vmx_run() */
+int kgr_handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+ return kgr_nested_vmx_run(vcpu, true);
+}
+
+/* patched, calls nested_vmx_run() */
+int kgr_handle_vmresume(struct kvm_vcpu *vcpu)
+{
+ return kgr_nested_vmx_run(vcpu, false);
+}
+
+
+static int kgr_patch_bsc1099306_kvm_intel_kallsyms(void)
+{
+ unsigned long addr;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(kgr_funcs); i++) {
+ /* mod_find_symname would be nice, but it is not exported */
+ addr = kallsyms_lookup_name(kgr_funcs[i].name);
+ if (!addr) {
+ pr_err("kgraft-patch: symbol %s not resolved\n",
+ kgr_funcs[i].name);
+ return -ENOENT;
+ }
+
+ *(kgr_funcs[i].addr) = (void *)addr;
+ }
+
+ return 0;
+}
+
+static int
+kgr_patch_bsc1099306_kvm_intel_module_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct module *mod = data;
+ int ret;
+
+ if (action != MODULE_STATE_COMING || strcmp(mod->name, KGR_PATCHED_MODULE))
+ return 0;
+
+ ret = kgr_patch_bsc1099306_kvm_intel_kallsyms();
+ WARN(ret, "kgraft-patch: delayed kallsyms lookup failed. System is broken and can crash.\n");
+
+ return ret;
+}
+
+static struct notifier_block kgr_patch_bsc1099306_kvm_intel_module_nb = {
+ .notifier_call = kgr_patch_bsc1099306_kvm_intel_module_notify,
+ .priority = INT_MIN+1,
+};
+
+int __kgr_patch_bsc1099306_kvm_intel_init(void)
+{
+ int ret;
+
+ mutex_lock(&module_mutex);
+ if (find_module(KGR_PATCHED_MODULE)) {
+ ret = kgr_patch_bsc1099306_kvm_intel_kallsyms();
+ if (ret)
+ goto out;
+ }
+
+ ret = register_module_notifier(&kgr_patch_bsc1099306_kvm_intel_module_nb);
+out:
+ mutex_unlock(&module_mutex);
+ return ret;
+}
+
+void __kgr_patch_bsc1099306_kvm_intel_cleanup(void)
+{
+ unregister_module_notifier(&kgr_patch_bsc1099306_kvm_intel_module_nb);
+}
+
+#endif /* IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM) */
diff --git a/bsc1099306/kgr_patch_bsc1099306_kvm_intel.h b/bsc1099306/kgr_patch_bsc1099306_kvm_intel.h
new file mode 100644
index 0000000..dffe4d9
--- /dev/null
+++ b/bsc1099306/kgr_patch_bsc1099306_kvm_intel.h
@@ -0,0 +1,29 @@
+#ifndef _KGR_PATCH_BSC1099306_KVM_INTEL_H
+#define _KGR_PATCH_BSC1099306_KVM_INTEL_H
+
+#if IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM)
+
+int __kgr_patch_bsc1099306_kvm_intel_init(void);
+void __kgr_patch_bsc1099306_kvm_intel_cleanup(void);
+
+
+struct kvm_vcpu;
+void kgr_vmx_save_host_state(struct kvm_vcpu *vcpu);
+void kgr_vmx_handle_external_intr(struct kvm_vcpu *vcpu);
+int kgr_handle_vmlaunch(struct kvm_vcpu *vcpu);
+int kgr_handle_vmresume(struct kvm_vcpu *vcpu);
+
+#define __KGR_PATCH_BSC1099306_KVM_INTEL_FUNCS \
+ KGR_PATCH_OBJ(vmx_save_host_state, \
+ kgr_vmx_save_host_state, "kvm_intel"), \
+ KGR_PATCH_OBJ(vmx_handle_external_intr, \
+ kgr_vmx_handle_external_intr, "kvm_intel"), \
+ KGR_PATCH_OBJ(handle_vmlaunch, \
+ kgr_handle_vmlaunch, "kvm_intel"), \
+ KGR_PATCH_OBJ(handle_vmresume, \
+ kgr_handle_vmresume, "kvm_intel"), \
+
+
+#endif /* IS_ENABLED(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM) */
+
+#endif /* _KGR_PATCH_BSC1099306_KVM_INTEL_H */