Home Home > GIT Browse > stable-xen
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOlaf Hering <ohering@suse.de>2019-11-07 14:49:14 +0100
committerOlaf Hering <ohering@suse.de>2019-11-07 14:49:14 +0100
commit3ebf2ab8d045b2f786755224ca2e9c7ec9f9d238 (patch)
tree6160ccedc8e87e6e12eb151d3c82a7c7159ddbef
parent06f2412e4d37a366b48889dd2013d09c39e20647 (diff)
parent276a306cd573d27427f5150fe97cdfcd76dcfc4c (diff)
Merge remote-tracking branch 'kerncvs/SLE15-SP1_EMBARGO' into SLE15-SP1-AZURE_EMBARGOrpm-4.12.14-8.19--sle15-sp1-updatesrpm-4.12.14-8.19
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu2
-rw-r--r--Documentation/admin-guide/hw-vuln/tsx_async_abort.rst276
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt84
-rw-r--r--Documentation/x86/index.rst1
-rw-r--r--Documentation/x86/tsx_async_abort.rst117
-rw-r--r--arch/x86/Kconfig45
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h13
-rw-r--r--arch/x86/include/asm/msr-index.h16
-rw-r--r--arch/x86/include/asm/nospec-branch.h4
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/bugs.c157
-rw-r--r--arch/x86/kernel/cpu/common.c96
-rw-r--r--arch/x86/kernel/cpu/cpu.h18
-rw-r--r--arch/x86/kernel/cpu/intel.c5
-rw-r--r--arch/x86/kernel/cpu/tsx.c140
-rw-r--r--arch/x86/kvm/cpuid.c17
-rw-r--r--arch/x86/kvm/mmu.c264
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/paging_tmpl.h29
-rw-r--r--arch/x86/kvm/svm.c10
-rw-r--r--arch/x86/kvm/vmx.c14
-rw-r--r--arch/x86/kvm/x86.c48
-rw-r--r--drivers/base/cpu.c17
-rw-r--r--drivers/gpu/drm/i915/i915_cmd_parser.c436
-rw-r--r--drivers/gpu/drm/i915/i915_drv.c5
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h31
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c24
-rw-r--r--drivers/gpu/drm/i915/i915_gem_context.c5
-rw-r--r--drivers/gpu/drm/i915/i915_gem_context.h6
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c110
-rw-r--r--drivers/gpu/drm/i915/i915_gem_gtt.c3
-rw-r--r--drivers/gpu/drm/i915/i915_reg.h10
-rw-r--r--drivers/gpu/drm/i915/intel_drv.h3
-rw-r--r--drivers/gpu/drm/i915/intel_pm.c115
-rw-r--r--drivers/gpu/drm/i915/intel_ringbuffer.h17
-rw-r--r--include/linux/cpu.h5
-rw-r--r--include/linux/kvm_host.h6
-rw-r--r--virt/kvm/kvm_main.c114
40 files changed, 2037 insertions, 242 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 87c8f75c15f0..b2c1c41f48cd 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -381,6 +381,8 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/l1tf
/sys/devices/system/cpu/vulnerabilities/mds
+ /sys/devices/system/cpu/vulnerabilities/tsx_async_abort
+ /sys/devices/system/cpu/vulnerabilities/itlb_multihit
Date: January 2018
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
Description: Information about CPU vulnerabilities
diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
new file mode 100644
index 000000000000..fddbd7579c53
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
@@ -0,0 +1,276 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+TAA - TSX Asynchronous Abort
+======================================
+
+TAA is a hardware vulnerability that allows unprivileged speculative access to
+data which is available in various CPU internal buffers by using asynchronous
+aborts within an Intel TSX transactional region.
+
+Affected processors
+-------------------
+
+This vulnerability only affects Intel processors that support Intel
+Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8)
+is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit
+(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations
+also mitigate against TAA.
+
+Whether a processor is affected or not can be read out from the TAA
+vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entry is related to this TAA issue:
+
+ ============== ===== ===================================================
+ CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some
+ microprocessors utilizing speculative execution may
+ allow an authenticated user to potentially enable
+ information disclosure via a side channel with
+ local access.
+ ============== ===== ===================================================
+
+Problem
+-------
+
+When performing store, load or L1 refill operations, processors write
+data into temporary microarchitectural structures (buffers). The data in
+those buffers can be forwarded to load operations as an optimization.
+
+Intel TSX is an extension to the x86 instruction set architecture that adds
+hardware transactional memory support to improve performance of multi-threaded
+software. TSX lets the processor expose and exploit concurrency hidden in an
+application due to dynamically avoiding unnecessary synchronization.
+
+TSX supports atomic memory transactions that are either committed (success) or
+aborted. During an abort, operations that happened within the transactional region
+are rolled back. An asynchronous abort takes place, among other options, when a
+different thread accesses a cache line that is also used within the transactional
+region when that access might lead to a data race.
+
+Immediately after an uncompleted asynchronous abort, certain speculatively
+executed loads may read data from those internal buffers and pass it to dependent
+operations. This can be then used to infer the value via a cache side channel
+attack.
+
+Because the buffers are potentially shared between Hyper-Threads cross
+Hyper-Thread attacks are possible.
+
+The victim of a malicious actor does not need to make use of TSX. Only the
+attacker needs to begin a TSX transaction and raise an asynchronous abort
+which in turn potenitally leaks data stored in the buffers.
+
+More detailed technical information is available in the TAA specific x86
+architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.
+
+
+Attack scenarios
+----------------
+
+Attacks against the TAA vulnerability can be implemented from unprivileged
+applications running on hosts or guests.
+
+As for MDS, the attacker has no control over the memory addresses that can
+be leaked. Only the victim is responsible for bringing data to the CPU. As
+a result, the malicious actor has to sample as much data as possible and
+then postprocess it to try to infer any useful information from it.
+
+A potential attacker only has read access to the data. Also, there is no direct
+privilege escalation by using this technique.
+
+
+.. _tsx_async_abort_sys_info:
+
+TAA system information
+-----------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current TAA status
+of mitigated systems. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
+
+The possible values in this file are:
+
+.. list-table::
+
+ * - 'Vulnerable'
+ - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied.
+ * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
+ - The system tries to clear the buffers but the microcode might not support the operation.
+ * - 'Mitigation: Clear CPU buffers'
+ - The microcode has been updated to clear the buffers. TSX is still enabled.
+ * - 'Mitigation: TSX disabled'
+ - TSX is disabled.
+ * - 'Not affected'
+ - The CPU is not affected by this issue.
+
+.. _ucode_needed:
+
+Best effort mitigation mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the processor is vulnerable, but the availability of the microcode-based
+mitigation mechanism is not advertised via CPUID the kernel selects a best
+effort mitigation mode. This mode invokes the mitigation instructions
+without a guarantee that they clear the CPU buffers.
+
+This is done to address virtualization scenarios where the host has the
+microcode update applied, but the hypervisor is not yet updated to expose the
+CPUID to the guest. If the host has updated microcode the protection takes
+effect; otherwise a few CPU cycles are wasted pointlessly.
+
+The state in the tsx_async_abort sysfs file reflects this situation
+accordingly.
+
+
+Mitigation mechanism
+--------------------
+
+The kernel detects the affected CPUs and the presence of the microcode which is
+required. If a CPU is affected and the microcode is available, then the kernel
+enables the mitigation by default.
+
+
+The mitigation can be controlled at boot time via a kernel command line option.
+See :ref:`taa_mitigation_control_command_line`.
+
+.. _virt_mechanism:
+
+Virtualization mitigation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Affected systems where the host has TAA microcode and TAA is mitigated by
+having disabled TSX previously, are not vulnerable regardless of the status
+of the VMs.
+
+In all other cases, if the host either does not have the TAA microcode or
+the kernel is not mitigated, the system might be vulnerable.
+
+
+.. _taa_mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the TAA mitigations at boot time with
+the option "tsx_async_abort=". The valid arguments for this option are:
+
+ ============ =============================================================
+ off This option disables the TAA mitigation on affected platforms.
+ If the system has TSX enabled (see next parameter) and the CPU
+ is affected, the system is vulnerable.
+
+ full TAA mitigation is enabled. If TSX is enabled, on an affected
+ system it will clear CPU buffers on ring transitions. On
+ systems which are MDS-affected and deploy MDS mitigation,
+ TAA is also mitigated. Specifying this option on those
+ systems will have no effect.
+
+ full,nosmt The same as tsx_async_abort=full, with SMT disabled on
+ vulnerable CPUs that have TSX enabled. This is the complete
+ mitigation. When TSX is disabled, SMT is not disabled because
+ CPU is not vulnerable to cross-thread TAA attacks.
+ ============ =============================================================
+
+Not specifying this option is equivalent to "tsx_async_abort=full".
+
+The kernel command line also allows to control the TSX feature using the
+parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
+to control the TSX feature and the enumeration of the TSX feature bits (RTM
+and HLE) in CPUID.
+
+The valid options are:
+
+ ============ =============================================================
+ off Disables TSX on the system.
+
+ Note that this option takes effect only on newer CPUs which are
+ not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1
+ and which get the new IA32_TSX_CTRL MSR through a microcode
+ update. This new MSR allows for the reliable deactivation of
+ the TSX functionality.
+
+ on Enables TSX.
+
+ Although there are mitigations for all known security
+ vulnerabilities, TSX has been known to be an accelerator for
+ several previous speculation-related CVEs, and so there may be
+ unknown security risks associated with leaving it enabled.
+
+ auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX
+ on the system.
+ ============ =============================================================
+
+Not specifying this option is equivalent to "tsx=off".
+
+The following combinations of the "tsx_async_abort" and "tsx" are possible. For
+affected platforms tsx=auto is equivalent to tsx=off and the result will be:
+
+ ========= ========================== =========================================
+ tsx=on tsx_async_abort=full The system will use VERW to clear CPU
+ buffers. Cross-thread attacks are still
+ possible on SMT machines.
+ tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT
+ mitigated.
+ tsx=on tsx_async_abort=off The system is vulnerable.
+ tsx=off tsx_async_abort=full TSX might be disabled if microcode
+ provides a TSX control MSR. If so,
+ system is not vulnerable.
+ tsx=off tsx_async_abort=full,nosmt Ditto
+ tsx=off tsx_async_abort=off ditto
+ ========= ========================== =========================================
+
+
+For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU
+buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0)
+"tsx" command line argument has no effect.
+
+For the affected platforms below table indicates the mitigation status for the
+combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO
+and TSX_CTRL_MSR.
+
+ ======= ========= ============= ========================================
+ MDS_NO MD_CLEAR TSX_CTRL_MSR Status
+ ======= ========= ============= ========================================
+ 0 0 0 Vulnerable (needs microcode)
+ 0 1 0 MDS and TAA mitigated via VERW
+ 1 1 0 MDS fixed, TAA vulnerable if TSX enabled
+ because MD_CLEAR has no meaning and
+ VERW is not guaranteed to clear buffers
+ 1 X 1 MDS fixed, TAA can be mitigated by
+ VERW or TSX_CTRL_MSR
+ ======= ========= ============= ========================================
+
+Mitigation selection guide
+--------------------------
+
+1. Trusted userspace and guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If all user space applications are from a trusted source and do not execute
+untrusted code which is supplied externally, then the mitigation can be
+disabled. The same applies to virtualized environments with trusted guests.
+
+
+2. Untrusted userspace and guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If there are untrusted applications or guests on the system, enabling TSX
+might allow a malicious actor to leak data from the host or from other
+processes running on the same physical core.
+
+If the microcode is available and the TSX is disabled on the host, attacks
+are prevented in a virtualized environment as well, even if the VMs do not
+explicitly enable the mitigation.
+
+
+.. _taa_default_mitigations:
+
+Default mitigations
+-------------------
+
+The kernel's default action for vulnerable processors is:
+
+ - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off).
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c3a6afc97d82..9c77cc3e1dc8 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1860,6 +1860,23 @@
KVM MMU at runtime.
Default is 0 (off)
+ kvm.nx_huge_pages=
+ [KVM] Controls the sw workaround for bug
+ X86_BUG_ITLB_MULTIHIT.
+ force : Always deploy workaround.
+ off : Never deploy workaround.
+ auto : Default. Deploy workaround based on presence of
+ X86_BUG_ITLB_MULTIHIT.
+
+ If the sw workaround is enabled for the host, guests
+ need not enable it for nested guests.
+
+ kvm.nx_huge_pages_recovery_ratio=
+ [KVM] Controls how many 4KiB pages are periodically zapped
+ back to huge pages. 0 disables the recovery, otherwise if
+ the value is N KVM will zap 1/Nth of the 4KiB pages every
+ minute. The default is 60.
+
kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
Default is 1 (enabled)
@@ -2423,6 +2440,7 @@
spec_store_bypass_disable=off [X86,PPC]
l1tf=off [X86]
mds=off [X86]
+ tsx_async_abort=off [X86]
auto (default)
Mitigate all CPU vulnerabilities, but leave SMT
@@ -2438,6 +2456,7 @@
be fully mitigated, even if it means losing SMT.
Equivalent to: l1tf=flush,nosmt [X86]
mds=full,nosmt [X86]
+ tsx_async_abort=full,nosmt [X86]
mminit_loglevel=
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
@@ -4502,6 +4521,71 @@
platforms where RDTSC is slow and this accounting
can add overhead.
+ tsx= [X86] Control Transactional Synchronization
+ Extensions (TSX) feature in Intel processors that
+ support TSX control.
+
+ This parameter controls the TSX feature. The options are:
+
+ on - Enable TSX on the system. Although there are
+ mitigations for all known security vulnerabilities,
+ TSX has been known to be an accelerator for
+ several previous speculation-related CVEs, and
+ so there may be unknown security risks associated
+ with leaving it enabled.
+
+ off - Disable TSX on the system. (Note that this
+ option takes effect only on newer CPUs which are
+ not vulnerable to MDS, i.e., have
+ MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get
+ the new IA32_TSX_CTRL MSR through a microcode
+ update. This new MSR allows for the reliable
+ deactivation of the TSX functionality.)
+
+ auto - Disable TSX if X86_BUG_TAA is present,
+ otherwise enable TSX on the system.
+
+ Not specifying this option is equivalent to tsx=off.
+
+ See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
+ for more details.
+
+ tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async
+ Abort (TAA) vulnerability.
+
+ Similar to Micro-architectural Data Sampling (MDS)
+ certain CPUs that support Transactional
+ Synchronization Extensions (TSX) are vulnerable to an
+ exploit against CPU internal buffers which can forward
+ information to a disclosure gadget under certain
+ conditions.
+
+ In vulnerable processors, the speculatively forwarded
+ data can be used in a cache side channel attack, to
+ access data to which the attacker does not have direct
+ access.
+
+ This parameter controls the TAA mitigation. The
+ options are:
+
+ full - Enable TAA mitigation on vulnerable CPUs
+ if TSX is enabled.
+
+ full,nosmt - Enable TAA mitigation and disable SMT on
+ vulnerable CPUs. If TSX is disabled, SMT
+ is not disabled because CPU is not
+ vulnerable to cross-thread TAA attacks.
+ off - Unconditionally disable TAA mitigation
+
+ Not specifying this option is equivalent to
+ tsx_async_abort=full. On CPUs which are MDS affected
+ and deploy MDS mitigation, TAA mitigation is not
+ required and doesn't provide any additional
+ mitigation.
+
+ For details see:
+ Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
+
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
Format:
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index ef389dcf1b1d..0780d55c5aa8 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -6,3 +6,4 @@ x86 architecture specifics
:maxdepth: 1
mds
+ tsx_async_abort
diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst
new file mode 100644
index 000000000000..583ddc185ba2
--- /dev/null
+++ b/Documentation/x86/tsx_async_abort.rst
@@ -0,0 +1,117 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+TSX Async Abort (TAA) mitigation
+================================
+
+.. _tsx_async_abort:
+
+Overview
+--------
+
+TSX Async Abort (TAA) is a side channel attack on internal buffers in some
+Intel processors similar to Microachitectural Data Sampling (MDS). In this
+case certain loads may speculatively pass invalid data to dependent operations
+when an asynchronous abort condition is pending in a Transactional
+Synchronization Extensions (TSX) transaction. This includes loads with no
+fault or assist condition. Such loads may speculatively expose stale data from
+the same uarch data structures as in MDS, with same scope of exposure i.e.
+same-thread and cross-thread. This issue affects all current processors that
+support TSX.
+
+Mitigation strategy
+-------------------
+
+a) TSX disable - one of the mitigations is to disable TSX. A new MSR
+IA32_TSX_CTRL will be available in future and current processors after
+microcode update which can be used to disable TSX. In addition, it
+controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID.
+
+b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this
+vulnerability. More details on this approach can be found in
+:ref:`Documentation/admin-guide/hw-vuln/mds.rst <mds>`.
+
+Kernel internal mitigation modes
+--------------------------------
+
+ ============= ============================================================
+ off Mitigation is disabled. Either the CPU is not affected or
+ tsx_async_abort=off is supplied on the kernel command line.
+
+ tsx disabled Mitigation is enabled. TSX feature is disabled by default at
+ bootup on processors that support TSX control.
+
+ verw Mitigation is enabled. CPU is affected and MD_CLEAR is
+ advertised in CPUID.
+
+ ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not
+ advertised in CPUID. That is mainly for virtualization
+ scenarios where the host has the updated microcode but the
+ hypervisor does not expose MD_CLEAR in CPUID. It's a best
+ effort approach without guarantee.
+ ============= ============================================================
+
+If the CPU is affected and the "tsx_async_abort" kernel command line parameter is
+not provided then the kernel selects an appropriate mitigation depending on the
+status of RTM and MD_CLEAR CPUID bits.
+
+Below tables indicate the impact of tsx=on|off|auto cmdline options on state of
+TAA mitigation, VERW behavior and TSX feature for various combinations of
+MSR_IA32_ARCH_CAPABILITIES bits.
+
+1. "tsx=off"
+
+========= ========= ============ ============ ============== =================== ======================
+MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off
+---------------------------------- -------------------------------------------------------------------------
+TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
+ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
+========= ========= ============ ============ ============== =================== ======================
+ 0 0 0 HW default Yes Same as MDS Same as MDS
+ 0 0 1 Invalid case Invalid case Invalid case Invalid case
+ 0 1 0 HW default No Need ucode update Need ucode update
+ 0 1 1 Disabled Yes TSX disabled TSX disabled
+ 1 X 1 Disabled X None needed None needed
+========= ========= ============ ============ ============== =================== ======================
+
+2. "tsx=on"
+
+========= ========= ============ ============ ============== =================== ======================
+MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on
+---------------------------------- -------------------------------------------------------------------------
+TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
+ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
+========= ========= ============ ============ ============== =================== ======================
+ 0 0 0 HW default Yes Same as MDS Same as MDS
+ 0 0 1 Invalid case Invalid case Invalid case Invalid case
+ 0 1 0 HW default No Need ucode update Need ucode update
+ 0 1 1 Enabled Yes None Same as MDS
+ 1 X 1 Enabled X None needed None needed
+========= ========= ============ ============ ============== =================== ======================
+
+3. "tsx=auto"
+
+========= ========= ============ ============ ============== =================== ======================
+MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto
+---------------------------------- -------------------------------------------------------------------------
+TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation
+ after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full
+========= ========= ============ ============ ============== =================== ======================
+ 0 0 0 HW default Yes Same as MDS Same as MDS
+ 0 0 1 Invalid case Invalid case Invalid case Invalid case
+ 0 1 0 HW default No Need ucode update Need ucode update
+ 0 1 1 Disabled Yes TSX disabled TSX disabled
+ 1 X 1 Enabled X None needed None needed
+========= ========= ============ ============ ============== =================== ======================
+
+In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that
+indicates whether MSR_IA32_TSX_CTRL is supported.
+
+There are two control bits in IA32_TSX_CTRL MSR:
+
+ Bit 0: When set it disables the Restricted Transactional Memory (RTM)
+ sub-feature of TSX (will force all transactions to abort on the
+ XBEGIN instruction).
+
+ Bit 1: When set it disables the enumeration of the RTM and HLE feature
+ (i.e. it will make CPUID(EAX=7).EBX{bit4} and
+ CPUID(EAX=7).EBX{bit11} read as 0).
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 27f25a95b86d..4947907a487b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1851,6 +1851,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
If unsure, say y.
+choice
+ prompt "TSX enable mode"
+ depends on CPU_SUP_INTEL
+ default X86_INTEL_TSX_MODE_OFF
+ help
+ Intel's TSX (Transactional Synchronization Extensions) feature
+ allows to optimize locking protocols through lock elision which
+ can lead to a noticeable performance boost.
+
+ On the other hand it has been shown that TSX can be exploited
+ to form side channel attacks (e.g. TAA) and chances are there
+ will be more of those attacks discovered in the future.
+
+ Therefore TSX is not enabled by default (aka tsx=off). An admin
+ might override this decision by tsx=on the command line parameter.
+ Even with TSX enabled, the kernel will attempt to enable the best
+ possible TAA mitigation setting depending on the microcode available
+ for the particular machine.
+
+ This option allows to set the default tsx mode between tsx=on, =off
+ and =auto. See Documentation/admin-guide/kernel-parameters.txt for more
+ details.
+
+ Say off if not sure, auto if TSX is in use but it should be used on safe
+ platforms or on if TSX is in use and the security aspect of tsx is not
+ relevant.
+
+config X86_INTEL_TSX_MODE_OFF
+ bool "off"
+ help
+ TSX is disabled if possible - equals to tsx=off command line parameter.
+
+config X86_INTEL_TSX_MODE_ON
+ bool "on"
+ help
+ TSX is always enabled on TSX capable HW - equals the tsx=on command
+ line parameter.
+
+config X86_INTEL_TSX_MODE_AUTO
+ bool "auto"
+ help
+ TSX is enabled on TSX capable HW that is believed to be safe against
+ side channel attacks- equals the tsx=auto command line parameter.
+endchoice
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4a6afd5c729f..665fefb057db 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -395,4 +395,7 @@
#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
+#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
+#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
+
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7b7a99583adf..6df985da6927 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -308,6 +308,11 @@ struct kvm_mmu_page {
/* Number of writes since the last time traversal visited this page. */
atomic_t write_flooding_count;
+
+#ifndef __GENKSYMS__
+ struct list_head lpage_disallowed_link;
+ bool lpage_disallowed; /* Can't be replaced by an equiv large page */
+#endif
};
struct kvm_pio_request {
@@ -883,6 +888,11 @@ struct kvm_arch {
bool x2apic_broadcast_quirk_disabled;
struct kvm_sev_info sev_info;
+
+#ifndef __GENKSYMS__
+ struct list_head lpage_disallowed_mmu_pages;
+ struct task_struct *nx_lpage_recovery_thread;
+#endif
};
struct kvm_vm_stat {
@@ -897,6 +907,9 @@ struct kvm_vm_stat {
ulong remote_tlb_flush;
ulong lpages;
ulong max_mmu_page_hash_collisions;
+#ifndef __GENKSYMS__
+ ulong nx_lpage_splits;
+#endif
};
struct kvm_vcpu_stat {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8a47bf9430e9..f6778fe6212c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -92,6 +92,18 @@
* Microarchitectural Data
* Sampling (MDS) vulnerabilities.
*/
+#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /*
+ * The processor is not susceptible to a
+ * machine check error due to modifying the
+ * code page size along with either the
+ * physical address or cache type
+ * without TLB invalidation.
+ */
+#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */
+#define ARCH_CAP_TAA_NO BIT(8) /*
+ * Not susceptible to
+ * TSX Async Abort (TAA) vulnerabilities.
+ */
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
@@ -102,6 +114,10 @@
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
+#define MSR_IA32_TSX_CTRL 0x00000122
+#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
+#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
+
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
#define MSR_IA32_SYSENTER_EIP 0x00000176
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 279c4e44b112..f23c1080e564 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -358,7 +358,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
#include <asm/segment.h>
/**
- * mds_clear_cpu_buffers - Mitigation for MDS vulnerability
+ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
*
* This uses the otherwise unused and obsolete VERW instruction in
* combination with microcode which triggers a CPU buffer flush when the
@@ -381,7 +381,7 @@ static inline void mds_clear_cpu_buffers(void)
}
/**
- * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
+ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
*
* Clear CPU buffers if the corresponding static key is enabled
*/
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2bdbc8ddb20c..53afd7f0ffed 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -993,4 +993,11 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 8434f6c679a1..e37319a5ea7d 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -28,7 +28,7 @@ obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel.o tsx.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 9a25e43dd13d..d2beadd0867c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -31,11 +31,14 @@
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
+#include "cpu.h"
+
static void __init spectre_v1_select_mitigation(void);
static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
+static void __init taa_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -102,6 +105,7 @@ void __init check_bugs(void)
ssb_select_mitigation();
l1tf_select_mitigation();
mds_select_mitigation();
+ taa_select_mitigation();
arch_smt_update();
@@ -390,6 +394,100 @@ static int __init mds_cmdline(char *str)
early_param("mds", mds_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static void __init taa_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ goto out;
+ }
+
+ if (cpu_mitigations_off()) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */
+ if (taa_mitigation == TAA_MITIGATION_OFF)
+ goto out;
+
+ if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
+ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ static_branch_enable(&mds_user_clear);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+
+out:
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -907,6 +1005,7 @@ static void update_mds_branch_idle(void)
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
void arch_smt_update(void)
{
@@ -939,6 +1038,17 @@ void arch_smt_update(void)
break;
}
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -1084,6 +1194,9 @@ static void ssb_select_mitigation(void)
pr_info("%s\n", ssb_strings[ssb_mode]);
}
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
#undef pr_fmt
#define pr_fmt(fmt) "Speculation prctl: " fmt
@@ -1284,11 +1397,24 @@ static ssize_t l1tf_show_state(char *buf)
l1tf_vmx_states[l1tf_vmx_mitigation],
sched_smt_active() ? "vulnerable" : "disabled");
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (itlb_multihit_kvm_mitigation)
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sprintf(buf, "KVM: Vulnerable\n");
+}
#else
static ssize_t l1tf_show_state(char *buf)
{
return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sprintf(buf, "Processor vulnerable\n");
+}
#endif
static ssize_t mds_show_state(char *buf)
@@ -1308,6 +1434,21 @@ static ssize_t mds_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
static char *stibp_state(void)
{
if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1378,6 +1519,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_MDS:
return mds_show_state(buf);
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
default:
break;
}
@@ -1414,4 +1561,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa9558a3ee1e..1532f3ded735 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -926,13 +926,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#endif
}
-#define NO_SPECULATION BIT(0)
-#define NO_MELTDOWN BIT(1)
-#define NO_SSB BIT(2)
-#define NO_L1TF BIT(3)
-#define NO_MDS BIT(4)
-#define MSBDS_ONLY BIT(5)
-#define NO_SWAPGS BIT(6)
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
#define VULNWL(_vendor, _family, _model, _whitelist) \
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -953,26 +954,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
-
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_INTEL(CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS),
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -982,15 +983,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
+ VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT),
+
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
{}
};
@@ -1001,19 +1004,30 @@ static bool __init cpu_matches(unsigned long which)
return m && !!(m->driver_data & which);
}
-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+u64 x86_read_arch_cap_msr(void)
{
u64 ia32_cap = 0;
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+ return ia32_cap;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
if (cpu_matches(NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
-
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
@@ -1030,6 +1044,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
if (!cpu_matches(NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
if (cpu_matches(NO_MELTDOWN))
return;
@@ -1434,6 +1463,7 @@ void __init identify_boot_cpu(void)
enable_sep_cpu();
#endif
cpu_detect_tlb(&boot_cpu_data);
+ tsx_init();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index de8929a147b0..924a92f904dd 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -44,6 +44,22 @@ struct _tlb_table {
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+#ifdef CONFIG_CPU_SUP_INTEL
+enum tsx_ctrl_states {
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
+
+extern void __init tsx_init(void);
+extern void tsx_enable(void);
+extern void tsx_disable(void);
+#else
+static inline void tsx_init(void) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
extern void x86_spec_ctrl_setup_ap(void);
@@ -53,4 +69,6 @@ extern int detect_ht_early(struct cpuinfo_x86 *c);
extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c);
void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
+extern u64 x86_read_arch_cap_msr(void);
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b5b188114520..a6b912a48280 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -709,6 +709,11 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_energy_perf(c);
init_intel_misc_features(c);
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..3e20d322bc98
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+
+#include "cpu.h"
+
+enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+
+void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static bool __init tsx_ctrl_is_supported(void)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+void __init tsx_init(void)
+{
+ char arg[5] = {};
+ int ret;
+
+ if (!tsx_ctrl_is_supported())
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
+ if (ret >= 0) {
+ if (!strcmp(arg, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(arg, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(arg, "auto")) {
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("tsx: invalid option, defaulting to off\n");
+ }
+ } else {
+ /* tsx= not provided */
+ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ else
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the
+ * RTM CPUID bit. Clear it here since it is now
+ * expected to be not set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the
+ * RTM CPUID bit. Force it here since it is now
+ * expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ }
+}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 043716e546ad..4adbf19759a3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -485,11 +485,18 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
- entry->edx &= kvm_cpuid_7_0_edx_x86_features;
- cpuid_mask(&entry->edx, CPUID_7_EDX);
- /*
- * We emulate ARCH_CAPABILITIES in software even
- * if the host doesn't support it.
+
+ entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_7_EDX);
+ if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS))
+ entry->edx |= F(SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ entry->edx |= F(INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SSBD))
+ entry->edx |= F(SPEC_CTRL_SSBD);
+ /*
+ * We emulate ARCH_CAPABILITIES in software even
+ * if the host doesn't support it.
*/
entry->edx |= F(ARCH_CAPABILITIES);
} else {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb8a1045393b..e3180a65ee19 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -40,6 +40,7 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/kern_levels.h>
+#include <linux/kthread.h>
#include <asm/page.h>
#include <asm/pat.h>
@@ -49,6 +50,30 @@
#include <asm/kvm_page_track.h>
#include "trace.h"
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+ .set = set_nx_huge_pages_recovery_ratio,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
/*
* When setting this variable to true it enables Two-Dimensional-Paging
* where the hardware walks 2 page tables:
@@ -267,6 +292,11 @@ static inline bool spte_ad_enabled(u64 spte)
return !(spte & shadow_acc_track_value);
}
+static bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
+
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
@@ -1079,6 +1109,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->lpage_disallowed)
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->lpage_disallowed_link,
+ &kvm->arch.lpage_disallowed_mmu_pages);
+ sp->lpage_disallowed = true;
+}
+
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
@@ -1096,6 +1137,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ --kvm->stat.nx_lpage_splits;
+ sp->lpage_disallowed = false;
+ list_del(&sp->lpage_disallowed_link);
+}
+
static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
@@ -2643,6 +2691,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_reload_remote_mmus(kvm);
}
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
sp->role.invalid = 1;
return ret;
}
@@ -2808,6 +2859,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!speculative)
spte |= spte_shadow_accessed_mask(spte);
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -3021,9 +3077,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+ int level = *levelp;
+ u64 spte = *it.sptep;
+
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+ is_nx_huge_page_enabled() &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte)) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch)
+ * and __direct_map would like to create a large PTE
+ * instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of
+ * the address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ *pfnp |= gfn & page_mask;
+ (*levelp)--;
+ }
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
int map_writable, int level, kvm_pfn_t pfn,
- bool prefault)
+ bool prefault, bool lpage_disallowed)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
@@ -3036,6 +3115,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
break;
@@ -3046,6 +3131,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -3345,11 +3432,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
{
int r;
int level;
- bool force_pt_level = false;
+ bool force_pt_level;
kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ force_pt_level = lpage_disallowed;
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
/*
@@ -3383,7 +3473,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
+ r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+ prefault, false);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
@@ -3932,6 +4023,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -3942,8 +4035,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
- PT_DIRECTORY_LEVEL);
+ force_pt_level =
+ lpage_disallowed ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
level = mapping_level(vcpu, gfn, &force_pt_level);
if (likely(!force_pt_level)) {
if (level > PT_DIRECTORY_LEVEL &&
@@ -3972,7 +4066,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, lpage_disallowed);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
@@ -5553,10 +5648,54 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off"))
+ new_val = 0;
+ else if (sysfs_streq(val, "force"))
+ new_val = 1;
+ else if (sysfs_streq(val, "auto"))
+ new_val = boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT);
+ else if (strtobool(val, &new_val) < 0)
+ return -EINVAL;
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+ int idx;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
int kvm_mmu_module_init(void)
{
int ret = -ENOMEM;
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT));
+
kvm_mmu_reset_all_pte_masks();
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
@@ -5623,3 +5762,116 @@ void kvm_mmu_module_exit(void)
unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+ unsigned int old_val;
+ int err;
+
+ old_val = nx_huge_pages_recovery_ratio;
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ if (READ_ONCE(nx_huge_pages) &&
+ !old_val && nx_huge_pages_recovery_ratio) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of lpage_disallowed pages is expected to
+ * be relatively small compared to the total.
+ */
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ struct kvm_mmu_page,
+ lpage_disallowed_link);
+ WARN_ON_ONCE(!sp->lpage_disallowed);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (to_zap)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+ ? start_time + 60 * HZ - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_lpages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_lpage_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_lpage_recovery_thread)
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index de14d69901cc..076cc8869100 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -193,4 +193,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6a6625dd62de..3473683a5283 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -589,13 +589,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int write_fault, int hlevel,
- kvm_pfn_t pfn, bool map_writable, bool prefault)
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
+ bool lpage_disallowed)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
int top_level, ret;
- gfn_t base_gfn;
+ gfn_t gfn, base_gfn;
direct_access = gw->pte_access;
@@ -640,13 +641,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- base_gfn = gw->gfn;
+ /*
+ * FNAME(page_fault) might have clobbered the bottom bits of
+ * gw->gfn, restore them from the virtual address.
+ */
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+ base_gfn = gfn;
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == hlevel)
break;
@@ -658,6 +671,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -734,9 +749,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
kvm_pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
- bool force_pt_level = false;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ bool force_pt_level = lpage_disallowed;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -826,7 +843,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
- level, pfn, map_writable, prefault);
+ level, pfn, map_writable, prefault, lpage_disallowed);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e0ea43713678..ef1aaf96d35a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -705,8 +705,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5c4fef9af0e9..235f157b6c2a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2255,17 +2255,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
- if (!enable_ept) {
- /*
- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
- * host CPUID is more efficient than testing guest CPUID
- * or CR4. Host SMEP is anyway a requirement for guest SMEP.
- */
- if (boot_cpu_has(X86_FEATURE_SMEP))
- guest_efer |= EFER_NX;
- else if (!(guest_efer & EFER_NX))
- ignore_bits |= EFER_NX;
- }
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 20fe7485472f..6721fdf24fe6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages, .mode = 0444) },
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
{ "max_mmu_page_hash_collisions",
VM_STAT(max_mmu_page_hash_collisions) },
{ NULL }
@@ -1082,8 +1083,44 @@ u64 kvm_get_arch_capabilities(void)
if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+
+ /*
+ * On TAA affected systems, export MDS_NO=0 when:
+ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+ * - Updated microcode is present. This is detected by
+ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+ * that VERW clears CPU buffers.
+ *
+ * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+ * mitigation and don't complain:
+ *
+ * "Vulnerable: Clear CPU buffers attempted, no microcode"
+ *
+ * If TSX is disabled on the system, guests are also mitigated against
+ * TAA and clear CPU buffer mitigation is not required for guests.
+ */
+ if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
+ (data & ARCH_CAP_TSX_CTRL_MSR))
+ data &= ~ARCH_CAP_MDS_NO;
+
+
+ /*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
return data;
}
+
EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
@@ -8425,6 +8462,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -8454,6 +8492,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return 0;
}
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
+}
+
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
int r;
@@ -8557,6 +8600,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
}
EXPORT_SYMBOL_GPL(x86_set_memory_region);
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ kvm_mmu_pre_destroy_vm(kvm);
+}
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
if (current->mm == kvm->mm) {
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 32b52e6bd13b..0272f66db5ac 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -539,12 +539,27 @@ ssize_t __weak cpu_show_mds(struct device *dev,
return sprintf(buf, "Not affected\n");
}
+ssize_t __weak cpu_show_tsx_async_abort(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "Not affected\n");
+}
+
+ssize_t __weak cpu_show_itlb_multihit(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "Not affected\n");
+}
+
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
+static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL);
+static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_meltdown.attr,
@@ -553,6 +568,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_spec_store_bypass.attr,
&dev_attr_l1tf.attr,
&dev_attr_mds.attr,
+ &dev_attr_tsx_async_abort.attr,
+ &dev_attr_itlb_multihit.attr,
NULL
};
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 95478db9998b..5c2ae816ac32 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -51,13 +51,11 @@
* granting userspace undue privileges. There are three categories of privilege.
*
* First, commands which are explicitly defined as privileged or which should
- * only be used by the kernel driver. The parser generally rejects such
- * commands, though it may allow some from the drm master process.
+ * only be used by the kernel driver. The parser rejects such commands
*
* Second, commands which access registers. To support correct/enhanced
* userspace functionality, particularly certain OpenGL extensions, the parser
- * provides a whitelist of registers which userspace may safely access (for both
- * normal and drm master processes).
+ * provides a whitelist of registers which userspace may safely access
*
* Third, commands which access privileged memory (i.e. GGTT, HWS page, etc).
* The parser always rejects such commands.
@@ -82,9 +80,9 @@
* in the per-engine command tables.
*
* Other command table entries map fairly directly to high level categories
- * mentioned above: rejected, master-only, register whitelist. The parser
- * implements a number of checks, including the privileged memory checks, via a
- * general bitmasking mechanism.
+ * mentioned above: rejected, register whitelist. The parser implements a number
+ * of checks, including the privileged memory checks, via a general bitmasking
+ * mechanism.
*/
/*
@@ -102,8 +100,6 @@ struct drm_i915_cmd_descriptor {
* CMD_DESC_REJECT: The command is never allowed
* CMD_DESC_REGISTER: The command should be checked against the
* register whitelist for the appropriate ring
- * CMD_DESC_MASTER: The command is allowed if the submitting process
- * is the DRM master
*/
u32 flags;
#define CMD_DESC_FIXED (1<<0)
@@ -111,7 +107,6 @@ struct drm_i915_cmd_descriptor {
#define CMD_DESC_REJECT (1<<2)
#define CMD_DESC_REGISTER (1<<3)
#define CMD_DESC_BITMASK (1<<4)
-#define CMD_DESC_MASTER (1<<5)
/*
* The command's unique identification bits and the bitmask to get them.
@@ -192,7 +187,7 @@ struct drm_i915_cmd_table {
#define CMD(op, opm, f, lm, fl, ...) \
{ \
.flags = (fl) | ((f) ? CMD_DESC_FIXED : 0), \
- .cmd = { (op), ~0u << (opm) }, \
+ .cmd = { (op & ~0u << (opm)), ~0u << (opm) }, \
.length = { (lm) }, \
__VA_ARGS__ \
}
@@ -207,14 +202,13 @@ struct drm_i915_cmd_table {
#define R CMD_DESC_REJECT
#define W CMD_DESC_REGISTER
#define B CMD_DESC_BITMASK
-#define M CMD_DESC_MASTER
/* Command Mask Fixed Len Action
---------------------------------------------------------- */
-static const struct drm_i915_cmd_descriptor common_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = {
CMD( MI_NOOP, SMI, F, 1, S ),
CMD( MI_USER_INTERRUPT, SMI, F, 1, R ),
- CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, M ),
+ CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, R ),
CMD( MI_ARB_CHECK, SMI, F, 1, S ),
CMD( MI_REPORT_HEAD, SMI, F, 1, S ),
CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ),
@@ -244,7 +238,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = {
CMD( MI_BATCH_BUFFER_START, SMI, !F, 0xFF, S ),
};
-static const struct drm_i915_cmd_descriptor render_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_render_cmds[] = {
CMD( MI_FLUSH, SMI, F, 1, S ),
CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
CMD( MI_PREDICATE, SMI, F, 1, S ),
@@ -311,7 +305,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
CMD( MI_URB_ATOMIC_ALLOC, SMI, F, 1, S ),
CMD( MI_SET_APPID, SMI, F, 1, S ),
CMD( MI_RS_CONTEXT, SMI, F, 1, S ),
- CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ),
+ CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ),
CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ),
CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W,
.reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ),
@@ -328,7 +322,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
CMD( GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS, S3D, !F, 0x1FF, S ),
};
-static const struct drm_i915_cmd_descriptor video_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_video_cmds[] = {
CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
CMD( MI_SET_APPID, SMI, F, 1, S ),
CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B,
@@ -372,7 +366,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = {
CMD( MFX_WAIT, SMFX, F, 1, S ),
};
-static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_vecs_cmds[] = {
CMD( MI_ARB_ON_OFF, SMI, F, 1, R ),
CMD( MI_SET_APPID, SMI, F, 1, S ),
CMD( MI_STORE_DWORD_IMM, SMI, !F, 0xFF, B,
@@ -410,7 +404,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
}}, ),
};
-static const struct drm_i915_cmd_descriptor blt_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = {
CMD( MI_DISPLAY_FLIP, SMI, !F, 0xFF, R ),
CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, B,
.bits = {{
@@ -444,10 +438,64 @@ static const struct drm_i915_cmd_descriptor blt_cmds[] = {
};
static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
- CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, M ),
+ CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, R ),
CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, R ),
};
+/*
+ * For Gen9 we can still rely on the h/w to enforce cmd security, and only
+ * need to re-enforce the register access checks. We therefore only need to
+ * teach the cmdparser how to find the end of each command, and identify
+ * register accesses. The table doesn't need to reject any commands, and so
+ * the only commands listed here are:
+ * 1) Those that touch registers
+ * 2) Those that do not have the default 8-bit length
+ *
+ * Note that the default MI length mask chosen for this table is 0xFF, not
+ * the 0x3F used on older devices. This is because the vast majority of MI
+ * cmds on Gen9 use a standard 8-bit Length field.
+ * All the Gen9 blitter instructions are standard 0xFF length mask, and
+ * none allow access to non-general registers, so in fact no BLT cmds are
+ * included in the table at all.
+ *
+ */
+static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = {
+ CMD( MI_NOOP, SMI, F, 1, S ),
+ CMD( MI_USER_INTERRUPT, SMI, F, 1, S ),
+ CMD( MI_WAIT_FOR_EVENT, SMI, F, 1, S ),
+ CMD( MI_FLUSH, SMI, F, 1, S ),
+ CMD( MI_ARB_CHECK, SMI, F, 1, S ),
+ CMD( MI_REPORT_HEAD, SMI, F, 1, S ),
+ CMD( MI_ARB_ON_OFF, SMI, F, 1, S ),
+ CMD( MI_SUSPEND_FLUSH, SMI, F, 1, S ),
+ CMD( MI_LOAD_SCAN_LINES_INCL, SMI, !F, 0x3F, S ),
+ CMD( MI_LOAD_SCAN_LINES_EXCL, SMI, !F, 0x3F, S ),
+ CMD( MI_STORE_DWORD_IMM, SMI, !F, 0x3FF, S ),
+ CMD( MI_LOAD_REGISTER_IMM(1), SMI, !F, 0xFF, W,
+ .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 } ),
+ CMD( MI_UPDATE_GTT, SMI, !F, 0x3FF, S ),
+ CMD( MI_STORE_REGISTER_MEM_GEN8, SMI, F, 4, W,
+ .reg = { .offset = 1, .mask = 0x007FFFFC } ),
+ CMD( MI_FLUSH_DW, SMI, !F, 0x3F, S ),
+ CMD( MI_LOAD_REGISTER_MEM_GEN8, SMI, F, 4, W,
+ .reg = { .offset = 1, .mask = 0x007FFFFC } ),
+ CMD( MI_LOAD_REGISTER_REG, SMI, !F, 0xFF, W,
+ .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 } ),
+
+ /*
+ * We allow BB_START but apply further checks. We just sanitize the
+ * basic fields here.
+ */
+#define MI_BB_START_OPERAND_MASK GENMASK(SMI-1, 0)
+#define MI_BB_START_OPERAND_EXPECT (MI_BATCH_PPGTT_HSW | 1)
+ CMD( MI_BATCH_BUFFER_START_GEN8, SMI, !F, 0xFF, B,
+ .bits = {{
+ .offset = 0,
+ .mask = MI_BB_START_OPERAND_MASK,
+ .expected = MI_BB_START_OPERAND_EXPECT,
+ }}, ),
+};
+
static const struct drm_i915_cmd_descriptor noop_desc =
CMD(MI_NOOP, SMI, F, 1, S);
@@ -461,40 +509,44 @@ static const struct drm_i915_cmd_descriptor noop_desc =
#undef R
#undef W
#undef B
-#undef M
-static const struct drm_i915_cmd_table gen7_render_cmds[] = {
- { common_cmds, ARRAY_SIZE(common_cmds) },
- { render_cmds, ARRAY_SIZE(render_cmds) },
+static const struct drm_i915_cmd_table gen7_render_cmd_table[] = {
+ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+ { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
};
-static const struct drm_i915_cmd_table hsw_render_ring_cmds[] = {
- { common_cmds, ARRAY_SIZE(common_cmds) },
- { render_cmds, ARRAY_SIZE(render_cmds) },
+static const struct drm_i915_cmd_table hsw_render_ring_cmd_table[] = {
+ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+ { gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
{ hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) },
};
-static const struct drm_i915_cmd_table gen7_video_cmds[] = {
- { common_cmds, ARRAY_SIZE(common_cmds) },
- { video_cmds, ARRAY_SIZE(video_cmds) },
+static const struct drm_i915_cmd_table gen7_video_cmd_table[] = {
+ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+ { gen7_video_cmds, ARRAY_SIZE(gen7_video_cmds) },
};
-static const struct drm_i915_cmd_table hsw_vebox_cmds[] = {
- { common_cmds, ARRAY_SIZE(common_cmds) },
- { vecs_cmds, ARRAY_SIZE(vecs_cmds) },
+static const struct drm_i915_cmd_table hsw_vebox_cmd_table[] = {
+ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+ { gen7_vecs_cmds, ARRAY_SIZE(gen7_vecs_cmds) },
};
-static const struct drm_i915_cmd_table gen7_blt_cmds[] = {
- { common_cmds, ARRAY_SIZE(common_cmds) },
- { blt_cmds, ARRAY_SIZE(blt_cmds) },
+static const struct drm_i915_cmd_table gen7_blt_cmd_table[] = {
+ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+ { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
};
-static const struct drm_i915_cmd_table hsw_blt_ring_cmds[] = {
- { common_cmds, ARRAY_SIZE(common_cmds) },
- { blt_cmds, ARRAY_SIZE(blt_cmds) },
+static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = {
+ { gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+ { gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
{ hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) },
};
+static const struct drm_i915_cmd_table gen9_blt_cmd_table[] = {
+ { gen9_blt_cmds, ARRAY_SIZE(gen9_blt_cmds) },
+};
+
+
/*
* Register whitelists, sorted by increasing register offset.
*/
@@ -610,17 +662,27 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = {
REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
};
-static const struct drm_i915_reg_descriptor ivb_master_regs[] = {
- REG32(FORCEWAKE_MT),
- REG32(DERRMR),
- REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_A)),
- REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_B)),
- REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_C)),
-};
-
-static const struct drm_i915_reg_descriptor hsw_master_regs[] = {
- REG32(FORCEWAKE_MT),
- REG32(DERRMR),
+static const struct drm_i915_reg_descriptor gen9_blt_regs[] = {
+ REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE),
+ REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE),
+ REG32(BCS_SWCTRL),
+ REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
+ REG64_IDX(BCS_GPR, 0),
+ REG64_IDX(BCS_GPR, 1),
+ REG64_IDX(BCS_GPR, 2),
+ REG64_IDX(BCS_GPR, 3),
+ REG64_IDX(BCS_GPR, 4),
+ REG64_IDX(BCS_GPR, 5),
+ REG64_IDX(BCS_GPR, 6),
+ REG64_IDX(BCS_GPR, 7),
+ REG64_IDX(BCS_GPR, 8),
+ REG64_IDX(BCS_GPR, 9),
+ REG64_IDX(BCS_GPR, 10),
+ REG64_IDX(BCS_GPR, 11),
+ REG64_IDX(BCS_GPR, 12),
+ REG64_IDX(BCS_GPR, 13),
+ REG64_IDX(BCS_GPR, 14),
+ REG64_IDX(BCS_GPR, 15),
};
#undef REG64
@@ -629,28 +691,27 @@ static const struct drm_i915_reg_descriptor hsw_master_regs[] = {
struct drm_i915_reg_table {
const struct drm_i915_reg_descriptor *regs;
int num_regs;
- bool master;
};
static const struct drm_i915_reg_table ivb_render_reg_tables[] = {
- { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false },
- { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
+ { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
};
static const struct drm_i915_reg_table ivb_blt_reg_tables[] = {
- { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false },
- { ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
+ { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
};
static const struct drm_i915_reg_table hsw_render_reg_tables[] = {
- { gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false },
- { hsw_render_regs, ARRAY_SIZE(hsw_render_regs), false },
- { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true },
+ { gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
+ { hsw_render_regs, ARRAY_SIZE(hsw_render_regs) },
};
static const struct drm_i915_reg_table hsw_blt_reg_tables[] = {
- { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false },
- { hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true },
+ { gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
+};
+
+static const struct drm_i915_reg_table gen9_blt_reg_tables[] = {
+ { gen9_blt_regs, ARRAY_SIZE(gen9_blt_regs) },
};
static u32 gen7_render_get_cmd_length_mask(u32 cmd_header)
@@ -708,6 +769,17 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header)
return 0;
}
+static u32 gen9_blt_get_cmd_length_mask(u32 cmd_header)
+{
+ u32 client = cmd_header >> INSTR_CLIENT_SHIFT;
+
+ if (client == INSTR_MI_CLIENT || client == INSTR_BC_CLIENT)
+ return 0xFF;
+
+ DRM_DEBUG_DRIVER("CMD: Abnormal blt cmd length! 0x%08X\n", cmd_header);
+ return 0;
+}
+
static bool validate_cmds_sorted(const struct intel_engine_cs *engine,
const struct drm_i915_cmd_table *cmd_tables,
int cmd_table_count)
@@ -865,18 +937,19 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
int cmd_table_count;
int ret;
- if (!IS_GEN7(engine->i915))
+ if (!IS_GEN7(engine->i915) && !(IS_GEN9(engine->i915) &&
+ engine->id == BCS))
return;
switch (engine->id) {
case RCS:
if (IS_HASWELL(engine->i915)) {
- cmd_tables = hsw_render_ring_cmds;
+ cmd_tables = hsw_render_ring_cmd_table;
cmd_table_count =
- ARRAY_SIZE(hsw_render_ring_cmds);
+ ARRAY_SIZE(hsw_render_ring_cmd_table);
} else {
- cmd_tables = gen7_render_cmds;
- cmd_table_count = ARRAY_SIZE(gen7_render_cmds);
+ cmd_tables = gen7_render_cmd_table;
+ cmd_table_count = ARRAY_SIZE(gen7_render_cmd_table);
}
if (IS_HASWELL(engine->i915)) {
@@ -886,36 +959,46 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
engine->reg_tables = ivb_render_reg_tables;
engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables);
}
-
engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask;
break;
case VCS:
- cmd_tables = gen7_video_cmds;
- cmd_table_count = ARRAY_SIZE(gen7_video_cmds);
+ cmd_tables = gen7_video_cmd_table;
+ cmd_table_count = ARRAY_SIZE(gen7_video_cmd_table);
engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
break;
case BCS:
- if (IS_HASWELL(engine->i915)) {
- cmd_tables = hsw_blt_ring_cmds;
- cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds);
+ engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
+ if (IS_GEN9(engine->i915)) {
+ cmd_tables = gen9_blt_cmd_table;
+ cmd_table_count = ARRAY_SIZE(gen9_blt_cmd_table);
+ engine->get_cmd_length_mask =
+ gen9_blt_get_cmd_length_mask;
+
+ /* BCS Engine unsafe without parser */
+ engine->flags |= I915_ENGINE_REQUIRES_CMD_PARSER;
+ } else if (IS_HASWELL(engine->i915)) {
+ cmd_tables = hsw_blt_ring_cmd_table;
+ cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table);
} else {
- cmd_tables = gen7_blt_cmds;
- cmd_table_count = ARRAY_SIZE(gen7_blt_cmds);
+ cmd_tables = gen7_blt_cmd_table;
+ cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table);
}
- if (IS_HASWELL(engine->i915)) {
+ if (IS_GEN9(engine->i915)) {
+ engine->reg_tables = gen9_blt_reg_tables;
+ engine->reg_table_count =
+ ARRAY_SIZE(gen9_blt_reg_tables);
+ } else if (IS_HASWELL(engine->i915)) {
engine->reg_tables = hsw_blt_reg_tables;
engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables);
} else {
engine->reg_tables = ivb_blt_reg_tables;
engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables);
}
-
- engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
break;
case VECS:
- cmd_tables = hsw_vebox_cmds;
- cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds);
+ cmd_tables = hsw_vebox_cmd_table;
+ cmd_table_count = ARRAY_SIZE(hsw_vebox_cmd_table);
/* VECS can use the same length_mask function as VCS */
engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
break;
@@ -941,7 +1024,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
return;
}
- engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER;
+ engine->flags |= I915_ENGINE_USING_CMD_PARSER;
}
/**
@@ -953,7 +1036,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
*/
void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine)
{
- if (!intel_engine_needs_cmd_parser(engine))
+ if (!intel_engine_using_cmd_parser(engine))
return;
fini_hash_table(engine);
@@ -1027,22 +1110,16 @@ __find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr)
}
static const struct drm_i915_reg_descriptor *
-find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr)
+find_reg(const struct intel_engine_cs *engine, u32 addr)
{
const struct drm_i915_reg_table *table = engine->reg_tables;
+ const struct drm_i915_reg_descriptor *reg = NULL;
int count = engine->reg_table_count;
- for (; count > 0; ++table, --count) {
- if (!table->master || is_master) {
- const struct drm_i915_reg_descriptor *reg;
+ for (; !reg && (count > 0); ++table, --count)
+ reg = __find_reg(table->regs, table->num_regs, addr);
- reg = __find_reg(table->regs, table->num_regs, addr);
- if (reg != NULL)
- return reg;
- }
- }
-
- return NULL;
+ return reg;
}
/* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
@@ -1127,8 +1204,7 @@ unpin_src:
static bool check_cmd(const struct intel_engine_cs *engine,
const struct drm_i915_cmd_descriptor *desc,
- const u32 *cmd, u32 length,
- const bool is_master)
+ const u32 *cmd, u32 length)
{
if (desc->flags & CMD_DESC_SKIP)
return true;
@@ -1138,12 +1214,6 @@ static bool check_cmd(const struct intel_engine_cs *engine,
return false;
}
- if ((desc->flags & CMD_DESC_MASTER) && !is_master) {
- DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n",
- *cmd);
- return false;
- }
-
if (desc->flags & CMD_DESC_REGISTER) {
/*
* Get the distance between individual register offset
@@ -1157,7 +1227,7 @@ static bool check_cmd(const struct intel_engine_cs *engine,
offset += step) {
const u32 reg_addr = cmd[offset] & desc->reg.mask;
const struct drm_i915_reg_descriptor *reg =
- find_reg(engine, is_master, reg_addr);
+ find_reg(engine, reg_addr);
if (!reg) {
DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n",
@@ -1235,16 +1305,113 @@ static bool check_cmd(const struct intel_engine_cs *engine,
return true;
}
+static int check_bbstart(const struct i915_gem_context *ctx,
+ u32 *cmd, u32 offset, u32 length,
+ u32 batch_len,
+ u64 batch_start,
+ u64 shadow_batch_start)
+{
+ u64 jump_offset, jump_target;
+ u32 target_cmd_offset, target_cmd_index;
+
+ /* For igt compatibility on older platforms */
+ if (CMDPARSER_USES_GGTT(ctx->i915)) {
+ DRM_DEBUG("CMD: Rejecting BB_START for ggtt based submission\n");
+ return -EACCES;
+ }
+
+ if (length != 3) {
+ DRM_DEBUG("CMD: Recursive BB_START with bad length(%u)\n",
+ length);
+ return -EINVAL;
+ }
+
+ jump_target = *(u64*)(cmd+1);
+ jump_offset = jump_target - batch_start;
+
+ /*
+ * Any underflow of jump_target is guaranteed to be outside the range
+ * of a u32, so >= test catches both too large and too small
+ */
+ if (jump_offset >= batch_len) {
+ DRM_DEBUG("CMD: BB_START to 0x%llx jumps out of BB\n",
+ jump_target);
+ return -EINVAL;
+ }
+
+ /*
+ * This cannot overflow a u32 because we already checked jump_offset
+ * is within the BB, and the batch_len is a u32
+ */
+ target_cmd_offset = lower_32_bits(jump_offset);
+ target_cmd_index = target_cmd_offset / sizeof(u32);
+
+ *(u64*)(cmd + 1) = shadow_batch_start + target_cmd_offset;
+
+ if (target_cmd_index == offset)
+ return 0;
+
+ if (ctx->jump_whitelist_cmds <= target_cmd_index) {
+ DRM_DEBUG("CMD: Rejecting BB_START - truncated whitelist array\n");
+ return -EINVAL;
+ } else if (!test_bit(target_cmd_index, ctx->jump_whitelist)) {
+ DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n",
+ jump_target);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len)
+{
+ const u32 batch_cmds = DIV_ROUND_UP(batch_len, sizeof(u32));
+ const u32 exact_size = BITS_TO_LONGS(batch_cmds);
+ u32 next_size = BITS_TO_LONGS(roundup_pow_of_two(batch_cmds));
+ unsigned long *next_whitelist;
+
+ if (CMDPARSER_USES_GGTT(ctx->i915))
+ return;
+
+ if (batch_cmds <= ctx->jump_whitelist_cmds) {
+ memset(ctx->jump_whitelist, 0, exact_size * sizeof(u32));
+ return;
+ }
+
+again:
+ next_whitelist = kcalloc(next_size, sizeof(long), GFP_KERNEL);
+ if (next_whitelist) {
+ kfree(ctx->jump_whitelist);
+ ctx->jump_whitelist = next_whitelist;
+ ctx->jump_whitelist_cmds =
+ next_size * BITS_PER_BYTE * sizeof(long);
+ return;
+ }
+
+ if (next_size > exact_size) {
+ next_size = exact_size;
+ goto again;
+ }
+
+ DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n");
+ memset(ctx->jump_whitelist, 0,
+ BITS_TO_LONGS(ctx->jump_whitelist_cmds) * sizeof(u32));
+
+ return;
+}
+
#define LENGTH_BIAS 2
/**
* i915_parse_cmds() - parse a submitted batch buffer for privilege violations
+ * @ctx: the context in which the batch is to execute
* @engine: the engine on which the batch is to execute
* @batch_obj: the batch buffer in question
- * @shadow_batch_obj: copy of the batch buffer in question
+ * @batch_start: Canonical base address of batch
* @batch_start_offset: byte offset in the batch at which execution starts
* @batch_len: length of the commands in batch_obj
- * @is_master: is the submitting process the drm master?
+ * @shadow_batch_obj: copy of the batch buffer in question
+ * @shadow_batch_start: Canonical base address of shadow_batch_obj
*
* Parses the specified batch buffer looking for privilege violations as
* described in the overview.
@@ -1252,14 +1419,17 @@ static bool check_cmd(const struct intel_engine_cs *engine,
* Return: non-zero if the parser finds violations or otherwise fails; -EACCES
* if the batch appears legal but should use hardware parsing
*/
-int intel_engine_cmd_parser(struct intel_engine_cs *engine,
+
+int intel_engine_cmd_parser(struct i915_gem_context *ctx,
+ struct intel_engine_cs *engine,
struct drm_i915_gem_object *batch_obj,
- struct drm_i915_gem_object *shadow_batch_obj,
+ u64 batch_start,
u32 batch_start_offset,
u32 batch_len,
- bool is_master)
+ struct drm_i915_gem_object *shadow_batch_obj,
+ u64 shadow_batch_start)
{
- u32 *cmd, *batch_end;
+ u32 *cmd, *batch_end, offset = 0;
struct drm_i915_cmd_descriptor default_desc = noop_desc;
const struct drm_i915_cmd_descriptor *desc = &default_desc;
bool needs_clflush_after = false;
@@ -1273,6 +1443,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
return PTR_ERR(cmd);
}
+ init_whitelist(ctx, batch_len);
+
/*
* We use the batch length as size because the shadow object is as
* large or larger and copy_batch() will write MI_NOPs to the extra
@@ -1282,31 +1454,15 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
do {
u32 length;
- if (*cmd == MI_BATCH_BUFFER_END) {
- if (needs_clflush_after) {
- void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
- drm_clflush_virt_range(ptr,
- (void *)(cmd + 1) - ptr);
- }
+ if (*cmd == MI_BATCH_BUFFER_END)
break;
- }
desc = find_cmd(engine, *cmd, desc, &default_desc);
if (!desc) {
DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
*cmd);
ret = -EINVAL;
- break;
- }
-
- /*
- * If the batch buffer contains a chained batch, return an
- * error that tells the caller to abort and dispatch the
- * workload as a non-secure batch.
- */
- if (desc->cmd.value == MI_BATCH_BUFFER_START) {
- ret = -EACCES;
- break;
+ goto err;
}
if (desc->flags & CMD_DESC_FIXED)
@@ -1320,22 +1476,43 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
length,
batch_end - cmd);
ret = -EINVAL;
- break;
+ goto err;
}
- if (!check_cmd(engine, desc, cmd, length, is_master)) {
+ if (!check_cmd(engine, desc, cmd, length)) {
ret = -EACCES;
+ goto err;
+ }
+
+ if (desc->cmd.value == MI_BATCH_BUFFER_START) {
+ ret = check_bbstart(ctx, cmd, offset, length,
+ batch_len, batch_start,
+ shadow_batch_start);
+
+ if (ret)
+ goto err;
break;
}
+ if (ctx->jump_whitelist_cmds > offset)
+ set_bit(offset, ctx->jump_whitelist);
+
cmd += length;
+ offset += length;
if (cmd >= batch_end) {
DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
ret = -EINVAL;
- break;
+ goto err;
}
} while (1);
+ if (needs_clflush_after) {
+ void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
+
+ drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr);
+ }
+
+err:
i915_gem_object_unpin_map(shadow_batch_obj);
return ret;
}
@@ -1357,7 +1534,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
/* If the command parser is not enabled, report 0 - unsupported */
for_each_engine(engine, dev_priv, id) {
- if (intel_engine_needs_cmd_parser(engine)) {
+ if (intel_engine_using_cmd_parser(engine)) {
active = true;
break;
}
@@ -1382,6 +1559,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
* the parser enabled.
* 9. Don't whitelist or handle oacontrol specially, as ownership
* for oacontrol state is moving to i915-perf.
+ * 10. Support for Gen9 BCS Parsing
*/
- return 9;
+ return 10;
}
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 0c1702b1ed34..c3e98d1f05a5 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -356,7 +356,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
value = HAS_LEGACY_SEMAPHORES(dev_priv);
break;
case I915_PARAM_HAS_SECURE_BATCHES:
- value = capable(CAP_SYS_ADMIN);
+ value = HAS_SECURE_BATCHES(dev_priv) && capable(CAP_SYS_ADMIN);
break;
case I915_PARAM_CMD_PARSER_VERSION:
value = i915_cmd_parser_get_version(dev_priv);
@@ -1658,6 +1658,7 @@ static int i915_drm_suspend_late(struct drm_device *dev, bool hibernation)
i915_gem_suspend_late(dev_priv);
intel_display_set_init_power(dev_priv, false);
+ i915_rc6_ctx_wa_suspend(dev_priv);
intel_uncore_suspend(dev_priv);
/*
@@ -1884,6 +1885,8 @@ static int i915_drm_resume_early(struct drm_device *dev)
else
intel_display_set_init_power(dev_priv, true);
+ i915_rc6_ctx_wa_resume(dev_priv);
+
intel_engines_sanitize(dev_priv);
enable_rpm_wakeref_asserts(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a8b6bc8f85a2..b7f23c3fe5b9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -801,6 +801,7 @@ struct intel_rps {
struct intel_rc6 {
bool enabled;
+ bool ctx_corrupted;
u64 prev_hw_residency[4];
u64 cur_residency[4];
};
@@ -2587,6 +2588,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
#define IS_GEN9_LP(dev_priv) (IS_GEN9(dev_priv) && IS_LP(dev_priv))
#define IS_GEN9_BC(dev_priv) (IS_GEN9(dev_priv) && !IS_LP(dev_priv))
+/*
+ * The Gen7 cmdparser copies the scanned buffer to the ggtt for execution
+ * All later gens can run the final buffer from the ppgtt
+ */
+#define CMDPARSER_USES_GGTT(dev_priv) IS_GEN7(dev_priv)
+
#define ENGINE_MASK(id) BIT(id)
#define RENDER_RING ENGINE_MASK(RCS)
#define BSD_RING ENGINE_MASK(VCS)
@@ -2608,6 +2615,8 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
#define HAS_LEGACY_SEMAPHORES(dev_priv) IS_GEN7(dev_priv)
+#define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6)
+
#define HAS_LLC(dev_priv) ((dev_priv)->info.has_llc)
#define HAS_SNOOP(dev_priv) ((dev_priv)->info.has_snoop)
#define HAS_EDRAM(dev_priv) (!!((dev_priv)->edram_cap & EDRAM_ENABLED))
@@ -2640,10 +2649,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
/* Early gen2 have a totally busted CS tlb and require pinned batches. */
#define HAS_BROKEN_CS_TLB(dev_priv) (IS_I830(dev_priv) || IS_I845G(dev_priv))
+#define NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv) \
+ (IS_BROADWELL(dev_priv) || INTEL_GEN(dev_priv) == 9)
+
/* WaRsDisableCoarsePowerGating:skl,cnl */
#define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \
- (IS_CANNONLAKE(dev_priv) || \
- IS_SKL_GT3(dev_priv) || IS_SKL_GT4(dev_priv))
+ (IS_CANNONLAKE(dev_priv) || INTEL_GEN(dev_priv) == 9)
#define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4)
#define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \
@@ -3036,6 +3047,14 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
u64 alignment,
u64 flags);
+struct i915_vma * __must_check
+i915_gem_object_pin(struct drm_i915_gem_object *obj,
+ struct i915_address_space *vm,
+ const struct i915_ggtt_view *view,
+ u64 size,
+ u64 alignment,
+ u64 flags);
+
int i915_gem_object_unbind(struct drm_i915_gem_object *obj);
void i915_gem_release_mmap(struct drm_i915_gem_object *obj);
@@ -3429,12 +3448,14 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv);
void intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine);
-int intel_engine_cmd_parser(struct intel_engine_cs *engine,
+int intel_engine_cmd_parser(struct i915_gem_context *cxt,
+ struct intel_engine_cs *engine,
struct drm_i915_gem_object *batch_obj,
- struct drm_i915_gem_object *shadow_batch_obj,
+ u64 user_batch_start,
u32 batch_start_offset,
u32 batch_len,
- bool is_master);
+ struct drm_i915_gem_object *shadow_batch_obj,
+ u64 shadow_batch_start);
/* i915_perf.c */
extern void i915_perf_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0f8979859d66..cbbc33fb482f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -174,6 +174,11 @@ static u32 __i915_gem_park(struct drm_i915_private *i915)
if (INTEL_GEN(i915) >= 6)
gen6_rps_idle(i915);
+ if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) {
+ i915_rc6_ctx_wa_check(i915);
+ intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
+ }
+
intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ);
intel_runtime_pm_put(i915);
@@ -220,6 +225,9 @@ void i915_gem_unpark(struct drm_i915_private *i915)
*/
intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
+ if (NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+ intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
+
i915->gt.awake = true;
if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
i915->gt.epoch = 1;
@@ -4421,6 +4429,20 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
{
struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
struct i915_address_space *vm = &dev_priv->ggtt.vm;
+
+ return i915_gem_object_pin(obj, vm, view, size, alignment,
+ flags | PIN_GLOBAL);
+}
+
+struct i915_vma *
+i915_gem_object_pin(struct drm_i915_gem_object *obj,
+ struct i915_address_space *vm,
+ const struct i915_ggtt_view *view,
+ u64 size,
+ u64 alignment,
+ u64 flags)
+{
+ struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
struct i915_vma *vma;
int ret;
@@ -4484,7 +4506,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
return ERR_PTR(ret);
}
- ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
+ ret = i915_vma_pin(vma, size, alignment, flags);
if (ret)
return ERR_PTR(ret);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 1539816ff064..ccbb8d135427 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -214,6 +214,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
release_hw_id(ctx);
i915_ppgtt_put(ctx->ppgtt);
+ kfree(ctx->jump_whitelist);
+
for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
struct intel_context *ce = &ctx->__engine[n];
@@ -384,6 +386,9 @@ __create_hw_context(struct drm_i915_private *dev_priv,
ctx->ggtt_offset_bias = dev_priv->ggtt.pin_bias;
+ ctx->jump_whitelist = NULL;
+ ctx->jump_whitelist_cmds = 0;
+
return ctx;
err_pid:
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 8379dd501e63..4b701c3034e1 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -191,6 +191,12 @@ struct i915_gem_context {
/** remap_slice: Bitmask of cache lines that need remapping */
u8 remap_slice;
+ /** jump_whitelist: Bit array for tracking cmds during cmdparsing */
+ unsigned long *jump_whitelist;
+
+ /** jump_whitelist_cmds: No of cmd slots available */
+ u32 jump_whitelist_cmds;
+
/** handles_vma: rbtree to look up our context specific obj/vma for
* the user handle. (user handles are per fd, but the binding is
* per vm, which may be one per context or shared with the global GTT)
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 4cb5520a0941..fe700aa2e94a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -309,7 +309,9 @@ static inline u64 gen8_noncanonical_addr(u64 address)
static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
{
- return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len;
+ return intel_engine_requires_cmd_parser(eb->engine) ||
+ (intel_engine_using_cmd_parser(eb->engine) &&
+ eb->args->batch_len);
}
static int eb_create(struct i915_execbuffer *eb)
@@ -1895,10 +1897,38 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq)
return 0;
}
-static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
+static struct i915_vma *
+shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj)
+{
+ struct drm_i915_private *dev_priv = eb->i915;
+ struct i915_address_space *vm;
+ u64 flags;
+
+ /*
+ * PPGTT backed shadow buffers must be mapped RO, to prevent
+ * post-scan tampering
+ */
+ if (CMDPARSER_USES_GGTT(dev_priv)) {
+ flags = PIN_GLOBAL;
+ vm = &dev_priv->ggtt.vm;
+ } else if (eb->vm->has_read_only) {
+ flags = PIN_USER;
+ vm = eb->vm;
+ i915_gem_object_set_readonly(obj);
+ } else {
+ DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags);
+}
+
+static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
{
struct drm_i915_gem_object *shadow_batch_obj;
struct i915_vma *vma;
+ u64 batch_start;
+ u64 shadow_batch_start;
int err;
shadow_batch_obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
@@ -1906,30 +1936,55 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
if (IS_ERR(shadow_batch_obj))
return ERR_CAST(shadow_batch_obj);
- err = intel_engine_cmd_parser(eb->engine,
+ vma = shadow_batch_pin(eb, shadow_batch_obj);
+ if (IS_ERR(vma))
+ goto out;
+
+ batch_start = gen8_canonical_addr(eb->batch->node.start) +
+ eb->batch_start_offset;
+
+ shadow_batch_start = gen8_canonical_addr(vma->node.start);
+
+ err = intel_engine_cmd_parser(eb->ctx,
+ eb->engine,
eb->batch->obj,
- shadow_batch_obj,
+ batch_start,
eb->batch_start_offset,
eb->batch_len,
- is_master);
+ shadow_batch_obj,
+ shadow_batch_start);
+
if (err) {
- if (err == -EACCES) /* unhandled chained batch */
+ i915_vma_unpin(vma);
+
+ /*
+ * Unsafe GGTT-backed buffers can still be submitted safely
+ * as non-secure.
+ * For PPGTT backing however, we have no choice but to forcibly
+ * reject unsafe buffers
+ */
+ if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES))
+ /* Execute original buffer non-secure */
vma = NULL;
else
vma = ERR_PTR(err);
- goto out;
- }
- vma = i915_gem_object_ggtt_pin(shadow_batch_obj, NULL, 0, 0, 0);
- if (IS_ERR(vma))
goto out;
+ }
eb->vma[eb->buffer_count] = i915_vma_get(vma);
eb->flags[eb->buffer_count] =
__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
vma->exec_flags = &eb->flags[eb->buffer_count];
eb->buffer_count++;
+ eb->batch_start_offset = 0;
+ eb->batch = vma;
+
+ if (CMDPARSER_USES_GGTT(eb->i915))
+ eb->batch_flags |= I915_DISPATCH_SECURE;
+ /* We should not have changed overall length */
+ GEM_BUG_ON(eb->batch_len != eb->batch->size - eb->batch_start_offset);
out:
i915_gem_object_unpin_pages(shadow_batch_obj);
return vma;
@@ -2179,6 +2234,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
struct drm_i915_gem_exec_object2 *exec,
struct drm_syncobj **fences)
{
+ struct drm_i915_private *i915 = to_i915(dev);
struct i915_execbuffer eb;
struct dma_fence *in_fence = NULL;
struct sync_file *out_fence = NULL;
@@ -2189,7 +2245,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS &
~__EXEC_OBJECT_UNKNOWN_FLAGS);
- eb.i915 = to_i915(dev);
+ eb.i915 = i915;
eb.file = file;
eb.args = args;
if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
@@ -2211,8 +2267,15 @@ i915_gem_do_execbuffer(struct drm_device *dev,
eb.batch_flags = 0;
if (args->flags & I915_EXEC_SECURE) {
+ if (INTEL_GEN(i915) >= 11)
+ return -ENODEV;
+
+ /* Return -EPERM to trigger fallback code on old binaries. */
+ if (!HAS_SECURE_BATCHES(i915))
+ return -EPERM;
+
if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ return -EPERM;
eb.batch_flags |= I915_DISPATCH_SECURE;
}
@@ -2299,34 +2362,19 @@ i915_gem_do_execbuffer(struct drm_device *dev,
goto err_vma;
}
+ if (eb.batch_len == 0)
+ eb.batch_len = eb.batch->size - eb.batch_start_offset;
+
if (eb_use_cmdparser(&eb)) {
struct i915_vma *vma;
- vma = eb_parse(&eb, drm_is_current_master(file));
+ vma = eb_parse(&eb);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
goto err_vma;
}
-
- if (vma) {
- /*
- * Batch parsed and accepted:
- *
- * Set the DISPATCH_SECURE bit to remove the NON_SECURE
- * bit from MI_BATCH_BUFFER_START commands issued in
- * the dispatch_execbuffer implementations. We
- * specifically don't want that set on batches the
- * command parser has accepted.
- */
- eb.batch_flags |= I915_DISPATCH_SECURE;
- eb.batch_start_offset = 0;
- eb.batch = vma;
- }
}
- if (eb.batch_len == 0)
- eb.batch_len = eb.batch->size - eb.batch_start_offset;
-
/*
* snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
* batch" bit. Hence we need to pin secure batches into the global gtt.
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 8b71597b75e1..2db449752e6e 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -158,7 +158,8 @@ int intel_sanitize_enable_ppgtt(struct drm_i915_private *dev_priv,
if (enable_ppgtt == 0 && INTEL_GEN(dev_priv) < 9)
return 0;
- if (enable_ppgtt == 1)
+ /* Full PPGTT is required by the Gen9 cmdparser */
+ if (enable_ppgtt == 1 && INTEL_GEN(dev_priv) != 9)
return 1;
if (enable_ppgtt == 2 && has_full_ppgtt)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index c014c1ba7e8a..09146cd130b7 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -389,6 +389,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define ECOCHK_PPGTT_WT_HSW (0x2 << 3)
#define ECOCHK_PPGTT_WB_HSW (0x3 << 3)
+#define GEN8_RC6_CTX_INFO _MMIO(0x8504)
+
#define GAC_ECO_BITS _MMIO(0x14090)
#define ECOBITS_SNB_BIT (1 << 13)
#define ECOBITS_PPGTT_CACHE64B (3 << 8)
@@ -473,6 +475,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
*/
#define BCS_SWCTRL _MMIO(0x22200)
+/* There are 16 GPR registers */
+#define BCS_GPR(n) _MMIO(0x22600 + (n) * 8)
+#define BCS_GPR_UDW(n) _MMIO(0x22600 + (n) * 8 + 4)
+
#define GPGPU_THREADS_DISPATCHED _MMIO(0x2290)
#define GPGPU_THREADS_DISPATCHED_UDW _MMIO(0x2290 + 4)
#define HS_INVOCATION_COUNT _MMIO(0x2300)
@@ -6968,6 +6974,10 @@ enum {
#define SKL_CSR_DC5_DC6_COUNT _MMIO(0x8002C)
#define BXT_CSR_DC3_DC5_COUNT _MMIO(0x80038)
+/* Display Internal Timeout Register */
+#define RM_TIMEOUT _MMIO(0x42060)
+#define MMIO_TIMEOUT_US(us) ((us) << 0)
+
/* interrupts */
#define DE_MASTER_IRQ_CONTROL (1 << 31)
#define DE_SPRITEB_FLIP_DONE (1 << 29)
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 5d4df8bc94bc..18572faeb988 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -2075,6 +2075,9 @@ void intel_sanitize_gt_powersave(struct drm_i915_private *dev_priv);
void intel_enable_gt_powersave(struct drm_i915_private *dev_priv);
void intel_disable_gt_powersave(struct drm_i915_private *dev_priv);
void intel_suspend_gt_powersave(struct drm_i915_private *dev_priv);
+bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915);
+void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915);
+void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915);
void gen6_rps_busy(struct drm_i915_private *dev_priv);
void gen6_rps_reset_ei(struct drm_i915_private *dev_priv);
void gen6_rps_idle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index efb2c42b17e1..a1a42bff8f2e 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -114,6 +114,14 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv)
*/
I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) |
PWM1_GATING_DIS | PWM2_GATING_DIS);
+
+ /*
+ * Lower the display internal timeout.
+ * This is needed to avoid any hard hangs when DSI port PLL
+ * is off and a MMIO access is attempted by any privilege
+ * application, using batch buffers or any other means.
+ */
+ I915_WRITE(RM_TIMEOUT, MMIO_TIMEOUT_US(950));
}
static void glk_init_clock_gating(struct drm_i915_private *dev_priv)
@@ -8194,6 +8202,95 @@ static void intel_init_emon(struct drm_i915_private *dev_priv)
dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK);
}
+static bool i915_rc6_ctx_corrupted(struct drm_i915_private *dev_priv)
+{
+ return !I915_READ(GEN8_RC6_CTX_INFO);
+}
+
+static void i915_rc6_ctx_wa_init(struct drm_i915_private *i915)
+{
+ if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+ return;
+
+ if (i915_rc6_ctx_corrupted(i915)) {
+ DRM_INFO("RC6 context corrupted, disabling runtime power management\n");
+ i915->gt_pm.rc6.ctx_corrupted = true;
+ intel_runtime_pm_get(i915);
+ }
+}
+
+static void i915_rc6_ctx_wa_cleanup(struct drm_i915_private *i915)
+{
+ if (i915->gt_pm.rc6.ctx_corrupted) {
+ intel_runtime_pm_put(i915);
+ i915->gt_pm.rc6.ctx_corrupted = false;
+ }
+}
+
+/**
+ * i915_rc6_ctx_wa_suspend - system suspend sequence for the RC6 CTX WA
+ * @i915: i915 device
+ *
+ * Perform any steps needed to clean up the RC6 CTX WA before system suspend.
+ */
+void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915)
+{
+ if (i915->gt_pm.rc6.ctx_corrupted)
+ intel_runtime_pm_put(i915);
+}
+
+/**
+ * i915_rc6_ctx_wa_resume - system resume sequence for the RC6 CTX WA
+ * @i915: i915 device
+ *
+ * Perform any steps needed to re-init the RC6 CTX WA after system resume.
+ */
+void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915)
+{
+ if (!i915->gt_pm.rc6.ctx_corrupted)
+ return;
+
+ if (i915_rc6_ctx_corrupted(i915)) {
+ intel_runtime_pm_get(i915);
+ return;
+ }
+
+ DRM_INFO("RC6 context restored, re-enabling runtime power management\n");
+ i915->gt_pm.rc6.ctx_corrupted = false;
+}
+
+static void intel_disable_rc6(struct drm_i915_private *dev_priv);
+
+/**
+ * i915_rc6_ctx_wa_check - check for a new RC6 CTX corruption
+ * @i915: i915 device
+ *
+ * Check if an RC6 CTX corruption has happened since the last check and if so
+ * disable RC6 and runtime power management.
+ *
+ * Return false if no context corruption has happened since the last call of
+ * this function, true otherwise.
+*/
+bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915)
+{
+ if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+ return false;
+
+ if (i915->gt_pm.rc6.ctx_corrupted)
+ return false;
+
+ if (!i915_rc6_ctx_corrupted(i915))
+ return false;
+
+ DRM_NOTE("RC6 context corruption, disabling runtime power management\n");
+
+ intel_disable_rc6(i915);
+ i915->gt_pm.rc6.ctx_corrupted = true;
+ intel_runtime_pm_get_noresume(i915);
+
+ return true;
+}
+
void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
{
struct intel_rps *rps = &dev_priv->gt_pm.rps;
@@ -8209,6 +8306,8 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
mutex_lock(&dev_priv->pcu_lock);
+ i915_rc6_ctx_wa_init(dev_priv);
+
/* Initialize RPS limits (for userspace) */
if (IS_CHERRYVIEW(dev_priv))
cherryview_init_gt_powersave(dev_priv);
@@ -8255,6 +8354,8 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv)
if (IS_VALLEYVIEW(dev_priv))
valleyview_cleanup_gt_powersave(dev_priv);
+ i915_rc6_ctx_wa_cleanup(dev_priv);
+
if (!HAS_RC6(dev_priv))
intel_runtime_pm_put(dev_priv);
}
@@ -8299,7 +8400,7 @@ static inline void intel_disable_llc_pstate(struct drm_i915_private *i915)
i915->gt_pm.llc_pstate.enabled = false;
}
-static void intel_disable_rc6(struct drm_i915_private *dev_priv)
+static void __intel_disable_rc6(struct drm_i915_private *dev_priv)
{
lockdep_assert_held(&dev_priv->pcu_lock);
@@ -8318,6 +8419,13 @@ static void intel_disable_rc6(struct drm_i915_private *dev_priv)
dev_priv->gt_pm.rc6.enabled = false;
}
+static void intel_disable_rc6(struct drm_i915_private *dev_priv)
+{
+ mutex_lock(&dev_priv->pcu_lock);
+ __intel_disable_rc6(dev_priv);
+ mutex_unlock(&dev_priv->pcu_lock);
+}
+
static void intel_disable_rps(struct drm_i915_private *dev_priv)
{
lockdep_assert_held(&dev_priv->pcu_lock);
@@ -8343,7 +8451,7 @@ void intel_disable_gt_powersave(struct drm_i915_private *dev_priv)
{
mutex_lock(&dev_priv->pcu_lock);
- intel_disable_rc6(dev_priv);
+ __intel_disable_rc6(dev_priv);
intel_disable_rps(dev_priv);
if (HAS_LLC(dev_priv))
intel_disable_llc_pstate(dev_priv);
@@ -8370,6 +8478,9 @@ static void intel_enable_rc6(struct drm_i915_private *dev_priv)
if (dev_priv->gt_pm.rc6.enabled)
return;
+ if (dev_priv->gt_pm.rc6.ctx_corrupted)
+ return;
+
if (IS_CHERRYVIEW(dev_priv))
cherryview_enable_rc6(dev_priv);
else if (IS_VALLEYVIEW(dev_priv))
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index b6aa8764af2a..44095cd53f28 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -570,9 +570,10 @@ struct intel_engine_cs {
struct intel_engine_hangcheck hangcheck;
-#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
-#define I915_ENGINE_SUPPORTS_STATS BIT(1)
-#define I915_ENGINE_HAS_PREEMPTION BIT(2)
+#define I915_ENGINE_USING_CMD_PARSER BIT(0)
+#define I915_ENGINE_SUPPORTS_STATS BIT(1)
+#define I915_ENGINE_HAS_PREEMPTION BIT(2)
+#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(3)
unsigned int flags;
/*
@@ -633,9 +634,15 @@ struct intel_engine_cs {
};
static inline bool
-intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine)
+intel_engine_using_cmd_parser(const struct intel_engine_cs *engine)
{
- return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER;
+ return engine->flags & I915_ENGINE_USING_CMD_PARSER;
+}
+
+static inline bool
+intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine)
+{
+ return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER;
}
static inline bool
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index fea5801b1def..fdb6bf046fd4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -58,6 +58,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_mds(struct device *dev,
struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_tsx_async_abort(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+extern ssize_t cpu_show_itlb_multihit(struct device *dev,
+ struct device_attribute *attr, char *buf);
extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3aeb1ce4de38..18a92575f291 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1285,4 +1285,10 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
#endif /* CONFIG_HAVE_KVM_NO_POLL */
+typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+ uintptr_t data, const char *name,
+ struct task_struct **thread_ptr);
+
#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ab2ca4758055..f61dba09c23b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/bsearch.h>
+#include <linux/kthread.h>
#include <asm/processor.h>
#include <asm/io.h>
@@ -669,6 +670,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
return 0;
}
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
static struct kvm *kvm_create_vm(unsigned long type)
{
int r, i;
@@ -723,11 +741,15 @@ static struct kvm *kvm_create_vm(unsigned long type)
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
if (!kvm->buses[i])
- goto out_err;
+ goto out_err_no_mmu_notifier;
}
r = kvm_init_mmu_notifier(kvm);
if (r)
+ goto out_err_no_mmu_notifier;
+
+ r = kvm_arch_post_init_vm(kvm);
+ if (r)
goto out_err;
mutex_lock(&kvm_lock);
@@ -739,6 +761,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
return kvm;
out_err:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ if (kvm->mmu_notifier.ops)
+ mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
cleanup_srcu_struct(&kvm->srcu);
@@ -781,6 +808,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
mutex_lock(&kvm_lock);
list_del(&kvm->vm_list);
mutex_unlock(&kvm_lock);
+ kvm_arch_pre_destroy_vm(kvm);
+
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++) {
struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -4209,3 +4238,86 @@ void kvm_exit(void)
kvm_vfio_ops_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+ struct kvm *kvm;
+ struct task_struct *parent;
+ struct completion init_done;
+ kvm_vm_thread_fn_t thread_fn;
+ uintptr_t data;
+ int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+ /*
+ * The init_context is allocated on the stack of the parent thread, so
+ * we have to locally copy anything that is needed beyond initialization
+ */
+ struct kvm_vm_worker_thread_context *init_context = context;
+ struct kvm *kvm = init_context->kvm;
+ kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+ uintptr_t data = init_context->data;
+ int err;
+
+ err = kthread_park(current);
+ /* kthread_park(current) is never supposed to return an error */
+ WARN_ON(err != 0);
+ if (err)
+ goto init_complete;
+
+ err = cgroup_attach_task_all(init_context->parent, current);
+ if (err) {
+ kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+ __func__, err);
+ goto init_complete;
+ }
+
+ set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+ init_context->err = err;
+ complete(&init_context->init_done);
+ init_context = NULL;
+
+ if (err)
+ return err;
+
+ /* Wait to be woken up by the spawner before proceeding. */
+ kthread_parkme();
+
+ if (!kthread_should_stop())
+ err = thread_fn(kvm, data);
+
+ return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+ uintptr_t data, const char *name,
+ struct task_struct **thread_ptr)
+{
+ struct kvm_vm_worker_thread_context init_context = {};
+ struct task_struct *thread;
+
+ *thread_ptr = NULL;
+ init_context.kvm = kvm;
+ init_context.parent = current;
+ init_context.thread_fn = thread_fn;
+ init_context.data = data;
+ init_completion(&init_context.init_done);
+
+ thread = kthread_run(kvm_vm_worker_thread, &init_context,
+ "%s-%d", name, task_pid_nr(current));
+ if (IS_ERR(thread))
+ return PTR_ERR(thread);
+
+ /* kthread_run is never supposed to return NULL */
+ WARN_ON(thread == NULL);
+
+ wait_for_completion(&init_context.init_done);
+
+ if (!init_context.err)
+ *thread_ptr = thread;
+
+ return init_context.err;
+}