Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohannes Thumshirn <jthumshirn@suse.de>2018-07-26 12:49:16 +0200
committerJohannes Thumshirn <jthumshirn@suse.de>2018-07-26 12:49:16 +0200
commit90c2761e7acdaea6f1e92e9c952fa558eac09ce3 (patch)
tree1f485e48a9ede836f1c0fd754ffbde304eca20b9
parent4cccb8ee6c76d724fefe0ac2421f5ea49df00f0d (diff)
parent2adbcf53bfc59168cd98622d45154b57d04cdded (diff)
Merge remote-tracking branch 'origin/users/jroedel/SLE12-SP4/for-next' into SLE12-SP4
Pull KVM fixes from Joerg Roedel. Conflicts: patches.drivers/nvme-move-init-of-keep_alive-work-item-to-controller.patch suse-commit: 1d68293567e57b0be893b2e2d224714f59dbbb81
-rw-r--r--arch/x86/include/asm/svm.h3
-rw-r--r--arch/x86/kvm/svm.c107
-rw-r--r--arch/x86/kvm/vmx.c81
-rw-r--r--arch/x86/kvm/x86.h38
-rw-r--r--drivers/iommu/amd_iommu.c229
-rw-r--r--drivers/iommu/dmar.c2
-rw-r--r--drivers/iommu/intel-iommu.c205
-rw-r--r--drivers/iommu/iova.c183
-rw-r--r--include/linux/iova.h67
9 files changed, 458 insertions, 457 deletions
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index e9899ed67c70..2f2454484e86 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -59,7 +59,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
- u8 reserved_1[42];
+ u8 reserved_1[40];
+ u16 pause_filter_thresh;
u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b0bc2ff94901..80781eec2318 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -281,6 +281,54 @@ static bool npt_enabled = true;
static bool npt_enabled;
#endif
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
module_param(npt, int, S_IRUGO);
@@ -1175,6 +1223,42 @@ err:
return rc;
}
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1205,6 +1289,14 @@ static __init int svm_hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 32;
}
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -1451,9 +1543,13 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- control->pause_filter_count = 3000;
+ if (pause_filter_count) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ clr_intercept(svm, INTERCEPT_PAUSE);
}
if (kvm_vcpu_apicv_active(&svm->vcpu))
@@ -4212,6 +4308,9 @@ static int pause_interception(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel = (svm_get_cpl(vcpu) == 0);
+ if (pause_filter_thresh)
+ grow_ple_window(vcpu);
+
kvm_vcpu_on_spin(vcpu, in_kernel);
return 1;
}
@@ -4585,6 +4684,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
@@ -5893,6 +5994,8 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ if (pause_filter_thresh)
+ shrink_ple_window(vcpu);
}
static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca551bd3c964..ec4658f987cb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -166,31 +166,22 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
-#define KVM_VMX_DEFAULT_PLE_GAP 128
-#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
-#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2
-#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
-#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
- INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
+static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
-static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
-module_param(ple_gap, int, S_IRUGO);
-
-static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
-module_param(ple_window, int, S_IRUGO);
+static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, uint, 0444);
/* Default doubles per-vcpu window every exit. */
-static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
-module_param(ple_window_grow, int, S_IRUGO);
+static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, uint, 0444);
/* Default resets per-vcpu window every exit to ple_window. */
-static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
-module_param(ple_window_shrink, int, S_IRUGO);
+static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, uint, 0444);
/* Default is to compute the maximum so we can never overflow. */
-static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
-static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
-module_param(ple_window_max, int, S_IRUGO);
+static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, uint, 0444);
extern const ulong vmx_return;
@@ -6856,40 +6847,14 @@ out:
return ret;
}
-static int __grow_ple_window(int val)
-{
- if (ple_window_grow < 1)
- return ple_window;
-
- val = min(val, ple_window_actual_max);
-
- if (ple_window_grow < ple_window)
- val *= ple_window_grow;
- else
- val += ple_window_grow;
-
- return val;
-}
-
-static int __shrink_ple_window(int val, int modifier, int minimum)
-{
- if (modifier < 1)
- return ple_window;
-
- if (modifier < ple_window)
- val /= modifier;
- else
- val -= modifier;
-
- return max(val, minimum);
-}
-
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __grow_ple_window(old);
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
if (vmx->ple_window != old)
vmx->ple_window_dirty = true;
@@ -6902,8 +6867,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int old = vmx->ple_window;
- vmx->ple_window = __shrink_ple_window(old,
- ple_window_shrink, ple_window);
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
if (vmx->ple_window != old)
vmx->ple_window_dirty = true;
@@ -6912,21 +6878,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
/*
- * ple_window_actual_max is computed to be one grow_ple_window() below
- * ple_window_max. (See __grow_ple_window for the reason.)
- * This prevents overflows, because ple_window_max is int.
- * ple_window_max effectively rounded down to a multiple of ple_window_grow in
- * this process.
- * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
- */
-static void update_ple_window_actual_max(void)
-{
- ple_window_actual_max =
- __shrink_ple_window(max(ple_window_max, ple_window),
- ple_window_grow, INT_MIN);
-}
-
-/*
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
*/
static void wakeup_handler(void)
@@ -7055,8 +7006,6 @@ static __init int hardware_setup(void)
else
kvm_disable_tdp();
- update_ple_window_actual_max();
-
/*
* Only enable PML when hardware supports PML feature, and both EPT
* and EPT A/D bit features are enabled -- PML depends on them to work.
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2874fb6bc978..013f81b241d9 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -7,6 +7,44 @@
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
+
+static inline unsigned int __grow_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int max)
+{
+ u64 ret = val;
+
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ ret *= modifier;
+ else
+ ret += modifier;
+
+ return min(ret, (u64)max);
+}
+
+static inline unsigned int __shrink_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int min)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, min);
+}
+
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index fb78758eac48..94c0ad33e4fd 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -112,20 +112,7 @@ struct kmem_cache *amd_iommu_irq_cache;
static void update_domain(struct protection_domain *domain);
static int protection_domain_init(struct protection_domain *domain);
static void detach_device(struct device *dev);
-
-#define FLUSH_QUEUE_SIZE 256
-
-struct flush_queue_entry {
- unsigned long iova_pfn;
- unsigned long pages;
- u64 counter; /* Flush counter when this entry was added to the queue */
-};
-
-struct flush_queue {
- struct flush_queue_entry *entries;
- unsigned head, tail;
- spinlock_t lock;
-};
+static void iova_domain_flush_tlb(struct iova_domain *iovad);
/*
* Data container for a dma_ops specific protection domain
@@ -136,36 +123,6 @@ struct dma_ops_domain {
/* IOVA RB-Tree */
struct iova_domain iovad;
-
- struct flush_queue __percpu *flush_queue;
-
- /*
- * We need two counter here to be race-free wrt. IOTLB flushing and
- * adding entries to the flush queue.
- *
- * The flush_start_cnt is incremented _before_ the IOTLB flush starts.
- * New entries added to the flush ring-buffer get their 'counter' value
- * from here. This way we can make sure that entries added to the queue
- * (or other per-cpu queues of the same domain) while the TLB is about
- * to be flushed are not considered to be flushed already.
- */
- atomic64_t flush_start_cnt;
-
- /*
- * The flush_finish_cnt is incremented when an IOTLB flush is complete.
- * This value is always smaller than flush_start_cnt. The queue_add
- * function frees all IOVAs that have a counter value smaller than
- * flush_finish_cnt. This makes sure that we only free IOVAs that are
- * flushed out of the IOTLB of the domain.
- */
- atomic64_t flush_finish_cnt;
-
- /*
- * Timer to make sure we don't keep IOVAs around unflushed
- * for too long
- */
- struct timer_list flush_timer;
- atomic_t flush_timer_on;
};
static struct iova_domain reserved_iova_ranges;
@@ -1773,178 +1730,19 @@ static void free_gcr3_table(struct protection_domain *domain)
free_page((unsigned long)domain->gcr3_tbl);
}
-static void dma_ops_domain_free_flush_queue(struct dma_ops_domain *dom)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct flush_queue *queue;
-
- queue = per_cpu_ptr(dom->flush_queue, cpu);
- kfree(queue->entries);
- }
-
- free_percpu(dom->flush_queue);
-
- dom->flush_queue = NULL;
-}
-
-static int dma_ops_domain_alloc_flush_queue(struct dma_ops_domain *dom)
-{
- int cpu;
-
- atomic64_set(&dom->flush_start_cnt, 0);
- atomic64_set(&dom->flush_finish_cnt, 0);
-
- dom->flush_queue = alloc_percpu(struct flush_queue);
- if (!dom->flush_queue)
- return -ENOMEM;
-
- /* First make sure everything is cleared */
- for_each_possible_cpu(cpu) {
- struct flush_queue *queue;
-
- queue = per_cpu_ptr(dom->flush_queue, cpu);
- queue->head = 0;
- queue->tail = 0;
- queue->entries = NULL;
- }
-
- /* Now start doing the allocation */
- for_each_possible_cpu(cpu) {
- struct flush_queue *queue;
-
- queue = per_cpu_ptr(dom->flush_queue, cpu);
- queue->entries = kzalloc(FLUSH_QUEUE_SIZE * sizeof(*queue->entries),
- GFP_KERNEL);
- if (!queue->entries) {
- dma_ops_domain_free_flush_queue(dom);
- return -ENOMEM;
- }
-
- spin_lock_init(&queue->lock);
- }
-
- return 0;
-}
-
static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
{
- atomic64_inc(&dom->flush_start_cnt);
domain_flush_tlb(&dom->domain);
domain_flush_complete(&dom->domain);
- atomic64_inc(&dom->flush_finish_cnt);
}
-static inline bool queue_ring_full(struct flush_queue *queue)
+static void iova_domain_flush_tlb(struct iova_domain *iovad)
{
- assert_spin_locked(&queue->lock);
-
- return (((queue->tail + 1) % FLUSH_QUEUE_SIZE) == queue->head);
-}
+ struct dma_ops_domain *dom;
-#define queue_ring_for_each(i, q) \
- for (i = (q)->head; i != (q)->tail; i = (i + 1) % FLUSH_QUEUE_SIZE)
-
-static inline unsigned queue_ring_add(struct flush_queue *queue)
-{
- unsigned idx = queue->tail;
-
- assert_spin_locked(&queue->lock);
- queue->tail = (idx + 1) % FLUSH_QUEUE_SIZE;
-
- return idx;
-}
-
-static inline void queue_ring_remove_head(struct flush_queue *queue)
-{
- assert_spin_locked(&queue->lock);
- queue->head = (queue->head + 1) % FLUSH_QUEUE_SIZE;
-}
-
-static void queue_ring_free_flushed(struct dma_ops_domain *dom,
- struct flush_queue *queue)
-{
- u64 counter = atomic64_read(&dom->flush_finish_cnt);
- int idx;
-
- queue_ring_for_each(idx, queue) {
- /*
- * This assumes that counter values in the ring-buffer are
- * monotonously rising.
- */
- if (queue->entries[idx].counter >= counter)
- break;
-
- free_iova_fast(&dom->iovad,
- queue->entries[idx].iova_pfn,
- queue->entries[idx].pages);
-
- queue_ring_remove_head(queue);
- }
-}
-
-static void queue_add(struct dma_ops_domain *dom,
- unsigned long address, unsigned long pages)
-{
- struct flush_queue *queue;
- unsigned long flags;
- int idx;
-
- pages = __roundup_pow_of_two(pages);
- address >>= PAGE_SHIFT;
-
- queue = get_cpu_ptr(dom->flush_queue);
- spin_lock_irqsave(&queue->lock, flags);
-
- /*
- * First remove the enries from the ring-buffer that are already
- * flushed to make the below queue_ring_full() check less likely
- */
- queue_ring_free_flushed(dom, queue);
-
- /*
- * When ring-queue is full, flush the entries from the IOTLB so
- * that we can free all entries with queue_ring_free_flushed()
- * below.
- */
- if (queue_ring_full(queue)) {
- dma_ops_domain_flush_tlb(dom);
- queue_ring_free_flushed(dom, queue);
- }
-
- idx = queue_ring_add(queue);
-
- queue->entries[idx].iova_pfn = address;
- queue->entries[idx].pages = pages;
- queue->entries[idx].counter = atomic64_read(&dom->flush_start_cnt);
-
- spin_unlock_irqrestore(&queue->lock, flags);
-
- if (atomic_cmpxchg(&dom->flush_timer_on, 0, 1) == 0)
- mod_timer(&dom->flush_timer, jiffies + msecs_to_jiffies(10));
-
- put_cpu_ptr(dom->flush_queue);
-}
-
-static void queue_flush_timeout(unsigned long data)
-{
- struct dma_ops_domain *dom = (struct dma_ops_domain *)data;
- int cpu;
-
- atomic_set(&dom->flush_timer_on, 0);
+ dom = container_of(iovad, struct dma_ops_domain, iovad);
dma_ops_domain_flush_tlb(dom);
-
- for_each_possible_cpu(cpu) {
- struct flush_queue *queue;
- unsigned long flags;
-
- queue = per_cpu_ptr(dom->flush_queue, cpu);
- spin_lock_irqsave(&queue->lock, flags);
- queue_ring_free_flushed(dom, queue);
- spin_unlock_irqrestore(&queue->lock, flags);
- }
}
/*
@@ -1958,11 +1756,6 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
del_domain_from_list(&dom->domain);
- if (timer_pending(&dom->flush_timer))
- del_timer(&dom->flush_timer);
-
- dma_ops_domain_free_flush_queue(dom);
-
put_iova_domain(&dom->iovad);
free_pagetable(&dom->domain);
@@ -1998,16 +1791,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void)
init_iova_domain(&dma_dom->iovad, PAGE_SIZE,
IOVA_START_PFN, DMA_32BIT_PFN);
- /* Initialize reserved ranges */
- copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);
-
- if (dma_ops_domain_alloc_flush_queue(dma_dom))
+ if (init_iova_flush_queue(&dma_dom->iovad, iova_domain_flush_tlb, NULL))
goto free_dma_dom;
- setup_timer(&dma_dom->flush_timer, queue_flush_timeout,
- (unsigned long)dma_dom);
-
- atomic_set(&dma_dom->flush_timer_on, 0);
+ /* Initialize reserved ranges */
+ copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);
add_domain_to_list(&dma_dom->domain);
@@ -2614,7 +2402,8 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
domain_flush_tlb(&dma_dom->domain);
domain_flush_complete(&dma_dom->domain);
} else {
- queue_add(dma_dom, dma_addr, pages);
+ pages = __roundup_pow_of_two(pages);
+ queue_iova(&dma_dom->iovad, dma_addr >> PAGE_SHIFT, pages, 0);
}
}
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index a79c826991c1..7f85256501c0 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1346,7 +1346,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
if (mask) {
BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
- addr |= (1 << (VTD_PAGE_SHIFT + mask - 1)) - 1;
+ addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
} else
desc.high = QI_DEV_IOTLB_ADDR(addr);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 919ede7e06d6..4a1e518a15df 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -458,31 +458,6 @@ static LIST_HEAD(dmar_rmrr_units);
#define for_each_rmrr_units(rmrr) \
list_for_each_entry(rmrr, &dmar_rmrr_units, list)
-static void flush_unmaps_timeout(unsigned long data);
-
-struct deferred_flush_entry {
- unsigned long iova_pfn;
- unsigned long nrpages;
- struct dmar_domain *domain;
- struct page *freelist;
-};
-
-#define HIGH_WATER_MARK 250
-struct deferred_flush_table {
- int next;
- struct deferred_flush_entry entries[HIGH_WATER_MARK];
-};
-
-struct deferred_flush_data {
- spinlock_t lock;
- int timer_on;
- struct timer_list timer;
- long size;
- struct deferred_flush_table *tables;
-};
-
-DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
-
/* bitmap for indexing intel_iommus */
static int g_num_of_iommus;
@@ -1309,6 +1284,13 @@ static void dma_free_pagelist(struct page *freelist)
}
}
+static void iova_entry_free(unsigned long data)
+{
+ struct page *freelist = (struct page *)data;
+
+ dma_free_pagelist(freelist);
+}
+
/* iommu handling */
static int iommu_alloc_root_entry(struct intel_iommu *iommu)
{
@@ -1621,6 +1603,25 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
iommu_flush_dev_iotlb(domain, addr, mask);
}
+static void iommu_flush_iova(struct iova_domain *iovad)
+{
+ struct dmar_domain *domain;
+ int idx;
+
+ domain = container_of(iovad, struct dmar_domain, iovad);
+
+ for_each_domain_iommu(idx, domain) {
+ struct intel_iommu *iommu = g_iommus[idx];
+ u16 did = domain->iommu_did[iommu->seq_id];
+
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+ if (!cap_caching_mode(iommu->cap))
+ iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
+ 0, MAX_AGAW_PFN_WIDTH);
+ }
+}
+
static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
{
u32 pmen;
@@ -1931,9 +1932,16 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
{
int adjust_width, agaw;
unsigned long sagaw;
+ int err;
init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
DMA_32BIT_PFN);
+
+ err = init_iova_flush_queue(&domain->iovad,
+ iommu_flush_iova, iova_entry_free);
+ if (err)
+ return err;
+
domain_reserve_special_ranges(domain);
/* calculate AGAW */
@@ -1985,14 +1993,6 @@ static void domain_exit(struct dmar_domain *domain)
if (!domain)
return;
- /* Flush any lazy unmaps that may reference this domain */
- if (!intel_iommu_strict) {
- int cpu;
-
- for_each_possible_cpu(cpu)
- flush_unmaps_timeout(cpu);
- }
-
/* Remove associated devices and clear attached or cached domains */
rcu_read_lock();
domain_remove_dev_info(domain);
@@ -3205,7 +3205,7 @@ static int __init init_dmars(void)
bool copied_tables = false;
struct device *dev;
struct intel_iommu *iommu;
- int i, ret, cpu;
+ int i, ret;
/*
* for each drhd
@@ -3238,22 +3238,6 @@ static int __init init_dmars(void)
goto error;
}
- for_each_possible_cpu(cpu) {
- struct deferred_flush_data *dfd = per_cpu_ptr(&deferred_flush,
- cpu);
-
- dfd->tables = kzalloc(g_num_of_iommus *
- sizeof(struct deferred_flush_table),
- GFP_KERNEL);
- if (!dfd->tables) {
- ret = -ENOMEM;
- goto free_g_iommus;
- }
-
- spin_lock_init(&dfd->lock);
- setup_timer(&dfd->timer, flush_unmaps_timeout, cpu);
- }
-
for_each_active_iommu(iommu, drhd) {
g_iommus[iommu->seq_id] = iommu;
@@ -3436,10 +3420,9 @@ free_iommu:
disable_dmar_iommu(iommu);
free_dmar_iommu(iommu);
}
-free_g_iommus:
- for_each_possible_cpu(cpu)
- kfree(per_cpu_ptr(&deferred_flush, cpu)->tables);
+
kfree(g_iommus);
+
error:
return ret;
}
@@ -3656,118 +3639,6 @@ static dma_addr_t intel_map_page(struct device *dev, struct page *page,
dir, *dev->dma_mask);
}
-static void flush_unmaps(struct deferred_flush_data *flush_data)
-{
- int i, j;
-
- flush_data->timer_on = 0;
-
- /* just flush them all */
- for (i = 0; i < g_num_of_iommus; i++) {
- struct intel_iommu *iommu = g_iommus[i];
- struct deferred_flush_table *flush_table =
- &flush_data->tables[i];
- if (!iommu)
- continue;
-
- if (!flush_table->next)
- continue;
-
- /* In caching mode, global flushes turn emulation expensive */
- if (!cap_caching_mode(iommu->cap))
- iommu->flush.flush_iotlb(iommu, 0, 0, 0,
- DMA_TLB_GLOBAL_FLUSH);
- for (j = 0; j < flush_table->next; j++) {
- unsigned long mask;
- struct deferred_flush_entry *entry =
- &flush_table->entries[j];
- unsigned long iova_pfn = entry->iova_pfn;
- unsigned long nrpages = entry->nrpages;
- struct dmar_domain *domain = entry->domain;
- struct page *freelist = entry->freelist;
-
- /* On real hardware multiple invalidations are expensive */
- if (cap_caching_mode(iommu->cap))
- iommu_flush_iotlb_psi(iommu, domain,
- mm_to_dma_pfn(iova_pfn),
- nrpages, !freelist, 0);
- else {
- mask = ilog2(nrpages);
- iommu_flush_dev_iotlb(domain,
- (uint64_t)iova_pfn << PAGE_SHIFT, mask);
- }
- free_iova_fast(&domain->iovad, iova_pfn, nrpages);
- if (freelist)
- dma_free_pagelist(freelist);
- }
- flush_table->next = 0;
- }
-
- flush_data->size = 0;
-}
-
-static void flush_unmaps_timeout(unsigned long cpuid)
-{
- struct deferred_flush_data *flush_data = per_cpu_ptr(&deferred_flush, cpuid);
- unsigned long flags;
-
- spin_lock_irqsave(&flush_data->lock, flags);
- flush_unmaps(flush_data);
- spin_unlock_irqrestore(&flush_data->lock, flags);
-}
-
-static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
- unsigned long nrpages, struct page *freelist)
-{
- unsigned long flags;
- int entry_id, iommu_id;
- struct intel_iommu *iommu;
- struct deferred_flush_entry *entry;
- struct deferred_flush_data *flush_data;
- unsigned int cpuid;
-
- cpuid = get_cpu();
- flush_data = per_cpu_ptr(&deferred_flush, cpuid);
-
- /* Flush all CPUs' entries to avoid deferring too much. If
- * this becomes a bottleneck, can just flush us, and rely on
- * flush timer for the rest.
- */
- if (flush_data->size == HIGH_WATER_MARK) {
- int cpu;
-
- for_each_online_cpu(cpu)
- flush_unmaps_timeout(cpu);
- }
-
- spin_lock_irqsave(&flush_data->lock, flags);
-
- /* Need to check that again after we own the lock */
- if (unlikely(flush_data->size == HIGH_WATER_MARK))
- flush_unmaps(flush_data);
-
- iommu = domain_get_iommu(dom);
- iommu_id = iommu->seq_id;
-
- entry_id = flush_data->tables[iommu_id].next;
- ++(flush_data->tables[iommu_id].next);
-
- entry = &flush_data->tables[iommu_id].entries[entry_id];
- entry->domain = dom;
- entry->iova_pfn = iova_pfn;
- entry->nrpages = nrpages;
- entry->freelist = freelist;
-
- if (!flush_data->timer_on) {
- mod_timer(&flush_data->timer, jiffies + msecs_to_jiffies(10));
- flush_data->timer_on = 1;
- }
- flush_data->size++;
- spin_unlock_irqrestore(&flush_data->lock, flags);
-
- put_cpu();
-}
-
static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
{
struct dmar_domain *domain;
@@ -3803,7 +3674,8 @@ static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
dma_free_pagelist(freelist);
} else {
- add_unmap(domain, iova_pfn, nrpages, freelist);
+ queue_iova(&domain->iovad, iova_pfn, nrpages,
+ (unsigned long)freelist);
/*
* queue up the release of the unmap to save the 1/6th of the
* cpu used up by the iotlb flush operation...
@@ -4737,7 +4609,6 @@ static void free_all_cpu_cached_iovas(unsigned int cpu)
static int intel_iommu_cpu_dead(unsigned int cpu)
{
free_all_cpu_cached_iovas(cpu);
- flush_unmaps_timeout(cpu);
return 0;
}
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 5c88ba70e4e0..983faf343ea1 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -31,6 +31,8 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
unsigned long limit_pfn);
static void init_iova_rcaches(struct iova_domain *iovad);
static void free_iova_rcaches(struct iova_domain *iovad);
+static void fq_destroy_all_entries(struct iova_domain *iovad);
+static void fq_flush_timeout(unsigned long data);
void
init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -49,10 +51,61 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
iovad->granule = granule;
iovad->start_pfn = start_pfn;
iovad->dma_32bit_pfn = pfn_32bit;
+ iovad->flush_cb = NULL;
+ iovad->fq = NULL;
init_iova_rcaches(iovad);
}
EXPORT_SYMBOL_GPL(init_iova_domain);
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+ if (!iovad->fq)
+ return;
+
+ if (timer_pending(&iovad->fq_timer))
+ del_timer(&iovad->fq_timer);
+
+ fq_destroy_all_entries(iovad);
+
+ free_percpu(iovad->fq);
+
+ iovad->fq = NULL;
+ iovad->flush_cb = NULL;
+ iovad->entry_dtor = NULL;
+}
+
+int init_iova_flush_queue(struct iova_domain *iovad,
+ iova_flush_cb flush_cb, iova_entry_dtor entry_dtor)
+{
+ int cpu;
+
+ atomic64_set(&iovad->fq_flush_start_cnt, 0);
+ atomic64_set(&iovad->fq_flush_finish_cnt, 0);
+
+ iovad->fq = alloc_percpu(struct iova_fq);
+ if (!iovad->fq)
+ return -ENOMEM;
+
+ iovad->flush_cb = flush_cb;
+ iovad->entry_dtor = entry_dtor;
+
+ for_each_possible_cpu(cpu) {
+ struct iova_fq *fq;
+
+ fq = per_cpu_ptr(iovad->fq, cpu);
+ fq->head = 0;
+ fq->tail = 0;
+
+ spin_lock_init(&fq->lock);
+ }
+
+ setup_timer(&iovad->fq_timer, fq_flush_timeout, (unsigned long)iovad);
+ atomic_set(&iovad->fq_timer_on, 0);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(init_iova_flush_queue);
+
static struct rb_node *
__get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
{
@@ -427,6 +480,135 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
}
EXPORT_SYMBOL_GPL(free_iova_fast);
+#define fq_ring_for_each(i, fq) \
+ for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
+
+static inline bool fq_full(struct iova_fq *fq)
+{
+ assert_spin_locked(&fq->lock);
+ return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
+}
+
+static inline unsigned fq_ring_add(struct iova_fq *fq)
+{
+ unsigned idx = fq->tail;
+
+ assert_spin_locked(&fq->lock);
+
+ fq->tail = (idx + 1) % IOVA_FQ_SIZE;
+
+ return idx;
+}
+
+static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+{
+ u64 counter = atomic64_read(&iovad->fq_flush_finish_cnt);
+ unsigned idx;
+
+ assert_spin_locked(&fq->lock);
+
+ fq_ring_for_each(idx, fq) {
+
+ if (fq->entries[idx].counter >= counter)
+ break;
+
+ if (iovad->entry_dtor)
+ iovad->entry_dtor(fq->entries[idx].data);
+
+ free_iova_fast(iovad,
+ fq->entries[idx].iova_pfn,
+ fq->entries[idx].pages);
+
+ fq->head = (fq->head + 1) % IOVA_FQ_SIZE;
+ }
+}
+
+static void iova_domain_flush(struct iova_domain *iovad)
+{
+ atomic64_inc(&iovad->fq_flush_start_cnt);
+ iovad->flush_cb(iovad);
+ atomic64_inc(&iovad->fq_flush_finish_cnt);
+}
+
+static void fq_destroy_all_entries(struct iova_domain *iovad)
+{
+ int cpu;
+
+ /*
+ * This code runs when the iova_domain is being detroyed, so don't
+ * bother to free iovas, just call the entry_dtor on all remaining
+ * entries.
+ */
+ if (!iovad->entry_dtor)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
+ int idx;
+
+ fq_ring_for_each(idx, fq)
+ iovad->entry_dtor(fq->entries[idx].data);
+ }
+}
+
+static void fq_flush_timeout(unsigned long data)
+{
+ struct iova_domain *iovad = (struct iova_domain *)data;
+ int cpu;
+
+ atomic_set(&iovad->fq_timer_on, 0);
+ iova_domain_flush(iovad);
+
+ for_each_possible_cpu(cpu) {
+ unsigned long flags;
+ struct iova_fq *fq;
+
+ fq = per_cpu_ptr(iovad->fq, cpu);
+ spin_lock_irqsave(&fq->lock, flags);
+ fq_ring_free(iovad, fq);
+ spin_unlock_irqrestore(&fq->lock, flags);
+ }
+}
+
+void queue_iova(struct iova_domain *iovad,
+ unsigned long pfn, unsigned long pages,
+ unsigned long data)
+{
+ struct iova_fq *fq = get_cpu_ptr(iovad->fq);
+ unsigned long flags;
+ unsigned idx;
+
+ spin_lock_irqsave(&fq->lock, flags);
+
+ /*
+ * First remove all entries from the flush queue that have already been
+ * flushed out on another CPU. This makes the fq_full() check below less
+ * likely to be true.
+ */
+ fq_ring_free(iovad, fq);
+
+ if (fq_full(fq)) {
+ iova_domain_flush(iovad);
+ fq_ring_free(iovad, fq);
+ }
+
+ idx = fq_ring_add(fq);
+
+ fq->entries[idx].iova_pfn = pfn;
+ fq->entries[idx].pages = pages;
+ fq->entries[idx].data = data;
+ fq->entries[idx].counter = atomic64_read(&iovad->fq_flush_start_cnt);
+
+ spin_unlock_irqrestore(&fq->lock, flags);
+
+ if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
+ mod_timer(&iovad->fq_timer,
+ jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
+
+ put_cpu_ptr(iovad->fq);
+}
+EXPORT_SYMBOL_GPL(queue_iova);
+
/**
* put_iova_domain - destroys the iova doamin
* @iovad: - iova domain in question.
@@ -437,6 +619,7 @@ void put_iova_domain(struct iova_domain *iovad)
struct rb_node *node;
unsigned long flags;
+ free_iova_flush_queue(iovad);
free_iova_rcaches(iovad);
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
node = rb_first(&iovad->rbroot);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index e0a892ae45c0..d179b9bf7814 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/rbtree.h>
+#include <linux/atomic.h>
#include <linux/dma-mapping.h>
/* iova structure */
@@ -36,6 +37,35 @@ struct iova_rcache {
struct iova_cpu_rcache __percpu *cpu_rcaches;
};
+struct iova_domain;
+
+/* Call-Back from IOVA code into IOMMU drivers */
+typedef void (* iova_flush_cb)(struct iova_domain *domain);
+
+/* Destructor for per-entry data */
+typedef void (* iova_entry_dtor)(unsigned long data);
+
+/* Number of entries per Flush Queue */
+#define IOVA_FQ_SIZE 256
+
+/* Timeout (in ms) after which entries are flushed from the Flush-Queue */
+#define IOVA_FQ_TIMEOUT 10
+
+/* Flush Queue entry for defered flushing */
+struct iova_fq_entry {
+ unsigned long iova_pfn;
+ unsigned long pages;
+ unsigned long data;
+ u64 counter; /* Flush counter when this entrie was added */
+};
+
+/* Per-CPU Flush Queue structure */
+struct iova_fq {
+ struct iova_fq_entry entries[IOVA_FQ_SIZE];
+ unsigned head, tail;
+ spinlock_t lock;
+};
+
/* holds all the iova translations for a domain */
struct iova_domain {
spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
@@ -45,6 +75,25 @@ struct iova_domain {
unsigned long start_pfn; /* Lower limit for this domain */
unsigned long dma_32bit_pfn;
struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE]; /* IOVA range caches */
+
+ iova_flush_cb flush_cb; /* Call-Back function to flush IOMMU
+ TLBs */
+
+ iova_entry_dtor entry_dtor; /* IOMMU driver specific destructor for
+ iova entry */
+
+ struct iova_fq __percpu *fq; /* Flush Queue */
+
+ atomic64_t fq_flush_start_cnt; /* Number of TLB flushes that
+ have been started */
+
+ atomic64_t fq_flush_finish_cnt; /* Number of TLB flushes that
+ have been finished */
+
+ struct timer_list fq_timer; /* Timer to regularily empty the
+ flush-queues */
+ atomic_t fq_timer_on; /* 1 when timer is active, 0
+ when not */
};
static inline unsigned long iova_size(struct iova *iova)
@@ -95,6 +144,9 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
bool size_aligned);
void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
unsigned long size);
+void queue_iova(struct iova_domain *iovad,
+ unsigned long pfn, unsigned long pages,
+ unsigned long data);
unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
unsigned long limit_pfn);
struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
@@ -102,6 +154,8 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
unsigned long start_pfn, unsigned long pfn_32bit);
+int init_iova_flush_queue(struct iova_domain *iovad,
+ iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
void put_iova_domain(struct iova_domain *iovad);
struct iova *split_and_remove_iova(struct iova_domain *iovad,
@@ -148,6 +202,12 @@ static inline void free_iova_fast(struct iova_domain *iovad,
{
}
+static inline void queue_iova(struct iova_domain *iovad,
+ unsigned long pfn, unsigned long pages,
+ unsigned long data)
+{
+}
+
static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
unsigned long size,
unsigned long limit_pfn)
@@ -174,6 +234,13 @@ static inline void init_iova_domain(struct iova_domain *iovad,
{
}
+static inline int init_iova_flush_queue(struct iova_domain *iovad,
+ iova_flush_cb flush_cb,
+ iova_entry_dtor entry_dtor)
+{
+ return -ENODEV;
+}
+
static inline struct iova *find_iova(struct iova_domain *iovad,
unsigned long pfn)
{