Home Home > GIT Browse > openSUSE-15.0
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKernel Build Daemon <kbuild@suse.de>2018-01-17 07:01:09 +0100
committerKernel Build Daemon <kbuild@suse.de>2018-01-17 07:01:09 +0100
commit4908d430bb155c8e60b323cf645a1b27ace583c8 (patch)
tree40fab581eba264d564a4f8b315dfb8b04b013079
parent443325a83637809f699e2fe6986fce9fb9bff351 (diff)
parent7eb0153263c961e6147e49ef5d2a31f0c883b33d (diff)
Merge branch 'SLE15' into openSUSE-15.0openSUSE-15.0
-rw-r--r--Documentation/x86/x86_64/mm.txt26
-rw-r--r--arch/powerpc/include/asm/mmu_context.h5
-rw-r--r--arch/s390/include/asm/mmu_context.h27
-rw-r--r--arch/um/include/asm/mmu_context.h3
-rw-r--r--arch/unicore32/include/asm/mmu_context.h5
-rw-r--r--arch/x86/entry/entry_32.S12
-rw-r--r--arch/x86/entry/entry_64.S4
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c38
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h68
-rw-r--r--arch/x86/include/asm/cpufeature.h9
-rw-r--r--arch/x86/include/asm/cpufeatures.h11
-rw-r--r--arch/x86/include/asm/desc.h1
-rw-r--r--arch/x86/include/asm/espfix.h7
-rw-r--r--arch/x86/include/asm/fixmap.h71
-rw-r--r--arch/x86/include/asm/invpcid.h53
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h37
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h15
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h55
-rw-r--r--arch/x86/include/asm/processor.h6
-rw-r--r--arch/x86/include/asm/stacktrace.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h136
-rw-r--r--arch/x86/kernel/asm-offsets.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/common.c100
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c121
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c13
-rw-r--r--arch/x86/kernel/dumpstack.c11
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c12
-rw-r--r--arch/x86/kernel/ldt.c68
-rw-r--r--arch/x86/kernel/smpboot.c6
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/cpu_entry_area.c138
-rw-r--r--arch/x86/mm/dump_pagetables.c98
-rw-r--r--arch/x86/mm/init_32.c6
-rw-r--r--arch/x86/mm/kasan_init_64.c15
-rw-r--r--arch/x86/mm/kaslr.c32
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/tlb.c10
-rw-r--r--arch/x86/platform/uv/tlb_uv.c2
-rw-r--r--arch/x86/xen/mmu_pv.c2
-rw-r--r--drivers/cdrom/cdrom.c119
-rw-r--r--drivers/md/bcache/alloc.c21
-rw-r--r--drivers/md/bcache/bcache.h20
-rw-r--r--drivers/md/bcache/btree.c22
-rw-r--r--drivers/md/bcache/btree.h2
-rw-r--r--drivers/md/bcache/closure.c17
-rw-r--r--drivers/md/bcache/closure.h10
-rw-r--r--drivers/md/bcache/extents.c2
-rw-r--r--drivers/md/bcache/journal.c7
-rw-r--r--drivers/md/bcache/request.c47
-rw-r--r--drivers/md/bcache/super.c62
-rw-r--r--drivers/md/bcache/sysfs.c47
-rw-r--r--drivers/md/bcache/util.c60
-rw-r--r--drivers/md/bcache/util.h4
-rw-r--r--drivers/md/bcache/writeback.c135
-rw-r--r--drivers/md/bcache/writeback.h27
-rw-r--r--drivers/scsi/sr.c10
-rw-r--r--include/asm-generic/mm_hooks.h5
-rw-r--r--include/asm-generic/pgtable.h5
-rw-r--r--include/linux/bpf.h11
-rw-r--r--include/linux/bpf_verifier.h21
-rw-r--r--include/linux/delay.h12
-rw-r--r--include/linux/filter.h47
-rw-r--r--include/uapi/linux/bcache.h2
-rw-r--r--init/main.c6
-rw-r--r--kernel/bpf/verifier.c754
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/trace/bpf_trace.c31
-rw-r--r--net/core/filter.c238
-rw-r--r--sound/core/seq/seq_clientmgr.c3
-rw-r--r--sound/core/seq/seq_clientmgr.h1
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c9
76 files changed, 1823 insertions, 1190 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index b0798e281aa6..3bad9c2977d3 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -1,6 +1,4 @@
-<previous description obsolete, deleted>
-
Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
@@ -14,13 +12,16 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ...
+ vaddr_end for KASLR
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables:
@@ -41,14 +42,16 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
+[fixmap start] - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
-through to the most-significant implemented bit are set to either all ones
-or all zero. This causes hole between user space and kernel addresses.
+through to the most-significant implemented bit are sign extended.
+This causes hole between user space and kernel addresses if you interpret them
+as unsigned.
The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
@@ -58,9 +61,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
the processes using the page fault handler, with init_top_pgt as
reference.
-Current X86-64 implementations support up to 46 bits of address space (64 TB),
-which is our current limit. This expands into MBZ space in the page tables.
-
We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
@@ -73,4 +73,6 @@ Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time.
--Andi Kleen, Jul 2004
+Be very careful vs. KASLR when changing anything here. The KASLR address
+range must not overlap with anything except the KASAN shadow area, which is
+correct as KASAN disables KASLR.
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index c480f4811aba..effcbddba60b 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -163,9 +163,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
#endif
}
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
{
+ return 0;
}
static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 8712e11bead4..24ef6ab2b94b 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -12,6 +12,7 @@
#include <linux/mm_types.h>
#include <asm/tlbflush.h>
#include <asm/ctl_reg.h>
+#include <asm-generic/mm_hooks.h>
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
@@ -131,30 +132,4 @@ static inline void activate_mm(struct mm_struct *prev,
set_user_asce(next);
}
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
-{
-}
-
-static inline void arch_exit_mmap(struct mm_struct *mm)
-{
-}
-
-static inline void arch_unmap(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
-}
-
-static inline void arch_bprm_mm_init(struct mm_struct *mm,
- struct vm_area_struct *vma)
-{
-}
-
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
- bool write, bool execute, bool foreign)
-{
- /* by default, allow everything */
- return true;
-}
#endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index b668e351fd6c..fca34b2177e2 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
/*
* Needed since we do not use the asm-generic/mm_hooks.h:
*/
-static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
uml_setup_stubs(mm);
+ return 0;
}
extern void arch_exit_mmap(struct mm_struct *mm);
static inline void arch_unmap(struct mm_struct *mm,
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index 59b06b48f27d..5c205a9cb5a6 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -81,9 +81,10 @@ do { \
} \
} while (0)
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
{
+ return 0;
}
static inline void arch_unmap(struct mm_struct *mm,
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 91a2ef302c7f..329d10aa3149 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -957,9 +957,9 @@ ENTRY(debug)
/* Are we currently on the SYSENTER stack? */
movl PER_CPU_VAR(cpu_entry_area), %ecx
- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
- cmpl $SIZEOF_SYSENTER_stack, %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
jb .Ldebug_from_sysenter_stack
TRACE_IRQS_OFF
@@ -1001,9 +1001,9 @@ ENTRY(nmi)
/* Are we currently on the SYSENTER stack? */
movl PER_CPU_VAR(cpu_entry_area), %ecx
- addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
- subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
- cmpl $SIZEOF_SYSENTER_stack, %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1371359c2f5e..83be58380317 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -153,8 +153,8 @@ END(native_usergs_sysret64)
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
- SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
+ SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
ENTRY(entry_SYSCALL_64_trampoline)
UNWIND_HINT_EMPTY
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index ce1d7534fa53..5e56a4ced848 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -36,6 +36,7 @@
#include <asm/unistd.h>
#include <asm/fixmap.h>
#include <asm/traps.h>
+#include <asm/paravirt.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
@@ -137,6 +138,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
WARN_ON_ONCE(address != regs->ip);
+ /* This should be unreachable in NATIVE mode. */
+ if (WARN_ON(vsyscall_mode == NATIVE))
+ return false;
+
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
@@ -328,16 +333,47 @@ int in_gate_area_no_mm(unsigned long addr)
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
}
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range. Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present. If so, we skip calling
+ * this.
+ */
+static void __init set_vsyscall_pgtable_user_bits(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset_k(VSYSCALL_ADDR);
+ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+ p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+#if CONFIG_PGTABLE_LEVELS >= 5
+ p4d->p4d |= _PAGE_USER;
+#endif
+ pud = pud_offset(p4d, VSYSCALL_ADDR);
+ set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+ pmd = pmd_offset(pud, VSYSCALL_ADDR);
+ set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- if (vsyscall_mode != NONE)
+ if (vsyscall_mode != NONE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
+ set_vsyscall_pgtable_user_bits();
+ }
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
new file mode 100644
index 000000000000..2fbc69a0916e
--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <linux/percpu-defs.h>
+#include <asm/processor.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code. Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+ * The GDT is just below entry_stack and thus serves (on x86_64) as
+ * a a read-only guard page.
+ */
+ struct entry_stack_page entry_stack_page;
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+ char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+ /*
+ * Exception stacks used for IST entries.
+ *
+ * In the future, this should have a separate slot for each stack
+ * with guard pages between them.
+ */
+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#define CPU_ENTRY_AREA_MAP_SIZE \
+ (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
+
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+ return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 638ebbf482c0..8b9915561ed1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,11 +125,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
-#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
-#define setup_clear_cpu_cap(bit) do { \
- clear_cpu_cap(&boot_cpu_data, bit); \
- set_bit(bit, (unsigned long *)cpu_caps_cleared); \
-} while (0)
+
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+
#define setup_force_cpu_cap(bit) do { \
set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ac28a2d838bd..1e8b60fda51f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -21,6 +21,11 @@
* this feature bit is not displayed in /proc/cpuinfo at all.
*/
+/*
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c
+ */
+
/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
@@ -294,6 +299,12 @@
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
+#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
+#define X86_FEATURE_VPCLMULQDQ (16*32+ 10) /* Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4c730d1caa3c..9bd748dd04d7 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -5,6 +5,7 @@
#include <asm/ldt.h>
#include <asm/mmu.h>
#include <asm/fixmap.h>
+#include <asm/cpu_entry_area.h>
#include <linux/smp.h>
#include <linux/percpu.h>
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index ca3ce9ab9385..e7009ac975af 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -1,7 +1,7 @@
#ifndef _ASM_X86_ESPFIX_H
#define _ASM_X86_ESPFIX_H
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_ESPFIX64
#include <asm/percpu.h>
@@ -10,7 +10,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void);
extern void init_espfix_ap(int cpu);
-
-#endif /* CONFIG_X86_64 */
+#else
+static inline void init_espfix_ap(int cpu) { }
+#endif
#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4c6d4a199ba1..2551e863d3c8 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -45,46 +45,6 @@ extern unsigned long __FIXADDR_TOP;
#endif
/*
- * cpu_entry_area is a percpu region in the fixmap that contains things
- * needed by the CPU and early entry/exit code. Real types aren't used
- * for all fields here to avoid circular header dependencies.
- *
- * Every field is a virtual alias of some other allocated backing store.
- * There is no direct allocation of a struct cpu_entry_area.
- */
-struct cpu_entry_area {
- char gdt[PAGE_SIZE];
-
- /*
- * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
- * a a read-only guard page.
- */
- struct SYSENTER_stack_page SYSENTER_stack_page;
-
- /*
- * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
- * we need task switches to work, and task switches write to the TSS.
- */
- struct tss_struct tss;
-
- char entry_trampoline[PAGE_SIZE];
-
-#ifdef CONFIG_X86_64
- /*
- * Exception stacks used for IST entries.
- *
- * In the future, this should have a separate slot for each stack
- * with guard pages between them.
- */
- char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
-#endif
-};
-
-#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
-
-extern void setup_cpu_entry_areas(void);
-
-/*
* Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at
* compile time, but to set the physical address only
@@ -123,7 +83,6 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif
- FIX_RO_IDT, /* Virtual mapping for read-only IDT */
#ifdef CONFIG_X86_32
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -139,9 +98,6 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
- /* Fixmap entries to remap the GDTs, one per processor. */
- FIX_CPU_ENTRY_AREA_TOP,
- FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
__end_of_permanent_fixed_addresses,
@@ -176,7 +132,7 @@ enum fixed_addresses {
extern void reserve_top_address(unsigned long reserve);
#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
extern int fixmaps_set;
@@ -224,30 +180,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
-static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
-{
- BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
-
- return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
-}
-
-#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
- BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
- __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
- })
-
-#define get_cpu_entry_area_index(cpu, field) \
- __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
-
-static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
-{
- return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
-}
-
-static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
-{
- return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
-}
-
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h
new file mode 100644
index 000000000000..989cfa86de85
--- /dev/null
+++ b/arch/x86/include/asm/invpcid.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVPCID
+#define _ASM_X86_INVPCID
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+{
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+ * mappings, we don't want the compiler to reorder any subsequent
+ * memory accesses before the TLB flush.
+ *
+ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+ * invpcid (%rcx), %rax in long mode.
+ */
+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+ : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR 0
+#define INVPCID_TYPE_SINGLE_CTXT 1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
+#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+ unsigned long addr)
+{
+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
+#endif /* _ASM_X86_INVPCID */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index bb8c597c2248..2d7e852b2dad 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -2,6 +2,7 @@
#define _ASM_X86_MMU_H
#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/mutex.h>
#include <linux/atomic.h>
@@ -26,7 +27,8 @@ typedef struct {
atomic64_t tlb_gen;
#ifdef CONFIG_MODIFY_LDT_SYSCALL
- struct ldt_struct *ldt;
+ struct rw_semaphore ldt_usr_sem;
+ struct ldt_struct *ldt;
#endif
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 1db3b4ff1f76..6972a3e9bc71 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -56,11 +56,17 @@ struct ldt_struct {
/*
* Used for LDT copy/destruction.
*/
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+static inline void init_new_context_ldt(struct mm_struct *mm)
+{
+ mm->context.ldt = NULL;
+ init_rwsem(&mm->context.ldt_usr_sem);
+}
+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context_ldt(struct task_struct *tsk,
- struct mm_struct *mm)
+static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline int ldt_dup_context(struct mm_struct *oldmm,
+ struct mm_struct *mm)
{
return 0;
}
@@ -131,18 +137,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ mutex_init(&mm->context.lock);
+
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
- #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
- #endif
- return init_new_context_ldt(tsk, mm);
+#endif
+ init_new_context_ldt(mm);
+ return 0;
}
static inline void destroy_context(struct mm_struct *mm)
{
@@ -175,10 +184,10 @@ do { \
} while (0)
#endif
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
paravirt_arch_dup_mmap(oldmm, mm);
+ return ldt_dup_context(oldmm, mm);
}
static inline void arch_exit_mmap(struct mm_struct *mm)
@@ -292,16 +301,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
-{
- return __sme_pa(mm->pgd) | asid;
-}
-
-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
-{
- return __sme_pa(mm->pgd) | asid | CR3_NOFLUSH;
-}
-
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) read_cr3().
@@ -311,7 +310,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
*/
static inline unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 9fb2f2bc8245..67b60e11b70d 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -37,13 +37,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
#define LAST_PKMAP 1024
#endif
-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
- & PMD_MASK)
+/*
+ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
+ * to avoid include recursion hell
+ */
+#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
+
+#define CPU_ENTRY_AREA_BASE \
+ ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
+
+#define PKMAP_BASE \
+ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
#ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else
-# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
#endif
#define MODULES_VADDR VMALLOC_START
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 06470da156ba..5a78b467d0df 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -74,33 +74,48 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+/*
+ * See Documentation/x86/x86_64/mm.txt for a description of the memory map.
+ *
+ * Be very careful vs. KASLR when changing anything here. The KASLR address
+ * range must not overlap with anything except the KASAN shadow area, which
+ * is correct as KASAN disables KASLR.
+ */
+#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+
#ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(16384, UL)
-#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+# define VMALLOC_SIZE_TB _AC(16384, UL)
+# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
+# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
#else
-#define VMALLOC_SIZE_TB _AC(32, UL)
-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+# define VMALLOC_SIZE_TB _AC(32, UL)
+# define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
+# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
#endif
+
#ifdef CONFIG_RANDOMIZE_MEMORY
-#define VMALLOC_START vmalloc_base
-#define VMEMMAP_START vmemmap_base
+# define VMALLOC_START vmalloc_base
+# define VMEMMAP_START vmemmap_base
#else
-#define VMALLOC_START __VMALLOC_BASE
-#define VMEMMAP_START __VMEMMAP_BASE
+# define VMALLOC_START __VMALLOC_BASE
+# define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
-#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+
+#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+
+#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */
-#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
-#define MODULES_LEN (MODULES_END - MODULES_VADDR)
-#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
-#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
-#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
+#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+
+#define CPU_ENTRY_AREA_PGD _AC(-3, UL)
+#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+
+#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
#define EARLY_DYNAMIC_PAGE_TABLES 64
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7a3718b76dad..147d363e3958 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -335,12 +335,12 @@ struct x86_hw_tss {
#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000
-struct SYSENTER_stack {
+struct entry_stack {
unsigned long words[64];
};
-struct SYSENTER_stack_page {
- struct SYSENTER_stack stack;
+struct entry_stack_page {
+ struct entry_stack stack;
} __aligned(PAGE_SIZE);
struct tss_struct {
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 95f999576131..3b3cc5ba579a 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -15,7 +15,7 @@ enum stack_type {
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ,
- STACK_TYPE_SYSENTER,
+ STACK_TYPE_ENTRY,
STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};
@@ -28,7 +28,7 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
+bool in_entry_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index c4aed0de565e..77cd69177cfc 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -8,70 +8,66 @@
#include <asm/cpufeature.h>
#include <asm/special_insns.h>
#include <asm/smp.h>
+#include <asm/invpcid.h>
-static inline void __invpcid(unsigned long pcid, unsigned long addr,
- unsigned long type)
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
- struct { u64 d[2]; } desc = { { pcid, addr } };
-
/*
- * The memory clobber is because the whole point is to invalidate
- * stale TLB entries and, especially if we're flushing global
- * mappings, we don't want the compiler to reorder any subsequent
- * memory accesses before the TLB flush.
- *
- * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
- * invpcid (%rcx), %rax in long mode.
+ * Bump the generation count. This also serves as a full barrier
+ * that synchronizes with switch_mm(): callers are required to order
+ * their read of mm_cpumask after their writes to the paging
+ * structures.
*/
- asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
- : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+ return atomic64_inc_return(&mm->context.tlb_gen);
}
-#define INVPCID_TYPE_INDIV_ADDR 0
-#define INVPCID_TYPE_SINGLE_CTXT 1
-#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
-#define INVPCID_TYPE_ALL_NON_GLOBAL 3
-
-/* Flush all mappings for a given pcid and addr, not including globals. */
-static inline void invpcid_flush_one(unsigned long pcid,
- unsigned long addr)
-{
- __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
-}
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#define PTI_CONSUMED_ASID_BITS 0
-/* Flush all mappings for a given PCID, not including globals. */
-static inline void invpcid_flush_single_context(unsigned long pcid)
-{
- __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
-}
+#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because ASID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
-/* Flush all mappings, including globals, for all PCIDs. */
-static inline void invpcid_flush_all(void)
+static inline u16 kern_pcid(u16 asid)
{
- __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ /*
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
+ * PCID bits. This serves two purposes. It prevents a nasty
+ * situation in which PCID-unaware code saves CR3, loads some other
+ * value (with PCID == 0), and then restores CR3, thus corrupting
+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
+ * that any bugs involving loading a PCID-enabled CR3 with
+ * CR4.PCIDE off will trigger deterministically.
+ */
+ return asid + 1;
}
-/* Flush all mappings for all PCIDs except globals. */
-static inline void invpcid_flush_all_nonglobals(void)
+struct pgd_t;
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{
- __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ return __sme_pa(pgd) | kern_pcid(asid);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(pgd);
+ }
}
-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
{
- u64 new_tlb_gen;
-
- /*
- * Bump the generation count. This also serves as a full barrier
- * that synchronizes with switch_mm(): callers are required to order
- * their read of mm_cpumask after their writes to the paging
- * structures.
- */
- smp_mb__before_atomic();
- new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
- smp_mb__after_atomic();
-
- return new_tlb_gen;
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
+ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
}
#ifdef CONFIG_PARAVIRT
@@ -233,6 +229,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
extern void initialize_tlbstate_and_flush(void);
+/*
+ * flush the entire current user mapping
+ */
static inline void __native_flush_tlb(void)
{
/*
@@ -245,20 +244,12 @@ static inline void __native_flush_tlb(void)
preempt_enable();
}
-static inline void __native_flush_tlb_global_irq_disabled(void)
-{
- unsigned long cr4;
-
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
-}
-
+/*
+ * flush everything
+ */
static inline void __native_flush_tlb_global(void)
{
- unsigned long flags;
+ unsigned long cr4, flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
@@ -276,22 +267,36 @@ static inline void __native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- __native_flush_tlb_global_irq_disabled();
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ /* toggle PGE */
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ native_write_cr4(cr4);
raw_local_irq_restore(flags);
}
+/*
+ * flush one page in the user mapping
+ */
static inline void __native_flush_tlb_single(unsigned long addr)
{
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
}
+/*
+ * flush everything
+ */
static inline void __flush_tlb_all(void)
{
- if (boot_cpu_has(X86_FEATURE_PGE))
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global();
- else
+ } else {
+ /*
+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+ */
__flush_tlb();
+ }
/*
* Note: if we somehow had PCID but not PGE, then this wouldn't work --
@@ -302,6 +307,9 @@ static inline void __flush_tlb_all(void)
*/
}
+/*
+ * flush one page in the kernel mapping
+ */
static inline void __flush_tlb_one(unsigned long addr)
{
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 40c3fab107ac..25b4832e9c28 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -96,6 +96,6 @@ void common(void) {
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
- OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
- DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
+ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c4f23da7a0f0..4dba34cb777d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -50,7 +50,7 @@ void foo(void)
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
- offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
+ offsetofend(struct cpu_entry_area, entry_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 1245f98bac1e..483116626be6 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -21,6 +21,7 @@ obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
+obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5967ddf18bdc..6774d9d6dd0d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -482,102 +482,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-#endif
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
- SYSENTER_stack_storage);
-
-static void __init
-set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
-{
- for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
- __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
-}
-
-/* Setup the fixmap mappings only once per-processor */
-static void __init setup_cpu_entry_area(int cpu)
-{
-#ifdef CONFIG_X86_64
- extern char _entry_trampoline[];
-
- /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
- pgprot_t gdt_prot = PAGE_KERNEL_RO;
- pgprot_t tss_prot = PAGE_KERNEL_RO;
-#else
- /*
- * On native 32-bit systems, the GDT cannot be read-only because
- * our double fault handler uses a task gate, and entering through
- * a task gate needs to change an available TSS to busy. If the
- * GDT is read-only, that will triple fault. The TSS cannot be
- * read-only because the CPU writes to it on task switches.
- *
- * On Xen PV, the GDT must be read-only because the hypervisor
- * requires it.
- */
- pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
- PAGE_KERNEL_RO : PAGE_KERNEL;
- pgprot_t tss_prot = PAGE_KERNEL;
-#endif
-
- __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
- per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
- PAGE_KERNEL);
-
- /*
- * The Intel SDM says (Volume 3, 7.2.1):
- *
- * Avoid placing a page boundary in the part of the TSS that the
- * processor reads during a task switch (the first 104 bytes). The
- * processor may not correctly perform address translations if a
- * boundary occurs in this area. During a task switch, the processor
- * reads and writes into the first 104 bytes of each TSS (using
- * contiguous physical addresses beginning with the physical address
- * of the first byte of the TSS). So, after TSS access begins, if
- * part of the 104 bytes is not physically contiguous, the processor
- * will access incorrect information without generating a page-fault
- * exception.
- *
- * There are also a lot of errata involving the TSS spanning a page
- * boundary. Assert that we're not doing that.
- */
- BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
- offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
- BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
- &per_cpu(cpu_tss_rw, cpu),
- sizeof(struct tss_struct) / PAGE_SIZE,
- tss_prot);
-
-#ifdef CONFIG_X86_32
- per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
#endif
-#ifdef CONFIG_X86_64
- BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
- BUILD_BUG_ON(sizeof(exception_stacks) !=
- sizeof(((struct cpu_entry_area *)0)->exception_stacks));
- set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
- &per_cpu(exception_stacks, cpu),
- sizeof(exception_stacks) / PAGE_SIZE,
- PAGE_KERNEL);
-
- __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
-#endif
-}
-
-void __init setup_cpu_entry_areas(void)
-{
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- setup_cpu_entry_area(cpu);
-}
-
/* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu)
{
@@ -1315,7 +1221,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@@ -1443,7 +1349,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1658,7 +1564,7 @@ void cpu_init(void)
*/
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
- load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..904b0a3c4e53
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,121 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f522415bf9e5..2e6af0f429f0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -551,15 +551,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
}
#else
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
- __native_flush_tlb_global_irq_disabled();
-}
-
static inline void print_ucode(struct ucode_cpu_info *uci)
{
struct microcode_intel *mc;
@@ -588,10 +579,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
if (rev != mc->hdr.rev)
return -1;
-#ifdef CONFIG_X86_64
- /* Flush global tlb. This is precaution. */
- flush_tlb_early();
-#endif
uci->cpu_sig.rev = rev;
if (early)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 8271bbf2a4c3..b978ac83c67e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,6 +18,7 @@
#include <linux/nmi.h>
#include <linux/sysfs.h>
+#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
@@ -43,9 +44,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
-bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+bool in_entry_stack(unsigned long *stack, struct stack_info *info)
{
- struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
+ struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
void *begin = ss;
void *end = ss + 1;
@@ -53,7 +54,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
if ((void *)stack < begin || (void *)stack >= end)
return false;
- info->type = STACK_TYPE_SYSENTER;
+ info->type = STACK_TYPE_ENTRY;
info->begin = begin;
info->end = end;
info->next_sp = NULL;
@@ -123,13 +124,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
- * - SYSENTER stack
+ * - entry stack
*
* x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
- * - SYSENTER stack
+ * - entry stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index c35d54f38ccc..3403ab744fe0 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -25,8 +25,8 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
- if (type == STACK_TYPE_SYSENTER)
- return "SYSENTER";
+ if (type == STACK_TYPE_ENTRY)
+ return "ENTRY_TRAMPOLINE";
return NULL;
}
@@ -95,7 +95,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
- if (in_sysenter_stack(stack, info))
+ if (in_entry_stack(stack, info))
goto recursion_check;
if (in_hardirq_stack(stack, info))
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 16ceab1132e8..b0462122e483 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -36,8 +36,14 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
- if (type == STACK_TYPE_SYSENTER)
- return "SYSENTER";
+ if (type == STACK_TYPE_ENTRY) {
+ /*
+ * On 64-bit, we have a generic entry stack that we
+ * use for all the kernel entry points, including
+ * SYSENTER.
+ */
+ return "ENTRY_TRAMPOLINE";
+ }
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@@ -117,7 +123,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
- if (in_sysenter_stack(stack, info))
+ if (in_entry_stack(stack, info))
goto recursion_check;
goto unknown;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a870910c8565..c0db465b5ff7 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -4,6 +4,11 @@
* Copyright (C) 2002 Andi Kleen
*
* This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ * contex.ldt_usr_sem
+ * mmap_sem
+ * context.lock
*/
#include <linux/errno.h>
@@ -21,7 +26,26 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
-/* context.lock is held for us, so we don't need any locking. */
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
+#endif
+}
+
+/* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm)
{
struct mm_struct *mm = __mm;
@@ -32,6 +56,8 @@ static void flush_ldt(void *__mm)
pc = &mm->context;
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+
+ refresh_ldt_segments();
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
@@ -76,15 +102,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
}
-/* context.lock is held */
-static void install_ldt(struct mm_struct *current_mm,
- struct ldt_struct *ldt)
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
{
+ mutex_lock(&mm->context.lock);
+
/* Synchronizes with lockless_dereference in load_mm_ldt. */
- smp_store_release(&current_mm->context.ldt, ldt);
+ smp_store_release(&mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using currents mm. */
+ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
- /* Activate the LDT for all CPUs using current_mm. */
- on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+ mutex_unlock(&mm->context.lock);
}
static void free_ldt_struct(struct ldt_struct *ldt)
@@ -101,27 +129,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
}
/*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
*/
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{
struct ldt_struct *new_ldt;
- struct mm_struct *old_mm;
int retval = 0;
- mutex_init(&mm->context.lock);
- old_mm = current->mm;
- if (!old_mm) {
- mm->context.ldt = NULL;
+ if (!old_mm)
return 0;
- }
mutex_lock(&old_mm->context.lock);
- if (!old_mm->context.ldt) {
- mm->context.ldt = NULL;
+ if (!old_mm->context.ldt)
goto out_unlock;
- }
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) {
@@ -157,7 +178,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
unsigned long entries_size;
int retval;
- mutex_lock(&mm->context.lock);
+ down_read(&mm->context.ldt_usr_sem);
if (!mm->context.ldt) {
retval = 0;
@@ -186,7 +207,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
retval = bytecount;
out_unlock:
- mutex_unlock(&mm->context.lock);
+ up_read(&mm->context.ldt_usr_sem);
return retval;
}
@@ -246,7 +267,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
ldt.avl = 0;
}
- mutex_lock(&mm->context.lock);
+ if (down_write_killable(&mm->context.ldt_usr_sem))
+ return -EINTR;
old_ldt = mm->context.ldt;
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
@@ -268,7 +290,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
error = 0;
out_unlock:
- mutex_unlock(&mm->context.lock);
+ up_write(&mm->context.ldt_usr_sem);
out:
return error;
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 63e6230c4a6d..aad2b74bd921 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -989,12 +989,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
- /*
- * Enable the espfix hack for this CPU
- */
-#ifdef CONFIG_X86_ESPFIX64
+ /* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
-#endif
/* So we see what's up */
announce_cpu(cpu, apicid);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 976694564c07..0333e174d564 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -57,6 +57,7 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
+#include <asm/cpu_entry_area.h>
#include <asm/mce.h>
#include <asm/fixmap.h>
#include <asm/mach_traps.h>
@@ -1036,8 +1037,9 @@ void __init trap_init(void)
* "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */
- __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
- idt_descr.address = fix_to_virt(FIX_RO_IDT);
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
/*
* Should be a barrier for any external CPU state:
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a5bfa6511a4a..61c7d9bd9415 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -9,7 +9,7 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o cpu_entry_area.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
new file mode 100644
index 000000000000..21e8b595cbb1
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+ unsigned long va = (unsigned long) cea_vaddr;
+
+ set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+ extern char _entry_trampoline[];
+
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
+ *
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
+ */
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+ gdt_prot);
+
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+ per_cpu_ptr(&entry_stack_storage, cpu), 1,
+ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+ *
+ * Avoid placing a page boundary in the part of the TSS that the
+ * processor reads during a task switch (the first 104 bytes). The
+ * processor may not correctly perform address translations if a
+ * boundary occurs in this area. During a task switch, the processor
+ * reads and writes into the first 104 bytes of each TSS (using
+ * contiguous physical addresses beginning with the physical address
+ * of the first byte of the TSS). So, after TSS access begins, if
+ * part of the 104 bytes is not physically contiguous, the processor
+ * will access incorrect information without generating a page-fault
+ * exception.
+ *
+ * There are also a lot of errata involving the TSS spanning a page
+ * boundary. Assert that we're not doing that.
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+ BUILD_BUG_ON(sizeof(exception_stacks) !=
+ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+ &per_cpu(exception_stacks, cpu),
+ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+}
+
+static __init void setup_cpu_entry_area_ptes(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long start, end;
+
+ BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
+ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + CPU_ENTRY_AREA_MAP_SIZE;
+
+ for (; start < end; start += PMD_SIZE)
+ populate_extra_pte(start);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+ unsigned int cpu;
+
+ setup_cpu_entry_area_ptes();
+
+ for_each_possible_cpu(cpu)
+ setup_cpu_entry_area(cpu);
+}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0470826d2bdc..3b7720404a9f 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -44,10 +44,12 @@ struct addr_marker {
unsigned long max_lines;
};
-/* indices for address_markers; keep sync'd w/ address_markers below */
+/* Address space markers hints */
+
+#ifdef CONFIG_X86_64
+
enum address_markers_idx {
USER_SPACE_NR = 0,
-#ifdef CONFIG_X86_64
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
VMALLOC_START_NR,
@@ -56,56 +58,74 @@ enum address_markers_idx {
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
-# ifdef CONFIG_X86_ESPFIX64
+ CPU_ENTRY_AREA_NR,
+#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
-# endif
+#endif
+#ifdef CONFIG_EFI
+ EFI_END_NR,
+#endif
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
-#else
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
+ [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
+#ifdef CONFIG_KASAN
+ [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
+ [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
+#endif
+ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
+#ifdef CONFIG_X86_ESPFIX64
+ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
+#endif
+#ifdef CONFIG_EFI
+ [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
+#endif
+ [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
+ [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
+ [MODULES_END_NR] = { MODULES_END, "End Modules" },
+ [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
+};
+
+#else /* CONFIG_X86_64 */
+
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
VMALLOC_START_NR,
VMALLOC_END_NR,
-# ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
-# endif
- FIXADDR_START_NR,
#endif
+ CPU_ENTRY_AREA_NR,
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
};
-/* Address space markers hints */
static struct addr_marker address_markers[] = {
- { 0, "User Space" },
-#ifdef CONFIG_X86_64
- { 0x8000000000000000UL, "Kernel Space" },
- { 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
- { 0/* VMALLOC_START */, "vmalloc() Area" },
- { 0/* VMEMMAP_START */, "Vmemmap" },
-#ifdef CONFIG_KASAN
- { KASAN_SHADOW_START, "KASAN shadow" },
- { KASAN_SHADOW_END, "KASAN shadow end" },
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
+#ifdef CONFIG_HIGHMEM
+ [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
#endif
-# ifdef CONFIG_X86_ESPFIX64
- { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
-# endif
-# ifdef CONFIG_EFI
- { EFI_VA_END, "EFI Runtime Services" },
-# endif
- { __START_KERNEL_map, "High Kernel Mapping" },
- { MODULES_VADDR, "Modules" },
- { MODULES_END, "End Modules" },
-#else
- { PAGE_OFFSET, "Kernel Mapping" },
- { 0/* VMALLOC_START */, "vmalloc() Area" },
- { 0/*VMALLOC_END*/, "vmalloc() End" },
-# ifdef CONFIG_HIGHMEM
- { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
-# endif
- { 0/*FIXADDR_START*/, "Fixmap Area" },
-#endif
- { -1, NULL } /* End of list */
+ [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
+ [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
};
+#endif /* !CONFIG_X86_64 */
+
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
@@ -140,7 +160,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
static const char * const level_name[] =
{ "cr3", "pgd", "pud", "pmd", "pte" };
- if (!pgprot_val(prot)) {
+ if (!(pr & _PAGE_PRESENT)) {
/* Not present */
pt_dump_cont_printf(m, dmsg, " ");
} else {
@@ -506,8 +526,8 @@ static int __init pt_dump_init(void)
address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+ address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
#endif
-
return 0;
}
__initcall(pt_dump_init);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8a64a6f2848d..135c9a7898c7 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/page_types.h>
+#include <asm/cpu_entry_area.h>
#include <asm/init.h>
#include "mm_internal.h"
@@ -766,6 +767,7 @@ void __init mem_init(void)
mem_init_print_info(NULL);
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
@@ -777,6 +779,10 @@ void __init mem_init(void)
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
+ CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
+ CPU_ENTRY_AREA_MAP_SIZE >> 10,
+
#ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 1b8791f859a3..b9c879ed4339 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,6 +12,7 @@
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES];
@@ -139,15 +140,17 @@ void __init kasan_init(void)
}
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
- kasan_mem_to_shadow((void *)__START_KERNEL_map));
+ shadow_cpu_entry_begin);
- vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
- (unsigned long)kasan_mem_to_shadow(_end),
- NUMA_NO_NODE);
+ kasan_populate_zero_shadow(shadow_cpu_entry_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
- kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
- (void *)KASAN_SHADOW_END);
+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
+ (unsigned long)kasan_mem_to_shadow(_end),
+ early_pfn_to_nid(__pa(_stext)));
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ (void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed206475aa7..68cfd31ba371 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -33,25 +33,14 @@
#define TB_SHIFT 40
/*
- * Virtual address start and end range for randomization. The end changes base
- * on configuration to have the highest amount of space for randomization.
- * It increases the possible random position for each randomized region.
+ * Virtual address start and end range for randomization.
*
- * You need to add an if/def entry if you introduce a new memory region
- * compatible with KASLR. Your entry must be in logical order with memory
- * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
- * ensure that this order is correct and won't be changed.
+ * The end address could depend on more configuration options to make the
+ * highest amount of space for randomization available, but that's too hard
+ * to keep straight and caused issues already.
*/
static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-
-#if defined(CONFIG_X86_ESPFIX64)
-static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
-#elif defined(CONFIG_EFI)
-static const unsigned long vaddr_end = EFI_VA_END;
-#else
-static const unsigned long vaddr_end = __START_KERNEL_map;
-#endif
+static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
/* Default values */
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
@@ -100,15 +89,12 @@ void __init kernel_randomize_memory(void)
unsigned long remain_entropy;
/*
- * All these BUILD_BUG_ON checks ensures the memory layout is
- * consistent with the vaddr_start/vaddr_end variables.
+ * These BUILD_BUG_ON checks ensure the memory layout is consistent
+ * with the vaddr_start/vaddr_end variables. These checks are very
+ * limited....
*/
BUILD_BUG_ON(vaddr_start >= vaddr_end);
- BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_ESPFIX64) &&
- vaddr_end >= EFI_VA_END);
- BUILD_BUG_ON((IS_ENABLED(CONFIG_X86_ESPFIX64) ||
- IS_ENABLED(CONFIG_EFI)) &&
- vaddr_end >= __START_KERNEL_map);
+ BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
if (!kaslr_memory_enabled())
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b9bd5b8b14fa..77909bae5943 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -9,6 +9,7 @@
#include <linux/pagemap.h>
#include <linux/spinlock.h>
+#include <asm/cpu_entry_area.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index de2deffd20c5..342a131469b9 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- write_cr3(build_cr3(next, new_asid));
+ write_cr3(build_cr3(next->pgd, new_asid));
/*
* NB: This gets called via leave_mm() in the idle path
@@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- write_cr3(build_cr3_noflush(next, new_asid));
+ write_cr3(build_cr3_noflush(next->pgd, new_asid));
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
@@ -324,7 +324,7 @@ void initialize_tlbstate_and_flush(void)
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm, 0));
+ write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
@@ -587,7 +587,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_single(addr);
+ __flush_tlb_one(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 2983faab5b18..42bd116c47a3 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -300,7 +300,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
local_flush_tlb();
stat->d_alltlb++;
} else {
- __flush_tlb_one(msg->address);
+ __flush_tlb_single(msg->address);
stat->d_onetlb++;
}
stat->d_requestee++;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index a9befc691617..d3073e51bbef 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2325,7 +2325,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
switch (idx) {
case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
- case FIX_RO_IDT:
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
# ifdef CONFIG_HIGHMEM
@@ -2336,7 +2335,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
- case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 7f503fd272d0..5c1573e0b4c8 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -281,7 +281,9 @@
#include <linux/fcntl.h>
#include <linux/blkdev.h>
#include <linux/times.h>
+#include <linux/delay.h>
#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
#include <scsi/scsi_request.h>
/* used to tell the module to turn on full debugging messages */
@@ -1030,13 +1032,25 @@ static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype *tracks)
tracks->cdi, tracks->xa);
}
+static int tray_close(struct cdrom_device_info *cdi)
+{
+ int ret;
+
+ ret = cdi->ops->tray_move(cdi, 0);
+ if (ret)
+ return ret;
+
+ return poll_event_interruptible(CDS_TRAY_OPEN !=
+ cdi->ops->drive_status(cdi, CDSL_CURRENT), 500);
+}
+
static
-int open_for_data(struct cdrom_device_info *cdi)
+int open_for_common(struct cdrom_device_info *cdi, tracktype *tracks)
{
int ret;
const struct cdrom_device_ops *cdo = cdi->ops;
- tracktype tracks;
- cd_dbg(CD_OPEN, "entering open_for_data\n");
+
+ cd_dbg(CD_OPEN, "entering open_for_common\n");
/* Check if the driver can report drive status. If it can, we
can do clever things. If it can't, well, we at least tried! */
if (cdo->drive_status != NULL) {
@@ -1048,7 +1062,9 @@ int open_for_data(struct cdrom_device_info *cdi)
if (CDROM_CAN(CDC_CLOSE_TRAY) &&
cdi->options & CDO_AUTO_CLOSE) {
cd_dbg(CD_OPEN, "trying to close the tray\n");
- ret=cdo->tray_move(cdi,0);
+ ret = tray_close(cdi);
+ if (ret == -ERESTARTSYS)
+ return ret;
if (ret) {
cd_dbg(CD_OPEN, "bummer. tried to close the tray but failed.\n");
/* Ignore the error from the low
@@ -1056,37 +1072,45 @@ int open_for_data(struct cdrom_device_info *cdi)
couldn't close the tray. We only care
that there is no disc in the drive,
since that is the _REAL_ problem here.*/
- ret=-ENOMEDIUM;
- goto clean_up_and_return;
+ return -ENOMEDIUM;
}
} else {
cd_dbg(CD_OPEN, "bummer. this drive can't close the tray.\n");
- ret=-ENOMEDIUM;
- goto clean_up_and_return;
+ return -ENOMEDIUM;
}
/* Ok, the door should be closed now.. Check again */
ret = cdo->drive_status(cdi, CDSL_CURRENT);
- if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) {
+ if ((ret == CDS_NO_DISC) || (ret == CDS_TRAY_OPEN)) {
cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n");
cd_dbg(CD_OPEN, "tray might not contain a medium\n");
- ret=-ENOMEDIUM;
- goto clean_up_and_return;
+ return -ENOMEDIUM;
}
cd_dbg(CD_OPEN, "the tray is now closed\n");
}
- /* the door should be closed now, check for the disc */
- ret = cdo->drive_status(cdi, CDSL_CURRENT);
- if (ret!=CDS_DISC_OK) {
- ret = -ENOMEDIUM;
- goto clean_up_and_return;
- }
+ if (ret != CDS_DISC_OK)
+ return -ENOMEDIUM;
}
- cdrom_count_tracks(cdi, &tracks);
- if (tracks.error == CDS_NO_DISC) {
+ cdrom_count_tracks(cdi, tracks);
+ if (tracks->error == CDS_NO_DISC) {
cd_dbg(CD_OPEN, "bummer. no disc.\n");
- ret=-ENOMEDIUM;
- goto clean_up_and_return;
+ return -ENOMEDIUM;
}
+
+ return 0;
+}
+
+static
+int open_for_data(struct cdrom_device_info *cdi)
+{
+ int ret;
+ const struct cdrom_device_ops *cdo = cdi->ops;
+ tracktype tracks;
+
+ cd_dbg(CD_OPEN, "entering open_for_data\n");
+ ret = open_for_common(cdi, &tracks);
+ if (ret)
+ goto clean_up_and_return;
+
/* CD-Players which don't use O_NONBLOCK, workman
* for example, need bit CDO_CHECK_TYPE cleared! */
if (tracks.data==0) {
@@ -1196,53 +1220,17 @@ err:
/* This code is similar to that in open_for_data. The routine is called
whenever an audio play operation is requested.
*/
-static int check_for_audio_disc(struct cdrom_device_info *cdi,
- const struct cdrom_device_ops *cdo)
+static int check_for_audio_disc(struct cdrom_device_info *cdi)
{
int ret;
tracktype tracks;
cd_dbg(CD_OPEN, "entering check_for_audio_disc\n");
if (!(cdi->options & CDO_CHECK_TYPE))
return 0;
- if (cdo->drive_status != NULL) {
- ret = cdo->drive_status(cdi, CDSL_CURRENT);
- cd_dbg(CD_OPEN, "drive_status=%d\n", ret);
- if (ret == CDS_TRAY_OPEN) {
- cd_dbg(CD_OPEN, "the tray is open...\n");
- /* can/may i close it? */
- if (CDROM_CAN(CDC_CLOSE_TRAY) &&
- cdi->options & CDO_AUTO_CLOSE) {
- cd_dbg(CD_OPEN, "trying to close the tray\n");
- ret=cdo->tray_move(cdi,0);
- if (ret) {
- cd_dbg(CD_OPEN, "bummer. tried to close tray but failed.\n");
- /* Ignore the error from the low
- level driver. We don't care why it
- couldn't close the tray. We only care
- that there is no disc in the drive,
- since that is the _REAL_ problem here.*/
- return -ENOMEDIUM;
- }
- } else {
- cd_dbg(CD_OPEN, "bummer. this driver can't close the tray.\n");
- return -ENOMEDIUM;
- }
- /* Ok, the door should be closed now.. Check again */
- ret = cdo->drive_status(cdi, CDSL_CURRENT);
- if ((ret == CDS_NO_DISC) || (ret==CDS_TRAY_OPEN)) {
- cd_dbg(CD_OPEN, "bummer. the tray is still not closed.\n");
- return -ENOMEDIUM;
- }
- if (ret!=CDS_DISC_OK) {
- cd_dbg(CD_OPEN, "bummer. disc isn't ready.\n");
- return -EIO;
- }
- cd_dbg(CD_OPEN, "the tray is now closed\n");
- }
- }
- cdrom_count_tracks(cdi, &tracks);
- if (tracks.error)
- return(tracks.error);
+
+ ret = open_for_common(cdi, &tracks);
+ if (ret)
+ return ret;
if (tracks.audio==0)
return -EMEDIUMTYPE;
@@ -2334,7 +2322,8 @@ static int cdrom_ioctl_closetray(struct cdrom_device_info *cdi)
if (!CDROM_CAN(CDC_CLOSE_TRAY))
return -ENOSYS;
- return cdi->ops->tray_move(cdi, 0);
+
+ return tray_close(cdi);
}
static int cdrom_ioctl_eject_sw(struct cdrom_device_info *cdi,
@@ -2704,7 +2693,7 @@ static int cdrom_ioctl_play_trkind(struct cdrom_device_info *cdi,
if (copy_from_user(&ti, argp, sizeof(ti)))
return -EFAULT;
- ret = check_for_audio_disc(cdi, cdi->ops);
+ ret = check_for_audio_disc(cdi);
if (ret)
return ret;
return cdi->ops->audio_ioctl(cdi, CDROMPLAYTRKIND, &ti);
@@ -2752,7 +2741,7 @@ static int cdrom_ioctl_audioctl(struct cdrom_device_info *cdi,
if (!CDROM_CAN(CDC_PLAY_AUDIO))
return -ENOSYS;
- ret = check_for_audio_disc(cdi, cdi->ops);
+ ret = check_for_audio_disc(cdi);
if (ret)
return ret;
return cdi->ops->audio_ioctl(cdi, cmd, NULL);
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ca4abe1ccd8d..ac3a331a2c66 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -68,6 +68,8 @@
#include <linux/random.h>
#include <trace/events/bcache.h>
+#define MAX_OPEN_BUCKETS 128
+
/* Bucket heap / gen */
uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -404,7 +406,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
finish_wait(&ca->set->bucket_wait, &w);
out:
- wake_up_process(ca->alloc_thread);
+ if (ca->alloc_thread)
+ wake_up_process(ca->alloc_thread);
trace_bcache_alloc(ca, reserve);
@@ -439,6 +442,11 @@ out:
b->prio = INITIAL_PRIO;
}
+ if (ca->set->avail_nbuckets > 0) {
+ ca->set->avail_nbuckets--;
+ bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+ }
+
return r;
}
@@ -446,6 +454,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b)
{
SET_GC_MARK(b, 0);
SET_GC_SECTORS_USED(b, 0);
+
+ if (ca->set->avail_nbuckets < ca->set->nbuckets) {
+ ca->set->avail_nbuckets++;
+ bch_update_bucket_in_use(ca->set, &ca->set->gc_stats);
+ }
}
void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -476,7 +489,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
if (b == -1)
goto err;
- k->ptr[i] = PTR(ca->buckets[b].gen,
+ k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
bucket_to_sector(c, b),
ca->sb.nr_this_dev);
@@ -598,7 +611,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
/*
* If we had to allocate, we might race and not need to allocate the
- * second time we call find_data_bucket(). If we allocated a bucket but
+ * second time we call pick_data_bucket(). If we allocated a bucket but
* didn't use it, drop the refcount bch_bucket_alloc_set() took:
*/
if (KEY_PTRS(&alloc.key))
@@ -671,7 +684,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
spin_lock_init(&c->data_bucket_lock);
- for (i = 0; i < 6; i++) {
+ for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
if (!b)
return -ENOMEM;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dee542fff68e..e274082330dc 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -184,6 +184,7 @@
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -265,9 +266,6 @@ struct bcache_device {
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
- unsigned long sectors_dirty_last;
- long sectors_dirty_derivative;
-
struct bio_set *bio_split;
unsigned data_csum:1;
@@ -299,7 +297,7 @@ struct cached_dev {
struct semaphore sb_write_mutex;
/* Refcount on the cache set. Always nonzero when we're caching. */
- atomic_t count;
+ refcount_t count;
struct work_struct detach;
/*
@@ -333,6 +331,7 @@ struct cached_dev {
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
struct task_struct *writeback_thread;
+ struct workqueue_struct *writeback_write_wq;
struct keybuf writeback_keys;
@@ -361,12 +360,14 @@ struct cached_dev {
uint64_t writeback_rate_target;
int64_t writeback_rate_proportional;
- int64_t writeback_rate_derivative;
- int64_t writeback_rate_change;
+ int64_t writeback_rate_integral;
+ int64_t writeback_rate_integral_scaled;
+ int32_t writeback_rate_change;
unsigned writeback_rate_update_seconds;
- unsigned writeback_rate_d_term;
+ unsigned writeback_rate_i_term_inverse;
unsigned writeback_rate_p_term_inverse;
+ unsigned writeback_rate_minimum;
};
enum alloc_reserve {
@@ -580,6 +581,7 @@ struct cache_set {
uint8_t need_gc;
struct gc_stat gc_stats;
size_t nbuckets;
+ size_t avail_nbuckets;
struct task_struct *gc_thread;
/* Where in the btree gc currently is */
@@ -805,13 +807,13 @@ do { \
static inline void cached_dev_put(struct cached_dev *dc)
{
- if (atomic_dec_and_test(&dc->count))
+ if (refcount_dec_and_test(&dc->count))
schedule_work(&dc->detach);
}
static inline bool cached_dev_get(struct cached_dev *dc)
{
- if (!atomic_inc_not_zero(&dc->count))
+ if (!refcount_inc_not_zero(&dc->count))
return false;
/* Paired with the mb in cached_dev_attach */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 866dcf78ff8e..126e017c8970 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -806,7 +806,10 @@ int bch_btree_cache_alloc(struct cache_set *c)
c->shrink.scan_objects = bch_mca_scan;
c->shrink.seeks = 4;
c->shrink.batch = c->btree_pages * 2;
- register_shrinker(&c->shrink);
+
+ if (register_shrinker(&c->shrink))
+ pr_warn("bcache: %s: could not register shrinker",
+ __func__);
return 0;
}
@@ -1240,6 +1243,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
__bch_btree_mark_key(c, level, k);
}
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
+{
+ stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
+}
+
static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
{
uint8_t stale = 0;
@@ -1651,9 +1659,8 @@ static void btree_gc_start(struct cache_set *c)
mutex_unlock(&c->bucket_lock);
}
-static size_t bch_btree_gc_finish(struct cache_set *c)
+static void bch_btree_gc_finish(struct cache_set *c)
{
- size_t available = 0;
struct bucket *b;
struct cache *ca;
unsigned i;
@@ -1690,6 +1697,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
}
rcu_read_unlock();
+ c->avail_nbuckets = 0;
for_each_cache(ca, c, i) {
uint64_t *i;
@@ -1711,18 +1719,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
- available++;
+ c->avail_nbuckets++;
}
}
mutex_unlock(&c->bucket_lock);
- return available;
}
static void bch_btree_gc(struct cache_set *c)
{
int ret;
- unsigned long available;
struct gc_stat stats;
struct closure writes;
struct btree_op op;
@@ -1745,14 +1751,14 @@ static void bch_btree_gc(struct cache_set *c)
pr_warn("gc failed!");
} while (ret);
- available = bch_btree_gc_finish(c);
+ bch_btree_gc_finish(c);
wake_up_allocators(c);
bch_time_stats_update(&c->btree_gc_time, start_time);
stats.key_bytes *= sizeof(uint64_t);
stats.data <<= 9;
- stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
+ bch_update_bucket_in_use(c, &stats);
memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
trace_bcache_gc_end(c);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 73da1f5626cb..4073aca09a49 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -305,5 +305,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
struct keybuf_key *bch_keybuf_next(struct keybuf *);
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
struct bkey *, keybuf_pred_fn *);
-
+void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 864e673aec39..1841d0359bac 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -64,27 +64,16 @@ EXPORT_SYMBOL(closure_put);
void __closure_wake_up(struct closure_waitlist *wait_list)
{
struct llist_node *list;
- struct closure *cl;
+ struct closure *cl, *t;
struct llist_node *reverse = NULL;
list = llist_del_all(&wait_list->list);
/* We first reverse the list to preserve FIFO ordering and fairness */
-
- while (list) {
- struct llist_node *t = list;
- list = llist_next(list);
-
- t->next = reverse;
- reverse = t;
- }
+ reverse = llist_reverse_order(list);
/* Then do the wakeups */
-
- while (reverse) {
- cl = container_of(reverse, struct closure, list);
- reverse = llist_next(reverse);
-
+ llist_for_each_entry_safe(cl, t, reverse, list) {
closure_set_waiting(cl, 0);
closure_sub(cl, CLOSURE_WAITING + 1);
}
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 1ec84ca81146..00fb314cce57 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -251,6 +251,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
static inline void closure_queue(struct closure *cl)
{
struct workqueue_struct *wq = cl->wq;
+ /**
+ * Changes made to closure, work_struct, or a couple of other structs
+ * may cause work.func not pointing to the right location.
+ */
+ BUILD_BUG_ON(offsetof(struct closure, fn)
+ != offsetof(struct work_struct, func));
if (wq) {
INIT_WORK(&cl->work, cl->work.func);
BUG_ON(!queue_work(wq, &cl->work));
@@ -312,8 +318,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
* been dropped with closure_put()), it will resume execution at @fn running out
* of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
*
- * NOTE: This macro expands to a return in the calling function!
- *
* This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops.
@@ -340,8 +344,6 @@ do { \
* Causes @fn to be executed out of @cl, in @wq context (or called directly if
* @wq is NULL).
*
- * NOTE: like continue_at(), this macro expands to a return in the caller!
- *
* The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
* thus it's not safe to touch anything protected by @cl after a
* continue_at_nobarrier().
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 243de0bf15cd..4bf15182c4da 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -584,7 +584,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey
return false;
for (i = 0; i < KEY_PTRS(l); i++)
- if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+ if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
return false;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7e1d1c3ba33a..3a07cd26145f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -169,6 +169,11 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
* find a sequence of buckets with valid journal entries
*/
for (i = 0; i < ca->sb.njournal_buckets; i++) {
+ /*
+ * We must try the index l with ZERO first for
+ * correctness due to the scenario that the journal
+ * bucket is circular buffer which might have wrapped
+ */
l = (i * 2654435769U) % ca->sb.njournal_buckets;
if (test_bit(l, bitmap))
@@ -506,7 +511,7 @@ static void journal_reclaim(struct cache_set *c)
continue;
ja->cur_idx = next;
- k->ptr[n++] = PTR(0,
+ k->ptr[n++] = MAKE_PTR(0,
bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
ca->sb.nr_this_dev);
}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 0e1463d0c334..92f94d035fe1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -26,12 +26,12 @@ struct kmem_cache *bch_search_cache;
static void bch_data_insert_start(struct closure *);
-static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+static unsigned cache_mode(struct cached_dev *dc)
{
return BDEV_CACHE_MODE(&dc->sb);
}
-static bool verify(struct cached_dev *dc, struct bio *bio)
+static bool verify(struct cached_dev *dc)
{
return dc->verify;
}
@@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
struct bio *bio = op->bio, *n;
- if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
- wake_up_gc(op->c);
-
if (op->bypass)
return bch_data_invalidate(cl);
+ if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
+ wake_up_gc(op->c);
+
/*
* Journal writes are marked REQ_PREFLUSH; if the original write was a
* flush, it'll wait on the journal write.
@@ -369,7 +369,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
{
struct cache_set *c = dc->disk.c;
- unsigned mode = cache_mode(dc, bio);
+ unsigned mode = cache_mode(dc);
unsigned sectors, congested = bch_get_congested(c);
struct task_struct *task = current;
struct io *i;
@@ -384,6 +384,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
op_is_write(bio_op(bio))))
goto skip;
+ /*
+ * Flag for bypass if the IO is for read-ahead or background,
+ * unless the read-ahead request is for metadata (eg, for gfs2).
+ */
+ if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+ !(bio->bi_opf & REQ_META))
+ goto skip;
+
if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
bio_sectors(bio) & (c->sb.block_size - 1)) {
pr_debug("skipping unaligned io");
@@ -400,12 +408,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
if (!congested && !dc->sequential_cutoff)
goto rescale;
- if (!congested &&
- mode == CACHE_MODE_WRITEBACK &&
- op_is_write(bio->bi_opf) &&
- op_is_sync(bio->bi_opf))
- goto rescale;
-
spin_lock(&dc->io_lock);
hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
@@ -468,6 +470,7 @@ struct search {
unsigned recoverable:1;
unsigned write:1;
unsigned read_dirty_data:1;
+ unsigned cache_missed:1;
unsigned long start_time;
@@ -654,6 +657,7 @@ static inline struct search *search_alloc(struct bio *bio,
s->orig_bio = bio;
s->cache_miss = NULL;
+ s->cache_missed = 0;
s->d = d;
s->recoverable = 1;
s->write = op_is_write(bio_op(bio));
@@ -704,7 +708,14 @@ static void cached_dev_read_error(struct closure *cl)
struct search *s = container_of(cl, struct search, cl);
struct bio *bio = &s->bio.bio;
- if (s->recoverable) {
+ /*
+ * If read request hit dirty data (s->read_dirty_data is true),
+ * then recovery a failed read request from cached device may
+ * get a stale data back. So read failure recovery is only
+ * permitted when read request hit clean data in cache device,
+ * or when cache read race happened.
+ */
+ if (s->recoverable && !s->read_dirty_data) {
/* Retry from the backing device: */
trace_bcache_read_retry(s->orig_bio);
@@ -745,7 +756,7 @@ static void cached_dev_read_done(struct closure *cl)
s->cache_miss = NULL;
}
- if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data)
+ if (verify(dc) && s->recoverable && !s->read_dirty_data)
bch_data_verify(dc, s->orig_bio);
bio_complete(s);
@@ -765,12 +776,12 @@ static void cached_dev_read_done_bh(struct closure *cl)
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
bch_mark_cache_accounting(s->iop.c, s->d,
- !s->cache_miss, s->iop.bypass);
+ !s->cache_missed, s->iop.bypass);
trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
if (s->iop.status)
continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
- else if (s->iop.bio || verify(dc, &s->bio.bio))
+ else if (s->iop.bio || verify(dc))
continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
else
continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -784,6 +795,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
struct bio *miss, *cache_bio;
+ s->cache_missed = 1;
+
if (s->cache_miss || s->iop.bypass) {
miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
@@ -897,7 +910,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
s->iop.bypass = true;
if (should_writeback(dc, s->orig_bio,
- cache_mode(dc, bio),
+ cache_mode(dc),
s->iop.bypass)) {
s->iop.bypass = false;
s->iop.writeback = true;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 974d832e54a6..b4d28928dec5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);
static int bcache_major;
-static DEFINE_IDA(bcache_minor);
+static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
-#define BCACHE_MINORS 16 /* partition support */
+/* limitation of partitions number on single bcache device */
+#define BCACHE_MINORS 128
+/* limitation of bcache devices number on single system */
+#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
/* Superblock */
@@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
closure_get(&c->caching);
}
+static inline int first_minor_to_idx(int first_minor)
+{
+ return (first_minor/BCACHE_MINORS);
+}
+
+static inline int idx_to_first_minor(int idx)
+{
+ return (idx * BCACHE_MINORS);
+}
+
static void bcache_device_free(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
@@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d)
if (d->disk && d->disk->queue)
blk_cleanup_queue(d->disk->queue);
if (d->disk) {
- ida_simple_remove(&bcache_minor, d->disk->first_minor);
+ ida_simple_remove(&bcache_device_idx,
+ first_minor_to_idx(d->disk->first_minor));
put_disk(d->disk);
}
@@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
{
struct request_queue *q;
size_t n;
- int minor;
+ int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
@@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
if (!d->full_dirty_stripes)
return -ENOMEM;
- minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
- if (minor < 0)
- return minor;
-
- minor *= BCACHE_MINORS;
+ idx = ida_simple_get(&bcache_device_idx, 0,
+ BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
+ if (idx < 0)
+ return idx;
if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS |
BIOSET_NEED_RESCUER)) ||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
- ida_simple_remove(&bcache_minor, minor);
+ ida_simple_remove(&bcache_device_idx, idx);
return -ENOMEM;
}
set_capacity(d->disk, sectors);
- snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
+ snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
d->disk->major = bcache_major;
- d->disk->first_minor = minor;
+ d->disk->first_minor = idx_to_first_minor(idx);
d->disk->fops = &bcache_ops;
d->disk->private_data = d;
@@ -889,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
closure_init_stack(&cl);
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
- BUG_ON(atomic_read(&dc->count));
+ BUG_ON(refcount_read(&dc->count));
mutex_lock(&bch_register_lock);
@@ -1016,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
* dc->c must be set before dc->count != 0 - paired with the mb in
* cached_dev_get()
*/
- atomic_set(&dc->count, 1);
+ refcount_set(&dc->count, 1);
/* Block writeback thread, but spawn it */
down_write(&dc->writeback_lock);
@@ -1026,9 +1039,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
}
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
- bch_sectors_dirty_init(dc);
+ bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
- atomic_inc(&dc->count);
+ refcount_inc(&dc->count);
bch_writeback_queue(dc);
}
@@ -1059,6 +1072,8 @@ static void cached_dev_free(struct closure *cl)
cancel_delayed_work_sync(&dc->writeback_rate_update);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
+ if (dc->writeback_write_wq)
+ destroy_workqueue(dc->writeback_write_wq);
mutex_lock(&bch_register_lock);
@@ -1127,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
if (ret)
return ret;
- set_capacity(dc->disk.disk,
- dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
-
dc->disk.disk->queue->backing_dev_info->ra_pages =
max(dc->disk.disk->queue->backing_dev_info->ra_pages,
q->backing_dev_info->ra_pages);
@@ -1228,6 +1240,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
goto err;
bcache_device_attach(d, c, u - c->uuids);
+ bch_sectors_dirty_init(d);
bch_flash_dev_request_init(d);
add_disk(d->disk);
@@ -1374,9 +1387,6 @@ static void cache_set_flush(struct closure *cl)
struct btree *b;
unsigned i;
- if (!c)
- closure_return(cl);
-
bch_cache_accounting_destroy(&c->accounting);
kobject_put(&c->internal);
@@ -1964,6 +1974,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
else
err = "device busy";
mutex_unlock(&bch_register_lock);
+ if (!IS_ERR(bdev))
+ bdput(bdev);
if (attr == &ksysfs_register_quiet)
goto out;
}
@@ -2083,6 +2095,7 @@ static void bcache_exit(void)
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
unregister_reboot_notifier(&reboot);
+ mutex_destroy(&bch_register_lock);
}
static int __init bcache_init(void)
@@ -2101,14 +2114,15 @@ static int __init bcache_init(void)
bcache_major = register_blkdev(0, "bcache");
if (bcache_major < 0) {
unregister_reboot_notifier(&reboot);
+ mutex_destroy(&bch_register_lock);
return bcache_major;
}
if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
- sysfs_create_files(bcache_kobj, files) ||
bch_request_init() ||
- bch_debug_init(bcache_kobj))
+ bch_debug_init(bcache_kobj) ||
+ sysfs_create_files(bcache_kobj, files))
goto err;
return 0;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index f90f13616980..2290bffd4922 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -81,8 +81,9 @@ rw_attribute(writeback_delay);
rw_attribute(writeback_rate);
rw_attribute(writeback_rate_update_seconds);
-rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_i_term_inverse);
rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_minimum);
read_attribute(writeback_rate_debug);
read_attribute(stripe_size);
@@ -130,15 +131,16 @@ SHOW(__bch_cached_dev)
sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
var_print(writeback_rate_update_seconds);
- var_print(writeback_rate_d_term);
+ var_print(writeback_rate_i_term_inverse);
var_print(writeback_rate_p_term_inverse);
+ var_print(writeback_rate_minimum);
if (attr == &sysfs_writeback_rate_debug) {
char rate[20];
char dirty[20];
char target[20];
char proportional[20];
- char derivative[20];
+ char integral[20];
char change[20];
s64 next_io;
@@ -146,7 +148,7 @@ SHOW(__bch_cached_dev)
bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
bch_hprint(proportional,dc->writeback_rate_proportional << 9);
- bch_hprint(derivative, dc->writeback_rate_derivative << 9);
+ bch_hprint(integral, dc->writeback_rate_integral_scaled << 9);
bch_hprint(change, dc->writeback_rate_change << 9);
next_io = div64_s64(dc->writeback_rate.next - local_clock(),
@@ -157,11 +159,11 @@ SHOW(__bch_cached_dev)
"dirty:\t\t%s\n"
"target:\t\t%s\n"
"proportional:\t%s\n"
- "derivative:\t%s\n"
+ "integral:\t%s\n"
"change:\t\t%s/sec\n"
"next io:\t%llims\n",
rate, dirty, target, proportional,
- derivative, change, next_io);
+ integral, change, next_io);
}
sysfs_hprint(dirty_data,
@@ -192,7 +194,7 @@ STORE(__cached_dev)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
- unsigned v = size;
+ ssize_t v = size;
struct cache_set *c;
struct kobj_uevent_env *env;
@@ -213,7 +215,7 @@ STORE(__cached_dev)
dc->writeback_rate.rate, 1, INT_MAX);
d_strtoul_nonzero(writeback_rate_update_seconds);
- d_strtoul(writeback_rate_d_term);
+ d_strtoul(writeback_rate_i_term_inverse);
d_strtoul_nonzero(writeback_rate_p_term_inverse);
d_strtoi_h(sequential_cutoff);
@@ -227,7 +229,7 @@ STORE(__cached_dev)
bch_cached_dev_run(dc);
if (attr == &sysfs_cache_mode) {
- ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+ v = bch_read_string_list(buf, bch_cache_modes + 1);
if (v < 0)
return v;
@@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_percent,
&sysfs_writeback_rate,
&sysfs_writeback_rate_update_seconds,
- &sysfs_writeback_rate_d_term,
+ &sysfs_writeback_rate_i_term_inverse,
&sysfs_writeback_rate_p_term_inverse,
&sysfs_writeback_rate_debug,
&sysfs_dirty_data,
@@ -615,8 +617,21 @@ STORE(__bch_cache_set)
bch_cache_accounting_clear(&c->accounting);
}
- if (attr == &sysfs_trigger_gc)
+ if (attr == &sysfs_trigger_gc) {
+ /*
+ * Garbage collection thread only works when sectors_to_gc < 0,
+ * when users write to sysfs entry trigger_gc, most of time
+ * they want to forcibly triger gargage collection. Here -1 is
+ * set to c->sectors_to_gc, to make gc_should_run() give a
+ * chance to permit gc thread to run. "give a chance" means
+ * before going into gc_should_run(), there is still chance
+ * that c->sectors_to_gc being set to other positive value. So
+ * writing sysfs entry trigger_gc won't always make sure gc
+ * thread takes effect.
+ */
+ atomic_set(&c->sectors_to_gc, -1);
wake_up_gc(c);
+ }
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
@@ -732,6 +747,11 @@ static struct attribute *bch_cache_set_internal_files[] = {
};
KTYPE(bch_cache_set_internal);
+static int __bch_cache_cmp(const void *l, const void *r)
+{
+ return *((uint16_t *)r) - *((uint16_t *)l);
+}
+
SHOW(__bch_cache)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -756,9 +776,6 @@ SHOW(__bch_cache)
CACHE_REPLACEMENT(&ca->sb));
if (attr == &sysfs_priority_stats) {
- int cmp(const void *l, const void *r)
- { return *((uint16_t *) r) - *((uint16_t *) l); }
-
struct bucket *b;
size_t n = ca->sb.nbuckets, i;
size_t unused = 0, available = 0, dirty = 0, meta = 0;
@@ -787,7 +804,7 @@ SHOW(__bch_cache)
p[i] = ca->buckets[i].prio;
mutex_unlock(&ca->set->bucket_lock);
- sort(p, n, sizeof(uint16_t), cmp, NULL);
+ sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL);
while (n &&
!cached[n - 1])
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 8c3a938f4bf0..e548b8b51322 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -74,24 +74,44 @@ STRTO_H(strtouint, unsigned int)
STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
+/**
+ * bch_hprint() - formats @v to human readable string for sysfs.
+ *
+ * @v - signed 64 bit integer
+ * @buf - the (at least 8 byte) buffer to format the result into.
+ *
+ * Returns the number of bytes used by format.
+ */
ssize_t bch_hprint(char *buf, int64_t v)
{
static const char units[] = "?kMGTPEZY";
- char dec[4] = "";
- int u, t = 0;
-
- for (u = 0; v >= 1024 || v <= -1024; u++) {
- t = v & ~(~0 << 10);
- v >>= 10;
- }
-
- if (!u)
- return sprintf(buf, "%llu", v);
-
- if (v < 100 && v > -100)
- snprintf(dec, sizeof(dec), ".%i", t / 100);
-
- return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+ int u = 0, t;
+
+ uint64_t q;
+
+ if (v < 0)
+ q = -v;
+ else
+ q = v;
+
+ /* For as long as the number is more than 3 digits, but at least
+ * once, shift right / divide by 1024. Keep the remainder for
+ * a digit after the decimal point.
+ */
+ do {
+ u++;
+
+ t = q & ~(~0 << 10);
+ q >>= 10;
+ } while (q >= 1000);
+
+ if (v < 0)
+ /* '-', up to 3 digits, '.', 1 digit, 1 character, null;
+ * yields 8 bytes.
+ */
+ return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]);
+ else
+ return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]);
}
ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[],
@@ -212,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
d->next += div_u64(done * NSEC_PER_SEC, d->rate);
- if (time_before64(now + NSEC_PER_SEC, d->next))
- d->next = now + NSEC_PER_SEC;
+ /* Bound the time. Don't let us fall further than 2 seconds behind
+ * (this prevents unnecessary backlog that would make it impossible
+ * to catch up). If we're ahead of the desired writeback rate,
+ * don't let us sleep more than 2.5 seconds (so we can notice/respond
+ * if the control system tells us to speed up!).
+ */
+ if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next))
+ d->next = now + NSEC_PER_SEC * 5LLU / 2LLU;
if (time_after64(now - NSEC_PER_SEC * 2, d->next))
d->next = now - NSEC_PER_SEC * 2;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cb8d2ccbb6c6..8f509290bb02 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -441,10 +441,10 @@ struct bch_ratelimit {
uint64_t next;
/*
- * Rate at which we want to do work, in units per nanosecond
+ * Rate at which we want to do work, in units per second
* The units here correspond to the units passed to bch_next_delay()
*/
- unsigned rate;
+ uint32_t rate;
};
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c49022a8dc9d..9b770b13bdf6 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -21,51 +21,67 @@
static void __update_writeback_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
- uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
+ bcache_flash_devs_sectors_dirty(c);
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
-
int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
c->cached_dev_sectors);
- /* PD controller */
-
+ /*
+ * PI controller:
+ * Figures out the amount that should be written per second.
+ *
+ * First, the error (number of sectors that are dirty beyond our
+ * target) is calculated. The error is accumulated (numerically
+ * integrated).
+ *
+ * Then, the proportional value and integral value are scaled
+ * based on configured values. These are stored as inverses to
+ * avoid fixed point math and to make configuration easy-- e.g.
+ * the default value of 40 for writeback_rate_p_term_inverse
+ * attempts to write at a rate that would retire all the dirty
+ * blocks in 40 seconds.
+ *
+ * The writeback_rate_i_inverse value of 10000 means that 1/10000th
+ * of the error is accumulated in the integral term per second.
+ * This acts as a slow, long-term average that is not subject to
+ * variations in usage like the p term.
+ */
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
- int64_t derivative = dirty - dc->disk.sectors_dirty_last;
- int64_t proportional = dirty - target;
- int64_t change;
-
- dc->disk.sectors_dirty_last = dirty;
-
- /* Scale to sectors per second */
-
- proportional *= dc->writeback_rate_update_seconds;
- proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
-
- derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
-
- derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
- (dc->writeback_rate_d_term /
- dc->writeback_rate_update_seconds) ?: 1, 0);
-
- derivative *= dc->writeback_rate_d_term;
- derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
-
- change = proportional + derivative;
+ int64_t error = dirty - target;
+ int64_t proportional_scaled =
+ div_s64(error, dc->writeback_rate_p_term_inverse);
+ int64_t integral_scaled;
+ uint32_t new_rate;
+
+ if ((error < 0 && dc->writeback_rate_integral > 0) ||
+ (error > 0 && time_before64(local_clock(),
+ dc->writeback_rate.next + NSEC_PER_MSEC))) {
+ /*
+ * Only decrease the integral term if it's more than
+ * zero. Only increase the integral term if the device
+ * is keeping up. (Don't wind up the integral
+ * ineffectively in either case).
+ *
+ * It's necessary to scale this by
+ * writeback_rate_update_seconds to keep the integral
+ * term dimensioned properly.
+ */
+ dc->writeback_rate_integral += error *
+ dc->writeback_rate_update_seconds;
+ }
- /* Don't increase writeback rate if the device isn't keeping up */
- if (change > 0 &&
- time_after64(local_clock(),
- dc->writeback_rate.next + NSEC_PER_MSEC))
- change = 0;
+ integral_scaled = div_s64(dc->writeback_rate_integral,
+ dc->writeback_rate_i_term_inverse);
- dc->writeback_rate.rate =
- clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
- 1, NSEC_PER_MSEC);
+ new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled),
+ dc->writeback_rate_minimum, NSEC_PER_SEC);
- dc->writeback_rate_proportional = proportional;
- dc->writeback_rate_derivative = derivative;
- dc->writeback_rate_change = change;
+ dc->writeback_rate_proportional = proportional_scaled;
+ dc->writeback_rate_integral_scaled = integral_scaled;
+ dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
+ dc->writeback_rate.rate = new_rate;
dc->writeback_rate_target = target;
}
@@ -178,15 +194,23 @@ static void write_dirty(struct closure *cl)
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
- dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
- io->bio.bi_iter.bi_sector = KEY_START(&w->key);
- bio_set_dev(&io->bio, io->dc->bdev);
- io->bio.bi_end_io = dirty_endio;
+ /*
+ * IO errors are signalled using the dirty bit on the key.
+ * If we failed to read, we should not attempt to write to the
+ * backing device. Instead, immediately go to write_dirty_finish
+ * to clean up.
+ */
+ if (KEY_DIRTY(&w->key)) {
+ dirty_init(w);
+ bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+ io->bio.bi_iter.bi_sector = KEY_START(&w->key);
+ bio_set_dev(&io->bio, io->dc->bdev);
+ io->bio.bi_end_io = dirty_endio;
- closure_bio_submit(&io->bio, cl);
+ closure_bio_submit(&io->bio, cl);
+ }
- continue_at(cl, write_dirty_finish, system_wq);
+ continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
static void read_dirty_endio(struct bio *bio)
@@ -206,7 +230,7 @@ static void read_dirty_submit(struct closure *cl)
closure_bio_submit(&io->bio, cl);
- continue_at(cl, write_dirty, system_wq);
+ continue_at(cl, write_dirty, io->dc->writeback_write_wq);
}
static void read_dirty(struct cached_dev *dc)
@@ -416,6 +440,8 @@ static int bch_writeback_thread(void *arg)
struct cached_dev *dc = arg;
bool searched_full_index;
+ bch_ratelimit_reset(&dc->writeback_rate);
+
while (!kthread_should_stop()) {
down_write(&dc->writeback_lock);
if (!atomic_read(&dc->has_dirty) ||
@@ -443,7 +469,6 @@ static int bch_writeback_thread(void *arg)
up_write(&dc->writeback_lock);
- bch_ratelimit_reset(&dc->writeback_rate);
read_dirty(dc);
if (searched_full_index) {
@@ -453,6 +478,8 @@ static int bch_writeback_thread(void *arg)
!kthread_should_stop() &&
!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
delay = schedule_timeout_interruptible(delay);
+
+ bch_ratelimit_reset(&dc->writeback_rate);
}
}
@@ -481,17 +508,15 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
return MAP_CONTINUE;
}
-void bch_sectors_dirty_init(struct cached_dev *dc)
+void bch_sectors_dirty_init(struct bcache_device *d)
{
struct sectors_dirty_init op;
bch_btree_op_init(&op.op, -1);
- op.inode = dc->disk.id;
+ op.inode = d->id;
- bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+ bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
sectors_dirty_init_fn, 0);
-
- dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -505,16 +530,22 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_percent = 10;
dc->writeback_delay = 30;
dc->writeback_rate.rate = 1024;
+ dc->writeback_rate_minimum = 8;
dc->writeback_rate_update_seconds = 5;
- dc->writeback_rate_d_term = 30;
- dc->writeback_rate_p_term_inverse = 6000;
+ dc->writeback_rate_p_term_inverse = 40;
+ dc->writeback_rate_i_term_inverse = 10000;
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
+ dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
+ WQ_MEM_RECLAIM, 0);
+ if (!dc->writeback_write_wq)
+ return -ENOMEM;
+
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
"bcache_writeback");
if (IS_ERR(dc->writeback_thread))
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 629bd1a502fd..7d25bff37a9b 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
+static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
+{
+ uint64_t i, ret = 0;
+
+ mutex_lock(&bch_register_lock);
+
+ for (i = 0; i < c->nr_uuids; i++) {
+ struct bcache_device *d = c->devices[i];
+
+ if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+ ret += bcache_dev_sectors_dirty(d);
+ }
+
+ mutex_unlock(&bch_register_lock);
+
+ return ret;
+}
+
static inline unsigned offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
@@ -57,7 +76,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
if (would_skip)
return false;
- return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+ return (op_is_sync(bio->bi_opf) ||
+ bio->bi_opf & (REQ_META|REQ_PRIO) ||
+ in_use <= CUTOFF_WRITEBACK);
}
static inline void bch_writeback_queue(struct cached_dev *dc)
@@ -70,7 +91,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
{
if (!atomic_read(&dc->has_dirty) &&
!atomic_xchg(&dc->has_dirty, 1)) {
- atomic_inc(&dc->count);
+ refcount_inc(&dc->count);
if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
@@ -84,7 +105,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
-void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_sectors_dirty_init(struct bcache_device *);
void bch_cached_dev_writeback_init(struct cached_dev *);
int bch_cached_dev_writeback_start(struct cached_dev *);
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index a8f630213a1a..cd1cd466addb 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -529,6 +529,16 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode)
cd = scsi_cd_get(bdev->bd_disk);
if (cd) {
ret = cdrom_open(&cd->cdi, bdev, mode);
+ /* wait for drive to get ready */
+ if ((ret == -ENOMEDIUM) && !(mode & FMODE_NDELAY))
+ switch (sr_disk_status(&cd->cdi)) {
+ case CDS_NO_DISC:
+ case CDS_NO_INFO:
+ case CDS_AUDIO:
+ break;;
+ default: /* looks like data disc was detected */
+ ret = cdrom_open(&cd->cdi, bdev, mode);
+ }
if (ret)
scsi_cd_put(cd);
}
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h
index 41e5b6784b97..7a2980f4e3e6 100644
--- a/include/asm-generic/mm_hooks.h
+++ b/include/asm-generic/mm_hooks.h
@@ -6,9 +6,10 @@
#ifndef _ASM_GENERIC_MM_HOOKS_H
#define _ASM_GENERIC_MM_HOOKS_H
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
{
+ return 0;
}
static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 4d7bb98f4134..72c26082488a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -968,6 +968,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot);
+
+#ifndef CONFIG_X86_ESPFIX64
+static inline void init_espfix_bsp(void) { }
+#endif
+
#endif /* !__ASSEMBLY__ */
#ifndef io_remap_pfn_range
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 59066a08d558..e07aed080d32 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -47,8 +47,8 @@ struct bpf_map {
u32 max_entries;
u32 map_flags;
u32 pages;
- bool unpriv_array;
u32 id;
+ bool unpriv_array;
struct user_struct *user;
const struct bpf_map_ops *ops;
struct work_struct work;
@@ -147,9 +147,14 @@ struct bpf_prog;
struct bpf_insn_access_aux {
enum bpf_reg_type reg_type;
int ctx_field_size;
- int converted_op_size;
};
+static inline void
+bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
+{
+ aux->ctx_field_size = size;
+}
+
struct bpf_verifier_ops {
/* return eBPF function prototype for verification */
const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -164,7 +169,7 @@ struct bpf_verifier_ops {
u32 (*convert_ctx_access)(enum bpf_access_type type,
const struct bpf_insn *src,
struct bpf_insn *dst,
- struct bpf_prog *prog);
+ struct bpf_prog *prog, u32 *target_size);
int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr);
};
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b8d200f60a40..a9c51d8105a3 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -15,11 +15,11 @@
* In practice this is far bigger than any realistic pointer offset; this limit
* ensures that umax_value + (int)off + (int)size cannot overflow a u64.
*/
-#define BPF_MAX_VAR_OFF (1ULL << 31)
+#define BPF_MAX_VAR_OFF (1 << 29)
/* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures
* that converting umax_value to int cannot overflow.
*/
-#define BPF_MAX_VAR_SIZ INT_MAX
+#define BPF_MAX_VAR_SIZ (1 << 29)
/* Liveness marks, used for registers and spilled-regs (in stack slots).
* Read marks propagate upwards until they find a write mark; they record that
@@ -110,11 +110,24 @@ struct bpf_insn_aux_data {
struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
};
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
- int converted_op_size; /* the valid value width after perceived conversion */
+ bool seen; /* this insn was processed by the verifier */
};
#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+struct bpf_verifer_log {
+ u32 level;
+ char *kbuf;
+ char __user *ubuf;
+ u32 len_used;
+ u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+ return log->len_used >= log->len_total - 1;
+}
+
struct bpf_verifier_env;
struct bpf_ext_analyzer_ops {
int (*insn_hook)(struct bpf_verifier_env *env,
@@ -139,6 +152,8 @@ struct bpf_verifier_env {
bool allow_ptr_leaks;
bool seen_direct_write;
struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+ struct bpf_verifer_log log;
};
int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 2ecb3c46b20a..3b5006fcd000 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -63,4 +63,16 @@ static inline void ssleep(unsigned int seconds)
msleep(seconds * 1000);
}
+#define poll_event_interruptible(event, interval) ({ \
+ int ret = 0; \
+ while (!(event)) { \
+ if (signal_pending(current)) { \
+ ret = -ERESTARTSYS; \
+ break; \
+ } \
+ msleep_interruptible(interval); \
+ } \
+ ret; \
+})
+
#endif /* defined(_LINUX_DELAY_H) */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 59589b2720af..01433de5ba62 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -337,6 +337,22 @@ struct bpf_prog_aux;
bpf_size; \
})
+#define bpf_size_to_bytes(bpf_size) \
+({ \
+ int bytes = -EINVAL; \
+ \
+ if (bpf_size == BPF_B) \
+ bytes = sizeof(u8); \
+ else if (bpf_size == BPF_H) \
+ bytes = sizeof(u16); \
+ else if (bpf_size == BPF_W) \
+ bytes = sizeof(u32); \
+ else if (bpf_size == BPF_DW) \
+ bytes = sizeof(u64); \
+ \
+ bytes; \
+})
+
#define BPF_SIZEOF(type) \
({ \
const int __size = bytes_to_bpf_size(sizeof(type)); \
@@ -351,6 +367,13 @@ struct bpf_prog_aux;
__size; \
})
+#define BPF_LDST_BYTES(insn) \
+ ({ \
+ const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \
+ WARN_ON(__size < 0); \
+ __size; \
+ })
+
#define __BPF_MAP_0(m, v, ...) v
#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
@@ -401,6 +424,18 @@ struct bpf_prog_aux;
#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__)
#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__)
+#define bpf_ctx_range(TYPE, MEMBER) \
+ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
+#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \
+ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
+
+#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \
+ ({ \
+ BUILD_BUG_ON(FIELD_SIZEOF(TYPE, MEMBER) != (SIZE)); \
+ *(PTR_SIZE) = (SIZE); \
+ offsetof(TYPE, MEMBER); \
+ })
+
#ifdef CONFIG_COMPAT
/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
@@ -564,6 +599,18 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
return prog->type == BPF_PROG_TYPE_UNSPEC;
}
+static inline bool
+bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
+{
+ bool off_ok;
+#ifdef __LITTLE_ENDIAN
+ off_ok = (off & (size_default - 1)) == 0;
+#else
+ off_ok = (off & (size_default - 1)) + size == size_default;
+#endif
+ return off_ok && size <= size_default && (size & (size - 1)) == 0;
+}
+
#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
#ifdef CONFIG_ARCH_HAS_SET_MEMORY
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index e3bb0635e94a..3b92610502ca 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -90,7 +90,7 @@ PTR_FIELD(PTR_GEN, 0, 8)
#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1)
-#define PTR(gen, offset, dev) \
+#define MAKE_PTR(gen, offset, dev) \
((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
/* Bkey utility code */
diff --git a/init/main.c b/init/main.c
index 87d52ec2ccf9..8f22f27143a2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -485,6 +485,8 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+ /* Should be run before the first non-init thread is created */
+ init_espfix_bsp();
}
asmlinkage __visible void __init start_kernel(void)
@@ -654,10 +656,6 @@ asmlinkage __visible void __init start_kernel(void)
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi_enter_virtual_mode();
#endif
-#ifdef CONFIG_X86_ESPFIX64
- /* Should be run before the first non-init thread is created */
- init_espfix_bsp();
-#endif
thread_stack_cache_init();
cred_init();
fork_init();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0876aed11b07..a2834188b687 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,27 +153,24 @@ struct bpf_call_arg_meta {
int access_size;
};
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
static DEFINE_MUTEX(bpf_verifier_lock);
/* log_level controls verbosity level of eBPF verifier.
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+ const char *fmt, ...)
{
+ struct bpf_verifer_log *log = &env->log;
va_list args;
- if (log_level == 0 || log_len >= log_size - 1)
+ if (!log->level || bpf_verifier_log_full(log))
return;
va_start(args, fmt);
- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ log->len_used += vscnprintf(log->kbuf + log->len_used,
+ log->len_total - log->len_used, fmt, args);
va_end(args);
}
@@ -206,7 +203,8 @@ static const char *func_id_name(int id)
return "unknown";
}
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state)
{
struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -217,21 +215,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
t = reg->type;
if (t == NOT_INIT)
continue;
- verbose(" R%d=%s", i, reg_type_str[t]);
+ verbose(env, " R%d=%s", i, reg_type_str[t]);
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
- verbose("%lld", reg->var_off.value + reg->off);
+ verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- verbose("(id=%d", reg->id);
+ verbose(env, "(id=%d", reg->id);
if (t != SCALAR_VALUE)
- verbose(",off=%d", reg->off);
+ verbose(env, ",off=%d", reg->off);
if (t == PTR_TO_PACKET)
- verbose(",r=%d", reg->range);
+ verbose(env, ",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(",ks=%d,vs=%d",
+ verbose(env, ",ks=%d,vs=%d",
reg->map_ptr->key_size,
reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
@@ -239,38 +237,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(",imm=%llx", reg->var_off.value);
+ verbose(env, ",imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(",smin_value=%lld",
+ verbose(env, ",smin_value=%lld",
(long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(",smax_value=%lld",
+ verbose(env, ",smax_value=%lld",
(long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(",umin_value=%llu",
+ verbose(env, ",umin_value=%llu",
(unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(",umax_value=%llu",
+ verbose(env, ",umax_value=%llu",
(unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(",var_off=%s", tn_buf);
+ verbose(env, ",var_off=%s", tn_buf);
}
}
- verbose(")");
+ verbose(env, ")");
}
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
}
- verbose("\n");
+ verbose(env, "\n");
}
static const char *const bpf_class_string[] = {
@@ -325,68 +323,87 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(const struct bpf_verifier_env *env,
+static void print_bpf_end_insn(struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
+static void print_bpf_insn(struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_SRC(insn->code) == BPF_X)
- verbose("(%02x) %sr%d %s %sr%d\n",
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ verbose(env, "BUG_alu64_%02x\n", insn->code);
+ else
+ print_bpf_end_insn(env, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose(env, "(%02x) r%d = %s-r%d\n",
+ insn->code, insn->dst_reg,
+ class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose(env, "(%02x) %sr%d %s %sr%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
class == BPF_ALU ? "(u32) " : "",
insn->src_reg);
- else
- verbose("(%02x) %sr%d %s %s%d\n",
+ } else {
+ verbose(env, "(%02x) %sr%d %s %s%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
class == BPF_ALU ? "(u32) " : "",
insn->imm);
+ }
} else if (class == BPF_STX) {
if (BPF_MODE(insn->code) == BPF_MEM)
- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->src_reg);
else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off,
insn->src_reg);
else
- verbose("BUG_%02x\n", insn->code);
+ verbose(env, "BUG_%02x\n", insn->code);
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_st_%02x\n", insn->code);
+ verbose(env, "BUG_st_%02x\n", insn->code);
return;
}
- verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
- verbose("BUG_ldx_%02x\n", insn->code);
+ verbose(env, "BUG_ldx_%02x\n", insn->code);
return;
}
- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
insn->code, insn->dst_reg,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->off);
} else if (class == BPF_LD) {
if (BPF_MODE(insn->code) == BPF_ABS) {
- verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->imm);
} else if (BPF_MODE(insn->code) == BPF_IND) {
- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
@@ -401,36 +418,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
if (map_ptr && !env->allow_ptr_leaks)
imm = 0;
- verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
insn->dst_reg, (unsigned long long)imm);
} else {
- verbose("BUG_ld_%02x\n", insn->code);
+ verbose(env, "BUG_ld_%02x\n", insn->code);
return;
}
} else if (class == BPF_JMP) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
- verbose("(%02x) call %s#%d\n", insn->code,
+ verbose(env, "(%02x) call %s#%d\n", insn->code,
func_id_name(insn->imm), insn->imm);
} else if (insn->code == (BPF_JMP | BPF_JA)) {
- verbose("(%02x) goto pc%+d\n",
+ verbose(env, "(%02x) goto pc%+d\n",
insn->code, insn->off);
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
- verbose("(%02x) exit\n", insn->code);
+ verbose(env, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
insn->code, insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->src_reg, insn->off);
} else {
- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
insn->code, insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->imm, insn->off);
}
} else {
- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ verbose(env, "(%02x) %s\n",
+ insn->code, bpf_class_string[class]);
}
}
@@ -469,7 +487,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
env->head = elem;
env->stack_size++;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose("BPF program is too complex\n");
+ verbose(env, "BPF program is too complex\n");
goto err;
}
return &elem->st;
@@ -507,10 +525,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
__mark_reg_known(reg, 0);
}
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_known_zero(regs, %u)\n", regno);
+ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -595,10 +614,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
__mark_reg_unbounded(reg);
}
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_unknown(regs, %u)\n", regno);
+ verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -613,10 +633,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
reg->type = NOT_INIT;
}
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno)
{
if (WARN_ON(regno >= MAX_BPF_REG)) {
- verbose("mark_reg_not_init(regs, %u)\n", regno);
+ verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
/* Something bad happened, let's kill all regs */
for (regno = 0; regno < MAX_BPF_REG; regno++)
__mark_reg_not_init(regs + regno);
@@ -625,22 +646,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
__mark_reg_not_init(regs + regno);
}
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- mark_reg_not_init(regs, i);
+ mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
}
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
- mark_reg_known_zero(regs, BPF_REG_FP);
+ mark_reg_known_zero(env, regs, BPF_REG_FP);
/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
- mark_reg_known_zero(regs, BPF_REG_1);
+ mark_reg_known_zero(env, regs, BPF_REG_1);
}
enum reg_arg_type {
@@ -674,44 +696,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
struct bpf_reg_state *regs = env->cur_state.regs;
if (regno >= MAX_BPF_REG) {
- verbose("R%d is invalid\n", regno);
+ verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (regs[regno].type == NOT_INIT) {
- verbose("R%d !read_ok\n", regno);
+ verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
mark_reg_read(&env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
- verbose("frame pointer is read only\n");
+ verbose(env, "frame pointer is read only\n");
return -EACCES;
}
regs[regno].live |= REG_LIVE_WRITTEN;
if (t == DST_OP)
- mark_reg_unknown(regs, regno);
+ mark_reg_unknown(env, regs, regno);
}
return 0;
}
-static int bpf_size_to_bytes(int bpf_size)
-{
- if (bpf_size == BPF_W)
- return 4;
- else if (bpf_size == BPF_H)
- return 2;
- else if (bpf_size == BPF_B)
- return 1;
- else if (bpf_size == BPF_DW)
- return 8;
- else
- return -EINVAL;
-}
-
static bool is_spillable_regtype(enum bpf_reg_type type)
{
switch (type) {
@@ -731,7 +739,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off,
int size, int value_regno)
{
int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -744,7 +753,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
@@ -779,7 +788,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
}
}
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
@@ -789,12 +799,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
if (slot_type[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose("invalid size of register spill\n");
+ verbose(env, "invalid size of register spill\n");
return -EACCES;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
if (slot_type[i] != STACK_SPILL) {
- verbose("corrupted spill memory\n");
+ verbose(env, "corrupted spill memory\n");
return -EACCES;
}
}
@@ -810,14 +820,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
} else {
for (i = 0; i < size; i++) {
if (slot_type[i] != STACK_MISC) {
- verbose("invalid read from stack off %d+%d size %d\n",
+ verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
}
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
return 0;
}
}
@@ -829,7 +839,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
if (off < 0 || size <= 0 || off + size > map->value_size) {
- verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
}
@@ -848,8 +858,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
- if (log_level)
- print_verifier_state(state);
+ if (env->log.level)
+ print_verifier_state(env, state);
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
@@ -857,13 +867,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* will have a set floor within our range.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
err = __check_map_access(env, regno, reg->smin_value + off, size);
if (err) {
- verbose("R%d min value is outside of the array range\n", regno);
+ verbose(env, "R%d min value is outside of the array range\n",
+ regno);
return err;
}
@@ -872,13 +883,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* If reg->umax_value + off could overflow, treat that as unbounded too.
*/
if (reg->umax_value >= BPF_MAX_VAR_OFF) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
regno);
return -EACCES;
}
err = __check_map_access(env, regno, reg->umax_value + off, size);
if (err)
- verbose("R%d max value is outside of the array range\n", regno);
+ verbose(env, "R%d max value is outside of the array range\n",
+ regno);
return err;
}
@@ -916,7 +928,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_reg_state *reg = &regs[regno];
if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
- verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
off, size, regno, reg->id, reg->off, reg->range);
return -EACCES;
}
@@ -939,13 +951,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
* detail to prove they're safe.
*/
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
}
err = __check_packet_access(env, regno, off, size);
if (err) {
- verbose("R%d offset is outside of the packet\n", regno);
+ verbose(env, "R%d offset is outside of the packet\n", regno);
return err;
}
return err;
@@ -955,7 +967,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
- struct bpf_insn_access_aux info = { .reg_type = *reg_type };
+ struct bpf_insn_access_aux info = {
+ .reg_type = *reg_type,
+ };
/* for analyzer ctx accesses are already validated and converted */
if (env->analyzer_ops)
@@ -963,25 +977,14 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
if (env->prog->aux->ops->is_valid_access &&
env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
- /* a non zero info.ctx_field_size indicates:
- * . For this field, the prog type specific ctx conversion algorithm
- * only supports whole field access.
- * . This ctx access is a candiate for later verifier transformation
- * to load the whole field and then apply a mask to get correct result.
- * a non zero info.converted_op_size indicates perceived actual converted
- * value width in convert_ctx_access.
+ /* A non zero info.ctx_field_size indicates that this field is a
+ * candidate for later verifier transformation to load the whole
+ * field and then apply a mask when accessed with a narrower
+ * access than actual ctx access size. A zero info.ctx_field_size
+ * will only allow for whole field access and rejects any other
+ * type of narrower access.
*/
- if ((info.ctx_field_size && !info.converted_op_size) ||
- (!info.ctx_field_size && info.converted_op_size)) {
- verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n",
- env->prog->type, off, size);
- return -EACCES;
- }
-
- if (info.ctx_field_size) {
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
- env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size;
- }
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
*reg_type = info.reg_type;
/* remember the offset of last byte accessed in ctx */
@@ -990,7 +993,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
return 0;
}
- verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
@@ -1008,7 +1011,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
}
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
struct tnum reg_off;
@@ -1033,7 +1037,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+ verbose(env,
+ "misaligned packet access off %d+%s+%d+%d size %d\n",
ip_align, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1041,7 +1046,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
return 0;
}
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
const char *pointer_desc,
int off, int size, bool strict)
{
@@ -1056,7 +1062,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("misaligned %saccess off %s+%d+%d size %d\n",
+ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
pointer_desc, tn_buf, reg->off, off, size);
return -EACCES;
}
@@ -1074,7 +1080,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
switch (reg->type) {
case PTR_TO_PACKET:
/* special case, because of NET_IP_ALIGN */
- return check_pkt_ptr_alignment(reg, off, size, strict);
+ return check_pkt_ptr_alignment(env, reg, off, size, strict);
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
break;
@@ -1092,7 +1098,31 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
default:
break;
}
- return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+ strict);
+}
+
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+ u64 mask;
+
+ /* clear high bits in bit representation */
+ reg->var_off = tnum_cast(reg->var_off, size);
+
+ /* fix arithmetic bounds */
+ mask = ((u64)1 << (size * 8)) - 1;
+ if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+ reg->umin_value &= mask;
+ reg->umax_value &= mask;
+ } else {
+ reg->umin_value = 0;
+ reg->umax_value = mask;
+ }
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value;
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1124,27 +1154,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into map\n", value_regno);
+ verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into ctx\n", value_regno);
+ verbose(env, "R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
/* ctx accesses must be at a fixed offset, so that we can
* determine what type of data were returned.
*/
if (reg->off) {
- verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+ verbose(env,
+ "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
regno, reg->off, off - reg->off);
return -EACCES;
}
@@ -1152,7 +1183,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable ctx access var_off=%s off=%d size=%d",
+ verbose(env,
+ "variable ctx access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
@@ -1163,9 +1195,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
else
- mark_reg_known_zero(state->regs, value_regno);
+ mark_reg_known_zero(env, state->regs,
+ value_regno);
state->regs[value_regno].id = 0;
state->regs[value_regno].off = 0;
state->regs[value_regno].range = 0;
@@ -1181,13 +1214,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable stack access var_off=%s off=%d size=%d",
+ verbose(env, "variable stack access var_off=%s off=%d size=%d",
tn_buf, off, size);
return -EACCES;
}
off += reg->var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
+ verbose(env, "invalid stack off=%d size=%d\n", off,
+ size);
return -EACCES;
}
@@ -1198,38 +1232,39 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!env->allow_ptr_leaks &&
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
size != BPF_REG_SIZE) {
- verbose("attempt to corrupt spilled pointer on stack\n");
+ verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
}
- err = check_stack_write(state, off, size, value_regno);
+ err = check_stack_write(env, state, off, size,
+ value_regno);
} else {
- err = check_stack_read(state, off, size, value_regno);
+ err = check_stack_read(env, state, off, size,
+ value_regno);
}
} else if (reg->type == PTR_TO_PACKET) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
- verbose("cannot write into packet\n");
+ verbose(env, "cannot write into packet\n");
return -EACCES;
}
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
- verbose("R%d leaks addr into packet\n", value_regno);
+ verbose(env, "R%d leaks addr into packet\n",
+ value_regno);
return -EACCES;
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(env, state->regs, value_regno);
} else {
- verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[reg->type]);
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str[reg->type]);
return -EACCES;
}
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
state->regs[value_regno].type == SCALAR_VALUE) {
/* b/h/w load zero-extends, mark upper bits as known 0 */
- state->regs[value_regno].var_off = tnum_cast(
- state->regs[value_regno].var_off, size);
- __update_reg_bounds(&state->regs[value_regno]);
+ coerce_reg_to_size(&state->regs[value_regno], size);
}
return err;
}
@@ -1240,7 +1275,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
insn->imm != 0) {
- verbose("BPF_XADD uses reserved fields\n");
+ verbose(env, "BPF_XADD uses reserved fields\n");
return -EINVAL;
}
@@ -1255,7 +1290,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d leaks addr into mem\n", insn->src_reg);
+ verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
return -EACCES;
}
@@ -1296,7 +1331,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
register_is_null(regs[regno]))
return 0;
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[regs[regno].type],
reg_type_str[PTR_TO_STACK]);
return -EACCES;
@@ -1307,14 +1342,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
- verbose("invalid variable stack read R%d var_off=%s\n",
+ verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
return -EACCES;
}
off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
access_size <= 0) {
- verbose("invalid stack type R%d off=%d access_size=%d\n",
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
regno, off, access_size);
return -EACCES;
}
@@ -1330,7 +1365,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
for (i = 0; i < access_size; i++) {
if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
- verbose("invalid indirect read from stack off %d+%d size %d\n",
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
}
@@ -1372,7 +1407,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
- verbose("R%d leaks addr into helper function\n", regno);
+ verbose(env, "R%d leaks addr into helper function\n",
+ regno);
return -EACCES;
}
return 0;
@@ -1380,7 +1416,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (type == PTR_TO_PACKET &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
- verbose("helper access to the packet is not allowed\n");
+ verbose(env, "helper access to the packet is not allowed\n");
return -EACCES;
}
@@ -1416,7 +1452,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
- verbose("unsupported arg_type %d\n", arg_type);
+ verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
@@ -1434,7 +1470,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose("invalid map_ptr to access map->key\n");
+ verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
if (type == PTR_TO_PACKET)
@@ -1450,7 +1486,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose("invalid map_ptr to access map->value\n");
+ verbose(env, "invalid map_ptr to access map->value\n");
return -EACCES;
}
if (type == PTR_TO_PACKET)
@@ -1470,7 +1506,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_SIZE cannot be first argument\n");
+ verbose(env,
+ "ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
@@ -1487,7 +1524,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta = NULL;
if (reg->smin_value < 0) {
- verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
regno);
return -EACCES;
}
@@ -1501,7 +1538,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
}
if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
regno);
return -EACCES;
}
@@ -1512,12 +1549,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return err;
err_type:
- verbose("R%d type=%s expected=%s\n", regno,
+ verbose(env, "R%d type=%s expected=%s\n", regno,
reg_type_str[type], reg_type_str[expected_type]);
return -EACCES;
}
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map, int func_id)
{
if (!map)
return 0;
@@ -1576,7 +1614,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
return 0;
error:
- verbose("cannot pass map_type %d into func %s#%d\n",
+ verbose(env, "cannot pass map_type %d into func %s#%d\n",
map->map_type, func_id_name(func_id), func_id);
return -EINVAL;
}
@@ -1611,7 +1649,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET ||
regs[i].type == PTR_TO_PACKET_END)
- mark_reg_unknown(regs, i);
+ mark_reg_unknown(env, regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1635,7 +1673,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
/* find function prototype */
if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
- verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
@@ -1643,13 +1682,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
fn = env->prog->aux->ops->get_func_proto(func_id);
if (!fn) {
- verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+ verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* eBPF programs must be GPL compatible to use GPL-ed functions */
if (!env->prog->gpl_compatible && fn->gpl_only) {
- verbose("cannot call GPL only function from proprietary program\n");
+ verbose(env, "cannot call GPL only function from proprietary program\n");
return -EINVAL;
}
@@ -1663,7 +1703,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
*/
err = check_raw_mode(fn);
if (err) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
@@ -1677,7 +1717,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return err;
if (func_id == BPF_FUNC_tail_call) {
if (meta.map_ptr == NULL) {
- verbose("verifier bug\n");
+ verbose(env, "verifier bug\n");
return -EINVAL;
}
env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
@@ -1703,14 +1743,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
/* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1718,14 +1758,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
- mark_reg_known_zero(regs, BPF_REG_0);
+ mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].off = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose("kernel subsystem misconfigured verifier\n");
+ verbose(env,
+ "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1736,12 +1777,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
else if (insn_aux->map_ptr != meta.map_ptr)
insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
- verbose("unknown return type %d of func %s#%d\n",
+ verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(meta.map_ptr, func_id);
+ err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
@@ -1750,14 +1791,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
- /* clear high 32 bits */
- reg->var_off = tnum_cast(reg->var_off, 4);
- /* Update bounds */
- __update_reg_bounds(reg);
-}
-
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -1778,6 +1811,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
return res > a;
}
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
+{
+ bool known = tnum_is_const(reg->var_off);
+ s64 val = reg->var_off.value;
+ s64 smin = reg->smin_value;
+
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+ verbose(env, "math between %s pointer and %lld is not allowed\n",
+ reg_type_str[type], val);
+ return false;
+ }
+
+ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "%s pointer offset %d is not allowed\n",
+ reg_type_str[type], reg->off);
+ return false;
+ }
+
+ if (smin == S64_MIN) {
+ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+ reg_type_str[type]);
+ return false;
+ }
+
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "value %lld makes %s pointer be out of bounds\n",
+ smin, reg_type_str[type]);
+ return false;
+ }
+
+ return true;
+}
+
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
@@ -1800,39 +1868,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad sbounds\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad sbounds\n");
return -EINVAL;
}
if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: known but bad ubounds\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env,
+ "verifier internal error: known but bad ubounds\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
if (!env->allow_ptr_leaks)
- verbose("R%d 32-bit pointer arithmetic prohibited\n",
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
dst);
return -EACCES;
}
if (ptr_reg->type == CONST_PTR_TO_MAP) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_PACKET_END) {
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
dst);
return -EACCES;
}
@@ -1843,6 +1914,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->type = ptr_reg->type;
dst_reg->id = ptr_reg->id;
+ if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+ !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+ return -EINVAL;
+
switch (opcode) {
case BPF_ADD:
/* We can take a fixed offset as long as it doesn't overflow
@@ -1897,7 +1972,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
if (!env->allow_ptr_leaks)
- verbose("R%d tried to subtract pointer from scalar\n",
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
dst);
return -EACCES;
}
@@ -1907,7 +1982,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
*/
if (ptr_reg->type == PTR_TO_STACK) {
if (!env->allow_ptr_leaks)
- verbose("R%d subtraction from stack pointer prohibited\n",
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
dst);
return -EACCES;
}
@@ -1962,23 +2037,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* ptr &= ~3 which would reduce min_value by 3.)
*/
if (!env->allow_ptr_leaks)
- verbose("R%d bitwise operator %s on pointer prohibited\n",
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
if (!env->allow_ptr_leaks)
- verbose("R%d pointer arithmetic with %s operator prohibited\n",
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
}
+ if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+ return -EINVAL;
+
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
}
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
@@ -1989,12 +2071,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
bool src_known, dst_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- if (BPF_CLASS(insn->code) != BPF_ALU64) {
- /* 32-bit ALU ops are (32,32)->64 */
- coerce_reg_to_32(dst_reg);
- coerce_reg_to_32(&src_reg);
- }
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
umin_val = src_reg.umin_value;
@@ -2002,6 +2080,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
src_known = tnum_is_const(src_reg.var_off);
dst_known = tnum_is_const(dst_reg->var_off);
+ if (!src_known &&
+ opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
switch (opcode) {
case BPF_ADD:
if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2130,11 +2214,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_LSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* We lose all sign bit information (except what we can pick
@@ -2158,11 +2242,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_RSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
/* BPF_RSH is an unsigned shift. If the value in dst_reg might
@@ -2192,10 +2276,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
default:
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->32 */
+ coerce_reg_to_size(dst_reg, 4);
+ coerce_reg_to_size(&src_reg, 4);
+ }
+
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
@@ -2224,12 +2314,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* an arbitrary scalar.
*/
if (!env->allow_ptr_leaks) {
- verbose("R%d pointer %s pointer prohibited\n",
+ verbose(env, "R%d pointer %s pointer prohibited\n",
insn->dst_reg,
bpf_alu_string[opcode >> 4]);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
+ mark_reg_unknown(env, regs, insn->dst_reg);
return 0;
} else {
/* scalar += pointer
@@ -2281,13 +2371,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: unexpected ptr_reg\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(&env->cur_state);
- verbose("verifier internal error: no src_reg\n");
+ print_verifier_state(env, &env->cur_state);
+ verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2305,14 +2395,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
- verbose("BPF_NEG uses reserved fields\n");
+ verbose(env, "BPF_NEG uses reserved fields\n");
return -EINVAL;
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
(insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
BPF_CLASS(insn->code) == BPF_ALU64) {
- verbose("BPF_END uses reserved fields\n");
+ verbose(env, "BPF_END uses reserved fields\n");
return -EINVAL;
}
}
@@ -2323,7 +2413,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer arithmetic prohibited\n",
+ verbose(env, "R%d pointer arithmetic prohibited\n",
insn->dst_reg);
return -EACCES;
}
@@ -2337,7 +2427,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
@@ -2347,7 +2437,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_MOV uses reserved fields\n");
+ verbose(env, "BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}
@@ -2367,15 +2457,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d partial copy of pointer\n",
+ verbose(env,
+ "R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
- mark_reg_unknown(regs, insn->dst_reg);
- /* high 32 bits are known zero. */
- regs[insn->dst_reg].var_off = tnum_cast(
- regs[insn->dst_reg].var_off, 4);
- __update_reg_bounds(&regs[insn->dst_reg]);
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ coerce_reg_to_size(&regs[insn->dst_reg], 4);
}
} else {
/* case: R = imm
@@ -2392,14 +2480,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
} else if (opcode > BPF_END) {
- verbose("invalid BPF_ALU opcode %x\n", opcode);
+ verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;
} else { /* all other ALU ops: and, sub, xor, add, ... */
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
/* check src1 operand */
@@ -2408,7 +2496,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
- verbose("BPF_ALU uses reserved fields\n");
+ verbose(env, "BPF_ALU uses reserved fields\n");
return -EINVAL;
}
}
@@ -2420,7 +2508,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
- verbose("div by zero\n");
+ verbose(env, "div by zero\n");
return -EINVAL;
}
@@ -2429,7 +2517,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
if (insn->imm < 0 || insn->imm >= size) {
- verbose("invalid shift %d\n", insn->imm);
+ verbose(env, "invalid shift %d\n", insn->imm);
return -EINVAL;
}
}
@@ -2790,13 +2878,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
int err;
if (opcode > BPF_JSLE) {
- verbose("invalid BPF_JMP opcode %x\n", opcode);
+ verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
@@ -2806,13 +2894,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
if (is_pointer_value(env, insn->src_reg)) {
- verbose("R%d pointer comparison prohibited\n",
+ verbose(env, "R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose("BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP uses reserved fields\n");
return -EINVAL;
}
}
@@ -2928,11 +3016,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* pkt_end <= pkt_data' */
find_good_pkt_pointers(this_branch, &regs[insn->src_reg], true);
} else if (is_pointer_value(env, insn->dst_reg)) {
- verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ verbose(env, "R%d pointer comparison prohibited\n",
+ insn->dst_reg);
return -EACCES;
}
- if (log_level)
- print_verifier_state(this_branch);
+ if (env->log.level)
+ print_verifier_state(env, this_branch);
return 0;
}
@@ -2951,11 +3040,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
- verbose("invalid BPF_LD_IMM insn\n");
+ verbose(env, "invalid BPF_LD_IMM insn\n");
return -EINVAL;
}
if (insn->off != 0) {
- verbose("BPF_LD_IMM64 uses reserved fields\n");
+ verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
return -EINVAL;
}
@@ -3013,14 +3102,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
@@ -3030,7 +3119,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ verbose(env,
+ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
return -EINVAL;
}
@@ -3043,7 +3133,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* reset caller saved regs to unreadable */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
- mark_reg_not_init(regs, caller_saved[i]);
+ mark_reg_not_init(env, regs, caller_saved[i]);
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
@@ -3051,7 +3141,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* the value fetched from the packet.
* Already marked as written above.
*/
- mark_reg_unknown(regs, BPF_REG_0);
+ mark_reg_unknown(env, regs, BPF_REG_0);
return 0;
}
@@ -3115,7 +3205,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
return 0;
if (w < 0 || w >= env->prog->len) {
- verbose("jump out of range from insn %d to %d\n", t, w);
+ verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}
@@ -3132,13 +3222,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
insn_stack[cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
- verbose("back-edge from insn %d to %d\n", t, w);
+ verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
} else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose("insn state internal bug\n");
+ verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
@@ -3232,7 +3322,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
if (cur_stack-- <= 0) {
- verbose("pop stack internal bug\n");
+ verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
@@ -3241,7 +3331,7 @@ mark_explored:
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
- verbose("unreachable insn %d\n", i);
+ verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
@@ -3620,7 +3710,7 @@ static int do_check(struct bpf_verifier_env *env)
int insn_processed = 0;
bool do_print_state = false;
- init_reg_state(regs);
+ init_reg_state(env, regs);
state->parent = NULL;
insn_idx = 0;
for (;;) {
@@ -3629,7 +3719,7 @@ static int do_check(struct bpf_verifier_env *env)
int err;
if (insn_idx >= insn_cnt) {
- verbose("invalid insn idx %d insn_cnt %d\n",
+ verbose(env, "invalid insn idx %d insn_cnt %d\n",
insn_idx, insn_cnt);
return -EFAULT;
}
@@ -3638,7 +3728,8 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Processed %d insn\n",
+ verbose(env,
+ "BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
@@ -3648,12 +3739,12 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (log_level) {
+ if (env->log.level) {
if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
+ verbose(env, "\nfrom %d to %d: safe\n",
prev_insn_idx, insn_idx);
else
- verbose("%d: safe\n", insn_idx);
+ verbose(env, "%d: safe\n", insn_idx);
}
goto process_bpf_exit;
}
@@ -3661,18 +3752,18 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (log_level > 1 || (log_level && do_print_state)) {
- if (log_level > 1)
- verbose("%d:", insn_idx);
+ if (env->log.level > 1 || (env->log.level && do_print_state)) {
+ if (env->log.level > 1)
+ verbose(env, "%d:", insn_idx);
else
- verbose("\nfrom %d to %d:",
+ verbose(env, "\nfrom %d to %d:",
prev_insn_idx, insn_idx);
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env, &env->cur_state);
do_print_state = false;
}
- if (log_level) {
- verbose("%d: ", insn_idx);
+ if (env->log.level) {
+ verbose(env, "%d: ", insn_idx);
print_bpf_insn(env, insn);
}
@@ -3680,6 +3771,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -3729,7 +3821,7 @@ static int do_check(struct bpf_verifier_env *env)
* src_reg == stack|map in some other branch.
* Reject it.
*/
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -3769,14 +3861,14 @@ static int do_check(struct bpf_verifier_env *env)
} else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
*prev_dst_type == PTR_TO_CTX)) {
- verbose("same insn cannot be used with different pointers\n");
+ verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
}
} else if (class == BPF_ST) {
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
- verbose("BPF_ST uses reserved fields\n");
+ verbose(env, "BPF_ST uses reserved fields\n");
return -EINVAL;
}
/* check src operand */
@@ -3799,7 +3891,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->off != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_CALL uses reserved fields\n");
+ verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
@@ -3812,7 +3904,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_JA uses reserved fields\n");
+ verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -3824,7 +3916,7 @@ static int do_check(struct bpf_verifier_env *env)
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
- verbose("BPF_EXIT uses reserved fields\n");
+ verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
@@ -3839,7 +3931,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
- verbose("R0 leaks addr as return value\n");
+ verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
@@ -3870,20 +3962,21 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
- verbose("invalid BPF_LD mode\n");
+ verbose(env, "invalid BPF_LD mode\n");
return -EINVAL;
}
} else {
- verbose("unknown insn class %d\n", class);
+ verbose(env, "unknown insn class %d\n", class);
return -EINVAL;
}
insn_idx++;
}
- verbose("processed %d insns, stack depth %d\n",
- insn_processed, env->prog->aux->stack_depth);
+ verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+ env->prog->aux->stack_depth);
return 0;
}
@@ -3895,7 +3988,8 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+ struct bpf_map *map,
struct bpf_prog *prog)
{
@@ -3906,12 +4000,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
- verbose("perf_event programs can only use preallocated hash map\n");
+ verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
- verbose("perf_event programs can only use preallocated inner hash map\n");
+ verbose(env, "perf_event programs can only use preallocated inner hash map\n");
return -EINVAL;
}
}
@@ -3934,14 +4028,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
- verbose("BPF_LDX uses reserved fields\n");
+ verbose(env, "BPF_LDX uses reserved fields\n");
return -EINVAL;
}
if (BPF_CLASS(insn->code) == BPF_STX &&
((BPF_MODE(insn->code) != BPF_MEM &&
BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
- verbose("BPF_STX uses reserved fields\n");
+ verbose(env, "BPF_STX uses reserved fields\n");
return -EINVAL;
}
@@ -3952,7 +4046,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
- verbose("invalid bpf_ld_imm64 insn\n");
+ verbose(env, "invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}
@@ -3961,19 +4055,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
goto next_insn;
if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
- verbose("unrecognized bpf_ld_imm64 insn\n");
+ verbose(env,
+ "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
f = fdget(insn->imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
- verbose("fd %d is not pointing to valid bpf_map\n",
+ verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}
- err = check_map_prog_compatibility(map, env->prog);
+ err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
@@ -4050,6 +4145,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
if (cnt == 1)
return 0;
@@ -4059,6 +4155,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -4077,23 +4175,44 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
return new_prog;
}
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0;
+ bool is_narrower_load;
+ u32 target_size;
if (ops->gen_prologue) {
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4127,39 +4246,51 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
- off = insn->off;
- size = bpf_size_to_bytes(BPF_SIZE(insn->code));
ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
- converted_op_size = env->insn_aux_data[i + delta].converted_op_size;
- is_narrower_load = type == BPF_READ && size < ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
/* If the read access is a narrower load of the field,
* convert to a 4/8-byte load, to minimum program type specific
* convert_ctx_access changes. If conversion is successful,
* we will apply proper mask to the result.
*/
+ is_narrower_load = size < ctx_field_size;
if (is_narrower_load) {
- int size_code = BPF_H;
+ u32 off = insn->off;
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verbose(env,
+ "bpf verifier narrow ctx access misconfigured\n");
+ return -EINVAL;
+ }
+ size_code = BPF_H;
if (ctx_field_size == 4)
size_code = BPF_W;
else if (ctx_field_size == 8)
size_code = BPF_DW;
+
insn->off = off & ~(ctx_field_size - 1);
insn->code = BPF_LDX | BPF_MEM | size_code;
}
- cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+
+ target_size = 0;
+ cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ (ctx_field_size && !target_size)) {
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
- if (is_narrower_load && size < converted_op_size) {
+
+ if (is_narrower_load && size < target_size) {
if (ctx_field_size <= 4)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
- (1 << size * 8) - 1);
+ (1 << size * 8) - 1);
else
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
- (1 << size * 8) - 1);
+ (1 << size * 8) - 1);
}
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
@@ -4225,7 +4356,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
map_ptr = env->insn_aux_data[i + delta].map_ptr;
if (map_ptr == BPF_MAP_PTR_POISON) {
- verbose("tail_call obusing map_ptr\n");
+ verbose(env, "tail_call obusing map_ptr\n");
return -EINVAL;
}
if (!map_ptr->unpriv_array)
@@ -4260,7 +4391,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
- verbose("bpf verifier is misconfigured\n");
+ verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@@ -4283,7 +4414,8 @@ patch_call_imm:
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose("kernel subsystem misconfigured func %s#%d\n",
+ verbose(env,
+ "kernel subsystem misconfigured func %s#%d\n",
func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -4317,8 +4449,8 @@ static void free_states(struct bpf_verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
- char __user *log_ubuf = NULL;
struct bpf_verifier_env *env;
+ struct bpf_verifer_log *log;
int ret = -EINVAL;
/* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4327,6 +4459,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
+ log = &env->log;
env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
(*prog)->len);
@@ -4342,23 +4475,20 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
- log_level = attr->log_level;
- log_ubuf = (char __user *) (unsigned long) attr->log_buf;
- log_size = attr->log_size;
- log_len = 0;
+ log->level = attr->log_level;
+ log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log->len_total = attr->log_size;
ret = -EINVAL;
- /* log_* values have to be sane */
- if (log_size < 128 || log_size > UINT_MAX >> 8 ||
- log_level == 0 || log_ubuf == NULL)
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf)
goto err_unlock;
ret = -ENOMEM;
- log_buf = vmalloc(log_size);
- if (!log_buf)
+ log->kbuf = vmalloc(log->len_total);
+ if (!log->kbuf)
goto err_unlock;
- } else {
- log_level = 0;
}
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4389,21 +4519,25 @@ skip_full_check:
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
if (ret == 0)
ret = fixup_bpf_calls(env);
- if (log_level && log_len >= log_size - 1) {
- BUG_ON(log_len >= log_size);
+ if (log->level && bpf_verifier_log_full(log)) {
+ BUG_ON(log->len_used >= log->len_total);
/* verifier log exceeded user supplied buffer */
ret = -ENOSPC;
/* fall through to return what was recorded */
}
/* copy verifier log back to user space including trailing zero */
- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ if (log->level && copy_to_user(log->ubuf, log->kbuf,
+ log->len_used + 1) != 0) {
ret = -EFAULT;
goto free_log_buf;
}
@@ -4430,8 +4564,8 @@ skip_full_check:
}
free_log_buf:
- if (log_level)
- vfree(log_buf);
+ if (log->level)
+ vfree(log->kbuf);
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
@@ -4468,8 +4602,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);
- log_level = 0;
-
env->strict_alignment = false;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e9e598756fc..49dda670889e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -705,8 +705,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
}
/* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
+ retval = arch_dup_mmap(oldmm, mm);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ed3a2cbdc93d..12be432001e6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -607,7 +607,8 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
- int sample_period_off;
+ const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
+ sample_period);
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
@@ -616,43 +617,35 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
if (off % size != 0)
return false;
- /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */
- sample_period_off = offsetof(struct bpf_perf_event_data, sample_period);
- if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) {
- int allowed;
-
-#ifdef __LITTLE_ENDIAN
- allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0;
-#else
- allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0;
-#endif
- if (!allowed)
+ switch (off) {
+ case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
+ bpf_ctx_record_field_size(info, size_sp);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
return false;
- info->ctx_field_size = 8;
- info->converted_op_size = 8;
- } else {
+ break;
+ default:
if (size != sizeof(long))
return false;
}
+
return true;
}
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
- BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
- offsetof(struct perf_sample_data, period));
+ bpf_target_off(struct perf_sample_data, period, 8,
+ target_size));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
diff --git a/net/core/filter.c b/net/core/filter.c
index e036ea912742..18fe4465c0ad 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2866,38 +2866,11 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)
}
}
-static void __set_access_aux_info(int off, struct bpf_insn_access_aux *info)
+static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
{
- info->ctx_field_size = 4;
- switch (off) {
- case offsetof(struct __sk_buff, pkt_type) ...
- offsetof(struct __sk_buff, pkt_type) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, vlan_present) ...
- offsetof(struct __sk_buff, vlan_present) + sizeof(__u32) - 1:
- info->converted_op_size = 1;
- break;
- case offsetof(struct __sk_buff, queue_mapping) ...
- offsetof(struct __sk_buff, queue_mapping) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, protocol) ...
- offsetof(struct __sk_buff, protocol) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, vlan_tci) ...
- offsetof(struct __sk_buff, vlan_tci) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, vlan_proto) ...
- offsetof(struct __sk_buff, vlan_proto) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, tc_index) ...
- offsetof(struct __sk_buff, tc_index) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, tc_classid) ...
- offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
- info->converted_op_size = 2;
- break;
- default:
- info->converted_op_size = 4;
- }
-}
+ const int size_default = sizeof(__u32);
-static bool __is_valid_access(int off, int size, enum bpf_access_type type,
- struct bpf_insn_access_aux *info)
-{
if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
@@ -2906,40 +2879,24 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type,
return false;
switch (off) {
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
- if (off + size >
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ if (off + size > offsetofend(struct __sk_buff, cb[4]))
return false;
break;
- case offsetof(struct __sk_buff, data) ...
- offsetof(struct __sk_buff, data) + sizeof(__u32) - 1:
- if (size != sizeof(__u32))
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (size != size_default)
return false;
- info->reg_type = PTR_TO_PACKET;
- break;
- case offsetof(struct __sk_buff, data_end) ...
- offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1:
- if (size != sizeof(__u32))
- return false;
- info->reg_type = PTR_TO_PACKET_END;
break;
default:
+ /* Only narrow read access allowed for now. */
if (type == BPF_WRITE) {
- if (size != sizeof(__u32))
+ if (size != size_default)
return false;
} else {
- int allowed;
-
- /* permit narrower load for not cb/data/data_end fields */
-#ifdef __LITTLE_ENDIAN
- allowed = (off & 0x3) == 0 && size <= 4 && (size & (size - 1)) == 0;
-#else
- allowed = (off & 0x3) + size == 4 && size <= 4 && (size & (size - 1)) == 0;
-#endif
- if (!allowed)
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
return false;
- __set_access_aux_info(off, info);
}
}
@@ -2951,26 +2908,22 @@ static bool sk_filter_is_valid_access(int off, int size,
struct bpf_insn_access_aux *info)
{
switch (off) {
- case offsetof(struct __sk_buff, tc_classid) ...
- offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, data) ...
- offsetof(struct __sk_buff, data) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, data_end) ...
- offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1:
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_end):
return false;
}
if (type == BPF_WRITE) {
switch (off) {
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
}
}
- return __is_valid_access(off, size, type, info);
+ return bpf_skb_is_valid_access(off, size, type, info);
}
static bool lwt_is_valid_access(int off, int size,
@@ -2978,24 +2931,31 @@ static bool lwt_is_valid_access(int off, int size,
struct bpf_insn_access_aux *info)
{
switch (off) {
- case offsetof(struct __sk_buff, tc_classid) ...
- offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
return false;
}
if (type == BPF_WRITE) {
switch (off) {
- case offsetof(struct __sk_buff, mark):
- case offsetof(struct __sk_buff, priority):
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
}
}
- return __is_valid_access(off, size, type, info);
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, info);
}
static bool sock_filter_is_valid_access(int off, int size,
@@ -3067,19 +3027,27 @@ static bool tc_cls_act_is_valid_access(int off, int size,
{
if (type == BPF_WRITE) {
switch (off) {
- case offsetof(struct __sk_buff, mark):
- case offsetof(struct __sk_buff, tc_index):
- case offsetof(struct __sk_buff, priority):
- case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
- case offsetof(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, tc_index):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
default:
return false;
}
}
- return __is_valid_access(off, size, type, info);
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, info);
}
static bool __is_valid_xdp_access(int off, int size)
@@ -3122,98 +3090,108 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
int off;
switch (si->off) {
case offsetof(struct __sk_buff, len):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, len));
+ bpf_target_off(struct sk_buff, len, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, protocol):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
-
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, protocol));
+ bpf_target_off(struct sk_buff, protocol, 2,
+ target_size));
break;
case offsetof(struct __sk_buff, vlan_proto):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
-
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, vlan_proto));
+ bpf_target_off(struct sk_buff, vlan_proto, 2,
+ target_size));
break;
case offsetof(struct __sk_buff, priority):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
-
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, priority));
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, priority));
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, ingress_ifindex):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, skb_iif));
+ bpf_target_off(struct sk_buff, skb_iif, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, ifindex):
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
- offsetof(struct net_device, ifindex));
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, hash):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, hash));
+ bpf_target_off(struct sk_buff, hash, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, mark):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
-
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, mark));
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, mark));
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
break;
case offsetof(struct __sk_buff, pkt_type):
- return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
- si->src_reg, insn);
+ *target_size = 1;
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
+ PKT_TYPE_OFFSET());
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
+#endif
+ break;
case offsetof(struct __sk_buff, queue_mapping):
- return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
- si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, queue_mapping, 2,
+ target_size));
+ break;
case offsetof(struct __sk_buff, vlan_present):
- return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
- si->dst_reg, si->src_reg, insn);
-
case offsetof(struct __sk_buff, vlan_tci):
- return convert_skb_access(SKF_AD_VLAN_TAG,
- si->dst_reg, si->src_reg, insn);
+ BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff, vlan_tci, 2,
+ target_size));
+ if (si->off == offsetof(struct __sk_buff, vlan_tci)) {
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg,
+ ~VLAN_TAG_PRESENT);
+ } else {
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
+ }
+ break;
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+ offsetofend(struct __sk_buff, cb[4]) - 1:
BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
offsetof(struct qdisc_skb_cb, data)) %
@@ -3239,6 +3217,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
off -= offsetof(struct __sk_buff, tc_classid);
off += offsetof(struct sk_buff, cb);
off += offsetof(struct qdisc_skb_cb, tc_classid);
+ *target_size = 2;
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
si->src_reg, off);
@@ -3264,15 +3243,16 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
-
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, tc_index));
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, tc_index));
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
#else
+ *target_size = 2;
if (type == BPF_WRITE)
*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
else
@@ -3282,13 +3262,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, napi_id):
#if defined(CONFIG_NET_RX_BUSY_POLL)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, napi_id));
+ bpf_target_off(struct sk_buff, napi_id, 4,
+ target_size));
*insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
+ *target_size = 4;
*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
break;
@@ -3300,7 +3280,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
@@ -3344,22 +3324,22 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct __sk_buff, ifindex):
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, dev));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
- offsetof(struct net_device, ifindex));
+ bpf_target_off(struct net_device, ifindex, 4,
+ target_size));
break;
default:
- return bpf_convert_ctx_access(type, si, insn_buf, prog);
+ return bpf_convert_ctx_access(type, si, insn_buf, prog,
+ target_size);
}
return insn - insn_buf;
@@ -3368,7 +3348,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
static u32 xdp_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 45ef5915462c..16580a82e1c8 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -221,6 +221,7 @@ static struct snd_seq_client *seq_create_client1(int client_index, int poolsize)
rwlock_init(&client->ports_lock);
mutex_init(&client->ports_mutex);
INIT_LIST_HEAD(&client->ports_list_head);
+ mutex_init(&client->ioctl_mutex);
/* find free slot in the client table */
spin_lock_irqsave(&clients_lock, flags);
@@ -2127,7 +2128,9 @@ static long snd_seq_ioctl(struct file *file, unsigned int cmd,
return -EFAULT;
}
+ mutex_lock(&client->ioctl_mutex);
err = handler->func(client, &buf);
+ mutex_unlock(&client->ioctl_mutex);
if (err >= 0) {
/* Some commands includes a bug in 'dir' field. */
if (handler->cmd == SNDRV_SEQ_IOCTL_SET_QUEUE_CLIENT ||
diff --git a/sound/core/seq/seq_clientmgr.h b/sound/core/seq/seq_clientmgr.h
index c6614254ef8a..0611e1e0ed5b 100644
--- a/sound/core/seq/seq_clientmgr.h
+++ b/sound/core/seq/seq_clientmgr.h
@@ -61,6 +61,7 @@ struct snd_seq_client {
struct list_head ports_list_head;
rwlock_t ports_lock;
struct mutex ports_mutex;
+ struct mutex ioctl_mutex;
int convert32; /* convert 32->64bit */
/* output pool */
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index b9a22f18566a..064ed289fced 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -574,13 +574,10 @@ static void do_multicpu_tests(void)
static int finish_exec_test(void)
{
/*
- * In a sensible world, this would be check_invalid_segment(0, 1);
- * For better or for worse, though, the LDT is inherited across exec.
- * We can probably change this safely, but for now we test it.
+ * Older kernel versions did inherit the LDT on exec() which is
+ * wrong because exec() starts from a clean state.
*/
- check_valid_segment(0, 1,
- AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB,
- 42, true);
+ check_invalid_segment(0, 1);
return nerrs ? 1 : 0;
}