Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohannes Thumshirn <jthumshirn@suse.de>2018-09-14 15:33:21 +0200
committerJohannes Thumshirn <jthumshirn@suse.de>2018-09-14 15:45:20 +0200
commitf5b5c89346353601d60584d4081450902e2eeac9 (patch)
tree2845c88f5b0907702cd882fe15ec6dc4204ec56e
parent0b3d56e94e961ab38aa829fd38199a7e80124017 (diff)
parentc65af74e155789d436ba36f2c1319f6ddfd30afd (diff)
Merge remote-tracking branch 'origin/SLE15' into SLE12-SP4
Conflicts: patches.arch/0001-x86-speculation-l1tf-Increase-l1tf-memory-limit-for-.patch patches.kabi/0001-x86-kabi-speculation-l1tf-Increase-l1tf-memory-limit-for-.patch series.conf suse-commit: 6fa8cb9fff7b9fdfa5278e27ca73eadcfedd5baf
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h3
-rw-r--r--arch/powerpc/kernel/mce_power.c26
-rw-r--r--arch/powerpc/lib/code-patching.c18
-rw-r--r--arch/powerpc/mm/slb.c41
-rw-r--r--arch/powerpc/platforms/pseries/ras.c4
-rw-r--r--arch/powerpc/platforms/pseries/setup.c25
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/events/intel/ds.c8
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/mm/kasan_init_64.c153
-rw-r--r--arch/x86/xen/efi.c45
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_display.c7
-rw-r--r--drivers/net/ethernet/ibm/ibmvnic.c12
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_ethtool.c2
-rw-r--r--drivers/net/ethernet/ti/davinci_cpdma.c2
-rw-r--r--drivers/pci/quirks.c6
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c31
-rw-r--r--drivers/vfio/vfio_iommu_type1.c73
-rw-r--r--drivers/vhost/scsi.c5
-rw-r--r--drivers/vhost/vhost.c8
-rw-r--r--drivers/xen/cpu_hotplug.c2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/disk-io.c160
-rw-r--r--fs/btrfs/extent-tree.c86
-rw-r--r--fs/btrfs/tree-checker.c699
-rw-r--r--fs/btrfs/tree-checker.h39
-rw-r--r--fs/btrfs/volumes.c186
-rw-r--r--fs/btrfs/volumes.h4
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c5
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c24
-rw-r--r--fs/xfs/xfs_icache.c73
-rw-r--r--include/uapi/linux/btrfs_tree.h1
-rw-r--r--kernel/sched/fair.c194
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--mm/memory_hotplug.c3
-rw-r--r--mm/page_alloc.c4
-rw-r--r--net/irda/af_irda.c13
38 files changed, 1547 insertions, 433 deletions
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 3382e4560c07..21498155ff69 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -487,6 +487,9 @@ extern void hpte_init_native(void);
extern void slb_initialize(void);
extern void slb_flush_and_rebolt(void);
+void slb_flush_all_realmode(void);
+void __slb_restore_bolted_realmode(void);
+void slb_restore_bolted_realmode(void);
extern void slb_vmalloc_update(void);
extern void slb_set_size(u16 size);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e2f834d0cb0d..10be4c8d3682 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -32,11 +32,8 @@
#ifdef CONFIG_PPC_BOOK3S_64
static void flush_and_reload_slb(void)
{
- struct slb_shadow *slb;
- unsigned long i, n;
-
/* Invalidate all SLBs */
- asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+ slb_flush_all_realmode();
#ifdef CONFIG_KVM_BOOK3S_HANDLER
/*
@@ -46,22 +43,17 @@ static void flush_and_reload_slb(void)
if (get_paca()->kvm_hstate.in_guest)
return;
#endif
-
- /* For host kernel, reload the SLBs from shadow SLB buffer. */
- slb = get_slb_shadow();
- if (!slb)
+ if (early_radix_enabled())
return;
- n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE);
-
- /* Load up the SLB entries from shadow SLB */
- for (i = 0; i < n; i++) {
- unsigned long rb = be64_to_cpu(slb->save_area[i].esid);
- unsigned long rs = be64_to_cpu(slb->save_area[i].vsid);
+ /*
+ * This probably shouldn't happen, but it may be possible it's
+ * called in early boot before SLB shadows are allocated.
+ */
+ if (!get_slb_shadow())
+ return;
- rb = (rb & ~0xFFFul) | i;
- asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
- }
+ slb_restore_bolted_realmode();
}
#endif
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 3dfe98786ab6..99ee07a05a2f 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -13,15 +13,33 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/page.h>
+#include <asm/sections.h>
#include <asm/code-patching.h>
#include <linux/uaccess.h>
#include <linux/kprobes.h>
+static inline bool in_init_section(unsigned int *patch_addr)
+{
+ if (patch_addr < (unsigned int *)__init_begin)
+ return false;
+ if (patch_addr >= (unsigned int *)__init_end)
+ return false;
+ return true;
+}
+
+static inline bool init_freed(void)
+{
+ return (system_state >= SYSTEM_RUNNING);
+}
int patch_instruction(unsigned int *addr, unsigned int instr)
{
int err;
+ /* Make sure we aren't patching a freed init section */
+ if (in_init_section(addr) && init_freed())
+ return 0;
+
__put_user_size(instr, addr, 4, err);
if (err)
return err;
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 66d8a4d2a14d..532382b2149f 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -77,7 +77,7 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
static inline void slb_shadow_clear(enum slb_index index)
{
- WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0);
+ WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
}
static inline void create_shadowed_slbe(unsigned long ea, int ssize,
@@ -97,6 +97,45 @@ static inline void create_shadowed_slbe(unsigned long ea, int ssize,
: "memory" );
}
+/*
+ * Insert bolted entries into SLB (which may not be empty, so don't clear
+ * slb_cache_ptr).
+ */
+void __slb_restore_bolted_realmode(void)
+{
+ struct slb_shadow *p = get_slb_shadow();
+ enum slb_index index;
+
+ /* No isync needed because realmode. */
+ for (index = 0; index < SLB_NUM_BOLTED; index++) {
+ asm volatile("slbmte %0,%1" :
+ : "r" (be64_to_cpu(p->save_area[index].vsid)),
+ "r" (be64_to_cpu(p->save_area[index].esid)));
+ }
+}
+
+/*
+ * Insert the bolted entries into an empty SLB.
+ * This is not the same as rebolt because the bolted segments are not
+ * changed, just loaded from the shadow area.
+ */
+void slb_restore_bolted_realmode(void)
+{
+ __slb_restore_bolted_realmode();
+ get_paca()->slb_cache_ptr = 0;
+}
+
+/*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+void slb_flush_all_realmode(void)
+{
+ /*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+ asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+}
+
static void __slb_flush_and_rebolt(void)
{
/* If you change this make sure you change SLB_NUM_BOLTED
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 950400979aad..b7cb59425430 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -360,7 +360,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
}
savep = __va(regs->gpr[3]);
- regs->gpr[3] = savep[0]; /* restore original r3 */
+ regs->gpr[3] = be64_to_cpu(savep[0]); /* restore original r3 */
/* If it isn't an extended log we can use the per cpu 64bit buffer */
h = (struct rtas_error_log *)&savep[1];
@@ -371,7 +371,7 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
int len, error_log_length;
error_log_length = 8 + rtas_error_extended_log_length(h);
- len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
+ len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
memcpy(global_mce_data_buf, h, len);
errhdr = (struct rtas_error_log *)global_mce_data_buf;
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 16d26cde2beb..c295699217a6 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -646,6 +646,15 @@ void of_pci_parse_iov_addrs(struct pci_dev *dev, const int *indexes)
}
}
+static void pseries_disable_sriov_resources(struct pci_dev *pdev)
+{
+ int i;
+
+ dev_warn(&pdev->dev, "No hypervisor support for SR-IOV on this device, IOV BARs disabled.\n");
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
+ pdev->resource[i + PCI_IOV_RESOURCES].flags = 0;
+}
+
static void pseries_pci_fixup_resources(struct pci_dev *pdev)
{
const int *indexes;
@@ -653,10 +662,10 @@ static void pseries_pci_fixup_resources(struct pci_dev *pdev)
/*Firmware must support open sriov otherwise dont configure*/
indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
- if (!indexes)
- return;
- /* Assign the addresses from device tree*/
- of_pci_set_vf_bar_size(pdev, indexes);
+ if (indexes)
+ of_pci_set_vf_bar_size(pdev, indexes);
+ else
+ pseries_disable_sriov_resources(pdev);
}
static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
@@ -668,10 +677,10 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
return;
/*Firmware must support open sriov otherwise dont configure*/
indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
- if (!indexes)
- return;
- /* Assign the addresses from device tree*/
- of_pci_parse_iov_addrs(pdev, indexes);
+ if (indexes)
+ of_pci_parse_iov_addrs(pdev, indexes);
+ else
+ pseries_disable_sriov_resources(pdev);
}
static resource_size_t pseries_pci_iov_resource_alignment(struct pci_dev *pdev,
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 722813b7c725..a6d010ab2fb2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -101,7 +101,7 @@ config X86
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
- select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+ select HAVE_ARCH_KASAN if X86_64
select HAVE_ARCH_KGDB
select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_MMAP_RND_BITS if MMU
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 2824735ea724..1c0d462c7967 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -412,9 +412,11 @@ static int alloc_bts_buffer(int cpu)
ds->bts_buffer_base = (unsigned long) cea;
ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
ds->bts_index = ds->bts_buffer_base;
- max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
- ds->bts_absolute_maximum = ds->bts_buffer_base + max;
- ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ (max / 16) * BTS_RECORD_SIZE;
return 0;
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 2da40dbf6ac3..f7c7bc91afd2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -149,6 +149,9 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return false;
+ if (c->x86 != 6)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
if (c->x86_model == spectre_bad_microcodes[i].model &&
c->x86_mask == spectre_bad_microcodes[i].stepping)
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index b9c879ed4339..30be90a289da 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -3,12 +3,14 @@
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/vmalloc.h>
#include <asm/e820/types.h>
+#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -16,7 +18,134 @@
extern struct range pfn_mapped[E820_MAX_ENTRIES];
-static int __init map_range(struct range *range)
+static __init void *early_alloc(size_t size, int nid)
+{
+ return memblock_virt_alloc_try_nid_nopanic(size, size,
+ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+}
+
+static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pte_t *pte;
+
+ if (pmd_none(*pmd)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_PSE) &&
+ ((end - addr) == PMD_SIZE) &&
+ IS_ALIGNED(addr, PMD_SIZE)) {
+ p = early_alloc(PMD_SIZE, nid);
+ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PMD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid);
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ pte_t entry;
+ void *p;
+
+ if (!pte_none(*pte))
+ continue;
+
+ p = early_alloc(PAGE_SIZE, nid);
+ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ if (pud_none(*pud)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
+ ((end - addr) == PUD_SIZE) &&
+ IS_ALIGNED(addr, PUD_SIZE)) {
+ p = early_alloc(PUD_SIZE, nid);
+ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PUD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid);
+ pud_populate(&init_mm, pud, p);
+ }
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (!pmd_large(*pmd))
+ kasan_populate_pmd(pmd, addr, next, nid);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ if (p4d_none(*p4d)) {
+ void *p = early_alloc(PAGE_SIZE, nid);
+
+ p4d_populate(&init_mm, p4d, p);
+ }
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (!pud_large(*pud))
+ kasan_populate_pud(pud, addr, next, nid);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ void *p;
+ p4d_t *p4d;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ p = early_alloc(PAGE_SIZE, nid);
+ pgd_populate(&init_mm, pgd, p);
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ kasan_populate_p4d(p4d, addr, next, nid);
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
+ int nid)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ addr = addr & PAGE_MASK;
+ end = round_up(end, PAGE_SIZE);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_populate_pgd(pgd, addr, next, nid);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static void __init map_range(struct range *range)
{
unsigned long start;
unsigned long end;
@@ -29,7 +158,7 @@ static int __init map_range(struct range *range)
* to slightly speed up fastpath. In some rare cases we could cross
* boundary of mapped shadow, so we just map some more here.
*/
- return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
+ kasan_populate_shadow(start, end + 1, early_pfn_to_nid(range->start));
}
static void __init clear_pgds(unsigned long start,
@@ -117,6 +246,7 @@ void __init kasan_early_init(void)
void __init kasan_init(void)
{
int i;
+ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
#ifdef CONFIG_KASAN_INLINE
register_die_notifier(&kasan_die_notifier);
@@ -135,13 +265,27 @@ void __init kasan_init(void)
if (pfn_mapped[i].end == 0)
break;
- if (map_range(&pfn_mapped[i]))
- panic("kasan: unable to allocate shadow!");
+ map_range(&pfn_mapped[i]);
}
+
+ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+ PAGE_SIZE);
+
+ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
+ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+ PAGE_SIZE);
+
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
shadow_cpu_entry_begin);
+ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+ (unsigned long)shadow_cpu_entry_end, 0);
+
kasan_populate_zero_shadow(shadow_cpu_entry_end,
kasan_mem_to_shadow((void *)__START_KERNEL_map));
@@ -151,6 +295,7 @@ void __init kasan_init(void)
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
+
load_cr3(init_top_pgt);
__flush_tlb_all();
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 30bb2e80cfe7..a18703be9ead 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -54,38 +54,6 @@ static efi_system_table_t efi_systab_xen __initdata = {
.tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */
};
-static const struct efi efi_xen __initconst = {
- .systab = NULL, /* Initialized later. */
- .runtime_version = 0, /* Initialized later. */
- .mps = EFI_INVALID_TABLE_ADDR,
- .acpi = EFI_INVALID_TABLE_ADDR,
- .acpi20 = EFI_INVALID_TABLE_ADDR,
- .smbios = EFI_INVALID_TABLE_ADDR,
- .smbios3 = EFI_INVALID_TABLE_ADDR,
- .sal_systab = EFI_INVALID_TABLE_ADDR,
- .boot_info = EFI_INVALID_TABLE_ADDR,
- .hcdp = EFI_INVALID_TABLE_ADDR,
- .uga = EFI_INVALID_TABLE_ADDR,
- .uv_systab = EFI_INVALID_TABLE_ADDR,
- .fw_vendor = EFI_INVALID_TABLE_ADDR,
- .runtime = EFI_INVALID_TABLE_ADDR,
- .config_table = EFI_INVALID_TABLE_ADDR,
- .get_time = xen_efi_get_time,
- .set_time = xen_efi_set_time,
- .get_wakeup_time = xen_efi_get_wakeup_time,
- .set_wakeup_time = xen_efi_set_wakeup_time,
- .get_variable = xen_efi_get_variable,
- .get_next_variable = xen_efi_get_next_variable,
- .set_variable = xen_efi_set_variable,
- .query_variable_info = xen_efi_query_variable_info,
- .update_capsule = xen_efi_update_capsule,
- .query_capsule_caps = xen_efi_query_capsule_caps,
- .get_next_high_mono_count = xen_efi_get_next_high_mono_count,
- .reset_system = xen_efi_reset_system,
- .set_virtual_address_map = NULL, /* Not used under Xen. */
- .flags = 0 /* Initialized later. */
-};
-
static efi_system_table_t __init *xen_efi_probe(void)
{
struct xen_platform_op op = {
@@ -102,7 +70,18 @@ static efi_system_table_t __init *xen_efi_probe(void)
/* Here we know that Xen runs on EFI platform. */
- efi = efi_xen;
+ efi.get_time = xen_efi_get_time;
+ efi.set_time = xen_efi_set_time;
+ efi.get_wakeup_time = xen_efi_get_wakeup_time;
+ efi.set_wakeup_time = xen_efi_set_wakeup_time;
+ efi.get_variable = xen_efi_get_variable;
+ efi.get_next_variable = xen_efi_get_next_variable;
+ efi.set_variable = xen_efi_set_variable;
+ efi.query_variable_info = xen_efi_query_variable_info;
+ efi.update_capsule = xen_efi_update_capsule;
+ efi.query_capsule_caps = xen_efi_query_capsule_caps;
+ efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count;
+ efi.reset_system = xen_efi_reset_system;
efi_systab_xen.tables = info->cfg.addr;
efi_systab_xen.nr_tables = info->cfg.nent;
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c
index caf53503c0f7..787495c33e84 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.c
+++ b/drivers/gpu/drm/nouveau/nouveau_display.c
@@ -356,8 +356,6 @@ nouveau_display_hpd_work(struct work_struct *work)
pm_runtime_get_sync(drm->dev->dev);
drm_helper_hpd_irq_event(drm->dev);
- /* enable polling for external displays */
- drm_kms_helper_poll_enable(drm->dev);
pm_runtime_mark_last_busy(drm->dev->dev);
pm_runtime_put_sync(drm->dev->dev);
@@ -412,6 +410,11 @@ nouveau_display_init(struct drm_device *dev)
if (ret)
return ret;
+ /* enable connector detection and polling for connectors without HPD
+ * support
+ */
+ drm_kms_helper_poll_enable(dev);
+
/* enable hotplug interrupts */
drm_connector_list_iter_begin(dev, &conn_iter);
nouveau_for_each_non_mst_connector_iter(connector, &conn_iter) {
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 401d0060d1e1..3146329392af 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1823,11 +1823,17 @@ static int do_reset(struct ibmvnic_adapter *adapter,
adapter->map_id = 1;
release_rx_pools(adapter);
release_tx_pools(adapter);
- init_rx_pools(netdev);
- init_tx_pools(netdev);
+ rc = init_rx_pools(netdev);
+ if (rc)
+ return rc;
+ rc = init_tx_pools(netdev);
+ if (rc)
+ return rc;
release_napi(adapter);
- init_napi(adapter);
+ rc = init_napi(adapter);
+ if (rc)
+ return rc;
} else {
rc = reset_tx_pools(adapter);
if (rc)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 6f72223d086e..26e657be1c0a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1923,7 +1923,7 @@ static void i40e_get_stat_strings(struct net_device *netdev, u8 *data)
data += ETH_GSTRING_LEN;
}
- WARN_ONCE(p - data != i40e_get_stats_count(netdev) * ETH_GSTRING_LEN,
+ WARN_ONCE(data - p != i40e_get_stats_count(netdev) * ETH_GSTRING_LEN,
"stat strings count mismatch!");
}
diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
index 7025f86a6732..7b710bc4ccba 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -191,7 +191,7 @@ static void cpdma_desc_pool_destroy(struct cpdma_ctlr *ctlr)
return;
WARN(gen_pool_size(pool->gen_pool) != gen_pool_avail(pool->gen_pool),
- "cpdma_desc_pool size %d != avail %d",
+ "cpdma_desc_pool size %zd != avail %zd",
gen_pool_size(pool->gen_pool),
gen_pool_avail(pool->gen_pool));
if (pool->cpumap)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index b26aceae5ad8..9de8119eb2e8 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4403,11 +4403,6 @@ static int pci_quirk_qcom_rp_acs(struct pci_dev *dev, u16 acs_flags)
*
* 0x9d10-0x9d1b PCI Express Root port #{1-12}
*
- * The 300 series chipset suffers from the same bug so include those root
- * ports here as well.
- *
- * 0xa32c-0xa343 PCI Express Root port #{0-24}
- *
* [1] http://www.intel.com/content/www/us/en/chipsets/100-series-chipset-datasheet-vol-2.html
* [2] http://www.intel.com/content/www/us/en/chipsets/100-series-chipset-datasheet-vol-1.html
* [3] http://www.intel.com/content/www/us/en/chipsets/100-series-chipset-spec-update.html
@@ -4425,7 +4420,6 @@ static bool pci_quirk_intel_spt_pch_acs_match(struct pci_dev *dev)
case 0xa110 ... 0xa11f: case 0xa167 ... 0xa16a: /* Sunrise Point */
case 0xa290 ... 0xa29f: case 0xa2e7 ... 0xa2ee: /* Union Point */
case 0x9d10 ... 0x9d1b: /* 7th & 8th Gen Mobile */
- case 0xa32c ... 0xa343: /* 300 series */
return true;
}
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 330a57024cbc..a5754ad61550 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -808,6 +808,7 @@ static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos,
{
__le16 *ctrl = (__le16 *)(vdev->vconfig + pos -
offset + PCI_EXP_DEVCTL);
+ int readrq = le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ;
count = vfio_default_config_write(vdev, pos, count, perm, offset, val);
if (count < 0)
@@ -833,6 +834,27 @@ static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos,
pci_try_reset_function(vdev->pdev);
}
+ /*
+ * MPS is virtualized to the user, writes do not change the physical
+ * register since determining a proper MPS value requires a system wide
+ * device view. The MRRS is largely independent of MPS, but since the
+ * user does not have that system-wide view, they might set a safe, but
+ * inefficiently low value. Here we allow writes through to hardware,
+ * but we set the floor to the physical device MPS setting, so that
+ * we can at least use full TLPs, as defined by the MPS value.
+ *
+ * NB, if any devices actually depend on an artificially low MRRS
+ * setting, this will need to be revisited, perhaps with a quirk
+ * though pcie_set_readrq().
+ */
+ if (readrq != (le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ)) {
+ readrq = 128 <<
+ ((le16_to_cpu(*ctrl) & PCI_EXP_DEVCTL_READRQ) >> 12);
+ readrq = max(readrq, pcie_get_mps(vdev->pdev));
+
+ pcie_set_readrq(vdev->pdev, readrq);
+ }
+
return count;
}
@@ -849,11 +871,14 @@ static int __init init_pci_cap_exp_perm(struct perm_bits *perm)
/*
* Allow writes to device control fields, except devctl_phantom,
- * which could confuse IOMMU, and the ARI bit in devctl2, which
- * is set at probe time. FLR gets virtualized via our writefn.
+ * which could confuse IOMMU, MPS, which can break communication
+ * with other physical devices, and the ARI bit in devctl2, which
+ * is set at probe time. FLR and MRRS get virtualized via our
+ * writefn.
*/
p_setw(perm, PCI_EXP_DEVCTL,
- PCI_EXP_DEVCTL_BCR_FLR, ~PCI_EXP_DEVCTL_PHANTOM);
+ PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD |
+ PCI_EXP_DEVCTL_READRQ, ~PCI_EXP_DEVCTL_PHANTOM);
p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
return 0;
}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 32e5c237814b..f67704a6f34b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -83,6 +83,7 @@ struct vfio_dma {
size_t size; /* Map size (bytes) */
int prot; /* IOMMU_READ/WRITE */
bool iommu_mapped;
+ bool lock_cap; /* capable(CAP_IPC_LOCK) */
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
};
@@ -246,29 +247,25 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
return ret;
}
-static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap)
+static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
{
struct mm_struct *mm;
- bool is_current;
int ret;
if (!npage)
return 0;
- is_current = (task->mm == current->mm);
-
- mm = is_current ? task->mm : get_task_mm(task);
+ mm = async ? get_task_mm(dma->task) : dma->task->mm;
if (!mm)
return -ESRCH; /* process exited */
ret = down_write_killable(&mm->mmap_sem);
if (!ret) {
if (npage > 0) {
- if (lock_cap ? !*lock_cap :
- !has_capability(task, CAP_IPC_LOCK)) {
+ if (!dma->lock_cap) {
unsigned long limit;
- limit = task_rlimit(task,
+ limit = task_rlimit(dma->task,
RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (mm->locked_vm + npage > limit)
@@ -282,7 +279,7 @@ static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap)
up_write(&mm->mmap_sem);
}
- if (!is_current)
+ if (async)
mmput(mm);
return ret;
@@ -391,7 +388,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
*/
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base,
- bool lock_cap, unsigned long limit)
+ unsigned long limit)
{
unsigned long pfn = 0;
long ret, pinned = 0, lock_acct = 0;
@@ -414,7 +411,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
* pages are already counted against the user.
*/
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!lock_cap && current->mm->locked_vm + 1 > limit) {
+ if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
put_pfn(*pfn_base, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
@@ -440,7 +437,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
}
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!lock_cap &&
+ if (!dma->lock_cap &&
current->mm->locked_vm + lock_acct + 1 > limit) {
put_pfn(pfn, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
@@ -453,7 +450,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
}
out:
- ret = vfio_lock_acct(current, lock_acct, &lock_cap);
+ ret = vfio_lock_acct(dma, lock_acct, false);
unpin_out:
if (ret) {
@@ -484,7 +481,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
}
if (do_accounting)
- vfio_lock_acct(dma->task, locked - unlocked, NULL);
+ vfio_lock_acct(dma, locked - unlocked, true);
return unlocked;
}
@@ -501,7 +498,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
- ret = vfio_lock_acct(dma->task, 1, NULL);
+ ret = vfio_lock_acct(dma, 1, true);
if (ret) {
put_pfn(*pfn_base, dma->prot);
if (ret == -ENOMEM)
@@ -528,7 +525,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
if (do_accounting)
- vfio_lock_acct(dma->task, -unlocked, NULL);
+ vfio_lock_acct(dma, -unlocked, true);
return unlocked;
}
@@ -723,7 +720,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma->iommu_mapped = false;
if (do_accounting) {
- vfio_lock_acct(dma->task, -unlocked, NULL);
+ vfio_lock_acct(dma, -unlocked, true);
return 0;
}
return unlocked;
@@ -935,14 +932,12 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
size_t size = map_size;
long npage;
unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- bool lock_cap = capable(CAP_IPC_LOCK);
int ret = 0;
while (size) {
/* Pin a contiguous chunk of memory */
npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
- size >> PAGE_SHIFT, &pfn,
- lock_cap, limit);
+ size >> PAGE_SHIFT, &pfn, limit);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
@@ -1017,8 +1012,36 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
- get_task_struct(current);
- dma->task = current;
+
+ /*
+ * We need to be able to both add to a task's locked memory and test
+ * against the locked memory limit and we need to be able to do both
+ * outside of this call path as pinning can be asynchronous via the
+ * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
+ * task_struct and VM locked pages requires an mm_struct, however
+ * holding an indefinite mm reference is not recommended, therefore we
+ * only hold a reference to a task. We could hold a reference to
+ * current, however QEMU uses this call path through vCPU threads,
+ * which can be killed resulting in a NULL mm and failure in the unmap
+ * path when called via a different thread. Avoid this problem by
+ * using the group_leader as threads within the same group require
+ * both CLONE_THREAD and CLONE_VM and will therefore use the same
+ * mm_struct.
+ *
+ * Previously we also used the task for testing CAP_IPC_LOCK at the
+ * time of pinning and accounting, however has_capability() makes use
+ * of real_cred, a copy-on-write field, so we can't guarantee that it
+ * matches group_leader, or in fact that it might not change by the
+ * time it's evaluated. If a process were to call MAP_DMA with
+ * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
+ * possibly see different results for an iommu_mapped vfio_dma vs
+ * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
+ * time of calling MAP_DMA.
+ */
+ get_task_struct(current->group_leader);
+ dma->task = current->group_leader;
+ dma->lock_cap = capable(CAP_IPC_LOCK);
+
dma->pfn_list = RB_ROOT;
/* Insert zero-sized and grow as we map chunks of it */
@@ -1053,7 +1076,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
struct vfio_domain *d;
struct rb_node *n;
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- bool lock_cap = capable(CAP_IPC_LOCK);
int ret;
/* Arbitrarily pick the first domain in the list for lookups */
@@ -1100,8 +1122,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
npage = vfio_pin_pages_remote(dma, vaddr,
n >> PAGE_SHIFT,
- &pfn, lock_cap,
- limit);
+ &pfn, limit);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
@@ -1370,7 +1391,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
if (!is_invalid_reserved_pfn(vpfn->pfn))
locked++;
}
- vfio_lock_acct(dma->task, locked - unlocked, NULL);
+ vfio_lock_acct(dma, locked - unlocked, true);
}
}
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 3a97d1d69770..d5507e10161b 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -693,6 +693,7 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
struct scatterlist *sg, int sg_count)
{
size_t off = iter->iov_offset;
+ struct scatterlist *p = sg;
int i, ret;
for (i = 0; i < iter->nr_segs; i++) {
@@ -701,8 +702,8 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write,
ret = vhost_scsi_map_to_sgl(cmd, base, len, sg, write);
if (ret < 0) {
- for (i = 0; i < sg_count; i++) {
- struct page *page = sg_page(&sg[i]);
+ while (p < sg) {
+ struct page *page = sg_page(p++);
if (page)
put_page(page);
}
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c2033de13473..c74d98fc68a1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -904,7 +904,7 @@ static void vhost_dev_lock_vqs(struct vhost_dev *d)
{
int i = 0;
for (i = 0; i < d->nvqs; ++i)
- mutex_lock(&d->vqs[i]->mutex);
+ mutex_lock_nested(&d->vqs[i]->mutex, i);
}
static void vhost_dev_unlock_vqs(struct vhost_dev *d)
@@ -961,7 +961,7 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
list_for_each_entry_safe(node, n, &d->pending_list, node) {
struct vhost_iotlb_msg *vq_msg = &node->msg.iotlb;
if (msg->iova <= vq_msg->iova &&
- msg->iova + msg->size - 1 > vq_msg->iova &&
+ msg->iova + msg->size - 1 >= vq_msg->iova &&
vq_msg->type == VHOST_IOTLB_MISS) {
vhost_poll_queue(&node->vq->poll);
list_del(&node->node);
@@ -1016,6 +1016,10 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
vhost_iotlb_notify_vq(dev, msg);
break;
case VHOST_IOTLB_INVALIDATE:
+ if (!dev->iotlb) {
+ ret = -EFAULT;
+ break;
+ }
vhost_vq_meta_reset(dev);
vhost_del_umem_range(dev->iotlb, msg->iova,
msg->iova + msg->size - 1);
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index 0003912a8111..4d65909a1b15 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -23,6 +23,8 @@ static void disable_hotplug_cpu(int cpu)
device_offline(get_cpu_device(cpu));
unlock_device_hotplug();
}
+ if (cpu_online(cpu))
+ return;
if (cpu_present(cpu))
xen_arch_unregister_cpu(cpu);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 128ce17a80b0..076ccfb44c28 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o free-space-tree.o
+ uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5fe1ca8abc70..9c5dfdb81b38 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -351,6 +351,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
@@ -395,6 +396,10 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_dev_replace_lock(dev_replace, 1);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
goto leave;
}
@@ -416,8 +421,6 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
leave:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
btrfs_dev_replace_unlock(dev_replace, 1);
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3604410e046b..16dc401096ac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
#include "sysfs.h"
#include "qgroup.h"
#include "compression.h"
+#include "tree-checker.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -602,146 +603,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
return ret;
}
-#define CORRUPT(reason, eb, root, slot) \
- btrfs_crit(root->fs_info, \
- "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \
- btrfs_header_level(eb) == 0 ? "leaf" : "node", \
- reason, btrfs_header_bytenr(eb), root->objectid, slot)
-
-static noinline int check_leaf(struct btrfs_root *root,
- struct extent_buffer *leaf)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_key key;
- struct btrfs_key leaf_key;
- u32 nritems = btrfs_header_nritems(leaf);
- int slot;
-
- /*
- * Extent buffers from a relocation tree have a owner field that
- * corresponds to the subvolume tree they are based on. So just from an
- * extent buffer alone we can not find out what is the id of the
- * corresponding subvolume tree, so we can not figure out if the extent
- * buffer corresponds to the root of the relocation tree or not. So skip
- * this check for relocation trees.
- */
- if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
- struct btrfs_root *check_root;
-
- key.objectid = btrfs_header_owner(leaf);
- key.type = BTRFS_ROOT_ITEM_KEY;
- key.offset = (u64)-1;
-
- check_root = btrfs_get_fs_root(fs_info, &key, false);
- /*
- * The only reason we also check NULL here is that during
- * open_ctree() some roots has not yet been set up.
- */
- if (!IS_ERR_OR_NULL(check_root)) {
- struct extent_buffer *eb;
-
- eb = btrfs_root_node(check_root);
- /* if leaf is the root, then it's fine */
- if (leaf != eb) {
- CORRUPT("non-root leaf's nritems is 0",
- leaf, check_root, 0);
- free_extent_buffer(eb);
- return -EIO;
- }
- free_extent_buffer(eb);
- }
- return 0;
- }
-
- if (nritems == 0)
- return 0;
-
- /* Check the 0 item */
- if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
- BTRFS_LEAF_DATA_SIZE(fs_info)) {
- CORRUPT("invalid item offset size pair", leaf, root, 0);
- return -EIO;
- }
-
- /*
- * Check to make sure each items keys are in the correct order and their
- * offsets make sense. We only have to loop through nritems-1 because
- * we check the current slot against the next slot, which verifies the
- * next slot's offset+size makes sense and that the current's slot
- * offset is correct.
- */
- for (slot = 0; slot < nritems - 1; slot++) {
- btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
- btrfs_item_key_to_cpu(leaf, &key, slot + 1);
-
- /* Make sure the keys are in the right order */
- if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
- CORRUPT("bad key order", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Make sure the offset and ends are right, remember that the
- * item data starts at the end of the leaf and grows towards the
- * front.
- */
- if (btrfs_item_offset_nr(leaf, slot) !=
- btrfs_item_end_nr(leaf, slot + 1)) {
- CORRUPT("slot offset bad", leaf, root, slot);
- return -EIO;
- }
-
- /*
- * Check to make sure that we don't point outside of the leaf,
- * just in case all the items are consistent to each other, but
- * all point outside of the leaf.
- */
- if (btrfs_item_end_nr(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info)) {
- CORRUPT("slot end outside of leaf", leaf, root, slot);
- return -EIO;
- }
- }
-
- return 0;
-}
-
-static int check_node(struct btrfs_root *root, struct extent_buffer *node)
-{
- unsigned long nr = btrfs_header_nritems(node);
- struct btrfs_key key, next_key;
- int slot;
- u64 bytenr;
- int ret = 0;
-
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
- btrfs_crit(root->fs_info,
- "corrupt node: block %llu root %llu nritems %lu",
- node->start, root->objectid, nr);
- return -EIO;
- }
-
- for (slot = 0; slot < nr - 1; slot++) {
- bytenr = btrfs_node_blockptr(node, slot);
- btrfs_node_key_to_cpu(node, &key, slot);
- btrfs_node_key_to_cpu(node, &next_key, slot + 1);
-
- if (!bytenr) {
- CORRUPT("invalid item slot", node, root, slot);
- ret = -EIO;
- goto out;
- }
-
- if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
- CORRUPT("bad key order", node, root, slot);
- ret = -EIO;
- goto out;
- }
- }
-out:
- return ret;
-}
-
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 phy_offset, struct page *page,
u64 start, u64 end, int mirror)
@@ -807,12 +668,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && check_leaf(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
- if (found_level > 0 && check_node(root, eb))
+ if (found_level > 0 && btrfs_check_node(fs_info, eb))
ret = -EIO;
if (!ret)
@@ -3080,6 +2941,13 @@ retry_root_backup:
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
+ ret = btrfs_verify_dev_extents(fs_info);
+ if (ret) {
+ btrfs_err(fs_info,
+ "failed to verify dev extents against chunks: %d",
+ ret);
+ goto fail_block_groups;
+ }
ret = btrfs_recover_balance(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to recover balance: %d", ret);
@@ -4145,7 +4013,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
buf->len,
fs_info->dirty_metadata_batch);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ /*
+ * Since btrfs_mark_buffer_dirty() can be called with item pointer set
+ * but item data not updated.
+ * So here we should only check item pointers, not item data.
+ */
+ if (btrfs_header_level(buf) == 0 &&
+ btrfs_check_leaf_relaxed(fs_info, buf)) {
btrfs_print_leaf(fs_info, buf);
ASSERT(0);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cd7b64a339bf..a9192148e574 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9722,6 +9722,8 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
int ret = 0;
struct btrfs_key found_key;
struct extent_buffer *leaf;
+ struct btrfs_block_group_item bg;
+ u64 flags;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
@@ -9756,8 +9758,32 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
"logical %llu len %llu found bg but no related chunk",
found_key.objectid, found_key.offset);
ret = -ENOENT;
+ } else if (em->start != found_key.objectid ||
+ em->len != found_key.offset) {
+ btrfs_err(fs_info,
+ "block group %llu len %llu mismatch with chunk %llu len %llu",
+ found_key.objectid, found_key.offset,
+ em->start, em->len);
+ ret = -EUCLEAN;
} else {
- ret = 0;
+ read_extent_buffer(leaf, &bg,
+ btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bg));
+ flags = btrfs_block_group_flags(&bg) &
+ BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+ if (flags != (em->map_lookup->type &
+ BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
+ found_key.objectid,
+ found_key.offset, flags,
+ (BTRFS_BLOCK_GROUP_TYPE_MASK &
+ em->map_lookup->type));
+ ret = -EUCLEAN;
+ } else {
+ ret = 0;
+ }
}
free_extent_map(em);
goto out;
@@ -10015,6 +10041,62 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
return cache;
}
+
+/*
+ * Iterate all chunks and verify that each of them has the corresponding block
+ * group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct btrfs_block_group_cache *bg;
+ u64 start = 0;
+ int ret = 0;
+
+ while (1) {
+ read_lock(&map_tree->map_tree.lock);
+ /*
+ * lookup_extent_mapping will return the first extent map
+ * intersecting the range, so setting @len to 1 is enough to
+ * get the first chunk.
+ */
+ em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
+ read_unlock(&map_tree->map_tree.lock);
+ if (!em)
+ break;
+
+ bg = btrfs_lookup_block_group(fs_info, em->start);
+ if (!bg) {
+ btrfs_err(fs_info,
+ "chunk start=%llu len=%llu doesn't have corresponding block group",
+ em->start, em->len);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ break;
+ }
+ if (bg->key.objectid != em->start ||
+ bg->key.offset != em->len ||
+ (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+ (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
+ em->start, em->len,
+ em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ bg->key.objectid, bg->key.offset,
+ bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+ ret = -EUCLEAN;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ break;
+ }
+ start = em->start + em->len;
+ free_extent_map(em);
+ btrfs_put_block_group(bg);
+ }
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_path *path;
@@ -10188,7 +10270,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_add_raid_kobjects(info);
init_global_block_rsv(info);
- ret = 0;
+ ret = check_chunk_block_group_mappings(info);
error:
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
new file mode 100644
index 000000000000..482e0a2dd21e
--- /dev/null
+++ b/fs/btrfs/tree-checker.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+/*
+ * The module is used to catch unexpected/corrupted tree block data.
+ * Such behavior can be caused either by a fuzzed image or bugs.
+ *
+ * The objective is to do leaf/node validation checks when tree block is read
+ * from disk, and check *every* possible member, so other code won't
+ * need to checking them again.
+ *
+ * Due to the potential and unwanted damage, every checker needs to be
+ * carefully reviewed otherwise so it does not prevent mount of valid images.
+ */
+
+#include "ctree.h"
+#include "tree-checker.h"
+#include "disk-io.h"
+#include "compression.h"
+#include "hash.h"
+#include "volumes.h"
+
+/*
+ * Error message should follow the following format:
+ * corrupt <type>: <identifier>, <reason>[, <bad_value>]
+ *
+ * @type: leaf or node
+ * @identifier: the necessary info to locate the leaf/node.
+ * It's recommened to decode key.objecitd/offset if it's
+ * meaningful.
+ * @reason: describe the error
+ * @bad_value: optional, it's recommened to output bad value and its
+ * expected value (range).
+ *
+ * Since comma is used to separate the components, only space is allowed
+ * inside each component.
+ */
+
+/*
+ * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt.
+ * Allows callers to customize the output.
+ */
+__printf(4, 5)
+static void generic_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf);
+ va_end(args);
+}
+
+/*
+ * Customized reporter for extent data item, since its key objectid and
+ * offset has its own meaning.
+ */
+__printf(4, 5)
+static void file_extent_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
+ va_end(args);
+}
+
+/*
+ * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
+ * Else return 1
+ */
+#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \
+({ \
+ if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
+ file_extent_err((fs_info), (leaf), (slot), \
+ "invalid %s for file extent, have %llu, should be aligned to %u", \
+ (#name), btrfs_file_extent_##name((leaf), (fi)), \
+ (alignment)); \
+ (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \
+})
+
+static int check_extent_data_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_file_extent_item *fi;
+ u32 sectorsize = fs_info->sectorsize;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ file_extent_err(fs_info, leaf, slot,
+"unaligned file_offset for file extent, have %llu should be aligned to %u",
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+ file_extent_err(fs_info, leaf, slot,
+ "invalid type for file extent, have %u expect range [0, %u]",
+ btrfs_file_extent_type(leaf, fi),
+ BTRFS_FILE_EXTENT_TYPES);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Support for new compression/encrption must introduce incompat flag,
+ * and must be caught in open_ctree().
+ */
+ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+ file_extent_err(fs_info, leaf, slot,
+ "invalid compression for file extent, have %u expect range [0, %u]",
+ btrfs_file_extent_compression(leaf, fi),
+ BTRFS_COMPRESS_TYPES);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_encryption(leaf, fi)) {
+ file_extent_err(fs_info, leaf, slot,
+ "invalid encryption for file extent, have %u expect 0",
+ btrfs_file_extent_encryption(leaf, fi));
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
+ /* Inline extent must have 0 as key offset */
+ if (key->offset) {
+ file_extent_err(fs_info, leaf, slot,
+ "invalid file_offset for inline file extent, have %llu expect 0",
+ key->offset);
+ return -EUCLEAN;
+ }
+
+ /* Compressed inline extent has no on-disk size, skip it */
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE)
+ return 0;
+
+ /* Uncompressed inline extent size must match item size */
+ if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi)) {
+ file_extent_err(fs_info, leaf, slot,
+ "invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
+ item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
+ btrfs_file_extent_ram_bytes(leaf, fi));
+ return -EUCLEAN;
+ }
+ return 0;
+ }
+
+ /* Regular or preallocated extent has fixed item size */
+ if (item_size != sizeof(*fi)) {
+ file_extent_err(fs_info, leaf, slot,
+ "invalid item size for reg/prealloc file extent, have %u expect %zu",
+ item_size, sizeof(*fi));
+ return -EUCLEAN;
+ }
+ if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize))
+ return -EUCLEAN;
+ return 0;
+}
+
+static int check_csum_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
+{
+ u32 sectorsize = fs_info->sectorsize;
+ u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
+
+ if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
+ generic_err(fs_info, leaf, slot,
+ "invalid key objectid for csum item, have %llu expect %llu",
+ key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(key->offset, sectorsize)) {
+ generic_err(fs_info, leaf, slot,
+ "unaligned key offset for csum item, have %llu should be aligned to %u",
+ key->offset, sectorsize);
+ return -EUCLEAN;
+ }
+ if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
+ generic_err(fs_info, leaf, slot,
+ "unaligned item size for csum item, have %u should be aligned to %u",
+ btrfs_item_size_nr(leaf, slot), csumsize);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+/*
+ * Customized reported for dir_item, only important new info is key->objectid,
+ * which represents inode number
+ */
+__printf(4, 5)
+static void dir_item_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
+ va_end(args);
+}
+
+static int check_dir_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_dir_item *di;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u32 cur = 0;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ while (cur < item_size) {
+ char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
+ u32 name_len;
+ u32 data_len;
+ u32 max_name_len;
+ u32 total_size;
+ u32 name_hash;
+ u8 dir_type;
+
+ /* header itself should not cross item boundary */
+ if (cur + sizeof(*di) > item_size) {
+ dir_item_err(fs_info, leaf, slot,
+ "dir item header crosses item boundary, have %zu boundary %u",
+ cur + sizeof(*di), item_size);
+ return -EUCLEAN;
+ }
+
+ /* dir type check */
+ dir_type = btrfs_dir_type(leaf, di);
+ if (dir_type >= BTRFS_FT_MAX) {
+ dir_item_err(fs_info, leaf, slot,
+ "invalid dir item type, have %u expect [0, %u)",
+ dir_type, BTRFS_FT_MAX);
+ return -EUCLEAN;
+ }
+
+ if (key->type == BTRFS_XATTR_ITEM_KEY &&
+ dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(fs_info, leaf, slot,
+ "invalid dir item type for XATTR key, have %u expect %u",
+ dir_type, BTRFS_FT_XATTR);
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR &&
+ key->type != BTRFS_XATTR_ITEM_KEY) {
+ dir_item_err(fs_info, leaf, slot,
+ "xattr dir type found for non-XATTR key");
+ return -EUCLEAN;
+ }
+ if (dir_type == BTRFS_FT_XATTR)
+ max_name_len = XATTR_NAME_MAX;
+ else
+ max_name_len = BTRFS_NAME_LEN;
+
+ /* Name/data length check */
+ name_len = btrfs_dir_name_len(leaf, di);
+ data_len = btrfs_dir_data_len(leaf, di);
+ if (name_len > max_name_len) {
+ dir_item_err(fs_info, leaf, slot,
+ "dir item name len too long, have %u max %u",
+ name_len, max_name_len);
+ return -EUCLEAN;
+ }
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
+ dir_item_err(fs_info, leaf, slot,
+ "dir item name and data len too long, have %u max %u",
+ name_len + data_len,
+ BTRFS_MAX_XATTR_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+
+ if (data_len && dir_type != BTRFS_FT_XATTR) {
+ dir_item_err(fs_info, leaf, slot,
+ "dir item with invalid data len, have %u expect 0",
+ data_len);
+ return -EUCLEAN;
+ }
+
+ total_size = sizeof(*di) + name_len + data_len;
+
+ /* header and name/data should not cross item boundary */
+ if (cur + total_size > item_size) {
+ dir_item_err(fs_info, leaf, slot,
+ "dir item data crosses item boundary, have %u boundary %u",
+ cur + total_size, item_size);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Special check for XATTR/DIR_ITEM, as key->offset is name
+ * hash, should match its name
+ */
+ if (key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_XATTR_ITEM_KEY) {
+ read_extent_buffer(leaf, namebuf,
+ (unsigned long)(di + 1), name_len);
+ name_hash = btrfs_name_hash(namebuf, name_len);
+ if (key->offset != name_hash) {
+ dir_item_err(fs_info, leaf, slot,
+ "name hash mismatch with key, have 0x%016x expect 0x%016llx",
+ name_hash, key->offset);
+ return -EUCLEAN;
+ }
+ }
+ cur += total_size;
+ di = (struct btrfs_dir_item *)((void *)di + total_size);
+ }
+ return 0;
+}
+
+__printf(4, 5)
+__cold
+static void block_group_err(const struct btrfs_fs_info *fs_info,
+ const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d bg_start=%llu bg_len=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
+ va_end(args);
+}
+
+static int check_block_group_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ struct btrfs_block_group_item bgi;
+ u32 item_size = btrfs_item_size_nr(leaf, slot);
+ u64 flags;
+ u64 type;
+
+ /*
+ * Here we don't really care about alignment since extent allocator can
+ * handle it. We care more about the size, as if one block group is
+ * larger than maximum size, it's must be some obvious corruption.
+ */
+ if (key->offset > BTRFS_MAX_DATA_CHUNK_SIZE || key->offset == 0) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group size, have %llu expect (0, %llu]",
+ key->offset, BTRFS_MAX_DATA_CHUNK_SIZE);
+ return -EUCLEAN;
+ }
+
+ if (item_size != sizeof(bgi)) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid item size, have %u expect %zu",
+ item_size, sizeof(bgi));
+ return -EUCLEAN;
+ }
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+ if (btrfs_block_group_chunk_objectid(&bgi) !=
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group chunk objectid, have %llu expect %llu",
+ btrfs_block_group_chunk_objectid(&bgi),
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ return -EUCLEAN;
+ }
+
+ if (btrfs_block_group_used(&bgi) > key->offset) {
+ block_group_err(fs_info, leaf, slot,
+ "invalid block group used, have %llu expect [0, %llu)",
+ btrfs_block_group_used(&bgi), key->offset);
+ return -EUCLEAN;
+ }
+
+ flags = btrfs_block_group_flags(&bgi);
+ if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
+ block_group_err(fs_info, leaf, slot,
+"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
+ flags & BTRFS_BLOCK_GROUP_PROFILE_MASK,
+ hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK));
+ return -EUCLEAN;
+ }
+
+ type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
+ if (type != BTRFS_BLOCK_GROUP_DATA &&
+ type != BTRFS_BLOCK_GROUP_METADATA &&
+ type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_DATA)) {
+ block_group_err(fs_info, leaf, slot,
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx",
+ type, hweight64(type),
+ BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
+ BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
+ return -EUCLEAN;
+ }
+ return 0;
+}
+
+/*
+ * Common point to switch the item-specific validation.
+ */
+static int check_leaf_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot)
+{
+ int ret = 0;
+
+ switch (key->type) {
+ case BTRFS_EXTENT_DATA_KEY:
+ ret = check_extent_data_item(fs_info, leaf, key, slot);
+ break;
+ case BTRFS_EXTENT_CSUM_KEY:
+ ret = check_csum_item(fs_info, leaf, key, slot);
+ break;
+ case BTRFS_DIR_ITEM_KEY:
+ case BTRFS_DIR_INDEX_KEY:
+ case BTRFS_XATTR_ITEM_KEY:
+ ret = check_dir_item(fs_info, leaf, key, slot);
+ break;
+ case BTRFS_BLOCK_GROUP_ITEM_KEY:
+ ret = check_block_group_item(fs_info, leaf, key, slot);
+ break;
+ }
+ return ret;
+}
+
+static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
+ bool check_item_data)
+{
+ /* No valid key type is 0, so all key should be larger than this key */
+ struct btrfs_key prev_key = {0, 0, 0};
+ struct btrfs_key key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ /*
+ * Extent buffers from a relocation tree have a owner field that
+ * corresponds to the subvolume tree they are based on. So just from an
+ * extent buffer alone we can not find out what is the id of the
+ * corresponding subvolume tree, so we can not figure out if the extent
+ * buffer corresponds to the root of the relocation tree or not. So
+ * skip this check for relocation trees.
+ */
+ if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
+ u64 owner = btrfs_header_owner(leaf);
+ struct btrfs_root *check_root;
+
+ /* These trees must never be empty */
+ if (owner == BTRFS_ROOT_TREE_OBJECTID ||
+ owner == BTRFS_CHUNK_TREE_OBJECTID ||
+ owner == BTRFS_EXTENT_TREE_OBJECTID ||
+ owner == BTRFS_DEV_TREE_OBJECTID ||
+ owner == BTRFS_FS_TREE_OBJECTID ||
+ owner == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ generic_err(fs_info, leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+ key.objectid = owner;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ check_root = btrfs_get_fs_root(fs_info, &key, false);
+ /*
+ * The only reason we also check NULL here is that during
+ * open_ctree() some roots has not yet been set up.
+ */
+ if (!IS_ERR_OR_NULL(check_root)) {
+ struct extent_buffer *eb;
+
+ eb = btrfs_root_node(check_root);
+ /* if leaf is the root, then it's fine */
+ if (leaf != eb) {
+ generic_err(fs_info, leaf, 0,
+ "invalid nritems, have %u should not be 0 for non-root leaf",
+ nritems);
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ free_extent_buffer(eb);
+ }
+ return 0;
+ }
+
+ if (nritems == 0)
+ return 0;
+
+ /*
+ * Check the following things to make sure this is a good leaf, and
+ * leaf users won't need to bother with similar sanity checks:
+ *
+ * 1) key ordering
+ * 2) item offset and size
+ * No overlap, no hole, all inside the leaf.
+ * 3) item content
+ * If possible, do comprehensive sanity check.
+ * NOTE: All checks must only rely on the item data itself.
+ */
+ for (slot = 0; slot < nritems; slot++) {
+ u32 item_end_expected;
+ int ret;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
+ generic_err(fs_info, leaf, slot,
+ "bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
+ prev_key.objectid, prev_key.type,
+ prev_key.offset, key.objectid, key.type,
+ key.offset);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (slot == 0)
+ item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info);
+ else
+ item_end_expected = btrfs_item_offset_nr(leaf,
+ slot - 1);
+ if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
+ generic_err(fs_info, leaf, slot,
+ "unexpected item end, have %u expect %u",
+ btrfs_item_end_nr(leaf, slot),
+ item_end_expected);
+ return -EUCLEAN;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just in case all the items are consistent to each other, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(fs_info)) {
+ generic_err(fs_info, leaf, slot,
+ "slot end outside of leaf, have %u expect range [0, %u]",
+ btrfs_item_end_nr(leaf, slot),
+ BTRFS_LEAF_DATA_SIZE(fs_info));
+ return -EUCLEAN;
+ }
+
+ /* Also check if the item pointer overlaps with btrfs item. */
+ if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
+ btrfs_item_ptr_offset(leaf, slot)) {
+ generic_err(fs_info, leaf, slot,
+ "slot overlaps with its data, item end %lu data start %lu",
+ btrfs_item_nr_offset(slot) +
+ sizeof(struct btrfs_item),
+ btrfs_item_ptr_offset(leaf, slot));
+ return -EUCLEAN;
+ }
+
+ if (check_item_data) {
+ /*
+ * Check if the item size and content meet other
+ * criteria
+ */
+ ret = check_leaf_item(fs_info, leaf, &key, slot);
+ if (ret < 0)
+ return ret;
+ }
+
+ prev_key.objectid = key.objectid;
+ prev_key.type = key.type;
+ prev_key.offset = key.offset;
+ }
+
+ return 0;
+}
+
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf)
+{
+ return check_leaf(fs_info, leaf, true);
+}
+
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf)
+{
+ return check_leaf(fs_info, leaf, false);
+}
+
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
+{
+ unsigned long nr = btrfs_header_nritems(node);
+ struct btrfs_key key, next_key;
+ int slot;
+ u64 bytenr;
+ int ret = 0;
+
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
+ btrfs_crit(fs_info,
+"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
+ btrfs_header_owner(node), node->start,
+ nr == 0 ? "small" : "large", nr,
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+ return -EUCLEAN;
+ }
+
+ for (slot = 0; slot < nr - 1; slot++) {
+ bytenr = btrfs_node_blockptr(node, slot);
+ btrfs_node_key_to_cpu(node, &key, slot);
+ btrfs_node_key_to_cpu(node, &next_key, slot + 1);
+
+ if (!bytenr) {
+ generic_err(fs_info, node, slot,
+ "invalid NULL node pointer");
+ ret = -EUCLEAN;
+ goto out;
+ }
+ if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
+ generic_err(fs_info, node, slot,
+ "unaligned pointer, have %llu should be aligned to %u",
+ bytenr, fs_info->sectorsize);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
+ generic_err(fs_info, node, slot,
+ "bad key order, current (%llu %u %llu) next (%llu %u %llu)",
+ key.objectid, key.type, key.offset,
+ next_key.objectid, next_key.type,
+ next_key.offset);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
new file mode 100644
index 000000000000..aba542755710
--- /dev/null
+++ b/fs/btrfs/tree-checker.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Qu Wenruo 2017. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program.
+ */
+
+#ifndef __BTRFS_TREE_CHECKER__
+#define __BTRFS_TREE_CHECKER__
+
+#include "ctree.h"
+#include "extent_io.h"
+
+/*
+ * Comprehensive leaf checker.
+ * Will check not only the item pointers, but also every possible member
+ * in item data.
+ */
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf);
+
+/*
+ * Less strict leaf checker.
+ * Will only check item pointers, not reading item data.
+ */
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf);
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 08692491c023..94326471bae6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4728,7 +4728,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = SZ_1G;
- max_chunk_size = 10 * max_stripe_size;
+ max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
@@ -6563,6 +6563,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = btrfs_chunk_type(leaf, chunk);
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+ map->verified_stripes = 0;
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7316,3 +7317,186 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
fs_devices = fs_devices->seed;
}
}
+
+
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+{
+ int index = __get_raid_index(type);
+ int ncopies = btrfs_raid_array[index].ncopies;
+ int data_stripes;
+
+ switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID5:
+ data_stripes = num_stripes - 1;
+ break;
+ case BTRFS_BLOCK_GROUP_RAID6:
+ data_stripes = num_stripes - 2;
+ break;
+ default:
+ data_stripes = num_stripes / ncopies;
+ break;
+ }
+ return div_u64(chunk_len, data_stripes);
+}
+
+static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset, u64 devid,
+ u64 physical_offset, u64 physical_len)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 stripe_len;
+ bool found = false;
+ int ret = 0;
+ int i;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+
+ if (!em) {
+ btrfs_err(fs_info,
+"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
+ physical_offset, devid);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ map = em->map_lookup;
+ stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+ if (physical_len != stripe_len) {
+ btrfs_err(fs_info,
+"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
+ physical_offset, devid, em->start, physical_len,
+ stripe_len);
+ ret = -EUCLEAN;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ if (map->stripes[i].dev->devid == devid &&
+ map->stripes[i].physical == physical_offset) {
+ found = true;
+ if (map->verified_stripes >= map->num_stripes) {
+ btrfs_err(fs_info,
+ "too many dev extents for chunk %llu found",
+ em->start);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ map->verified_stripes++;
+ break;
+ }
+ }
+ if (!found) {
+ btrfs_err(fs_info,
+ "dev extent physical offset %llu devid %llu has no corresponding chunk",
+ physical_offset, devid);
+ ret = -EUCLEAN;
+ }
+out:
+ free_extent_map(em);
+ return ret;
+}
+
+static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct rb_node *node;
+ int ret = 0;
+
+ read_lock(&em_tree->lock);
+ for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
+ em = rb_entry(node, struct extent_map, rb_node);
+ if (em->map_lookup->num_stripes !=
+ em->map_lookup->verified_stripes) {
+ btrfs_err(fs_info,
+ "chunk %llu has missing dev extent, have %d expect %d",
+ em->start, em->map_lookup->verified_stripes,
+ em->map_lookup->num_stripes);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+out:
+ read_unlock(&em_tree->lock);
+ return ret;
+}
+
+/*
+ * Ensure that all dev extents are mapped to correct chunk, otherwise
+ * later chunk allocation/free would cause unexpected behavior.
+ *
+ * NOTE: This will iterate through the whole device tree, which should be of
+ * the same size level as the chunk tree. This slightly increases mount time.
+ */
+int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = fs_info->dev_root;
+ struct btrfs_key key;
+ int ret = 0;
+
+ key.objectid = 1;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = READA_FORWARD;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ /* No dev extents at all? Not good */
+ if (ret > 0) {
+ ret = -EUCLEAN;
+ goto out;
+ }
+ }
+ while (1) {
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_dev_extent *dext;
+ int slot = path->slots[0];
+ u64 chunk_offset;
+ u64 physical_offset;
+ u64 physical_len;
+ u64 devid;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.type != BTRFS_DEV_EXTENT_KEY)
+ break;
+ devid = key.objectid;
+ physical_offset = key.offset;
+
+ dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+ chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
+ physical_len = btrfs_dev_extent_length(leaf, dext);
+
+ ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
+ physical_offset, physical_len);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+
+ /* Ensure all chunks have corresponding dev extents */
+ ret = verify_chunk_dev_extent_mapping(fs_info);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2d6da952ba4a..6d1b48929104 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
#include <linux/btrfs.h>
#include "async-thread.h"
+#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
+
extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
@@ -350,6 +352,7 @@ struct map_lookup {
int sector_size;
int num_stripes;
int sub_stripes;
+ int verified_stripes; /* For mount time dev extent verification */
struct btrfs_bio_stripe stripes[];
};
@@ -537,4 +540,5 @@ struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
+int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2852521fc8ec..dc1b3678525b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -785,9 +785,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
ASSERT(blkno == 0);
error = xfs_attr3_leaf_create(args, blkno, &bp);
if (error) {
- error = xfs_da_shrink_inode(args, 0, bp);
- bp = NULL;
- if (error)
+ /* xfs_attr3_leaf_create may not have instantiated a block */
+ if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
goto out;
xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 6306dc6d056f..9b28e9058b14 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -773,19 +773,14 @@ xfs_bmap_extents_to_btree(
args.wasdel = wasdel;
*logflagsp = 0;
if ((error = xfs_alloc_vextent(&args))) {
- xfs_iroot_realloc(ip, -1, whichfork);
ASSERT(ifp->if_broot == NULL);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
+ goto err1;
}
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
- xfs_iroot_realloc(ip, -1, whichfork);
ASSERT(ifp->if_broot == NULL);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return -ENOSPC;
+ error = -ENOSPC;
+ goto err1;
}
/*
* Allocation can't fail, the space was reserved.
@@ -797,6 +792,10 @@ xfs_bmap_extents_to_btree(
ip->i_d.di_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+ if (!abp) {
+ error = -ENOSPC;
+ goto err2;
+ }
/*
* Fill in the child block.
*/
@@ -839,6 +838,15 @@ xfs_bmap_extents_to_btree(
*curp = cur;
*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
return 0;
+
+err2:
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+err1:
+ xfs_iroot_realloc(ip, -1, whichfork);
+ XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+
+ return error;
}
/*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 05bc96bc0ac4..ab9a20e16469 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -307,6 +307,46 @@ xfs_reinit_inode(
}
/*
+ * If we are allocating a new inode, then check what was returned is
+ * actually a free, empty inode. If we are not allocating an inode,
+ * then check we didn't find a free inode.
+ *
+ * Returns:
+ * 0 if the inode free state matches the lookup context
+ * -ENOENT if the inode is free and we are not allocating
+ * -EFSCORRUPTED if there is any state mismatch at all
+ */
+static int
+xfs_iget_check_free_state(
+ struct xfs_inode *ip,
+ int flags)
+{
+ if (flags & XFS_IGET_CREATE) {
+ /* should be a free inode */
+ if (VFS_I(ip)->i_mode != 0) {
+ xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
+ ip->i_ino, VFS_I(ip)->i_mode);
+ return -EFSCORRUPTED;
+ }
+
+ if (ip->i_d.di_nblocks != 0) {
+ xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx has blocks allocated!",
+ ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+ }
+
+ /* should be an allocated inode */
+ if (VFS_I(ip)->i_mode == 0)
+ return -ENOENT;
+
+ return 0;
+}
+
+/*
* Check the validity of the inode we just found it the cache
*/
static int
@@ -355,12 +395,12 @@ xfs_iget_cache_hit(
}
/*
- * If lookup is racing with unlink return an error immediately.
+ * Check the inode free state is valid. This also detects lookup
+ * racing with unlinks.
*/
- if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = -ENOENT;
+ error = xfs_iget_check_free_state(ip, flags);
+ if (error)
goto out_error;
- }
/*
* If IRECLAIMABLE is set, we've torn down the VFS inode already.
@@ -478,29 +518,12 @@ xfs_iget_cache_miss(
/*
- * If we are allocating a new inode, then check what was returned is
- * actually a free, empty inode. If we are not allocating an inode,
- * the check we didn't find a free inode.
+ * Check the inode free state is valid. This also detects lookup
+ * racing with unlinks.
*/
- if (flags & XFS_IGET_CREATE) {
- if (VFS_I(ip)->i_mode != 0) {
- xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx not marked free on disk",
- ino);
- error = -EFSCORRUPTED;
- goto out_destroy;
- }
- if (ip->i_d.di_nblocks != 0) {
- xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx has blocks allocated!",
- ino);
- error = -EFSCORRUPTED;
- goto out_destroy;
- }
- } else if (VFS_I(ip)->i_mode == 0) {
- error = -ENOENT;
+ error = xfs_iget_check_free_state(ip, flags);
+ if (error)
goto out_destroy;
- }
/*
* Preload the radix tree so we can insert safely under the
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 10689e1fdf11..3142645a27f5 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -732,6 +732,7 @@ struct btrfs_balance_item {
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
+#define BTRFS_FILE_EXTENT_TYPES 2
struct btrfs_file_extent_item {
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 266cadec8bab..3dfdf4e03229 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1478,15 +1478,10 @@ static unsigned long capacity_of(int cpu);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
- unsigned long nr_running;
unsigned long load;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
-
- /* Approximate capacity in terms of runnable tasks on a node */
- unsigned long task_capacity;
- int has_free_capacity;
};
/*
@@ -1494,38 +1489,16 @@ struct numa_stats {
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int smt, cpu, cpus = 0;
- unsigned long capacity;
+ int cpu;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
-
- cpus++;
}
- /*
- * If we raced with hotplug and there are no CPUs left in our mask
- * the @ns structure is NULL'ed and task_numa_compare() will
- * not find this node attractive.
- *
- * We'll either bail at !has_free_capacity, or we'll detect a huge
- * imbalance and bail there.
- */
- if (!cpus)
- return;
-
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
- capacity = cpus / smt; /* cores */
-
- ns->task_capacity = min_t(unsigned, capacity,
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
- ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@@ -1591,10 +1564,9 @@ static bool load_too_imbalanced(long src_load, long dst_load,
* into account that it might be best if task running on the dst_cpu should
* be exchanged with the source task
*/
-static void task_numa_compare(struct task_numa_env *env,
- long taskimp, long groupimp)
+static bool task_numa_compare(struct task_numa_env *env,
+ long taskimp, long groupimp, bool maymove)
{
- struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
long src_load, dst_load;
@@ -1602,6 +1574,7 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool dst_idle = false;
rcu_read_lock();
cur = task_rcu_dereference(&dst_rq->curr);
@@ -1615,97 +1588,73 @@ static void task_numa_compare(struct task_numa_env *env,
if (cur == env->p)
goto unlock;
+ if (!cur) {
+ if (maymove)
+ goto assign;
+ else
+ goto unlock;
+ }
+
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
* the source task and potential destination task. The more negative
- * the value is, the more rmeote accesses that would be expected to
+ * the value is, the more remote accesses that would be expected to
* be incurred if the tasks were swapped.
*/
- if (cur) {
- /* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
- goto unlock;
+ /* Skip this swap candidate if cannot move to the source cpu */
+ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ goto unlock;
+ /*
+ * If dst and source tasks are in the same NUMA group, or not
+ * in any group then look only at task weights.
+ */
+ if (cur->numa_group == env->p->numa_group) {
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
- * If dst and source tasks are in the same NUMA group, or not
- * in any group then look only at task weights.
+ * Add some hysteresis to prevent swapping the
+ * tasks within a group over tiny differences.
*/
- if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid, dist) -
- task_weight(cur, env->dst_nid, dist);
- /*
- * Add some hysteresis to prevent swapping the
- * tasks within a group over tiny differences.
- */
- if (cur->numa_group)
- imp -= imp/16;
- } else {
- /*
- * Compare the group weights. If a task is all by
- * itself (not part of a group), use the task weight
- * instead.
- */
- if (cur->numa_group)
- imp += group_weight(cur, env->src_nid, dist) -
- group_weight(cur, env->dst_nid, dist);
- else
- imp += task_weight(cur, env->src_nid, dist) -
- task_weight(cur, env->dst_nid, dist);
- }
+ if (cur->numa_group)
+ imp -= imp / 16;
+ } else {
+ /*
+ * Compare the group weights. If a task is all by itself
+ * (not part of a group), use the task weight instead.
+ */
+ if (cur->numa_group && env->p->numa_group)
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
+ else
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
- if (imp <= env->best_imp && moveimp <= env->best_imp)
+ if (imp <= env->best_imp)
goto unlock;
- if (!cur) {
- /* Is there capacity at our destination? */
- if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
- !env->dst_stats.has_free_capacity)
- goto unlock;
-
- goto balance;
- }
-
- /* Balance doesn't matter much if we're running a task per cpu */
- if (imp > env->best_imp && src_rq->nr_running == 1 &&
- dst_rq->nr_running == 1)
+ if (maymove && moveimp > imp && moveimp > env->best_imp) {
+ imp = moveimp - 1;
+ cur = NULL;
goto assign;
+ }
/*
* In the overloaded case, try and keep the load balanced.
*/
-balance:
- load = task_h_load(env->p);
+ load = task_h_load(env->p) - task_h_load(cur);
+ if (!load)
+ goto assign;
+
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
- if (moveimp > imp && moveimp > env->best_imp) {
- /*
- * If the improvement from just moving env->p direction is
- * better than swapping tasks around, check if a move is
- * possible. Store a slightly smaller score than moveimp,
- * so an actually idle CPU will win.
- */
- if (!load_too_imbalanced(src_load, dst_load, env)) {
- imp = moveimp - 1;
- cur = NULL;
- goto assign;
- }
- }
-
- if (imp <= env->best_imp)
- goto unlock;
-
- if (cur) {
- load = task_h_load(cur);
- dst_load -= load;
- src_load += load;
- }
-
if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
+assign:
/*
* One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one.
@@ -1719,54 +1668,43 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
env->dst_cpu);
local_irq_enable();
+ dst_idle = true;
}
-assign:
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ return dst_idle;
}
static void task_numa_find_cpu(struct task_numa_env *env,
long taskimp, long groupimp)
{
+ long src_load, dst_load, load;
+ bool maymove = false;
int cpu;
+ load = task_h_load(env->p);
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ /*
+ * If the improvement from just moving env->p direction is better
+ * than swapping tasks around, check if a move is possible.
+ */
+ maymove = !load_too_imbalanced(src_load, dst_load, env);
+
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
continue;
env->dst_cpu = cpu;
- task_numa_compare(env, taskimp, groupimp);
+ if (task_numa_compare(env, taskimp, groupimp, maymove))
+ break;
}
}
-/* Only move tasks to a NUMA node less busy than the current node. */
-static bool numa_has_capacity(struct task_numa_env *env)
-{
- struct numa_stats *src = &env->src_stats;
- struct numa_stats *dst = &env->dst_stats;
-
- if (src->has_free_capacity && !dst->has_free_capacity)
- return false;
-
- /*
- * Only consider a task move if the source has a higher load
- * than the destination, corrected for CPU capacity on each node.
- *
- * src->load dst->load
- * --------------------- vs ---------------------
- * src->compute_capacity dst->compute_capacity
- */
- if (src->load * dst->compute_capacity * env->imbalance_pct >
-
- dst->load * src->compute_capacity * 100)
- return true;
-
- return false;
-}
-
static int task_numa_migrate(struct task_struct *p)
{
struct task_numa_env env = {
@@ -1821,8 +1759,7 @@ static int task_numa_migrate(struct task_struct *p)
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
- if (numa_has_capacity(&env))
- task_numa_find_cpu(&env, taskimp, groupimp);
+ task_numa_find_cpu(&env, taskimp, groupimp);
/*
* Look at other nodes in these cases:
@@ -1852,8 +1789,7 @@ static int task_numa_migrate(struct task_struct *p)
env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
- if (numa_has_capacity(&env))
- task_numa_find_cpu(&env, taskimp, groupimp);
+ task_numa_find_cpu(&env, taskimp, groupimp);
}
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6752c25564bf..17b64374f542 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
@@ -240,7 +240,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_remove(&tp_funcs, func);
if (IS_ERR(old)) {
- WARN_ON_ONCE(1);
+ WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 46520b00aee7..2e979e7afb0e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1441,7 +1441,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
if (__PageMovable(page))
return pfn;
if (PageHuge(page)) {
- if (page_huge_active(page))
+ if (hugepage_migration_supported(page_hstate(page)) &&
+ page_huge_active(page))
return pfn;
else
pfn = round_up(pfn + 1,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 996216e1b4f8..159e1821fa0e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7455,6 +7455,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
+
+ if (!hugepage_migration_supported(page_hstate(page)))
+ return true;
+
iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
continue;
}
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 4cadc29f547c..40a68ee09d42 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -775,6 +775,13 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return -EINVAL;
lock_sock(sk);
+
+ /* Ensure that the socket is not already bound */
+ if (self->ias_obj) {
+ err = -EINVAL;
+ goto out;
+ }
+
#ifdef CONFIG_IRDA_ULTRA
/* Special care for Ultra sockets */
if ((sk->sk_type == SOCK_DGRAM) &&
@@ -2018,7 +2025,11 @@ static int irda_setsockopt(struct socket *sock, int level, int optname,
err = -EINVAL;
goto out;
}
- irias_insert_object(ias_obj);
+
+ /* Only insert newly allocated objects */
+ if (free_ias)
+ irias_insert_object(ias_obj);
+
kfree(ias_opt);
break;
case IRLMP_IAS_DEL: