Home Home > GIT Browse > vanilla
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohannes Thumshirn <jthumshirn@suse.de>2018-09-14 16:35:13 +0200
committerJohannes Thumshirn <jthumshirn@suse.de>2018-09-14 16:35:13 +0200
commite0e199a58f607aa5f111c2b95900564c67dda52e (patch)
tree0377608c5be2133adf02c4dfa04133eeb3c92347
parentf5b5c89346353601d60584d4081450902e2eeac9 (diff)
parentc0bf4383d323c55e485d1ee5853325f1e9f92da3 (diff)
Merge remote-tracking branch 'origin/users/jack/SLE12-SP4/for-next' into SLE12-SP4
Pull nvdimm fixes from Jan Kara Conflicts: series.conf suse-commit: 728b1ddbe1d7a562ef7ff89d1179efdd81c186a6
-rw-r--r--arch/x86/include/asm/set_memory.h44
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c48
-rw-r--r--arch/x86/mm/pageattr.c25
-rw-r--r--arch/x86/mm/pat.c16
-rw-r--r--drivers/dax/device.c63
-rw-r--r--drivers/nvdimm/pmem.c26
-rw-r--r--drivers/nvdimm/pmem.h13
-rw-r--r--fs/dax.c125
-rw-r--r--include/asm-generic/set_memory.h4
-rw-r--r--include/linux/dax.h16
-rw-r--r--include/linux/device-mapper.h5
-rw-r--r--include/linux/mm.h1
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memory-failure.c209
15 files changed, 494 insertions, 132 deletions
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index cd71273ec49d..c2ac6c75b83d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -87,4 +87,48 @@ extern int kernel_set_to_readonly;
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
+#ifdef CONFIG_X86_64
+#undef set_mce_nospec
+static inline int set_mce_nospec(unsigned long pfn)
+{
+ unsigned long decoy_addr;
+ int rc;
+
+ /*
+ * Mark the linear address as UC to make sure we don't log more
+ * errors because of speculative access to the page.
+ * We would like to just call:
+ * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+ * but doing that would radically increase the odds of a
+ * speculative access to the poison page because we'd have
+ * the virtual address of the kernel 1:1 mapping sitting
+ * around in registers.
+ * Instead we get tricky. We create a non-canonical address
+ * that looks just like the one we want, but has bit 63 flipped.
+ * This relies on set_memory_uc() properly sanitizing any __pa()
+ * results with __PHYSICAL_MASK or PTE_PFN_MASK.
+ */
+ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+
+ rc = set_memory_uc(decoy_addr, 1);
+ if (rc)
+ pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+ return rc;
+}
+#define set_mce_nospec set_mce_nospec
+
+#undef clear_mce_nospec
+/* Restore full speculative operation to the pfn. */
+static inline int clear_mce_nospec(unsigned long pfn)
+{
+ return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
+}
+#define clear_mce_nospec clear_mce_nospec
+#else
+/*
+ * Few people would run a 32-bit kernel on a machine that supports
+ * recoverable errors because they have too much memory to boot 32-bit.
+ */
+#endif
+
#endif /* _ASM_X86_SET_MEMORY_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 0759dd2fd7a5..654ad0668d72 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,21 +13,6 @@ enum severity_level {
MCE_PANIC_SEVERITY,
};
-#ifndef CONFIG_X86_64
-/*
- * On 32-bit systems it would be difficult to safely unmap a poison page
- * from the kernel 1:1 map because there are no non-canonical addresses that
- * we can use to refer to the address without risking a speculative access.
- * However, this isn't much of an issue because:
- * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which
- * are only mapped into the kernel as needed
- * 2) Few people would run a 32-bit kernel on a machine that supports
- * recoverable errors because they have too much memory to boot 32-bit.
- */
-static inline void mce_unmap_kpfn(unsigned long pfn) {}
-#define mce_unmap_kpfn mce_unmap_kpfn
-#endif
-
extern struct blocking_notifier_head x86_mce_decoder_chain;
#define ATTR_LEN 16
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 36ec1fe8512f..93ddf647f318 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -106,10 +106,6 @@ static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-#ifndef mce_unmap_kpfn
-static void mce_unmap_kpfn(unsigned long pfn);
-#endif
-
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -587,7 +583,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, MCE_VECTOR, 0))
- mce_unmap_kpfn(pfn);
+ set_mce_nospec(pfn);
}
return NOTIFY_OK;
@@ -1060,50 +1056,10 @@ static int do_memory_failure(struct mce *m)
if (ret)
pr_err("Memory error not recovered");
else
- mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
+ set_mce_nospec(m->addr >> PAGE_SHIFT);
return ret;
}
-#ifndef mce_unmap_kpfn
-static void mce_unmap_kpfn(unsigned long pfn)
-{
- unsigned long decoy_addr;
-
- /*
- * Unmap this page from the kernel 1:1 mappings to make sure
- * we don't log more errors because of speculative access to
- * the page.
- * We would like to just call:
- * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
- * but doing that would radically increase the odds of a
- * speculative access to the poison page because we'd have
- * the virtual address of the kernel 1:1 mapping sitting
- * around in registers.
- * Instead we get tricky. We create a non-canonical address
- * that looks just like the one we want, but has bit 63 flipped.
- * This relies on set_memory_np() not checking whether we passed
- * a legal address.
- */
-
-/*
- * Build time check to see if we have a spare virtual bit. Don't want
- * to leave this until run time because most developers don't have a
- * system that can exercise this code path. This will only become a
- * problem if/when we move beyond 5-level page tables.
- *
- * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
- */
-#if PGDIR_SHIFT + 9 < 63
- decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
-#else
-#error "no unused virtual bit available"
-#endif
-
- if (set_memory_np(decoy_addr, 1))
- pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
-}
-#endif
-
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..b3874957a0be 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1416,6 +1416,29 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
return 0;
}
+/*
+ * Machine check recovery code needs to change cache mode of poisoned
+ * pages to UC to avoid speculative access logging another error. But
+ * passing the address of the 1:1 mapping to set_memory_uc() is a fine
+ * way to encourage a speculative access. So we cheat and flip the top
+ * bit of the address. This works fine for the code that updates the
+ * page tables. But at the end of the process we need to flush the cache
+ * and the non-canonical address causes a #GP fault when used by the
+ * CLFLUSH instruction.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long make_addr_canonical_again(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+ return (long)(addr << 1) >> 1;
+#else
+ return addr;
+#endif
+}
+
+
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,
@@ -1461,7 +1484,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
* Save address for cache flush. *addr is modified in the call
* to __change_page_attr_set_clr() below.
*/
- baddr = *addr;
+ baddr = make_addr_canonical_again(*addr);
}
/* Must avoid aliasing mappings in the highmem code */
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 1555bd7d3449..3d0c83ef6aab 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end)
return 0;
}
+static u64 sanitize_phys(u64 address)
+{
+ /*
+ * When changing the memtype for pages containing poison allow
+ * for a "decoy" virtual address (bit 63 clear) passed to
+ * set_memory_X(). __pa() on a "decoy" address results in a
+ * physical address with bit 63 set.
+ */
+ return address & __PHYSICAL_MASK;
+}
+
/*
* req_type typically has one of the:
* - _PAGE_CACHE_MODE_WB
@@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
int is_range_ram;
int err = 0;
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
BUG_ON(start >= end); /* end is exclusive */
if (!pat_enabled()) {
@@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end)
if (!pat_enabled())
return 0;
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
+
/* Low ISA region is always mapped WB. No need to track */
if (x86_platform.is_untracked_pat_range(start, end))
return 0;
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 861de4f8c6db..36e05b8119cd 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -243,13 +243,13 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
return -1;
}
-static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf,
+ pfn_t *pfn)
{
struct device *dev = &dev_dax->dev;
struct dax_region *dax_region;
int rc = VM_FAULT_SIGBUS;
phys_addr_t phys;
- pfn_t pfn;
unsigned int fault_size = PAGE_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
@@ -272,9 +272,9 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
- rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
+ rc = vm_insert_mixed(vmf->vma, vmf->address, *pfn);
if (rc == -ENOMEM)
return VM_FAULT_OOM;
@@ -284,14 +284,14 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
return VM_FAULT_NOPAGE;
}
-static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf,
+ pfn_t *pfn)
{
unsigned long pmd_addr = vmf->address & PMD_MASK;
struct device *dev = &dev_dax->dev;
struct dax_region *dax_region;
phys_addr_t phys;
pgoff_t pgoff;
- pfn_t pfn;
unsigned int fault_size = PMD_SIZE;
if (check_vma(dev_dax, vmf->vma, __func__))
@@ -328,21 +328,21 @@ static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
- return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
+ return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
vmf->flags & FAULT_FLAG_WRITE);
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf,
+ pfn_t *pfn)
{
unsigned long pud_addr = vmf->address & PUD_MASK;
struct device *dev = &dev_dax->dev;
struct dax_region *dax_region;
phys_addr_t phys;
pgoff_t pgoff;
- pfn_t pfn;
unsigned int fault_size = PUD_SIZE;
@@ -380,13 +380,14 @@ static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+ *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
- return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
+ return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
vmf->flags & FAULT_FLAG_WRITE);
}
#else
-static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf,
+ pfn_t *pfn)
{
return VM_FAULT_FALLBACK;
}
@@ -395,8 +396,10 @@ static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
static int dev_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
- int rc, id;
struct file *filp = vmf->vma->vm_file;
+ unsigned long fault_size;
+ int rc, id;
+ pfn_t pfn;
struct dev_dax *dev_dax = filp->private_data;
dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
@@ -407,17 +410,43 @@ static int dev_dax_huge_fault(struct vm_fault *vmf,
id = dax_read_lock();
switch (pe_size) {
case PE_SIZE_PTE:
- rc = __dev_dax_pte_fault(dev_dax, vmf);
+ fault_size = PAGE_SIZE;
+ rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
break;
case PE_SIZE_PMD:
- rc = __dev_dax_pmd_fault(dev_dax, vmf);
+ fault_size = PMD_SIZE;
+ rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
break;
case PE_SIZE_PUD:
- rc = __dev_dax_pud_fault(dev_dax, vmf);
+ fault_size = PUD_SIZE;
+ rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
break;
default:
rc = VM_FAULT_SIGBUS;
}
+
+ if (rc == VM_FAULT_NOPAGE) {
+ unsigned long i;
+ pgoff_t pgoff;
+
+ /*
+ * In the device-dax case the only possibility for a
+ * VM_FAULT_NOPAGE result is when device-dax capacity is
+ * mapped. No need to consider the zero page, or racing
+ * conflicting mappings.
+ */
+ pgoff = linear_page_index(vmf->vma, vmf->address
+ & ~(fault_size - 1));
+ for (i = 0; i < fault_size / PAGE_SIZE; i++) {
+ struct page *page;
+
+ page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+ if (page->mapping)
+ continue;
+ page->mapping = filp->f_mapping;
+ page->index = pgoff + i;
+ }
+ }
dax_read_unlock(id);
return rc;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 76824f2a9031..b24eabab63fd 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -20,6 +20,7 @@
#include <linux/hdreg.h>
#include <linux/init.h>
#include <linux/platform_device.h>
+#include <asm/set_memory.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/badblocks.h>
@@ -51,6 +52,30 @@ static struct nd_region *to_region(struct pmem_device *pmem)
return to_nd_region(to_dev(pmem)->parent);
}
+static void hwpoison_clear(struct pmem_device *pmem,
+ phys_addr_t phys, unsigned int len)
+{
+ unsigned long pfn_start, pfn_end, pfn;
+
+ /* only pmem in the linear map supports HWPoison */
+ if (is_vmalloc_addr(pmem->virt_addr))
+ return;
+
+ pfn_start = PHYS_PFN(phys);
+ pfn_end = pfn_start + PHYS_PFN(len);
+ for (pfn = pfn_start; pfn < pfn_end; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ /*
+ * Note, no need to hold a get_dev_pagemap() reference
+ * here since we're in the driver I/O path and
+ * outstanding I/O requests pin the dev_pagemap.
+ */
+ if (test_and_clear_pmem_poison(page))
+ clear_mce_nospec(pfn);
+ }
+}
+
static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
phys_addr_t offset, unsigned int len)
{
@@ -65,6 +90,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
if (cleared < len)
rc = BLK_STS_IOERR;
if (cleared > 0 && cleared / 512) {
+ hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
cleared /= 512;
dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__,
(unsigned long long) sector, cleared,
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 8db88b9fb5e9..0cb7a78560d9 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -1,5 +1,6 @@
#ifndef __NVDIMM_PMEM_H__
#define __NVDIMM_PMEM_H__
+#include <linux/page-flags.h>
#include <linux/badblocks.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
@@ -26,4 +27,16 @@ struct pmem_device {
long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn);
+
+#ifdef CONFIG_MEMORY_FAILURE
+static inline bool test_and_clear_pmem_poison(struct page *page)
+{
+ return TestClearPageHWPoison(page);
+}
+#else
+static inline bool test_and_clear_pmem_poison(struct page *page)
+{
+ return false;
+}
+#endif
#endif /* __NVDIMM_PMEM_H__ */
diff --git a/fs/dax.c b/fs/dax.c
index 6aafa1c745ef..9b9e95f8cf7f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -230,8 +230,8 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
*
* The function must be called with mapping->tree_lock held.
*/
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void ***slotp)
+static void *__get_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void ***slotp, bool (*wait_fn)(void))
{
void *entry, **slot;
struct wait_exceptional_entry_queue ewait;
@@ -241,6 +241,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
+ bool revalidate;
+
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
&slot);
if (!entry ||
@@ -255,14 +257,31 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&mapping->tree_lock);
- schedule();
+ revalidate = wait_fn();
finish_wait(wq, &ewait.wait);
spin_lock_irq(&mapping->tree_lock);
+ if (revalidate)
+ return ERR_PTR(-EAGAIN);
}
}
-static void dax_unlock_mapping_entry(struct address_space *mapping,
- pgoff_t index)
+static bool entry_wait(void)
+{
+ schedule();
+ /*
+ * Never return an ERR_PTR() from
+ * __get_unlocked_mapping_entry(), just keep looping.
+ */
+ return false;
+}
+
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+ pgoff_t index, void ***slotp)
+{
+ return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
+}
+
+static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
{
void *entry, **slot;
@@ -281,7 +300,7 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
static void put_locked_mapping_entry(struct address_space *mapping,
pgoff_t index)
{
- dax_unlock_mapping_entry(mapping, index);
+ unlock_mapping_entry(mapping, index);
}
/*
@@ -323,18 +342,27 @@ static unsigned long dax_radix_end_pfn(void *entry)
for (pfn = dax_radix_pfn(entry); \
pfn < dax_radix_end_pfn(entry); pfn++)
-static void dax_associate_entry(void *entry, struct address_space *mapping)
+/*
+ * TODO: for reflink+dax we need a way to associate a single page with
+ * multiple address_space instances at different linear_page_index()
+ * offsets.
+ */
+static void dax_associate_entry(void *entry, struct address_space *mapping,
+ struct vm_area_struct *vma, unsigned long address)
{
- unsigned long pfn;
+ unsigned long size = dax_entry_size(entry), pfn, index;
+ int i = 0;
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;
+ index = linear_page_index(vma, address & ~(size - 1));
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
+ page->index = index + i++;
}
}
@@ -352,6 +380,7 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
page->mapping = NULL;
+ page->index = 0;
}
}
@@ -368,6 +397,84 @@ static struct page *dax_busy_page(void *entry)
return NULL;
}
+static bool entry_wait_revalidate(void)
+{
+ rcu_read_unlock();
+ schedule();
+ rcu_read_lock();
+
+ /*
+ * Tell __get_unlocked_mapping_entry() to take a break, we need
+ * to revalidate page->mapping after dropping locks
+ */
+ return true;
+}
+
+bool dax_lock_mapping_entry(struct page *page)
+{
+ pgoff_t index;
+ struct inode *inode;
+ bool did_lock = false;
+ void *entry = NULL, **slot;
+ struct address_space *mapping;
+
+ rcu_read_lock();
+ for (;;) {
+ mapping = READ_ONCE(page->mapping);
+
+ if (!dax_mapping(mapping))
+ break;
+
+ /*
+ * In the device-dax case there's no need to lock, a
+ * struct dev_pagemap pin is sufficient to keep the
+ * inode alive, and we assume we have dev_pagemap pin
+ * otherwise we would not have a valid pfn_to_page()
+ * translation.
+ */
+ inode = mapping->host;
+ if (S_ISCHR(inode->i_mode)) {
+ did_lock = true;
+ break;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+ if (mapping != page->mapping) {
+ spin_unlock_irq(&mapping->tree_lock);
+ continue;
+ }
+ index = page->index;
+
+ entry = __get_unlocked_mapping_entry(mapping, index, &slot,
+ entry_wait_revalidate);
+ if (!entry) {
+ spin_unlock_irq(&mapping->tree_lock);
+ break;
+ } else if (IS_ERR(entry)) {
+ WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
+ continue;
+ }
+ lock_slot(mapping, slot);
+ did_lock = true;
+ spin_unlock_irq(&mapping->tree_lock);
+ break;
+ }
+ rcu_read_unlock();
+
+ return did_lock;
+}
+
+void dax_unlock_mapping_entry(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+
+ if (S_ISCHR(inode->i_mode))
+ return;
+
+ unlock_mapping_entry(mapping, page->index);
+}
+
/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
@@ -715,7 +822,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
new_entry = dax_radix_locked_entry(pfn, flags);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping);
+ dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
}
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
diff --git a/include/asm-generic/set_memory.h b/include/asm-generic/set_memory.h
index 83e81f8996b2..8aac9fc1d86b 100644
--- a/include/asm-generic/set_memory.h
+++ b/include/asm-generic/set_memory.h
@@ -9,4 +9,8 @@ int set_memory_rw(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
+/* Stubs to be redefined by arch if it wants */
+#define set_mce_nospec(pfn) (0)
+#define clear_mce_nospec(pfn) (0)
+
#endif
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0bbd582476d6..ae2d680c03bf 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -19,6 +19,9 @@ struct dax_operations {
/* copy_from_iter: dax-driver override for default copy_from_iter */
size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
struct iov_iter *);
+ /* copy_to_iter: required operation for fs-dax direct-i/o */
+ size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t,
+ struct iov_iter *);
};
extern struct attribute_group dax_attribute_group;
@@ -61,6 +64,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc);
struct page *dax_layout_busy_page(struct address_space *mapping);
+bool dax_lock_mapping_entry(struct page *page);
+void dax_unlock_mapping_entry(struct page *page);
#else
static inline bool bdev_dax_supported(struct block_device *bdev,
int blocksize)
@@ -92,6 +97,17 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping,
{
return -EOPNOTSUPP;
}
+
+static inline bool dax_lock_mapping_entry(struct page *page)
+{
+ if (IS_DAX(page->mapping->host))
+ return true;
+ return false;
+}
+
+static inline void dax_unlock_mapping_entry(struct page *page)
+{
+}
#endif
int dax_read_lock(void);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index d010aec164d5..4356b198fb55 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -132,7 +132,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
*/
typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn);
-typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
+typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i);
#define PAGE_SECTORS (PAGE_SIZE / 512)
@@ -183,7 +183,8 @@ struct target_type {
dm_iterate_devices_fn iterate_devices;
dm_io_hints_fn io_hints;
dm_dax_direct_access_fn direct_access;
- dm_dax_copy_from_iter_fn dax_copy_from_iter;
+ dm_dax_copy_iter_fn dax_copy_from_iter;
+ dm_dax_copy_iter_fn dax_copy_to_iter;
/* For internal device-mapper use. */
struct list_head list;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fca222355c89..86ddf82c56bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2607,6 +2607,7 @@ enum mf_action_page_type {
MF_MSG_TRUNCATED_LRU,
MF_MSG_BUDDY,
MF_MSG_BUDDY_2ND,
+ MF_MSG_DAX,
MF_MSG_UNKNOWN,
};
diff --git a/mm/madvise.c b/mm/madvise.c
index d676b7f1489e..3147813b6c36 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -615,11 +615,13 @@ static int madvise_inject_error(int behavior,
for (; start < end; start += PAGE_SIZE <<
compound_order(compound_head(page))) {
+ unsigned long pfn;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
if (ret != 1)
return ret;
+ pfn = page_to_pfn(page);
if (PageHWPoison(page)) {
put_page(page);
@@ -628,17 +630,25 @@ static int madvise_inject_error(int behavior,
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
- page_to_pfn(page), start);
+ pfn, start);
ret = soft_offline_page(page, MF_COUNT_INCREASED);
if (ret)
return ret;
continue;
}
+
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
- page_to_pfn(page), start);
+ pfn, start);
- ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED);
+ /*
+ * Drop the page reference taken by get_user_pages_fast(). In
+ * the absence of MF_COUNT_INCREASED the memory_failure()
+ * routine is responsible for pinning the page to prevent it
+ * from being released back to the page allocator.
+ */
+ put_page(page);
+ ret = memory_failure(pfn, 0, 0);
if (ret)
return ret;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c801e6890207..e86cdcec0b7d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -56,6 +56,7 @@
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
+#include <linux/memremap.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include "internal.h"
@@ -175,25 +176,56 @@ int hwpoison_filter(struct page *p)
EXPORT_SYMBOL_GPL(hwpoison_filter);
/*
+ * Kill all processes that have a poisoned page mapped and then isolate
+ * the page.
+ *
+ * General strategy:
+ * Find all processes having the page mapped and kill them.
+ * But we keep a page reference around so that the page is not
+ * actually freed yet.
+ * Then stash the page away
+ *
+ * There's no convenient way to get back to mapped processes
+ * from the VMAs. So do a brute-force search over all
+ * running processes.
+ *
+ * Remember that machine checks are not common (or rather
+ * if they are common you have other problems), so this shouldn't
+ * be a performance issue.
+ *
+ * Also there are some races possible while we get from the
+ * error detection to actually handle it.
+ */
+
+struct to_kill {
+ struct list_head nd;
+ struct task_struct *tsk;
+ unsigned long addr;
+ short size_shift;
+ char addr_valid;
+};
+
+/*
* Send all the processes who have the page mapped a signal.
* ``action optional'' if they are not immediately affected by the error
* ``action required'' if error happened in current execution context
*/
-static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn, struct page *page, int flags)
+static int kill_proc(struct to_kill *tk, int trapno, unsigned long pfn,
+ int flags)
{
struct siginfo si;
+ struct task_struct *t = tk->tsk;
int ret;
pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
- si.si_addr = (void *)addr;
+ si.si_addr = (void *)tk->addr;
#ifdef __ARCH_SI_TRAPNO
- si.si_trapno = trapno;
+ si.si_trapno = tk->trapno;
#endif
- si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+ si.si_addr_lsb = tk->size_shift;
if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
si.si_code = BUS_MCEERR_AR;
@@ -241,34 +273,39 @@ void shake_page(struct page *p, int access)
}
EXPORT_SYMBOL_GPL(shake_page);
-/*
- * Kill all processes that have a poisoned page mapped and then isolate
- * the page.
- *
- * General strategy:
- * Find all processes having the page mapped and kill them.
- * But we keep a page reference around so that the page is not
- * actually freed yet.
- * Then stash the page away
- *
- * There's no convenient way to get back to mapped processes
- * from the VMAs. So do a brute-force search over all
- * running processes.
- *
- * Remember that machine checks are not common (or rather
- * if they are common you have other problems), so this shouldn't
- * be a performance issue.
- *
- * Also there are some races possible while we get from the
- * error detection to actually handle it.
- */
-
-struct to_kill {
- struct list_head nd;
- struct task_struct *tsk;
- unsigned long addr;
- char addr_valid;
-};
+static unsigned long dev_pagemap_mapping_shift(struct page *page,
+ struct vm_area_struct *vma)
+{
+ unsigned long address = vma_address(page, vma);
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return 0;
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return 0;
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return 0;
+ if (pud_devmap(*pud))
+ return PUD_SHIFT;
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+ if (pmd_devmap(*pmd))
+ return PMD_SHIFT;
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+ if (pte_devmap(*pte))
+ return PAGE_SHIFT;
+ return 0;
+}
/*
* Failure handling: if we can't find or can't kill a process there's
@@ -299,6 +336,10 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
}
tk->addr = page_address_in_vma(p, vma);
tk->addr_valid = 1;
+ if (is_zone_device_page(p))
+ tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+ else
+ tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
/*
* In theory we don't have to kill when the page was
@@ -306,7 +347,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* likely very rare kill anyways just out of paranoia, but use
* a SIGKILL because the error is not contained anymore.
*/
- if (tk->addr == -EFAULT) {
+ if (tk->addr == -EFAULT || tk->size_shift == 0) {
pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
@@ -325,8 +366,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* wrong earlier.
*/
static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
- bool fail, struct page *page, unsigned long pfn,
- int flags)
+ bool fail, unsigned long pfn, int flags)
{
struct to_kill *tk, *next;
@@ -349,8 +389,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* check for that, but we need to tell the
* process anyways.
*/
- else if (kill_proc(tk->tsk, tk->addr, trapno,
- pfn, page, flags) < 0)
+ else if (kill_proc(tk, trapno, pfn, flags) < 0)
pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
@@ -521,6 +560,7 @@ static const char * const action_page_types[] = {
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
+ [MF_MSG_DAX] = "dax page",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1005,7 +1045,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* any accesses to the poisoned memory.
*/
forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
- kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
+ kill_procs(&tokill, forcekill, trapno, !unmap_success, pfn, flags);
return unmap_success;
}
@@ -1026,6 +1066,84 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
ClearPageHWPoison(hpage + i);
}
+static int memory_failure_dev_pagemap(unsigned long pfn, int trapno, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ const bool unmap_success = true;
+ unsigned long size = 0;
+ struct to_kill *tk;
+ LIST_HEAD(tokill);
+ int rc = -EBUSY;
+ loff_t start;
+
+ /*
+ * Prevent the inode from being freed while we are interrogating
+ * the address_space, typically this would be handled by
+ * lock_page(), but dax pages do not use the page lock. This
+ * also prevents changes to the mapping of this pfn until
+ * poison signaling is complete.
+ */
+ if (!dax_lock_mapping_entry(page))
+ goto out;
+
+ if (hwpoison_filter(page)) {
+ rc = 0;
+ goto unlock;
+ }
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_PUBLIC:
+ /*
+ * TODO: Handle HMM pages which may need coordination
+ * with device-side memory.
+ */
+ goto unlock;
+ default:
+ break;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a
+ * different physical page at a given virtual address, so all
+ * userspace consumption of ZONE_DEVICE memory necessitates
+ * SIGBUS (i.e. MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+
+ list_for_each_entry(tk, &tokill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up
+ * device-dax mappings which are constant size. The
+ * actual size of the mapping being torn down is
+ * communicated in siginfo, see kill_proc()
+ */
+ start = (page->index << PAGE_SHIFT) & ~(size - 1);
+ unmap_mapping_range(page->mapping, start, start + size, 0);
+ }
+ kill_procs(&tokill, flags & MF_MUST_KILL, trapno, !unmap_success, pfn,
+ flags);
+ rc = 0;
+unlock:
+ dax_unlock_mapping_entry(page);
+out:
+ /* drop pgmap ref acquired in caller */
+ put_dev_pagemap(pgmap);
+ action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
+ return rc;
+}
+
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
@@ -1050,6 +1168,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
struct page *p;
struct page *hpage;
struct page *orig_head;
+ struct dev_pagemap *pgmap;
int res;
unsigned int nr_pages;
unsigned long page_flags;
@@ -1063,6 +1182,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return -ENXIO;
}
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (pgmap)
+ return memory_failure_dev_pagemap(pfn, trapno, flags, pgmap);
+
p = pfn_to_page(pfn);
orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
@@ -1766,6 +1889,14 @@ int soft_offline_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
+ if (is_zone_device_page(page)) {
+ pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
+ pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_page(page);
+ return -EIO;
+ }
+
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
if (flags & MF_COUNT_INCREASED)