Home Home > GIT Browse > stable-xen
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOlaf Hering <ohering@suse.de>2019-10-07 12:53:34 +0200
committerOlaf Hering <ohering@suse.de>2019-10-07 12:53:34 +0200
commit83592698a44914a69bf5aff9884e9560d978c258 (patch)
tree5f141160bbe2ebd9e851d349efe31fb254cc2492
parente3882dd8b8bc50d7e30cf0d4df1853e7508b2599 (diff)
parent37817a44490652c8984e1c5eb7e45f975b2b75d0 (diff)
Merge remote-tracking branch 'kerncvs/SLE12-SP4' into SLE12-SP4-AZURErpm-4.12.14-6.26--sle12-sp4-updatesrpm-4.12.14-6.26
-rw-r--r--arch/powerpc/Kconfig3
-rw-r--r--arch/powerpc/include/asm/asm-prototypes.h11
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h7
-rw-r--r--arch/powerpc/include/asm/cputable.h5
-rw-r--r--arch/powerpc/include/asm/hugetlb.h6
-rw-r--r--arch/powerpc/include/asm/opal.h2
-rw-r--r--arch/powerpc/include/asm/page.h1
-rw-r--r--arch/powerpc/kernel/dt_cpu_ftrs.c32
-rw-r--r--arch/powerpc/kernel/irq.c7
-rw-r--r--arch/powerpc/kernel/module_64.c2
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c15
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c42
-rw-r--r--arch/powerpc/mm/hash_native_64.c31
-rw-r--r--arch/powerpc/mm/mem.c49
-rw-r--r--arch/powerpc/mm/tlb-radix.c693
-rw-r--r--arch/powerpc/platforms/powernv/Makefile5
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c4
-rw-r--r--arch/powerpc/platforms/powernv/opal-call.c274
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S332
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c81
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c163
-rw-r--r--arch/powerpc/platforms/pseries/pseries.h1
-rw-r--r--arch/powerpc/platforms/pseries/setup.c1
-rw-r--r--arch/powerpc/sysdev/xive/native.c11
-rw-r--r--arch/x86/kernel/ftrace.c3
-rw-r--r--drivers/char/mem.c6
-rw-r--r--drivers/misc/eeprom/at24.c2
-rw-r--r--drivers/net/ethernet/ibm/ibmvnic.c262
-rw-r--r--drivers/net/ethernet/ibm/ibmvnic.h6
-rw-r--r--drivers/net/ppp/ppp_generic.c2
-rw-r--r--drivers/nvmem/core.c15
-rw-r--r--drivers/scsi/device_handler/scsi_dh_rdac.c2
-rw-r--r--fs/btrfs/qgroup.c5
-rw-r--r--fs/btrfs/relocation.c9
-rw-r--r--fs/ceph/inode.c10
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/delegation.c25
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/dir.c295
-rw-r--r--fs/nfs/direct.c27
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c2
-rw-r--r--fs/nfs/fscache.c7
-rw-r--r--fs/nfs/fscache.h2
-rw-r--r--fs/nfs/inode.c1
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c310
-rw-r--r--fs/nfs/nfs4state.c31
-rw-r--r--fs/nfs/pagelist.c17
-rw-r--r--fs/nfs/pnfs_nfs.c14
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--include/linux/sunrpc/svc.h2
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/resource.c4
-rw-r--r--net/sunrpc/clnt.c3
-rw-r--r--net/sunrpc/svc.c27
64 files changed, 1957 insertions, 952 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d8a598601a1a..871e6719fd3d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -496,9 +496,6 @@ config ARCH_CPU_PROBE_RELEASE
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
-config ARCH_HAS_WALK_MEMORY
- def_bool y
-
config ARCH_ENABLE_MEMORY_HOTREMOVE
def_bool y
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 59d72a25131c..5c18655ed23e 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -38,13 +38,10 @@ extern struct static_key hcall_tracepoint_key;
void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
void __trace_hcall_exit(long opcode, unsigned long retval,
unsigned long *retbuf);
-/* OPAL tracing */
-#ifdef HAVE_JUMP_LABEL
-extern struct static_key opal_tracepoint_key;
-#endif
-
-void __trace_opal_entry(unsigned long opcode, unsigned long *args);
-void __trace_opal_exit(long opcode, unsigned long retval);
+/* OPAL */
+int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+ int64_t a4, int64_t a5, int64_t a6, int64_t a7,
+ int64_t opcode, uint64_t msr);
/* VMX copying */
int enter_vmx_usercopy(void);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index fba8c4b2015c..f6262d28dad9 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -44,4 +44,11 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr);
extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr);
extern void radix__flush_tlb_all(void);
+extern void radix__flush_tlb_lpid_page(unsigned int lpid,
+ unsigned long addr,
+ unsigned long page_size);
+extern void radix__flush_pwc_lpid(unsigned int lpid);
+extern void radix__local_flush_tlb_lpid(unsigned int lpid);
+extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
+
#endif
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index e4527faf3853..a5d3dba5109c 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -212,7 +212,8 @@ static inline void cpu_feature_keys_init(void) { }
#define CPU_FTR_POWER9_DD2_1 LONG_ASM_CONST(0x0000080000000000)
#define CPU_FTR_P9_TM_HV_ASSIST LONG_ASM_CONST(0x0000100000000000)
#define CPU_FTR_P9_TM_XER_SO_BUG LONG_ASM_CONST(0x0000200000000000)
-#define CPU_FTR_P9_TLBIE_BUG LONG_ASM_CONST(0x0000400000000000)
+#define CPU_FTR_P9_TLBIE_STQ_BUG LONG_ASM_CONST(0x0000400000000000)
+#define CPU_FTR_P9_TLBIE_ERAT_BUG LONG_ASM_CONST(0x0001000000000000)
#ifndef __ASSEMBLY__
@@ -460,7 +461,7 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | CPU_FTR_PKEY | \
- CPU_FTR_P9_TLBIE_BUG)
+ CPU_FTR_P9_TLBIE_STQ_BUG | CPU_FTR_P9_TLBIE_ERAT_BUG)
#define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9
#define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1)
#define CPU_FTRS_POWER9_DD2_2 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1 | \
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index ecbcafb59714..4aff6ac0ec7b 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -40,12 +40,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
return radix__flush_hugetlb_page(vma, vmaddr);
}
-static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma,
- unsigned long vmaddr)
-{
- if (radix_enabled())
- return radix__local_flush_hugetlb_page(vma, vmaddr);
-}
#else
static inline pte_t *hugepd_page(hugepd_t hpd)
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 8eb3ebca02df..163970c56e2f 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -266,7 +266,7 @@ int64_t opal_xive_get_vp_info(uint64_t vp,
int64_t opal_xive_set_vp_info(uint64_t vp,
uint64_t flags,
uint64_t report_cl_pair);
-int64_t opal_xive_allocate_irq(uint32_t chip_id);
+int64_t opal_xive_allocate_irq_raw(uint32_t chip_id);
int64_t opal_xive_free_irq(uint32_t girq);
int64_t opal_xive_sync(uint32_t type, uint32_t id);
int64_t opal_xive_dump(uint32_t type, uint32_t id);
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 40aee9396556..aae13946dbd2 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -319,7 +319,6 @@ struct page;
extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
extern void copy_user_page(void *to, void *from, unsigned long vaddr,
struct page *p);
-extern int page_is_ram(unsigned long pfn);
extern int devmem_is_allowed(unsigned long pfn);
#ifdef CONFIG_PPC_SMLPAR
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 7719896657a7..78e20016bc25 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -690,9 +690,37 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f)
return true;
}
+/*
+ * Handle POWER9 broadcast tlbie invalidation issue using
+ * cpu feature flag.
+ */
+static __init void update_tlbie_feature_flag(unsigned long pvr)
+{
+ if (PVR_VER(pvr) == PVR_POWER9) {
+ /*
+ * Set the tlbie feature flag for anything below
+ * Nimbus DD 2.3 and Cumulus DD 1.3
+ */
+ if ((pvr & 0xe000) == 0) {
+ /* Nimbus */
+ if ((pvr & 0xfff) < 0x203)
+ cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+ } else if ((pvr & 0xc000) == 0) {
+ /* Cumulus */
+ if ((pvr & 0xfff) < 0x103)
+ cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+ } else {
+ WARN_ONCE(1, "Unknown PVR");
+ cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG;
+ }
+
+ cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_ERAT_BUG;
+ }
+}
+
static __init void cpufeatures_cpu_quirks(void)
{
- int version = mfspr(SPRN_PVR);
+ unsigned long version = mfspr(SPRN_PVR);
/*
* Not all quirks can be derived from the cpufeatures device tree.
@@ -711,9 +739,9 @@ static __init void cpufeatures_cpu_quirks(void)
if ((version & 0xffff0000) == 0x004e0000) {
cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR);
- cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG;
}
+ update_tlbie_feature_flag(version);
/*
* PKEY was not in the initial base or feature node
* specification, but it should become optional in the next
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 48fda67fb092..505433ea630f 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -265,7 +265,7 @@ notrace void arch_local_irq_restore(unsigned long en)
* warn if we are wrong. Only do that when IRQ tracing
* is enabled as mfmsr() can be costly.
*/
- if (WARN_ON(mfmsr() & MSR_EE))
+ if (WARN_ON_ONCE(mfmsr() & MSR_EE))
__hard_irq_disable();
}
#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -775,11 +775,6 @@ int irq_choose_cpu(const struct cpumask *mask)
}
#endif
-int arch_early_irq_init(void)
-{
- return 0;
-}
-
#ifdef CONFIG_PPC64
static int __init setup_noirqdistrib(char *str)
{
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 20dd82d565e2..96e344ad27f9 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -692,7 +692,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
/*
* If found, replace it with:
* addis r2, r12, (.TOC.-func)@ha
- * addi r2, r12, (.TOC.-func)@l
+ * addi r2, r2, (.TOC.-func)@l
*/
((uint32_t *)location)[0] = 0x3c4c0000 + PPC_HA(value);
((uint32_t *)location)[1] = 0x38420000 + PPC_LO(value);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 5e97d4f836ca..2a42c8e6e522 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -156,7 +156,7 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
: : "r" (addr), "r" (kvm->arch.lpid) : "memory");
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG))
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG))
asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1)
: : "r" (addr), "r" (kvm->arch.lpid) : "memory");
asm volatile("eieio ; tlbsync ; ptesync": : :"memory");
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f350c20f5d43..17c684545b85 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2974,25 +2974,26 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
for (sub = 0; sub < core_info.n_subcores; ++sub)
spin_unlock(&core_info.vc[sub]->lock);
- /*
- * Interrupts will be enabled once we get into the guest,
- * so tell lockdep that we're about to enable interrupts.
- */
- trace_hardirqs_on();
-
guest_enter_irqoff();
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
this_cpu_disable_ftrace();
+ /*
+ * Interrupts will be enabled once we get into the guest,
+ * so tell lockdep that we're about to enable interrupts.
+ */
+ trace_hardirqs_on();
+
trap = __kvmppc_vcore_entry();
+ trace_hardirqs_off();
+
this_cpu_enable_ftrace();
srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
- trace_hardirqs_off();
set_irq_happened(trap);
spin_lock(&vc->lock);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 19b1d08dbc90..5e1e62206f1d 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -435,6 +435,37 @@ static inline int is_mmio_hpte(unsigned long v, unsigned long r)
(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
}
+static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid)
+{
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ /* Radix flush for a hash guest */
+
+ unsigned long rb,rs,prs,r,ric;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = 0; /* lpid = 0 */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+ ric = 0; /* RIC_FLSUH_TLB */
+
+ /*
+ * Need the extra ptesync to make sure we don't
+ * re-order the tlbie
+ */
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs),
+ "i"(ric), "r"(rs) : "memory");
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
+ "r" (rb_value), "r" (lpid));
+ }
+}
+
static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
long npages, int global, bool need_sync)
{
@@ -453,16 +484,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
"r" (rbvalues[i]), "r" (kvm->arch.lpid));
}
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
- /*
- * Need the extra ptesync to make sure we don't
- * re-order the tlbie
- */
- asm volatile("ptesync": : :"memory");
- asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
- "r" (rbvalues[0]), "r" (kvm->arch.lpid));
- }
-
+ fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid);
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
} else {
if (need_sync)
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 1fd21f191e4c..c433851cfd32 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -201,9 +201,32 @@ static inline unsigned long ___tlbie(unsigned long vpn, int psize,
return va;
}
-static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+static inline void fixup_tlbie_vpn(unsigned long vpn, int psize,
+ int apsize, int ssize)
{
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ /* Radix flush for a hash guest */
+
+ unsigned long rb,rs,prs,r,ric;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = 0; /* lpid = 0 */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+ ric = 0; /* RIC_FLSUH_TLB */
+
+ /*
+ * Need the extra ptesync to make sure we don't
+ * re-order the tlbie
+ */
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs),
+ "i"(ric), "r"(rs) : "memory");
+ }
+
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
/* Need the extra ptesync to ensure we don't reorder tlbie*/
asm volatile("ptesync": : :"memory");
___tlbie(vpn, psize, apsize, ssize);
@@ -287,7 +310,7 @@ static inline void tlbie(unsigned long vpn, int psize, int apsize,
asm volatile("ptesync": : :"memory");
} else {
__tlbie(vpn, psize, apsize, ssize);
- fixup_tlbie(vpn, psize, apsize, ssize);
+ fixup_tlbie_vpn(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
@@ -824,7 +847,7 @@ static void native_flush_hash_range(unsigned long number, int local)
/*
* Just do one more with the last used values.
*/
- fixup_tlbie(vpn, psize, psize, ssize);
+ fixup_tlbie_vpn(vpn, psize, psize, ssize);
asm volatile("eieio; tlbsync; ptesync":::"memory");
if (lock_tlbie)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 4bc55e5ea93c..63ae402dbabc 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -80,21 +80,6 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr)
#define TOP_ZONE ZONE_NORMAL
#endif
-int page_is_ram(unsigned long pfn)
-{
-#ifndef CONFIG_PPC64 /* XXX for now */
- return pfn < max_pfn;
-#else
- unsigned long paddr = (pfn << PAGE_SHIFT);
- struct memblock_region *reg;
-
- for_each_memblock(memory, reg)
- if (paddr >= reg->base && paddr < (reg->base + reg->size))
- return 1;
- return 0;
-#endif
-}
-
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
@@ -177,34 +162,6 @@ int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
-/*
- * walk_memory_resource() needs to make sure there is no holes in a given
- * memory range. PPC64 does not maintain the memory layout in /proc/iomem.
- * Instead it maintains it in memblock.memory structures. Walk through the
- * memory regions, find holes and callback for contiguous regions.
- */
-int
-walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg, int (*func)(unsigned long, unsigned long, void *))
-{
- struct memblock_region *reg;
- unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long tstart, tend;
- int ret = -1;
-
- for_each_memblock(memory, reg) {
- tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
- tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
- if (tstart >= tend)
- continue;
- ret = (*func)(tstart, tend - tstart, arg);
- if (ret)
- break;
- }
- return ret;
-}
-EXPORT_SYMBOL_GPL(walk_system_ram_range);
-
#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init initmem_init(void)
{
@@ -592,3 +549,9 @@ int devmem_is_allowed(unsigned long pfn)
return 0;
}
#endif /* CONFIG_STRICT_DEVMEM */
+
+/*
+ * This is defined in kernel/resource.c but only powerpc needs to export it, for
+ * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed.
+ */
+EXPORT_SYMBOL_GPL(walk_system_ram_range);
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 672730f4665b..9004bc5bee5a 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -104,6 +104,67 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = pid << PPC_BITLSHIFT(31);
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+
static inline void __tlbiel_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
@@ -136,17 +197,104 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static inline void fixup_tlbie(void)
+static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+
+static inline void fixup_tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap)
{
- unsigned long pid = 0;
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, 0, ap, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_pid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_pid(unsigned long pid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
unsigned long va = ((1UL << 52) - 1);
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_pid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
}
}
+
+static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_lpid(unsigned long lpid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+ }
+}
+
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
@@ -178,38 +326,187 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
{
- unsigned long rb,rs,prs,r;
+ asm volatile("ptesync": : :"memory");
- rb = PPC_BIT(53); /* IS = 1 */
- rs = pid << PPC_BITLSHIFT(31);
- prs = 1; /* process scoped */
- r = 1; /* radix format */
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid(pid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid(pid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid(pid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid(pid);
+ }
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
asm volatile("ptesync": : :"memory");
- asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- fixup_tlbie();
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_lpid(lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_lpid(lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_lpid(lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_lpid(lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_lpid(lpid);
+ }
asm volatile("eieio; tlbsync; ptesync": : :"memory");
- trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
+static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid_guest(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+}
+
+
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
}
static inline void _tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
{
+ unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbiel_va(va, pid, ap, ric);
asm volatile("ptesync": : :"memory");
}
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ __tlbiel_va_range(start, end, pid, page_size, psize);
+ asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range(addr - page_size, pid, ap);
+}
+
static inline void _tlbie_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
{
+ unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, ap, ric);
- fixup_tlbie();
+ fixup_tlbie_va(va, pid, ap);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, ric);
+ fixup_tlbie_lpid_va(va, lpid, ap);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync": : :"memory");
+ if (also_pwc)
+ __tlbie_pid(pid, RIC_FLUSH_PWC);
+ __tlbie_va_range(start, end, pid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
/*
* Base TLB flushing operations:
@@ -252,12 +549,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd
int psize)
{
unsigned long pid;
- unsigned long ap = mmu_get_ap(psize);
preempt_disable();
- pid = mm ? mm->context.id : 0;
+ pid = mm->context.id;
if (pid != MMU_NO_CONTEXT)
- _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
@@ -265,11 +561,10 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
{
#ifdef CONFIG_HUGETLB_PAGE
/* need the return fix for nohash.c */
- if (vma && is_vm_hugetlb_page(vma))
- return __local_flush_hugetlb_page(vma, vmaddr);
+ if (is_vm_hugetlb_page(vma))
+ return radix__local_flush_hugetlb_page(vma, vmaddr);
#endif
- radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
- mmu_virtual_psize);
+ radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
}
EXPORT_SYMBOL(radix__local_flush_tlb_page);
@@ -288,11 +583,11 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
{
unsigned long pid;
- preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto no_context;
+ return;
+ preempt_disable();
if (!mm_is_thread_local(mm)) {
if (mm_needs_flush_escalation(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
@@ -300,7 +595,6 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
_tlbie_pid(pid, RIC_FLUSH_TLB);
} else
_tlbiel_pid(pid, RIC_FLUSH_TLB);
-no_context:
preempt_enable();
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -309,16 +603,15 @@ void radix__flush_all_mm(struct mm_struct *mm)
{
unsigned long pid;
- preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto no_context;
+ return;
+ preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbiel_pid(pid, RIC_FLUSH_ALL);
-no_context:
preempt_enable();
}
EXPORT_SYMBOL(radix__flush_all_mm);
@@ -333,28 +626,26 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
unsigned long pid;
- unsigned long ap = mmu_get_ap(psize);
- preempt_disable();
- pid = mm ? mm->context.id : 0;
+ pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto bail;
+ return;
+
+ preempt_disable();
if (!mm_is_thread_local(mm))
- _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+ _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
else
- _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-bail:
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
#ifdef CONFIG_HUGETLB_PAGE
- if (vma && is_vm_hugetlb_page(vma))
- return flush_hugetlb_page(vma, vmaddr);
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_page(vma, vmaddr);
#endif
- radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
- mmu_virtual_psize);
+ radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
}
EXPORT_SYMBOL(radix__flush_tlb_page);
@@ -368,17 +659,113 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+#define TLB_FLUSH_ALL -1UL
+
/*
- * Currently, for range flushing, we just do a full mm flush. Because
- * we use this in code path where we don' track the page size.
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
*/
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
+
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool flush_all_sizes)
+
+{
+ unsigned long pid;
+ unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool local, full;
+
+ pid = mm->context.id;
+ if (unlikely(pid == MMU_NO_CONTEXT))
+ return;
+
+ preempt_disable();
+ if (mm_is_thread_local(mm)) {
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
+ } else {
+ local = false;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_single_page_flush_ceiling);
+ }
+
+ if (full) {
+ if (local) {
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ } else {
+ if (mm_needs_flush_escalation(mm))
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
+ }
+ } else {
+ bool hflush = flush_all_sizes;
+ bool gflush = flush_all_sizes;
+ unsigned long hstart, hend;
+ unsigned long gstart, gend;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ hflush = true;
+
+ if (hflush) {
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ if (hstart == hend)
+ hflush = false;
+ }
+
+ if (gflush) {
+ gstart = (start + PUD_SIZE - 1) & PUD_MASK;
+ gend = end & PUD_MASK;
+ if (gstart == gend)
+ gflush = false;
+ }
+
+ asm volatile("ptesync": : :"memory");
+ if (local) {
+ __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbiel_va_range(hstart, hend, pid,
+ PMD_SIZE, MMU_PAGE_2M);
+ if (gflush)
+ __tlbiel_va_range(gstart, gend, pid,
+ PUD_SIZE, MMU_PAGE_1G);
+ asm volatile("ptesync": : :"memory");
+ } else {
+ __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+ if (hflush)
+ __tlbie_va_range(hstart, hend, pid,
+ PMD_SIZE, MMU_PAGE_2M);
+ if (gflush)
+ __tlbie_va_range(gstart, gend, pid,
+ PUD_SIZE, MMU_PAGE_1G);
+
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ }
+ }
+ preempt_enable();
+}
+
void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
- struct mm_struct *mm = vma->vm_mm;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma))
+ return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
- radix__flush_tlb_mm(mm);
+ __radix__flush_tlb_range(vma->vm_mm, start, end, false);
}
EXPORT_SYMBOL(radix__flush_tlb_range);
@@ -397,13 +784,60 @@ static int radix_get_mmu_psize(int page_size)
return psize;
}
+/*
+ * Flush partition scoped LPID address translation for all CPUs.
+ */
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+ unsigned long addr,
+ unsigned long page_size)
+{
+ int psize = radix_get_mmu_psize(page_size);
+
+ _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+/*
+ * Flush partition scoped PWC from LPID for all CPUs.
+ */
+void radix__flush_pwc_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_PWC);
+}
+EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__local_flush_tlb_lpid(unsigned int lpid)
+{
+ _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+
+/*
+ * Flush process scoped translations from LPID (=LPIDR).
+ * Important difference, the guest normally manages its own translations,
+ * but some cases e.g., vCPU CPU migration require KVM to flush.
+ */
+void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+{
+ _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
+
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize);
+
void radix__tlb_flush(struct mmu_gather *tlb)
{
int psize = 0;
struct mm_struct *mm = tlb->mm;
int page_size = tlb->page_size;
+ unsigned long start = tlb->start;
+ unsigned long end = tlb->end;
- psize = radix_get_mmu_psize(page_size);
/*
* if page size is not something we understand, do a full mm flush
*
@@ -411,108 +845,135 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* that flushes the process table entry cache upon process teardown.
* See the comment for radix in arch_exit_mmap().
*/
- if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
- radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
- else if (tlb->fullmm || tlb->need_flush_all) {
- tlb->need_flush_all = 0;
+ if (tlb->fullmm) {
radix__flush_all_mm(mm);
- } else
- radix__flush_tlb_mm(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+ } else if (mm_tlb_flush_nested(mm)) {
+ /*
+ * If there is a concurrent invalidation that is clearing ptes,
+ * then it's possible this invalidation will miss one of those
+ * cleared ptes and miss flushing the TLB. If this invalidate
+ * returns before the other one flushes TLBs, that can result
+ * in it returning while there are still valid TLBs inside the
+ * range to be invalidated.
+ *
+ * See mm/memory.c:tlb_finish_mmu() for more details.
+ *
+ * The solution to this is ensure the entire range is always
+ * flushed here. The problem for powerpc is that the flushes
+ * are page size specific, so this "forced flush" would not
+ * do the right thing if there are a mix of page sizes in
+ * the range to be invalidated. So use __flush_tlb_range
+ * which invalidates all possible page sizes in the range.
+ *
+ * PWC flush probably is not be required because the core code
+ * shouldn't free page tables in this path, but accounting
+ * for the possibility makes us a bit more robust.
+ *
+ * need_flush_all is an uncommon case because page table
+ * teardown should be done with exclusive locks held (but
+ * after locks are dropped another invalidate could come
+ * in), it could be optimized further if necessary.
+ */
+ if (!tlb->need_flush_all)
+ __radix__flush_tlb_range(mm, start, end, true);
+ else
+ radix__flush_all_mm(mm);
+#endif
+ } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+ if (!tlb->need_flush_all)
+ radix__flush_tlb_mm(mm);
+ else
+ radix__flush_all_mm(mm);
+ } else {
+ if (!tlb->need_flush_all)
+ radix__flush_tlb_range_psize(mm, start, end, psize);
+ else
+ radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+ }
+ tlb->need_flush_all = 0;
}
-#define TLB_FLUSH_ALL -1UL
-/*
- * Number of pages above which we will do a bcast tlbie. Just a
- * number at this point copied from x86
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize)
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ int psize, bool also_pwc)
{
unsigned long pid;
- unsigned long addr;
- int local = mm_is_thread_local(mm);
- unsigned long ap = mmu_get_ap(psize);
- unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
-
+ unsigned int page_shift = mmu_psize_defs[psize].shift;
+ unsigned long page_size = 1UL << page_shift;
+ unsigned long nr_pages = (end - start) >> page_shift;
+ bool local, full;
- preempt_disable();
- pid = mm ? mm->context.id : 0;
+ pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto err_out;
+ return;
- if (end == TLB_FLUSH_ALL ||
- (end - start) > tlb_single_page_flush_ceiling * page_size) {
- if (local) {
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
- } else {
- if (mm_needs_flush_escalation(mm))
- _tlbie_pid(pid, RIC_FLUSH_ALL);
- else
- _tlbie_pid(pid, RIC_FLUSH_TLB);
- }
- goto err_out;
+ preempt_disable();
+ if (mm_is_thread_local(mm)) {
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
+ } else {
+ local = false;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_single_page_flush_ceiling);
}
- asm volatile("ptesync": : :"memory");
- for (addr = start; addr < end; addr += page_size) {
+
+ if (full) {
+ if (!local && mm_needs_flush_escalation(mm))
+ also_pwc = true;
+
if (local)
- __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
- else {
- __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
- fixup_tlbie();
- }
+ _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ else
+ _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
+ } else {
+ if (local)
+ _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
+ else
+ _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
}
- if (local)
- asm volatile("ptesync": : :"memory");
- else
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
-err_out:
preempt_enable();
}
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+ __radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
{
- int local = mm_is_thread_local(mm);
- unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;
-
- pid = mm ? mm->context.id : 0;
- preempt_disable();
+ pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
- goto no_context;
+ return;
/* 4k page size, just blow the world */
if (PAGE_SIZE == 0x1000) {
radix__flush_all_mm(mm);
- preempt_enable();
return;
}
- /* Otherwise first do the PWC */
- if (local)
- _tlbiel_pid(pid, RIC_FLUSH_PWC);
- else
- _tlbie_pid(pid, RIC_FLUSH_PWC);
-
- /* Then iterate the pages */
- asm volatile("ptesync": : :"memory");
end = addr + HPAGE_PMD_SIZE;
- for (; addr < end; addr += PAGE_SIZE) {
- if (local)
- _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
- else
- _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
- }
- if (local)
- asm volatile("ptesync": : :"memory");
- else
- asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ /* Otherwise first do the PWC, then iterate the pages. */
+ preempt_disable();
+
+ if (mm_is_thread_local(mm)) {
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ } else {
+ _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ }
-no_context:
preempt_enable();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index be975390c4a6..67b4950825ec 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,5 +1,5 @@
-obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o
-obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o
+obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
@@ -10,6 +10,5 @@ obj-$(CONFIG_CXL_BASE) += pci-cxl.o
obj-$(CONFIG_EEH) += eeh-powernv.o
obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
-obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
obj-$(CONFIG_OPAL_PRD) += opal-prd.o
obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 8b9042376819..cf3428bf9604 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -46,10 +46,6 @@ static DEFINE_SPINLOCK(npu_context_lock);
*/
#define ATSD_THRESHOLD (2*1024*1024)
-/*
- * Other types of TCE cache invalidation are not functional in the
- * hardware.
- */
static struct pci_dev *get_pci_dev(struct device_node *dn)
{
return PCI_DN(dn)->pcidev;
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
new file mode 100644
index 000000000000..5297676a903f
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/opal-api.h>
+#include <asm/trace.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3,
+ s64 a4, s64 a5, s64 a6, s64 a7,
+ unsigned long opcode)
+{
+ unsigned int *depth;
+ unsigned long args[8];
+
+ depth = this_cpu_ptr(&opal_trace_depth);
+
+ if (*depth)
+ return;
+
+ args[0] = a0;
+ args[1] = a1;
+ args[2] = a2;
+ args[3] = a3;
+ args[4] = a4;
+ args[5] = a5;
+ args[6] = a6;
+ args[7] = a7;
+
+ (*depth)++;
+ trace_opal_entry(opcode, &args[0]);
+ (*depth)--;
+}
+
+static void __trace_opal_exit(unsigned long opcode, unsigned long retval)
+{
+ unsigned int *depth;
+
+ depth = this_cpu_ptr(&opal_trace_depth);
+
+ if (*depth)
+ return;
+
+ (*depth)++;
+ trace_opal_exit(opcode, retval);
+ (*depth)--;
+}
+
+static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key);
+
+int opal_tracepoint_regfunc(void)
+{
+ static_branch_inc(&opal_tracepoint_key);
+ return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+ static_branch_dec(&opal_tracepoint_key);
+}
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+ s64 a4, s64 a5, s64 a6, s64 a7,
+ unsigned long opcode, unsigned long msr)
+{
+ s64 ret;
+
+ __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode);
+ ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+ __trace_opal_exit(opcode, ret);
+
+ return ret;
+}
+
+#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key))
+
+#else /* CONFIG_TRACEPOINTS */
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+ s64 a4, s64 a5, s64 a6, s64 a7,
+ unsigned long opcode, unsigned long msr)
+{
+ return 0;
+}
+
+#define DO_TRACE false
+#endif /* CONFIG_TRACEPOINTS */
+
+static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+ int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode)
+{
+ unsigned long flags;
+ unsigned long msr = mfmsr();
+ bool mmu = (msr & (MSR_IR|MSR_DR));
+ int64_t ret;
+
+ msr &= ~MSR_EE;
+
+ if (unlikely(!mmu))
+ return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+
+ local_save_flags(flags);
+ hard_irq_disable();
+
+ if (DO_TRACE) {
+ ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+ } else {
+ ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+#define OPAL_CALL(name, opcode) \
+int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \
+ int64_t a4, int64_t a5, int64_t a6, int64_t a7) \
+{ \
+ return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \
+}
+
+OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
+OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
+OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
+OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET);
+OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT);
+OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
+OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
+OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS);
+OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
+OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
+OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
+OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
+OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
+OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
+OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
+OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
+OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
+OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
+OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
+OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
+OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN);
+OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
+OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
+OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC);
+OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
+OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
+OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
+OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
+OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
+OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
+OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
+OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
+OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
+OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE);
+OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO);
+OPAL_CALL(opal_tpo_read, OPAL_READ_TPO);
+OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND);
+OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV);
+OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST);
+OPAL_CALL(opal_flash_read, OPAL_FLASH_READ);
+OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE);
+OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE);
+OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG);
+OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR);
+OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR);
+OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH);
+OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE);
+OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
+OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
+OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
+OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
+OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
+OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
+OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
+OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
+OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
+OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
+OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
+OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
+OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP);
+OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P);
+OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP);
+OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP);
+OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index a4abedc85b60..7cf3a64951b2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -15,307 +15,53 @@
#include <asm/asm-offsets.h>
#include <asm/opal.h>
- .section ".text"
-
-#ifdef CONFIG_TRACEPOINTS
-#ifdef HAVE_JUMP_LABEL
-#define OPAL_BRANCH(LABEL) \
- ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
-#else
-
- .section ".toc","aw"
-
- .globl opal_tracepoint_refcount
-opal_tracepoint_refcount:
- .llong 0
-
- .section ".text"
+ .section ".text"
/*
- * We branch around this in early init by using an unconditional cpu
- * feature.
+ * r3-r10 - OPAL call arguments
+ * STK_PARAM(R11) - OPAL opcode
+ * STK_PARAM(R12) - MSR to restore
*/
-#define OPAL_BRANCH(LABEL) \
-BEGIN_FTR_SECTION; \
- b 1f; \
-END_FTR_SECTION(0, 1); \
- ld r11,opal_tracepoint_refcount@toc(r2); \
- cmpdi r11,0; \
- bne- LABEL; \
-1:
-
-#endif
-
-#else
-#define OPAL_BRANCH(LABEL)
-#endif
-
-/*
- * DO_OPAL_CALL assumes:
- * r0 = opal call token
- * r12 = msr
- * LR has been saved
- */
-#define DO_OPAL_CALL() \
- mfcr r11; \
- stw r11,8(r1); \
- li r11,0; \
- ori r11,r11,MSR_EE; \
- std r12,PACASAVEDMSR(r13); \
- andc r12,r12,r11; \
- mtmsrd r12,1; \
- LOAD_REG_ADDR(r11,opal_return); \
- mtlr r11; \
- li r11,MSR_DR|MSR_IR|MSR_LE;\
- andc r12,r12,r11; \
- mtspr SPRN_HSRR1,r12; \
- LOAD_REG_ADDR(r11,opal); \
- ld r12,8(r11); \
- ld r2,0(r11); \
- mtspr SPRN_HSRR0,r12; \
+_GLOBAL_TOC(__opal_call)
+ mflr r0
+ std r0,PPC_LR_STKOFF(r1)
+ ld r12,STK_PARAM(R12)(r1)
+ li r0,MSR_IR|MSR_DR|MSR_LE
+ andc r12,r12,r0
+ LOAD_REG_ADDR(r11, opal_return)
+ mtlr r11
+ LOAD_REG_ADDR(r11, opal)
+ ld r2,0(r11)
+ ld r11,8(r11)
+ mtspr SPRN_HSRR0,r11
+ mtspr SPRN_HSRR1,r12
+ /* set token to r0 */
+ ld r0,STK_PARAM(R11)(r1)
hrfid
-#define OPAL_CALL(name, token) \
- _GLOBAL_TOC(name); \
- mfmsr r12; \
- mflr r0; \
- andi. r11,r12,MSR_IR|MSR_DR; \
- std r0,PPC_LR_STKOFF(r1); \
- li r0,token; \
- beq opal_real_call; \
- OPAL_BRANCH(opal_tracepoint_entry) \
- DO_OPAL_CALL()
-
-
opal_return:
/*
- * Fixup endian on OPAL return... we should be able to simplify
- * this by instead converting the below trampoline to a set of
- * bytes (always BE) since MSR:LE will end up fixed up as a side
- * effect of the rfid.
+ * Restore MSR on OPAL return. The MSR is set to big-endian.
*/
- FIXUP_ENDIAN
- ld r2,PACATOC(r13);
- lwz r4,8(r1);
- ld r5,PPC_LR_STKOFF(r1);
- ld r6,PACASAVEDMSR(r13);
- mtspr SPRN_SRR0,r5;
- mtspr SPRN_SRR1,r6;
- mtcr r4;
- rfid
-
-opal_real_call:
- mfcr r11
- stw r11,8(r1)
- /* Set opal return address */
- LOAD_REG_ADDR(r11, opal_return_realmode)
- mtlr r11
- li r11,MSR_LE
- andc r12,r12,r11
- mtspr SPRN_HSRR1,r12
- LOAD_REG_ADDR(r11,opal)
- ld r12,8(r11)
- ld r2,0(r11)
- mtspr SPRN_HSRR0,r12
- hrfid
-
-opal_return_realmode:
- FIXUP_ENDIAN
- ld r2,PACATOC(r13);
- lwz r11,8(r1);
- ld r12,PPC_LR_STKOFF(r1)
- mtcr r11;
- mtlr r12
- blr
-
-#ifdef CONFIG_TRACEPOINTS
-opal_tracepoint_entry:
- stdu r1,-STACKFRAMESIZE(r1)
- std r0,STK_REG(R23)(r1)
- std r3,STK_REG(R24)(r1)
- std r4,STK_REG(R25)(r1)
- std r5,STK_REG(R26)(r1)
- std r6,STK_REG(R27)(r1)
- std r7,STK_REG(R28)(r1)
- std r8,STK_REG(R29)(r1)
- std r9,STK_REG(R30)(r1)
- std r10,STK_REG(R31)(r1)
- mr r3,r0
- addi r4,r1,STK_REG(R24)
- bl __trace_opal_entry
- ld r0,STK_REG(R23)(r1)
- ld r3,STK_REG(R24)(r1)
- ld r4,STK_REG(R25)(r1)
- ld r5,STK_REG(R26)(r1)
- ld r6,STK_REG(R27)(r1)
- ld r7,STK_REG(R28)(r1)
- ld r8,STK_REG(R29)(r1)
- ld r9,STK_REG(R30)(r1)
- ld r10,STK_REG(R31)(r1)
-
- /* setup LR so we return via tracepoint_return */
- LOAD_REG_ADDR(r11,opal_tracepoint_return)
- std r11,16(r1)
-
- mfmsr r12
- DO_OPAL_CALL()
-
-opal_tracepoint_return:
- std r3,STK_REG(R31)(r1)
- mr r4,r3
- ld r3,STK_REG(R23)(r1)
- bl __trace_opal_exit
- ld r3,STK_REG(R31)(r1)
- addi r1,r1,STACKFRAMESIZE
- ld r0,16(r1)
+#ifdef __BIG_ENDIAN__
+ ld r11,STK_PARAM(R12)(r1)
+ mtmsrd r11
+#else
+ /* Endian can only be switched with rfi, must byte reverse MSR load */
+ .short 0x4039 /* li r10,STK_PARAM(R12) */
+ .byte (STK_PARAM(R12) >> 8) & 0xff
+ .byte STK_PARAM(R12) & 0xff
+
+ .long 0x280c6a7d /* ldbrx r11,r10,r1 */
+ .long 0x05009f42 /* bcl 20,31,$+4 */
+ .long 0xa602487d /* mflr r10 */
+ .long 0x14004a39 /* addi r10,r10,20 */
+ .long 0xa64b5a7d /* mthsrr0 r10 */
+ .long 0xa64b7b7d /* mthsrr1 r11 */
+ .long 0x2402004c /* hrfid */
+#endif
+ ld r2,PACATOC(r13)
+ ld r0,PPC_LR_STKOFF(r1)
mtlr r0
blr
-#endif
-
-OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
-OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
-OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
-OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
-OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
-OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
-OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
-OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
-OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
-OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
-OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
-OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
-OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
-OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
-OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
-OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
-OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
-OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
-OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
-OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
-OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
-OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
-OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
-OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
-OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
-OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
-OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET);
-OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT);
-OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
-OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
-OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
-OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
-OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
-OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
-OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
-OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
-OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
-OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
-OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
-OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
-OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
-OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
-OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
-OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
-OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
-OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
-OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
-OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
-OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
-OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
-OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
-OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
-OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
-OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
-OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
-OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
-OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS);
-OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
-OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
-OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
-OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
-OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
-OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
-OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
-OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
-OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
-OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
-OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
-OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
-OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
-OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
-OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
-OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
-OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
-OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
-OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
-OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
-OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN);
-OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
-OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
-OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
-OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
-OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
-OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
-OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC);
-OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
-OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
-OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
-OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
-OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
-OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
-OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
-OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
-OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
-OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
-OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
-OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE);
-OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO);
-OPAL_CALL(opal_tpo_read, OPAL_READ_TPO);
-OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND);
-OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV);
-OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST);
-OPAL_CALL(opal_flash_read, OPAL_FLASH_READ);
-OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE);
-OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE);
-OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG);
-OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR);
-OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR);
-OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH);
-OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE);
-OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
-OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
-OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
-OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
-OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
-OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
-OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
-OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
-OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
-OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
-OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
-OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
-OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
-OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
-OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
-OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
-OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
-OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
-OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
-OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
-OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
-OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
-OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
-OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
-OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
-OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
-OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
-OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
-OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
-OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP);
-OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P);
-OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP);
-OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP);
-OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO);
-OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO);
-OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR);
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index aa82a83e4c39..79aa32aeee4e 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -101,11 +101,12 @@ static struct property *dlpar_clone_property(struct property *prop,
return new_prop;
}
-static u32 find_aa_index(struct device_node *dr_node,
- struct property *ala_prop, const u32 *lmb_assoc)
+static bool find_aa_index(struct device_node *dr_node,
+ struct property *ala_prop,
+ const u32 *lmb_assoc, u32 *aa_index)
{
- u32 *assoc_arrays;
- u32 aa_index;
+ u32 *assoc_arrays, new_prop_size;
+ struct property *new_prop;
int aa_arrays, aa_array_entries, aa_array_sz;
int i, index;
@@ -121,75 +122,68 @@ static u32 find_aa_index(struct device_node *dr_node,
aa_array_entries = be32_to_cpu(assoc_arrays[1]);
aa_array_sz = aa_array_entries * sizeof(u32);
- aa_index = -1;
for (i = 0; i < aa_arrays; i++) {
index = (i * aa_array_entries) + 2;
if (memcmp(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz))
continue;
- aa_index = i;
- break;
+ *aa_index = i;
+ return true;
}
- if (aa_index == -1) {
- struct property *new_prop;
- u32 new_prop_size;
-
- new_prop_size = ala_prop->length + aa_array_sz;
- new_prop = dlpar_clone_property(ala_prop, new_prop_size);
- if (!new_prop)
- return -1;
+ new_prop_size = ala_prop->length + aa_array_sz;
+ new_prop = dlpar_clone_property(ala_prop, new_prop_size);
+ if (!new_prop)
+ return false;
- assoc_arrays = new_prop->value;
+ assoc_arrays = new_prop->value;
- /* increment the number of entries in the lookup array */
- assoc_arrays[0] = cpu_to_be32(aa_arrays + 1);
+ /* increment the number of entries in the lookup array */
+ assoc_arrays[0] = cpu_to_be32(aa_arrays + 1);
- /* copy the new associativity into the lookup array */
- index = aa_arrays * aa_array_entries + 2;
- memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz);
+ /* copy the new associativity into the lookup array */
+ index = aa_arrays * aa_array_entries + 2;
+ memcpy(&assoc_arrays[index], &lmb_assoc[1], aa_array_sz);
- of_update_property(dr_node, new_prop);
+ of_update_property(dr_node, new_prop);
- /*
- * The associativity lookup array index for this lmb is
- * number of entries - 1 since we added its associativity
- * to the end of the lookup array.
- */
- aa_index = be32_to_cpu(assoc_arrays[0]) - 1;
- }
-
- return aa_index;
+ /*
+ * The associativity lookup array index for this lmb is
+ * number of entries - 1 since we added its associativity
+ * to the end of the lookup array.
+ */
+ *aa_index = be32_to_cpu(assoc_arrays[0]) - 1;
+ return true;
}
-static u32 lookup_lmb_associativity_index(struct drmem_lmb *lmb)
+static u32 lookup_lmb_associativity_index(struct drmem_lmb *lmb, u32 *aa_index)
{
struct device_node *parent, *lmb_node, *dr_node;
struct property *ala_prop;
const u32 *lmb_assoc;
- u32 aa_index;
+ bool found;
parent = of_find_node_by_path("/");
if (!parent)
- return -ENODEV;
+ return false;
lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index),
parent);
of_node_put(parent);
if (!lmb_node)
- return -EINVAL;
+ return false;
lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL);
if (!lmb_assoc) {
dlpar_free_cc_nodes(lmb_node);
- return -ENODEV;
+ return false;
}
dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (!dr_node) {
dlpar_free_cc_nodes(lmb_node);
- return -ENODEV;
+ return false;
}
ala_prop = of_find_property(dr_node, "ibm,associativity-lookup-arrays",
@@ -197,26 +191,27 @@ static u32 lookup_lmb_associativity_index(struct drmem_lmb *lmb)
if (!ala_prop) {
of_node_put(dr_node);
dlpar_free_cc_nodes(lmb_node);
- return -ENODEV;
+ return false;
}
- aa_index = find_aa_index(dr_node, ala_prop, lmb_assoc);
+ found = find_aa_index(dr_node, ala_prop, lmb_assoc, aa_index);
dlpar_free_cc_nodes(lmb_node);
- return aa_index;
+ return found;
}
static int dlpar_add_device_tree_lmb(struct drmem_lmb *lmb)
{
int rc, aa_index;
+ bool found;
lmb->flags |= DRCONF_MEM_ASSIGNED;
- aa_index = lookup_lmb_associativity_index(lmb);
- if (aa_index < 0) {
+ found = lookup_lmb_associativity_index(lmb, &aa_index);
+ if (!found) {
pr_err("Couldn't find associativity index for drc index %x\n",
lmb->drc_index);
- return aa_index;
+ return -ENODEV;
}
lmb->aa_index = aa_index;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index fa3f19c5abca..ffe9b722ded3 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -64,6 +64,22 @@ EXPORT_SYMBOL(plpar_hcall);
EXPORT_SYMBOL(plpar_hcall9);
EXPORT_SYMBOL(plpar_hcall_norets);
+/*
+ * H_BLOCK_REMOVE supported block size for this page size in segment who's base
+ * page size is that page size.
+ *
+ * The first index is the segment base page size, the second one is the actual
+ * page size.
+ */
+static int hblkrm_size[MMU_PAGE_COUNT][MMU_PAGE_COUNT] __ro_after_init;
+
+/*
+ * Due to the involved complexity, and that the current hypervisor is only
+ * returning this value or 0, we are limiting the support of the H_BLOCK_REMOVE
+ * buffer size to 8 size block.
+ */
+#define HBLKRM_SUPPORTED_BLOCK_SIZE 8
+
void vpa_init(int cpu)
{
int hwcpu = get_hard_smp_processor_id(cpu);
@@ -425,6 +441,17 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
#define HBLKR_CTRL_ERRNOTFOUND 0x8800000000000000UL
#define HBLKR_CTRL_ERRBUSY 0xa000000000000000UL
+/*
+ * Returned true if we are supporting this block size for the specified segment
+ * base page size and actual page size.
+ *
+ * Currently, we only support 8 size block.
+ */
+static inline bool is_supported_hlbkrm(int bpsize, int psize)
+{
+ return (hblkrm_size[bpsize][psize] == HBLKRM_SUPPORTED_BLOCK_SIZE);
+}
+
/**
* H_BLOCK_REMOVE caller.
* @idx should point to the latest @param entry set with a PTEX.
@@ -584,7 +611,8 @@ static inline void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
if (lock_tlbie)
spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
- if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
+ /* Assuming THP size is 16M */
+ if (is_supported_hlbkrm(psize, MMU_PAGE_16M))
hugepage_block_invalidate(slot, vpn, count, psize, ssize);
else
hugepage_bulk_invalidate(slot, vpn, count, psize, ssize);
@@ -753,6 +781,137 @@ static void do_block_remove(unsigned long number, struct ppc64_tlb_batch *batch,
}
/*
+ * TLB Block Invalidate Characteristics
+ *
+ * These characteristics define the size of the block the hcall H_BLOCK_REMOVE
+ * is able to process for each couple segment base page size, actual page size.
+ *
+ * The ibm,get-system-parameter properties is returning a buffer with the
+ * following layout:
+ *
+ * [ 2 bytes size of the RTAS buffer (excluding these 2 bytes) ]
+ * -----------------
+ * TLB Block Invalidate Specifiers:
+ * [ 1 byte LOG base 2 of the TLB invalidate block size being specified ]
+ * [ 1 byte Number of page sizes (N) that are supported for the specified
+ * TLB invalidate block size ]
+ * [ 1 byte Encoded segment base page size and actual page size
+ * MSB=0 means 4k segment base page size and actual page size
+ * MSB=1 the penc value in mmu_psize_def ]
+ * ...
+ * -----------------
+ * Next TLB Block Invalidate Specifiers...
+ * -----------------
+ * [ 0 ]
+ */
+static inline void set_hblkrm_bloc_size(int bpsize, int psize,
+ unsigned int block_size)
+{
+ if (block_size > hblkrm_size[bpsize][psize])
+ hblkrm_size[bpsize][psize] = block_size;
+}
+
+/*
+ * Decode the Encoded segment base page size and actual page size.
+ * PAPR specifies:
+ * - bit 7 is the L bit
+ * - bits 0-5 are the penc value
+ * If the L bit is 0, this means 4K segment base page size and actual page size
+ * otherwise the penc value should be read.
+ */
+#define HBLKRM_L_MASK 0x80
+#define HBLKRM_PENC_MASK 0x3f
+static inline void __init check_lp_set_hblkrm(unsigned int lp,
+ unsigned int block_size)
+{
+ unsigned int bpsize, psize;
+
+ /* First, check the L bit, if not set, this means 4K */
+ if ((lp & HBLKRM_L_MASK) == 0) {
+ set_hblkrm_bloc_size(MMU_PAGE_4K, MMU_PAGE_4K, block_size);
+ return;
+ }
+
+ lp &= HBLKRM_PENC_MASK;
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) {
+ struct mmu_psize_def *def = &mmu_psize_defs[bpsize];
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ if (def->penc[psize] == lp) {
+ set_hblkrm_bloc_size(bpsize, psize, block_size);
+ return;
+ }
+ }
+ }
+}
+
+#define SPLPAR_TLB_BIC_TOKEN 50
+
+/*
+ * The size of the TLB Block Invalidate Characteristics is variable. But at the
+ * maximum it will be the number of possible page sizes *2 + 10 bytes.
+ * Currently MMU_PAGE_COUNT is 16, which means 42 bytes. Use a cache line size
+ * (128 bytes) for the buffer to get plenty of space.
+ */
+#define SPLPAR_TLB_BIC_MAXLENGTH 128
+
+void __init pseries_lpar_read_hblkrm_characteristics(void)
+{
+ unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
+ int call_status, len, idx, bpsize;
+
+ spin_lock(&rtas_data_buf_lock);
+ memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+ call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL,
+ SPLPAR_TLB_BIC_TOKEN,
+ __pa(rtas_data_buf),
+ RTAS_DATA_BUF_SIZE);
+ memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
+ local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
+ spin_unlock(&rtas_data_buf_lock);
+
+ if (call_status != 0) {
+ pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
+ __FILE__, __func__, call_status);
+ return;
+ }
+
+ /*
+ * The first two (2) bytes of the data in the buffer are the length of
+ * the returned data, not counting these first two (2) bytes.
+ */
+ len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
+ if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
+ pr_warn("%s too large returned buffer %d", __func__, len);
+ return;
+ }
+
+ idx = 2;
+ while (idx < len) {
+ u8 block_shift = local_buffer[idx++];
+ u32 block_size;
+ unsigned int npsize;
+
+ if (!block_shift)
+ break;
+
+ block_size = 1 << block_shift;
+
+ for (npsize = local_buffer[idx++];
+ npsize > 0 && idx < len; npsize--)
+ check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
+ block_size);
+ }
+
+ for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++)
+ for (idx = 0; idx < MMU_PAGE_COUNT; idx++)
+ if (hblkrm_size[bpsize][idx])
+ pr_info("H_BLOCK_REMOVE supports base psize:%d psize:%d block size:%d",
+ bpsize, idx, hblkrm_size[bpsize][idx]);
+}
+
+/*
* Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
* lock.
*/
@@ -771,7 +930,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
if (lock_tlbie)
spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
- if (firmware_has_feature(FW_FEATURE_BLOCK_REMOVE)) {
+ if (is_supported_hlbkrm(batch->psize, batch->psize)) {
do_block_remove(number, batch, param);
goto out;
}
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 120b132ede25..fcea35f4409d 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -103,5 +103,6 @@ static inline unsigned long cmo_get_page_size(void)
int dlpar_workqueue_init(void);
void pseries_setup_rfi_flush(void);
+void pseries_lpar_read_hblkrm_characteristics(void);
#endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 076d6e6c6c99..89da2bbe1b39 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -759,6 +759,7 @@ static void __init pSeries_setup_arch(void)
pseries_setup_rfi_flush();
setup_stf_barrier();
+ pseries_lpar_read_hblkrm_characteristics();
/* By default, only probe PCI (can be overridden by rtas_pci) */
pci_add_flags(PCI_PROBE_ONLY);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index b46030e6f49d..5d127c81d25d 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -249,6 +249,17 @@ static bool xive_native_match(struct device_node *node)
return of_device_is_compatible(node, "ibm,opal-xive-vc");
}
+static s64 opal_xive_allocate_irq(u32 chip_id)
+{
+ s64 irq = opal_xive_allocate_irq_raw(chip_id);
+
+ /*
+ * Old versions of skiboot can incorrectly return 0xffffffff to
+ * indicate no space, fix it up here.
+ */
+ return irq == 0xffffffff ? OPAL_RESOURCE : irq;
+}
+
#ifdef CONFIG_SMP
static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4dd696907d3c..c98f757395a2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -21,7 +21,6 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/module.h>
-#include <linux/memory.h>
#include <trace/syscall.h>
@@ -35,7 +34,6 @@
int ftrace_arch_code_modify_prepare(void)
{
- mutex_lock(&text_mutex);
set_kernel_text_rw();
set_all_modules_text_rw();
return 0;
@@ -45,7 +43,6 @@ int ftrace_arch_code_modify_post_process(void)
{
set_all_modules_text_ro();
set_kernel_text_ro();
- mutex_unlock(&text_mutex);
return 0;
}
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 682b24ebb604..9ae30c60bebe 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -196,9 +196,6 @@ static ssize_t write_mem(struct file *file, const char __user *buf,
if (p != *ppos)
return -EFBIG;
- if (kernel_is_locked_down())
- return -EPERM;
-
if (!valid_phys_addr_range(p, count))
return -EFAULT;
@@ -560,9 +557,6 @@ static ssize_t write_kmem(struct file *file, const char __user *buf,
char *kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */
int err = 0;
- if (kernel_is_locked_down())
- return -EPERM;
-
if (p < (unsigned long) high_memory) {
unsigned long to_write = min_t(unsigned long, count,
(unsigned long)high_memory - p);
diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index 4cc0b42f2acc..7568213f3c0f 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -769,7 +769,7 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
at24->nvmem_config.name = dev_name(&client->dev);
at24->nvmem_config.dev = &client->dev;
at24->nvmem_config.read_only = !writable;
- at24->nvmem_config.root_only = true;
+ at24->nvmem_config.root_only = !(chip.flags & AT24_FLAG_IRUGO);
at24->nvmem_config.owner = THIS_MODULE;
at24->nvmem_config.compat = true;
at24->nvmem_config.base_dev = &client->dev;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4c1b1ad782a4..557bf4ccdc77 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1218,7 +1218,7 @@ static void ibmvnic_cleanup(struct net_device *netdev)
struct ibmvnic_adapter *adapter = netdev_priv(netdev);
/* ensure that transmissions are stopped if called by do_reset */
- if (adapter->resetting)
+ if (test_bit(0, &adapter->resetting))
netif_tx_disable(netdev);
else
netif_tx_stop_all_queues(netdev);
@@ -1439,7 +1439,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
u8 proto = 0;
netdev_tx_t ret = NETDEV_TX_OK;
- if (adapter->resetting) {
+ if (test_bit(0, &adapter->resetting)) {
if (!netif_subqueue_stopped(netdev, skb))
netif_stop_subqueue(netdev, queue_num);
dev_kfree_skb_any(skb);
@@ -1735,6 +1735,86 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p)
}
/**
+ * do_change_param_reset returns zero if we are able to keep processing reset
+ * events, or non-zero if we hit a fatal error and must halt.
+ */
+static int do_change_param_reset(struct ibmvnic_adapter *adapter,
+ struct ibmvnic_rwi *rwi,
+ u32 reset_state)
+{
+ struct net_device *netdev = adapter->netdev;
+ int i, rc;
+
+ netdev_dbg(adapter->netdev, "Change param resetting driver (%d)\n",
+ rwi->reset_reason);
+
+ netif_carrier_off(netdev);
+ adapter->reset_reason = rwi->reset_reason;
+
+ ibmvnic_cleanup(netdev);
+
+ if (reset_state == VNIC_OPEN) {
+ rc = __ibmvnic_close(netdev);
+ if (rc)
+ return rc;
+ }
+
+ release_resources(adapter);
+ release_sub_crqs(adapter, 1);
+ release_crq_queue(adapter);
+
+ adapter->state = VNIC_PROBED;
+
+ rc = init_crq_queue(adapter);
+
+ if (rc) {
+ netdev_err(adapter->netdev,
+ "Couldn't initialize crq. rc=%d\n", rc);
+ return rc;
+ }
+
+ rc = ibmvnic_reset_init(adapter);
+ if (rc)
+ return IBMVNIC_INIT_FAILED;
+
+ /* If the adapter was in PROBE state prior to the reset,
+ * exit here.
+ */
+ if (reset_state == VNIC_PROBED)
+ return 0;
+
+ rc = ibmvnic_login(netdev);
+ if (rc) {
+ adapter->state = reset_state;
+ return rc;
+ }
+
+ rc = init_resources(adapter);
+ if (rc)
+ return rc;
+
+ ibmvnic_disable_irqs(adapter);
+
+ adapter->state = VNIC_CLOSED;
+
+ if (reset_state == VNIC_CLOSED)
+ return 0;
+
+ rc = __ibmvnic_open(netdev);
+ if (rc)
+ return IBMVNIC_OPEN_FAILED;
+
+ /* refresh device's multicast list */
+ ibmvnic_set_multi(netdev);
+
+ /* kick napi */
+ for (i = 0; i < adapter->req_rx_queues; i++)
+ napi_schedule(&adapter->napi[i]);
+
+ return 0;
+}
+
+/**
* do_reset returns zero if we are able to keep processing reset events, or
* non-zero if we hit a fatal error and must halt.
*/
@@ -1749,6 +1829,8 @@ static int do_reset(struct ibmvnic_adapter *adapter,
netdev_dbg(adapter->netdev, "Re-setting driver (%d)\n",
rwi->reset_reason);
+ rtnl_lock();
+
netif_carrier_off(netdev);
adapter->reset_reason = rwi->reset_reason;
@@ -1762,16 +1844,25 @@ static int do_reset(struct ibmvnic_adapter *adapter,
if (reset_state == VNIC_OPEN &&
adapter->reset_reason != VNIC_RESET_MOBILITY &&
adapter->reset_reason != VNIC_RESET_FAILOVER) {
- rc = __ibmvnic_close(netdev);
+ adapter->state = VNIC_CLOSING;
+
+ /* Release the RTNL lock before link state change and
+ * re-acquire after the link state change to allow
+ * linkwatch_event to grab the RTNL lock and run during
+ * a reset.
+ */
+ rtnl_unlock();
+ rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
+ rtnl_lock();
if (rc)
- return rc;
- }
+ goto out;
- if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM ||
- adapter->wait_for_reset) {
- release_resources(adapter);
- release_sub_crqs(adapter, 1);
- release_crq_queue(adapter);
+ if (adapter->state != VNIC_CLOSING) {
+ rc = -1;
+ goto out;
+ }
+
+ adapter->state = VNIC_CLOSED;
}
if (adapter->reset_reason != VNIC_RESET_NON_FATAL) {
@@ -1780,9 +1871,7 @@ static int do_reset(struct ibmvnic_adapter *adapter,
*/
adapter->state = VNIC_PROBED;
- if (adapter->wait_for_reset) {
- rc = init_crq_queue(adapter);
- } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) {
+ if (adapter->reset_reason == VNIC_RESET_MOBILITY) {
rc = ibmvnic_reenable_crq_queue(adapter);
release_sub_crqs(adapter, 1);
} else {
@@ -1794,36 +1883,35 @@ static int do_reset(struct ibmvnic_adapter *adapter,
if (rc) {
netdev_err(adapter->netdev,
"Couldn't initialize crq. rc=%d\n", rc);
- return rc;
+ goto out;
}
rc = ibmvnic_reset_init(adapter);
- if (rc)
- return IBMVNIC_INIT_FAILED;
+ if (rc) {
+ rc = IBMVNIC_INIT_FAILED;
+ goto out;
+ }
/* If the adapter was in PROBE state prior to the reset,
* exit here.
*/
- if (reset_state == VNIC_PROBED)
- return 0;
+ if (reset_state == VNIC_PROBED) {
+ rc = 0;
+ goto out;
+ }
rc = ibmvnic_login(netdev);
if (rc) {
adapter->state = reset_state;
- return rc;
+ goto out;
}
- if (adapter->reset_reason == VNIC_RESET_CHANGE_PARAM ||
- adapter->wait_for_reset) {
- rc = init_resources(adapter);
- if (rc)
- return rc;
- } else if (adapter->req_rx_queues != old_num_rx_queues ||
- adapter->req_tx_queues != old_num_tx_queues ||
- adapter->req_rx_add_entries_per_subcrq !=
- old_num_rx_slots ||
- adapter->req_tx_entries_per_subcrq !=
- old_num_tx_slots) {
+ if (adapter->req_rx_queues != old_num_rx_queues ||
+ adapter->req_tx_queues != old_num_tx_queues ||
+ adapter->req_rx_add_entries_per_subcrq !=
+ old_num_rx_slots ||
+ adapter->req_tx_entries_per_subcrq !=
+ old_num_tx_slots) {
release_rx_pools(adapter);
release_tx_pools(adapter);
release_napi(adapter);
@@ -1831,32 +1919,30 @@ static int do_reset(struct ibmvnic_adapter *adapter,
rc = init_resources(adapter);
if (rc)
- return rc;
+ goto out;
} else {
rc = reset_tx_pools(adapter);
if (rc)
- return rc;
+ goto out;
rc = reset_rx_pools(adapter);
if (rc)
- return rc;
+ goto out;
}
ibmvnic_disable_irqs(adapter);
}
adapter->state = VNIC_CLOSED;
- if (reset_state == VNIC_CLOSED)
- return 0;
+ if (reset_state == VNIC_CLOSED) {
+ rc = 0;
+ goto out;
+ }
rc = __ibmvnic_open(netdev);
if (rc) {
- if (list_empty(&adapter->rwi_list))
- adapter->state = VNIC_CLOSED;
- else
- adapter->state = reset_state;
-
- return 0;
+ rc = IBMVNIC_OPEN_FAILED;
+ goto out;
}
/* refresh device's multicast list */
@@ -1866,11 +1952,15 @@ static int do_reset(struct ibmvnic_adapter *adapter,
for (i = 0; i < adapter->req_rx_queues; i++)
napi_schedule(&adapter->napi[i]);
- if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
- adapter->reset_reason != VNIC_RESET_CHANGE_PARAM)
+ if (adapter->reset_reason != VNIC_RESET_FAILOVER)
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, netdev);
- return 0;
+ rc = 0;
+
+out:
+ rtnl_unlock();
+
+ return rc;
}
static int do_hard_reset(struct ibmvnic_adapter *adapter,
@@ -1930,14 +2020,8 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter,
return 0;
rc = __ibmvnic_open(netdev);
- if (rc) {
- if (list_empty(&adapter->rwi_list))
- adapter->state = VNIC_CLOSED;
- else
- adapter->state = reset_state;
-
- return 0;
- }
+ if (rc)
+ return IBMVNIC_OPEN_FAILED;
return 0;
}
@@ -1976,20 +2060,17 @@ static void __ibmvnic_reset(struct work_struct *work)
{
struct ibmvnic_rwi *rwi;
struct ibmvnic_adapter *adapter;
- bool we_lock_rtnl = false;
u32 reset_state;
int rc = 0;
adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
- /* netif_set_real_num_xx_queues needs to take rtnl lock here
- * unless wait_for_reset is set, in which case the rtnl lock
- * has already been taken before initializing the reset
- */
- if (!adapter->wait_for_reset) {
- rtnl_lock();
- we_lock_rtnl = true;
+ if (test_and_set_bit_lock(0, &adapter->resetting)) {
+ schedule_delayed_work(&adapter->ibmvnic_delayed_reset,
+ IBMVNIC_RESET_DELAY);
+ return;
}
+
reset_state = adapter->state;
rwi = get_next_rwi(adapter);
@@ -2001,22 +2082,43 @@ static void __ibmvnic_reset(struct work_struct *work)
break;
}
- if (adapter->force_reset_recovery) {
- adapter->force_reset_recovery = false;
- rc = do_hard_reset(adapter, rwi, reset_state);
+ if (rwi->reset_reason == VNIC_RESET_CHANGE_PARAM) {
+ /* CHANGE_PARAM requestor holds rtnl_lock */
+ rc = do_change_param_reset(adapter, rwi, reset_state);
+ } else if (adapter->force_reset_recovery) {
+ /* Transport event occurred during previous reset */
+ if (adapter->wait_for_reset) {
+ /* Previous was CHANGE_PARAM; caller locked */
+ adapter->force_reset_recovery = false;
+ rc = do_hard_reset(adapter, rwi, reset_state);
+ } else {
+ rtnl_lock();
+ adapter->force_reset_recovery = false;
+ rc = do_hard_reset(adapter, rwi, reset_state);
+ rtnl_unlock();
+ }
} else {
rc = do_reset(adapter, rwi, reset_state);
}
kfree(rwi);
- if (rc && rc != IBMVNIC_INIT_FAILED &&
+ if (rc == IBMVNIC_OPEN_FAILED) {
+ if (list_empty(&adapter->rwi_list))
+ adapter->state = VNIC_CLOSED;
+ else
+ adapter->state = reset_state;
+ rc = 0;
+ } else if (rc && rc != IBMVNIC_INIT_FAILED &&
!adapter->force_reset_recovery)
break;
rwi = get_next_rwi(adapter);
+
+ if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER ||
+ rwi->reset_reason == VNIC_RESET_MOBILITY))
+ adapter->force_reset_recovery = true;
}
if (adapter->wait_for_reset) {
- adapter->wait_for_reset = false;
adapter->reset_done_rc = rc;
complete(&adapter->reset_done);
}
@@ -2026,9 +2128,16 @@ static void __ibmvnic_reset(struct work_struct *work)
free_all_rwi(adapter);
}
- adapter->resetting = false;
- if (we_lock_rtnl)
- rtnl_unlock();
+ clear_bit_unlock(0, &adapter->resetting);
+}
+
+static void __ibmvnic_delayed_reset(struct work_struct *work)
+{
+ struct ibmvnic_adapter *adapter;
+
+ adapter = container_of(work, struct ibmvnic_adapter,
+ ibmvnic_delayed_reset.work);
+ __ibmvnic_reset(&adapter->ibmvnic_reset);
}
static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
@@ -2083,14 +2192,11 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
rwi->reset_reason = reason;
list_add_tail(&rwi->list, &adapter->rwi_list);
spin_unlock_irqrestore(&adapter->rwi_lock, flags);
- adapter->resetting = true;
netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason);
schedule_work(&adapter->ibmvnic_reset);
return 0;
err:
- if (adapter->wait_for_reset)
- adapter->wait_for_reset = false;
return -ret;
}
@@ -2130,7 +2236,7 @@ restart_poll:
u16 offset;
u8 flags = 0;
- if (unlikely(adapter->resetting &&
+ if (unlikely(test_bit(0, &adapter->resetting) &&
adapter->reset_reason != VNIC_RESET_NON_FATAL)) {
enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]);
napi_complete_done(napi, frames_processed);
@@ -2781,7 +2887,7 @@ static int enable_scrq_irq(struct ibmvnic_adapter *adapter,
return 1;
}
- if (adapter->resetting &&
+ if (test_bit(0, &adapter->resetting) &&
adapter->reset_reason == VNIC_RESET_MOBILITY) {
u64 val = (0xff000000) | scrq->hw_irq;
@@ -3331,7 +3437,7 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter,
if (rc) {
if (rc == H_CLOSED) {
dev_warn(dev, "CRQ Queue closed\n");
- if (adapter->resetting)
+ if (test_bit(0, &adapter->resetting))
ibmvnic_reset(adapter, VNIC_RESET_FATAL);
}
@@ -4406,7 +4512,7 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
case IBMVNIC_CRQ_XPORT_EVENT:
netif_carrier_off(netdev);
adapter->crq.active = false;
- if (adapter->resetting)
+ if (test_bit(0, &adapter->resetting))
adapter->force_reset_recovery = true;
if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
dev_info(dev, "Migrated, re-enabling adapter\n");
@@ -4744,7 +4850,7 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter)
return -1;
}
- if (adapter->resetting && !adapter->wait_for_reset &&
+ if (test_bit(0, &adapter->resetting) && !adapter->wait_for_reset &&
adapter->reset_reason != VNIC_RESET_MOBILITY) {
if (adapter->req_rx_queues != old_num_rx_queues ||
adapter->req_tx_queues != old_num_tx_queues) {
@@ -4856,10 +4962,12 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
spin_lock_init(&adapter->stats_lock);
INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset);
+ INIT_DELAYED_WORK(&adapter->ibmvnic_delayed_reset,
+ __ibmvnic_delayed_reset);
INIT_LIST_HEAD(&adapter->rwi_list);
spin_lock_init(&adapter->rwi_lock);
init_completion(&adapter->init_done);
- adapter->resetting = false;
+ clear_bit(0, &adapter->resetting);
do {
rc = init_crq_queue(adapter);
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index dcf2eb6d9290..ef050f948128 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -31,6 +31,7 @@
#define IBMVNIC_INVALID_MAP -1
#define IBMVNIC_STATS_TIMEOUT 1
#define IBMVNIC_INIT_FAILED 2
+#define IBMVNIC_OPEN_FAILED 3
/* basic structures plus 100 2k buffers */
#define IBMVNIC_IO_ENTITLEMENT_DEFAULT 610305
@@ -49,6 +50,8 @@
#define IBMVNIC_MAX_LTB_SIZE ((1 << (MAX_ORDER - 1)) * PAGE_SIZE)
#define IBMVNIC_BUFFER_HLEN 500
+#define IBMVNIC_RESET_DELAY 100
+
static const char ibmvnic_priv_flags[][ETH_GSTRING_LEN] = {
#define IBMVNIC_USE_SERVER_MAXES 0x1
"use-server-maxes"
@@ -1087,7 +1090,8 @@ struct ibmvnic_adapter {
spinlock_t rwi_lock;
struct list_head rwi_list;
struct work_struct ibmvnic_reset;
- bool resetting;
+ struct delayed_work ibmvnic_delayed_reset;
+ unsigned long resetting;
bool napi_enabled, from_passive_init;
bool failover_pending;
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 937a5f577da8..fa20a36edfcd 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1432,6 +1432,8 @@ static void __ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb)
netif_wake_queue(ppp->dev);
else
netif_stop_queue(ppp->dev);
+ } else {
+ kfree_skb(skb);
}
ppp_xmit_unlock(ppp);
}
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index 45f28dec29a0..6345a0ec6a42 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -401,10 +401,17 @@ static int nvmem_setup_compat(struct nvmem_device *nvmem,
if (!config->base_dev)
return -EINVAL;
- if (nvmem->read_only)
- nvmem->eeprom = bin_attr_ro_root_nvmem;
- else
- nvmem->eeprom = bin_attr_rw_root_nvmem;
+ if (nvmem->read_only) {
+ if (config->root_only)
+ nvmem->eeprom = bin_attr_ro_root_nvmem;
+ else
+ nvmem->eeprom = bin_attr_ro_nvmem;
+ } else {
+ if (config->root_only)
+ nvmem->eeprom = bin_attr_rw_root_nvmem;
+ else
+ nvmem->eeprom = bin_attr_rw_nvmem;
+ }
nvmem->eeprom.attr.name = "eeprom";
nvmem->eeprom.size = nvmem->size;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index 7af31a1247ee..fe8ac0d4dca3 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -546,6 +546,8 @@ static void send_mode_select(struct work_struct *work)
spin_unlock(&ctlr->ms_lock);
retry:
+ memset(cdb, 0, sizeof(cdb));
+
data_size = rdac_failover_get(ctlr, &list, cdb);
RDAC_LOG(RDAC_LOG_FAILOVER, sdev, "array %s, ctlr %d, "
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9471e0376104..6c6b4a908ca9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3452,6 +3452,9 @@ cleanup:
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
GFP_NOFS);
+ /* Also free data bytes of already reserved one */
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
+ orig_reserved, BTRFS_QGROUP_RSV_DATA);
extent_changeset_release(reserved);
return ret;
}
@@ -3496,7 +3499,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
free_start, free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 6f0f602d1013..94651898c559 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1478,6 +1478,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int clear_rsv = 0;
int ret;
+ /*
+ * The subvolume has reloc tree but the swap is finished, no need to
+ * create/update the dead reloc tree
+ */
+ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
+ return 0;
+
if (root->reloc_root) {
reloc_root = root->reloc_root;
reloc_root->last_trans = trans->transid;
@@ -2228,7 +2235,6 @@ static int clean_dirty_subvols(struct reloc_control *rc)
/* Merged subvolume, cleanup its reloc root */
struct btrfs_root *reloc_root = root->reloc_root;
- clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
list_del_init(&root->reloc_dirty_list);
root->reloc_root = NULL;
if (reloc_root) {
@@ -2237,6 +2243,7 @@ static int clean_dirty_subvols(struct reloc_control *rc)
if (ret2 < 0 && !ret)
ret = ret2;
}
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
btrfs_put_fs_root(root);
} else {
/* Orphan reloc tree, just clean it up */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3b31de59fa50..559be941aefd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -527,13 +527,16 @@ static void ceph_i_callback(struct rcu_head *head)
kmem_cache_free(ceph_inode_cachep, ci);
}
-void ceph_destroy_inode(struct inode *inode)
+void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_frag *frag;
struct rb_node *n;
- dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+ dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
@@ -575,7 +578,10 @@ void ceph_destroy_inode(struct inode *inode)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+}
+void ceph_destroy_inode(struct inode *inode)
+{
call_rcu(&inode->i_rcu, ceph_i_callback);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9a6102a0413c..8073f6ceb3ec 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -821,6 +821,7 @@ static const struct super_operations ceph_super_ops = {
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
.drop_inode = ceph_drop_inode,
+ .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.remount_fs = ceph_remount,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b8b059016884..2a544060f016 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -854,6 +854,7 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_evict_inode(struct inode *inode);
extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 2b07d3ef5a5e..1a121ccf4bb8 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -931,7 +931,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
out_invalidcred:
pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
- return rpc_autherr_badcred;
+ return svc_return_autherr(rqstp, rpc_autherr_badcred);
}
/*
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index a4d21aa24566..ee4c35ccc472 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -152,7 +152,7 @@ again:
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
- err = nfs4_open_delegation_recall(ctx, state, stateid, type);
+ err = nfs4_open_delegation_recall(ctx, state, stateid);
if (!err)
err = nfs_delegation_claim_locks(ctx, state, stateid);
if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
@@ -991,6 +991,22 @@ void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
rcu_read_unlock();
}
+static void
+nfs_delegation_test_free_expired(struct inode *inode,
+ nfs4_stateid *stateid,
+ struct rpc_cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+ int status;
+
+ if (!cred)
+ return;
+ status = ops->test_and_free_expired(server, stateid, cred);
+ if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+ nfs_remove_bad_delegation(inode, stateid);
+}
+
/**
* nfs_reap_expired_delegations - reap expired delegations
* @clp: nfs_client to process
@@ -1002,7 +1018,6 @@ void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
*/
void nfs_reap_expired_delegations(struct nfs_client *clp)
{
- const struct nfs4_minor_version_ops *ops = clp->cl_mvops;
struct nfs_delegation *delegation;
struct nfs_server *server;
struct inode *inode;
@@ -1033,11 +1048,7 @@ restart:
nfs4_stateid_copy(&stateid, &delegation->stateid);
clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
rcu_read_unlock();
- if (cred != NULL &&
- ops->test_and_free_expired(server, &stateid, cred) < 0) {
- nfs_revoke_delegation(inode, &stateid);
- nfs_inode_find_state_and_recover(inode, &stateid);
- }
+ nfs_delegation_test_free_expired(inode, &stateid, cred);
put_rpccred(cred);
if (nfs4_server_rebooted(clp)) {
nfs_inode_mark_test_expired_delegation(server,inode);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index a289fd424019..7a22c3f7540e 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -59,7 +59,7 @@ void nfs_reap_expired_delegations(struct nfs_client *clp);
/* NFSv4 delegation-related procedures */
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
-int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 349e9993d9cd..4e46628b8f37 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1059,6 +1059,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
}
+static int
+nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
+ struct inode *inode, int error)
+{
+ switch (error) {
+ case 1:
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
+ __func__, dentry);
+ return 1;
+ case 0:
+ nfs_mark_for_revalidate(dir);
+ if (inode && S_ISDIR(inode->i_mode)) {
+ /* Purge readdir caches. */
+ nfs_zap_caches(inode);
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (IS_ROOT(dentry))
+ return 1;
+ }
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
+ __func__, dentry);
+ return 0;
+ }
+ dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
+ __func__, dentry, error);
+ return error;
+}
+
+static int
+nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ int ret = 1;
+ if (nfs_neg_need_reval(dir, dentry, flags)) {
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ ret = 0;
+ }
+ return nfs_lookup_revalidate_done(dir, dentry, NULL, ret);
+}
+
+static int
+nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+}
+
+static int
+nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct nfs_fh *fhandle;
+ struct nfs_fattr *fattr;
+ struct nfs4_label *label;
+ int ret;
+
+ ret = -ENOMEM;
+ fhandle = nfs_alloc_fhandle();
+ fattr = nfs_alloc_fattr();
+ label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
+ if (fhandle == NULL || fattr == NULL || IS_ERR(label))
+ goto out;
+
+ ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+ if (ret < 0) {
+ if (ret == -ESTALE || ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+ ret = 0;
+ if (nfs_compare_fh(NFS_FH(inode), fhandle))
+ goto out;
+ if (nfs_refresh_inode(inode, fattr) < 0)
+ goto out;
+
+ nfs_setsecurity(inode, fattr, label);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+
+ /* set a readdirplus hint that we had a cache miss */
+ nfs_force_use_readdirplus(dir);
+ ret = 1;
+out:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
+ nfs4_label_free(label);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, ret);
+}
+
/*
* This is called every time the dcache has a lookup hit,
* and we should check whether we can really trust that
@@ -1070,58 +1164,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
* If the parent directory is seen to have changed, we throw out the
* cached dentry and do a new lookup.
*/
-static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
- struct inode *dir;
struct inode *inode;
- struct dentry *parent;
- struct nfs_fh *fhandle = NULL;
- struct nfs_fattr *fattr = NULL;
- struct nfs4_label *label = NULL;
int error;
- if (flags & LOOKUP_RCU) {
- parent = READ_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- }
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = d_inode(dentry);
- if (!inode) {
- if (nfs_neg_need_reval(dir, dentry, flags)) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
- goto out_bad;
- }
- goto out_valid;
- }
+ if (!inode)
+ return nfs_lookup_revalidate_negative(dir, dentry, flags);
if (is_bad_inode(inode)) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
__func__, dentry);
goto out_bad;
}
if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
- goto out_set_verifier;
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* Force a full look up iff the parent directory has changed */
if (!nfs_is_exclusive_create(dir, flags) &&
nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
error = nfs_lookup_verify_inode(inode, flags);
if (error) {
- if (flags & LOOKUP_RCU)
- return -ECHILD;
if (error == -ESTALE)
- goto out_zap_parent;
- goto out_error;
+ nfs_zap_caches(dir);
+ goto out_bad;
}
nfs_advise_use_readdirplus(dir);
goto out_valid;
@@ -1133,81 +1205,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (NFS_STALE(inode))
goto out_bad;
- error = -ENOMEM;
- fhandle = nfs_alloc_fhandle();
- fattr = nfs_alloc_fattr();
- if (fhandle == NULL || fattr == NULL)
- goto out_error;
-
- label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
- if (IS_ERR(label))
- goto out_error;
-
trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
- error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
+ error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
- if (error == -ESTALE || error == -ENOENT)
- goto out_bad;
- if (error)
- goto out_error;
- if (nfs_compare_fh(NFS_FH(inode), fhandle))
- goto out_bad;
- if ((error = nfs_refresh_inode(inode, fattr)) != 0)
- goto out_bad;
-
- nfs_setsecurity(inode, fattr, label);
-
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
+ return error;
+out_valid:
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
+out_bad:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+}
- /* set a readdirplus hint that we had a cache miss */
- nfs_force_use_readdirplus(dir);
+static int
+__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
+ int (*reval)(struct inode *, struct dentry *, unsigned int))
+{
+ struct dentry *parent;
+ struct inode *dir;
+ int ret;
-out_set_verifier:
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- out_valid:
if (flags & LOOKUP_RCU) {
+ parent = READ_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ ret = reval(dir, dentry, flags);
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
- } else
+ } else {
+ parent = dget_parent(dentry);
+ ret = reval(d_inode(parent), dentry, flags);
dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
- __func__, dentry);
- return 1;
-out_zap_parent:
- nfs_zap_caches(dir);
- out_bad:
- WARN_ON(flags & LOOKUP_RCU);
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
- nfs_mark_for_revalidate(dir);
- if (inode && S_ISDIR(inode->i_mode)) {
- /* Purge readdir caches. */
- nfs_zap_caches(inode);
- /*
- * We can't d_drop the root of a disconnected tree:
- * its d_hash is on the s_anon list and d_drop() would hide
- * it from shrink_dcache_for_unmount(), leading to busy
- * inodes on unmount and further oopses.
- */
- if (IS_ROOT(dentry))
- goto out_valid;
}
- dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
- __func__, dentry);
- return 0;
-out_error:
- WARN_ON(flags & LOOKUP_RCU);
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
- nfs4_label_free(label);
- dput(parent);
- dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
- __func__, dentry, error);
- return error;
+ return ret;
+}
+
+static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
}
/*
@@ -1558,62 +1594,55 @@ no_open:
}
EXPORT_SYMBOL_GPL(nfs_atomic_open);
-static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+static int
+nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
struct inode *inode;
- int ret = 0;
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
- goto no_open;
+ goto full_reval;
if (d_mountpoint(dentry))
- goto no_open;
- if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
- goto no_open;
+ goto full_reval;
inode = d_inode(dentry);
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
- if (inode == NULL) {
- struct dentry *parent;
- struct inode *dir;
-
- if (flags & LOOKUP_RCU) {
- parent = READ_ONCE(dentry->d_parent);
- dir = d_inode_rcu(parent);
- if (!dir)
- return -ECHILD;
- } else {
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- }
- if (!nfs_neg_need_reval(dir, dentry, flags))
- ret = 1;
- else if (flags & LOOKUP_RCU)
- ret = -ECHILD;
- if (!(flags & LOOKUP_RCU))
- dput(parent);
- else if (parent != READ_ONCE(dentry->d_parent))
- return -ECHILD;
- goto out;
- }
+ if (inode == NULL)
+ goto full_reval;
+
+ if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ))
+ return nfs_lookup_revalidate_delegated(dir, dentry, inode);
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
- goto no_open;
+ goto full_reval;
+
/* We cannot do exclusive creation on a positive dentry */
- if (flags & LOOKUP_EXCL)
- goto no_open;
+ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL))
+ goto reval_dentry;
+
+ /* Check if the directory changed */
+ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU))
+ goto reval_dentry;
/* Let f_op->open() actually open (and revalidate) the file */
- ret = 1;
+ return 1;
+reval_dentry:
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+ return nfs_lookup_revalidate_dentry(dir, dentry, inode);
-out:
- return ret;
+full_reval:
+ return nfs_do_lookup_revalidate(dir, dentry, flags);
+}
-no_open:
- return nfs_lookup_revalidate(dentry, flags);
+static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ return __nfs_lookup_revalidate(dentry, flags,
+ nfs4_do_lookup_revalidate);
}
#endif /* CONFIG_NFSV4 */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f6e68e953967..03fed7e18e75 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -397,15 +397,21 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
unsigned long bytes = 0;
struct nfs_direct_req *dreq = hdr->dreq;
- if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
- goto out_put;
-
spin_lock(&dreq->lock);
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
dreq->error = hdr->error;
- else
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
+
+ if (hdr->good_bytes != 0)
nfs_direct_good_bytes(dreq, hdr);
+ if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+ dreq->error = 0;
+
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
@@ -768,16 +774,19 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
bool request_commit = false;
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
- if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
- goto out_put;
-
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
dreq->error = hdr->error;
- if (dreq->error == 0) {
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ spin_unlock(&dreq->lock);
+ goto out_put;
+ }
+
+ if (hdr->good_bytes != 0) {
nfs_direct_good_bytes(dreq, hdr);
if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index ca0b0e03619f..f37fbc2dbbae 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -932,7 +932,7 @@ fl_pnfs_update_layout(struct inode *ino,
status = filelayout_check_deviceid(lo, fl, gfp_flags);
if (status) {
pnfs_put_lseg(lseg);
- lseg = ERR_PTR(status);
+ lseg = NULL;
}
out:
return lseg;
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f32c58bbe556..37cbe316d224 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -305,7 +305,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
if (status == 0)
return 0;
- if (mirror->mirror_ds == NULL)
+ if (IS_ERR_OR_NULL(mirror->mirror_ds))
return -EINVAL;
dserr = kmalloc(sizeof(*dserr), gfp_flags);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index d63bea8bbfbb..6367e7fd530c 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -71,6 +71,10 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
struct rb_node **p, *parent;
int diff;
+ nfss->fscache_key = NULL;
+ nfss->fscache = NULL;
+ if (!(nfss->options & NFS_OPTION_FSCACHE))
+ return;
if (!uniq) {
uniq = "";
ulen = 1;
@@ -180,10 +184,11 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
*/
void nfs_fscache_init_inode(struct inode *inode)
{
+ struct nfs_server *nfss = NFS_SERVER(inode);
struct nfs_inode *nfsi = NFS_I(inode);
nfsi->fscache = NULL;
- if (!S_ISREG(inode->i_mode))
+ if (!(nfss->fscache && S_ISREG(inode->i_mode)))
return;
nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
&nfs_fscache_inode_object_def,
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index d7fe3e799f2f..7ead2158fe63 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -171,7 +171,7 @@ static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
*/
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
{
- if (server->fscache && (server->options & NFS_OPTION_FSCACHE))
+ if (server->fscache)
return "yes";
return "no ";
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e906c5bcd01b..a9094c1c543b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1006,6 +1006,7 @@ int nfs_open(struct inode *inode, struct file *filp)
nfs_fscache_open_file(inode, filp);
return 0;
}
+EXPORT_SYMBOL_GPL(nfs_open);
/*
* This function is called whenever some part of NFS notices that
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c96280495e6e..808a1fe88cec 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -161,6 +161,7 @@ enum {
NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */
NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */
NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */
+ NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */
};
struct nfs4_state {
@@ -432,7 +433,8 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session,
extern struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *, gfp_t);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
-extern void nfs4_purge_state_owners(struct nfs_server *);
+extern void nfs4_purge_state_owners(struct nfs_server *, struct list_head *);
+extern void nfs4_free_state_owners(struct list_head *head);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
extern void nfs4_put_open_state(struct nfs4_state *);
extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index c5d3c0b4e54c..cd24080d13ba 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -744,9 +744,12 @@ out:
static void nfs4_destroy_server(struct nfs_server *server)
{
+ LIST_HEAD(freeme);
+
nfs_server_return_all_delegations(server);
unset_pnfs_layoutdriver(server);
- nfs4_purge_state_owners(server);
+ nfs4_purge_state_owners(server, &freeme);
+ nfs4_free_state_owners(&freeme);
}
/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 0efba77789b9..298df1db77a4 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -49,7 +49,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
return err;
if ((openflags & O_ACCMODE) == 3)
- openflags--;
+ return nfs_open(inode, filp);
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 672875b37460..7c8d66c489ae 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1396,6 +1396,25 @@ static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
}
#endif /* CONFIG_NFS_V4_1 */
+static void nfs_state_log_update_open_stateid(struct nfs4_state *state)
+{
+ if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
+ wake_up_bit(&state->flags, NFS_STATE_CHANGE_WAIT);
+}
+
+static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state,
+ const nfs4_stateid *stateid)
+{
+ u32 state_seqid = be32_to_cpu(state->open_stateid.seqid);
+ u32 stateid_seqid = be32_to_cpu(stateid->seqid);
+
+ if (stateid_seqid == state_seqid + 1U ||
+ (stateid_seqid == 1U && state_seqid == 0xffffffffU))
+ nfs_state_log_update_open_stateid(state);
+ else
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
+}
+
static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
{
struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1411,18 +1430,32 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
nfs4_state_mark_reclaim_nograce(clp, state);
}
+/*
+ * Check for whether or not the caller may update the open stateid
+ * to the value passed in by stateid.
+ *
+ * Note: This function relies heavily on the server implementing
+ * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2
+ * correctly.
+ * i.e. The stateid seqids have to be initialised to 1, and
+ * are then incremented on every state transition.
+ */
static bool nfs_need_update_open_stateid(struct nfs4_state *state,
- const nfs4_stateid *stateid, nfs4_stateid *freeme)
+ const nfs4_stateid *stateid)
{
- if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
- return true;
- if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
- nfs4_stateid_copy(freeme, &state->open_stateid);
- nfs_test_and_clear_all_open_stateid(state);
+ if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 ||
+ !nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ if (stateid->seqid == cpu_to_be32(1))
+ nfs_state_log_update_open_stateid(state);
+ else
+ set_bit(NFS_STATE_CHANGE_WAIT, &state->flags);
return true;
}
- if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
+
+ if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ nfs_state_log_out_of_order_open_stateid(state, stateid);
return true;
+ }
return false;
}
@@ -1461,11 +1494,13 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
if (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
!nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
nfs_resync_open_stateid_locked(state);
- return;
+ goto out;
}
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
+out:
+ nfs_state_log_update_open_stateid(state);
}
static void nfs_clear_open_stateid(struct nfs4_state *state,
@@ -1482,29 +1517,67 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
}
static void nfs_set_open_stateid_locked(struct nfs4_state *state,
- const nfs4_stateid *stateid, fmode_t fmode,
- nfs4_stateid *freeme)
+ const nfs4_stateid *stateid, nfs4_stateid *freeme)
{
- switch (fmode) {
- case FMODE_READ:
- set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ struct wait_queue_head *wq_head = bit_waitqueue(&state->flags,
+ NFS_STATE_CHANGE_WAIT);
+ DEFINE_WAIT_BIT(wbq_entry, &state->flags, NFS_STATE_CHANGE_WAIT);
+ int status = 0;
+ for (;;) {
+
+ if (!nfs_need_update_open_stateid(state, stateid))
+ return;
+ if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags))
break;
- case FMODE_WRITE:
- set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ if (status)
break;
- case FMODE_READ|FMODE_WRITE:
- set_bit(NFS_O_RDWR_STATE, &state->flags);
+ /* Rely on seqids for serialisation with NFSv4.0 */
+ if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client))
+ break;
+
+ prepare_to_wait(wq_head, &wbq_entry.wq_entry, TASK_KILLABLE);
+ /*
+ * Ensure we process the state changes in the same order
+ * in which the server processed them by delaying the
+ * update of the stateid until we are in sequence.
+ */
+ write_sequnlock(&state->seqlock);
+ spin_unlock(&state->owner->so_lock);
+ rcu_read_unlock();
+ if (!signal_pending(current)) {
+ if (schedule_timeout(5*HZ) == 0)
+ status = -EAGAIN;
+ else
+ status = 0;
+ } else
+ status = -EINTR;
+ finish_wait(wq_head, &wbq_entry.wq_entry);
+ rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ write_seqlock(&state->seqlock);
}
- if (!nfs_need_update_open_stateid(state, stateid, freeme))
- return;
+
+ if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ nfs4_stateid_copy(freeme, &state->open_stateid);
+ nfs_test_and_clear_all_open_stateid(state);
+ }
+
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
+ nfs_state_log_update_open_stateid(state);
+}
+
+static void nfs_state_clear_open_state_flags(struct nfs4_state *state)
+{
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
}
-static void __update_open_stateid(struct nfs4_state *state,
+static void nfs_state_set_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
- const nfs4_stateid *deleg_stateid,
fmode_t fmode,
nfs4_stateid *freeme)
{
@@ -1512,17 +1585,42 @@ static void __update_open_stateid(struct nfs4_state *state,
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
*/
- spin_lock(&state->owner->so_lock);
write_seqlock(&state->seqlock);
- if (deleg_stateid != NULL) {
- nfs4_stateid_copy(&state->stateid, deleg_stateid);
- set_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_set_open_stateid_locked(state, open_stateid, freeme);
+ switch (fmode) {
+ case FMODE_READ:
+ set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_WRITE:
+ set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ set_bit(NFS_O_RDWR_STATE, &state->flags);
}
- if (open_stateid != NULL)
- nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
+ set_bit(NFS_OPEN_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_set_delegation(struct nfs4_state *state,
+ const nfs4_stateid *deleg_stateid,
+ fmode_t fmode)
+{
+ /*
+ * Protect the call to nfs4_state_set_mode_locked and
+ * serialise the stateid update
+ */
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, deleg_stateid);
+ set_bit(NFS_DELEGATED_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
+static void nfs_state_clear_delegation(struct nfs4_state *state)
+{
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
write_sequnlock(&state->seqlock);
- update_open_stateflags(state, fmode);
- spin_unlock(&state->owner->so_lock);
}
static int update_open_stateid(struct nfs4_state *state,
@@ -1540,6 +1638,12 @@ static int update_open_stateid(struct nfs4_state *state,
fmode &= (FMODE_READ|FMODE_WRITE);
rcu_read_lock();
+ spin_lock(&state->owner->so_lock);
+ if (open_stateid != NULL) {
+ nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme);
+ ret = 1;
+ }
+
deleg_cur = rcu_dereference(nfsi->delegation);
if (deleg_cur == NULL)
goto no_delegation;
@@ -1556,18 +1660,16 @@ static int update_open_stateid(struct nfs4_state *state,
goto no_delegation_unlock;
nfs_mark_delegation_referenced(deleg_cur);
- __update_open_stateid(state, open_stateid, &deleg_cur->stateid,
- fmode, &freeme);
+ nfs_state_set_delegation(state, &deleg_cur->stateid, fmode);
ret = 1;
no_delegation_unlock:
spin_unlock(&deleg_cur->lock);
no_delegation:
+ if (ret)
+ update_open_stateflags(state, fmode);
+ spin_unlock(&state->owner->so_lock);
rcu_read_unlock();
- if (!ret && open_stateid != NULL) {
- __update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
- ret = 1;
- }
if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
nfs4_schedule_state_manager(clp);
if (freeme.type != 0)
@@ -1714,8 +1816,9 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
update:
- update_open_stateid(state, &data->o_res.stateid, NULL,
- data->o_arg.fmode);
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode))
+ return ERR_PTR(-EAGAIN);
atomic_inc(&state->count);
return state;
@@ -1747,9 +1850,11 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
goto err_put_inode;
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
- update_open_stateid(state, &data->o_res.stateid, NULL,
- data->o_arg.fmode);
- iput(inode);
+ if (!update_open_stateid(state, &data->o_res.stateid,
+ NULL, data->o_arg.fmode)) {
+ nfs4_put_open_state(state);
+ state = ERR_PTR(-EAGAIN);
+ }
out:
nfs_release_seqid(data->o_arg.seqid);
return state;
@@ -1835,13 +1940,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
{
int ret;
- /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
/* memory barrier prior to reading state->n_* */
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
- clear_bit(NFS_OPEN_STATE, &state->flags);
smp_rmb();
ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
if (ret != 0)
@@ -1917,6 +2016,8 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta
ctx = nfs4_state_find_open_context(state);
if (IS_ERR(ctx))
return -EAGAIN;
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_state_clear_open_state_flags(state);
ret = nfs4_do_open_reclaim(ctx, state);
put_nfs_open_context(ctx);
return ret;
@@ -1938,12 +2039,10 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
case -NFS4ERR_BAD_HIGH_SLOT:
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
case -NFS4ERR_DEADSESSION:
- set_bit(NFS_DELEGATED_STATE, &state->flags);
nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
return -EAGAIN;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
- set_bit(NFS_DELEGATED_STATE, &state->flags);
/* Don't recall a delegation if it was lost */
nfs4_schedule_lease_recovery(server->nfs_client);
return -EAGAIN;
@@ -1964,7 +2063,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
return -EAGAIN;
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
- set_bit(NFS_DELEGATED_STATE, &state->flags);
ssleep(1);
return -EAGAIN;
case -ENOMEM:
@@ -1980,8 +2078,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
}
int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
- struct nfs4_state *state, const nfs4_stateid *stateid,
- fmode_t type)
+ struct nfs4_state *state, const nfs4_stateid *stateid)
{
struct nfs_server *server = NFS_SERVER(state->inode);
struct nfs4_opendata *opendata;
@@ -1992,22 +2089,23 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
if (IS_ERR(opendata))
return PTR_ERR(opendata);
nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
- switch (type & (FMODE_READ|FMODE_WRITE)) {
- case FMODE_READ|FMODE_WRITE:
- case FMODE_WRITE:
+ if (!test_bit(NFS_O_RDWR_STATE, &state->flags)) {
err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
if (err)
- break;
+ goto out;
+ }
+ if (!test_bit(NFS_O_WRONLY_STATE, &state->flags)) {
err = nfs4_open_recover_helper(opendata, FMODE_WRITE);
if (err)
- break;
- case FMODE_READ:
+ goto out;
+ }
+ if (!test_bit(NFS_O_RDONLY_STATE, &state->flags)) {
err = nfs4_open_recover_helper(opendata, FMODE_READ);
+ if (err)
+ goto out;
}
+ nfs_state_clear_delegation(state);
+out:
nfs4_opendata_put(opendata);
return nfs4_handle_delegation_recall_error(server, state, stateid, NULL, err);
}
@@ -2441,10 +2539,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
const nfs4_stateid *stateid)
{
nfs_remove_bad_delegation(state->inode, stateid);
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_state_clear_delegation(state);
}
static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
@@ -2457,6 +2552,7 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
{
/* NFSv4.0 doesn't allow for delegation recovery on open expire */
nfs40_clear_delegation_stateid(state);
+ nfs_state_clear_open_state_flags(state);
return nfs4_open_expired(sp, state);
}
@@ -2499,33 +2595,29 @@ out_free:
return -NFS4ERR_EXPIRED;
}
-static void nfs41_check_delegation_stateid(struct nfs4_state *state)
+static int nfs41_check_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
nfs4_stateid stateid;
struct nfs_delegation *delegation;
struct rpc_cred *cred;
- int status;
+ int status, ret = NFS_OK;
/* Get the delegation credential for use by test/free_stateid */
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
if (delegation == NULL) {
rcu_read_unlock();
- return;
+ nfs_state_clear_delegation(state);
+ return NFS_OK;
}
nfs4_stateid_copy(&stateid, &delegation->stateid);
- if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
- rcu_read_unlock();
- nfs_finish_clear_delegation_stateid(state, &stateid);
- return;
- }
if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
&delegation->flags)) {
rcu_read_unlock();
- return;
+ return NFS_OK;
}
cred = get_rpccred(delegation->cred);
@@ -2534,8 +2626,24 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state)
trace_nfs4_test_delegation_stateid(state, NULL, status);
if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
nfs_finish_clear_delegation_stateid(state, &stateid);
+ else
+ ret = status;
put_rpccred(cred);
+ return ret;
+}
+
+static void nfs41_delegation_recover_stateid(struct nfs4_state *state)
+{
+ nfs4_stateid tmp;
+
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) &&
+ nfs4_copy_delegation_stateid(state->inode, state->state,
+ &tmp, NULL) &&
+ nfs4_stateid_match_other(&state->stateid, &tmp))
+ nfs_state_set_delegation(state, &tmp, state->state);
+ else
+ nfs_state_clear_delegation(state);
}
/**
@@ -2605,21 +2713,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
struct rpc_cred *cred = state->owner->so_cred;
int status;
- if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
- if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) {
- if (nfs4_have_delegation(state->inode, state->state))
- return NFS_OK;
- return -NFS4ERR_OPENMODE;
- }
+ if (test_bit(NFS_OPEN_STATE, &state->flags) == 0)
return -NFS4ERR_BAD_STATEID;
- }
status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
trace_nfs4_test_open_stateid(state, NULL, status);
if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
- clear_bit(NFS_OPEN_STATE, &state->flags);
+ nfs_state_clear_open_state_flags(state);
stateid->type = NFS4_INVALID_STATEID_TYPE;
}
if (status != NFS_OK)
@@ -2633,7 +2732,11 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
{
int status;
- nfs41_check_delegation_stateid(state);
+ status = nfs41_check_delegation_stateid(state);
+ if (status != NFS_OK)
+ return status;
+ nfs41_delegation_recover_stateid(state);
+
status = nfs41_check_expired_locks(state);
if (status != NFS_OK)
return status;
@@ -2914,7 +3017,6 @@ static int _nfs4_do_setattr(struct inode *inode,
};
struct rpc_cred *delegation_cred = NULL;
unsigned long timestamp = jiffies;
- fmode_t fmode;
bool truncate;
int status;
@@ -2922,11 +3024,12 @@ static int _nfs4_do_setattr(struct inode *inode,
/* Servers should only apply open mode checks for file size changes */
truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
- fmode = truncate ? FMODE_WRITE : FMODE_READ;
+ if (!truncate)
+ goto zero_stateid;
- if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
+ if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
- } else if (truncate && ctx != NULL) {
+ } else if (ctx != NULL && ctx->state) {
struct nfs_lock_context *l_ctx;
if (!nfs4_valid_open_stateid(ctx->state))
return -EBADF;
@@ -2938,8 +3041,10 @@ static int _nfs4_do_setattr(struct inode *inode,
nfs_put_lock_context(l_ctx);
if (status == -EIO)
return -EBADF;
- } else
+ } else {
+zero_stateid:
nfs4_stateid_copy(&arg->stateid, &zero_stateid);
+ }
if (delegation_cred)
msg.rpc_cred = delegation_cred;
@@ -6464,7 +6569,6 @@ struct nfs4_lock_waiter {
struct task_struct *task;
struct inode *inode;
struct nfs_lowner *owner;
- bool notified;
};
static int
@@ -6486,11 +6590,11 @@ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, vo
if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
return 0;
- waiter->notified = true;
-
/* override "private" so we can use default_wake_function */
wait->private = waiter->task;
- ret = autoremove_wake_function(wait, mode, flags, key);
+ ret = woken_wake_function(wait, mode, flags, key);
+ if (ret)
+ list_del_init(&wait->entry);
wait->private = waiter;
return ret;
}
@@ -6499,7 +6603,6 @@ static int
nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
int status = -ERESTARTSYS;
- unsigned long flags;
struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
struct nfs_server *server = NFS_SERVER(state->inode);
struct nfs_client *clp = server->nfs_client;
@@ -6509,8 +6612,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
.s_dev = server->s_dev };
struct nfs4_lock_waiter waiter = { .task = current,
.inode = state->inode,
- .owner = &owner,
- .notified = false };
+ .owner = &owner};
wait_queue_entry_t wait;
/* Don't bother with waitqueue if we don't expect a callback */
@@ -6528,15 +6630,9 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
break;
status = -ERESTARTSYS;
- spin_lock_irqsave(&q->lock, flags);
- if (waiter.notified) {
- spin_unlock_irqrestore(&q->lock, flags);
- continue;
- }
- set_current_state(TASK_INTERRUPTIBLE);
- spin_unlock_irqrestore(&q->lock, flags);
-
- freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT);
+ freezer_do_not_count();
+ wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT);
+ freezer_count();
}
finish_wait(q, &wait);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0c78980b26ce..5dd473d00110 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -143,6 +143,10 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
/* Sustain the lease, even if it's empty. If the clientid4
* goes stale it's of no use for trunking discovery. */
nfs4_schedule_state_renewal(*result);
+
+ /* If the client state need to recover, do it. */
+ if (clp->cl_state)
+ nfs4_schedule_state_manager(clp);
}
out:
return status;
@@ -604,24 +608,39 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
/**
* nfs4_purge_state_owners - Release all cached state owners
* @server: nfs_server with cached state owners to release
+ * @head: resulting list of state owners
*
* Called at umount time. Remaining state owners will be on
* the LRU with ref count of zero.
+ * Note that the state owners are not freed, but are added
+ * to the list @head, which can later be used as an argument
+ * to nfs4_free_state_owners.
*/
-void nfs4_purge_state_owners(struct nfs_server *server)
+void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state_owner *sp, *tmp;
- LIST_HEAD(doomed);
spin_lock(&clp->cl_lock);
list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
- list_move(&sp->so_lru, &doomed);
+ list_move(&sp->so_lru, head);
nfs4_remove_state_owner_locked(sp);
}
spin_unlock(&clp->cl_lock);
+}
- list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @head: resulting list of state owners
+ *
+ * Frees a list of state owners that was generated by
+ * nfs4_purge_state_owners
+ */
+void nfs4_free_state_owners(struct list_head *head)
+{
+ struct nfs4_state_owner *sp, *tmp;
+
+ list_for_each_entry_safe(sp, tmp, head, so_lru) {
list_del(&sp->so_lru);
nfs4_free_state_owner(sp);
}
@@ -1772,12 +1791,13 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
struct nfs4_state_owner *sp;
struct nfs_server *server;
struct rb_node *pos;
+ LIST_HEAD(freeme);
int status = 0;
restart:
rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- nfs4_purge_state_owners(server);
+ nfs4_purge_state_owners(server, &freeme);
spin_lock(&clp->cl_lock);
for (pos = rb_first(&server->state_owners);
pos != NULL;
@@ -1806,6 +1826,7 @@ restart:
spin_unlock(&clp->cl_lock);
}
rcu_read_unlock();
+ nfs4_free_state_owners(&freeme);
return 0;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 827922e94a8f..68f65b73e19b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1262,21 +1262,24 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
- LIST_HEAD(failed);
+ LIST_HEAD(pages);
*pg_io_completion(desc) = hdr->io_completion;
desc->pg_dreq = hdr->dreq;
- while (!list_empty(&hdr->pages)) {
- struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ list_splice_init(&hdr->pages, &pages);
+ while (!list_empty(&pages)) {
+ struct nfs_page *req = nfs_list_entry(pages.next);
nfs_list_remove_request(req);
if (!nfs_pageio_add_request(desc, req))
- nfs_list_add_request(req, &failed);
+ break;
}
nfs_pageio_complete(desc);
- if (!list_empty(&failed)) {
- list_move(&failed, &hdr->pages);
- return desc->pg_error < 0 ? desc->pg_error : -EIO;
+ if (!list_empty(&pages)) {
+ int err = desc->pg_error < 0 ? desc->pg_error : -EIO;
+ hdr->completion_ops->error_cleanup(&pages);
+ nfs_set_pgio_error(hdr, err, hdr->io_start);
+ return err;
}
return 0;
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index d40755a0984b..11474aac91bc 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -650,11 +650,15 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
/* Add this address as an alias */
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
rpc_clnt_test_and_add_xprt, NULL);
- } else
- clp = get_v3_ds_connect(mds_srv,
- (struct sockaddr *)&da->da_addr,
- da->da_addrlen, IPPROTO_TCP,
- timeo, retrans);
+ continue;
+ }
+ clp = get_v3_ds_connect(mds_srv,
+ (struct sockaddr *)&da->da_addr,
+ da->da_addrlen, IPPROTO_TCP,
+ timeo, retrans);
+ if (IS_ERR(clp))
+ continue;
+ clp->cl_rpcclient->cl_softrtry = 0;
}
if (IS_ERR(clp)) {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e483d906de13..7d1f0140ff5a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2091,7 +2091,8 @@ static int nfs23_validate_mount_data(void *options,
memcpy(sap, &data->addr, sizeof(data->addr));
args->nfs_server.addrlen = sizeof(data->addr);
args->nfs_server.port = ntohs(data->addr.sin_port);
- if (!nfs_verify_server_address(sap))
+ if (sap->sa_family != AF_INET ||
+ !nfs_verify_server_address(sap))
goto out_no_address;
if (!(data->flags & NFS_MOUNT_TCP))
@@ -2277,6 +2278,7 @@ nfs_compare_remount_data(struct nfs_server *nfss,
data->acdirmin != nfss->acdirmin / HZ ||
data->acdirmax != nfss->acdirmax / HZ ||
data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ (data->options & NFS_OPTION_FSCACHE) != (nfss->options & NFS_OPTION_FSCACHE) ||
data->nfs_server.port != nfss->port ||
data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
!rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 74915e17581a..cc422964732d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -668,7 +668,7 @@ out:
return ret;
out_launder:
nfs_write_error_remove_page(req);
- return ret;
+ return 0;
}
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0274db6e65d0..c28e37d942ab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -931,8 +931,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
cb->cb_seq_status = 1;
cb->cb_status = 0;
if (minorversion) {
- if (!nfsd41_cb_get_slot(clp, task))
+ if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task))
return;
+ cb->cb_holds_slot = true;
}
rpc_call_start(task);
}
@@ -959,6 +960,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
return true;
}
+ if (!cb->cb_holds_slot)
+ goto need_restart;
+
switch (cb->cb_seq_status) {
case 0:
/*
@@ -996,6 +1000,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
cb->cb_seq_status);
}
+ cb->cb_holds_slot = false;
clear_bit(0, &clp->cl_cb_slot_busy);
rpc_wake_up_next(&clp->cl_cb_waitq);
dprintk("%s: freed slot, new seqid=%d\n", __func__,
@@ -1203,6 +1208,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_seq_status = 1;
cb->cb_status = 0;
cb->cb_need_restart = false;
+ cb->cb_holds_slot = false;
}
void nfsd4_run_cb(struct nfsd4_callback *cb)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 86aa92d200e1..133d8bf62a5c 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -69,6 +69,7 @@ struct nfsd4_callback {
int cb_seq_status;
int cb_status;
bool cb_need_restart;
+ bool cb_holds_slot;
};
struct nfsd4_callback_ops {
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 34d1e291fe95..93efb7704856 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -269,6 +269,7 @@ struct svc_rqst {
#define RQ_VICTIM (5) /* about to be shut down */
#define RQ_BUSY (6) /* request is busy */
#define RQ_DATA (7) /* request has data */
+#define RQ_AUTHERR (8) /* Request status is auth error */
unsigned long rq_flags; /* flags field */
void * rq_argp; /* decoded arguments */
@@ -493,6 +494,7 @@ void svc_wake_up(struct svc_serv *);
void svc_reserve(struct svc_rqst *rqstp, int space);
struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu);
char * svc_print_addr(struct svc_rqst *, char *, size_t);
+__be32 svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err);
#define RPC_MAX_ADDRBUFLEN (63U)
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 4f94a8ecf36f..b8070b6c7ca0 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -30,7 +30,6 @@
#include <linux/elf.h>
#include <linux/moduleloader.h>
#include <linux/completion.h>
-#include <linux/memory.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -730,21 +729,16 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
- mutex_lock(&text_mutex);
-
module_disable_ro(patch->mod);
ret = klp_write_object_relocations(patch->mod, obj);
if (ret) {
module_enable_ro(patch->mod, true);
- mutex_unlock(&text_mutex);
return ret;
}
arch_klp_init_object_loaded(patch, obj);
module_enable_ro(patch->mod, true);
- mutex_unlock(&text_mutex);
-
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
diff --git a/kernel/resource.c b/kernel/resource.c
index 54b59151f81c..7063ef16427f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -486,8 +486,6 @@ int walk_mem_res(u64 start, u64 end, void *arg,
arg, func);
}
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
-
/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
@@ -519,8 +517,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
return ret;
}
-#endif
-
static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 60395aa7af33..8137077d9012 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2175,11 +2175,12 @@ call_status(struct rpc_task *task)
case -ECONNREFUSED:
case -ECONNRESET:
case -ECONNABORTED:
+ case -ENOTCONN:
rpc_force_rebind(clnt);
+ break;
case -EADDRINUSE:
rpc_delay(task, 3*HZ);
case -EPIPE:
- case -ENOTCONN:
task->tk_action = call_bind;
break;
case -ENOBUFS:
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 1e589aa63ddf..83686d8fba26 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1144,6 +1144,22 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
#endif
+__be32
+svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err)
+{
+ set_bit(RQ_AUTHERR, &rqstp->rq_flags);
+ return auth_err;
+}
+EXPORT_SYMBOL_GPL(svc_return_autherr);
+
+static __be32
+svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp)
+{
+ if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags))
+ return *statp;
+ return rpc_auth_ok;
+}
+
/*
* Common routine for processing the RPC request.
*/
@@ -1295,11 +1311,9 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
procp->pc_release(rqstp, NULL, rqstp->rq_resp);
goto dropit;
}
- if (*statp == rpc_autherr_badcred) {
- if (procp->pc_release)
- procp->pc_release(rqstp, NULL, rqstp->rq_resp);
- goto err_bad_auth;
- }
+ auth_stat = svc_get_autherr(rqstp, statp);
+ if (auth_stat != rpc_auth_ok)
+ goto err_release_bad_auth;
if (*statp == rpc_success &&
(xdr = procp->pc_encode) &&
!xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
@@ -1357,6 +1371,9 @@ err_bad_rpc:
svc_putnl(resv, 2);
goto sendit;
+err_release_bad_auth:
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
err_bad_auth:
dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
serv->sv_stats->rpcbadauth++;