summaryrefslogtreecommitdiff |
diff options
author | Jiri Kosina <jkosina@suse.cz> | 2017-10-20 13:22:03 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2017-10-20 13:22:03 +0200 |
commit | f4c3286dae100b21daa3e02573894d68c43a72f5 (patch) | |
tree | a4206e50b5f90b8c0c999ac08f914dd045517231 | |
parent | b0c30e4605e34d6450a73048bdd6aef734dec675 (diff) | |
parent | 11cb57a26ad71f55aa601896848d66ac91282463 (diff) |
Merge remote-tracking branch 'origin/users/ohering/SLE15/for-next' into SLE15rpm-4.12.14-2--SLE-15-Packages-Beta1rpm-4.12.14-2
suse-commit: 845773a994638565570c3c346dfdcfed50716171
31 files changed, 1209 insertions, 973 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index fc6b40328f45..336eec0f140e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6157,6 +6157,7 @@ M: Stephen Hemminger <sthemmin@microsoft.com> L: devel@linuxdriverproject.org S: Maintained F: arch/x86/include/asm/mshyperv.h +F: arch/x86/include/asm/trace/hyperv.h F: arch/x86/include/uapi/asm/hyperv.h F: arch/x86/kernel/cpu/mshyperv.c F: arch/x86/hyperv diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 586b786b3edf..3e6f64073005 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -8,7 +8,7 @@ obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support -obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ +obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/ # lguest paravirtualization support obj-$(CONFIG_LGUEST_GUEST) += lguest/ diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 171ae09864d7..367a8203cfcf 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1 +1 @@ -obj-y := hv_init.o +obj-y := hv_init.o mmu.o diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 21f16a9f7004..211a43334219 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/clockchips.h> #include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> #ifdef CONFIG_HYPERV_TSCPAGE @@ -75,10 +77,30 @@ static struct clocksource hyperv_cs_msr = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void *hypercall_pg; +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); struct clocksource *hyperv_cs; EXPORT_SYMBOL_GPL(hyperv_cs); +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +u32 hv_max_vp_index; + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = msr_vp_index; + + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -96,6 +118,16 @@ void hyperv_init(void) if (x86_hyper != &x86_hyper_ms_hyperv) return; + /* Allocate percpu VP index */ + hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, NULL) < 0) + goto free_vp_index; + /* * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID @@ -104,17 +136,19 @@ void hyperv_init(void) guest_id = generate_guest_id(d1, LINUX_VERSION_CODE, d2); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); - if (hypercall_pg == NULL) { + hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - return; + goto free_vp_index; } rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg); + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hyper_alloc_mmu(); + /* * Register Hyper-V specific clocksource. */ @@ -150,6 +184,12 @@ register_msr_cs: hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + + return; + +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; } /* @@ -172,51 +212,6 @@ void hyperv_cleanup(void) } EXPORT_SYMBOL_GPL(hyperv_cleanup); -/* - * hv_do_hypercall- Invoke the specified hypercall - */ -u64 hv_do_hypercall(u64 control, void *input, void *output) -{ - u64 input_address = (input) ? virt_to_phys(input) : 0; - u64 output_address = (output) ? virt_to_phys(output) : 0; -#ifdef CONFIG_X86_64 - u64 hv_status = 0; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); - __asm__ __volatile__("call *%3" : "=a" (hv_status) : - "c" (control), "d" (input_address), - "m" (hypercall_pg)); - - return hv_status; - -#else - - u32 control_hi = control >> 32; - u32 control_lo = control & 0xFFFFFFFF; - u32 hv_status_hi = 1; - u32 hv_status_lo = 1; - u32 input_address_hi = input_address >> 32; - u32 input_address_lo = input_address & 0xFFFFFFFF; - u32 output_address_hi = output_address >> 32; - u32 output_address_lo = output_address & 0xFFFFFFFF; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), - "=a"(hv_status_lo) : "d" (control_hi), - "a" (control_lo), "b" (input_address_hi), - "c" (input_address_lo), "D"(output_address_hi), - "S"(output_address_lo), "m" (hypercall_pg)); - - return hv_status_lo | ((u64)hv_status_hi << 32); -#endif /* !x86_64 */ -} -EXPORT_SYMBOL_GPL(hv_do_hypercall); - void hyperv_report_panic(struct pt_regs *regs) { static bool panic_reported; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c new file mode 100644 index 000000000000..1f87627d9909 --- /dev/null +++ b/arch/x86/hyperv/mmu.c @@ -0,0 +1,301 @@ +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/hyperv.h> +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include <asm/fpu/api.h> +#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/tlbflush.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hyperv.h> + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_flush_pcpu { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +}; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_flush_pcpu_ex { + u64 address_space; + u64 flags; + struct { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; + } hv_vp_set; + u64 gva_list[]; +}; + +/* Each gva in gva_list encodes up to 4096 pages to flush */ +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) + +static struct hv_flush_pcpu __percpu **pcpu_flush; + +static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; + +/* + * Fills in gva_list starting from offset. Returns the number of items added. + */ +static inline int fill_gva_list(u64 gva_list[], int offset, + unsigned long start, unsigned long end) +{ + int gva_n = offset; + unsigned long cur = start, diff; + + do { + diff = end > cur ? end - cur : 0; + + gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (diff >= HV_TLB_FLUSH_UNIT) + gva_list[gva_n] |= ~PAGE_MASK; + else if (diff) + gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + + cur += HV_TLB_FLUSH_UNIT; + gva_n++; + + } while (cur < end); + + return gva_n - offset; +} + +/* Return the number of banks in the resulting vp_set */ +static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, + const struct cpumask *cpus) +{ + int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + + /* valid_bank_mask can represent up to 64 banks */ + if (hv_max_vp_index / 64 >= 64) + return 0; + + /* + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex + * structs are not cleared between calls, we risk flushing unneeded + * vCPUs otherwise. + */ + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; + + /* + * Some banks may end up being empty but this is acceptable. + */ + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + vcpu_bank = vcpu / 64; + vcpu_offset = vcpu % 64; + __set_bit(vcpu_offset, (unsigned long *) + &flush->hv_vp_set.bank_contents[vcpu_bank]); + if (vcpu_bank >= nr_bank) + nr_bank = vcpu_bank + 1; + } + flush->hv_vp_set.valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0); + + return nr_bank; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, +struct mm_struct *mm, unsigned long start, unsigned long end) +{ + int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu **flush_pcpu; + struct hv_flush_pcpu *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, mm, start, end); + + if (!pcpu_flush || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush_pcpu = this_cpu_ptr(pcpu_flush); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (mm) { + flush->address_space = virt_to_phys(mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu >= 64) + goto do_native; + + __set_bit(vcpu, (unsigned long *) + &flush->processor_mask); + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]); + + if (end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else if (end && + ((end - start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, 0, + start, end); + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, mm, start, end); +} + +static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, +struct mm_struct *mm, unsigned long start, unsigned long end) +{ + int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex **flush_pcpu; + struct hv_flush_pcpu_ex *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, mm, start, end); + + if (!pcpu_flush_ex || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (mm) { + flush->address_space = virt_to_phys(mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + if (!cpumask_equal(cpus, cpu_present_mask)) { + flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K; + nr_bank = cpumask_to_vp_set(flush, cpus); + } + + if (!nr_bank) { + flush->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = + (PAGE_SIZE - sizeof(*flush) - nr_bank * + sizeof(flush->hv_vp_set.bank_contents[0])) / + sizeof(flush->gva_list[0]); + + if (end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else if (end && + ((end - start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, nr_bank, + start, end); + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, mm, start, end); +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { + pr_info("Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + } else { + pr_info("Using ext hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex; + } +} + +void hyper_alloc_mmu(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); + else + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); +} diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 2b58c8c1eeaa..91cf8d419388 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -3,6 +3,8 @@ #include <linux/types.h> #include <linux/atomic.h> +#include <linux/nmi.h> +#include <asm/io.h> #include <asm/hyperv.h> /* @@ -28,6 +30,8 @@ struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; @@ -168,12 +172,156 @@ void hv_remove_crash_handler(void); #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; +extern void *hv_hypercall_pg; + +static inline u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = input ? virt_to_phys(input) : 0; + u64 output_address = output ? virt_to_phys(output) : 0; + u64 hv_status; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("mov %4, %%r8\n" + "call *%5" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input_address) + : "r" (output_address), "m" (hv_hypercall_pg) + : "cc", "memory", "r8", "r9", "r10", "r11"); +#else + u32 input_address_hi = upper_32_bits(input_address); + u32 input_address_lo = lower_32_bits(input_address); + u32 output_address_hi = upper_32_bits(output_address); + u32 output_address_lo = lower_32_bits(output_address); + + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("call *%7" + : "=A" (hv_status), + "+c" (input_address_lo), "+r" (__sp) + : "A" (control), + "b" (input_address_hi), + "D"(output_address_hi), "S"(output_address_lo), + "m" (hv_hypercall_pg) + : "cc", "memory"); +#endif /* !x86_64 */ + return hv_status; +} + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* Fast hypercall with 8 bytes of input and no output */ +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + { + __asm__ __volatile__("call *%4" + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input1) + : "m" (hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + } +#else + { + u32 input1_hi = upper_32_bits(input1); + u32 input1_lo = lower_32_bits(input1); + + __asm__ __volatile__ ("call *%5" + : "=A"(hv_status), + "+c"(input1_lo), + "+r"(__sp) + : "A" (control), + "b" (input1_hi), + "m" (hv_hypercall_pg) + : "cc", "edi", "esi"); + } +#endif + return hv_status; +} + +/* + * Rep hypercalls. Callers of this functions are supposed to ensure that + * rep_count and varhead_size comply with Hyper-V hypercall definition. + */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + u64 control = code; + u64 status; + u16 rep_comp; + + control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; + control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + + do { + status = hv_do_hypercall(control, input, output); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + return status; + + /* Bits 32-43 of status have 'Reps completed' data. */ + rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >> + HV_HYPERCALL_REP_COMP_OFFSET; + + control &= ~HV_HYPERCALL_REP_START_MASK; + control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET; + + touch_nmi_watchdog(); + } while (rep_comp < rep_count); + + return status; +} + +/* + * Hypervisor's notion of virtual processor ID is different from + * Linux' notion of CPU ID. This information can only be retrieved + * in the context of the calling CPU. Setup a map for easy access + * to this information. + */ +extern u32 *hv_vp_index; +extern u32 hv_max_vp_index; + +/** + * hv_cpu_number_to_vp_number() - Map CPU to VP. + * @cpu_number: CPU number in Linux terms + * + * This function returns the mapping between the Linux processor + * number and the hypervisor's virtual processor number, useful + * in making hypercalls and such that talk about specific + * processors. + * + * Return: Virtual processor number in Hyper-V terms + */ +static inline int hv_cpu_number_to_vp_number(int cpu_number) +{ + return hv_vp_index[cpu_number]; +} void hyperv_init(void); +void hyperv_setup_mmu_ops(void); +void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); -#endif +#else /* CONFIG_HYPERV */ +static inline void hyperv_init(void) {} +static inline bool hv_is_hypercall_page_setup(void) { return false; } +static inline void hyperv_cleanup(void) {} +static inline void hyperv_setup_mmu_ops(void) {} +#endif /* CONFIG_HYPERV */ + #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h new file mode 100644 index 000000000000..098ab4394cbd --- /dev/null +++ b/arch/x86/include/asm/trace/hyperv.h @@ -0,0 +1,40 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HYPERV_H + +#include <linux/tracepoint.h> + +#if IS_ENABLED(CONFIG_HYPERV) + +TRACE_EVENT(hyperv_mmu_flush_tlb_others, + TP_PROTO(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long start, unsigned long end), + TP_ARGS(cpus, mm, start, end), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(struct mm_struct *, mm) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->mm = mm; + __entry->addr = start; + __entry->end = end; + ), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, + __entry->addr, __entry->end) + ); + +#endif /* CONFIG_HYPERV */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hyperv +#endif /* _TRACE_HYPERV_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 127ddadee1a5..f65d12504e80 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -149,12 +149,9 @@ */ #define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) -/* - * HV_VP_SET available - */ +/* Recommend using the newer ExProcessorMasks interface */ #define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) - /* * Crash notification flag. */ @@ -242,7 +239,11 @@ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) /* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d @@ -259,6 +260,16 @@ #define HV_PROCESSOR_POWER_STATE_C2 2 #define HV_PROCESSOR_POWER_STATE_C3 3 +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARCE_4K, + HV_GENERIC_SET_ALL, +}; + /* hypercall status code */ #define HV_STATUS_SUCCESS 0 #define HV_STATUS_INVALID_HYPERCALL_CODE 2 diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 70e717fccdd6..42664f944cbc 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -184,9 +184,15 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -219,7 +225,7 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -249,11 +255,12 @@ static void __init ms_hyperv_init_platform(void) * Setup the hook to get control post apic initialization. */ x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); #endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .init_platform = ms_hyperv_init_platform, }; diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index c29cd5387a35..50b89ea0e60f 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -3,6 +3,7 @@ menu "Microsoft Hyper-V guest support" config HYPERV tristate "Microsoft Hyper-V client drivers" depends on X86 && ACPI && PCI && X86_LOCAL_APIC && HYPERVISOR_GUEST + select PARAVIRT help Select this option to run Linux as a Hyper-V client operating system. diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index e57cc40cb768..894b67ac2cae 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -177,6 +177,11 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size, &vmbus_connection.chn_msg_list); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + if (newchannel->rescind) { + err = -ENODEV; + goto error_free_gpadl; + } + ret = vmbus_post_msg(open_msg, sizeof(struct vmbus_channel_open_channel), true); @@ -421,6 +426,11 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + if (channel->rescind) { + ret = -ENODEV; + goto cleanup; + } + ret = vmbus_post_msg(gpadlmsg, msginfo->msgsize - sizeof(*msginfo), true); if (ret != 0) @@ -494,6 +504,10 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle) list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + + if (channel->rescind) + goto post_msg_err; + ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_gpadl_teardown), true); @@ -626,6 +640,7 @@ void vmbus_close(struct vmbus_channel *channel) */ return; } + mutex_lock(&vmbus_connection.channel_mutex); /* * Close all the sub-channels first and then close the * primary channel. @@ -634,22 +649,35 @@ void vmbus_close(struct vmbus_channel *channel) cur_channel = list_entry(cur, struct vmbus_channel, sc_list); vmbus_close_internal(cur_channel); if (cur_channel->rescind) { - mutex_lock(&vmbus_connection.channel_mutex); - hv_process_channel_removal(cur_channel, + hv_process_channel_removal( cur_channel->offermsg.child_relid); - mutex_unlock(&vmbus_connection.channel_mutex); } } /* * Now close the primary. */ vmbus_close_internal(channel); + mutex_unlock(&vmbus_connection.channel_mutex); } EXPORT_SYMBOL_GPL(vmbus_close); -int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer, - u32 bufferlen, u64 requestid, - enum vmbus_packet_type type, u32 flags) +/** + * vmbus_sendpacket() - Send the specified buffer on the given channel + * @channel: Pointer to vmbus_channel structure. + * @buffer: Pointer to the buffer you want to receive the data into. + * @bufferlen: Maximum size of what the the buffer will hold + * @requestid: Identifier of the request + * @type: Type of packet that is being send e.g. negotiate, time + * packet etc. + * + * Sends data in @buffer directly to hyper-v via the vmbus + * This will send the data unparsed to hyper-v. + * + * Mainly used by Hyper-V drivers. + */ +int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, + u32 bufferlen, u64 requestid, + enum vmbus_packet_type type, u32 flags) { struct vmpacket_descriptor desc; u32 packetlen = sizeof(struct vmpacket_descriptor) + bufferlen; @@ -676,42 +704,19 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer, return hv_ringbuffer_write(channel, bufferlist, num_vecs); } -EXPORT_SYMBOL(vmbus_sendpacket_ctl); - -/** - * vmbus_sendpacket() - Send the specified buffer on the given channel - * @channel: Pointer to vmbus_channel structure. - * @buffer: Pointer to the buffer you want to receive the data into. - * @bufferlen: Maximum size of what the the buffer will hold - * @requestid: Identifier of the request - * @type: Type of packet that is being send e.g. negotiate, time - * packet etc. - * - * Sends data in @buffer directly to hyper-v via the vmbus - * This will send the data unparsed to hyper-v. - * - * Mainly used by Hyper-V drivers. - */ -int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, - u32 bufferlen, u64 requestid, - enum vmbus_packet_type type, u32 flags) -{ - return vmbus_sendpacket_ctl(channel, buffer, bufferlen, requestid, - type, flags); -} EXPORT_SYMBOL(vmbus_sendpacket); /* - * vmbus_sendpacket_pagebuffer_ctl - Send a range of single-page buffer + * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer * packets using a GPADL Direct packet type. This interface allows you * to control notifying the host. This will be useful for sending * batched data. Also the sender can control the send flags * explicitly. */ -int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid, u32 flags) +int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, + struct hv_page_buffer pagebuffers[], + u32 pagecount, void *buffer, u32 bufferlen, + u64 requestid) { int i; struct vmbus_channel_packet_page_buffer desc; @@ -736,7 +741,7 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel, /* Setup the descriptor */ desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = flags; + desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ desc.length8 = (u16)(packetlen_aligned >> 3); desc.transactionid = requestid; @@ -757,24 +762,6 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel, return hv_ringbuffer_write(channel, bufferlist, 3); } -EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer_ctl); - -/* - * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer - * packets using a GPADL Direct packet type. - */ -int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid) -{ - u32 flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - - return vmbus_sendpacket_pagebuffer_ctl(channel, pagebuffers, pagecount, - buffer, bufferlen, - requestid, flags); - -} EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); /* @@ -814,62 +801,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, } EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc); -/* - * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet - * using a GPADL Direct packet type. - */ -int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel, - struct hv_multipage_buffer *multi_pagebuffer, - void *buffer, u32 bufferlen, u64 requestid) -{ - struct vmbus_channel_packet_multipage_buffer desc; - u32 descsize; - u32 packetlen; - u32 packetlen_aligned; - struct kvec bufferlist[3]; - u64 aligned_data = 0; - u32 pfncount = NUM_PAGES_SPANNED(multi_pagebuffer->offset, - multi_pagebuffer->len); - - if (pfncount > MAX_MULTIPAGE_BUFFER_COUNT) - return -EINVAL; - - /* - * Adjust the size down since vmbus_channel_packet_multipage_buffer is - * the largest size we support - */ - descsize = sizeof(struct vmbus_channel_packet_multipage_buffer) - - ((MAX_MULTIPAGE_BUFFER_COUNT - pfncount) * - sizeof(u64)); - packetlen = descsize + bufferlen; - packetlen_aligned = ALIGN(packetlen, sizeof(u64)); - - - /* Setup the descriptor */ - desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ - desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = requestid; - desc.rangecount = 1; - - desc.range.len = multi_pagebuffer->len; - desc.range.offset = multi_pagebuffer->offset; - - memcpy(desc.range.pfn_array, multi_pagebuffer->pfn_array, - pfncount * sizeof(u64)); - - bufferlist[0].iov_base = &desc; - bufferlist[0].iov_len = descsize; - bufferlist[1].iov_base = buffer; - bufferlist[1].iov_len = bufferlen; - bufferlist[2].iov_base = &aligned_data; - bufferlist[2].iov_len = (packetlen_aligned - packetlen); - - return hv_ringbuffer_write(channel, bufferlist, 3); -} -EXPORT_SYMBOL_GPL(vmbus_sendpacket_multipagebuffer); - /** * vmbus_recvpacket() - Retrieve the user packet on the specified channel * @channel: Pointer to vmbus_channel structure. diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 4bbb8dea4727..018d2e0f8ec5 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -159,7 +159,7 @@ static void vmbus_rescind_cleanup(struct vmbus_channel *channel) spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); - + channel->rescind = true; list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) { @@ -381,14 +381,21 @@ static void vmbus_release_relid(u32 relid) true); } -void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) +void hv_process_channel_removal(u32 relid) { unsigned long flags; - struct vmbus_channel *primary_channel; + struct vmbus_channel *primary_channel, *channel; - BUG_ON(!channel->rescind); BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + /* + * Make sure channel is valid as we may have raced. + */ + channel = relid2channel(relid); + if (!channel) + return; + + BUG_ON(!channel->rescind); if (channel->target_cpu != get_cpu()) { put_cpu(); smp_call_function_single(channel->target_cpu, @@ -451,6 +458,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) /* Make sure this is a new offer */ mutex_lock(&vmbus_connection.channel_mutex); + /* + * Now that we have acquired the channel_mutex, + * we can release the potentially racing rescind thread. + */ + atomic_dec(&vmbus_connection.offer_in_progress); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { if (!uuid_le_cmp(channel->offermsg.offer.if_type, newchannel->offermsg.offer.if_type) && @@ -481,7 +494,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) channel->num_sc++; spin_unlock_irqrestore(&channel->lock, flags); } else { - atomic_dec(&vmbus_connection.offer_in_progress); goto err_free_chan; } } @@ -510,7 +522,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) if (!fnew) { if (channel->sc_creation_callback != NULL) channel->sc_creation_callback(newchannel); - atomic_dec(&vmbus_connection.offer_in_progress); + newchannel->probe_done = true; return; } @@ -541,7 +553,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) goto err_deq_chan; } - atomic_dec(&vmbus_connection.offer_in_progress); + newchannel->probe_done = true; return; err_deq_chan: @@ -599,7 +611,7 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) */ channel->numa_node = 0; channel->target_cpu = 0; - channel->target_vp = hv_context.vp_index[0]; + channel->target_vp = hv_cpu_number_to_vp_number(0); return; } @@ -683,7 +695,7 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) } channel->target_cpu = cur_cpu; - channel->target_vp = hv_context.vp_index[cur_cpu]; + channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); } static void vmbus_wait_for_unload(void) @@ -805,21 +817,12 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) /* * Setup state for signalling the host. */ - newchannel->sig_event = (struct hv_input_signal_event *) - (ALIGN((unsigned long) - &newchannel->sig_buf, - HV_HYPERCALL_PARAM_ALIGN)); - - newchannel->sig_event->connectionid.asu32 = 0; - newchannel->sig_event->connectionid.u.id = VMBUS_EVENT_CONNECTION_ID; - newchannel->sig_event->flag_number = 0; - newchannel->sig_event->rsvdz = 0; + newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID; if (vmbus_proto_version != VERSION_WS2008) { newchannel->is_dedicated_interrupt = (offer->is_dedicated_interrupt != 0); - newchannel->sig_event->connectionid.u.id = - offer->connection_id; + newchannel->sig_event = offer->connection_id; } memcpy(&newchannel->offermsg, offer, @@ -839,7 +842,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) { struct vmbus_channel_rescind_offer *rescind; struct vmbus_channel *channel; - unsigned long flags; struct device *dev; rescind = (struct vmbus_channel_rescind_offer *)hdr; @@ -878,15 +880,25 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) return; } - spin_lock_irqsave(&channel->lock, flags); - channel->rescind = true; - spin_unlock_irqrestore(&channel->lock, flags); + /* + * Now wait for offer handling to complete. + */ + while (READ_ONCE(channel->probe_done) == false) { + /* + * We wait here until any channel offer is currently + * being processed. + */ + msleep(1); + } - vmbus_rescind_cleanup(channel); + /* + * At this point, the rescind handling can proceed safely. + */ if (channel->device_obj) { if (channel->chn_rescind_callback) { channel->chn_rescind_callback(channel); + vmbus_rescind_cleanup(channel); return; } /* @@ -895,6 +907,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) */ dev = get_device(&channel->device_obj->device); if (dev) { + vmbus_rescind_cleanup(channel); vmbus_device_unregister(channel->device_obj); put_device(dev); } @@ -907,29 +920,25 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) * 1. Close all sub-channels first * 2. Then close the primary channel. */ + mutex_lock(&vmbus_connection.channel_mutex); + vmbus_rescind_cleanup(channel); if (channel->state == CHANNEL_OPEN_STATE) { /* * The channel is currently not open; * it is safe for us to cleanup the channel. */ - mutex_lock(&vmbus_connection.channel_mutex); - hv_process_channel_removal(channel, - channel->offermsg.child_relid); - mutex_unlock(&vmbus_connection.channel_mutex); + hv_process_channel_removal(rescind->child_relid); } + mutex_unlock(&vmbus_connection.channel_mutex); } } void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) { - mutex_lock(&vmbus_connection.channel_mutex); - BUG_ON(!is_hvsock_channel(channel)); channel->rescind = true; vmbus_device_unregister(channel->device_obj); - - mutex_unlock(&vmbus_connection.channel_mutex); } EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister); @@ -1228,8 +1237,7 @@ struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary) return outgoing_channel; } - cur_cpu = hv_context.vp_index[get_cpu()]; - put_cpu(); + cur_cpu = hv_cpu_number_to_vp_number(smp_processor_id()); list_for_each_safe(cur, tmp, &primary->sc_list) { cur_channel = list_entry(cur, struct vmbus_channel, sc_list); if (cur_channel->state != CHANNEL_OPENED_STATE) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 59c11ff90d12..f41901f80b64 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -32,6 +32,8 @@ #include <linux/hyperv.h> #include <linux/export.h> #include <asm/hyperv.h> +#include <asm/mshyperv.h> + #include "hyperv_vmbus.h" @@ -94,7 +96,8 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, * the CPU attempting to connect may not be CPU 0. */ if (version >= VERSION_WIN8_1) { - msg->target_vcpu = hv_context.vp_index[smp_processor_id()]; + msg->target_vcpu = + hv_cpu_number_to_vp_number(smp_processor_id()); vmbus_connection.connect_cpu = smp_processor_id(); } else { msg->target_vcpu = 0; @@ -406,6 +409,6 @@ void vmbus_set_event(struct vmbus_channel *channel) if (!channel->is_dedicated_interrupt) vmbus_send_interrupt(child_relid); - hv_do_hypercall(HVCALL_SIGNAL_EVENT, channel->sig_event, NULL); + hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event); } EXPORT_SYMBOL_GPL(vmbus_set_event); diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 2ea12207caa0..8267439dd1ee 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -234,7 +234,6 @@ int hv_synic_init(unsigned int cpu) union hv_synic_siefp siefp; union hv_synic_sint shared_sint; union hv_synic_scontrol sctrl; - u64 vp_index; /* Setup the Synic's message page */ hv_get_simp(simp.as_uint64); @@ -276,14 +275,6 @@ int hv_synic_init(unsigned int cpu) hv_context.synic_initialized = true; /* - * Setup the mapping between Hyper-V's notion - * of cpuid and Linux' notion of cpuid. - * This array will be indexed using Linux cpuid. - */ - hv_get_vp_index(vp_index); - hv_context.vp_index[cpu] = (u32)vp_index; - - /* * Register the per-cpu clockevent source. */ if (ms_hyperv.features & HV_X64_MSR_SYNTIMER_AVAILABLE) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index f5728deff893..db0e6652d7ef 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -584,10 +584,6 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, switch (val) { case MEM_ONLINE: - spin_lock_irqsave(&dm_device.ha_lock, flags); - dm_device.num_pages_onlined += mem->nr_pages; - spin_unlock_irqrestore(&dm_device.ha_lock, flags); - /* Fall through */ case MEM_CANCEL_ONLINE: if (dm_device.ha_waiting) { dm_device.ha_waiting = false; @@ -644,6 +640,9 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) __online_page_set_limits(pg); __online_page_increment_counters(pg); __online_page_free(pg); + + WARN_ON_ONCE(!spin_is_locked(&dm_device.ha_lock)); + dm_device.num_pages_onlined++; } static void hv_bring_pgs_online(struct hv_hotadd_state *has, @@ -1036,8 +1035,8 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) if (info_hdr->data_size == sizeof(__u64)) { __u64 *max_page_count = (__u64 *)&info_hdr[1]; - pr_info("INFO_TYPE_MAX_PAGE_CNT = %llu\n", - *max_page_count); + pr_info("Max. dynamic memory size: %llu MB\n", + (*max_page_count) >> (20 - PAGE_SHIFT)); } break; @@ -1656,6 +1655,7 @@ static int balloon_probe(struct hv_device *dev, } dm_device.state = DM_INITIALIZED; + last_post_time = jiffies; return 0; diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index d4e4e56c8fb1..002227d1ba9c 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -171,6 +171,10 @@ static void fcopy_send_data(struct work_struct *dummy) out_src = smsg_out; break; + case WRITE_TO_FILE: + out_src = fcopy_transaction.fcopy_msg; + out_len = sizeof(struct hv_do_fcopy); + break; default: out_src = fcopy_transaction.fcopy_msg; out_len = fcopy_transaction.recv_len; diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index eefc72a49a5f..e1d5debcb0ed 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -319,7 +319,7 @@ static int process_ob_ipinfo(void *in_msg, void *out_msg, int op) strlen((char *)in->body.kvp_ip_val.adapter_id), UTF16_HOST_ENDIAN, (wchar_t *)out->kvp_ip_val.adapter_id, - MAX_IP_ADDR_SIZE); + MAX_ADAPTER_ID_SIZE); if (len < 0) return len; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 252191b1fa4d..85db40f036dc 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -229,17 +229,6 @@ struct hv_context { struct hv_per_cpu_context __percpu *cpu_context; /* - * Hypervisor's notion of virtual processor ID is different from - * Linux' notion of CPU ID. This information can only be retrieved - * in the context of the calling CPU. Setup a map for easy access - * to this information: - * - * vp_index[a] is the Hyper-V's processor ID corresponding to - * Linux cpuid 'a'. - */ - u32 vp_index[NR_CPUS]; - - /* * To manage allocations in a NUMA node. * Array indexed by numa node ID. */ diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 1f450c39a9b0..12eb8caa4263 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -29,6 +29,7 @@ #include <linux/uio.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/prefetch.h> #include "hyperv_vmbus.h" @@ -94,30 +95,6 @@ hv_set_next_write_location(struct hv_ring_buffer_info *ring_info, ring_info->ring_buffer->write_index = next_write_location; } -/* Get the next read location for the specified ring buffer. */ -static inline u32 -hv_get_next_read_location(const struct hv_ring_buffer_info *ring_info) -{ - return ring_info->ring_buffer->read_index; -} - -/* - * Get the next read location + offset for the specified ring buffer. - * This allows the caller to skip. - */ -static inline u32 -hv_get_next_readlocation_withoffset(const struct hv_ring_buffer_info *ring_info, - u32 offset) -{ - u32 next = ring_info->ring_buffer->read_index; - - next += offset; - if (next >= ring_info->ring_datasize) - next -= ring_info->ring_datasize; - - return next; -} - /* Set the next read location for the specified ring buffer. */ static inline void hv_set_next_read_location(struct hv_ring_buffer_info *ring_info, @@ -142,29 +119,6 @@ hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info) } /* - * Helper routine to copy to source from ring buffer. - * Assume there is enough room. Handles wrap-around in src case only!! - */ -static u32 hv_copyfrom_ringbuffer( - const struct hv_ring_buffer_info *ring_info, - void *dest, - u32 destlen, - u32 start_read_offset) -{ - void *ring_buffer = hv_get_ring_buffer(ring_info); - u32 ring_buffer_size = hv_get_ring_buffersize(ring_info); - - memcpy(dest, ring_buffer + start_read_offset, destlen); - - start_read_offset += destlen; - if (start_read_offset >= ring_buffer_size) - start_read_offset -= ring_buffer_size; - - return start_read_offset; -} - - -/* * Helper routine to copy from source to ring buffer. * Assume there is enough room. Handles wrap-around in dest case only!! */ @@ -334,33 +288,22 @@ int hv_ringbuffer_write(struct vmbus_channel *channel, return 0; } -static inline void -init_cached_read_index(struct hv_ring_buffer_info *rbi) -{ - rbi->cached_read_index = rbi->ring_buffer->read_index; -} - int hv_ringbuffer_read(struct vmbus_channel *channel, void *buffer, u32 buflen, u32 *buffer_actual_len, u64 *requestid, bool raw) { - u32 bytes_avail_toread; - u32 next_read_location; - u64 prev_indices = 0; - struct vmpacket_descriptor desc; - u32 offset; - u32 packetlen; - struct hv_ring_buffer_info *inring_info = &channel->inbound; - - if (buflen <= 0) + struct vmpacket_descriptor *desc; + u32 packetlen, offset; + + if (unlikely(buflen == 0)) return -EINVAL; *buffer_actual_len = 0; *requestid = 0; - bytes_avail_toread = hv_get_bytes_to_read(inring_info); /* Make sure there is something to read */ - if (bytes_avail_toread < sizeof(desc)) { + desc = hv_pkt_iter_first(channel); + if (desc == NULL) { /* * No error is set when there is even no header, drivers are * supposed to analyze buffer_actual_len. @@ -368,48 +311,22 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, return 0; } - init_cached_read_index(inring_info); - - next_read_location = hv_get_next_read_location(inring_info); - next_read_location = hv_copyfrom_ringbuffer(inring_info, &desc, - sizeof(desc), - next_read_location); - - offset = raw ? 0 : (desc.offset8 << 3); - packetlen = (desc.len8 << 3) - offset; + offset = raw ? 0 : (desc->offset8 << 3); + packetlen = (desc->len8 << 3) - offset; *buffer_actual_len = packetlen; - *requestid = desc.trans_id; - - if (bytes_avail_toread < packetlen + offset) - return -EAGAIN; + *requestid = desc->trans_id; - if (packetlen > buflen) + if (unlikely(packetlen > buflen)) return -ENOBUFS; - next_read_location = - hv_get_next_readlocation_withoffset(inring_info, offset); + /* since ring is double mapped, only one copy is necessary */ + memcpy(buffer, (const char *)desc + offset, packetlen); - next_read_location = hv_copyfrom_ringbuffer(inring_info, - buffer, - packetlen, - next_read_location); + /* Advance ring index to next packet descriptor */ + __hv_pkt_iter_next(channel, desc); - next_read_location = hv_copyfrom_ringbuffer(inring_info, - &prev_indices, - sizeof(u64), - next_read_location); - - /* - * Make sure all reads are done before we update the read index since - * the writer may start writing to the read area once the read index - * is updated. - */ - virt_mb(); - - /* Update the read index */ - hv_set_next_read_location(inring_info, next_read_location); - - hv_signal_on_read(channel); + /* Notify host of update */ + hv_pkt_iter_close(channel); return 0; } @@ -440,14 +357,16 @@ static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi) struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel) { struct hv_ring_buffer_info *rbi = &channel->inbound; - - /* set state for later hv_signal_on_read() */ - init_cached_read_index(rbi); + struct vmpacket_descriptor *desc; if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor)) return NULL; - return hv_get_ring_buffer(rbi) + rbi->priv_read_index; + desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index; + if (desc) + prefetch((char *)desc + (desc->len8 << 3)); + + return desc; } EXPORT_SYMBOL_GPL(hv_pkt_iter_first); @@ -471,10 +390,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel, rbi->priv_read_index -= dsize; /* more data? */ - if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor)) - return NULL; - else - return hv_get_ring_buffer(rbi) + rbi->priv_read_index; + return hv_pkt_iter_first(channel); } EXPORT_SYMBOL_GPL(__hv_pkt_iter_next); @@ -484,6 +400,7 @@ EXPORT_SYMBOL_GPL(__hv_pkt_iter_next); void hv_pkt_iter_close(struct vmbus_channel *channel) { struct hv_ring_buffer_info *rbi = &channel->inbound; + u32 orig_write_sz = hv_get_bytes_to_write(rbi); /* * Make sure all reads are done before we update the read index since @@ -493,6 +410,40 @@ void hv_pkt_iter_close(struct vmbus_channel *channel) virt_rmb(); rbi->ring_buffer->read_index = rbi->priv_read_index; - hv_signal_on_read(channel); + /* + * Issue a full memory barrier before making the signaling decision. + * Here is the reason for having this barrier: + * If the reading of the pend_sz (in this function) + * were to be reordered and read before we commit the new read + * index (in the calling function) we could + * have a problem. If the host were to set the pending_sz after we + * have sampled pending_sz and go to sleep before we commit the + * read index, we could miss sending the interrupt. Issue a full + * memory barrier to address this. + */ + virt_mb(); + + /* If host has disabled notifications then skip */ + if (rbi->ring_buffer->interrupt_mask) + return; + + if (rbi->ring_buffer->feature_bits.feat_pending_send_sz) { + u32 pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); + + /* + * If there was space before we began iteration, + * then host was not blocked. Also handles case where + * pending_sz is zero then host has nothing pending + * and does not need to be signaled. + */ + if (orig_write_sz > pending_sz) + return; + + /* If pending write will not fit, don't give false hope. */ + if (hv_get_bytes_to_write(rbi) < pending_sz) + return; + } + + vmbus_setevent(channel); } EXPORT_SYMBOL_GPL(hv_pkt_iter_close); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ed84e96715a0..937801ac2fe0 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -768,8 +768,7 @@ static void vmbus_device_release(struct device *device) struct vmbus_channel *channel = hv_dev->channel; mutex_lock(&vmbus_connection.channel_mutex); - hv_process_channel_removal(channel, - channel->offermsg.child_relid); + hv_process_channel_removal(channel->offermsg.child_relid); mutex_unlock(&vmbus_connection.channel_mutex); kfree(hv_dev); @@ -940,6 +939,9 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (channel->offermsg.child_relid != relid) continue; + if (channel->rescind) + continue; + switch (channel->callback_mode) { case HV_CALL_ISR: vmbus_channel_isr(channel); @@ -1451,23 +1453,6 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) } EXPORT_SYMBOL_GPL(vmbus_free_mmio); -/** - * vmbus_cpu_number_to_vp_number() - Map CPU to VP. - * @cpu_number: CPU number in Linux terms - * - * This function returns the mapping between the Linux processor - * number and the hypervisor's virtual processor number, useful - * in making hypercalls and such that talk about specific - * processors. - * - * Return: Virtual processor number in Hyper-V terms - */ -int vmbus_cpu_number_to_vp_number(int cpu_number) -{ - return hv_context.vp_index[cpu_number]; -} -EXPORT_SYMBOL_GPL(vmbus_cpu_number_to_vp_number); - static int vmbus_acpi_add(struct acpi_device *device) { acpi_status result; diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d1ea99a12cf2..5176be76ca7d 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -148,6 +148,10 @@ struct netvsc_device_info { unsigned char mac_adr[ETH_ALEN]; int ring_size; u32 num_chn; + u32 send_sections; + u32 recv_sections; + u32 send_section_size; + u32 recv_section_size; }; enum rndis_device_state { @@ -202,6 +206,8 @@ int netvsc_recv_callback(struct net_device *net, const struct ndis_pkt_8021q_info *vlan); void netvsc_channel_cb(void *context); int netvsc_poll(struct napi_struct *napi, int budget); + +void rndis_set_subchannel(struct work_struct *w); bool rndis_filter_opened(const struct netvsc_device *nvdev); int rndis_filter_open(struct netvsc_device *nvdev); int rndis_filter_close(struct netvsc_device *nvdev); @@ -211,7 +217,7 @@ void rndis_filter_update(struct netvsc_device *nvdev); void rndis_filter_device_remove(struct hv_device *dev, struct netvsc_device *nvdev); int rndis_filter_set_rss_param(struct rndis_device *rdev, - const u8 *key, int num_queue); + const u8 *key); int rndis_filter_receive(struct net_device *ndev, struct netvsc_device *net_dev, struct hv_device *dev, @@ -634,12 +640,12 @@ struct nvsp_message { #define NETVSC_SEND_BUFFER_SIZE (1024 * 1024 * 15) /* 15MB */ #define NETVSC_INVALID_INDEX -1 +#define NETVSC_SEND_SECTION_SIZE 6144 +#define NETVSC_RECV_SECTION_SIZE 1728 #define NETVSC_RECEIVE_BUFFER_ID 0xcafe #define NETVSC_SEND_BUFFER_ID 0 -#define NETVSC_PACKET_SIZE 4096 - #define VRSS_SEND_TAB_SIZE 16 /* must be power of 2 */ #define VRSS_CHANNEL_MAX 64 #define VRSS_CHANNEL_DEFAULT 8 @@ -678,6 +684,8 @@ struct netvsc_ethtool_stats { unsigned long tx_no_space; unsigned long tx_too_big; unsigned long tx_busy; + unsigned long tx_send_full; + unsigned long rx_comp_busy; }; struct netvsc_vf_pcpu_stats { @@ -716,6 +724,8 @@ struct net_device_context { u32 tx_send_table[VRSS_SEND_TAB_SIZE]; /* Ethtool settings */ + bool udp4_l4_hash; + bool udp6_l4_hash; u8 duplex; u32 speed; struct netvsc_ethtool_stats eth_stats; @@ -723,7 +733,7 @@ struct net_device_context { /* State to manage the associated VF interface. */ struct net_device __rcu *vf_netdev; struct netvsc_vf_pcpu_stats __percpu *vf_stats; - struct work_struct vf_takeover; + struct delayed_work vf_takeover; /* 1: allocated, serial number is valid. 0: not allocated */ u32 vf_alloc; @@ -754,14 +764,13 @@ struct netvsc_device { /* Receive buffer allocated by us but manages by NetVSP */ void *recv_buf; - u32 recv_buf_size; u32 recv_buf_gpadl_handle; u32 recv_section_cnt; + u32 recv_section_size; u32 recv_completion_cnt; /* Send buffer allocated by us */ void *send_buf; - u32 send_buf_size; u32 send_buf_gpadl_handle; u32 send_section_cnt; u32 send_section_size; @@ -776,7 +785,9 @@ struct netvsc_device { u32 max_chn; u32 num_chn; - refcount_t sc_offered; + atomic_t open_chn; + struct work_struct subchan_work; + wait_queue_head_t subchan_open; struct rndis_device *extension; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 208f03aa83de..8d5077fb0492 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -75,7 +75,10 @@ static struct netvsc_device *alloc_net_device(void) atomic_set(&net_device->open_cnt, 0); net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT; net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT; + init_completion(&net_device->channel_init_wait); + init_waitqueue_head(&net_device->subchan_open); + INIT_WORK(&net_device->subchan_work, rndis_set_subchannel); return net_device; } @@ -142,6 +145,7 @@ static void netvsc_destroy_buf(struct hv_device *device) "revoke receive buffer to netvsp\n"); return; } + net_device->recv_section_cnt = 0; } /* Teardown the gpadl on the vsp end */ @@ -172,7 +176,7 @@ static void netvsc_destroy_buf(struct hv_device *device) * NVSP_MSG1_TYPE_SEND_SEND_BUF msg) therefore, we need * to send a revoke msg here */ - if (net_device->send_section_size) { + if (net_device->send_section_cnt) { /* Send the revoke receive buffer */ revoke_packet = &net_device->revoke_packet; memset(revoke_packet, 0, sizeof(struct nvsp_message)); @@ -204,6 +208,7 @@ static void netvsc_destroy_buf(struct hv_device *device) "revoke send buffer to netvsp\n"); return; } + net_device->send_section_cnt = 0; } /* Teardown the gpadl on the vsp end */ if (net_device->send_buf_gpadl_handle) { @@ -243,25 +248,25 @@ int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx) } static int netvsc_init_buf(struct hv_device *device, - struct netvsc_device *net_device) + struct netvsc_device *net_device, + const struct netvsc_device_info *device_info) { - int ret = 0; - struct nvsp_message *init_packet; struct nvsp_1_message_send_receive_buffer_complete *resp; - struct net_device *ndev; + struct net_device *ndev = hv_get_drvdata(device); + struct nvsp_message *init_packet; + unsigned int buf_size; size_t map_words; - int node; - - ndev = hv_get_drvdata(device); + int ret = 0; - node = cpu_to_node(device->channel->target_cpu); - net_device->recv_buf = vzalloc_node(net_device->recv_buf_size, node); - if (!net_device->recv_buf) - net_device->recv_buf = vzalloc(net_device->recv_buf_size); + /* Get receive buffer area. */ + buf_size = device_info->recv_sections * device_info->recv_section_size; + buf_size = roundup(buf_size, PAGE_SIZE); + net_device->recv_buf = vzalloc(buf_size); if (!net_device->recv_buf) { - netdev_err(ndev, "unable to allocate receive " - "buffer of size %d\n", net_device->recv_buf_size); + netdev_err(ndev, + "unable to allocate receive buffer of size %u\n", + buf_size); ret = -ENOMEM; goto cleanup; } @@ -272,7 +277,7 @@ static int netvsc_init_buf(struct hv_device *device, * than the channel to establish the gpadl handle. */ ret = vmbus_establish_gpadl(device->channel, net_device->recv_buf, - net_device->recv_buf_size, + buf_size, &net_device->recv_buf_gpadl_handle); if (ret != 0) { netdev_err(ndev, @@ -318,33 +323,31 @@ static int netvsc_init_buf(struct hv_device *device, resp->num_sections, resp->sections[0].sub_alloc_size, resp->sections[0].num_sub_allocs); - net_device->recv_section_cnt = resp->num_sections; - - /* - * For 1st release, there should only be 1 section that represents the - * entire receive buffer - */ - if (net_device->recv_section_cnt != 1 || - resp->sections[0].offset != 0) { + /* There should only be one section for the entire receive buffer */ + if (resp->num_sections != 1 || resp->sections[0].offset != 0) { ret = -EINVAL; goto cleanup; } + net_device->recv_section_size = resp->sections[0].sub_alloc_size; + net_device->recv_section_cnt = resp->sections[0].num_sub_allocs; + /* Setup receive completion ring */ net_device->recv_completion_cnt - = round_up(resp->sections[0].num_sub_allocs + 1, + = round_up(net_device->recv_section_cnt + 1, PAGE_SIZE / sizeof(u64)); ret = netvsc_alloc_recv_comp_ring(net_device, 0); if (ret) goto cleanup; /* Now setup the send buffer. */ - net_device->send_buf = vzalloc_node(net_device->send_buf_size, node); - if (!net_device->send_buf) - net_device->send_buf = vzalloc(net_device->send_buf_size); + buf_size = device_info->send_sections * device_info->send_section_size; + buf_size = round_up(buf_size, PAGE_SIZE); + + net_device->send_buf = vzalloc(buf_size); if (!net_device->send_buf) { - netdev_err(ndev, "unable to allocate send " - "buffer of size %d\n", net_device->send_buf_size); + netdev_err(ndev, "unable to allocate send buffer of size %u\n", + buf_size); ret = -ENOMEM; goto cleanup; } @@ -354,7 +357,7 @@ static int netvsc_init_buf(struct hv_device *device, * than the channel to establish the gpadl handle. */ ret = vmbus_establish_gpadl(device->channel, net_device->send_buf, - net_device->send_buf_size, + buf_size, &net_device->send_buf_gpadl_handle); if (ret != 0) { netdev_err(ndev, @@ -399,10 +402,8 @@ static int netvsc_init_buf(struct hv_device *device, net_device->send_section_size = init_packet->msg. v1_msg.send_send_buf_complete.section_size; - /* Section count is simply the size divided by the section size. - */ - net_device->send_section_cnt = - net_device->send_buf_size / net_device->send_section_size; + /* Section count is simply the size divided by the section size. */ + net_device->send_section_cnt = buf_size / net_device->send_section_size; netdev_dbg(ndev, "Send section size: %d, Section count:%d\n", net_device->send_section_size, net_device->send_section_cnt); @@ -480,7 +481,8 @@ static int negotiate_nvsp_ver(struct hv_device *device, } static int netvsc_connect_vsp(struct hv_device *device, - struct netvsc_device *net_device) + struct netvsc_device *net_device, + const struct netvsc_device_info *device_info) { const u32 ver_list[] = { NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2, @@ -530,14 +532,8 @@ static int netvsc_connect_vsp(struct hv_device *device, if (ret != 0) goto cleanup; - /* Post the big receive buffer to NetVSP */ - if (net_device->nvsp_version <= NVSP_PROTOCOL_VERSION_2) - net_device->recv_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY; - else - net_device->recv_buf_size = NETVSC_RECEIVE_BUFFER_SIZE; - net_device->send_buf_size = NETVSC_SEND_BUFFER_SIZE; - ret = netvsc_init_buf(device, net_device); + ret = netvsc_init_buf(device, net_device, device_info); cleanup: return ret; @@ -559,6 +555,8 @@ void netvsc_device_remove(struct hv_device *device) = rtnl_dereference(net_device_ctx->nvdev); int i; + cancel_work_sync(&net_device->subchan_work); + netvsc_disconnect_vsp(device); RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); @@ -777,18 +775,15 @@ static inline int netvsc_send_pkt( if (packet->cp_partial) pb += packet->rmsg_pgcnt; - ret = vmbus_sendpacket_pagebuffer_ctl(out_channel, - pb, packet->page_buf_cnt, - &nvmsg, - sizeof(struct nvsp_message), - req_id, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + ret = vmbus_sendpacket_pagebuffer(out_channel, + pb, packet->page_buf_cnt, + &nvmsg, sizeof(nvmsg), + req_id); } else { - ret = vmbus_sendpacket_ctl(out_channel, &nvmsg, - sizeof(struct nvsp_message), - req_id, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + ret = vmbus_sendpacket(out_channel, + &nvmsg, sizeof(nvmsg), + req_id, VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); } if (ret == 0) { @@ -885,7 +880,9 @@ int netvsc_send(struct net_device_context *ndev_ctx, } else if (pktlen + net_device->pkt_align < net_device->send_section_size) { section_index = netvsc_get_next_send_section(net_device); - if (section_index != NETVSC_INVALID_INDEX) { + if (unlikely(section_index == NETVSC_INVALID_INDEX)) { + ++ndev_ctx->eth_stats.tx_send_full; + } else { move_pkt_msd(&msd_send, &msd_skb, msdp); msd_len = 0; } @@ -951,9 +948,10 @@ send_now: } /* Send pending recv completions */ -static int send_recv_completions(struct netvsc_channel *nvchan) +static int send_recv_completions(struct net_device *ndev, + struct netvsc_device *nvdev, + struct netvsc_channel *nvchan) { - struct netvsc_device *nvdev = nvchan->net_device; struct multi_recv_comp *mrc = &nvchan->mrc; struct recv_comp_msg { struct nvsp_message_header hdr; @@ -971,8 +969,12 @@ static int send_recv_completions(struct netvsc_channel *nvchan) msg.status = rcd->status; ret = vmbus_sendpacket(nvchan->channel, &msg, sizeof(msg), rcd->tid, VM_PKT_COMP, 0); - if (unlikely(ret)) + if (unlikely(ret)) { + struct net_device_context *ndev_ctx = netdev_priv(ndev); + + ++ndev_ctx->eth_stats.rx_comp_busy; return ret; + } if (++mrc->first == nvdev->recv_completion_cnt) mrc->first = 0; @@ -1013,7 +1015,7 @@ static void enq_receive_complete(struct net_device *ndev, recv_comp_slot_avail(nvdev, mrc, &filled, &avail); if (unlikely(filled > NAPI_POLL_WEIGHT)) { - send_recv_completions(nvchan); + send_recv_completions(ndev, nvdev, nvchan); recv_comp_slot_avail(nvdev, mrc, &filled, &avail); } @@ -1190,17 +1192,13 @@ int netvsc_poll(struct napi_struct *napi, int budget) nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc); } - /* if ring is empty, signal host */ - if (!nvchan->desc) - hv_pkt_iter_close(channel); - /* If send of pending receive completions suceeded * and did not exhaust NAPI budget this time * and not doing busy poll * then re-enable host interrupts * and reschedule if ring is not empty. */ - if (send_recv_completions(nvchan) == 0 && + if (send_recv_completions(ndev, net_device, nvchan) == 0 && work_done < budget && napi_complete_done(napi, work_done) && hv_end_read(&channel->inbound)) { @@ -1268,6 +1266,8 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, nvchan->channel = device->channel; nvchan->net_device = net_device; + u64_stats_init(&nvchan->tx_stats.syncp); + u64_stats_init(&nvchan->rx_stats.syncp); } /* Enable NAPI handler before init callbacks */ @@ -1297,7 +1297,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, rcu_assign_pointer(net_device_ctx->nvdev, net_device); /* Connect with the NetVsp */ - ret = netvsc_connect_vsp(device, net_device); + ret = netvsc_connect_vsp(device, net_device, device_info); if (ret != 0) { netdev_err(ndev, "unable to connect to NetVSP - %d\n", ret); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d5d63184f65b..a32ae02e1b6c 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -45,8 +45,14 @@ #include "hyperv_net.h" -#define RING_SIZE_MIN 64 +#define RING_SIZE_MIN 64 +#define NETVSC_MIN_TX_SECTIONS 10 +#define NETVSC_DEFAULT_TX 192 /* ~1M */ +#define NETVSC_MIN_RX_SECTIONS 10 /* ~64K */ +#define NETVSC_DEFAULT_RX 10485 /* Max ~16M */ + #define LINKCHANGE_INT (2 * HZ) +#define VF_TAKEOVER_INT (HZ / 10) static int ring_size = 128; module_param(ring_size, int, S_IRUGO); @@ -113,12 +119,16 @@ static int netvsc_close(struct net_device *net) struct net_device *vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev); struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev); - int ret; + int ret = 0; u32 aread, i, msec = 10, retry = 0, retry_max = 20; struct vmbus_channel *chn; netif_tx_disable(net); + /* No need to close rndis filter if it is removed already */ + if (!nvdev) + goto out; + ret = rndis_filter_close(nvdev); if (ret != 0) { netdev_err(net, "unable to close device (ret %d).\n", ret); @@ -157,6 +167,7 @@ static int netvsc_close(struct net_device *net) ret = -ETIMEDOUT; } +out: if (vf_netdev) dev_close(vf_netdev); @@ -164,7 +175,7 @@ static int netvsc_close(struct net_device *net) } static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size, - int pkt_type) + int pkt_type) { struct rndis_packet *rndis_pkt; struct rndis_per_packet_info *ppi; @@ -184,10 +195,12 @@ static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size, return ppi; } -/* Azure hosts don't support non-TCP port numbers in hashing yet. We compute - * hash for non-TCP traffic with only IP numbers. +/* Azure hosts don't support non-TCP port numbers in hashing for fragmented + * packets. We can use ethtool to change UDP hash level when necessary. */ -static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk) +static inline u32 netvsc_get_hash( + struct sk_buff *skb, + const struct net_device_context *ndc) { struct flow_keys flow; u32 hash; @@ -198,7 +211,11 @@ static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk) if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) return 0; - if (flow.basic.ip_proto == IPPROTO_TCP) { + if (flow.basic.ip_proto == IPPROTO_TCP || + (flow.basic.ip_proto == IPPROTO_UDP && + ((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) || + (flow.basic.n_proto == htons(ETH_P_IPV6) && + ndc->udp6_l4_hash)))) { return skb_get_hash(skb); } else { if (flow.basic.n_proto == htons(ETH_P_IP)) @@ -221,7 +238,7 @@ static inline int netvsc_get_tx_queue(struct net_device *ndev, struct sock *sk = skb->sk; int q_idx; - q_idx = ndc->tx_send_table[netvsc_get_hash(skb, sk) & + q_idx = ndc->tx_send_table[netvsc_get_hash(skb, ndc) & (VRSS_SEND_TAB_SIZE - 1)]; /* If queue index changed record the new value */ @@ -285,7 +302,7 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, } static u32 fill_pg_buf(struct page *page, u32 offset, u32 len, - struct hv_page_buffer *pb) + struct hv_page_buffer *pb) { int j = 0; @@ -332,10 +349,9 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, * 2. skb linear data * 3. skb fragment data */ - if (hdr != NULL) - slots_used += fill_pg_buf(virt_to_page(hdr), - offset_in_page(hdr), - len, &pb[slots_used]); + slots_used += fill_pg_buf(virt_to_page(hdr), + offset_in_page(hdr), + len, &pb[slots_used]); packet->rmsg_size = len; packet->rmsg_pgcnt = slots_used; @@ -522,9 +538,9 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) rndis_msg_size += NDIS_VLAN_PPI_SIZE; ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE, - IEEE_8021Q_INFO); - vlan = (struct ndis_pkt_8021q_info *)((void *)ppi + - ppi->ppi_offset); + IEEE_8021Q_INFO); + + vlan = (void *)ppi + ppi->ppi_offset; vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK; vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; @@ -537,8 +553,7 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net) ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE, TCP_LARGESEND_PKTINFO); - lso_info = (struct ndis_tcp_lso_info *)((void *)ppi + - ppi->ppi_offset); + lso_info = (void *)ppi + ppi->ppi_offset; lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; if (skb->protocol == htons(ETH_P_IP)) { @@ -625,6 +640,7 @@ no_memory: ++net_device_ctx->eth_stats.tx_no_memory; goto drop; } + /* * netvsc_linkstatus_callback - Link up/down notification */ @@ -648,8 +664,8 @@ void netvsc_linkstatus_callback(struct hv_device *device_obj, if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) { u32 speed; - speed = *(u32 *)((void *)indicate + indicate-> - status_buf_offset) / 10000; + speed = *(u32 *)((void *)indicate + + indicate->status_buf_offset) / 10000; ndev_ctx->speed = speed; return; } @@ -814,9 +830,6 @@ static int netvsc_set_channels(struct net_device *net, channels->rx_count || channels->tx_count || channels->other_count) return -EINVAL; - if (count > net->num_tx_queues || count > VRSS_CHANNEL_MAX) - return -EINVAL; - if (!nvdev || nvdev->destroy) return -ENODEV; @@ -831,20 +844,27 @@ static int netvsc_set_channels(struct net_device *net, if (was_opened) rndis_filter_close(nvdev); - rndis_filter_device_remove(dev, nvdev); - memset(&device_info, 0, sizeof(device_info)); device_info.num_chn = count; device_info.ring_size = ring_size; + device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; + device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; + + rndis_filter_device_remove(dev, nvdev); nvdev = rndis_filter_device_add(dev, &device_info); - if (!IS_ERR(nvdev)) { - netif_set_real_num_tx_queues(net, nvdev->num_chn); - netif_set_real_num_rx_queues(net, nvdev->num_chn); - } else { + if (IS_ERR(nvdev)) { ret = PTR_ERR(nvdev); device_info.num_chn = orig; - rndis_filter_device_add(dev, &device_info); + nvdev = rndis_filter_device_add(dev, &device_info); + + if (IS_ERR(nvdev)) { + netdev_err(net, "restoring channel setting failed: %ld\n", + PTR_ERR(nvdev)); + return ret; + } } if (was_opened) @@ -878,6 +898,9 @@ static void netvsc_init_settings(struct net_device *dev) { struct net_device_context *ndc = netdev_priv(dev); + ndc->udp4_l4_hash = true; + ndc->udp6_l4_hash = true; + ndc->speed = SPEED_UNKNOWN; ndc->duplex = DUPLEX_FULL; } @@ -941,6 +964,10 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu) memset(&device_info, 0, sizeof(device_info)); device_info.ring_size = ring_size; device_info.num_chn = nvdev->num_chn; + device_info.send_sections = nvdev->send_section_cnt; + device_info.send_section_size = nvdev->send_section_size; + device_info.recv_sections = nvdev->recv_section_cnt; + device_info.recv_section_size = nvdev->recv_section_size; rndis_filter_device_remove(hdev, nvdev); @@ -952,10 +979,16 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu) /* Attempt rollback to original MTU */ ndev->mtu = orig_mtu; - rndis_filter_device_add(hdev, &device_info); + nvdev = rndis_filter_device_add(hdev, &device_info); if (vf_netdev) dev_set_mtu(vf_netdev, orig_mtu); + + if (IS_ERR(nvdev)) { + netdev_err(ndev, "restoring mtu failed: %ld\n", + PTR_ERR(nvdev)); + return ret; + } } if (was_opened) @@ -1005,7 +1038,7 @@ static void netvsc_get_stats64(struct net_device *net, struct net_device_context *ndev_ctx = netdev_priv(net); struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev); struct netvsc_vf_pcpu_stats vf_tot; - int i; + int i; if (!nvdev) return; @@ -1052,27 +1085,31 @@ static void netvsc_get_stats64(struct net_device *net, static int netvsc_set_mac_addr(struct net_device *ndev, void *p) { struct net_device_context *ndc = netdev_priv(ndev); + struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev); struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev); struct sockaddr *addr = p; - char save_adr[ETH_ALEN]; - unsigned char save_aatype; int err; - memcpy(save_adr, ndev->dev_addr, ETH_ALEN); - save_aatype = ndev->addr_assign_type; - - err = eth_mac_addr(ndev, p); - if (err != 0) + err = eth_prepare_mac_addr_change(ndev, p); + if (err) return err; if (!nvdev) return -ENODEV; + if (vf_netdev) { + err = dev_set_mac_address(vf_netdev, addr); + if (err) + return err; + } + err = rndis_filter_set_device_mac(nvdev, addr->sa_data); - if (err != 0) { - /* roll back to saved MAC */ - memcpy(ndev->dev_addr, save_adr, ETH_ALEN); - ndev->addr_assign_type = save_aatype; + if (!err) { + eth_commit_mac_addr_change(ndev, p); + } else if (vf_netdev) { + /* rollback change on VF */ + memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN); + dev_set_mac_address(vf_netdev, addr); } return err; @@ -1087,6 +1124,8 @@ static const struct { { "tx_no_space", offsetof(struct netvsc_ethtool_stats, tx_no_space) }, { "tx_too_big", offsetof(struct netvsc_ethtool_stats, tx_too_big) }, { "tx_busy", offsetof(struct netvsc_ethtool_stats, tx_busy) }, + { "tx_send_full", offsetof(struct netvsc_ethtool_stats, tx_send_full) }, + { "rx_comp_busy", offsetof(struct netvsc_ethtool_stats, rx_comp_busy) }, }, vf_stats[] = { { "vf_rx_packets", offsetof(struct netvsc_vf_pcpu_stats, rx_packets) }, { "vf_rx_bytes", offsetof(struct netvsc_vf_pcpu_stats, rx_bytes) }, @@ -1201,7 +1240,7 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data) } static int -netvsc_get_rss_hash_opts(struct netvsc_device *nvdev, +netvsc_get_rss_hash_opts(struct net_device_context *ndc, struct ethtool_rxnfc *info) { info->data = RXH_IP_SRC | RXH_IP_DST; @@ -1210,9 +1249,20 @@ netvsc_get_rss_hash_opts(struct netvsc_device *nvdev, case TCP_V4_FLOW: case TCP_V6_FLOW: info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; - /* fallthrough */ + break; + case UDP_V4_FLOW: + if (ndc->udp4_l4_hash) + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + + break; + case UDP_V6_FLOW: + if (ndc->udp6_l4_hash) + info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + + break; + case IPV4_FLOW: case IPV6_FLOW: break; @@ -1240,11 +1290,51 @@ netvsc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, return 0; case ETHTOOL_GRXFH: - return netvsc_get_rss_hash_opts(nvdev, info); + return netvsc_get_rss_hash_opts(ndc, info); } return -EOPNOTSUPP; } +static int netvsc_set_rss_hash_opts(struct net_device_context *ndc, + struct ethtool_rxnfc *info) +{ + if (info->data == (RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3)) { + if (info->flow_type == UDP_V4_FLOW) + ndc->udp4_l4_hash = true; + else if (info->flow_type == UDP_V6_FLOW) + ndc->udp6_l4_hash = true; + else + return -EOPNOTSUPP; + + return 0; + } + + if (info->data == (RXH_IP_SRC | RXH_IP_DST)) { + if (info->flow_type == UDP_V4_FLOW) + ndc->udp4_l4_hash = false; + else if (info->flow_type == UDP_V6_FLOW) + ndc->udp6_l4_hash = false; + else + return -EOPNOTSUPP; + + return 0; + } + + return -EOPNOTSUPP; +} + +static int +netvsc_set_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *info) +{ + struct net_device_context *ndc = netdev_priv(ndev); + + if (info->cmd == ETHTOOL_SRXFH) + return netvsc_set_rss_hash_opts(ndc, info); + + return -EOPNOTSUPP; +} + #ifdef CONFIG_NET_POLL_CONTROLLER static void netvsc_poll_controller(struct net_device *dev) { @@ -1318,7 +1408,7 @@ static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir, rndis_dev = ndev->extension; if (indir) { for (i = 0; i < ITAB_NUM; i++) - if (indir[i] >= VRSS_CHANNEL_MAX) + if (indir[i] >= ndev->num_chn) return -EINVAL; for (i = 0; i < ITAB_NUM; i++) @@ -1332,7 +1422,107 @@ static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir, key = rndis_dev->rss_key; } - return rndis_filter_set_rss_param(rndis_dev, key, ndev->num_chn); + return rndis_filter_set_rss_param(rndis_dev, key); +} + +/* Hyper-V RNDIS protocol does not have ring in the HW sense. + * It does have pre-allocated receive area which is divided into sections. + */ +static void __netvsc_get_ringparam(struct netvsc_device *nvdev, + struct ethtool_ringparam *ring) +{ + u32 max_buf_size; + + ring->rx_pending = nvdev->recv_section_cnt; + ring->tx_pending = nvdev->send_section_cnt; + + if (nvdev->nvsp_version <= NVSP_PROTOCOL_VERSION_2) + max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY; + else + max_buf_size = NETVSC_RECEIVE_BUFFER_SIZE; + + ring->rx_max_pending = max_buf_size / nvdev->recv_section_size; + ring->tx_max_pending = NETVSC_SEND_BUFFER_SIZE + / nvdev->send_section_size; +} + +static void netvsc_get_ringparam(struct net_device *ndev, + struct ethtool_ringparam *ring) +{ + struct net_device_context *ndevctx = netdev_priv(ndev); + struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev); + + if (!nvdev) + return; + + __netvsc_get_ringparam(nvdev, ring); +} + +static int netvsc_set_ringparam(struct net_device *ndev, + struct ethtool_ringparam *ring) +{ + struct net_device_context *ndevctx = netdev_priv(ndev); + struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev); + struct hv_device *hdev = ndevctx->device_ctx; + struct netvsc_device_info device_info; + struct ethtool_ringparam orig; + u32 new_tx, new_rx; + bool was_opened; + int ret = 0; + + if (!nvdev || nvdev->destroy) + return -ENODEV; + + memset(&orig, 0, sizeof(orig)); + __netvsc_get_ringparam(nvdev, &orig); + + new_tx = clamp_t(u32, ring->tx_pending, + NETVSC_MIN_TX_SECTIONS, orig.tx_max_pending); + new_rx = clamp_t(u32, ring->rx_pending, + NETVSC_MIN_RX_SECTIONS, orig.rx_max_pending); + + if (new_tx == orig.tx_pending && + new_rx == orig.rx_pending) + return 0; /* no change */ + + memset(&device_info, 0, sizeof(device_info)); + device_info.num_chn = nvdev->num_chn; + device_info.ring_size = ring_size; + device_info.send_sections = new_tx; + device_info.send_section_size = nvdev->send_section_size; + device_info.recv_sections = new_rx; + device_info.recv_section_size = nvdev->recv_section_size; + + netif_device_detach(ndev); + was_opened = rndis_filter_opened(nvdev); + if (was_opened) + rndis_filter_close(nvdev); + + rndis_filter_device_remove(hdev, nvdev); + + nvdev = rndis_filter_device_add(hdev, &device_info); + if (IS_ERR(nvdev)) { + ret = PTR_ERR(nvdev); + + device_info.send_sections = orig.tx_pending; + device_info.recv_sections = orig.rx_pending; + nvdev = rndis_filter_device_add(hdev, &device_info); + if (IS_ERR(nvdev)) { + netdev_err(ndev, "restoring ringparam failed: %ld\n", + PTR_ERR(nvdev)); + return ret; + } + } + + if (was_opened) + rndis_filter_open(nvdev); + netif_device_attach(ndev); + + /* We may have missed link change notifications */ + ndevctx->last_reconfig = 0; + schedule_delayed_work(&ndevctx->dwork, 0); + + return ret; } static const struct ethtool_ops ethtool_ops = { @@ -1345,12 +1535,15 @@ static const struct ethtool_ops ethtool_ops = { .set_channels = netvsc_set_channels, .get_ts_info = ethtool_op_get_ts_info, .get_rxnfc = netvsc_get_rxnfc, + .set_rxnfc = netvsc_set_rxnfc, .get_rxfh_key_size = netvsc_get_rxfh_key_size, .get_rxfh_indir_size = netvsc_rss_indir_size, .get_rxfh = netvsc_get_rxfh, .set_rxfh = netvsc_set_rxfh, .get_link_ksettings = netvsc_get_link_ksettings, .set_link_ksettings = netvsc_set_link_ksettings, + .get_ringparam = netvsc_get_ringparam, + .set_ringparam = netvsc_set_ringparam, }; static const struct net_device_ops device_ops = { @@ -1564,7 +1757,9 @@ static int netvsc_vf_join(struct net_device *vf_netdev, /* set slave flag before open to prevent IPv6 addrconf */ vf_netdev->flags |= IFF_SLAVE; - schedule_work(&ndev_ctx->vf_takeover); + schedule_delayed_work(&ndev_ctx->vf_takeover, VF_TAKEOVER_INT); + + call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); netdev_info(vf_netdev, "joined to %s\n", ndev->name); return 0; @@ -1580,8 +1775,6 @@ static void __netvsc_vf_setup(struct net_device *ndev, { int ret; - call_netdevice_notifiers(NETDEV_JOIN, vf_netdev); - /* Align MTU of VF with master */ ret = dev_set_mtu(vf_netdev, ndev->mtu); if (ret) @@ -1602,12 +1795,12 @@ static void __netvsc_vf_setup(struct net_device *ndev, static void netvsc_vf_setup(struct work_struct *w) { struct net_device_context *ndev_ctx - = container_of(w, struct net_device_context, vf_takeover); + = container_of(w, struct net_device_context, vf_takeover.work); struct net_device *ndev = hv_get_drvdata(ndev_ctx->device_ctx); struct net_device *vf_netdev; if (!rtnl_trylock()) { - schedule_work(w); + schedule_delayed_work(&ndev_ctx->vf_takeover, 0); return; } @@ -1646,44 +1839,18 @@ static int netvsc_register_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF registering: %s\n", vf_netdev->name); - /* Prevent this module from being unloaded while VF is registered */ - try_module_get(THIS_MODULE); - dev_hold(vf_netdev); rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev); return NOTIFY_OK; } -static int netvsc_vf_up(struct net_device *vf_netdev) -{ - struct net_device_context *net_device_ctx; - struct netvsc_device *netvsc_dev; - struct net_device *ndev; - - ndev = get_netvsc_byref(vf_netdev); - if (!ndev) - return NOTIFY_DONE; - - net_device_ctx = netdev_priv(ndev); - netvsc_dev = rtnl_dereference(net_device_ctx->nvdev); - if (!netvsc_dev) - return NOTIFY_DONE; - - /* Bump refcount when datapath is acvive - Why? */ - rndis_filter_open(netvsc_dev); - - /* notify the host to switch the data path. */ - netvsc_switch_datapath(ndev, true); - netdev_info(ndev, "Data path switched to VF: %s\n", vf_netdev->name); - - return NOTIFY_OK; -} - -static int netvsc_vf_down(struct net_device *vf_netdev) +/* VF up/down change detected, schedule to change data path */ +static int netvsc_vf_changed(struct net_device *vf_netdev) { struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; struct net_device *ndev; + bool vf_is_up = netif_running(vf_netdev); ndev = get_netvsc_byref(vf_netdev); if (!ndev) @@ -1694,9 +1861,9 @@ static int netvsc_vf_down(struct net_device *vf_netdev) if (!netvsc_dev) return NOTIFY_DONE; - netvsc_switch_datapath(ndev, false); - netdev_info(ndev, "Data path switched from VF: %s\n", vf_netdev->name); - rndis_filter_close(netvsc_dev); + netvsc_switch_datapath(ndev, vf_is_up); + netdev_info(ndev, "Data path switched %s VF: %s\n", + vf_is_up ? "to" : "from", vf_netdev->name); return NOTIFY_OK; } @@ -1711,14 +1878,15 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) return NOTIFY_DONE; net_device_ctx = netdev_priv(ndev); - cancel_work_sync(&net_device_ctx->vf_takeover); + cancel_delayed_work_sync(&net_device_ctx->vf_takeover); netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name); + netdev_rx_handler_unregister(vf_netdev); netdev_upper_dev_unlink(vf_netdev, ndev); RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL); dev_put(vf_netdev); - module_put(THIS_MODULE); + return NOTIFY_OK; } @@ -1753,7 +1921,7 @@ static int netvsc_probe(struct hv_device *dev, spin_lock_init(&net_device_ctx->lock); INIT_LIST_HEAD(&net_device_ctx->reconfig_events); - INIT_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup); + INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup); net_device_ctx->vf_stats = netdev_alloc_pcpu_stats(struct netvsc_vf_pcpu_stats); @@ -1771,6 +1939,10 @@ static int netvsc_probe(struct hv_device *dev, memset(&device_info, 0, sizeof(device_info)); device_info.ring_size = ring_size; device_info.num_chn = VRSS_CHANNEL_DEFAULT; + device_info.send_sections = NETVSC_DEFAULT_TX; + device_info.send_section_size = NETVSC_SEND_SECTION_SIZE; + device_info.recv_sections = NETVSC_DEFAULT_RX; + device_info.recv_section_size = NETVSC_RECV_SECTION_SIZE; nvdev = rndis_filter_device_add(dev, &device_info); if (IS_ERR(nvdev)) { @@ -1787,9 +1959,6 @@ static int netvsc_probe(struct hv_device *dev, NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; net->vlan_features = net->features; - netif_set_real_num_tx_queues(net, nvdev->num_chn); - netif_set_real_num_rx_queues(net, nvdev->num_chn); - netdev_lockdep_set_classes(net); /* MTU range: 68 - 1500 or 65521 */ @@ -1820,11 +1989,11 @@ no_net: static int netvsc_remove(struct hv_device *dev) { - struct net_device *net; struct net_device_context *ndev_ctx; + struct net_device *vf_netdev; + struct net_device *net; net = hv_get_drvdata(dev); - if (net == NULL) { dev_err(&dev->device, "No net device to remove\n"); return 0; @@ -1841,12 +2010,16 @@ static int netvsc_remove(struct hv_device *dev) * removed. Also blocks mtu and channel changes. */ rtnl_lock(); + vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); + if (vf_netdev) + netvsc_unregister_vf(vf_netdev); + + unregister_netdevice(net); + rndis_filter_device_remove(dev, rtnl_dereference(ndev_ctx->nvdev)); rtnl_unlock(); - unregister_netdev(net); - hv_set_drvdata(dev, NULL); free_percpu(ndev_ctx->vf_stats); @@ -1904,9 +2077,8 @@ static int netvsc_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: return netvsc_unregister_vf(event_dev); case NETDEV_UP: - return netvsc_vf_up(event_dev); case NETDEV_DOWN: - return netvsc_vf_down(event_dev); + return netvsc_vf_changed(event_dev); default: return NOTIFY_DONE; } diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 44165fe328a4..065b204d8e17 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -717,7 +717,7 @@ cleanup: } int rndis_filter_set_rss_param(struct rndis_device *rdev, - const u8 *rss_key, int num_queue) + const u8 *rss_key) { struct net_device *ndev = rdev->ndev; struct rndis_request *request; @@ -1039,8 +1039,6 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) /* Set the channel before opening.*/ nvchan->channel = new_sc; - netif_napi_add(ndev, &nvchan->napi, - netvsc_poll, NAPI_POLL_WEIGHT); ret = vmbus_open(new_sc, nvscdev->ring_size * PAGE_SIZE, nvscdev->ring_size * PAGE_SIZE, NULL, 0, @@ -1048,10 +1046,86 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) if (ret == 0) napi_enable(&nvchan->napi); else - netif_napi_del(&nvchan->napi); + netdev_notice(ndev, "sub channel open failed: %d\n", ret); - if (refcount_dec_and_test(&nvscdev->sc_offered)) - complete(&nvscdev->channel_init_wait); + if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn) + wake_up(&nvscdev->subchan_open); +} + +/* Open sub-channels after completing the handling of the device probe. + * This breaks overlap of processing the host message for the + * new primary channel with the initialization of sub-channels. + */ +void rndis_set_subchannel(struct work_struct *w) +{ + struct netvsc_device *nvdev + = container_of(w, struct netvsc_device, subchan_work); + struct nvsp_message *init_packet = &nvdev->channel_init_pkt; + struct net_device_context *ndev_ctx; + struct rndis_device *rdev; + struct net_device *ndev; + struct hv_device *hv_dev; + int i, ret; + + if (!rtnl_trylock()) { + schedule_work(w); + return; + } + + rdev = nvdev->extension; + if (!rdev) + goto unlock; /* device was removed */ + + ndev = rdev->ndev; + ndev_ctx = netdev_priv(ndev); + hv_dev = ndev_ctx->device_ctx; + + memset(init_packet, 0, sizeof(struct nvsp_message)); + init_packet->hdr.msg_type = NVSP_MSG5_TYPE_SUBCHANNEL; + init_packet->msg.v5_msg.subchn_req.op = NVSP_SUBCHANNEL_ALLOCATE; + init_packet->msg.v5_msg.subchn_req.num_subchannels = + nvdev->num_chn - 1; + ret = vmbus_sendpacket(hv_dev->channel, init_packet, + sizeof(struct nvsp_message), + (unsigned long)init_packet, + VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret) { + netdev_err(ndev, "sub channel allocate send failed: %d\n", ret); + goto failed; + } + + wait_for_completion(&nvdev->channel_init_wait); + if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) { + netdev_err(ndev, "sub channel request failed\n"); + goto failed; + } + + nvdev->num_chn = 1 + + init_packet->msg.v5_msg.subchn_comp.num_subchannels; + + /* wait for all sub channels to open */ + wait_event(nvdev->subchan_open, + atomic_read(&nvdev->open_chn) == nvdev->num_chn); + + /* ignore failues from setting rss parameters, still have channels */ + rndis_filter_set_rss_param(rdev, netvsc_hash_key); + + netif_set_real_num_tx_queues(ndev, nvdev->num_chn); + netif_set_real_num_rx_queues(ndev, nvdev->num_chn); + + rtnl_unlock(); + return; + +failed: + /* fallback to only primary channel */ + for (i = 1; i < nvdev->num_chn; i++) + netif_napi_del(&nvdev->chan_table[i].napi); + + nvdev->max_chn = 1; + nvdev->num_chn = 1; +unlock: + rtnl_unlock(); } struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, @@ -1063,11 +1137,10 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, struct rndis_device *rndis_device; struct ndis_offload hwcaps; struct ndis_offload_params offloads; - struct nvsp_message *init_packet; struct ndis_recv_scale_cap rsscap; u32 rsscap_size = sizeof(struct ndis_recv_scale_cap); unsigned int gso_max_size = GSO_MAX_SIZE; - u32 mtu, size, num_rss_qs; + u32 mtu, size; const struct cpumask *node_cpu_mask; u32 num_possible_rss_qs; int i, ret; @@ -1091,8 +1164,6 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, net_device->max_chn = 1; net_device->num_chn = 1; - refcount_set(&net_device->sc_offered, 0); - net_device->extension = rndis_device; rndis_device->ndev = net; @@ -1216,9 +1287,8 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, rndis_device->ind_table[i] = ethtool_rxfh_indir_default(i, net_device->num_chn); - num_rss_qs = net_device->num_chn - 1; - if (num_rss_qs == 0) - return net_device; + atomic_set(&net_device->open_chn, 1); + vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open); for (i = 1; i < net_device->num_chn; i++) { ret = netvsc_alloc_recv_comp_ring(net_device, i); @@ -1229,36 +1299,15 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, } } - refcount_set(&net_device->sc_offered, num_rss_qs); - vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open); - - init_packet = &net_device->channel_init_pkt; - memset(init_packet, 0, sizeof(struct nvsp_message)); - init_packet->hdr.msg_type = NVSP_MSG5_TYPE_SUBCHANNEL; - init_packet->msg.v5_msg.subchn_req.op = NVSP_SUBCHANNEL_ALLOCATE; - init_packet->msg.v5_msg.subchn_req.num_subchannels = - net_device->num_chn - 1; - ret = vmbus_sendpacket(dev->channel, init_packet, - sizeof(struct nvsp_message), - (unsigned long)init_packet, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret) - goto out; - - if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) { - ret = -ENODEV; - goto out; - } - wait_for_completion(&net_device->channel_init_wait); + for (i = 1; i < net_device->num_chn; i++) + netif_napi_add(net, &net_device->chan_table[i].napi, + netvsc_poll, NAPI_POLL_WEIGHT); - net_device->num_chn = 1 + - init_packet->msg.v5_msg.subchn_comp.num_subchannels; + if (net_device->num_chn > 1) + schedule_work(&net_device->subchan_work); - /* ignore failues from setting rss parameters, still have channels */ - rndis_filter_set_rss_param(rndis_device, netvsc_hash_key, - net_device->num_chn); out: + /* if unavailable, just proceed with one queue */ if (ret) { net_device->max_chn = 1; net_device->num_chn = 1; @@ -1279,10 +1328,10 @@ void rndis_filter_device_remove(struct hv_device *dev, /* Halt and release the rndis device */ rndis_filter_halt_device(rndis_dev); - kfree(rndis_dev); net_dev->extension = NULL; netvsc_device_remove(dev); + kfree(rndis_dev); } int rndis_filter_open(struct netvsc_device *nvdev) diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c index 415dcc69a502..0fe3ea164ee5 100644 --- a/drivers/pci/host/pci-hyperv.c +++ b/drivers/pci/host/pci-hyperv.c @@ -50,6 +50,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/delay.h> #include <linux/semaphore.h> #include <linux/irqdomain.h> #include <asm/irqdomain.h> @@ -562,52 +563,6 @@ static void put_pcichild(struct hv_pci_dev *hv_pcidev, static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus); static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus); - -/* - * Temporary CPU to vCPU mapping to address transitioning - * vmbus_cpu_number_to_vp_number() being migrated to - * hv_cpu_number_to_vp_number() in a separate patch. Once that patch - * has been picked up in the main line, remove this code here and use - * the official code. - */ -static struct hv_tmpcpumap -{ - bool initialized; - u32 vp_index[NR_CPUS]; -} hv_tmpcpumap; - -static void hv_tmpcpumap_init_cpu(void *_unused) -{ - int cpu = smp_processor_id(); - u64 vp_index; - - hv_get_vp_index(vp_index); - - hv_tmpcpumap.vp_index[cpu] = vp_index; -} - -static void hv_tmpcpumap_init(void) -{ - if (hv_tmpcpumap.initialized) - return; - - memset(hv_tmpcpumap.vp_index, -1, sizeof(hv_tmpcpumap.vp_index)); - on_each_cpu(hv_tmpcpumap_init_cpu, NULL, true); - hv_tmpcpumap.initialized = true; -} - -/** - * hv_tmp_cpu_nr_to_vp_nr() - Convert Linux CPU nr to Hyper-V vCPU nr - * - * Remove once vmbus_cpu_number_to_vp_number() has been converted to - * hv_cpu_number_to_vp_number() and replace callers appropriately. - */ -static u32 hv_tmp_cpu_nr_to_vp_nr(int cpu) -{ - return hv_tmpcpumap.vp_index[cpu]; -} - - /** * devfn_to_wslot() - Convert from Linux PCI slot to Windows * @devfn: The Linux representation of PCI slot @@ -971,7 +926,7 @@ static void hv_irq_unmask(struct irq_data *data) var_size = 1 + HV_VP_SET_BANK_COUNT_MAX; for_each_cpu_and(cpu, dest, cpu_online_mask) { - cpu_vmbus = hv_tmp_cpu_nr_to_vp_nr(cpu); + cpu_vmbus = hv_cpu_number_to_vp_number(cpu); if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) { dev_err(&hbus->hdev->device, @@ -986,7 +941,7 @@ static void hv_irq_unmask(struct irq_data *data) } else { for_each_cpu_and(cpu, dest, cpu_online_mask) { params->int_target.vp_mask |= - (1ULL << hv_tmp_cpu_nr_to_vp_nr(cpu)); + (1ULL << hv_cpu_number_to_vp_number(cpu)); } } @@ -1063,7 +1018,7 @@ static u32 hv_compose_msi_req_v2( */ cpu = cpumask_first_and(affinity, cpu_online_mask); int_pkt->int_desc.processor_array[0] = - hv_tmp_cpu_nr_to_vp_nr(cpu); + hv_cpu_number_to_vp_number(cpu); int_pkt->int_desc.processor_count = 1; return sizeof(*int_pkt); @@ -1159,7 +1114,12 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) goto free_int_desc; } - wait_for_completion(&comp.comp_pkt.host_event); + /* + * Since this function is called with IRQ locks held, can't + * do normal wait for completion; instead poll. + */ + while (!try_wait_for_completion(&comp.comp_pkt.host_event)) + udelay(100); if (comp.comp_pkt.completion_status < 0) { dev_err(&hbus->hdev->device, @@ -2490,8 +2450,6 @@ static int hv_pci_probe(struct hv_device *hdev, return -ENOMEM; hbus->state = hv_pcibus_init; - hv_tmpcpumap_init(); - /* * The PCI bus "domain" is what is called "segment" in ACPI and * other specs. Pull it from the instance ID, to get something diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 3cc8d67783a1..5e7200f05873 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1640,6 +1640,8 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) put_cpu(); if (ret == -EAGAIN) { + if (payload_sz > sizeof(cmd_request->mpb)) + kfree(payload); /* no more space */ return SCSI_MLQUEUE_DEVICE_BUSY; } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b7d7bbec74e0..6431087816ba 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -124,10 +124,7 @@ struct hv_ring_buffer_info { spinlock_t ring_lock; u32 ring_datasize; /* < ring_size */ - u32 ring_data_startoffset; - u32 priv_write_index; u32 priv_read_index; - u32 cached_read_index; }; /* @@ -180,19 +177,6 @@ static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi) return write; } -static inline u32 hv_get_cached_bytes_to_write( - const struct hv_ring_buffer_info *rbi) -{ - u32 read_loc, write_loc, dsize, write; - - dsize = rbi->ring_datasize; - read_loc = rbi->cached_read_index; - write_loc = rbi->ring_buffer->write_index; - - write = write_loc >= read_loc ? dsize - (write_loc - read_loc) : - read_loc - write_loc; - return write; -} /* * VMBUS version is 32 bit entity broken up into * two 16 bit quantities: major_number. minor_number. @@ -677,18 +661,6 @@ union hv_connection_id { } u; }; -/* Definition of the hv_signal_event hypercall input structure. */ -struct hv_input_signal_event { - union hv_connection_id connectionid; - u16 flag_number; - u16 rsvdz; -}; - -struct hv_input_signal_event_buffer { - u64 align8; - struct hv_input_signal_event event; -}; - enum hv_numa_policy { HV_BALANCED = 0, HV_LOCALIZED, @@ -770,8 +742,7 @@ struct vmbus_channel { } callback_mode; bool is_dedicated_interrupt; - struct hv_input_signal_event_buffer sig_buf; - struct hv_input_signal_event *sig_event; + u64 sig_event; /* * Starting with win8, this field will be used to specify @@ -895,6 +866,8 @@ struct vmbus_channel { */ enum hv_numa_policy affinity_policy; + bool probe_done; + }; static inline bool is_hvsock_channel(const struct vmbus_channel *c) @@ -1030,13 +1003,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel, enum vmbus_packet_type type, u32 flags); -extern int vmbus_sendpacket_ctl(struct vmbus_channel *channel, - void *buffer, - u32 bufferLen, - u64 requestid, - enum vmbus_packet_type type, - u32 flags); - extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, struct hv_page_buffer pagebuffers[], u32 pagecount, @@ -1044,20 +1010,6 @@ extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, u32 bufferlen, u64 requestid); -extern int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, - void *buffer, - u32 bufferlen, - u64 requestid, - u32 flags); - -extern int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel, - struct hv_multipage_buffer *mpb, - void *buffer, - u32 bufferlen, - u64 requestid); - extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *mpb, u32 desc_size, @@ -1186,8 +1138,6 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, resource_size_t size, resource_size_t align, bool fb_overlap_ok); void vmbus_free_mmio(resource_size_t start, resource_size_t size); -int vmbus_cpu_number_to_vp_number(int cpu_number); -u64 hv_do_hypercall(u64 control, void *input, void *output); /* * GUID definitions of various offer types - services offered to the guest. @@ -1453,7 +1403,7 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, const int *srv_version, int srv_vercnt, int *nego_fw_version, int *nego_srv_version); -void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid); +void hv_process_channel_removal(u32 relid); void vmbus_setevent(struct vmbus_channel *channel); /* @@ -1474,55 +1424,6 @@ hv_get_ring_buffer(const struct hv_ring_buffer_info *ring_info) } /* - * To optimize the flow management on the send-side, - * when the sender is blocked because of lack of - * sufficient space in the ring buffer, potential the - * consumer of the ring buffer can signal the producer. - * This is controlled by the following parameters: - * - * 1. pending_send_sz: This is the size in bytes that the - * producer is trying to send. - * 2. The feature bit feat_pending_send_sz set to indicate if - * the consumer of the ring will signal when the ring - * state transitions from being full to a state where - * there is room for the producer to send the pending packet. - */ - -static inline void hv_signal_on_read(struct vmbus_channel *channel) -{ - u32 cur_write_sz, cached_write_sz; - u32 pending_sz; - struct hv_ring_buffer_info *rbi = &channel->inbound; - - /* - * Issue a full memory barrier before making the signaling decision. - * Here is the reason for having this barrier: - * If the reading of the pend_sz (in this function) - * were to be reordered and read before we commit the new read - * index (in the calling function) we could - * have a problem. If the host were to set the pending_sz after we - * have sampled pending_sz and go to sleep before we commit the - * read index, we could miss sending the interrupt. Issue a full - * memory barrier to address this. - */ - virt_mb(); - - pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz); - /* If the other end is not blocked on write don't bother. */ - if (pending_sz == 0) - return; - - cur_write_sz = hv_get_bytes_to_write(rbi); - - if (cur_write_sz < pending_sz) - return; - - cached_write_sz = hv_get_cached_bytes_to_write(rbi); - if (cached_write_sz < pending_sz) - vmbus_setevent(channel); -} - -/* * Mask off host interrupt callback notifications */ static inline void hv_begin_read(struct hv_ring_buffer_info *rbi) diff --git a/tools/hv/bondvf.sh b/tools/hv/bondvf.sh deleted file mode 100755 index 89b25068cd98..000000000000 --- a/tools/hv/bondvf.sh +++ /dev/null @@ -1,232 +0,0 @@ -#!/bin/bash - -# This example script creates bonding network devices based on synthetic NIC -# (the virtual network adapter usually provided by Hyper-V) and the matching -# VF NIC (SRIOV virtual function). So the synthetic NIC and VF NIC can -# function as one network device, and fail over to the synthetic NIC if VF is -# down. -# -# Usage: -# - After configured vSwitch and vNIC with SRIOV, start Linux virtual -# machine (VM) -# - Run this scripts on the VM. It will create configuration files in -# distro specific directory. -# - Reboot the VM, so that the bonding config are enabled. -# -# The config files are DHCP by default. You may edit them if you need to change -# to Static IP or change other settings. -# - -sysdir=/sys/class/net -netvsc_cls={f8615163-df3e-46c5-913f-f2d2f965ed0e} -bondcnt=0 - -# Detect Distro -if [ -f /etc/redhat-release ]; -then - cfgdir=/etc/sysconfig/network-scripts - distro=redhat -elif grep -q 'Ubuntu' /etc/issue -then - cfgdir=/etc/network - distro=ubuntu -elif grep -q 'SUSE' /etc/issue -then - cfgdir=/etc/sysconfig/network - distro=suse -else - echo "Unsupported Distro" - exit 1 -fi - -echo Detected Distro: $distro, or compatible - -# Get a list of ethernet names -list_eth=(`cd $sysdir && ls -d */ | cut -d/ -f1 | grep -v bond`) -eth_cnt=${#list_eth[@]} - -echo List of net devices: - -# Get the MAC addresses -for (( i=0; i < $eth_cnt; i++ )) -do - list_mac[$i]=`cat $sysdir/${list_eth[$i]}/address` - echo ${list_eth[$i]}, ${list_mac[$i]} -done - -# Find NIC with matching MAC -for (( i=0; i < $eth_cnt-1; i++ )) -do - for (( j=i+1; j < $eth_cnt; j++ )) - do - if [ "${list_mac[$i]}" = "${list_mac[$j]}" ] - then - list_match[$i]=${list_eth[$j]} - break - fi - done -done - -function create_eth_cfg_redhat { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo DEVICE=$1 >>$fn - echo TYPE=Ethernet >>$fn - echo BOOTPROTO=none >>$fn - echo UUID=`uuidgen` >>$fn - echo ONBOOT=yes >>$fn - echo PEERDNS=yes >>$fn - echo IPV6INIT=yes >>$fn - echo MASTER=$2 >>$fn - echo SLAVE=yes >>$fn -} - -function create_eth_cfg_pri_redhat { - create_eth_cfg_redhat $1 $2 -} - -function create_bond_cfg_redhat { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo DEVICE=$1 >>$fn - echo TYPE=Bond >>$fn - echo BOOTPROTO=dhcp >>$fn - echo UUID=`uuidgen` >>$fn - echo ONBOOT=yes >>$fn - echo PEERDNS=yes >>$fn - echo IPV6INIT=yes >>$fn - echo BONDING_MASTER=yes >>$fn - echo BONDING_OPTS=\"mode=active-backup miimon=100 primary=$2\" >>$fn -} - -function del_eth_cfg_ubuntu { - local mainfn=$cfgdir/interfaces - local fnlist=( $mainfn ) - - local dirlist=(`awk '/^[ \t]*source/{print $2}' $mainfn`) - - local i - for i in "${dirlist[@]}" - do - fnlist+=(`ls $i 2>/dev/null`) - done - - local tmpfl=$(mktemp) - - local nic_start='^[ \t]*(auto|iface|mapping|allow-.*)[ \t]+'$1 - local nic_end='^[ \t]*(auto|iface|mapping|allow-.*|source)' - - local fn - for fn in "${fnlist[@]}" - do - awk "/$nic_end/{x=0} x{next} /$nic_start/{x=1;next} 1" \ - $fn >$tmpfl - - cp $tmpfl $fn - done - - rm $tmpfl -} - -function create_eth_cfg_ubuntu { - local fn=$cfgdir/interfaces - - del_eth_cfg_ubuntu $1 - echo $'\n'auto $1 >>$fn - echo iface $1 inet manual >>$fn - echo bond-master $2 >>$fn -} - -function create_eth_cfg_pri_ubuntu { - local fn=$cfgdir/interfaces - - del_eth_cfg_ubuntu $1 - echo $'\n'allow-hotplug $1 >>$fn - echo iface $1 inet manual >>$fn - echo bond-master $2 >>$fn - echo bond-primary $1 >>$fn -} - -function create_bond_cfg_ubuntu { - local fn=$cfgdir/interfaces - - del_eth_cfg_ubuntu $1 - - echo $'\n'auto $1 >>$fn - echo iface $1 inet dhcp >>$fn - echo bond-mode active-backup >>$fn - echo bond-miimon 100 >>$fn - echo bond-slaves none >>$fn -} - -function create_eth_cfg_suse { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo BOOTPROTO=none >>$fn - echo STARTMODE=auto >>$fn -} - -function create_eth_cfg_pri_suse { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo BOOTPROTO=none >>$fn - echo STARTMODE=hotplug >>$fn -} - -function create_bond_cfg_suse { - local fn=$cfgdir/ifcfg-$1 - - rm -f $fn - echo BOOTPROTO=dhcp >>$fn - echo STARTMODE=auto >>$fn - echo BONDING_MASTER=yes >>$fn - echo BONDING_SLAVE_0=$2 >>$fn - echo BONDING_SLAVE_1=$3 >>$fn - echo BONDING_MODULE_OPTS=\'mode=active-backup miimon=100 primary=$2\' >>$fn -} - -function create_bond { - local bondname=bond$bondcnt - local primary - local secondary - - local class_id1=`cat $sysdir/$1/device/class_id 2>/dev/null` - local class_id2=`cat $sysdir/$2/device/class_id 2>/dev/null` - - if [ "$class_id1" = "$netvsc_cls" ] - then - primary=$2 - secondary=$1 - elif [ "$class_id2" = "$netvsc_cls" ] - then - primary=$1 - secondary=$2 - else - return 0 - fi - - echo $'\nBond name:' $bondname - - echo configuring $primary - create_eth_cfg_pri_$distro $primary $bondname - - echo configuring $secondary - create_eth_cfg_$distro $secondary $bondname - - echo creating: $bondname with primary slave: $primary - create_bond_cfg_$distro $bondname $primary $secondary - - let bondcnt=bondcnt+1 -} - -for (( i=0; i < $eth_cnt-1; i++ )) -do - if [ -n "${list_match[$i]}" ] - then - create_bond ${list_eth[$i]} ${list_match[$i]} - fi -done diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c index 26ae609a9448..457a1521f32f 100644 --- a/tools/hv/hv_fcopy_daemon.c +++ b/tools/hv/hv_fcopy_daemon.c @@ -138,14 +138,17 @@ void print_usage(char *argv[]) int main(int argc, char *argv[]) { - int fcopy_fd, len; + int fcopy_fd; int error; int daemonize = 1, long_index = 0, opt; int version = FCOPY_CURRENT_VERSION; - char *buffer[4096 * 2]; - struct hv_fcopy_hdr *in_msg; + union { + struct hv_fcopy_hdr hdr; + struct hv_start_fcopy start; + struct hv_do_fcopy copy; + __u32 kernel_modver; + } buffer = { }; int in_handshake = 1; - __u32 kernel_modver; static struct option long_options[] = { {"help", no_argument, 0, 'h' }, @@ -195,32 +198,31 @@ int main(int argc, char *argv[]) * In this loop we process fcopy messages after the * handshake is complete. */ - len = pread(fcopy_fd, buffer, (4096 * 2), 0); + ssize_t len; + + len = pread(fcopy_fd, &buffer, sizeof(buffer), 0); if (len < 0) { syslog(LOG_ERR, "pread failed: %s", strerror(errno)); exit(EXIT_FAILURE); } if (in_handshake) { - if (len != sizeof(kernel_modver)) { + if (len != sizeof(buffer.kernel_modver)) { syslog(LOG_ERR, "invalid version negotiation"); exit(EXIT_FAILURE); } - kernel_modver = *(__u32 *)buffer; in_handshake = 0; - syslog(LOG_INFO, "kernel module version: %d", - kernel_modver); + syslog(LOG_INFO, "kernel module version: %u", + buffer.kernel_modver); continue; } - in_msg = (struct hv_fcopy_hdr *)buffer; - - switch (in_msg->operation) { + switch (buffer.hdr.operation) { case START_FILE_COPY: - error = hv_start_fcopy((struct hv_start_fcopy *)in_msg); + error = hv_start_fcopy(&buffer.start); break; case WRITE_TO_FILE: - error = hv_copy_data((struct hv_do_fcopy *)in_msg); + error = hv_copy_data(&buffer.copy); break; case COMPLETE_FCOPY: error = hv_copy_finished(); @@ -231,7 +233,7 @@ int main(int argc, char *argv[]) default: syslog(LOG_ERR, "Unknown operation: %d", - in_msg->operation); + buffer.hdr.operation); } diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 88b20e007c05..eaa3bec273c8 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -1136,7 +1136,7 @@ static int process_ip_string(FILE *f, char *ip_string, int type) int i = 0; int j = 0; char str[256]; - char sub_str[10]; + char sub_str[13]; int offset = 0; memset(addr, 0, sizeof(addr)); diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c index 7ba54195934c..b2b4ebffab8c 100644 --- a/tools/hv/hv_vss_daemon.c +++ b/tools/hv/hv_vss_daemon.c @@ -21,6 +21,7 @@ #include <sys/types.h> #include <sys/poll.h> #include <sys/ioctl.h> +#include <sys/stat.h> #include <fcntl.h> #include <stdio.h> #include <mntent.h> @@ -30,6 +31,7 @@ #include <ctype.h> #include <errno.h> #include <linux/fs.h> +#include <linux/major.h> #include <linux/hyperv.h> #include <syslog.h> #include <getopt.h> @@ -70,6 +72,7 @@ static int vss_operate(int operation) char match[] = "/dev/"; FILE *mounts; struct mntent *ent; + struct stat sb; char errdir[1024] = {0}; unsigned int cmd; int error = 0, root_seen = 0, save_errno = 0; @@ -92,6 +95,10 @@ static int vss_operate(int operation) while ((ent = getmntent(mounts))) { if (strncmp(ent->mnt_fsname, match, strlen(match))) continue; + if (stat(ent->mnt_fsname, &sb) == -1) + continue; + if (S_ISBLK(sb.st_mode) && major(sb.st_rdev) == LOOP_MAJOR) + continue; if (hasmntopt(ent, MNTOPT_RO) != NULL) continue; if (strcmp(ent->mnt_type, "vfat") == 0) |