Home Home > GIT Browse > openSUSE-15.0
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKernel Build Daemon <kbuild@suse.de>2018-01-13 07:01:11 +0100
committerKernel Build Daemon <kbuild@suse.de>2018-01-13 07:01:11 +0100
commitd56ea9356f814c40c827673bbf54f82e7d173a01 (patch)
tree42e9d0502de59979480e332d5e1b7a13e319666a
parenta473c88a502025cbe9753beeac4b00c262bc2c3e (diff)
parent5f23d516277274e9350da376a403d82157c1576f (diff)
Merge branch 'SLE15' into openSUSE-15.0
-rw-r--r--Documentation/vm/hmm.txt384
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/powerpc/kernel/ptrace.c2
-rw-r--r--arch/powerpc/kernel/setup_64.c3
-rw-r--r--arch/powerpc/kernel/sysfs.c46
-rw-r--r--arch/powerpc/platforms/pseries/mobility.c3
-rw-r--r--arch/powerpc/platforms/pseries/pseries.h2
-rw-r--r--arch/powerpc/platforms/pseries/setup.c2
-rw-r--r--arch/s390/include/asm/switch_to.h2
-rw-r--r--arch/s390/kernel/dis.c3
-rw-r--r--arch/s390/kernel/early.c4
-rw-r--r--arch/s390/kernel/process.c1
-rw-r--r--arch/s390/pci/pci_dma.c21
-rw-r--r--arch/s390/pci/pci_insn.c3
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/entry/entry_32.S6
-rw-r--r--arch/x86/entry/entry_64.S202
-rw-r--r--arch/x86/entry/entry_64_compat.S14
-rw-r--r--arch/x86/hyperv/hv_init.c2
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/desc.h11
-rw-r--r--arch/x86/include/asm/fixmap.h68
-rw-r--r--arch/x86/include/asm/hypervisor.h53
-rw-r--r--arch/x86/include/asm/irqflags.h3
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/processor.h59
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/switch_to.h8
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/unwind.h20
-rw-r--r--arch/x86/include/asm/x86_init.h24
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/asm-offsets.c6
-rw-r--r--arch/x86/kernel/asm-offsets_32.c9
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/cpu/common.c170
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c64
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c8
-rw-r--r--arch/x86/kernel/doublefault.c36
-rw-r--r--arch/x86/kernel/dumpstack.c88
-rw-r--r--arch/x86/kernel/dumpstack_32.c6
-rw-r--r--arch/x86/kernel/dumpstack_64.c6
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq_64.c4
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/process.c19
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/traps.c69
-rw-r--r--arch/x86/kernel/unwind_orc.c88
-rw-r--r--arch/x86/kernel/vmlinux.lds.S9
-rw-r--r--arch/x86/kernel/x86_init.c9
-rw-r--r--arch/x86/kvm/vmx.c2
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_64.c22
-rw-r--r--arch/x86/power/cpu.c16
-rw-r--r--arch/x86/xen/enlighten_hvm.c12
-rw-r--r--arch/x86/xen/enlighten_pv.c8
-rw-r--r--arch/x86/xen/mmu_pv.c2
-rw-r--r--arch/x86/xen/xen-asm_64.S41
-rw-r--r--drivers/cpufreq/intel_pstate.c10
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h20
-rw-r--r--drivers/gpu/drm/i915/i915_pci.c300
-rw-r--r--drivers/gpu/drm/i915/i915_reg.h5
-rw-r--r--drivers/gpu/drm/i915/intel_bios.c27
-rw-r--r--drivers/gpu/drm/i915/intel_cdclk.c35
-rw-r--r--drivers/gpu/drm/i915/intel_ddi.c53
-rw-r--r--drivers/gpu/drm/i915/intel_pm.c13
-rw-r--r--drivers/gpu/drm/i915/intel_runtime_pm.c10
-rw-r--r--drivers/gpu/drm/i915/intel_vbt_defs.h8
-rw-r--r--drivers/hv/vmbus_drv.c2
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c8
-rw-r--r--drivers/input/mouse/vmmouse.c10
-rw-r--r--drivers/md/dm-mpath.c19
-rw-r--r--drivers/misc/vmw_balloon.c2
-rw-r--r--drivers/net/ethernet/cisco/enic/enic.h2
-rw-r--r--drivers/net/ethernet/cisco/enic/enic_ethtool.c77
-rw-r--r--drivers/net/ethernet/cisco/enic/vnic_rq.c16
-rw-r--r--drivers/net/ethernet/ibm/ibmvnic.c6
-rw-r--r--drivers/nvme/host/core.c28
-rw-r--r--drivers/nvme/host/fc.c6
-rw-r--r--drivers/nvme/host/multipath.c44
-rw-r--r--drivers/nvme/host/nvme.h5
-rw-r--r--drivers/s390/net/qeth_l3_main.c6
-rw-r--r--fs/aio.c8
-rw-r--r--fs/f2fs/data.c5
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/proc/task_mmu.c9
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--include/drm/i915_pciids.h182
-rw-r--r--include/linux/blk_types.h28
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/hmm.h520
-rw-r--r--include/linux/hypervisor.h8
-rw-r--r--include/linux/ioport.h2
-rw-r--r--include/linux/memory_hotplug.h11
-rw-r--r--include/linux/memremap.h99
-rw-r--r--include/linux/migrate.h137
-rw-r--r--include/linux/migrate_mode.h5
-rw-r--r--include/linux/mm.h41
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/swap.h24
-rw-r--r--include/linux/swapops.h68
-rw-r--r--kernel/bpf/arraymap.c39
-rw-r--r--kernel/bpf/core.c3
-rw-r--r--kernel/bpf/verifier.c36
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/memremap.c60
-rw-r--r--mm/Kconfig51
-rw-r--r--mm/Makefile1
-rw-r--r--mm/balloon_compaction.c8
-rw-r--r--mm/gup.c7
-rw-r--r--mm/hmm.c1257
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c222
-rw-r--r--mm/memory.c107
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/migrate.c931
-rw-r--r--mm/mprotect.c14
-rw-r--r--mm/page_vma_mapped.c10
-rw-r--r--mm/rmap.c26
-rw-r--r--mm/swap.c11
-rw-r--r--mm/zsmalloc.c8
130 files changed, 5549 insertions, 851 deletions
diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt
new file mode 100644
index 000000000000..4d3aac9f4a5d
--- /dev/null
+++ b/Documentation/vm/hmm.txt
@@ -0,0 +1,384 @@
+Heterogeneous Memory Management (HMM)
+
+Transparently allow any component of a program to use any memory region of said
+program with a device without using device specific memory allocator. This is
+becoming a requirement to simplify the use of advance heterogeneous computing
+where GPU, DSP or FPGA are use to perform various computations.
+
+This document is divided as follow, in the first section i expose the problems
+related to the use of a device specific allocator. The second section i expose
+the hardware limitations that are inherent to many platforms. The third section
+gives an overview of HMM designs. The fourth section explains how CPU page-
+table mirroring works and what is HMM purpose in this context. Fifth section
+deals with how device memory is represented inside the kernel. Finaly the last
+section present the new migration helper that allow to leverage the device DMA
+engine.
+
+
+1) Problems of using device specific memory allocator:
+2) System bus, device memory characteristics
+3) Share address space and migration
+4) Address space mirroring implementation and API
+5) Represent and manage device memory from core kernel point of view
+6) Migrate to and from device memory
+7) Memory cgroup (memcg) and rss accounting
+
+
+-------------------------------------------------------------------------------
+
+1) Problems of using device specific memory allocator:
+
+Device with large amount of on board memory (several giga bytes) like GPU have
+historically manage their memory through dedicated driver specific API. This
+creates a disconnect between memory allocated and managed by device driver and
+regular application memory (private anonymous, share memory or regular file
+back memory). From here on i will refer to this aspect as split address space.
+I use share address space to refer to the opposite situation ie one in which
+any memory region can be use by device transparently.
+
+Split address space because device can only access memory allocated through the
+device specific API. This imply that all memory object in a program are not
+equal from device point of view which complicate large program that rely on a
+wide set of libraries.
+
+Concretly this means that code that wants to leverage device like GPU need to
+copy object between genericly allocated memory (malloc, mmap private/share/)
+and memory allocated through the device driver API (this still end up with an
+mmap but of the device file).
+
+For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
+complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
+data-set need to re-map all the pointer relations between each of its elements.
+This is error prone and program gets harder to debug because of the duplicate
+data-set.
+
+Split address space also means that library can not transparently use data they
+are getting from core program or other library and thus each library might have
+to duplicate its input data-set using specific memory allocator. Large project
+suffer from this and waste resources because of the various memory copy.
+
+Duplicating each library API to accept as input or output memory allocted by
+each device specific allocator is not a viable option. It would lead to a
+combinatorial explosions in the library entry points.
+
+Finaly with the advance of high level language constructs (in C++ but in other
+language too) it is now possible for compiler to leverage GPU or other devices
+without even the programmer knowledge. Some of compiler identified patterns are
+only do-able with a share address. It is as well more reasonable to use a share
+address space for all the other patterns.
+
+
+-------------------------------------------------------------------------------
+
+2) System bus, device memory characteristics
+
+System bus cripple share address due to few limitations. Most system bus only
+allow basic memory access from device to main memory, even cache coherency is
+often optional. Access to device memory from CPU is even more limited, most
+often than not it is not cache coherent.
+
+If we only consider the PCIE bus than device can access main memory (often
+through an IOMMU) and be cache coherent with the CPUs. However it only allows
+a limited set of atomic operation from device on main memory. This is worse
+in the other direction the CPUs can only access a limited range of the device
+memory and can not perform atomic operations on it. Thus device memory can not
+be consider like regular memory from kernel point of view.
+
+Another crippling factor is the limited bandwidth (~32GBytes/s with PCIE 4.0
+and 16 lanes). This is 33 times less that fastest GPU memory (1 TBytes/s).
+The final limitation is latency, access to main memory from the device has an
+order of magnitude higher latency than when the device access its own memory.
+
+Some platform are developing new system bus or additions/modifications to PCIE
+to address some of those limitations (OpenCAPI, CCIX). They mainly allow two
+way cache coherency between CPU and device and allow all atomic operations the
+architecture supports. Saddly not all platform are following this trends and
+some major architecture are left without hardware solutions to those problems.
+
+So for share address space to make sense not only we must allow device to
+access any memory memory but we must also permit any memory to be migrated to
+device memory while device is using it (blocking CPU access while it happens).
+
+
+-------------------------------------------------------------------------------
+
+3) Share address space and migration
+
+HMM intends to provide two main features. First one is to share the address
+space by duplication the CPU page table into the device page table so same
+address point to same memory and this for any valid main memory address in
+the process address space.
+
+To achieve this, HMM offer a set of helpers to populate the device page table
+while keeping track of CPU page table updates. Device page table updates are
+not as easy as CPU page table updates. To update the device page table you must
+allow a buffer (or use a pool of pre-allocated buffer) and write GPU specifics
+commands in it to perform the update (unmap, cache invalidations and flush,
+...). This can not be done through common code for all device. Hence why HMM
+provides helpers to factor out everything that can be while leaving the gory
+details to the device driver.
+
+The second mechanism HMM provide is a new kind of ZONE_DEVICE memory that does
+allow to allocate a struct page for each page of the device memory. Those page
+are special because the CPU can not map them. They however allow to migrate
+main memory to device memory using exhisting migration mechanism and everything
+looks like if page was swap out to disk from CPU point of view. Using a struct
+page gives the easiest and cleanest integration with existing mm mechanisms.
+Again here HMM only provide helpers, first to hotplug new ZONE_DEVICE memory
+for the device memory and second to perform migration. Policy decision of what
+and when to migrate things is left to the device driver.
+
+Note that any CPU access to a device page trigger a page fault and a migration
+back to main memory ie when a page backing an given address A is migrated from
+a main memory page to a device page then any CPU access to address A trigger a
+page fault and initiate a migration back to main memory.
+
+
+With this two features, HMM not only allow a device to mirror a process address
+space and keeps both CPU and device page table synchronize, but also allow to
+leverage device memory by migrating part of data-set that is actively use by a
+device.
+
+
+-------------------------------------------------------------------------------
+
+4) Address space mirroring implementation and API
+
+Address space mirroring main objective is to allow to duplicate range of CPU
+page table into a device page table and HMM helps keeping both synchronize. A
+device driver that want to mirror a process address space must start with the
+registration of an hmm_mirror struct:
+
+ int hmm_mirror_register(struct hmm_mirror *mirror,
+ struct mm_struct *mm);
+ int hmm_mirror_register_locked(struct hmm_mirror *mirror,
+ struct mm_struct *mm);
+
+The locked variant is to be use when the driver is already holding the mmap_sem
+of the mm in write mode. The mirror struct has a set of callback that are use
+to propagate CPU page table:
+
+ struct hmm_mirror_ops {
+ /* sync_cpu_device_pagetables() - synchronize page tables
+ *
+ * @mirror: pointer to struct hmm_mirror
+ * @update_type: type of update that occurred to the CPU page table
+ * @start: virtual start address of the range to update
+ * @end: virtual end address of the range to update
+ *
+ * This callback ultimately originates from mmu_notifiers when the CPU
+ * page table is updated. The device driver must update its page table
+ * in response to this callback. The update argument tells what action
+ * to perform.
+ *
+ * The device driver must not return from this callback until the device
+ * page tables are completely updated (TLBs flushed, etc); this is a
+ * synchronous call.
+ */
+ void (*update)(struct hmm_mirror *mirror,
+ enum hmm_update action,
+ unsigned long start,
+ unsigned long end);
+ };
+
+Device driver must perform update to the range following action (turn range
+read only, or fully unmap, ...). Once driver callback returns the device must
+be done with the update.
+
+
+When device driver wants to populate a range of virtual address it can use
+either:
+ int hmm_vma_get_pfns(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns);
+ int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block);
+
+First one (hmm_vma_get_pfns()) will only fetch present CPU page table entry and
+will not trigger a page fault on missing or non present entry. The second one
+do trigger page fault on missing or read only entry if write parameter is true.
+Page fault use the generic mm page fault code path just like a CPU page fault.
+
+Both function copy CPU page table into their pfns array argument. Each entry in
+that array correspond to an address in the virtual range. HMM provide a set of
+flags to help driver identify special CPU page table entries.
+
+Locking with the update() callback is the most important aspect the driver must
+respect in order to keep things properly synchronize. The usage pattern is :
+
+ int driver_populate_range(...)
+ {
+ struct hmm_range range;
+ ...
+ again:
+ ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
+ if (ret)
+ return ret;
+ take_lock(driver->update);
+ if (!hmm_vma_range_done(vma, &range)) {
+ release_lock(driver->update);
+ goto again;
+ }
+
+ // Use pfns array content to update device page table
+
+ release_lock(driver->update);
+ return 0;
+ }
+
+The driver->update lock is the same lock that driver takes inside its update()
+callback. That lock must be call before hmm_vma_range_done() to avoid any race
+with a concurrent CPU page table update.
+
+HMM implements all this on top of the mmu_notifier API because we wanted to a
+simpler API and also to be able to perform optimization latter own like doing
+concurrent device update in multi-devices scenario.
+
+HMM also serve as an impedence missmatch between how CPU page table update are
+done (by CPU write to the page table and TLB flushes) from how device update
+their own page table. Device update is a multi-step process, first appropriate
+commands are write to a buffer, then this buffer is schedule for execution on
+the device. It is only once the device has executed commands in the buffer that
+the update is done. Creating and scheduling update command buffer can happen
+concurrently for multiple devices. Waiting for each device to report commands
+as executed is serialize (there is no point in doing this concurrently).
+
+
+-------------------------------------------------------------------------------
+
+5) Represent and manage device memory from core kernel point of view
+
+Several differents design were try to support device memory. First one use
+device specific data structure to keep information about migrated memory and
+HMM hooked itself in various place of mm code to handle any access to address
+that were back by device memory. It turns out that this ended up replicating
+most of the fields of struct page and also needed many kernel code path to be
+updated to understand this new kind of memory.
+
+Thing is most kernel code path never try to access the memory behind a page
+but only care about struct page contents. Because of this HMM switchted to
+directly using struct page for device memory which left most kernel code path
+un-aware of the difference. We only need to make sure that no one ever try to
+map those page from the CPU side.
+
+HMM provide a set of helpers to register and hotplug device memory as a new
+region needing struct page. This is offer through a very simple API:
+
+ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size);
+ void hmm_devmem_remove(struct hmm_devmem *devmem);
+
+The hmm_devmem_ops is where most of the important things are:
+
+ struct hmm_devmem_ops {
+ void (*free)(struct hmm_devmem *devmem, struct page *page);
+ int (*fault)(struct hmm_devmem *devmem,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ struct page *page,
+ unsigned flags,
+ pmd_t *pmdp);
+ };
+
+The first callback (free()) happens when the last reference on a device page is
+drop. This means the device page is now free and no longer use by anyone. The
+second callback happens whenever CPU try to access a device page which it can
+not do. This second callback must trigger a migration back to system memory.
+
+
+-------------------------------------------------------------------------------
+
+6) Migrate to and from device memory
+
+Because CPU can not access device memory, migration must use device DMA engine
+to perform copy from and to device memory. For this we need a new migration
+helper:
+
+ int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long mentries,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private);
+
+Unlike other migration function it works on a range of virtual address, there
+is two reasons for that. First device DMA copy has a high setup overhead cost
+and thus batching multiple pages is needed as otherwise the migration overhead
+make the whole excersie pointless. The second reason is because driver trigger
+such migration base on range of address the device is actively accessing.
+
+The migrate_vma_ops struct define two callbacks. First one (alloc_and_copy())
+control destination memory allocation and copy operation. Second one is there
+to allow device driver to perform cleanup operation after migration.
+
+ struct migrate_vma_ops {
+ void (*alloc_and_copy)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+ void (*finalize_and_map)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ const unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+ };
+
+It is important to stress that this migration helpers allow for hole in the
+virtual address range. Some pages in the range might not be migrated for all
+the usual reasons (page is pin, page is lock, ...). This helper does not fail
+but just skip over those pages.
+
+The alloc_and_copy() might as well decide to not migrate all pages in the
+range (for reasons under the callback control). For those the callback just
+have to leave the corresponding dst entry empty.
+
+Finaly the migration of the struct page might fails (for file back page) for
+various reasons (failure to freeze reference, or update page cache, ...). If
+that happens then the finalize_and_map() can catch any pages that was not
+migrated. Note those page were still copied to new page and thus we wasted
+bandwidth but this is considered as a rare event and a price that we are
+willing to pay to keep all the code simpler.
+
+
+-------------------------------------------------------------------------------
+
+7) Memory cgroup (memcg) and rss accounting
+
+For now device memory is accounted as any regular page in rss counters (either
+anonymous if device page is use for anonymous, file if device page is use for
+file back page or shmem if device page is use for share memory). This is a
+deliberate choice to keep existing application that might start using device
+memory without knowing about it to keep runing unimpacted.
+
+Drawbacks is that OOM killer might kill an application using a lot of device
+memory and not a lot of regular system memory and thus not freeing much system
+memory. We want to gather more real world experience on how application and
+system react under memory pressure in the presence of device memory before
+deciding to account device memory differently.
+
+
+Same decision was made for memory cgroup. Device memory page are accounted
+against same memory cgroup a regular page would be accounted to. This does
+simplify migration to and from device memory. This also means that migration
+back from device memory to regular memory can not fail because it would
+go above memory cgroup limit. We might revisit this choice latter on once we
+get more experience in how device memory is use and its impact on memory
+resource control.
+
+
+Note that device memory can never be pin nor by device driver nor through GUP
+and thus such memory is always free upon process exit. Or when last reference
+is drop in case of share memory or file back memory.
diff --git a/MAINTAINERS b/MAINTAINERS
index 521fe1f094b1..976e2d586637 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7562,6 +7562,13 @@ F: include/linux/nd.h
F: include/linux/libnvdimm.h
F: include/uapi/linux/ndctl.h
+HMM - Heterogeneous Memory Management
+M: Jérôme Glisse <jglisse@redhat.com>
+L: linux-mm@kvack.org
+S: Maintained
+F: mm/hmm*
+F: include/linux/hmm*
+
LIBNVDIMM BLK: MMIO-APERTURE DRIVER
M: Ross Zwisler <ross.zwisler@linux.intel.com>
L: linux-nvdimm@lists.01.org
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 660ed39e9c9a..b8d4f07f332c 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -131,7 +131,7 @@ static void flush_tmregs_to_thread(struct task_struct *tsk)
* in the appropriate thread structures from live.
*/
- if (tsk != current)
+ if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current))
return;
if (MSR_TM_SUSPENDED(mfmsr())) {
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 7e34cbdd5080..525649400965 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -807,9 +807,6 @@ static void do_nothing(void *unused)
void rfi_flush_enable(bool enable)
{
- if (rfi_flush == enable)
- return;
-
if (enable) {
do_rfi_flush_fixups(enabled_flush_types);
on_each_cpu(do_nothing, NULL, 1);
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 4437c70c7c2b..cba34571027e 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -496,6 +496,49 @@ static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
static DEVICE_ATTR(pir, 0400, show_pir, NULL);
+#ifdef CONFIG_PPC_BOOK3S_64
+extern bool rfi_flush;
+static ssize_t show_rfi_flush(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", rfi_flush ? 1 : 0);
+}
+
+static ssize_t __used store_rfi_flush(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ bool enable;
+ int val;
+ int ret = 0;
+
+ ret = sscanf(buf, "%d", &val);
+ if (ret != 1)
+ return -EINVAL;
+
+ if (val == 1)
+ enable = true;
+ else if (val == 0)
+ enable = false;
+ else
+ return -EINVAL;
+
+ /* Only do anything if we're changing state */
+ if (enable != rfi_flush)
+ rfi_flush_enable(enable);
+
+ return count;
+}
+
+static DEVICE_ATTR(rfi_flush, 0600,
+ show_rfi_flush, store_rfi_flush);
+
+static void sysfs_create_rfi_flush(void)
+{
+ device_create_file(cpu_subsys.dev_root, &dev_attr_rfi_flush);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
/*
* This is the system wide DSCR register default value. Any
* change to this default value through the sysfs interface
@@ -1036,6 +1079,9 @@ static int __init topology_init(void)
WARN_ON(r < 0);
#ifdef CONFIG_PPC64
sysfs_create_dscr_default();
+#ifdef CONFIG_PPC_BOOK3S
+ sysfs_create_rfi_flush();
+#endif
#endif /* CONFIG_PPC64 */
return 0;
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index 74efac61aae9..ac98e83a5eb2 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -348,6 +348,9 @@ void post_mobility_fixup(void)
printk(KERN_ERR "Post-mobility device tree update "
"failed: %d\n", rc);
+ /* Possibly switch to a new RFI flush type */
+ pseries_setup_rfi_flush();
+
return;
}
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 72610bf58c9f..f46132c434f6 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -79,6 +79,8 @@ extern struct pci_controller_ops pseries_pci_controller_ops;
unsigned long pseries_memory_block_size(void);
+void pseries_setup_rfi_flush(void);
+
extern int CMO_PrPSP;
extern int CMO_SecPSP;
extern unsigned long CMO_PageSize;
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index ae4f596273b5..6ce54c17ca17 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -459,7 +459,7 @@ static void __init find_and_init_phbs(void)
of_pci_check_probe_only();
}
-static void pseries_setup_rfi_flush(void)
+void pseries_setup_rfi_flush(void)
{
struct h_cpu_char_result result;
enum l1d_flush_type types;
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h
index f6c2b5814ab0..8e6b07609ff4 100644
--- a/arch/s390/include/asm/switch_to.h
+++ b/arch/s390/include/asm/switch_to.h
@@ -36,8 +36,8 @@ static inline void restore_access_regs(unsigned int *acrs)
save_ri_cb(prev->thread.ri_cb); \
save_gs_cb(prev->thread.gs_cb); \
} \
+ update_cr_regs(next); \
if (next->mm) { \
- update_cr_regs(next); \
set_cpu_flag(CIF_FPU); \
restore_access_regs(&next->thread.acrs[0]); \
restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index f7e82302a71e..ee08127b6caa 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -1548,6 +1548,7 @@ static struct s390_insn opcode_e7[] = {
{ "vfsq", 0xce, INSTR_VRR_VV000MM },
{ "vfs", 0xe2, INSTR_VRR_VVV00MM },
{ "vftci", 0x4a, INSTR_VRI_VVIMM },
+ { "", 0, INSTR_INVALID }
};
static struct s390_insn opcode_eb[] = {
@@ -1953,7 +1954,7 @@ void show_code(struct pt_regs *regs)
{
char *mode = user_mode(regs) ? "User" : "Krnl";
unsigned char code[64];
- char buffer[64], *ptr;
+ char buffer[128], *ptr;
mm_segment_t old_fs;
unsigned long addr;
int start, end, opsize, hops, i;
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index fd45cf0ce857..0756ead405a1 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -373,8 +373,10 @@ static __init void detect_machine_facilities(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE;
if (test_facility(40))
S390_lowcore.machine_flags |= MACHINE_FLAG_LPP;
- if (test_facility(50) && test_facility(73))
+ if (test_facility(50) && test_facility(73)) {
S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
+ __ctl_set_bit(0, 55);
+ }
if (test_facility(51))
S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
if (test_facility(129)) {
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index bb32b8618bf6..0bc4af232359 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -99,6 +99,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long new_stackp,
memset(&p->thread.per_user, 0, sizeof(p->thread.per_user));
memset(&p->thread.per_event, 0, sizeof(p->thread.per_event));
clear_tsk_thread_flag(p, TIF_SINGLE_STEP);
+ p->thread.per_flags = 0;
/* Initialize per thread user and system timer values */
p->thread.user_timer = 0;
p->thread.guest_timer = 0;
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 8eb1cc341dab..005bfb1152b3 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -178,6 +178,9 @@ out_unlock:
static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
size_t size, int flags)
{
+ unsigned long irqflags;
+ int ret;
+
/*
* With zdev->tlb_refresh == 0, rpcit is not required to establish new
* translations when previously invalid translation-table entries are
@@ -193,8 +196,22 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr,
return 0;
}
- return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
- PAGE_ALIGN(size));
+ ret = zpci_refresh_trans((u64) zdev->fh << 32, dma_addr,
+ PAGE_ALIGN(size));
+ if (ret == -ENOMEM && !s390_iommu_strict) {
+ /* enable the hypervisor to free some resources */
+ if (zpci_refresh_global(zdev))
+ goto out;
+
+ spin_lock_irqsave(&zdev->iommu_bitmap_lock, irqflags);
+ bitmap_andnot(zdev->iommu_bitmap, zdev->iommu_bitmap,
+ zdev->lazy_bitmap, zdev->iommu_pages);
+ bitmap_zero(zdev->lazy_bitmap, zdev->iommu_pages);
+ spin_unlock_irqrestore(&zdev->iommu_bitmap_lock, irqflags);
+ ret = 0;
+ }
+out:
+ return ret;
}
static int dma_update_trans(struct zpci_dev *zdev, unsigned long pa,
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index ea34086c8674..b00702105763 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -87,6 +87,9 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
if (cc)
zpci_err_insn(cc, status, addr, range);
+ if (cc == 1 && (status == 4 || status == 16))
+ return -ENOMEM;
+
return (cc) ? -EIO : 0;
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 34da1c5ffd72..fdad3038c904 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2305,6 +2305,10 @@ source "kernel/livepatch/Kconfig"
endmenu
+config ARCH_HAS_ADD_PAGES
+ def_bool y
+ depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+
config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 71c7cf4e7c11..91a2ef302c7f 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -956,7 +956,8 @@ ENTRY(debug)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Ldebug_from_sysenter_stack
@@ -999,7 +1000,8 @@ ENTRY(nmi)
movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */
- PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */
cmpl $SIZEOF_SYSENTER_stack, %ecx
jb .Lnmi_from_sysenter_stack
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index bbc14b5379d3..1371359c2f5e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -135,6 +135,64 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
+ .pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline. This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address). So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline. We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH CPU_ENTRY_AREA_SYSENTER_stack + \
+ SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+ UNWIND_HINT_EMPTY
+ swapgs
+
+ /* Stash the user RSP. */
+ movq %rsp, RSP_SCRATCH
+
+ /* Load the top of the task stack into RSP */
+ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+ /* Start building the simulated IRET frame. */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq RSP_SCRATCH /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+
+ /*
+ * x86 lacks a near absolute jump, and we can't jump to the real
+ * entry text with a relative jump. We could push the target
+ * address and then use retq, but this destroys the pipeline on
+ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
+ * spill RDI and restore it in a second-stage trampoline.
+ */
+ pushq %rdi
+ movq $entry_SYSCALL_64_stage2, %rdi
+ jmp *%rdi
+END(entry_SYSCALL_64_trampoline)
+
+ .popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+ UNWIND_HINT_EMPTY
+ popq %rdi
+ jmp entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@@ -142,25 +200,18 @@ ENTRY(entry_SYSCALL_64)
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
- SWAPGS_UNSAFE_STACK
- /*
- * A hypervisor implementation might want to use a label
- * after the swapgs, so that it can do the swapgs
- * for the guest and jump here on syscall.
- */
-GLOBAL(entry_SYSCALL_64_after_swapgs)
+ swapgs
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- TRACE_IRQS_OFF
-
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -174,6 +225,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
UNWIND_HINT_REGS extra=0
+ TRACE_IRQS_OFF
+
/*
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
@@ -329,8 +382,24 @@ syscall_return_via_sysret:
popq %rsi /* skip rcx */
popq %rdx
popq %rsi
+
+ /*
+ * Now all regs are restored except RSP and RDI.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ pushq RSP-RDI(%rdi) /* RSP */
+ pushq (%rdi) /* RDI */
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+
popq %rdi
- movq RSP-ORIG_RAX(%rsp), %rsp
+ popq %rsp
USERGS_SYSRET64
END(entry_SYSCALL_64)
@@ -465,12 +534,13 @@ END(irq_entries_start)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
- pushfq
- testl $X86_EFLAGS_IF, (%rsp)
+ pushq %rax
+ SAVE_FLAGS(CLBR_RAX)
+ testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
.Lokay_\@:
- addq $8, %rsp
+ popq %rax
#endif
.endm
@@ -562,6 +632,13 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
cld
+
+ testb $3, CS-ORIG_RAX(%rsp)
+ jz 1f
+ SWAPGS
+ call switch_to_thread_stack
+1:
+
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
@@ -571,12 +648,8 @@ END(irq_entries_start)
jz 1f
/*
- * IRQ from user mode. Switch to kernel gsbase and inform context
- * tracking that we're in kernel mode.
- */
- SWAPGS
-
- /*
+ * IRQ from user mode.
+ *
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
* (which can take locks). Since TRACE_IRQS_OFF idempotent,
@@ -629,10 +702,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2
1:
#endif
- SWAPGS
POP_EXTRA_REGS
- POP_C_REGS
- addq $8, %rsp /* skip regs->orig_ax */
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+
+ /*
+ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+ /* Copy the IRET frame to the trampoline stack. */
+ pushq 6*8(%rdi) /* SS */
+ pushq 5*8(%rdi) /* RSP */
+ pushq 4*8(%rdi) /* EFLAGS */
+ pushq 3*8(%rdi) /* CS */
+ pushq 2*8(%rdi) /* RIP */
+
+ /* Push user RDI on the trampoline stack. */
+ pushq (%rdi)
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+
+ /* Restore RDI. */
+ popq %rdi
+ SWAPGS
INTERRUPT_RETURN
@@ -845,7 +949,33 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/*
* Exception entry points.
*/
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+/*
+ * Switch to the thread stack. This is called with the IRET frame and
+ * orig_ax on the stack. (That is, RDI..R12 are not on the stack and
+ * space has not been allocated for them.)
+ */
+ENTRY(switch_to_thread_stack)
+ UNWIND_HINT_FUNC
+
+ pushq %rdi
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+
+ pushq 7*8(%rdi) /* regs->ss */
+ pushq 6*8(%rdi) /* regs->rsp */
+ pushq 5*8(%rdi) /* regs->eflags */
+ pushq 4*8(%rdi) /* regs->cs */
+ pushq 3*8(%rdi) /* regs->ip */
+ pushq 2*8(%rdi) /* regs->orig_ax */
+ pushq 8(%rdi) /* return address */
+ UNWIND_HINT_FUNC
+
+ movq (%rdi), %rdi
+ ret
+END(switch_to_thread_stack)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@@ -864,11 +994,12 @@ ENTRY(\sym)
ALLOC_PT_GPREGS_ON_STACK
- .if \paranoid
- .if \paranoid == 1
+ .if \paranoid < 2
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
- jnz 1f
+ jnz .Lfrom_usermode_switch_stack_\@
.endif
+
+ .if \paranoid
call paranoid_entry
.else
call error_entry
@@ -910,20 +1041,15 @@ ENTRY(\sym)
jmp error_exit
.endif
- .if \paranoid == 1
+ .if \paranoid < 2
/*
- * Paranoid entry from userspace. Switch stacks and treat it
+ * Entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
-1:
+.Lfrom_usermode_switch_stack_\@:
call error_entry
-
- movq %rsp, %rdi /* pt_regs pointer */
- call sync_regs
- movq %rax, %rsp /* switch stack */
-
movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
@@ -1195,6 +1321,14 @@ ENTRY(error_entry)
SWAPGS
.Lerror_entry_from_usermode_after_swapgs:
+ /* Put us onto the real thread stack. */
+ popq %r12 /* save return addr in %12 */
+ movq %rsp, %rdi /* arg0 = pt_regs pointer */
+ call sync_regs
+ movq %rax, %rsp /* switch stack */
+ ENCODE_FRAME_POINTER
+ pushq %r12
+
/*
* We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1c771a55b68..2270601b6218 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -47,7 +47,7 @@
*/
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ SWAPGS
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -183,21 +183,20 @@ ENDPROC(entry_SYSENTER_compat)
*/
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
- SWAPGS_UNSAFE_STACK
+ swapgs
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- /* Zero-extending 32-bit regs, do not remove */
- movl %eax, %eax
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+ movl %eax, %eax /* discard orig_ax high bits */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
@@ -306,8 +305,11 @@ ENTRY(entry_INT80_compat)
*/
movl %eax, %eax
- /* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */
+
+ /* switch to thread stack expects orig_ax to be pushed */
+ call switch_to_thread_stack
+
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 885457aa2a15..935d347c47a5 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -115,7 +115,7 @@ void hyperv_init(void)
__u8 d1 = 0x10; /* SuSE */
__u16 d2 = 0x0; /* -d of a.b.c-d */
- if (x86_hyper != &x86_hyper_ms_hyperv)
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
/* Allocate percpu VP index */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index d59c15c3defd..638ebbf482c0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -135,6 +135,8 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
/*
* Static testing of CPU features. Used the same as boot_cpu_has().
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d0a21b12dd58..4c730d1caa3c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -58,17 +58,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
return this_cpu_ptr(&gdt_page)->gdt;
}
-/* Get the fixmap index for a specific processor */
-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
-{
- return FIX_GDT_REMAP_BEGIN + cpu;
-}
-
/* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
- unsigned int idx = get_cpu_gdt_ro_index(cpu);
- return (struct desc_struct *)__fix_to_virt(idx);
+ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
}
/* Provide the current read-only GDT */
@@ -205,7 +198,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t
#endif
}
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{
struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dcd9fb55e679..4c6d4a199ba1 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE)
#endif
+/*
+ * cpu_entry_area is a percpu region in the fixmap that contains things
+ * needed by the CPU and early entry/exit code. Real types aren't used
+ * for all fields here to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+ * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+ * a a read-only guard page.
+ */
+ struct SYSENTER_stack_page SYSENTER_stack_page;
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+ char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+ /*
+ * Exception stacks used for IST entries.
+ *
+ * In the future, this should have a separate slot for each stack
+ * with guard pages between them.
+ */
+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+};
+
+#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+
+extern void setup_cpu_entry_areas(void);
/*
* Here we define all the compile-time 'special' virtual
@@ -101,8 +140,8 @@ enum fixed_addresses {
FIX_LNW_VRTC,
#endif
/* Fixmap entries to remap the GDTs, one per processor. */
- FIX_GDT_REMAP_BEGIN,
- FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+ FIX_CPU_ENTRY_AREA_TOP,
+ FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
__end_of_permanent_fixed_addresses,
@@ -185,5 +224,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
void __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
+static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
+{
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
+}
+
+#define __get_cpu_entry_area_offset_index(cpu, offset) ({ \
+ BUILD_BUG_ON(offset % PAGE_SIZE != 0); \
+ __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE); \
+ })
+
+#define get_cpu_entry_area_index(cpu, field) \
+ __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
+
+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
+}
+
+static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
+{
+ return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 0ead9dbb9130..96aa6b9884dc 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,14 +20,22 @@
#ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H
+/* x86 hypervisor types */
+enum x86_hypervisor_type {
+ X86_HYPER_NATIVE = 0,
+ X86_HYPER_VMWARE,
+ X86_HYPER_MS_HYPERV,
+ X86_HYPER_XEN_PV,
+ X86_HYPER_XEN_HVM,
+ X86_HYPER_KVM,
+};
+
#ifdef CONFIG_HYPERVISOR_GUEST
#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
-/*
- * x86 hypervisor information
- */
struct hypervisor_x86 {
/* Hypervisor name */
const char *name;
@@ -35,40 +43,27 @@ struct hypervisor_x86 {
/* Detection routine */
uint32_t (*detect)(void);
- /* Platform setup (run once per boot) */
- void (*init_platform)(void);
-
- /* X2APIC detection (run once per boot) */
- bool (*x2apic_available)(void);
+ /* Hypervisor type */
+ enum x86_hypervisor_type type;
- /* pin current vcpu to specified physical cpu (run rarely) */
- void (*pin_vcpu)(int);
+ /* init time callbacks */
+ struct x86_hyper_init init;
- /* called during init_mem_mapping() to setup early mappings. */
- void (*init_mem_mapping)(void);
+ /* runtime callbacks */
+ struct x86_hyper_runtime runtime;
};
-extern const struct hypervisor_x86 *x86_hyper;
-
-/* Recognized hypervisors */
-extern const struct hypervisor_x86 x86_hyper_vmware;
-extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
-extern const struct hypervisor_x86 x86_hyper_xen_pv;
-extern const struct hypervisor_x86 x86_hyper_xen_hvm;
-extern const struct hypervisor_x86 x86_hyper_kvm;
-
+extern enum x86_hypervisor_type x86_hyper_type;
extern void init_hypervisor_platform(void);
-extern bool hypervisor_x2apic_available(void);
-extern void hypervisor_pin_vcpu(int cpu);
-
-static inline void hypervisor_init_mem_mapping(void)
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
- if (x86_hyper && x86_hyper->init_mem_mapping)
- x86_hyper->init_mem_mapping();
+ return x86_hyper_type == type;
}
#else
static inline void init_hypervisor_platform(void) { }
-static inline bool hypervisor_x2apic_available(void) { return false; }
-static inline void hypervisor_init_mem_mapping(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return type == X86_HYPER_NATIVE;
+}
#endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index ac7692dcfa2e..d937781e1047 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -141,6 +141,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \
sysretl
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x) pushfq; popq %rax
+#endif
#else
#define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 29a594a3b82a..2a7769dd8fa2 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -25,6 +25,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 3ffd9fa96ce8..811bdd289475 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -965,6 +965,15 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
#endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0a6597d057d3..7a3718b76dad 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -161,9 +161,9 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
-extern struct tss_struct doublefault_tss;
-extern __u32 cpu_caps_cleared[NCAPINTS];
-extern __u32 cpu_caps_set[NCAPINTS];
+extern struct x86_hw_tss doublefault_tss;
+extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@@ -251,6 +251,11 @@ static inline void load_cr3(pgd_t *pgdir)
write_cr3(__sme_pa(pgdir));
}
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
#ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */
struct x86_hw_tss {
@@ -303,7 +308,13 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
+
+ /*
+ * We store cpu_current_top_of_stack in sp1 so it's always accessible.
+ * Linux does not use ring 1, so sp1 is not otherwise needed.
+ */
u64 sp1;
+
u64 sp2;
u64 reserved2;
u64 ist[7];
@@ -321,12 +332,22 @@ struct x86_hw_tss {
#define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
+#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000
+struct SYSENTER_stack {
+ unsigned long words[64];
+};
+
+struct SYSENTER_stack_page {
+ struct SYSENTER_stack stack;
+} __aligned(PAGE_SIZE);
+
struct tss_struct {
/*
- * The hardware state:
+ * The fixed hardware portion. This must not cross a page boundary
+ * at risk of violating the SDM's advice and potentially triggering
+ * errata.
*/
struct x86_hw_tss x86_tss;
@@ -337,18 +358,9 @@ struct tss_struct {
* be within the limit.
*/
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+} __aligned(PAGE_SIZE);
-#ifdef CONFIG_X86_32
- /*
- * Space for the temporary SYSENTER stack.
- */
- unsigned long SYSENTER_stack_canary;
- unsigned long SYSENTER_stack[64];
-#endif
-
-} ____cacheline_aligned;
-
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* sizeof(unsigned long) coming from an extra "long" at the end
@@ -362,6 +374,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#else
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif
/*
@@ -521,7 +536,7 @@ static inline void native_set_iopl_mask(unsigned mask)
static inline void
native_load_sp0(unsigned long sp0)
{
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
static inline void native_swapgs(void)
@@ -533,12 +548,12 @@ static inline void native_swapgs(void)
static inline unsigned long current_top_of_stack(void)
{
-#ifdef CONFIG_X86_64
- return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
-#else
- /* sp0 on x86_32 is special in and around vm86 mode. */
+ /*
+ * We can't read directly from tss.sp0: sp0 on x86_32 is special in
+ * and around vm86 mode and sp0 on x86_64 is special because of the
+ * entry trampoline.
+ */
return this_cpu_read_stable(cpu_current_top_of_stack);
-#endif
}
static inline bool on_thread_stack(void)
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 2e41c50ddf47..95f999576131 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -15,6 +15,7 @@ enum stack_type {
STACK_TYPE_TASK,
STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ,
+ STACK_TYPE_SYSENTER,
STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
};
@@ -27,6 +28,8 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
+
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 010cd6e4eafc..cfb6dfe4c457 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -78,10 +78,10 @@ do { \
static inline void refresh_sysenter_cs(struct thread_struct *thread)
{
/* Only happens when SEP is enabled, no need to test "SEP"arately: */
- if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return;
- this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
#endif
@@ -89,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
/* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task)
{
+ /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
load_sp0(task->thread.sp0);
#else
- load_sp0(task_top_of_stack(task));
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ load_sp0(task_top_of_stack(task));
#endif
}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 90e1f9b84534..49cb02d0597f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -203,7 +203,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif
#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 9fc6a94cfea6..1ba3381f5534 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -92,7 +92,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif
dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 35d67dc7b69f..e1c1cb5019bc 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -6,6 +6,9 @@
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
struct unwind_state {
struct stack_info stack_info;
unsigned long stack_mask;
@@ -51,15 +54,28 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
}
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+/*
+ * If 'partial' returns true, only the iret frame registers are valid.
+ */
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
{
if (unwind_done(state))
return NULL;
+ if (partial) {
+#ifdef CONFIG_UNWINDER_ORC
+ *partial = !state->full_regs;
+#else
+ *partial = false;
+#endif
+ }
+
return state->regs;
}
#else
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
{
return NULL;
}
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 7ba7e90a9ad6..4d95e5a13c0b 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -114,6 +114,18 @@ struct x86_init_pci {
};
/**
+ * struct x86_hyper_init - x86 hypervisor init functions
+ * @init_platform: platform setup
+ * @x2apic_available: X2APIC detection
+ * @init_mem_mapping: setup early mappings during init_mem_mapping()
+ */
+struct x86_hyper_init {
+ void (*init_platform)(void);
+ bool (*x2apic_available)(void);
+ void (*init_mem_mapping)(void);
+};
+
+/**
* struct x86_init_ops - functions for platform specific setup
*
*/
@@ -126,6 +138,7 @@ struct x86_init_ops {
struct x86_init_timers timers;
struct x86_init_iommu iommu;
struct x86_init_pci pci;
+ struct x86_hyper_init hyper;
};
/**
@@ -199,6 +212,15 @@ struct x86_legacy_features {
};
/**
+ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
+ *
+ * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely)
+ */
+struct x86_hyper_runtime {
+ void (*pin_vcpu)(int cpu);
+};
+
+/**
* struct x86_platform_ops - platform specific runtime functions
* @calibrate_cpu: calibrate CPU
* @calibrate_tsc: calibrate TSC, if different from CPU
@@ -217,6 +239,7 @@ struct x86_legacy_features {
* possible in x86_early_init_platform_quirks() by
* only using the current x86_hardware_subarch
* semantics.
+ * @hyper: x86 hypervisor specific runtime callbacks
*/
struct x86_platform_ops {
unsigned long (*calibrate_cpu)(void);
@@ -232,6 +255,7 @@ struct x86_platform_ops {
void (*apic_post_init)(void);
struct x86_legacy_features legacy;
void (*set_legacy_features)(void);
+ struct x86_hyper_runtime hyper;
};
struct pci_dev;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2d75faf743f2..b7651a44bee8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1573,7 +1573,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
* under KVM
*/
if (max_physical_apicid > 255 ||
- !hypervisor_x2apic_available()) {
+ !x86_init.hyper.x2apic_available()) {
pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
x2apic_disable();
return;
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index de827d6ac8c2..40c3fab107ac 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -92,4 +92,10 @@ void common(void) {
BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
+ DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..c4f23da7a0f0 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -49,13 +49,8 @@ void foo(void)
BLANK();
/* Offset from the sysenter stack to tss.sp0 */
- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- offsetofend(struct tss_struct, SYSENTER_stack));
-
- /* Offset from cpu_tss to SYSENTER_stack */
- OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
- /* Size of SYSENTER_stack */
- DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+ DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+ offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
#ifdef CONFIG_CC_STACKPROTECTOR
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index cf42206926af..048f68ff3396 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -22,6 +22,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
BLANK();
#endif
@@ -62,6 +65,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 841199ee801f..c5e21716f033 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -434,8 +434,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS];
-__u32 cpu_caps_set[NCAPINTS];
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu)
{
@@ -448,27 +448,116 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
-/* Setup the fixmap mapping only once per-processor */
-static inline void setup_fixmap_gdt(int cpu)
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
+ SYSENTER_stack_storage);
+
+static void __init
+set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
+{
+ for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
+ __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
- /* On 64-bit systems, we use a read-only fixmap GDT. */
- pgprot_t prot = PAGE_KERNEL_RO;
+ extern char _entry_trampoline[];
+
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
- * a task gate needs to change an available TSS to busy. If the GDT
- * is read-only, that will triple fault.
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
*
- * On Xen PV, the GDT must be read-only because the hypervisor requires
- * it.
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
*/
- pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
#endif
- __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+ __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
+ per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+ *
+ * Avoid placing a page boundary in the part of the TSS that the
+ * processor reads during a task switch (the first 104 bytes). The
+ * processor may not correctly perform address translations if a
+ * boundary occurs in this area. During a task switch, the processor
+ * reads and writes into the first 104 bytes of each TSS (using
+ * contiguous physical addresses beginning with the physical address
+ * of the first byte of the TSS). So, after TSS access begins, if
+ * part of the 104 bytes is not physically contiguous, the processor
+ * will access incorrect information without generating a page-fault
+ * exception.
+ *
+ * There are also a lot of errata involving the TSS spanning a page
+ * boundary. Assert that we're not doing that.
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
+ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE,
+ tss_prot);
+
+#ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+ BUILD_BUG_ON(sizeof(exception_stacks) !=
+ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+ set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
+ &per_cpu(exception_stacks, cpu),
+ sizeof(exception_stacks) / PAGE_SIZE,
+ PAGE_KERNEL);
+
+ __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ setup_cpu_entry_area(cpu);
}
/* Load the original GDT from the per-cpu structure */
@@ -705,7 +794,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{
int i;
- for (i = 0; i < NCAPINTS; i++) {
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
}
@@ -1199,7 +1288,7 @@ void enable_sep_cpu(void)
return;
cpu = get_cpu();
- tss = &per_cpu(cpu_tss, cpu);
+ tss = &per_cpu(cpu_tss_rw, cpu);
/*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1208,11 +1297,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
-
- wrmsr(MSR_IA32_SYSENTER_ESP,
- (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
- 0);
-
+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu();
@@ -1317,25 +1402,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
-};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
+ extern char _entry_trampoline[];
+ extern char entry_SYSCALL_64_trampoline[];
+
+ int cpu = smp_processor_id();
+ unsigned long SYSCALL64_entry_trampoline =
+ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1346,7 +1425,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1490,7 +1569,7 @@ void cpu_init(void)
if (cpu)
load_ucode_ap();
- t = &per_cpu(cpu_tss, cpu);
+ t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1529,7 +1608,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS
*/
if (!oist->ist[0]) {
- char *estacks = per_cpu(exception_stacks, cpu);
+ char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v];
@@ -1540,7 +1619,7 @@ void cpu_init(void)
}
}
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/*
* <= is required because the CPU will access up to
@@ -1555,11 +1634,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, me);
/*
- * Initialize the TSS. Don't bother initializing sp0, as the initial
- * task never enters user mode.
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
*/
- set_tss_desc(cpu, t);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
+ load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
load_mm_ldt(&init_mm);
@@ -1571,7 +1651,6 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
- setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
@@ -1581,7 +1660,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu);
@@ -1615,12 +1694,12 @@ void cpu_init(void)
* Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode.
*/
- set_tss_desc(cpu, t);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
load_mm_ldt(&init_mm);
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */
@@ -1632,7 +1711,6 @@ void cpu_init(void)
fpu__init_cpu();
- setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 4fa90006ac68..bea8d3e24f50 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -26,6 +26,12 @@
#include <asm/processor.h>
#include <asm/hypervisor.h>
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_pv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
+
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PV
@@ -41,54 +47,52 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#endif
};
-const struct hypervisor_x86 *x86_hyper;
-EXPORT_SYMBOL(x86_hyper);
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
-static inline void __init
+static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
- const struct hypervisor_x86 *h, * const *p;
+ const struct hypervisor_x86 *h = NULL, * const *p;
uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
- h = *p;
- pri = h->detect();
- if (pri != 0 && pri > max_pri) {
+ pri = (*p)->detect();
+ if (pri > max_pri) {
max_pri = pri;
- x86_hyper = h;
+ h = *p;
}
}
- if (max_pri)
- pr_info("Hypervisor detected: %s\n", x86_hyper->name);
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
}
-void __init init_hypervisor_platform(void)
+static void __init copy_array(const void *src, void *target, unsigned int size)
{
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
- detect_hypervisor_vendor();
-
- if (!x86_hyper)
- return;
-
- if (x86_hyper->init_platform)
- x86_hyper->init_platform();
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
}
-bool __init hypervisor_x2apic_available(void)
+void __init init_hypervisor_platform(void)
{
- return x86_hyper &&
- x86_hyper->x2apic_available &&
- x86_hyper->x2apic_available();
-}
+ const struct hypervisor_x86 *h;
-void hypervisor_pin_vcpu(int cpu)
-{
- if (!x86_hyper)
+ h = detect_hypervisor_vendor();
+
+ if (!h)
return;
- if (x86_hyper->pin_vcpu)
- x86_hyper->pin_vcpu(cpu);
- else
- WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
+
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 42664f944cbc..1feb22440acb 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -259,9 +259,9 @@ static void __init ms_hyperv_init_platform(void)
#endif
}
-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
- .init_platform = ms_hyperv_init_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.init_platform = ms_hyperv_init_platform,
};
-EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 40ed26852ebd..8e005329648b 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void)
(eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
}
-const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
.name = "VMware",
.detect = vmware_platform,
- .init_platform = vmware_platform_setup,
- .x2apic_available = vmware_legacy_x2apic_available,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
};
-EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index f9c324e08d85..a9fe79d49d39 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -49,25 +49,23 @@ static void doublefault_fn(void)
cpu_relax();
}
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-
- .__cr3 = __pa_nodebug(swapper_pg_dir),
- }
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+ .sp0 = STACK_START,
+ .ss0 = __KERNEL_DS,
+ .ldt = 0,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+ .ip = (unsigned long) doublefault_fn,
+ /* 0x2 bit is always set */
+ .flags = X86_EFLAGS_SF | 0x2,
+ .sp = STACK_START,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
};
/* dummy for do_double_fault() call */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index bd265a4cf108..8271bbf2a4c3 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+{
+ struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
+
+ void *begin = ss;
+ void *end = ss + 1;
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_SYSENTER;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
@@ -50,6 +68,39 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
+void show_iret_regs(struct pt_regs *regs)
+{
+ printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+ regs->sp, regs->flags);
+}
+
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+ bool partial)
+{
+ /*
+ * These on_stack() checks aren't strictly necessary: the unwind code
+ * has already validated the 'regs' pointer. The checks are done for
+ * ordering reasons: if the registers are on the next stack, we don't
+ * want to print them out yet. Otherwise they'll be shown as part of
+ * the wrong stack. Later, when show_trace_log_lvl() switches to the
+ * next stack, this function will be called again with the same regs so
+ * they can be printed in the right context.
+ */
+ if (!partial && on_stack(info, regs, sizeof(*regs))) {
+ __show_regs(regs, 0);
+
+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
+ /*
+ * When an interrupt or exception occurs in entry code, the
+ * full pt_regs might not have been saved yet. In that case
+ * just print the iret frame.
+ */
+ show_iret_regs(regs);
+ }
+}
+
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl)
{
@@ -57,6 +108,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
struct stack_info stack_info = {0};
unsigned long visit_mask = 0;
int graph_idx = 0;
+ bool partial;
printk("%sCall Trace:\n", log_lvl);
@@ -71,31 +123,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack
* - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce)
+ * - SYSENTER stack
*
- * x86-32 can have up to three stacks:
+ * x86-32 can have up to four stacks:
* - task stack
* - softirq stack
* - hardirq stack
+ * - SYSENTER stack
*/
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name;
- /*
- * If we overflowed the task stack into a guard page, jump back
- * to the bottom of the usable stack.
- */
- if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
- stack = task_stack_page(task);
-
- if (get_stack_info(stack, task, &stack_info, &visit_mask))
- break;
+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
+ }
stack_name = stack_type_name(stack_info.type);
if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name);
- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
- __show_regs(regs, 0);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -119,7 +175,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/*
* Don't print regs->ip again if it was already printed
- * by __show_regs() below.
+ * by show_regs_if_on_stack().
*/
if (regs && stack == &regs->ip)
goto next;
@@ -154,9 +210,9 @@ next:
unwind_next_frame(&state);
/* if the frame has entry regs, print them */
- regs = unwind_get_entry_regs(&state);
- if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
- __show_regs(regs, 0);
+ regs = unwind_get_entry_regs(&state, &partial);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
}
if (stack_name)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 4f0481474903..c35d54f38ccc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -25,6 +25,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ";
+ if (type == STACK_TYPE_SYSENTER)
+ return "SYSENTER";
+
return NULL;
}
@@ -92,6 +95,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current)
goto unknown;
+ if (in_sysenter_stack(stack, info))
+ goto recursion_check;
+
if (in_hardirq_stack(stack, info))
goto recursion_check;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 225af4184f06..16ceab1132e8 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -36,6 +36,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ)
return "IRQ";
+ if (type == STACK_TYPE_SYSENTER)
+ return "SYSENTER";
+
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@@ -114,6 +117,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info))
goto recursion_check;
+ if (in_sysenter_stack(stack, info))
+ goto recursion_check;
+
goto unknown;
recursion_check:
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a4516ca4c4f3..3224a4297175 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -526,6 +526,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_SKL_IDS(&gen9_early_ops),
INTEL_BXT_IDS(&gen9_early_ops),
INTEL_KBL_IDS(&gen9_early_ops),
+ INTEL_CFL_IDS(&gen9_early_ops),
INTEL_GLK_IDS(&gen9_early_ops),
INTEL_CNL_IDS(&gen9_early_ops),
};
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4a613fed94b6..d13777d49d8b 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -66,7 +66,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(cpu_tss, get_cpu());
+ tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3be74fbdeff2..feca14980e32 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -56,10 +56,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
- WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom,
- estack_top, estack_bottom);
+ estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n");
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index dcc0154f6871..a83d4bbaff83 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -566,12 +566,12 @@ static uint32_t __init kvm_detect(void)
return kvm_cpuid_base();
}
-const struct hypervisor_x86 x86_hyper_kvm __refconst = {
+const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM",
.detect = kvm_detect,
- .x2apic_available = kvm_para_available,
+ .type = X86_HYPER_KVM,
+ .init.x2apic_available = kvm_para_available,
};
-EXPORT_SYMBOL_GPL(x86_hyper_kvm);
static __init int activate_jump_labels(void)
{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 11aaf1eaa0e4..c354833342bd 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff8a9acbcf8b..0a17cbcdf64c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,7 +46,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
.x86_tss = {
/*
* .sp0 is only used when entering ring 0 from a lower
@@ -55,6 +55,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
* Poison it.
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+#ifdef CONFIG_X86_64
+ /*
+ * .sp1 is cpu_current_top_of_stack. The init task never
+ * runs user code, but cpu_current_top_of_stack should still
+ * be well defined before the first context switch.
+ */
+ .sp1 = TOP_OF_INIT_STACK,
+#endif
+
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
@@ -70,11 +80,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
*/
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
-#ifdef CONFIG_X86_32
- .SYSENTER_stack_canary = STACK_END_MAGIC,
-#endif
};
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -103,7 +110,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (bp) {
- struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c0d60420466c..784ff9147172 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b08b9b6c40eb..c75466232016 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -69,10 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
- (void *)regs->ip);
- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
- regs->sp, regs->flags);
+ show_iret_regs(regs);
+
if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else
@@ -89,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
regs->r13, regs->r14, regs->r15);
+ if (!all)
+ return;
+
asm("movl %%ds,%0" : "=r" (ds));
asm("movl %%cs,%0" : "=r" (cs));
asm("movl %%es,%0" : "=r" (es));
@@ -99,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
rdmsrl(MSR_GS_BASE, gs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
- if (!all)
- return;
-
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = __read_cr3();
@@ -401,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
@@ -463,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Switch the PDA and FPU contexts.
*/
this_cpu_write(current_task, next_p);
+ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
/* Reload sp0. */
update_sp0(next_p);
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b9ba5b972a47..ae90592118d1 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -102,7 +102,7 @@ __save_stack_trace_reliable(struct stack_trace *trace,
for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
unwind_next_frame(&state)) {
- regs = unwind_get_entry_regs(&state);
+ regs = unwind_get_entry_regs(&state, NULL);
if (regs) {
/* Success path for user tasks */
if (user_mode(regs))
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 0347ed41c92d..976694564c07 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -356,9 +356,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
/*
* If IRET takes a non-IST fault on the espfix64 stack, then we
- * end up promoting it to a doublefault. In that case, modify
- * the stack to make it look like we just entered the #GP
- * handler from user space, similar to bad_iret.
+ * end up promoting it to a doublefault. In that case, take
+ * advantage of the fact that we're not using the normal (TSS.sp0)
+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
+ * and then modify our own IRET frame so that, when we return,
+ * we land directly at the #GP(0) vector with the stack already
+ * set up according to its expectations.
+ *
+ * The net result is that our #GP handler will think that we
+ * entered from usermode with the bad user context.
*
* No need for ist_enter here because we don't use RCU.
*/
@@ -366,13 +372,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
- struct pt_regs *normal_regs = task_pt_regs(current);
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
- /* Fake a #GP(0) from userspace. */
- memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
- normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
+ /*
+ * regs->sp points to the failing IRET frame on the
+ * ESPFIX64 stack. Copy it to the entry stack. This fills
+ * in gpregs->ss through gpregs->ip.
+ *
+ */
+ memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
+
+ /*
+ * Adjust our frame so that we return straight to the #GP
+ * vector with the expected RSP value. This is safe because
+ * we won't enable interupts or schedule before we invoke
+ * general_protection, so nothing will clobber the stack
+ * frame we just set up.
+ */
regs->ip = (unsigned long)general_protection;
- regs->sp = (unsigned long)&normal_regs->orig_ax;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
@@ -397,7 +416,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
*
* Processors update CR2 whenever a page fault is detected. If a
* second page fault occurs while an earlier page fault is being
- * deliv- ered, the faulting linear address of the second fault will
+ * delivered, the faulting linear address of the second fault will
* overwrite the contents of CR2 (replacing the previous
* address). These updates to CR2 occur even if the page fault
* results in a double fault or occurs during the delivery of a
@@ -608,14 +627,15 @@ NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch off the IST stack if the
- * interrupted code was in user mode. The actual stack switch is done in
- * entry_64.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = task_pt_regs(current);
- *regs = *eregs;
+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+ if (regs != eregs)
+ *regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
@@ -631,13 +651,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
- * correctly, we want move our stack frame to task_pt_regs
- * and we want to pretend that the exception came from the
- * iret target.
+ * correctly, we want to move our stack frame to where it would
+ * be had we entered directly on the entry stack (rather than
+ * just below the IRET frame) and we want to pretend that the
+ * exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
- container_of(task_pt_regs(current),
- struct bad_iret_stack, regs);
+ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@@ -802,14 +822,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
-#if defined(CONFIG_X86_32)
- /*
- * This is the most likely code path that involves non-trivial use
- * of the SYSENTER stack. Check that we haven't overrun it.
- */
- WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
- "Overran or corrupted SYSENTER stack\n");
-#endif
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@@ -977,6 +989,9 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
+ /* Init cpu_entry_area before IST entries are set up */
+ setup_cpu_entry_areas();
+
set_intr_gate(X86_TRAP_DE, divide_error);
set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
/* int4 can be called from all */
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index a3f973b2c97a..be86a865087a 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL;
}
-static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
+ void *addr = (void *)_addr;
- /*
- * If the address isn't on the current stack, switch to the next one.
- *
- * We may have to traverse multiple stacks to deal with the possibility
- * that info->next_sp could point to an empty stack and the address
- * could be on a subsequent stack.
- */
- while (!on_stack(info, (void *)addr, len))
- if (get_stack_info(info->next_sp, state->task, info,
- &state->stack_mask))
- return false;
+ if (!on_stack(info, addr, len) &&
+ (get_stack_info(addr, state->task, info, &state->stack_mask)))
+ return false;
return true;
}
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
return true;
}
-#define REGS_SIZE (sizeof(struct pt_regs))
-#define SP_OFFSET (offsetof(struct pt_regs, sp))
-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
-
static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
- unsigned long *ip, unsigned long *sp, bool full)
+ unsigned long *ip, unsigned long *sp)
{
- size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
- size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
- struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
-
- if (IS_ENABLED(CONFIG_X86_64)) {
- if (!stack_access_ok(state, addr, regs_size))
- return false;
+ struct pt_regs *regs = (struct pt_regs *)addr;
- *ip = regs->ip;
- *sp = regs->sp;
+ /* x86-32 support will be more complicated due to the &regs->sp hack */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
- return true;
- }
-
- if (!stack_access_ok(state, addr, sp_offset))
+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
*ip = regs->ip;
+ *sp = regs->sp;
+ return true;
+}
- if (user_mode(regs)) {
- if (!stack_access_ok(state, addr + sp_offset,
- REGS_SIZE - SP_OFFSET))
- return false;
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
- *sp = regs->sp;
- } else
- *sp = (unsigned long)&regs->sp;
+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+ return false;
+ *ip = regs->ip;
+ *sp = regs->sp;
return true;
}
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
- struct pt_regs *ptregs;
bool indirect = false;
if (unwind_done(state))
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS:
- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_TYPE_REGS_IRET:
- if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
goto done;
}
- ptregs = container_of((void *)sp, struct pt_regs, ip);
- if ((unsigned long)ptregs >= prev_sp &&
- on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
- state->regs = ptregs;
- state->full_regs = false;
- } else
- state->regs = NULL;
-
+ state->regs = (void *)sp - IRET_FRAME_OFFSET;
+ state->full_regs = false;
state->signal = true;
break;
@@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
if (get_stack_info((unsigned long *)state->sp, state->task,
- &state->stack_info, &state->stack_mask))
- return;
+ &state->stack_info, &state->stack_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ if (get_stack_info(next_page, state->task, &state->stack_info,
+ &state->stack_mask))
+ return;
+ }
/*
* The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f05f00acac89..423aa36f0150 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -106,6 +106,15 @@ SECTIONS
SOFTIRQENTRY_TEXT
*(.fixup)
*(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ _entry_trampoline = .;
+ *(.entry_trampoline)
+ . = ALIGN(PAGE_SIZE);
+ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
/* End of text section */
_etext = .;
} :text = 0x9090
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a088b2c47f73..5b2d10c1973a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -28,6 +28,8 @@ void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
+bool __init bool_x86_init_noop(void) { return false; }
+void x86_op_int_noop(int cpu) { }
/*
* The platform setup functions are preset with the default functions
@@ -81,6 +83,12 @@ struct x86_init_ops x86_init __initdata = {
.init_irq = x86_default_pci_init_irq,
.fixup_irqs = x86_default_pci_fixup_irqs,
},
+
+ .hyper = {
+ .init_platform = x86_init_noop,
+ .x2apic_available = bool_x86_init_noop,
+ .init_mem_mapping = x86_init_noop,
+ },
};
struct x86_cpuinit_ops x86_cpuinit = {
@@ -101,6 +109,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.get_nmi_reason = default_get_nmi_reason,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .hyper.pin_vcpu = x86_op_int_noop,
};
EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1a0c301859d7..c1613dc5b764 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2274,7 +2274,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
- (unsigned long)this_cpu_ptr(&cpu_tss));
+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 29df077cb089..cf2ac227c2ac 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -106,10 +106,10 @@ static void delay_mwaitx(unsigned long __loops)
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
/*
- * Use cpu_tss as a cacheline-aligned, seldomly
+ * Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
- __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c6cb03eedab7..98f1ada68976 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -637,7 +637,7 @@ void __init init_mem_mapping(void)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
- hypervisor_init_mem_mapping();
+ x86_init.hyper.init_mem_mapping();
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 11d4da6a25a5..480b23a002d6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -671,7 +671,7 @@ void __init paging_init(void)
* After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
* updating.
*/
-static void update_end_of_memory_vars(u64 start, u64 size)
+static void update_end_of_memory_vars(u64 start, u64 size)
{
unsigned long end_pfn = PFN_UP(start + size);
@@ -682,22 +682,30 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
}
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- init_memory_mapping(start, start + size);
-
ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
- update_end_of_memory_vars(start, size);
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
return ret;
}
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ init_memory_mapping(start, start + size);
+
+ return add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
EXPORT_SYMBOL_GPL(arch_add_memory);
#define PAGE_INUSE 0xFD
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 78459a6d455a..2a717e023c9f 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -160,17 +160,19 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
- struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
- set_tss_desc(cpu, t); /*
- * This just modifies memory; should not be
- * necessary. But... This is necessary, because
- * 386 hardware has concept of busy TSS or some
- * similar stupidity.
- */
+
+ /*
+ * We need to reload TR, which requires that we change the
+ * GDT entry to indicate "available" first.
+ *
+ * XXX: This could probably all be replaced by a call to
+ * force_reload_TR().
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
#ifdef CONFIG_X86_64
memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index d1d37631d9c0..e25de9276ce6 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -235,12 +235,12 @@ static uint32_t __init xen_platform_hvm(void)
return xen_cpuid_base();
}
-const struct hypervisor_x86 x86_hyper_xen_hvm = {
+const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = {
.name = "Xen HVM",
.detect = xen_platform_hvm,
- .init_platform = xen_hvm_guest_init,
- .pin_vcpu = xen_pin_vcpu,
- .x2apic_available = xen_x2apic_para_available,
- .init_mem_mapping = xen_hvm_init_mem_mapping,
+ .type = X86_HYPER_XEN_HVM,
+ .init.init_platform = xen_hvm_guest_init,
+ .init.x2apic_available = xen_x2apic_para_available,
+ .init.init_mem_mapping = xen_hvm_init_mem_mapping,
+ .runtime.pin_vcpu = xen_pin_vcpu,
};
-EXPORT_SYMBOL(x86_hyper_xen_hvm);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index fe5b9b2e653d..2a909d0cba1b 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -840,7 +840,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
- this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
void xen_set_iopl_mask(unsigned mask)
@@ -1518,9 +1518,9 @@ static uint32_t __init xen_platform_pv(void)
return 0;
}
-const struct hypervisor_x86 x86_hyper_xen_pv = {
+const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
.name = "Xen PV",
.detect = xen_platform_pv,
- .pin_vcpu = xen_pin_vcpu,
+ .type = X86_HYPER_XEN_PV,
+ .runtime.pin_vcpu = xen_pin_vcpu,
};
-EXPORT_SYMBOL(x86_hyper_xen_pv);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 13a085df737f..e3ffdd656726 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2342,7 +2342,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
- case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
+ case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 41d3e7fcc914..e1a90f98790c 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -113,34 +113,47 @@ RELOC(xen_sysret64, 1b+1)
* rip
* r11
* rsp->rcx
- *
- * In all the entrypoints, we undo all that to make it look like a
- * CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
-.macro undo_xen_syscall
- mov 0*8(%rsp), %rcx
- mov 1*8(%rsp), %r11
- mov 5*8(%rsp), %rsp
-.endm
-
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
- undo_xen_syscall
- jmp entry_SYSCALL_64_after_swapgs
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER_DS and __USER_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER_DS, 4*8(%rsp)
+ movq $__USER_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_64_after_hwframe
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
- undo_xen_syscall
- jmp entry_SYSCALL_compat
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_compat_after_hwframe
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
- undo_xen_syscall
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
jmp entry_SYSENTER_compat
ENDPROC(xen_sysenter_target)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e6fd82d56748..31bd90345f69 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1645,9 +1645,12 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu)
/* Exit from long idle boosting */
if (cpu->idle_boost &&
pid_params.setpoint == CPUFREQ_SERVER_DEFAULT_SETPOINT) {
- int32_t min_scaled = int_tofp(CPUFREQ_SERVER_DEFAULT_SETPOINT);
+ boost = max_t(int32_t, pid_params.setpoint, cpu->idle_boost);
cpu->idle_boost >>= 1;
- sample->busy_scaled = max(sample->busy_scaled, min_scaled);
+ if (busy_frac < boost && !is_idle_task(current)) {
+ busy_frac = boost;
+ sample->busy_scaled = boost * 100;
+ }
}
max_target = global.no_turbo || global.turbo_disabled ?
@@ -1805,7 +1808,8 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
if (cpu->iowait_boost)
cpu->iowait_boost = 0;
- cpu->idle_boost = int_tofp(1);
+ if (!is_idle_task(current))
+ cpu->idle_boost = int_tofp(1);
}
}
cpu->last_update = time;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b2beaa1150f1..1c114815bbd8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -929,6 +929,7 @@ struct intel_device_info {
u8 gen;
u16 gen_mask;
enum intel_platform platform;
+ u8 gt; /* GT number, 0 if undefined */
u8 ring_mask; /* Rings supported by the HW */
u8 num_rings;
#define DEFINE_FLAG(name) u8 name:1
@@ -2770,9 +2771,8 @@ intel_info(const struct drm_i915_private *dev_priv)
#define IS_G33(dev_priv) ((dev_priv)->info.platform == INTEL_G33)
#define IS_IRONLAKE_M(dev_priv) (INTEL_DEVID(dev_priv) == 0x0046)
#define IS_IVYBRIDGE(dev_priv) ((dev_priv)->info.platform == INTEL_IVYBRIDGE)
-#define IS_IVB_GT1(dev_priv) (INTEL_DEVID(dev_priv) == 0x0156 || \
- INTEL_DEVID(dev_priv) == 0x0152 || \
- INTEL_DEVID(dev_priv) == 0x015a)
+#define IS_IVB_GT1(dev_priv) (IS_IVYBRIDGE(dev_priv) && \
+ (dev_priv)->info.gt == 1)
#define IS_VALLEYVIEW(dev_priv) ((dev_priv)->info.platform == INTEL_VALLEYVIEW)
#define IS_CHERRYVIEW(dev_priv) ((dev_priv)->info.platform == INTEL_CHERRYVIEW)
#define IS_HASWELL(dev_priv) ((dev_priv)->info.platform == INTEL_HASWELL)
@@ -2794,11 +2794,11 @@ intel_info(const struct drm_i915_private *dev_priv)
#define IS_BDW_ULX(dev_priv) (IS_BROADWELL(dev_priv) && \
(INTEL_DEVID(dev_priv) & 0xf) == 0xe)
#define IS_BDW_GT3(dev_priv) (IS_BROADWELL(dev_priv) && \
- (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
+ (dev_priv)->info.gt == 3)
#define IS_HSW_ULT(dev_priv) (IS_HASWELL(dev_priv) && \
(INTEL_DEVID(dev_priv) & 0xFF00) == 0x0A00)
#define IS_HSW_GT3(dev_priv) (IS_HASWELL(dev_priv) && \
- (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
+ (dev_priv)->info.gt == 3)
/* ULX machines are also considered ULT. */
#define IS_HSW_ULX(dev_priv) (INTEL_DEVID(dev_priv) == 0x0A0E || \
INTEL_DEVID(dev_priv) == 0x0A1E)
@@ -2818,10 +2818,16 @@ intel_info(const struct drm_i915_private *dev_priv)
#define IS_KBL_ULX(dev_priv) (INTEL_DEVID(dev_priv) == 0x590E || \
INTEL_DEVID(dev_priv) == 0x5915 || \
INTEL_DEVID(dev_priv) == 0x591E)
+#define IS_SKL_GT2(dev_priv) (IS_SKYLAKE(dev_priv) && \
+ (dev_priv)->info.gt == 2)
#define IS_SKL_GT3(dev_priv) (IS_SKYLAKE(dev_priv) && \
- (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
+ (dev_priv)->info.gt == 3)
#define IS_SKL_GT4(dev_priv) (IS_SKYLAKE(dev_priv) && \
- (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0030)
+ (dev_priv)->info.gt == 4)
+#define IS_KBL_GT2(dev_priv) (IS_KABYLAKE(dev_priv) && \
+ (dev_priv)->info.gt == 2)
+#define IS_KBL_GT3(dev_priv) (IS_KABYLAKE(dev_priv) && \
+ (dev_priv)->info.gt == 3)
#define IS_CFL_ULT(dev_priv) (IS_COFFEELAKE(dev_priv) && \
(INTEL_DEVID(dev_priv) & 0x00F0) == 0x00A0)
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index decffc360b6e..3111c638c839 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -66,19 +66,19 @@
GEN_DEFAULT_PIPEOFFSETS, \
CURSOR_OFFSETS
-static const struct intel_device_info intel_i830_info = {
+static const struct intel_device_info intel_i830_info __initconst = {
GEN2_FEATURES,
.platform = INTEL_I830,
.is_mobile = 1, .cursor_needs_physical = 1,
.num_pipes = 2, /* legal, last one wins */
};
-static const struct intel_device_info intel_i845g_info = {
+static const struct intel_device_info intel_i845g_info __initconst = {
GEN2_FEATURES,
.platform = INTEL_I845G,
};
-static const struct intel_device_info intel_i85x_info = {
+static const struct intel_device_info intel_i85x_info __initconst = {
GEN2_FEATURES,
.platform = INTEL_I85X, .is_mobile = 1,
.num_pipes = 2, /* legal, last one wins */
@@ -86,7 +86,7 @@ static const struct intel_device_info intel_i85x_info = {
.has_fbc = 1,
};
-static const struct intel_device_info intel_i865g_info = {
+static const struct intel_device_info intel_i865g_info __initconst = {
GEN2_FEATURES,
.platform = INTEL_I865G,
};
@@ -98,7 +98,7 @@ static const struct intel_device_info intel_i865g_info = {
GEN_DEFAULT_PIPEOFFSETS, \
CURSOR_OFFSETS
-static const struct intel_device_info intel_i915g_info = {
+static const struct intel_device_info intel_i915g_info __initconst = {
GEN3_FEATURES,
.platform = INTEL_I915G, .cursor_needs_physical = 1,
.has_overlay = 1, .overlay_needs_physical = 1,
@@ -106,7 +106,7 @@ static const struct intel_device_info intel_i915g_info = {
.unfenced_needs_alignment = 1,
};
-static const struct intel_device_info intel_i915gm_info = {
+static const struct intel_device_info intel_i915gm_info __initconst = {
GEN3_FEATURES,
.platform = INTEL_I915GM,
.is_mobile = 1,
@@ -118,7 +118,7 @@ static const struct intel_device_info intel_i915gm_info = {
.unfenced_needs_alignment = 1,
};
-static const struct intel_device_info intel_i945g_info = {
+static const struct intel_device_info intel_i945g_info __initconst = {
GEN3_FEATURES,
.platform = INTEL_I945G,
.has_hotplug = 1, .cursor_needs_physical = 1,
@@ -127,7 +127,7 @@ static const struct intel_device_info intel_i945g_info = {
.unfenced_needs_alignment = 1,
};
-static const struct intel_device_info intel_i945gm_info = {
+static const struct intel_device_info intel_i945gm_info __initconst = {
GEN3_FEATURES,
.platform = INTEL_I945GM, .is_mobile = 1,
.has_hotplug = 1, .cursor_needs_physical = 1,
@@ -138,14 +138,14 @@ static const struct intel_device_info intel_i945gm_info = {
.unfenced_needs_alignment = 1,
};
-static const struct intel_device_info intel_g33_info = {
+static const struct intel_device_info intel_g33_info __initconst = {
GEN3_FEATURES,
.platform = INTEL_G33,
.has_hotplug = 1,
.has_overlay = 1,
};
-static const struct intel_device_info intel_pineview_info = {
+static const struct intel_device_info intel_pineview_info __initconst = {
GEN3_FEATURES,
.platform = INTEL_PINEVIEW, .is_mobile = 1,
.has_hotplug = 1,
@@ -160,14 +160,14 @@ static const struct intel_device_info intel_pineview_info = {
GEN_DEFAULT_PIPEOFFSETS, \
CURSOR_OFFSETS
-static const struct intel_device_info intel_i965g_info = {
+static const struct intel_device_info intel_i965g_info __initconst = {
GEN4_FEATURES,
.platform = INTEL_I965G,
.has_overlay = 1,
.hws_needs_physical = 1,
};
-static const struct intel_device_info intel_i965gm_info = {
+static const struct intel_device_info intel_i965gm_info __initconst = {
GEN4_FEATURES,
.platform = INTEL_I965GM,
.is_mobile = 1, .has_fbc = 1,
@@ -176,14 +176,14 @@ static const struct intel_device_info intel_i965gm_info = {
.hws_needs_physical = 1,
};
-static const struct intel_device_info intel_g45_info = {
+static const struct intel_device_info intel_g45_info __initconst = {
GEN4_FEATURES,
.platform = INTEL_G45,
.has_pipe_cxsr = 1,
.ring_mask = RENDER_RING | BSD_RING,
};
-static const struct intel_device_info intel_gm45_info = {
+static const struct intel_device_info intel_gm45_info __initconst = {
GEN4_FEATURES,
.platform = INTEL_GM45,
.is_mobile = 1, .has_fbc = 1,
@@ -200,12 +200,12 @@ static const struct intel_device_info intel_gm45_info = {
GEN_DEFAULT_PIPEOFFSETS, \
CURSOR_OFFSETS
-static const struct intel_device_info intel_ironlake_d_info = {
+static const struct intel_device_info intel_ironlake_d_info __initconst = {
GEN5_FEATURES,
.platform = INTEL_IRONLAKE,
};
-static const struct intel_device_info intel_ironlake_m_info = {
+static const struct intel_device_info intel_ironlake_m_info __initconst = {
GEN5_FEATURES,
.platform = INTEL_IRONLAKE,
.is_mobile = 1, .has_fbc = 1,
@@ -225,15 +225,34 @@ static const struct intel_device_info intel_ironlake_m_info = {
GEN_DEFAULT_PIPEOFFSETS, \
CURSOR_OFFSETS
-static const struct intel_device_info intel_sandybridge_d_info = {
- GEN6_FEATURES,
- .platform = INTEL_SANDYBRIDGE,
+#define SNB_D_PLATFORM \
+ GEN6_FEATURES, \
+ .platform = INTEL_SANDYBRIDGE
+
+static const struct intel_device_info intel_sandybridge_d_gt1_info __initconst = {
+ SNB_D_PLATFORM,
+ .gt = 1,
};
-static const struct intel_device_info intel_sandybridge_m_info = {
- GEN6_FEATURES,
- .platform = INTEL_SANDYBRIDGE,
- .is_mobile = 1,
+static const struct intel_device_info intel_sandybridge_d_gt2_info __initconst = {
+ SNB_D_PLATFORM,
+ .gt = 2,
+};
+
+#define SNB_M_PLATFORM \
+ GEN6_FEATURES, \
+ .platform = INTEL_SANDYBRIDGE, \
+ .is_mobile = 1
+
+
+static const struct intel_device_info intel_sandybridge_m_gt1_info __initconst = {
+ SNB_M_PLATFORM,
+ .gt = 1,
+};
+
+static const struct intel_device_info intel_sandybridge_m_gt2_info __initconst = {
+ SNB_M_PLATFORM,
+ .gt = 2,
};
#define GEN7_FEATURES \
@@ -251,27 +270,46 @@ static const struct intel_device_info intel_sandybridge_m_info = {
GEN_DEFAULT_PIPEOFFSETS, \
IVB_CURSOR_OFFSETS
-static const struct intel_device_info intel_ivybridge_d_info = {
- GEN7_FEATURES,
- .platform = INTEL_IVYBRIDGE,
- .has_l3_dpf = 1,
+#define IVB_D_PLATFORM \
+ GEN7_FEATURES, \
+ .platform = INTEL_IVYBRIDGE, \
+ .has_l3_dpf = 1
+
+static const struct intel_device_info intel_ivybridge_d_gt1_info __initconst = {
+ IVB_D_PLATFORM,
+ .gt = 1,
};
-static const struct intel_device_info intel_ivybridge_m_info = {
- GEN7_FEATURES,
- .platform = INTEL_IVYBRIDGE,
- .is_mobile = 1,
- .has_l3_dpf = 1,
+static const struct intel_device_info intel_ivybridge_d_gt2_info __initconst = {
+ IVB_D_PLATFORM,
+ .gt = 2,
+};
+
+#define IVB_M_PLATFORM \
+ GEN7_FEATURES, \
+ .platform = INTEL_IVYBRIDGE, \
+ .is_mobile = 1, \
+ .has_l3_dpf = 1
+
+static const struct intel_device_info intel_ivybridge_m_gt1_info __initconst = {
+ IVB_M_PLATFORM,
+ .gt = 1,
+};
+
+static const struct intel_device_info intel_ivybridge_m_gt2_info __initconst = {
+ IVB_M_PLATFORM,
+ .gt = 2,
};
-static const struct intel_device_info intel_ivybridge_q_info = {
+static const struct intel_device_info intel_ivybridge_q_info __initconst = {
GEN7_FEATURES,
.platform = INTEL_IVYBRIDGE,
+ .gt = 2,
.num_pipes = 0, /* legal, last one wins */
.has_l3_dpf = 1,
};
-static const struct intel_device_info intel_valleyview_info = {
+static const struct intel_device_info intel_valleyview_info __initconst = {
.platform = INTEL_VALLEYVIEW,
.gen = 7,
.is_lp = 1,
@@ -302,10 +340,24 @@ static const struct intel_device_info intel_valleyview_info = {
.has_rc6p = 0 /* RC6p removed-by HSW */, \
.has_runtime_pm = 1
-static const struct intel_device_info intel_haswell_info = {
- HSW_FEATURES,
- .platform = INTEL_HASWELL,
- .has_l3_dpf = 1,
+#define HSW_PLATFORM \
+ HSW_FEATURES, \
+ .platform = INTEL_HASWELL, \
+ .has_l3_dpf = 1
+
+static const struct intel_device_info intel_haswell_gt1_info __initconst = {
+ HSW_PLATFORM,
+ .gt = 1,
+};
+
+static const struct intel_device_info intel_haswell_gt2_info __initconst = {
+ HSW_PLATFORM,
+ .gt = 2,
+};
+
+static const struct intel_device_info intel_haswell_gt3_info __initconst = {
+ HSW_PLATFORM,
+ .gt = 3,
};
#define BDW_FEATURES \
@@ -315,20 +367,36 @@ static const struct intel_device_info intel_haswell_info = {
.has_full_48bit_ppgtt = 1, \
.has_64bit_reloc = 1
-static const struct intel_device_info intel_broadwell_info = {
- BDW_FEATURES,
- .gen = 8,
- .platform = INTEL_BROADWELL,
+#define BDW_PLATFORM \
+ BDW_FEATURES, \
+ .gen = 8, \
+ .platform = INTEL_BROADWELL
+
+static const struct intel_device_info intel_broadwell_gt1_info __initconst = {
+ BDW_PLATFORM,
+ .gt = 1,
};
-static const struct intel_device_info intel_broadwell_gt3_info = {
- BDW_FEATURES,
- .gen = 8,
- .platform = INTEL_BROADWELL,
+static const struct intel_device_info intel_broadwell_gt2_info __initconst = {
+ BDW_PLATFORM,
+ .gt = 2,
+};
+
+static const struct intel_device_info intel_broadwell_rsvd_info __initconst = {
+ BDW_PLATFORM,
+ .gt = 3,
+ /* According to the device ID those devices are GT3, they were
+ * previously treated as not GT3, keep it like that.
+ */
+};
+
+static const struct intel_device_info intel_broadwell_gt3_info __initconst = {
+ BDW_PLATFORM,
+ .gt = 3,
.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING,
};
-static const struct intel_device_info intel_cherryview_info = {
+static const struct intel_device_info intel_cherryview_info __initconst = {
.gen = 8, .num_pipes = 3,
.has_hotplug = 1,
.is_lp = 1,
@@ -351,23 +419,37 @@ static const struct intel_device_info intel_cherryview_info = {
CHV_COLORS,
};
-static const struct intel_device_info intel_skylake_info = {
- BDW_FEATURES,
- .platform = INTEL_SKYLAKE,
- .gen = 9,
- .has_csr = 1,
- .has_guc = 1,
- .ddb_size = 896,
+#define SKL_PLATFORM \
+ BDW_FEATURES, \
+ .gen = 9, \
+ .platform = INTEL_SKYLAKE, \
+ .has_csr = 1, \
+ .has_guc = 1, \
+ .ddb_size = 896
+
+static const struct intel_device_info intel_skylake_gt1_info __initconst = {
+ SKL_PLATFORM,
+ .gt = 1,
};
-static const struct intel_device_info intel_skylake_gt3_info = {
- BDW_FEATURES,
- .platform = INTEL_SKYLAKE,
- .gen = 9,
- .has_csr = 1,
- .has_guc = 1,
- .ddb_size = 896,
- .ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING,
+static const struct intel_device_info intel_skylake_gt2_info __initconst = {
+ SKL_PLATFORM,
+ .gt = 2,
+};
+
+#define SKL_GT3_PLUS_PLATFORM \
+ SKL_PLATFORM, \
+ .ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING
+
+
+static const struct intel_device_info intel_skylake_gt3_info __initconst = {
+ SKL_GT3_PLUS_PLATFORM,
+ .gt = 3,
+};
+
+static const struct intel_device_info intel_skylake_gt4_info __initconst = {
+ SKL_GT3_PLUS_PLATFORM,
+ .gt = 4,
};
#define GEN9_LP_FEATURES \
@@ -397,35 +479,40 @@ static const struct intel_device_info intel_skylake_gt3_info = {
IVB_CURSOR_OFFSETS, \
BDW_COLORS
-static const struct intel_device_info intel_broxton_info = {
+static const struct intel_device_info intel_broxton_info __initconst = {
GEN9_LP_FEATURES,
.platform = INTEL_BROXTON,
.ddb_size = 512,
};
-static const struct intel_device_info intel_geminilake_info = {
+static const struct intel_device_info intel_geminilake_info __initconst = {
GEN9_LP_FEATURES,
.platform = INTEL_GEMINILAKE,
.ddb_size = 1024,
.color = { .degamma_lut_size = 0, .gamma_lut_size = 1024 }
};
-static const struct intel_device_info intel_kabylake_info = {
- BDW_FEATURES,
- .platform = INTEL_KABYLAKE,
- .gen = 9,
- .has_csr = 1,
- .has_guc = 1,
- .ddb_size = 896,
+#define KBL_PLATFORM \
+ BDW_FEATURES, \
+ .gen = 9, \
+ .platform = INTEL_KABYLAKE, \
+ .has_csr = 1, \
+ .has_guc = 1, \
+ .ddb_size = 896
+
+static const struct intel_device_info intel_kabylake_gt1_info __initconst = {
+ KBL_PLATFORM,
+ .gt = 1,
};
-static const struct intel_device_info intel_kabylake_gt3_info = {
- BDW_FEATURES,
- .platform = INTEL_KABYLAKE,
- .gen = 9,
- .has_csr = 1,
- .has_guc = 1,
- .ddb_size = 896,
+static const struct intel_device_info intel_kabylake_gt2_info __initconst = {
+ KBL_PLATFORM,
+ .gt = 2,
+};
+
+static const struct intel_device_info intel_kabylake_gt3_info __initconst = {
+ KBL_PLATFORM,
+ .gt = 3,
.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING,
};
@@ -437,20 +524,28 @@ static const struct intel_device_info intel_kabylake_gt3_info = {
.has_guc = 1, \
.ddb_size = 896
-static const struct intel_device_info intel_coffeelake_info = {
+static const struct intel_device_info intel_coffeelake_gt1_info __initconst = {
+ CFL_PLATFORM,
+ .gt = 1,
+};
+
+static const struct intel_device_info intel_coffeelake_gt2_info __initconst = {
CFL_PLATFORM,
+ .gt = 2,
};
-static const struct intel_device_info intel_coffeelake_gt3_info = {
+static const struct intel_device_info intel_coffeelake_gt3_info __initconst = {
CFL_PLATFORM,
+ .gt = 3,
.ring_mask = RENDER_RING | BSD_RING | BLT_RING | VEBOX_RING | BSD2_RING,
};
-static const struct intel_device_info intel_cannonlake_info = {
+static const struct intel_device_info intel_cannonlake_gt2_info __initconst = {
BDW_FEATURES,
.is_alpha_support = 1,
.platform = INTEL_CANNONLAKE,
.gen = 10,
+ .gt = 2,
.ddb_size = 1024,
.has_csr = 1,
.color = { .degamma_lut_size = 0, .gamma_lut_size = 1024 }
@@ -479,31 +574,42 @@ static const struct pci_device_id pciidlist[] = {
INTEL_PINEVIEW_IDS(&intel_pineview_info),
INTEL_IRONLAKE_D_IDS(&intel_ironlake_d_info),
INTEL_IRONLAKE_M_IDS(&intel_ironlake_m_info),
- INTEL_SNB_D_IDS(&intel_sandybridge_d_info),
- INTEL_SNB_M_IDS(&intel_sandybridge_m_info),
+ INTEL_SNB_D_GT1_IDS(&intel_sandybridge_d_gt1_info),
+ INTEL_SNB_D_GT2_IDS(&intel_sandybridge_d_gt2_info),
+ INTEL_SNB_M_GT1_IDS(&intel_sandybridge_m_gt1_info),
+ INTEL_SNB_M_GT2_IDS(&intel_sandybridge_m_gt2_info),
INTEL_IVB_Q_IDS(&intel_ivybridge_q_info), /* must be first IVB */
- INTEL_IVB_M_IDS(&intel_ivybridge_m_info),
- INTEL_IVB_D_IDS(&intel_ivybridge_d_info),
- INTEL_HSW_IDS(&intel_haswell_info),
+ INTEL_IVB_M_GT1_IDS(&intel_ivybridge_m_gt1_info),
+ INTEL_IVB_M_GT2_IDS(&intel_ivybridge_m_gt2_info),
+ INTEL_IVB_D_GT1_IDS(&intel_ivybridge_d_gt1_info),
+ INTEL_IVB_D_GT2_IDS(&intel_ivybridge_d_gt2_info),
+ INTEL_HSW_GT1_IDS(&intel_haswell_gt1_info),
+ INTEL_HSW_GT2_IDS(&intel_haswell_gt2_info),
+ INTEL_HSW_GT3_IDS(&intel_haswell_gt3_info),
INTEL_VLV_IDS(&intel_valleyview_info),
- INTEL_BDW_GT12_IDS(&intel_broadwell_info),
+ INTEL_BDW_GT1_IDS(&intel_broadwell_gt1_info),
+ INTEL_BDW_GT2_IDS(&intel_broadwell_gt2_info),
INTEL_BDW_GT3_IDS(&intel_broadwell_gt3_info),
- INTEL_BDW_RSVD_IDS(&intel_broadwell_info),
+ INTEL_BDW_RSVD_IDS(&intel_broadwell_rsvd_info),
INTEL_CHV_IDS(&intel_cherryview_info),
- INTEL_SKL_GT1_IDS(&intel_skylake_info),
- INTEL_SKL_GT2_IDS(&intel_skylake_info),
+ INTEL_SKL_GT1_IDS(&intel_skylake_gt1_info),
+ INTEL_SKL_GT2_IDS(&intel_skylake_gt2_info),
INTEL_SKL_GT3_IDS(&intel_skylake_gt3_info),
- INTEL_SKL_GT4_IDS(&intel_skylake_gt3_info),
+ INTEL_SKL_GT4_IDS(&intel_skylake_gt4_info),
INTEL_BXT_IDS(&intel_broxton_info),
INTEL_GLK_IDS(&intel_geminilake_info),
- INTEL_KBL_GT1_IDS(&intel_kabylake_info),
- INTEL_KBL_GT2_IDS(&intel_kabylake_info),
+ INTEL_KBL_GT1_IDS(&intel_kabylake_gt1_info),
+ INTEL_KBL_GT2_IDS(&intel_kabylake_gt2_info),
INTEL_KBL_GT3_IDS(&intel_kabylake_gt3_info),
INTEL_KBL_GT4_IDS(&intel_kabylake_gt3_info),
- INTEL_CFL_S_IDS(&intel_coffeelake_info),
- INTEL_CFL_H_IDS(&intel_coffeelake_info),
- INTEL_CFL_U_IDS(&intel_coffeelake_gt3_info),
- INTEL_CNL_IDS(&intel_cannonlake_info),
+ INTEL_CFL_S_GT1_IDS(&intel_coffeelake_gt1_info),
+ INTEL_CFL_S_GT2_IDS(&intel_coffeelake_gt2_info),
+ INTEL_CFL_H_GT2_IDS(&intel_coffeelake_gt2_info),
+ INTEL_CFL_U_GT1_IDS(&intel_coffeelake_gt1_info),
+ INTEL_CFL_U_GT2_IDS(&intel_coffeelake_gt2_info),
+ INTEL_CFL_U_GT3_IDS(&intel_coffeelake_gt3_info),
+ INTEL_CNL_U_GT2_IDS(&intel_cannonlake_gt2_info),
+ INTEL_CNL_Y_GT2_IDS(&intel_cannonlake_gt2_info),
{0, 0, 0}
};
MODULE_DEVICE_TABLE(pci, pciidlist);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 5e4529ce3586..1f6ce99f84d7 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2738,9 +2738,6 @@ enum skl_disp_power_wells {
#define ILK_DPFC_CHICKEN _MMIO(0x43224)
#define ILK_DPFC_DISABLE_DUMMY0 (1<<8)
#define ILK_DPFC_NUKE_ON_ANY_MODIFICATION (1<<23)
-#define GLK_SKIP_SEG_EN (1<<12)
-#define GLK_SKIP_SEG_COUNT_MASK (3<<10)
-#define GLK_SKIP_SEG_COUNT(x) ((x)<<10)
#define ILK_FBC_RT_BASE _MMIO(0x2128)
#define ILK_FBC_RT_VALID (1<<0)
#define SNB_FBC_FRONT_BUFFER (1<<1)
@@ -6748,6 +6745,7 @@ enum {
#define RESET_PCH_HANDSHAKE_ENABLE (1<<4)
#define GEN8_CHICKEN_DCPR_1 _MMIO(0x46430)
+#define SKL_SELECT_ALTERNATE_DC_EXIT (1<<30)
#define MASK_WAKEMEM (1<<13)
#define SKL_DFSM _MMIO(0x51000)
@@ -8267,6 +8265,7 @@ enum {
#define BXT_CDCLK_CD2X_DIV_SEL_2 (2<<22)
#define BXT_CDCLK_CD2X_DIV_SEL_4 (3<<22)
#define BXT_CDCLK_CD2X_PIPE(pipe) ((pipe)<<20)
+#define CDCLK_DIVMUX_CD_OVERRIDE (1<<19)
#define BXT_CDCLK_CD2X_PIPE_NONE BXT_CDCLK_CD2X_PIPE(3)
#define BXT_CDCLK_SSA_PRECHARGE_ENABLE (1<<16)
#define CDCLK_FREQ_DECIMAL_MASK (0x7ff)
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index 183e87e8ea31..6feec382813e 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -1110,6 +1110,22 @@ static void sanitize_aux_ch(struct drm_i915_private *dev_priv,
}
}
+static const u8 cnp_ddc_pin_map[] = {
+ [DDC_BUS_DDI_B] = GMBUS_PIN_1_BXT,
+ [DDC_BUS_DDI_C] = GMBUS_PIN_2_BXT,
+ [DDC_BUS_DDI_D] = GMBUS_PIN_4_CNP, /* sic */
+ [DDC_BUS_DDI_F] = GMBUS_PIN_3_BXT, /* sic */
+};
+
+static u8 map_ddc_pin(struct drm_i915_private *dev_priv, u8 vbt_pin)
+{
+ if (HAS_PCH_CNP(dev_priv) &&
+ vbt_pin > 0 && vbt_pin < ARRAY_SIZE(cnp_ddc_pin_map))
+ return cnp_ddc_pin_map[vbt_pin];
+
+ return vbt_pin;
+}
+
static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
const struct bdb_header *bdb)
{
@@ -1188,16 +1204,7 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
DRM_DEBUG_KMS("Port %c is internal DP\n", port_name(port));
if (is_dvi) {
- info->alternate_ddc_pin = ddc_pin;
-
- /*
- * All VBTs that we got so far for B Stepping has this
- * information wrong for Port D. So, let's just ignore for now.
- */
- if (IS_CNL_REVID(dev_priv, CNL_REVID_B0, CNL_REVID_B0) &&
- port == PORT_D) {
- info->alternate_ddc_pin = 0;
- }
+ info->alternate_ddc_pin = map_ddc_pin(dev_priv, ddc_pin);
sanitize_ddc_pin(dev_priv, port);
}
diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c
index 1241e5891b29..26a8dcd2c549 100644
--- a/drivers/gpu/drm/i915/intel_cdclk.c
+++ b/drivers/gpu/drm/i915/intel_cdclk.c
@@ -859,16 +859,10 @@ static void skl_set_preferred_cdclk_vco(struct drm_i915_private *dev_priv,
static void skl_dpll0_enable(struct drm_i915_private *dev_priv, int vco)
{
- int min_cdclk = skl_calc_cdclk(0, vco);
u32 val;
WARN_ON(vco != 8100000 && vco != 8640000);
- /* select the minimum CDCLK before enabling DPLL 0 */
- val = CDCLK_FREQ_337_308 | skl_cdclk_decimal(min_cdclk);
- I915_WRITE(CDCLK_CTL, val);
- POSTING_READ(CDCLK_CTL);
-
/*
* We always enable DPLL0 with the lowest link rate possible, but still
* taking into account the VCO required to operate the eDP panel at the
@@ -922,7 +916,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
{
int cdclk = cdclk_state->cdclk;
int vco = cdclk_state->vco;
- u32 freq_select, pcu_ack;
+ u32 freq_select, pcu_ack, cdclk_ctl;
int ret;
WARN_ON((cdclk == 24000) != (vco == 0));
@@ -939,7 +933,7 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
return;
}
- /* set CDCLK_CTL */
+ /* Choose frequency for this cdclk */
switch (cdclk) {
case 450000:
case 432000:
@@ -967,10 +961,33 @@ static void skl_set_cdclk(struct drm_i915_private *dev_priv,
dev_priv->cdclk.hw.vco != vco)
skl_dpll0_disable(dev_priv);
+ cdclk_ctl = I915_READ(CDCLK_CTL);
+
+ if (dev_priv->cdclk.hw.vco != vco) {
+ /* Wa Display #1183: skl,kbl,cfl */
+ cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
+ cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
+ I915_WRITE(CDCLK_CTL, cdclk_ctl);
+ }
+
+ /* Wa Display #1183: skl,kbl,cfl */
+ cdclk_ctl |= CDCLK_DIVMUX_CD_OVERRIDE;
+ I915_WRITE(CDCLK_CTL, cdclk_ctl);
+ POSTING_READ(CDCLK_CTL);
+
if (dev_priv->cdclk.hw.vco != vco)
skl_dpll0_enable(dev_priv, vco);
- I915_WRITE(CDCLK_CTL, freq_select | skl_cdclk_decimal(cdclk));
+ /* Wa Display #1183: skl,kbl,cfl */
+ cdclk_ctl &= ~(CDCLK_FREQ_SEL_MASK | CDCLK_FREQ_DECIMAL_MASK);
+ I915_WRITE(CDCLK_CTL, cdclk_ctl);
+
+ cdclk_ctl |= freq_select | skl_cdclk_decimal(cdclk);
+ I915_WRITE(CDCLK_CTL, cdclk_ctl);
+
+ /* Wa Display #1183: skl,kbl,cfl */
+ cdclk_ctl &= ~CDCLK_DIVMUX_CD_OVERRIDE;
+ I915_WRITE(CDCLK_CTL, cdclk_ctl);
POSTING_READ(CDCLK_CTL);
/* inform PCU of the change */
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index 19eb888b9f95..96f8be2d2898 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -1907,20 +1907,21 @@ u8 intel_ddi_dp_voltage_max(struct intel_encoder *encoder)
DP_TRAIN_VOLTAGE_SWING_MASK;
}
-static void cnl_ddi_vswing_program(struct drm_i915_private *dev_priv,
- u32 level, enum port port, int type)
+static void cnl_ddi_vswing_program(struct intel_encoder *encoder,
+ int level, enum intel_output_type type)
{
- const struct cnl_ddi_buf_trans *ddi_translations = NULL;
- u32 n_entries, val;
- int ln;
+ struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
+ enum port port = intel_ddi_get_encoder_port(encoder);
+ const struct cnl_ddi_buf_trans *ddi_translations;
+ int n_entries, ln;
+ u32 val;
- if (type == INTEL_OUTPUT_HDMI) {
+ if (type == INTEL_OUTPUT_HDMI)
ddi_translations = cnl_get_buf_trans_hdmi(dev_priv, &n_entries);
- } else if (type == INTEL_OUTPUT_DP) {
- ddi_translations = cnl_get_buf_trans_dp(dev_priv, &n_entries);
- } else if (type == INTEL_OUTPUT_EDP) {
+ else if (type == INTEL_OUTPUT_EDP)
ddi_translations = cnl_get_buf_trans_edp(dev_priv, &n_entries);
- }
+ else
+ ddi_translations = cnl_get_buf_trans_dp(dev_priv, &n_entries);
if (WARN_ON(ddi_translations == NULL))
return;
@@ -1973,26 +1974,22 @@ static void cnl_ddi_vswing_program(struct drm_i915_private *dev_priv,
I915_WRITE(CNL_PORT_TX_DW7_GRP(port), val);
}
-static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder, u32 level)
+static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder,
+ int level, enum intel_output_type type)
{
struct drm_i915_private *dev_priv = to_i915(encoder->base.dev);
- struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
enum port port = intel_ddi_get_encoder_port(encoder);
- int type = encoder->type;
- int width = 0;
- int rate = 0;
+ int width, rate, ln;
u32 val;
- int ln = 0;
- if ((intel_dp) && (type == INTEL_OUTPUT_EDP || type == INTEL_OUTPUT_DP)) {
- width = intel_dp->lane_count;
- rate = intel_dp->link_rate;
- } else if (type == INTEL_OUTPUT_HDMI) {
+ if (type == INTEL_OUTPUT_HDMI) {
width = 4;
- /* Rate is always < than 6GHz for HDMI */
+ rate = 0; /* Rate is always < than 6GHz for HDMI */
} else {
- MISSING_CASE(type);
- return;
+ struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base);
+
+ width = intel_dp->lane_count;
+ rate = intel_dp->link_rate;
}
/*
@@ -2001,7 +1998,7 @@ static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder, u32 level)
* else clear to 0b.
*/
val = I915_READ(CNL_PORT_PCS_DW1_LN0(port));
- if (type == INTEL_OUTPUT_EDP || type == INTEL_OUTPUT_DP)
+ if (type != INTEL_OUTPUT_HDMI)
val |= COMMON_KEEPER_EN;
else
val &= ~COMMON_KEEPER_EN;
@@ -2036,7 +2033,7 @@ static void cnl_ddi_vswing_sequence(struct intel_encoder *encoder, u32 level)
I915_WRITE(CNL_PORT_TX_DW5_GRP(port), val);
/* 5. Program swing and de-emphasis */
- cnl_ddi_vswing_program(dev_priv, level, port, type);
+ cnl_ddi_vswing_program(encoder, level, type);
/* 6. Set training enable to trigger update */
val = I915_READ(CNL_PORT_TX_DW5_LN0(port));
@@ -2077,7 +2074,7 @@ u32 bxt_signal_levels(struct intel_dp *intel_dp)
u32 level = intel_ddi_dp_level(intel_dp);
if (IS_CANNONLAKE(dev_priv))
- cnl_ddi_vswing_sequence(encoder, level);
+ cnl_ddi_vswing_sequence(encoder, level, encoder->type);
else
bxt_ddi_vswing_sequence(dev_priv, level, port, encoder->type);
@@ -2165,7 +2162,7 @@ static void intel_ddi_pre_enable_dp(struct intel_encoder *encoder,
intel_display_power_get(dev_priv, dig_port->ddi_io_power_domain);
if (IS_CANNONLAKE(dev_priv))
- cnl_ddi_vswing_sequence(encoder, level);
+ cnl_ddi_vswing_sequence(encoder, level, encoder->type);
else if (IS_GEN9_LP(dev_priv))
bxt_ddi_vswing_sequence(dev_priv, level, port, encoder->type);
else
@@ -2197,7 +2194,7 @@ static void intel_ddi_pre_enable_hdmi(struct intel_encoder *encoder,
intel_display_power_get(dev_priv, dig_port->ddi_io_power_domain);
if (IS_CANNONLAKE(dev_priv))
- cnl_ddi_vswing_sequence(encoder, level);
+ cnl_ddi_vswing_sequence(encoder, level, INTEL_OUTPUT_HDMI);
else if (IS_GEN9_LP(dev_priv))
bxt_ddi_vswing_sequence(dev_priv, level, port,
INTEL_OUTPUT_HDMI);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 98e570d9c7fd..58af05e9cf12 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -105,7 +105,6 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv)
static void glk_init_clock_gating(struct drm_i915_private *dev_priv)
{
- u32 val;
gen9_init_clock_gating(dev_priv);
/*
@@ -125,11 +124,6 @@ static void glk_init_clock_gating(struct drm_i915_private *dev_priv)
I915_WRITE(CHICKEN_MISC_2, val);
}
- /* Display WA #1133: WaFbcSkipSegments:glk */
- val = I915_READ(ILK_DPFC_CHICKEN);
- val &= ~GLK_SKIP_SEG_COUNT_MASK;
- val |= GLK_SKIP_SEG_EN | GLK_SKIP_SEG_COUNT(1);
- I915_WRITE(ILK_DPFC_CHICKEN, val);
}
static void i915_pineview_get_mem_freq(struct drm_i915_private *dev_priv)
@@ -7524,7 +7518,6 @@ static void cnp_init_clock_gating(struct drm_i915_private *dev_priv)
static void cnl_init_clock_gating(struct drm_i915_private *dev_priv)
{
- u32 val;
cnp_init_clock_gating(dev_priv);
/* This is not an Wa. Enable for better image quality */
@@ -7544,12 +7537,6 @@ static void cnl_init_clock_gating(struct drm_i915_private *dev_priv)
I915_WRITE(SLICE_UNIT_LEVEL_CLKGATE,
I915_READ(SLICE_UNIT_LEVEL_CLKGATE) |
SARBUNIT_CLKGATE_DIS);
-
- /* Display WA #1133: WaFbcSkipSegments:cnl */
- val = I915_READ(ILK_DPFC_CHICKEN);
- val &= ~GLK_SKIP_SEG_COUNT_MASK;
- val |= GLK_SKIP_SEG_EN | GLK_SKIP_SEG_COUNT(1);
- I915_WRITE(ILK_DPFC_CHICKEN, val);
}
static void cfl_init_clock_gating(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 2fb4d2ffad57..dadc189062b1 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -712,6 +712,11 @@ void gen9_enable_dc5(struct drm_i915_private *dev_priv)
DRM_DEBUG_KMS("Enabling DC5\n");
+ /* Wa Display #1183: skl,kbl,cfl */
+ if (IS_GEN9_BC(dev_priv))
+ I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
+ SKL_SELECT_ALTERNATE_DC_EXIT);
+
gen9_set_dc_state(dev_priv, DC_STATE_EN_UPTO_DC5);
}
@@ -739,6 +744,11 @@ void skl_disable_dc6(struct drm_i915_private *dev_priv)
{
DRM_DEBUG_KMS("Disabling DC6\n");
+ /* Wa Display #1183: skl,kbl,cfl */
+ if (IS_GEN9_BC(dev_priv))
+ I915_WRITE(GEN8_CHICKEN_DCPR_1, I915_READ(GEN8_CHICKEN_DCPR_1) |
+ SKL_SELECT_ALTERNATE_DC_EXIT);
+
gen9_set_dc_state(dev_priv, DC_STATE_DISABLE);
}
diff --git a/drivers/gpu/drm/i915/intel_vbt_defs.h b/drivers/gpu/drm/i915/intel_vbt_defs.h
index a92e7762f596..907b00dec1ee 100644
--- a/drivers/gpu/drm/i915/intel_vbt_defs.h
+++ b/drivers/gpu/drm/i915/intel_vbt_defs.h
@@ -230,6 +230,14 @@ struct bdb_general_features {
#define DEVICE_PORT_DVOB 0x01
#define DEVICE_PORT_DVOC 0x02
+/* DDC Bus DDI Type 155+ */
+enum vbt_gmbus_ddi {
+ DDC_BUS_DDI_B = 0x1,
+ DDC_BUS_DDI_C,
+ DDC_BUS_DDI_D,
+ DDC_BUS_DDI_F,
+};
+
/*
* We used to keep this struct but without any version control. We should avoid
* using it in the future, but it should be safe to keep using it in the old
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 4f3faf513ba3..610223f0e945 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1717,7 +1717,7 @@ static int __init hv_acpi_init(void)
{
int ret, t;
- if (x86_hyper != &x86_hyper_ms_hyperv)
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return -ENODEV;
init_completion(&probe_event);
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 8a8aef721a64..d65e37b48c1f 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1357,21 +1357,21 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
spin_unlock_irqrestore(&rchp->lock, flag);
if (schp == rchp) {
- if (t4_clear_cq_armed(&rchp->cq) &&
- (rq_flushed || sq_flushed)) {
+ if ((rq_flushed || sq_flushed) &&
+ t4_clear_cq_armed(&rchp->cq)) {
spin_lock_irqsave(&rchp->comp_handler_lock, flag);
(*rchp->ibcq.comp_handler)(&rchp->ibcq,
rchp->ibcq.cq_context);
spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
}
} else {
- if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) {
+ if (rq_flushed && t4_clear_cq_armed(&rchp->cq)) {
spin_lock_irqsave(&rchp->comp_handler_lock, flag);
(*rchp->ibcq.comp_handler)(&rchp->ibcq,
rchp->ibcq.cq_context);
spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
}
- if (t4_clear_cq_armed(&schp->cq) && sq_flushed) {
+ if (sq_flushed && t4_clear_cq_armed(&schp->cq)) {
spin_lock_irqsave(&schp->comp_handler_lock, flag);
(*schp->ibcq.comp_handler)(&schp->ibcq,
schp->ibcq.cq_context);
diff --git a/drivers/input/mouse/vmmouse.c b/drivers/input/mouse/vmmouse.c
index 0f586780ceb4..1ae5c1ef3f5b 100644
--- a/drivers/input/mouse/vmmouse.c
+++ b/drivers/input/mouse/vmmouse.c
@@ -316,11 +316,9 @@ static int vmmouse_enable(struct psmouse *psmouse)
/*
* Array of supported hypervisors.
*/
-static const struct hypervisor_x86 *vmmouse_supported_hypervisors[] = {
- &x86_hyper_vmware,
-#ifdef CONFIG_KVM_GUEST
- &x86_hyper_kvm,
-#endif
+static enum x86_hypervisor_type vmmouse_supported_hypervisors[] = {
+ X86_HYPER_VMWARE,
+ X86_HYPER_KVM,
};
/**
@@ -331,7 +329,7 @@ static bool vmmouse_check_hypervisor(void)
int i;
for (i = 0; i < ARRAY_SIZE(vmmouse_supported_hypervisors); i++)
- if (vmmouse_supported_hypervisors[i] == x86_hyper)
+ if (vmmouse_supported_hypervisors[i] == x86_hyper_type)
return true;
return false;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d112569de84a..f8c8f27148db 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1503,21 +1503,6 @@ static void activate_path_work(struct work_struct *work)
activate_or_offline_path(pgpath);
}
-static int noretry_error(blk_status_t error)
-{
- switch (error) {
- case BLK_STS_NOTSUPP:
- case BLK_STS_NOSPC:
- case BLK_STS_TARGET:
- case BLK_STS_NEXUS:
- case BLK_STS_MEDIUM:
- return 1;
- }
-
- /* Anything else could be a path failure, so should be retried */
- return 0;
-}
-
static int multipath_end_io(struct dm_target *ti, struct request *clone,
blk_status_t error, union map_info *map_context)
{
@@ -1536,7 +1521,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
* request into dm core, which will remake a clone request and
* clone bios for it and resubmit it later.
*/
- if (error && !noretry_error(error)) {
+ if (error && blk_path_error(error)) {
struct multipath *m = ti->private;
r = DM_ENDIO_REQUEUE;
@@ -1572,7 +1557,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
unsigned long flags;
int r = DM_ENDIO_DONE;
- if (!*error || noretry_error(*error))
+ if (!*error || !blk_path_error(*error))
goto done;
if (pgpath)
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 1e688bfec567..9047c0a529b2 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1271,7 +1271,7 @@ static int __init vmballoon_init(void)
* Check if we are running on VMware's hypervisor and bail out
* if we are not.
*/
- if (x86_hyper != &x86_hyper_vmware)
+ if (x86_hyper_type != X86_HYPER_VMWARE)
return -ENODEV;
for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index ba032ac9ae86..6a9527004cb1 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -33,7 +33,7 @@
#define DRV_NAME "enic"
#define DRV_DESCRIPTION "Cisco VIC Ethernet NIC Driver"
-#define DRV_VERSION "2.3.0.42"
+#define DRV_VERSION "2.3.0.45"
#define DRV_COPYRIGHT "Copyright 2008-2013 Cisco Systems, Inc"
#define ENIC_BARS_MAX 6
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index fd3980cc1e34..462d0ce51240 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -176,6 +176,81 @@ static void enic_get_strings(struct net_device *netdev, u32 stringset,
}
}
+static void enic_get_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring)
+{
+ struct enic *enic = netdev_priv(netdev);
+ struct vnic_enet_config *c = &enic->config;
+
+ ring->rx_max_pending = ENIC_MAX_RQ_DESCS;
+ ring->rx_pending = c->rq_desc_count;
+ ring->tx_max_pending = ENIC_MAX_WQ_DESCS;
+ ring->tx_pending = c->wq_desc_count;
+}
+
+static int enic_set_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring)
+{
+ struct enic *enic = netdev_priv(netdev);
+ struct vnic_enet_config *c = &enic->config;
+ int running = netif_running(netdev);
+ unsigned int rx_pending;
+ unsigned int tx_pending;
+ int err = 0;
+
+ if (ring->rx_mini_max_pending || ring->rx_mini_pending) {
+ netdev_info(netdev,
+ "modifying mini ring params is not supported");
+ return -EINVAL;
+ }
+ if (ring->rx_jumbo_max_pending || ring->rx_jumbo_pending) {
+ netdev_info(netdev,
+ "modifying jumbo ring params is not supported");
+ return -EINVAL;
+ }
+ rx_pending = c->rq_desc_count;
+ tx_pending = c->wq_desc_count;
+ if (ring->rx_pending > ENIC_MAX_RQ_DESCS ||
+ ring->rx_pending < ENIC_MIN_RQ_DESCS) {
+ netdev_info(netdev, "rx pending (%u) not in range [%u,%u]",
+ ring->rx_pending, ENIC_MIN_RQ_DESCS,
+ ENIC_MAX_RQ_DESCS);
+ return -EINVAL;
+ }
+ if (ring->tx_pending > ENIC_MAX_WQ_DESCS ||
+ ring->tx_pending < ENIC_MIN_WQ_DESCS) {
+ netdev_info(netdev, "tx pending (%u) not in range [%u,%u]",
+ ring->tx_pending, ENIC_MIN_WQ_DESCS,
+ ENIC_MAX_WQ_DESCS);
+ return -EINVAL;
+ }
+ if (running)
+ dev_close(netdev);
+ c->rq_desc_count =
+ ring->rx_pending & 0xffffffe0; /* must be aligned to groups of 32 */
+ c->wq_desc_count =
+ ring->tx_pending & 0xffffffe0; /* must be aligned to groups of 32 */
+ enic_free_vnic_resources(enic);
+ err = enic_alloc_vnic_resources(enic);
+ if (err) {
+ netdev_err(netdev,
+ "Failed to alloc vNIC resources, aborting\n");
+ enic_free_vnic_resources(enic);
+ goto err_out;
+ }
+ enic_init_vnic_resources(enic);
+ if (running) {
+ err = dev_open(netdev);
+ if (err)
+ goto err_out;
+ }
+ return 0;
+err_out:
+ c->rq_desc_count = rx_pending;
+ c->wq_desc_count = tx_pending;
+ return err;
+}
+
static int enic_get_sset_count(struct net_device *netdev, int sset)
{
switch (sset) {
@@ -509,6 +584,8 @@ static const struct ethtool_ops enic_ethtool_ops = {
.set_msglevel = enic_set_msglevel,
.get_link = ethtool_op_get_link,
.get_strings = enic_get_strings,
+ .get_ringparam = enic_get_ringparam,
+ .set_ringparam = enic_set_ringparam,
.get_sset_count = enic_get_sset_count,
.get_ethtool_stats = enic_get_ethtool_stats,
.get_coalesce = enic_get_coalesce,
diff --git a/drivers/net/ethernet/cisco/enic/vnic_rq.c b/drivers/net/ethernet/cisco/enic/vnic_rq.c
index 36bc2c71fba9..f8aa326d1d58 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_rq.c
+++ b/drivers/net/ethernet/cisco/enic/vnic_rq.c
@@ -139,20 +139,8 @@ void vnic_rq_init(struct vnic_rq *rq, unsigned int cq_index,
unsigned int error_interrupt_enable,
unsigned int error_interrupt_offset)
{
- u32 fetch_index = 0;
-
- /* Use current fetch_index as the ring starting point */
- fetch_index = ioread32(&rq->ctrl->fetch_index);
-
- if (fetch_index == 0xFFFFFFFF) { /* check for hardware gone */
- /* Hardware surprise removal: reset fetch_index */
- fetch_index = 0;
- }
-
- vnic_rq_init_start(rq, cq_index,
- fetch_index, fetch_index,
- error_interrupt_enable,
- error_interrupt_offset);
+ vnic_rq_init_start(rq, cq_index, 0, 0, error_interrupt_enable,
+ error_interrupt_offset);
}
unsigned int vnic_rq_error_status(struct vnic_rq *rq)
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index b676fa9d4714..736df59c16f5 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2454,6 +2454,12 @@ static irqreturn_t ibmvnic_interrupt_rx(int irq, void *instance)
struct ibmvnic_sub_crq_queue *scrq = instance;
struct ibmvnic_adapter *adapter = scrq->adapter;
+ /* When booting a kdump kernel we can hit pending interrupts
+ * prior to completing driver initialization.
+ */
+ if (unlikely(adapter->state != VNIC_OPEN))
+ return IRQ_NONE;
+
adapter->rx_stats_buffers[scrq->scrq_num].interrupts++;
if (napi_schedule_prep(&adapter->napi[scrq->scrq_num])) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 349e6d2b0d9f..54f13f34e18b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -157,13 +157,20 @@ static blk_status_t nvme_error_status(struct request *req)
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
return BLK_STS_NOSPC;
+ case NVME_SC_LBA_RANGE:
+ return BLK_STS_TARGET;
+ case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_ONCS_NOT_SUPPORTED:
+ case NVME_SC_INVALID_OPCODE:
+ case NVME_SC_INVALID_FIELD:
+ case NVME_SC_INVALID_NS:
return BLK_STS_NOTSUPP;
case NVME_SC_WRITE_FAULT:
case NVME_SC_READ_ERROR:
case NVME_SC_UNWRITTEN_BLOCK:
case NVME_SC_ACCESS_DENIED:
case NVME_SC_READ_ONLY:
+ case NVME_SC_COMPARE_FAILED:
return BLK_STS_MEDIUM;
case NVME_SC_GUARD_CHECK:
case NVME_SC_APPTAG_CHECK:
@@ -190,8 +197,10 @@ static inline bool nvme_req_needs_retry(struct request *req)
void nvme_complete_rq(struct request *req)
{
- if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
- if (nvme_req_needs_failover(req)) {
+ blk_status_t status = nvme_error_status(req);
+
+ if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
+ if (nvme_req_needs_failover(req, status)) {
nvme_failover_req(req);
return;
}
@@ -202,8 +211,7 @@ void nvme_complete_rq(struct request *req)
return;
}
}
-
- blk_mq_end_request(req, nvme_error_status(req));
+ blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -1449,19 +1457,19 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
int srcu_idx, ret;
u8 data[16] = { 0, };
+ ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+ if (unlikely(!ns))
+ return -EWOULDBLOCK;
+
put_unaligned_le64(key, &data[0]);
put_unaligned_le64(sa_key, &data[8]);
memset(&c, 0, sizeof(c));
c.common.opcode = op;
- c.common.nsid = cpu_to_le32(head->ns_id);
+ c.common.nsid = cpu_to_le32(ns->head->ns_id);
c.common.cdw10[0] = cpu_to_le32(cdw10);
- ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
- if (unlikely(!ns))
- ret = -EWOULDBLOCK;
- else
- ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+ ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
nvme_put_ns_from_disk(head, srcu_idx);
return ret;
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 794e66e4aa20..aa916a475aa7 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
nvme_fc_free_queue(&ctrl->queues[0]);
+ /* re-enable the admin_q so anything new can fast fail */
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
nvme_fc_ctlr_inactive_on_rport(ctrl);
}
@@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
* waiting for io to terminate
*/
nvme_fc_delete_association(ctrl);
+
+ /* resume the io queues so that things will fast fail */
+ nvme_start_queues(nctrl);
}
static void
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 78d92151a904..933c9c0243ea 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req)
kblockd_schedule_work(&ns->head->requeue_work);
}
-bool nvme_req_needs_failover(struct request *req)
+bool nvme_req_needs_failover(struct request *req, blk_status_t error)
{
if (!(req->cmd_flags & REQ_NVME_MPATH))
return false;
-
- switch (nvme_req(req)->status & 0x7ff) {
- /*
- * Generic command status:
- */
- case NVME_SC_INVALID_OPCODE:
- case NVME_SC_INVALID_FIELD:
- case NVME_SC_INVALID_NS:
- case NVME_SC_LBA_RANGE:
- case NVME_SC_CAP_EXCEEDED:
- case NVME_SC_RESERVATION_CONFLICT:
- return false;
-
- /*
- * I/O command set specific error. Unfortunately these values are
- * reused for fabrics commands, but those should never get here.
- */
- case NVME_SC_BAD_ATTRIBUTES:
- case NVME_SC_INVALID_PI:
- case NVME_SC_READ_ONLY:
- case NVME_SC_ONCS_NOT_SUPPORTED:
- WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
- nvme_fabrics_command);
- return false;
-
- /*
- * Media and Data Integrity Errors:
- */
- case NVME_SC_WRITE_FAULT:
- case NVME_SC_READ_ERROR:
- case NVME_SC_GUARD_CHECK:
- case NVME_SC_APPTAG_CHECK:
- case NVME_SC_REFTAG_CHECK:
- case NVME_SC_COMPARE_FAILED:
- case NVME_SC_ACCESS_DENIED:
- case NVME_SC_UNWRITTEN_BLOCK:
- return false;
- }
-
- /* Everything else could be a path failure, so should be retried */
- return true;
+ return blk_path_error(error);
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 527f52b9cfe1..9b68aa09c7cd 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -400,7 +400,7 @@ extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH
void nvme_failover_req(struct request *req);
-bool nvme_req_needs_failover(struct request *req);
+bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head);
@@ -429,7 +429,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
static inline void nvme_failover_req(struct request *req)
{
}
-static inline bool nvme_req_needs_failover(struct request *req)
+static inline bool nvme_req_needs_failover(struct request *req,
+ blk_status_t error)
{
return false;
}
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index d34b3d24df93..36dee176f8e2 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1580,7 +1580,7 @@ static void qeth_l3_free_vlan_addresses4(struct qeth_card *card,
addr = qeth_l3_get_addr_buffer(QETH_PROT_IPV4);
if (!addr)
- return;
+ goto out;
spin_lock_bh(&card->ip_lock);
@@ -1594,6 +1594,7 @@ static void qeth_l3_free_vlan_addresses4(struct qeth_card *card,
spin_unlock_bh(&card->ip_lock);
kfree(addr);
+out:
in_dev_put(in_dev);
}
@@ -1618,7 +1619,7 @@ static void qeth_l3_free_vlan_addresses6(struct qeth_card *card,
addr = qeth_l3_get_addr_buffer(QETH_PROT_IPV6);
if (!addr)
- return;
+ goto out;
spin_lock_bh(&card->ip_lock);
@@ -1633,6 +1634,7 @@ static void qeth_l3_free_vlan_addresses6(struct qeth_card *card,
spin_unlock_bh(&card->ip_lock);
kfree(addr);
+out:
in6_dev_put(in6_dev);
#endif /* CONFIG_QETH_IPV6 */
}
diff --git a/fs/aio.c b/fs/aio.c
index 34027b67e2f4..e908a30a1c8a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
pgoff_t idx;
int rc;
+ /*
+ * We cannot support the _NO_COPY case here, because copy needs to
+ * happen under the ctx->completion_lock. That does not work with the
+ * migration workflow of MIGRATE_SYNC_NO_COPY.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
rc = 0;
/* mapping->private_lock here protects against the kioctx teardown. */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 6a478c0de2f1..5c215c1d15fb 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2186,7 +2186,10 @@ int f2fs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
set_page_private(newpage, page_private(page));
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e5d05ccba8b0..34a7375b702d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -846,7 +846,10 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 403b8bfa9c41..882d62b5ab94 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -539,6 +539,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ else if (is_device_private_entry(swpent))
+ page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = find_get_entry(vma->vm_file->f_mapping,
@@ -702,6 +704,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ else if (is_device_private_entry(swpent))
+ page = device_private_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -1183,7 +1187,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
- page = vm_normal_page(vma, addr, pte);
+ page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
@@ -1196,6 +1200,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
+
+ if (is_device_private_entry(entry))
+ page = device_private_entry_to_page(entry);
}
if (page && !PageAnon(page))
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 3a6bdbb1cd27..ddca0990476b 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1482,7 +1482,10 @@ static int ubifs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
}
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
#endif
diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 34c8f5600ce0..5db0458dd832 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -118,92 +118,125 @@
#define INTEL_IRONLAKE_M_IDS(info) \
INTEL_VGA_DEVICE(0x0046, info)
-#define INTEL_SNB_D_IDS(info) \
+#define INTEL_SNB_D_GT1_IDS(info) \
INTEL_VGA_DEVICE(0x0102, info), \
- INTEL_VGA_DEVICE(0x0112, info), \
- INTEL_VGA_DEVICE(0x0122, info), \
INTEL_VGA_DEVICE(0x010A, info)
-#define INTEL_SNB_M_IDS(info) \
- INTEL_VGA_DEVICE(0x0106, info), \
+#define INTEL_SNB_D_GT2_IDS(info) \
+ INTEL_VGA_DEVICE(0x0112, info), \
+ INTEL_VGA_DEVICE(0x0122, info)
+
+#define INTEL_SNB_D_IDS(info) \
+ INTEL_SNB_D_GT1_IDS(info), \
+ INTEL_SNB_D_GT2_IDS(info)
+
+#define INTEL_SNB_M_GT1_IDS(info) \
+ INTEL_VGA_DEVICE(0x0106, info)
+
+#define INTEL_SNB_M_GT2_IDS(info) \
INTEL_VGA_DEVICE(0x0116, info), \
INTEL_VGA_DEVICE(0x0126, info)
+#define INTEL_SNB_M_IDS(info) \
+ INTEL_SNB_M_GT1_IDS(info), \
+ INTEL_SNB_M_GT2_IDS(info)
+
+#define INTEL_IVB_M_GT1_IDS(info) \
+ INTEL_VGA_DEVICE(0x0156, info) /* GT1 mobile */
+
+#define INTEL_IVB_M_GT2_IDS(info) \
+ INTEL_VGA_DEVICE(0x0166, info) /* GT2 mobile */
+
#define INTEL_IVB_M_IDS(info) \
- INTEL_VGA_DEVICE(0x0156, info), /* GT1 mobile */ \
- INTEL_VGA_DEVICE(0x0166, info) /* GT2 mobile */
+ INTEL_IVB_M_GT1_IDS(info), \
+ INTEL_IVB_M_GT2_IDS(info)
-#define INTEL_IVB_D_IDS(info) \
+#define INTEL_IVB_D_GT1_IDS(info) \
INTEL_VGA_DEVICE(0x0152, info), /* GT1 desktop */ \
+ INTEL_VGA_DEVICE(0x015a, info) /* GT1 server */
+
+#define INTEL_IVB_D_GT2_IDS(info) \
INTEL_VGA_DEVICE(0x0162, info), /* GT2 desktop */ \
- INTEL_VGA_DEVICE(0x015a, info), /* GT1 server */ \
INTEL_VGA_DEVICE(0x016a, info) /* GT2 server */
+#define INTEL_IVB_D_IDS(info) \
+ INTEL_IVB_D_GT1_IDS(info), \
+ INTEL_IVB_D_GT2_IDS(info)
+
#define INTEL_IVB_Q_IDS(info) \
INTEL_QUANTA_VGA_DEVICE(info) /* Quanta transcode */
-#define INTEL_HSW_IDS(info) \
+#define INTEL_HSW_GT1_IDS(info) \
INTEL_VGA_DEVICE(0x0402, info), /* GT1 desktop */ \
- INTEL_VGA_DEVICE(0x0412, info), /* GT2 desktop */ \
- INTEL_VGA_DEVICE(0x0422, info), /* GT3 desktop */ \
INTEL_VGA_DEVICE(0x040a, info), /* GT1 server */ \
- INTEL_VGA_DEVICE(0x041a, info), /* GT2 server */ \
- INTEL_VGA_DEVICE(0x042a, info), /* GT3 server */ \
INTEL_VGA_DEVICE(0x040B, info), /* GT1 reserved */ \
- INTEL_VGA_DEVICE(0x041B, info), /* GT2 reserved */ \
- INTEL_VGA_DEVICE(0x042B, info), /* GT3 reserved */ \
INTEL_VGA_DEVICE(0x040E, info), /* GT1 reserved */ \
- INTEL_VGA_DEVICE(0x041E, info), /* GT2 reserved */ \
- INTEL_VGA_DEVICE(0x042E, info), /* GT3 reserved */ \
INTEL_VGA_DEVICE(0x0C02, info), /* SDV GT1 desktop */ \
- INTEL_VGA_DEVICE(0x0C12, info), /* SDV GT2 desktop */ \
- INTEL_VGA_DEVICE(0x0C22, info), /* SDV GT3 desktop */ \
INTEL_VGA_DEVICE(0x0C0A, info), /* SDV GT1 server */ \
- INTEL_VGA_DEVICE(0x0C1A, info), /* SDV GT2 server */ \
- INTEL_VGA_DEVICE(0x0C2A, info), /* SDV GT3 server */ \
INTEL_VGA_DEVICE(0x0C0B, info), /* SDV GT1 reserved */ \
- INTEL_VGA_DEVICE(0x0C1B, info), /* SDV GT2 reserved */ \
- INTEL_VGA_DEVICE(0x0C2B, info), /* SDV GT3 reserved */ \
INTEL_VGA_DEVICE(0x0C0E, info), /* SDV GT1 reserved */ \
- INTEL_VGA_DEVICE(0x0C1E, info), /* SDV GT2 reserved */ \
- INTEL_VGA_DEVICE(0x0C2E, info), /* SDV GT3 reserved */ \
INTEL_VGA_DEVICE(0x0A02, info), /* ULT GT1 desktop */ \
- INTEL_VGA_DEVICE(0x0A12, info), /* ULT GT2 desktop */ \
- INTEL_VGA_DEVICE(0x0A22, info), /* ULT GT3 desktop */ \
INTEL_VGA_DEVICE(0x0A0A, info), /* ULT GT1 server */ \
- INTEL_VGA_DEVICE(0x0A1A, info), /* ULT GT2 server */ \
- INTEL_VGA_DEVICE(0x0A2A, info), /* ULT GT3 server */ \
INTEL_VGA_DEVICE(0x0A0B, info), /* ULT GT1 reserved */ \
- INTEL_VGA_DEVICE(0x0A1B, info), /* ULT GT2 reserved */ \
- INTEL_VGA_DEVICE(0x0A2B, info), /* ULT GT3 reserved */ \
INTEL_VGA_DEVICE(0x0D02, info), /* CRW GT1 desktop */ \
- INTEL_VGA_DEVICE(0x0D12, info), /* CRW GT2 desktop */ \
- INTEL_VGA_DEVICE(0x0D22, info), /* CRW GT3 desktop */ \
INTEL_VGA_DEVICE(0x0D0A, info), /* CRW GT1 server */ \
- INTEL_VGA_DEVICE(0x0D1A, info), /* CRW GT2 server */ \
- INTEL_VGA_DEVICE(0x0D2A, info), /* CRW GT3 server */ \
INTEL_VGA_DEVICE(0x0D0B, info), /* CRW GT1 reserved */ \
- INTEL_VGA_DEVICE(0x0D1B, info), /* CRW GT2 reserved */ \
- INTEL_VGA_DEVICE(0x0D2B, info), /* CRW GT3 reserved */ \
INTEL_VGA_DEVICE(0x0D0E, info), /* CRW GT1 reserved */ \
- INTEL_VGA_DEVICE(0x0D1E, info), /* CRW GT2 reserved */ \
- INTEL_VGA_DEVICE(0x0D2E, info), /* CRW GT3 reserved */ \
INTEL_VGA_DEVICE(0x0406, info), /* GT1 mobile */ \
+ INTEL_VGA_DEVICE(0x0C06, info), /* SDV GT1 mobile */ \
+ INTEL_VGA_DEVICE(0x0A06, info), /* ULT GT1 mobile */ \
+ INTEL_VGA_DEVICE(0x0A0E, info), /* ULX GT1 mobile */ \
+ INTEL_VGA_DEVICE(0x0D06, info) /* CRW GT1 mobile */
+
+#define INTEL_HSW_GT2_IDS(info) \
+ INTEL_VGA_DEVICE(0x0412, info), /* GT2 desktop */ \
+ INTEL_VGA_DEVICE(0x041a, info), /* GT2 server */ \
+ INTEL_VGA_DEVICE(0x041B, info), /* GT2 reserved */ \
+ INTEL_VGA_DEVICE(0x041E, info), /* GT2 reserved */ \
+ INTEL_VGA_DEVICE(0x0C12, info), /* SDV GT2 desktop */ \
+ INTEL_VGA_DEVICE(0x0C1A, info), /* SDV GT2 server */ \
+ INTEL_VGA_DEVICE(0x0C1B, info), /* SDV GT2 reserved */ \
+ INTEL_VGA_DEVICE(0x0C1E, info), /* SDV GT2 reserved */ \
+ INTEL_VGA_DEVICE(0x0A12, info), /* ULT GT2 desktop */ \
+ INTEL_VGA_DEVICE(0x0A1A, info), /* ULT GT2 server */ \
+ INTEL_VGA_DEVICE(0x0A1B, info), /* ULT GT2 reserved */ \
+ INTEL_VGA_DEVICE(0x0D12, info), /* CRW GT2 desktop */ \
+ INTEL_VGA_DEVICE(0x0D1A, info), /* CRW GT2 server */ \
+ INTEL_VGA_DEVICE(0x0D1B, info), /* CRW GT2 reserved */ \
+ INTEL_VGA_DEVICE(0x0D1E, info), /* CRW GT2 reserved */ \
INTEL_VGA_DEVICE(0x0416, info), /* GT2 mobile */ \
INTEL_VGA_DEVICE(0x0426, info), /* GT2 mobile */ \
- INTEL_VGA_DEVICE(0x0C06, info), /* SDV GT1 mobile */ \
INTEL_VGA_DEVICE(0x0C16, info), /* SDV GT2 mobile */ \
- INTEL_VGA_DEVICE(0x0C26, info), /* SDV GT3 mobile */ \
- INTEL_VGA_DEVICE(0x0A06, info), /* ULT GT1 mobile */ \
INTEL_VGA_DEVICE(0x0A16, info), /* ULT GT2 mobile */ \
- INTEL_VGA_DEVICE(0x0A26, info), /* ULT GT3 mobile */ \
- INTEL_VGA_DEVICE(0x0A0E, info), /* ULX GT1 mobile */ \
INTEL_VGA_DEVICE(0x0A1E, info), /* ULX GT2 mobile */ \
+ INTEL_VGA_DEVICE(0x0D16, info) /* CRW GT2 mobile */
+
+#define INTEL_HSW_GT3_IDS(info) \
+ INTEL_VGA_DEVICE(0x0422, info), /* GT3 desktop */ \
+ INTEL_VGA_DEVICE(0x042a, info), /* GT3 server */ \
+ INTEL_VGA_DEVICE(0x042B, info), /* GT3 reserved */ \
+ INTEL_VGA_DEVICE(0x042E, info), /* GT3 reserved */ \
+ INTEL_VGA_DEVICE(0x0C22, info), /* SDV GT3 desktop */ \
+ INTEL_VGA_DEVICE(0x0C2A, info), /* SDV GT3 server */ \
+ INTEL_VGA_DEVICE(0x0C2B, info), /* SDV GT3 reserved */ \
+ INTEL_VGA_DEVICE(0x0C2E, info), /* SDV GT3 reserved */ \
+ INTEL_VGA_DEVICE(0x0A22, info), /* ULT GT3 desktop */ \
+ INTEL_VGA_DEVICE(0x0A2A, info), /* ULT GT3 server */ \
+ INTEL_VGA_DEVICE(0x0A2B, info), /* ULT GT3 reserved */ \
+ INTEL_VGA_DEVICE(0x0D22, info), /* CRW GT3 desktop */ \
+ INTEL_VGA_DEVICE(0x0D2A, info), /* CRW GT3 server */ \
+ INTEL_VGA_DEVICE(0x0D2B, info), /* CRW GT3 reserved */ \
+ INTEL_VGA_DEVICE(0x0D2E, info), /* CRW GT3 reserved */ \
+ INTEL_VGA_DEVICE(0x0C26, info), /* SDV GT3 mobile */ \
+ INTEL_VGA_DEVICE(0x0A26, info), /* ULT GT3 mobile */ \
INTEL_VGA_DEVICE(0x0A2E, info), /* ULT GT3 reserved */ \
- INTEL_VGA_DEVICE(0x0D06, info), /* CRW GT1 mobile */ \
- INTEL_VGA_DEVICE(0x0D16, info), /* CRW GT2 mobile */ \
INTEL_VGA_DEVICE(0x0D26, info) /* CRW GT3 mobile */
+#define INTEL_HSW_IDS(info) \
+ INTEL_HSW_GT1_IDS(info), \
+ INTEL_HSW_GT2_IDS(info), \
+ INTEL_HSW_GT3_IDS(info)
+
#define INTEL_VLV_IDS(info) \
INTEL_VGA_DEVICE(0x0f30, info), \
INTEL_VGA_DEVICE(0x0f31, info), \
@@ -212,17 +245,19 @@
INTEL_VGA_DEVICE(0x0157, info), \
INTEL_VGA_DEVICE(0x0155, info)
-#define INTEL_BDW_GT12_IDS(info) \
+#define INTEL_BDW_GT1_IDS(info) \
INTEL_VGA_DEVICE(0x1602, info), /* GT1 ULT */ \
INTEL_VGA_DEVICE(0x1606, info), /* GT1 ULT */ \
INTEL_VGA_DEVICE(0x160B, info), /* GT1 Iris */ \
INTEL_VGA_DEVICE(0x160E, info), /* GT1 ULX */ \
- INTEL_VGA_DEVICE(0x1612, info), /* GT2 Halo */ \
+ INTEL_VGA_DEVICE(0x160A, info), /* GT1 Server */ \
+ INTEL_VGA_DEVICE(0x160D, info) /* GT1 Workstation */
+
+#define INTEL_BDW_GT2_IDS(info) \
+ INTEL_VGA_DEVICE(0x1612, info), /* GT2 Halo */ \
INTEL_VGA_DEVICE(0x1616, info), /* GT2 ULT */ \
INTEL_VGA_DEVICE(0x161B, info), /* GT2 ULT */ \
- INTEL_VGA_DEVICE(0x161E, info), /* GT2 ULX */ \
- INTEL_VGA_DEVICE(0x160A, info), /* GT1 Server */ \
- INTEL_VGA_DEVICE(0x160D, info), /* GT1 Workstation */ \
+ INTEL_VGA_DEVICE(0x161E, info), /* GT2 ULX */ \
INTEL_VGA_DEVICE(0x161A, info), /* GT2 Server */ \
INTEL_VGA_DEVICE(0x161D, info) /* GT2 Workstation */
@@ -243,7 +278,8 @@
INTEL_VGA_DEVICE(0x163D, info) /* Workstation */
#define INTEL_BDW_IDS(info) \
- INTEL_BDW_GT12_IDS(info), \
+ INTEL_BDW_GT1_IDS(info), \
+ INTEL_BDW_GT2_IDS(info), \
INTEL_BDW_GT3_IDS(info), \
INTEL_BDW_RSVD_IDS(info)
@@ -303,7 +339,6 @@
#define INTEL_KBL_GT1_IDS(info) \
INTEL_VGA_DEVICE(0x5913, info), /* ULT GT1.5 */ \
INTEL_VGA_DEVICE(0x5915, info), /* ULX GT1.5 */ \
- INTEL_VGA_DEVICE(0x5917, info), /* DT GT1.5 */ \
INTEL_VGA_DEVICE(0x5906, info), /* ULT GT1 */ \
INTEL_VGA_DEVICE(0x590E, info), /* ULX GT1 */ \
INTEL_VGA_DEVICE(0x5902, info), /* DT GT1 */ \
@@ -313,6 +348,7 @@
#define INTEL_KBL_GT2_IDS(info) \
INTEL_VGA_DEVICE(0x5916, info), /* ULT GT2 */ \
+ INTEL_VGA_DEVICE(0x5917, info), /* Mobile GT2 */ \
INTEL_VGA_DEVICE(0x5921, info), /* ULT GT2F */ \
INTEL_VGA_DEVICE(0x591E, info), /* ULX GT2 */ \
INTEL_VGA_DEVICE(0x5912, info), /* DT GT2 */ \
@@ -335,24 +371,48 @@
INTEL_KBL_GT4_IDS(info)
/* CFL S */
-#define INTEL_CFL_S_IDS(info) \
+#define INTEL_CFL_S_GT1_IDS(info) \
INTEL_VGA_DEVICE(0x3E90, info), /* SRV GT1 */ \
INTEL_VGA_DEVICE(0x3E93, info), /* SRV GT1 */ \
+ INTEL_VGA_DEVICE(0x3E99, info) /* SRV GT1 */
+
+#define INTEL_CFL_S_GT2_IDS(info) \
INTEL_VGA_DEVICE(0x3E91, info), /* SRV GT2 */ \
INTEL_VGA_DEVICE(0x3E92, info), /* SRV GT2 */ \
- INTEL_VGA_DEVICE(0x3E96, info) /* SRV GT2 */
+ INTEL_VGA_DEVICE(0x3E96, info), /* SRV GT2 */ \
+ INTEL_VGA_DEVICE(0x3E9A, info) /* SRV GT2 */
/* CFL H */
-#define INTEL_CFL_H_IDS(info) \
+#define INTEL_CFL_H_GT2_IDS(info) \
INTEL_VGA_DEVICE(0x3E9B, info), /* Halo GT2 */ \
INTEL_VGA_DEVICE(0x3E94, info) /* Halo GT2 */
-/* CFL U */
-#define INTEL_CFL_U_IDS(info) \
+/* CFL U GT1 */
+#define INTEL_CFL_U_GT1_IDS(info) \
+ INTEL_VGA_DEVICE(0x3EA1, info), \
+ INTEL_VGA_DEVICE(0x3EA4, info)
+
+/* CFL U GT2 */
+#define INTEL_CFL_U_GT2_IDS(info) \
+ INTEL_VGA_DEVICE(0x3EA0, info), \
+ INTEL_VGA_DEVICE(0x3EA3, info), \
+ INTEL_VGA_DEVICE(0x3EA9, info)
+
+/* CFL U GT3 */
+#define INTEL_CFL_U_GT3_IDS(info) \
+ INTEL_VGA_DEVICE(0x3EA2, info), /* ULT GT3 */ \
+ INTEL_VGA_DEVICE(0x3EA5, info), /* ULT GT3 */ \
INTEL_VGA_DEVICE(0x3EA6, info), /* ULT GT3 */ \
INTEL_VGA_DEVICE(0x3EA7, info), /* ULT GT3 */ \
- INTEL_VGA_DEVICE(0x3EA8, info), /* ULT GT3 */ \
- INTEL_VGA_DEVICE(0x3EA5, info) /* ULT GT3 */
+ INTEL_VGA_DEVICE(0x3EA8, info) /* ULT GT3 */
+
+#define INTEL_CFL_IDS(info) \
+ INTEL_CFL_S_GT1_IDS(info), \
+ INTEL_CFL_S_GT2_IDS(info), \
+ INTEL_CFL_H_GT2_IDS(info), \
+ INTEL_CFL_U_GT1_IDS(info), \
+ INTEL_CFL_U_GT2_IDS(info), \
+ INTEL_CFL_U_GT3_IDS(info)
/* CNL U 2+2 */
#define INTEL_CNL_U_GT2_IDS(info) \
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 7ec2ed097a8a..da01189249db 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -38,6 +38,34 @@ typedef u8 __bitwise blk_status_t;
#define BLK_STS_AGAIN ((__force blk_status_t)12)
+/**
+ * blk_path_error - returns true if error may be path related
+ * @error: status the request was completed with
+ *
+ * Description:
+ * This classifies block error status into non-retryable errors and ones
+ * that may be successful if retried on a failover path.
+ *
+ * Return:
+ * %false - retrying failover path will not help
+ * %true - may succeed if retried
+ */
+static inline bool blk_path_error(blk_status_t error)
+{
+ switch (error) {
+ case BLK_STS_NOTSUPP:
+ case BLK_STS_NOSPC:
+ case BLK_STS_TARGET:
+ case BLK_STS_NEXUS:
+ case BLK_STS_MEDIUM:
+ case BLK_STS_PROTECTION:
+ return false;
+ }
+
+ /* Anything else could be a path failure, so should be retried */
+ return true;
+}
+
struct blk_issue_stat {
u64 stat;
};
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b196006e3211..59066a08d558 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -47,6 +47,7 @@ struct bpf_map {
u32 max_entries;
u32 map_flags;
u32 pages;
+ bool unpriv_array;
u32 id;
struct user_struct *user;
const struct bpf_map_ops *ops;
@@ -189,6 +190,7 @@ struct bpf_prog_aux {
struct bpf_array {
struct bpf_map map;
u32 elem_size;
+ u32 index_mask;
/* 'ownership' of prog_array is claimed by the first program that
* is going to use this map or by the first program which FD is stored
* in the map to make sure that all callers and callees have the same
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000000000000..96e69979f84d
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,520 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/*
+ * Heterogeneous Memory Management (HMM)
+ *
+ * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * is for. Here we focus on the HMM API description, with some explanation of
+ * the underlying implementation.
+ *
+ * Short description: HMM provides a set of helpers to share a virtual address
+ * space between CPU and a device, so that the device can access any valid
+ * address of the process (while still obeying memory protection). HMM also
+ * provides helpers to migrate process memory to device memory, and back. Each
+ * set of functionality (address space mirroring, and migration to and from
+ * device memory) can be used independently of the other.
+ *
+ *
+ * HMM address space mirroring API:
+ *
+ * Use HMM address space mirroring if you want to mirror range of the CPU page
+ * table of a process into a device page table. Here, "mirror" means "keep
+ * synchronized". Prerequisites: the device must provide the ability to write-
+ * protect its page tables (at PAGE_SIZE granularity), and must be able to
+ * recover from the resulting potential page faults.
+ *
+ * HMM guarantees that at any point in time, a given virtual address points to
+ * either the same memory in both CPU and device page tables (that is: CPU and
+ * device page tables each point to the same pages), or that one page table (CPU
+ * or device) points to no entry, while the other still points to the old page
+ * for the address. The latter case happens when the CPU page table update
+ * happens first, and then the update is mirrored over to the device page table.
+ * This does not cause any issue, because the CPU page table cannot start
+ * pointing to a new page until the device page table is invalidated.
+ *
+ * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
+ * updates to each device driver that has registered a mirror. It also provides
+ * some API calls to help with taking a snapshot of the CPU page table, and to
+ * synchronize with any updates that might happen concurrently.
+ *
+ *
+ * HMM migration to and from device memory:
+ *
+ * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
+ * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
+ * of the device memory, and allows the device driver to manage its memory
+ * using those struct pages. Having struct pages for device memory makes
+ * migration easier. Because that memory is not addressable by the CPU it must
+ * never be pinned to the device; in other words, any CPU page fault can always
+ * cause the device memory to be migrated (copied/moved) back to regular memory.
+ *
+ * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
+ * allows use of a device DMA engine to perform the copy operation between
+ * regular system memory and device memory.
+ */
+#ifndef LINUX_HMM_H
+#define LINUX_HMM_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_HMM)
+
+#include <linux/device.h>
+#include <linux/migrate.h>
+#include <linux/memremap.h>
+#include <linux/completion.h>
+
+struct hmm;
+
+/*
+ * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
+ *
+ * Flags:
+ * HMM_PFN_VALID: pfn is valid
+ * HMM_PFN_READ: CPU page table has read permission set
+ * HMM_PFN_WRITE: CPU page table has write permission set
+ * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
+ * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none()
+ * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
+ * result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ * be mirrored by a device, because the entry will never have HMM_PFN_VALID
+ * set and the pfn value is undefined.
+ * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE)
+ */
+typedef unsigned long hmm_pfn_t;
+
+#define HMM_PFN_VALID (1 << 0)
+#define HMM_PFN_READ (1 << 1)
+#define HMM_PFN_WRITE (1 << 2)
+#define HMM_PFN_ERROR (1 << 3)
+#define HMM_PFN_EMPTY (1 << 4)
+#define HMM_PFN_SPECIAL (1 << 5)
+#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6)
+#define HMM_PFN_SHIFT 7
+
+/*
+ * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
+ * @pfn: hmm_pfn_t to convert to struct page
+ * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise
+ *
+ * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page
+ * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL.
+ */
+static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn)
+{
+ if (!(pfn & HMM_PFN_VALID))
+ return NULL;
+ return pfn_to_page(pfn >> HMM_PFN_SHIFT);
+}
+
+/*
+ * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t
+ * @pfn: hmm_pfn_t to extract pfn from
+ * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise
+ */
+static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn)
+{
+ if (!(pfn & HMM_PFN_VALID))
+ return -1UL;
+ return (pfn >> HMM_PFN_SHIFT);
+}
+
+/*
+ * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page
+ * @page: struct page pointer for which to create the hmm_pfn_t
+ * Returns: valid hmm_pfn_t for the page
+ */
+static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page)
+{
+ return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID;
+}
+
+/*
+ * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn
+ * @pfn: pfn value for which to create the hmm_pfn_t
+ * Returns: valid hmm_pfn_t for the pfn
+ */
+static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn)
+{
+ return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID;
+}
+
+
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+/*
+ * Mirroring: how to synchronize device page table with CPU page table.
+ *
+ * A device driver that is participating in HMM mirroring must always
+ * synchronize with CPU page table updates. For this, device drivers can either
+ * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
+ * drivers can decide to register one mirror per device per process, or just
+ * one mirror per process for a group of devices. The pattern is:
+ *
+ * int device_bind_address_space(..., struct mm_struct *mm, ...)
+ * {
+ * struct device_address_space *das;
+ *
+ * // Device driver specific initialization, and allocation of das
+ * // which contains an hmm_mirror struct as one of its fields.
+ * ...
+ *
+ * ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
+ * if (ret) {
+ * // Cleanup on error
+ * return ret;
+ * }
+ *
+ * // Other device driver specific initialization
+ * ...
+ * }
+ *
+ * Once an hmm_mirror is registered for an address space, the device driver
+ * will get callbacks through sync_cpu_device_pagetables() operation (see
+ * hmm_mirror_ops struct).
+ *
+ * Device driver must not free the struct containing the hmm_mirror struct
+ * before calling hmm_mirror_unregister(). The expected usage is to do that when
+ * the device driver is unbinding from an address space.
+ *
+ *
+ * void device_unbind_address_space(struct device_address_space *das)
+ * {
+ * // Device driver specific cleanup
+ * ...
+ *
+ * hmm_mirror_unregister(&das->mirror);
+ *
+ * // Other device driver specific cleanup, and now das can be freed
+ * ...
+ * }
+ */
+
+struct hmm_mirror;
+
+/*
+ * enum hmm_update_type - type of update
+ * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
+ */
+enum hmm_update_type {
+ HMM_UPDATE_INVALIDATE,
+};
+
+/*
+ * struct hmm_mirror_ops - HMM mirror device operations callback
+ *
+ * @update: callback to update range on a device
+ */
+struct hmm_mirror_ops {
+ /* sync_cpu_device_pagetables() - synchronize page tables
+ *
+ * @mirror: pointer to struct hmm_mirror
+ * @update_type: type of update that occurred to the CPU page table
+ * @start: virtual start address of the range to update
+ * @end: virtual end address of the range to update
+ *
+ * This callback ultimately originates from mmu_notifiers when the CPU
+ * page table is updated. The device driver must update its page table
+ * in response to this callback. The update argument tells what action
+ * to perform.
+ *
+ * The device driver must not return from this callback until the device
+ * page tables are completely updated (TLBs flushed, etc); this is a
+ * synchronous call.
+ */
+ void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+ enum hmm_update_type update_type,
+ unsigned long start,
+ unsigned long end);
+};
+
+/*
+ * struct hmm_mirror - mirror struct for a device driver
+ *
+ * @hmm: pointer to struct hmm (which is unique per mm_struct)
+ * @ops: device driver callback for HMM mirror operations
+ * @list: for list of mirrors of a given mm
+ *
+ * Each address space (mm_struct) being mirrored by a device must register one
+ * instance of an hmm_mirror struct with HMM. HMM will track the list of all
+ * mirrors for each mm_struct.
+ */
+struct hmm_mirror {
+ struct hmm *hmm;
+ const struct hmm_mirror_ops *ops;
+ struct list_head list;
+};
+
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
+void hmm_mirror_unregister(struct hmm_mirror *mirror);
+
+
+/*
+ * struct hmm_range - track invalidation lock on virtual address range
+ *
+ * @list: all range lock are on a list
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @pfns: array of pfns (big enough for the range)
+ * @valid: pfns array did not change since it has been fill by an HMM function
+ */
+struct hmm_range {
+ struct list_head list;
+ unsigned long start;
+ unsigned long end;
+ hmm_pfn_t *pfns;
+ bool valid;
+};
+
+/*
+ * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
+ * driver lock that serializes device page table updates, then call
+ * hmm_vma_range_done(), to check if the snapshot is still valid. The same
+ * device driver page table update lock must also be used in the
+ * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
+ * table invalidation serializes on it.
+ *
+ * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
+ * hmm_vma_get_pfns() WITHOUT ERROR !
+ *
+ * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns);
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
+
+
+/*
+ * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
+ * not migrate any device memory back to system memory. The hmm_pfn_t array will
+ * be updated with the fault result and current snapshot of the CPU page table
+ * for the range.
+ *
+ * The mmap_sem must be taken in read mode before entering and it might be
+ * dropped by the function if the block argument is false. In that case, the
+ * function returns -EAGAIN.
+ *
+ * Return value does not reflect if the fault was successful for every single
+ * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
+ * determine fault status for each address.
+ *
+ * Trying to fault inside an invalid vma will result in -EINVAL.
+ *
+ * See the function description in mm/hmm.c for further documentation.
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+struct hmm_devmem;
+
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+ unsigned long addr);
+
+/*
+ * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
+ *
+ * @free: call when refcount on page reach 1 and thus is no longer use
+ * @fault: call when there is a page fault to unaddressable memory
+ *
+ * Both callback happens from page_free() and page_fault() callback of struct
+ * dev_pagemap respectively. See include/linux/memremap.h for more details on
+ * those.
+ *
+ * The hmm_devmem_ops callback are just here to provide a coherent and
+ * uniq API to device driver and device driver should not register their
+ * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
+ * back.
+ */
+struct hmm_devmem_ops {
+ /*
+ * free() - free a device page
+ * @devmem: device memory structure (see struct hmm_devmem)
+ * @page: pointer to struct page being freed
+ *
+ * Call back occurs whenever a device page refcount reach 1 which
+ * means that no one is holding any reference on the page anymore
+ * (ZONE_DEVICE page have an elevated refcount of 1 as default so
+ * that they are not release to the general page allocator).
+ *
+ * Note that callback has exclusive ownership of the page (as no
+ * one is holding any reference).
+ */
+ void (*free)(struct hmm_devmem *devmem, struct page *page);
+ /*
+ * fault() - CPU page fault or get user page (GUP)
+ * @devmem: device memory structure (see struct hmm_devmem)
+ * @vma: virtual memory area containing the virtual address
+ * @addr: virtual address that faulted or for which there is a GUP
+ * @page: pointer to struct page backing virtual address (unreliable)
+ * @flags: FAULT_FLAG_* (see include/linux/mm.h)
+ * @pmdp: page middle directory
+ * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+ * on error
+ *
+ * The callback occurs whenever there is a CPU page fault or GUP on a
+ * virtual address. This means that the device driver must migrate the
+ * page back to regular memory (CPU accessible).
+ *
+ * The device driver is free to migrate more than one page from the
+ * fault() callback as an optimization. However if device decide to
+ * migrate more than one page it must always priotirize the faulting
+ * address over the others.
+ *
+ * The struct page pointer is only given as an hint to allow quick
+ * lookup of internal device driver data. A concurrent migration
+ * might have already free that page and the virtual address might
+ * not longer be back by it. So it should not be modified by the
+ * callback.
+ *
+ * Note that mmap semaphore is held in read mode at least when this
+ * callback occurs, hence the vma is valid upon callback entry.
+ */
+ int (*fault)(struct hmm_devmem *devmem,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ const struct page *page,
+ unsigned int flags,
+ pmd_t *pmdp);
+};
+
+/*
+ * struct hmm_devmem - track device memory
+ *
+ * @completion: completion object for device memory
+ * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
+ * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
+ * @resource: IO resource reserved for this chunk of memory
+ * @pagemap: device page map for that chunk
+ * @device: device to bind resource to
+ * @ops: memory operations callback
+ * @ref: per CPU refcount
+ *
+ * This an helper structure for device drivers that do not wish to implement
+ * the gory details related to hotplugging new memoy and allocating struct
+ * pages.
+ *
+ * Device drivers can directly use ZONE_DEVICE memory on their own if they
+ * wish to do so.
+ */
+struct hmm_devmem {
+ struct completion completion;
+ unsigned long pfn_first;
+ unsigned long pfn_last;
+ struct resource *resource;
+ struct device *device;
+ struct dev_pagemap pagemap;
+ const struct hmm_devmem_ops *ops;
+ struct percpu_ref ref;
+};
+
+/*
+ * To add (hotplug) device memory, HMM assumes that there is no real resource
+ * that reserves a range in the physical address space (this is intended to be
+ * use by unaddressable device memory). It will reserve a physical range big
+ * enough and allocate struct page for it.
+ *
+ * The device driver can wrap the hmm_devmem struct inside a private device
+ * driver struct. The device driver must call hmm_devmem_remove() before the
+ * device goes away and before freeing the hmm_devmem struct memory.
+ */
+struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ struct resource *res);
+void hmm_devmem_remove(struct hmm_devmem *devmem);
+
+/*
+ * hmm_devmem_page_set_drvdata - set per-page driver data field
+ *
+ * @page: pointer to struct page
+ * @data: driver data value to set
+ *
+ * Because page can not be on lru we have an unsigned long that driver can use
+ * to store a per page field. This just a simple helper to do that.
+ */
+static inline void hmm_devmem_page_set_drvdata(struct page *page,
+ unsigned long data)
+{
+ unsigned long *drvdata = (unsigned long *)&page->pgmap;
+
+ drvdata[1] = data;
+}
+
+/*
+ * hmm_devmem_page_get_drvdata - get per page driver data field
+ *
+ * @page: pointer to struct page
+ * Return: driver data value
+ */
+static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
+{
+ unsigned long *drvdata = (unsigned long *)&page->pgmap;
+
+ return drvdata[1];
+}
+
+
+/*
+ * struct hmm_device - fake device to hang device memory onto
+ *
+ * @device: device struct
+ * @minor: device minor number
+ */
+struct hmm_device {
+ struct device device;
+ unsigned int minor;
+};
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper and
+ * it is not strictly needed, in order to make use of any HMM functionality.
+ */
+struct hmm_device *hmm_device_new(void *drvdata);
+void hmm_device_put(struct hmm_device *hmm_device);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* IS_ENABLED(CONFIG_HMM) */
+
+/* Below are for HMM internal use only! Not to be used by device driver! */
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+void hmm_mm_destroy(struct mm_struct *mm);
+
+static inline void hmm_mm_init(struct mm_struct *mm)
+{
+ mm->hmm = NULL;
+}
+#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+static inline void hmm_mm_init(struct mm_struct *mm) {}
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#else /* IS_ENABLED(CONFIG_HMM) */
+static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+static inline void hmm_mm_init(struct mm_struct *mm) {}
+#endif /* LINUX_HMM_H */
diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
index 3fa5ef2b3759..35e170ca87a8 100644
--- a/include/linux/hypervisor.h
+++ b/include/linux/hypervisor.h
@@ -6,8 +6,12 @@
* Juergen Gross <jgross@suse.com>
*/
-#ifdef CONFIG_HYPERVISOR_GUEST
-#include <asm/hypervisor.h>
+#ifdef CONFIG_X86
+#include <asm/x86_init.h>
+static inline void hypervisor_pin_vcpu(int cpu)
+{
+ x86_platform.hyper.pin_vcpu(cpu);
+}
#else
static inline void hypervisor_pin_vcpu(int cpu)
{
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 297f5b8e8bfd..c04d584ab5a1 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -130,6 +130,8 @@ enum {
IORES_DESC_ACPI_NV_STORAGE = 3,
IORES_DESC_PERSISTENT_MEMORY = 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
+ IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
+ IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
};
/* helpers to define resources */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ed167541e4fc..f79ebfd43a62 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -127,6 +127,17 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
extern int __add_pages(int nid, unsigned long start_pfn,
unsigned long nr_pages, bool want_memblock);
+#ifndef CONFIG_ARCH_HAS_ADD_PAGES
+static inline int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
+{
+ return __add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
+#else /* ARCH_HAS_ADD_PAGES */
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock);
+#endif /* ARCH_HAS_ADD_PAGES */
+
#ifdef CONFIG_NUMA
extern int memory_add_physaddr_to_nid(u64 start);
#else
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 93416196ba64..79f8ba7c3894 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -4,6 +4,8 @@
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>
+#include <asm/pgtable.h>
+
struct resource;
struct device;
@@ -35,24 +37,107 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
}
#endif
+/*
+ * Specialize ZONE_DEVICE memory into multiple types each having differents
+ * usage.
+ *
+ * MEMORY_DEVICE_HOST:
+ * Persistent device memory (pmem): struct page might be allocated in different
+ * memory and architecture might want to perform special actions. It is similar
+ * to regular memory, in that the CPU can access it transparently. However,
+ * it is likely to have different bandwidth and latency than regular memory.
+ * See Documentation/nvdimm/nvdimm.txt for more information.
+ *
+ * MEMORY_DEVICE_PRIVATE:
+ * Device memory that is not directly addressable by the CPU: CPU can neither
+ * read nor write private memory. In this case, we do still have struct pages
+ * backing the device memory. Doing so simplifies the implementation, but it is
+ * important to remember that there are certain points at which the struct page
+ * must be treated as an opaque object, rather than a "normal" struct page.
+ *
+ * A more complete discussion of unaddressable memory may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
+ */
+enum memory_type {
+ MEMORY_DEVICE_HOST = 0,
+ MEMORY_DEVICE_PRIVATE,
+ MEMORY_DEVICE_PUBLIC,
+};
+
+/*
+ * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two
+ * callbacks:
+ * page_fault()
+ * page_free()
+ *
+ * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
+ * explanation in include/linux/memory_hotplug.h.
+ *
+ * The page_fault() callback must migrate page back, from device memory to
+ * system memory, so that the CPU can access it. This might fail for various
+ * reasons (device issues, device have been unplugged, ...). When such error
+ * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
+ * set the CPU page table entry to "poisoned".
+ *
+ * Note that because memory cgroup charges are transferred to the device memory,
+ * this should never fail due to memory restrictions. However, allocation
+ * of a regular system page might still fail because we are out of memory. If
+ * that happens, the page_fault() callback must return VM_FAULT_OOM.
+ *
+ * The page_fault() callback can also try to migrate back multiple pages in one
+ * chunk, as an optimization. It must, however, prioritize the faulting address
+ * over all the others.
+ *
+ *
+ * The page_free() callback is called once the page refcount reaches 1
+ * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
+ * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
+ */
+typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+ unsigned long addr,
+ const struct page *page,
+ unsigned int flags,
+ pmd_t *pmdp);
+typedef void (*dev_page_free_t)(struct page *page, void *data);
+
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
+ * @page_fault: callback when CPU fault on an unaddressable device page
+ * @page_free: free page callback when page refcount reaches 1
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @res: physical address range covered by @ref
* @ref: reference count that pins the devm_memremap_pages() mapping
* @dev: host device of the mapping for debug
+ * @data: private data pointer for page_free()
+ * @type: memory type: see MEMORY_* in memory_hotplug.h
*/
struct dev_pagemap {
+ dev_page_fault_t page_fault;
+ dev_page_free_t page_free;
struct vmem_altmap *altmap;
const struct resource *res;
struct percpu_ref *ref;
struct device *dev;
+ void *data;
+ enum memory_type type;
};
#ifdef CONFIG_ZONE_DEVICE
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap);
struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+
+static inline bool is_zone_device_page(const struct page *page);
#else
static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
@@ -73,6 +158,20 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
}
#endif
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+static inline bool is_device_private_page(const struct page *page)
+{
+ return is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+}
+
+static inline bool is_device_public_page(const struct page *page)
+{
+ return is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+}
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
/**
* get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
* @pfn: page frame number to lookup page_map
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 99449646c97a..9fef5d8a617e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -61,6 +61,7 @@ extern void putback_movable_page(struct page *page);
extern int migrate_prep(void);
extern int migrate_prep_local(void);
+extern void migrate_page_states(struct page *newpage, struct page *page);
extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
@@ -81,6 +82,10 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
static inline int migrate_prep(void) { return -ENOSYS; }
static inline int migrate_prep_local(void) { return -ENOSYS; }
+static inline void migrate_page_states(struct page *newpage, struct page *page)
+{
+}
+
static inline void migrate_page_copy(struct page *newpage,
struct page *page) {}
@@ -140,4 +145,136 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
}
#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
+
+#ifdef CONFIG_MIGRATION
+
+/*
+ * Watch out for PAE architecture, which has an unsigned long, and might not
+ * have enough bits to store all physical address and flags. So far we have
+ * enough room for all our flags.
+ */
+#define MIGRATE_PFN_VALID (1UL << 0)
+#define MIGRATE_PFN_MIGRATE (1UL << 1)
+#define MIGRATE_PFN_LOCKED (1UL << 2)
+#define MIGRATE_PFN_WRITE (1UL << 3)
+#define MIGRATE_PFN_DEVICE (1UL << 4)
+#define MIGRATE_PFN_ERROR (1UL << 5)
+#define MIGRATE_PFN_SHIFT 6
+
+static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
+{
+ if (!(mpfn & MIGRATE_PFN_VALID))
+ return NULL;
+ return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT);
+}
+
+static inline unsigned long migrate_pfn(unsigned long pfn)
+{
+ return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
+}
+
+/*
+ * struct migrate_vma_ops - migrate operation callback
+ *
+ * @alloc_and_copy: alloc destination memory and copy source memory to it
+ * @finalize_and_map: allow caller to map the successfully migrated pages
+ *
+ *
+ * The alloc_and_copy() callback happens once all source pages have been locked,
+ * unmapped and checked (checked whether pinned or not). All pages that can be
+ * migrated will have an entry in the src array set with the pfn value of the
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other
+ * flags might be set but should be ignored by the callback).
+ *
+ * The alloc_and_copy() callback can then allocate destination memory and copy
+ * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the
+ * callback must update each corresponding entry in the dst array with the pfn
+ * value of the destination page and with the MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
+ * locked, via lock_page()).
+ *
+ * At this point the alloc_and_copy() callback is done and returns.
+ *
+ * Note that the callback does not have to migrate all the pages that are
+ * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
+ * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
+ * set in the src array entry). If the device driver cannot migrate a device
+ * page back to system memory, then it must set the corresponding dst array
+ * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
+ * access any of the virtual addresses originally backed by this page. Because
+ * a SIGBUS is such a severe result for the userspace process, the device
+ * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
+ * unrecoverable state.
+ *
+ * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing device driver to allocate device memory for those unback virtual
+ * address. For this the device driver simply have to allocate device memory
+ * and properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entry inside the finalize_and_map()
+ * callback just like for regular migration.
+ *
+ * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
+ * OR BAD THINGS WILL HAPPEN !
+ *
+ *
+ * The finalize_and_map() callback happens after struct page migration from
+ * source to destination (destination struct pages are the struct pages for the
+ * memory allocated by the alloc_and_copy() callback). Migration can fail, and
+ * thus the finalize_and_map() allows the driver to inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table from within the finalize_and_map()
+ * callback because both destination and source page are still locked, and the
+ * mmap_sem is held in read mode (hence no one can unmap the range being
+ * migrated).
+ *
+ * Once callback is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) then it returns. At this point,
+ * the HMM core will finish up the final steps, and the migration is complete.
+ *
+ * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
+ * ENTRIES OR BAD THINGS WILL HAPPEN !
+ */
+struct migrate_vma_ops {
+ void (*alloc_and_copy)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+ void (*finalize_and_map)(struct vm_area_struct *vma,
+ const unsigned long *src,
+ const unsigned long *dst,
+ unsigned long start,
+ unsigned long end,
+ void *private);
+};
+
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private);
+#else
+static inline int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private)
+{
+ return -EINVAL;
+}
+#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
+
+#endif /* CONFIG_MIGRATION */
+
#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index ebf3d89a3919..bdf66af9b937 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -6,11 +6,16 @@
* on most operations but not ->writepage as the potential stall time
* is too significant
* MIGRATE_SYNC will block when migrating pages
+ * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages
+ * with the CPU. Instead, page copy happens outside the migratepage()
+ * callback and is likely using a DMA engine. See migrate_vma() and HMM
+ * (mm/hmm.c) for users of this mode.
*/
enum migrate_mode {
MIGRATE_ASYNC,
MIGRATE_SYNC_LIGHT,
MIGRATE_SYNC,
+ MIGRATE_SYNC_NO_COPY,
};
#endif /* MIGRATE_MODE_H_INCLUDED */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9ac2d2e5f03..c63f81c35248 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -23,6 +23,7 @@
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page_ref.h>
+#include <linux/memremap.h>
struct mempolicy;
struct anon_vma;
@@ -797,6 +798,28 @@ static inline bool is_zone_device_page(const struct page *page)
}
#endif
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page);
+DECLARE_STATIC_KEY_FALSE(device_private_key);
+#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
+static inline bool is_device_private_page(const struct page *page);
+static inline bool is_device_public_page(const struct page *page);
+#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+static inline void put_zone_device_private_or_public_page(struct page *page)
+{
+}
+#define IS_HMM_ENABLED 0
+static inline bool is_device_private_page(const struct page *page)
+{
+ return false;
+}
+static inline bool is_device_public_page(const struct page *page)
+{
+ return false;
+}
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+
static inline void get_page(struct page *page)
{
page = compound_head(page);
@@ -812,6 +835,18 @@ static inline void put_page(struct page *page)
{
page = compound_head(page);
+ /*
+ * For private device pages we need to catch refcount transition from
+ * 2 to 1, when refcount reach one it means the private device page is
+ * free and we need to inform the device driver through callback. See
+ * include/linux/memremap.h and HMM for details.
+ */
+ if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
+ unlikely(is_device_public_page(page)))) {
+ put_zone_device_private_or_public_page(page);
+ return;
+ }
+
if (put_page_testzero(page))
__put_page(page);
}
@@ -1199,8 +1234,10 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap */
};
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte);
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device);
+#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
+
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c3073e49ae3e..9bfad0bc6393 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
struct address_space;
struct mem_cgroup;
+struct hmm;
/*
* Each physical page in the system has a struct page associated with
@@ -503,6 +504,11 @@ struct mm_struct {
atomic_long_t hugetlb_usage;
#endif
struct work_struct async_put_work;
+
+#if IS_ENABLED(CONFIG_HMM)
+ /* HMM needs to track a few things per mm */
+ struct hmm *hmm;
+#endif
};
extern struct mm_struct init_mm;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e95dd4f0af37..33a5dd079595 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -51,6 +51,23 @@ static inline int current_is_kswapd(void)
*/
/*
+ * Unaddressable device memory support. See include/linux/hmm.h and
+ * Documentation/vm/hmm.txt. Short description is we need struct pages for
+ * device memory that is unaddressable (inaccessible) by CPU, so that we can
+ * migrate part of a process memory to device memory.
+ *
+ * When a page is migrated from CPU to device, we set the CPU page table entry
+ * to a special SWP_DEVICE_* entry.
+ */
+#ifdef CONFIG_DEVICE_PRIVATE
+#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
+#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#else
+#define SWP_DEVICE_NUM 0
+#endif
+
+/*
* NUMA node memory migration support
*/
#ifdef CONFIG_MIGRATION
@@ -72,7 +89,8 @@ static inline int current_is_kswapd(void)
#endif
#define MAX_SWAPFILES \
- ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
+ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
/*
* Magic header for a swap area. The first part of the union is
@@ -443,8 +461,8 @@ static inline void show_swap_cache_info(void)
{
}
-#define free_swap_and_cache(swp) is_migration_entry(swp)
-#define swapcache_prepare(swp) is_migration_entry(swp)
+#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 5c3a5f3e7eec..361090cef69e 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -100,6 +100,74 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+ return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
+ page_to_pfn(page));
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+ int type = swp_type(entry);
+ return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+ *entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+ return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+ return pfn_to_page(swp_offset(entry));
+}
+
+int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp);
+#else /* CONFIG_DEVICE_PRIVATE */
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+ return swp_entry(0, 0);
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+ return false;
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+ return false;
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ return VM_FAULT_SIGBUS;
+}
+#endif /* CONFIG_DEVICE_PRIVATE */
+
#ifdef CONFIG_MIGRATION
static inline swp_entry_t make_migration_entry(struct page *page, int write)
{
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 185ba058693d..91198cb3b8dc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,9 +49,10 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
u64 array_size;
- u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -66,11 +67,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+ index_mask = roundup_pow_of_two(max_entries) - 1;
+
+ if (unpriv)
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+
array_size = sizeof(*array);
if (percpu)
- array_size += (u64) attr->max_entries * sizeof(void *);
+ array_size += (u64) max_entries * sizeof(void *);
else
- array_size += (u64) attr->max_entries * elem_size;
+ array_size += (u64) max_entries * elem_size;
/* make sure there is no u32 overflow later in round_up() */
if (array_size >= U32_MAX - PAGE_SIZE)
@@ -80,6 +90,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array = bpf_map_area_alloc(array_size);
if (!array)
return ERR_PTR(-ENOMEM);
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -114,12 +126,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + array->elem_size * (index & array->index_mask);
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
@@ -128,7 +141,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ }
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -150,7 +168,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return this_cpu_ptr(array->pptrs[index]);
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -170,7 +188,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
off += size;
@@ -218,10 +236,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
- memcpy(this_cpu_ptr(array->pptrs[index]),
+ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
else
- memcpy(array->value + array->elem_size * index,
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
value, map->value_size);
return 0;
}
@@ -255,7 +274,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
off += size;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 79634da4ead3..217ad8514c1b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -33,7 +33,6 @@
#include <linux/rcupdate.h>
#include <asm/unaligned.h>
-#include <asm/barrier.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -933,7 +932,6 @@ select_insn:
DST = IMM;
CONT;
LD_IMM_DW:
- osb();
DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
insn++;
CONT;
@@ -1195,7 +1193,6 @@ out:
*(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
CONT; \
LDX_MEM_##SIZEOP: \
- osb(); \
DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
CONT;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7aacfc5e9b40..0876aed11b07 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1675,6 +1675,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (meta.map_ptr == NULL) {
+ verbose("verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -4209,6 +4216,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON) {
+ verbose("tail_call obusing map_ptr\n");
+ return -EINVAL;
+ }
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 870029b2ef1b..3e9e598756fc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
+#include <linux/hmm.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -812,6 +813,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
+ hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
@@ -891,6 +893,7 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
destroy_context(mm);
+ hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 45a6a61a9c1d..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,13 +11,14 @@
* General Public License for more details.
*/
#include <linux/radix-tree.h>
-#include <linux/memremap.h>
#include <linux/device.h>
#include <linux/types.h>
#include <linux/pfn_t.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/memory_hotplug.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
@@ -219,6 +220,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff)
for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
pgoff += 1UL << order, order = order_at((res), pgoff))
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+int device_private_entry_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ swp_entry_t entry,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct page *page = device_private_entry_to_page(entry);
+
+ /*
+ * The page_fault() callback must migrate page back to system memory
+ * so that CPU can access it. This might fail for various reasons
+ * (device issue, device was unsafely unplugged, ...). When such
+ * error conditions happen, the callback must return VM_FAULT_SIGBUS.
+ *
+ * Note that because memory cgroup charges are accounted to the device
+ * memory, this should never fail because of memory restrictions (but
+ * allocation of regular system page might still fail because we are
+ * out of memory).
+ *
+ * There is a more in-depth description of what that callback can and
+ * cannot do, in include/linux/memremap.h
+ */
+ return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+}
+EXPORT_SYMBOL(device_private_entry_fault);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
static void pgmap_radix_release(struct resource *res)
{
unsigned long pgoff, order;
@@ -356,6 +385,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
}
pgmap->ref = ref;
pgmap->res = &page_map->res;
+ pgmap->type = MEMORY_DEVICE_HOST;
+ pgmap->page_fault = NULL;
+ pgmap->page_free = NULL;
+ pgmap->data = NULL;
mutex_lock(&pgmap_lock);
error = 0;
@@ -468,3 +501,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
return pgmap ? pgmap->altmap : NULL;
}
#endif /* CONFIG_ZONE_DEVICE */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page)
+{
+ int count = page_ref_dec_return(page);
+
+ /*
+ * If refcount is 1 then page is freed and refcount is stable as nobody
+ * holds a reference on the page.
+ */
+ if (count == 1) {
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
+
+ page->mapping = NULL;
+ mem_cgroup_uncharge(page);
+
+ page->pgmap->page_free(page, page->pgmap->data);
+ } else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_zone_device_private_or_public_page);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/mm/Kconfig b/mm/Kconfig
index 679d9ce6a596..a3f66adf2e35 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -684,7 +684,7 @@ config IDLE_PAGE_TRACKING
See Documentation/vm/idle_page_tracking.txt for more details.
config ZONE_DEVICE
- bool "Device memory (pmem, etc...) hotplug support"
+ bool "Device memory (pmem, HMM, etc...) hotplug support"
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
@@ -700,6 +700,55 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
+config ARCH_HAS_HMM
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on ZONE_DEVICE
+ depends on MMU && 64BIT
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+
+config MIGRATE_VMA_HELPER
+ bool
+
+config HMM
+ bool
+ select MIGRATE_VMA_HELPER
+
+config HMM_MIRROR
+ bool "HMM mirror CPU page table into a device page table"
+ depends on ARCH_HAS_HMM
+ select MMU_NOTIFIER
+ select HMM
+ help
+ Select HMM_MIRROR if you want to mirror range of the CPU page table of a
+ process into a device page table. Here, mirror means "keep synchronized".
+ Prerequisites: the device must provide the ability to write-protect its
+ page tables (at PAGE_SIZE granularity), and must be able to recover from
+ the resulting potential page faults.
+
+config DEVICE_PRIVATE
+ bool "Unaddressable device memory (GPU memory, ...)"
+ depends on ARCH_HAS_HMM
+ select HMM
+
+ help
+ Allows creation of struct pages to represent unaddressable device
+ memory; i.e., memory that is only accessible from the device (or
+ group of devices). You likely also want to select HMM_MIRROR.
+
+config DEVICE_PUBLIC
+ bool "Addressable device memory (like GPU memory)"
+ depends on ARCH_HAS_HMM
+ select HMM
+
+ help
+ Allows creation of struct pages to represent addressable device
+ memory; i.e., memory that is accessible from both the device and
+ the CPU
+
config FRAME_VECTOR
bool
diff --git a/mm/Makefile b/mm/Makefile
index 026f6a828a50..71f1f6c72be6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
+obj-$(CONFIG_HMM) += hmm.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index da91df50ba31..145b903eb023 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping,
{
struct balloon_dev_info *balloon = balloon_page_device(page);
+ /*
+ * We can not easily support the no copy case here so ignore it as it
+ * is unlikely to be use with ballon pages. See include/linux/hmm.h for
+ * user of the MIGRATE_SYNC_NO_COPY mode.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
diff --git a/mm/gup.c b/mm/gup.c
index 1a88cb99462a..4c57a6ee3519 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -364,6 +364,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
+
+ /*
+ * This should never happen (a device public page in the gate
+ * area).
+ */
+ if (is_device_public_page(*page))
+ goto unmap;
}
get_page(*page);
out:
diff --git a/mm/hmm.c b/mm/hmm.c
new file mode 100644
index 000000000000..a88a847bccba
--- /dev/null
+++ b/mm/hmm.c
@@ -0,0 +1,1257 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
+ */
+/*
+ * Refer to include/linux/hmm.h for information about heterogeneous memory
+ * management or HMM for short.
+ */
+#include <linux/mm.h>
+#include <linux/hmm.h>
+#include <linux/init.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
+#include <linux/memremap.h>
+#include <linux/jump_label.h>
+#include <linux/mmu_notifier.h>
+#include <linux/memory_hotplug.h>
+
+#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
+
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
+/*
+ * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ */
+DEFINE_STATIC_KEY_FALSE(device_private_key);
+EXPORT_SYMBOL(device_private_key);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+
+
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @sequence: we track updates to the CPU page table with a sequence number
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ */
+struct hmm {
+ struct mm_struct *mm;
+ spinlock_t lock;
+ atomic_t sequence;
+ struct list_head ranges;
+ struct list_head mirrors;
+ struct mmu_notifier mmu_notifier;
+ struct rw_semaphore mirrors_sem;
+};
+
+/*
+ * hmm_register - register HMM against an mm (HMM internal)
+ *
+ * @mm: mm struct to attach to
+ *
+ * This is not intended to be used directly by device drivers. It allocates an
+ * HMM struct if mm does not have one, and initializes it.
+ */
+static struct hmm *hmm_register(struct mm_struct *mm)
+{
+ struct hmm *hmm = READ_ONCE(mm->hmm);
+ bool cleanup = false;
+
+ /*
+ * The hmm struct can only be freed once the mm_struct goes away,
+ * hence we should always have pre-allocated an new hmm struct
+ * above.
+ */
+ if (hmm)
+ return hmm;
+
+ hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+ if (!hmm)
+ return NULL;
+ INIT_LIST_HEAD(&hmm->mirrors);
+ init_rwsem(&hmm->mirrors_sem);
+ atomic_set(&hmm->sequence, 0);
+ hmm->mmu_notifier.ops = NULL;
+ INIT_LIST_HEAD(&hmm->ranges);
+ spin_lock_init(&hmm->lock);
+ hmm->mm = mm;
+
+ /*
+ * We should only get here if hold the mmap_sem in write mode ie on
+ * registration of first mirror through hmm_mirror_register()
+ */
+ hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+ if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
+ kfree(hmm);
+ return NULL;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (!mm->hmm)
+ mm->hmm = hmm;
+ else
+ cleanup = true;
+ spin_unlock(&mm->page_table_lock);
+
+ if (cleanup) {
+ mmu_notifier_unregister(&hmm->mmu_notifier, mm);
+ kfree(hmm);
+ }
+
+ return mm->hmm;
+}
+
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+ kfree(mm->hmm);
+}
+
+static void hmm_invalidate_range(struct hmm *hmm,
+ enum hmm_update_type action,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm_mirror *mirror;
+ struct hmm_range *range;
+
+ spin_lock(&hmm->lock);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ unsigned long addr, idx, npages;
+
+ if (end < range->start || start >= range->end)
+ continue;
+
+ range->valid = false;
+ addr = max(start, range->start);
+ idx = (addr - range->start) >> PAGE_SHIFT;
+ npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
+ memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
+ }
+ spin_unlock(&hmm->lock);
+
+ down_read(&hmm->mirrors_sem);
+ list_for_each_entry(mirror, &hmm->mirrors, list)
+ mirror->ops->sync_cpu_device_pagetables(mirror, action,
+ start, end);
+ up_read(&hmm->mirrors_sem);
+}
+
+static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm *hmm = mm->hmm;
+
+ VM_BUG_ON(!hmm);
+
+ atomic_inc(&hmm->sequence);
+}
+
+static void hmm_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm *hmm = mm->hmm;
+
+ VM_BUG_ON(!hmm);
+
+ hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+}
+
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
+ .invalidate_range_start = hmm_invalidate_range_start,
+ .invalidate_range_end = hmm_invalidate_range_end,
+};
+
+/*
+ * hmm_mirror_register() - register a mirror against an mm
+ *
+ * @mirror: new mirror struct to register
+ * @mm: mm to register against
+ *
+ * To start mirroring a process address space, the device driver must register
+ * an HMM mirror struct.
+ *
+ * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
+ */
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
+{
+ /* Sanity check */
+ if (!mm || !mirror || !mirror->ops)
+ return -EINVAL;
+
+ mirror->hmm = hmm_register(mm);
+ if (!mirror->hmm)
+ return -ENOMEM;
+
+ down_write(&mirror->hmm->mirrors_sem);
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
+
+ return 0;
+}
+EXPORT_SYMBOL(hmm_mirror_register);
+
+/*
+ * hmm_mirror_unregister() - unregister a mirror
+ *
+ * @mirror: new mirror struct to register
+ *
+ * Stop mirroring a process address space, and cleanup.
+ */
+void hmm_mirror_unregister(struct hmm_mirror *mirror)
+{
+ struct hmm *hmm = mirror->hmm;
+
+ down_write(&hmm->mirrors_sem);
+ list_del(&mirror->list);
+ up_write(&hmm->mirrors_sem);
+}
+EXPORT_SYMBOL(hmm_mirror_unregister);
+
+struct hmm_vma_walk {
+ struct hmm_range *range;
+ unsigned long last;
+ bool fault;
+ bool block;
+ bool write;
+};
+
+static int hmm_vma_do_fault(struct mm_walk *walk,
+ unsigned long addr,
+ hmm_pfn_t *pfn)
+{
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ int r;
+
+ flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
+ flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+ r = handle_mm_fault(vma, addr, flags);
+ if (r & VM_FAULT_RETRY)
+ return -EBUSY;
+ if (r & VM_FAULT_ERROR) {
+ *pfn = HMM_PFN_ERROR;
+ return -EFAULT;
+ }
+
+ return -EAGAIN;
+}
+
+static void hmm_pfns_special(hmm_pfn_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = HMM_PFN_SPECIAL;
+}
+
+static int hmm_pfns_bad(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_range *range = walk->private;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++)
+ pfns[i] = HMM_PFN_ERROR;
+
+ return 0;
+}
+
+static void hmm_pfns_clear(hmm_pfn_t *pfns,
+ unsigned long addr,
+ unsigned long end)
+{
+ for (; addr < end; addr += PAGE_SIZE, pfns++)
+ *pfns = 0;
+}
+
+static int hmm_vma_walk_hole(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = HMM_PFN_EMPTY;
+ if (hmm_vma_walk->fault) {
+ int ret;
+
+ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+ }
+
+ return hmm_vma_walk->fault ? -EAGAIN : 0;
+}
+
+static int hmm_vma_walk_clear(unsigned long addr,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long i;
+
+ hmm_vma_walk->last = addr;
+ i = (addr - range->start) >> PAGE_SHIFT;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ pfns[i] = 0;
+ if (hmm_vma_walk->fault) {
+ int ret;
+
+ ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+ }
+
+ return hmm_vma_walk->fault ? -EAGAIN : 0;
+}
+
+static int hmm_vma_walk_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ hmm_pfn_t *pfns = range->pfns;
+ unsigned long addr = start, i;
+ bool write_fault;
+ hmm_pfn_t flag;
+ pte_t *ptep;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
+ write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
+
+again:
+ if (pmd_none(*pmdp))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
+ return hmm_pfns_bad(start, end, walk);
+
+ if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
+ unsigned long pfn;
+ pmd_t pmd;
+
+ /*
+ * No need to take pmd_lock here, even if some other threads
+ * is splitting the huge pmd we will get that event through
+ * mmu_notifier callback.
+ *
+ * So just read pmd value and check again its a transparent
+ * huge or device mapping one and compute corresponding pfn
+ * values.
+ */
+ pmd = pmd_read_atomic(pmdp);
+ barrier();
+ if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+ goto again;
+ if (pmd_protnone(pmd))
+ return hmm_vma_walk_clear(start, end, walk);
+
+ if (write_fault && !pmd_write(pmd))
+ return hmm_vma_walk_clear(start, end, walk);
+
+ pfn = pmd_pfn(pmd) + pte_index(addr);
+ flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
+ return 0;
+ }
+
+ if (pmd_bad(*pmdp))
+ return hmm_pfns_bad(start, end, walk);
+
+ ptep = pte_offset_map(pmdp, addr);
+ for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
+ pte_t pte = *ptep;
+
+ pfns[i] = 0;
+
+ if (pte_none(pte)) {
+ pfns[i] = HMM_PFN_EMPTY;
+ if (hmm_vma_walk->fault)
+ goto fault;
+ continue;
+ }
+
+ if (!pte_present(pte)) {
+ swp_entry_t entry;
+
+ if (!non_swap_entry(entry)) {
+ if (hmm_vma_walk->fault)
+ goto fault;
+ continue;
+ }
+
+ entry = pte_to_swp_entry(pte);
+
+ /*
+ * This is a special swap entry, ignore migration, use
+ * device and report anything else as error.
+ */
+ if (is_device_private_entry(entry)) {
+ pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
+ if (is_write_device_private_entry(entry)) {
+ pfns[i] |= HMM_PFN_WRITE;
+ } else if (write_fault)
+ goto fault;
+ pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
+ pfns[i] |= flag;
+ } else if (is_migration_entry(entry)) {
+ if (hmm_vma_walk->fault) {
+ pte_unmap(ptep);
+ hmm_vma_walk->last = addr;
+ migration_entry_wait(vma->vm_mm,
+ pmdp, addr);
+ return -EAGAIN;
+ }
+ continue;
+ } else {
+ /* Report error for everything else */
+ pfns[i] = HMM_PFN_ERROR;
+ }
+ continue;
+ }
+
+ if (write_fault && !pte_write(pte))
+ goto fault;
+
+ pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
+ pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+ continue;
+
+fault:
+ pte_unmap(ptep);
+ /* Fault all pages in range */
+ return hmm_vma_walk_clear(start, end, walk);
+ }
+ pte_unmap(ptep - 1);
+
+ return 0;
+}
+
+/*
+ * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
+ * @vma: virtual memory area containing the virtual address range
+ * @range: used to track snapshot validity
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
+ *
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See hmm_vma_range_done() for further
+ * information.
+ *
+ * The range struct is initialized here. It tracks the CPU page table, but only
+ * if the function returns success (0), in which case the caller must then call
+ * hmm_vma_range_done() to stop CPU page table update tracking on this range.
+ *
+ * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
+ * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns)
+{
+ struct hmm_vma_walk hmm_vma_walk;
+ struct mm_walk mm_walk;
+ struct hmm *hmm;
+
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(pfns, start, end);
+ return -EINVAL;
+ }
+
+ /* Sanity check, this really should not happen ! */
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm)
+ return -ENOMEM;
+ /* Caller must have registered a mirror, via hmm_mirror_register() ! */
+ if (!hmm->mmu_notifier.ops)
+ return -EINVAL;
+
+ /* Initialize range to track CPU page table update */
+ range->start = start;
+ range->pfns = pfns;
+ range->end = end;
+ spin_lock(&hmm->lock);
+ range->valid = true;
+ list_add_rcu(&range->list, &hmm->ranges);
+ spin_unlock(&hmm->lock);
+
+ hmm_vma_walk.fault = false;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+
+ walk_page_range(start, end, &mm_walk);
+ return 0;
+}
+EXPORT_SYMBOL(hmm_vma_get_pfns);
+
+/*
+ * hmm_vma_range_done() - stop tracking change to CPU page table over a range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: range being tracked
+ * Returns: false if range data has been invalidated, true otherwise
+ *
+ * Range struct is used to track updates to the CPU page table after a call to
+ * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
+ * using the data, or wants to lock updates to the data it got from those
+ * functions, it must call the hmm_vma_range_done() function, which will then
+ * stop tracking CPU page table updates.
+ *
+ * Note that device driver must still implement general CPU page table update
+ * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
+ * the mmu_notifier API directly.
+ *
+ * CPU page table update tracking done through hmm_range is only temporary and
+ * to be used while trying to duplicate CPU page table contents for a range of
+ * virtual addresses.
+ *
+ * There are two ways to use this :
+ * again:
+ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * trans = device_build_page_table_update_transaction(pfns);
+ * device_page_table_lock();
+ * if (!hmm_vma_range_done(vma, range)) {
+ * device_page_table_unlock();
+ * goto again;
+ * }
+ * device_commit_transaction(trans);
+ * device_page_table_unlock();
+ *
+ * Or:
+ * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
+ * device_page_table_lock();
+ * hmm_vma_range_done(vma, range);
+ * device_update_page_table(pfns);
+ * device_page_table_unlock();
+ */
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
+{
+ unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
+ struct hmm *hmm;
+
+ if (range->end <= range->start) {
+ BUG();
+ return false;
+ }
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm) {
+ memset(range->pfns, 0, sizeof(*range->pfns) * npages);
+ return false;
+ }
+
+ spin_lock(&hmm->lock);
+ list_del_rcu(&range->list);
+ spin_unlock(&hmm->lock);
+
+ return range->valid;
+}
+EXPORT_SYMBOL(hmm_vma_range_done);
+
+/*
+ * hmm_vma_fault() - try to fault some address in a virtual address range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: use to track pfns array content validity
+ * @start: fault range virtual start address (inclusive)
+ * @end: fault range virtual end address (exclusive)
+ * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
+ * @write: is it a write fault
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ *
+ * This is similar to a regular CPU page fault except that it will not trigger
+ * any memory migration if the memory being faulted is not accessible by CPUs.
+ *
+ * On error, for one virtual address in the range, the function will set the
+ * hmm_pfn_t error flag for the corresponding pfn entry.
+ *
+ * Expected use pattern:
+ * retry:
+ * down_read(&mm->mmap_sem);
+ * // Find vma and address device wants to fault, initialize hmm_pfn_t
+ * // array accordingly
+ * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
+ * switch (ret) {
+ * case -EAGAIN:
+ * hmm_vma_range_done(vma, range);
+ * // You might want to rate limit or yield to play nicely, you may
+ * // also commit any valid pfn in the array assuming that you are
+ * // getting true from hmm_vma_range_monitor_end()
+ * goto retry;
+ * case 0:
+ * break;
+ * default:
+ * // Handle error !
+ * up_read(&mm->mmap_sem)
+ * return;
+ * }
+ * // Take device driver lock that serialize device page table update
+ * driver_lock_device_page_table_update();
+ * hmm_vma_range_done(vma, range);
+ * // Commit pfns we got from hmm_vma_fault()
+ * driver_unlock_device_page_table_update();
+ * up_read(&mm->mmap_sem)
+ *
+ * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
+ * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
+ *
+ * YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block)
+{
+ struct hmm_vma_walk hmm_vma_walk;
+ struct mm_walk mm_walk;
+ struct hmm *hmm;
+ int ret;
+
+ /* Sanity check, this really should not happen ! */
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end < vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+
+ hmm = hmm_register(vma->vm_mm);
+ if (!hmm) {
+ hmm_pfns_clear(pfns, start, end);
+ return -ENOMEM;
+ }
+ /* Caller must have registered a mirror using hmm_mirror_register() */
+ if (!hmm->mmu_notifier.ops)
+ return -EINVAL;
+
+ /* Initialize range to track CPU page table update */
+ range->start = start;
+ range->pfns = pfns;
+ range->end = end;
+ spin_lock(&hmm->lock);
+ range->valid = true;
+ list_add_rcu(&range->list, &hmm->ranges);
+ spin_unlock(&hmm->lock);
+
+ /* FIXME support hugetlb fs */
+ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ hmm_pfns_special(pfns, start, end);
+ return 0;
+ }
+
+ hmm_vma_walk.fault = true;
+ hmm_vma_walk.write = write;
+ hmm_vma_walk.block = block;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ hmm_vma_walk.last = range->start;
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+
+ do {
+ ret = walk_page_range(start, end, &mm_walk);
+ start = hmm_vma_walk.last;
+ } while (ret == -EAGAIN);
+
+ if (ret) {
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
+ hmm_vma_range_done(vma, range);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(hmm_vma_fault);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *page;
+
+ page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+ if (!page)
+ return NULL;
+ lock_page(page);
+ return page;
+}
+EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
+
+
+static void hmm_devmem_ref_release(struct percpu_ref *ref)
+{
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ complete(&devmem->completion);
+}
+
+static void hmm_devmem_ref_exit(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ percpu_ref_exit(ref);
+ devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
+}
+
+static void hmm_devmem_ref_kill(void *data)
+{
+ struct percpu_ref *ref = data;
+ struct hmm_devmem *devmem;
+
+ devmem = container_of(ref, struct hmm_devmem, ref);
+ percpu_ref_kill(ref);
+ wait_for_completion(&devmem->completion);
+ devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
+}
+
+static int hmm_devmem_fault(struct vm_area_struct *vma,
+ unsigned long addr,
+ const struct page *page,
+ unsigned int flags,
+ pmd_t *pmdp)
+{
+ struct hmm_devmem *devmem = page->pgmap->data;
+
+ return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
+}
+
+static void hmm_devmem_free(struct page *page, void *data)
+{
+ struct hmm_devmem *devmem = data;
+
+ devmem->ops->free(devmem, page);
+}
+
+static DEFINE_MUTEX(hmm_devmem_lock);
+static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
+
+static void hmm_devmem_radix_release(struct resource *resource)
+{
+ resource_size_t key, align_start, align_size, align_end;
+
+ align_start = resource->start & ~(PA_SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+ align_end = align_start + align_size - 1;
+
+ mutex_lock(&hmm_devmem_lock);
+ for (key = resource->start;
+ key <= resource->end;
+ key += PA_SECTION_SIZE)
+ radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
+ mutex_unlock(&hmm_devmem_lock);
+}
+
+static void hmm_devmem_release(struct device *dev, void *data)
+{
+ struct hmm_devmem *devmem = data;
+ struct resource *resource = devmem->resource;
+ unsigned long start_pfn, npages;
+ struct zone *zone;
+ struct page *page;
+
+ if (percpu_ref_tryget_live(&devmem->ref)) {
+ dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+ percpu_ref_put(&devmem->ref);
+ }
+
+ /* pages are dead and unused, undo the arch mapping */
+ start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
+ npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
+
+ page = pfn_to_page(start_pfn);
+ zone = page_zone(page);
+
+ mem_hotplug_begin();
+ if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+ __remove_pages(zone, start_pfn, npages);
+ else
+ arch_remove_memory(start_pfn << PAGE_SHIFT,
+ npages << PAGE_SHIFT);
+ mem_hotplug_done();
+
+ hmm_devmem_radix_release(resource);
+}
+
+static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
+}
+
+static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
+{
+ resource_size_t key, align_start, align_size, align_end;
+ struct device *device = devmem->device;
+ int ret, nid, is_ram;
+ unsigned long pfn;
+
+ align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
+ align_size = ALIGN(devmem->resource->start +
+ resource_size(devmem->resource),
+ PA_SECTION_SIZE) - align_start;
+
+ is_ram = region_intersects(align_start, align_size,
+ IORESOURCE_SYSTEM_RAM,
+ IORES_DESC_NONE);
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+ __func__, devmem->resource);
+ return -ENXIO;
+ }
+ if (is_ram == REGION_INTERSECTS)
+ return -ENXIO;
+
+ if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+ devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+ else
+ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
+ devmem->pagemap.res = devmem->resource;
+ devmem->pagemap.page_fault = hmm_devmem_fault;
+ devmem->pagemap.page_free = hmm_devmem_free;
+ devmem->pagemap.dev = devmem->device;
+ devmem->pagemap.ref = &devmem->ref;
+ devmem->pagemap.data = devmem;
+
+ mutex_lock(&hmm_devmem_lock);
+ align_end = align_start + align_size - 1;
+ for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
+ struct hmm_devmem *dup;
+
+ rcu_read_lock();
+ dup = hmm_devmem_find(key);
+ rcu_read_unlock();
+ if (dup) {
+ dev_err(device, "%s: collides with mapping for %s\n",
+ __func__, dev_name(dup->device));
+ mutex_unlock(&hmm_devmem_lock);
+ ret = -EBUSY;
+ goto error;
+ }
+ ret = radix_tree_insert(&hmm_devmem_radix,
+ key >> PA_SECTION_SHIFT,
+ devmem);
+ if (ret) {
+ dev_err(device, "%s: failed: %d\n", __func__, ret);
+ mutex_unlock(&hmm_devmem_lock);
+ goto error_radix;
+ }
+ }
+ mutex_unlock(&hmm_devmem_lock);
+
+ nid = dev_to_node(device);
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ mem_hotplug_begin();
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For device public memory, which is accesible by the CPU, we do
+ * want the linear mapping and thus use arch_add_memory().
+ */
+ if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+ ret = arch_add_memory(nid, align_start, align_size, false);
+ else
+ ret = add_pages(nid, align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT, false);
+ if (ret) {
+ mem_hotplug_done();
+ goto error_add_memory;
+ }
+ move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT);
+ mem_hotplug_done();
+
+ for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ page->pgmap = &devmem->pagemap;
+ }
+ return 0;
+
+error_add_memory:
+ untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+error_radix:
+ hmm_devmem_radix_release(devmem->resource);
+error:
+ return ret;
+}
+
+static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
+{
+ struct hmm_devmem *devmem = data;
+
+ return devmem->resource == match_data;
+}
+
+static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
+{
+ devres_release(devmem->device, &hmm_devmem_release,
+ &hmm_devmem_match, devmem->resource);
+}
+
+/*
+ * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
+ *
+ * @ops: memory event device driver callback (see struct hmm_devmem_ops)
+ * @device: device struct to bind the resource too
+ * @size: size in bytes of the device memory to add
+ * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
+ *
+ * This function first finds an empty range of physical address big enough to
+ * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
+ * in turn allocates struct pages. It does not do anything beyond that; all
+ * events affecting the memory will go through the various callbacks provided
+ * by hmm_devmem_ops struct.
+ *
+ * Device driver should call this function during device initialization and
+ * is then responsible of memory management. HMM only provides helpers.
+ */
+struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ unsigned long size)
+{
+ struct hmm_devmem *devmem;
+ resource_size_t addr;
+ int ret;
+
+ static_branch_enable(&device_private_key);
+
+ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+ GFP_KERNEL, dev_to_node(device));
+ if (!devmem)
+ return ERR_PTR(-ENOMEM);
+
+ init_completion(&devmem->completion);
+ devmem->pfn_first = -1UL;
+ devmem->pfn_last = -1UL;
+ devmem->resource = NULL;
+ devmem->device = device;
+ devmem->ops = ops;
+
+ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto error_percpu_ref;
+
+ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ if (ret)
+ goto error_devm_add_action;
+
+ size = ALIGN(size, PA_SECTION_SIZE);
+ addr = min((unsigned long)iomem_resource.end,
+ (1UL << MAX_PHYSMEM_BITS) - 1);
+ addr = addr - size + 1UL;
+
+ /*
+ * FIXME add a new helper to quickly walk resource tree and find free
+ * range
+ *
+ * FIXME what about ioport_resource resource ?
+ */
+ for (; addr > size && addr >= iomem_resource.start; addr -= size) {
+ ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
+ if (ret != REGION_DISJOINT)
+ continue;
+
+ devmem->resource = devm_request_mem_region(device, addr, size,
+ dev_name(device));
+ if (!devmem->resource) {
+ ret = -ENOMEM;
+ goto error_no_resource;
+ }
+ break;
+ }
+ if (!devmem->resource) {
+ ret = -ERANGE;
+ goto error_no_resource;
+ }
+
+ devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+ devmem->pfn_last = devmem->pfn_first +
+ (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+ ret = hmm_devmem_pages_create(devmem);
+ if (ret)
+ goto error_pages;
+
+ devres_add(device, devmem);
+
+ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+ if (ret) {
+ hmm_devmem_remove(devmem);
+ return ERR_PTR(ret);
+ }
+
+ return devmem;
+
+error_pages:
+ devm_release_mem_region(device, devmem->resource->start,
+ resource_size(devmem->resource));
+error_no_resource:
+error_devm_add_action:
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+ devres_free(devmem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add);
+
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+ struct device *device,
+ struct resource *res)
+{
+ struct hmm_devmem *devmem;
+ int ret;
+
+ if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+ return ERR_PTR(-EINVAL);
+
+ static_branch_enable(&device_private_key);
+
+ devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+ GFP_KERNEL, dev_to_node(device));
+ if (!devmem)
+ return ERR_PTR(-ENOMEM);
+
+ init_completion(&devmem->completion);
+ devmem->pfn_first = -1UL;
+ devmem->pfn_last = -1UL;
+ devmem->resource = res;
+ devmem->device = device;
+ devmem->ops = ops;
+
+ ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+ 0, GFP_KERNEL);
+ if (ret)
+ goto error_percpu_ref;
+
+ ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ if (ret)
+ goto error_devm_add_action;
+
+
+ devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+ devmem->pfn_last = devmem->pfn_first +
+ (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+ ret = hmm_devmem_pages_create(devmem);
+ if (ret)
+ goto error_devm_add_action;
+
+ devres_add(device, devmem);
+
+ ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+ if (ret) {
+ hmm_devmem_remove(devmem);
+ return ERR_PTR(ret);
+ }
+
+ return devmem;
+
+error_devm_add_action:
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+ devres_free(devmem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add_resource);
+
+/*
+ * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
+ *
+ * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
+ *
+ * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
+ * of the device driver. It will free struct page and remove the resource that
+ * reserved the physical address range for this device memory.
+ */
+void hmm_devmem_remove(struct hmm_devmem *devmem)
+{
+ resource_size_t start, size;
+ struct device *device;
+ bool cdm = false;
+
+ if (!devmem)
+ return;
+
+ device = devmem->device;
+ start = devmem->resource->start;
+ size = resource_size(devmem->resource);
+
+ cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
+ hmm_devmem_ref_kill(&devmem->ref);
+ hmm_devmem_ref_exit(&devmem->ref);
+ hmm_devmem_pages_remove(devmem);
+
+ if (!cdm)
+ devm_release_mem_region(device, start, size);
+}
+EXPORT_SYMBOL(hmm_devmem_remove);
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper
+ * and it is not needed to make use of any HMM functionality.
+ */
+#define HMM_DEVICE_MAX 256
+
+static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
+static DEFINE_SPINLOCK(hmm_device_lock);
+static struct class *hmm_device_class;
+static dev_t hmm_device_devt;
+
+static void hmm_device_release(struct device *device)
+{
+ struct hmm_device *hmm_device;
+
+ hmm_device = container_of(device, struct hmm_device, device);
+ spin_lock(&hmm_device_lock);
+ clear_bit(hmm_device->minor, hmm_device_mask);
+ spin_unlock(&hmm_device_lock);
+
+ kfree(hmm_device);
+}
+
+struct hmm_device *hmm_device_new(void *drvdata)
+{
+ struct hmm_device *hmm_device;
+
+ hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
+ if (!hmm_device)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock(&hmm_device_lock);
+ hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
+ if (hmm_device->minor >= HMM_DEVICE_MAX) {
+ spin_unlock(&hmm_device_lock);
+ kfree(hmm_device);
+ return ERR_PTR(-EBUSY);
+ }
+ set_bit(hmm_device->minor, hmm_device_mask);
+ spin_unlock(&hmm_device_lock);
+
+ dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
+ hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
+ hmm_device->minor);
+ hmm_device->device.release = hmm_device_release;
+ dev_set_drvdata(&hmm_device->device, drvdata);
+ hmm_device->device.class = hmm_device_class;
+ device_initialize(&hmm_device->device);
+
+ return hmm_device;
+}
+EXPORT_SYMBOL(hmm_device_new);
+
+void hmm_device_put(struct hmm_device *hmm_device)
+{
+ put_device(&hmm_device->device);
+}
+EXPORT_SYMBOL(hmm_device_put);
+
+static int __init hmm_init(void)
+{
+ int ret;
+
+ ret = alloc_chrdev_region(&hmm_device_devt, 0,
+ HMM_DEVICE_MAX,
+ "hmm_device");
+ if (ret)
+ return ret;
+
+ hmm_device_class = class_create(THIS_MODULE, "hmm_device");
+ if (IS_ERR(hmm_device_class)) {
+ unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
+ return PTR_ERR(hmm_device_class);
+ }
+ return 0;
+}
+
+device_initcall(hmm_init);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/mm/madvise.c b/mm/madvise.c
index 6c5bfab49826..d676b7f1489e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -342,7 +342,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
- page = vm_normal_page(vma, addr, ptent);
+ page = _vm_normal_page(vma, addr, ptent, true);
if (!page)
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 72d4cd82f3a6..0ab95f5a823a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4417,12 +4417,13 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+ MC_TARGET_DEVICE,
};
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = vm_normal_page(vma, addr, ptent);
+ struct page *page = _vm_normal_page(vma, addr, ptent, true);
if (!page || !page_mapped(page))
return NULL;
@@ -4439,7 +4440,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
return page;
}
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
{
@@ -4448,6 +4449,23 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
+
+ /*
+ * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+ * a device and because they are not accessible by CPU they are store
+ * as special swap entry in the CPU page table.
+ */
+ if (is_device_private_entry(ent)) {
+ page = device_private_entry_to_page(ent);
+ /*
+ * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+ * a refcount of 1 when free (unlike normal page)
+ */
+ if (!page_ref_add_unless(page, 1, 1))
+ return NULL;
+ return page;
+ }
+
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
@@ -4608,6 +4626,13 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
+ * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * For now we such page is charge like a regular page would be as for all
+ * intent and purposes it is just special memory taking the place of a
+ * regular page.
+ *
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
*
* Called with pte lock held.
*/
@@ -4636,6 +4661,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+ if (is_device_private_page(page) ||
+ is_device_public_page(page))
+ ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
@@ -4695,6 +4723,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
+ /*
+ * Note their can not be MC_TARGET_DEVICE for now as we do not
+ * support transparent huge page with MEMORY_DEVICE_PUBLIC or
+ * MEMORY_DEVICE_PRIVATE but this might change.
+ */
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4910,6 +4943,14 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
putback_lru_page(page);
}
put_page(page);
+ } else if (target_type == MC_TARGET_DEVICE) {
+ page = target.page;
+ if (!mem_cgroup_move_account(page, true,
+ mc.from, mc.to)) {
+ mc.precharge -= HPAGE_PMD_NR;
+ mc.moved_charge += HPAGE_PMD_NR;
+ }
+ put_page(page);
}
spin_unlock(ptl);
return 0;
@@ -4921,12 +4962,16 @@ retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
pte_t ptent = *(pte++);
+ bool device = false;
swp_entry_t ent;
if (!mc.precharge)
break;
switch (get_mctgt_type(vma, addr, ptent, &target)) {
+ case MC_TARGET_DEVICE:
+ device = true;
+ /* fall through */
case MC_TARGET_PAGE:
page = target.page;
/*
@@ -4937,7 +4982,7 @@ retry:
*/
if (PageTransCompound(page))
goto put;
- if (isolate_lru_page(page))
+ if (!device && isolate_lru_page(page))
goto put;
if (!mem_cgroup_move_account(page, false,
mc.from, mc.to)) {
@@ -4945,7 +4990,8 @@ retry:
/* we uncharge from mc.from later. */
mc.moved_charge++;
}
- putback_lru_page(page);
+ if (!device)
+ putback_lru_page(page);
put: /* get_mctgt_type() gets the page */
put_page(page);
break;
@@ -5510,48 +5556,102 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
cancel_charge(memcg, nr_pages);
}
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
- unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_kmem, unsigned long nr_huge,
- unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+ struct mem_cgroup *memcg;
+ unsigned long pgpgout;
+ unsigned long nr_anon;
+ unsigned long nr_file;
+ unsigned long nr_kmem;
+ unsigned long nr_huge;
+ unsigned long nr_shmem;
+ struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
+{
+ memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+ unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
- if (!mem_cgroup_is_root(memcg)) {
- page_counter_uncharge(&memcg->memory, nr_pages);
+ if (!mem_cgroup_is_root(ug->memcg)) {
+ page_counter_uncharge(&ug->memcg->memory, nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
- page_counter_uncharge(&memcg->kmem, nr_kmem);
- memcg_oom_recover(memcg);
+ page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+ page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
- __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
- __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
- __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
- __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
- __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
- __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
- memcg_check_events(memcg, dummy_page);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+ __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+ __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+ __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+ __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+ memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_pages);
+ if (!mem_cgroup_is_root(ug->memcg))
+ css_put_many(&ug->memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+ if (!page->mem_cgroup)
+ return;
+
+ /*
+ * Nobody should be changing or seriously looking at
+ * page->mem_cgroup at this point, we have fully
+ * exclusive access to the page.
+ */
+
+ if (ug->memcg != page->mem_cgroup) {
+ if (ug->memcg) {
+ uncharge_batch(ug);
+ uncharge_gather_clear(ug);
+ }
+ ug->memcg = page->mem_cgroup;
+ }
+
+ if (!PageKmemcg(page)) {
+ unsigned int nr_pages = 1;
+
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ ug->nr_huge += nr_pages;
+ }
+ if (PageAnon(page))
+ ug->nr_anon += nr_pages;
+ else {
+ ug->nr_file += nr_pages;
+ if (PageSwapBacked(page))
+ ug->nr_shmem += nr_pages;
+ }
+ ug->pgpgout++;
+ } else {
+ ug->nr_kmem += 1 << compound_order(page);
+ __ClearPageKmemcg(page);
+ }
+
+ ug->dummy_page = page;
+ page->mem_cgroup = NULL;
}
static void uncharge_list(struct list_head *page_list)
{
- struct mem_cgroup *memcg = NULL;
- unsigned long nr_shmem = 0;
- unsigned long nr_anon = 0;
- unsigned long nr_file = 0;
- unsigned long nr_huge = 0;
- unsigned long nr_kmem = 0;
- unsigned long pgpgout = 0;
+ struct uncharge_gather ug;
struct list_head *next;
- struct page *page;
+
+ uncharge_gather_clear(&ug);
/*
* Note that the list can be a single page->lru; hence the
@@ -5559,57 +5659,16 @@ static void uncharge_list(struct list_head *page_list)
*/
next = page_list->next;
do {
+ struct page *page;
+
page = list_entry(next, struct page, lru);
next = page->lru.next;
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
-
- if (!page->mem_cgroup)
- continue;
-
- /*
- * Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point, we have fully
- * exclusive access to the page.
- */
-
- if (memcg != page->mem_cgroup) {
- if (memcg) {
- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_kmem, nr_huge, nr_shmem, page);
- pgpgout = nr_anon = nr_file = nr_kmem = 0;
- nr_huge = nr_shmem = 0;
- }
- memcg = page->mem_cgroup;
- }
-
- if (!PageKmemcg(page)) {
- unsigned int nr_pages = 1;
-
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- nr_huge += nr_pages;
- }
- if (PageAnon(page))
- nr_anon += nr_pages;
- else {
- nr_file += nr_pages;
- if (PageSwapBacked(page))
- nr_shmem += nr_pages;
- }
- pgpgout++;
- } else {
- nr_kmem += 1 << compound_order(page);
- __ClearPageKmemcg(page);
- }
-
- page->mem_cgroup = NULL;
+ uncharge_page(page, &ug);
} while (next != page_list);
- if (memcg)
- uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_kmem, nr_huge, nr_shmem, page);
+ if (ug.memcg)
+ uncharge_batch(&ug);
}
/**
@@ -5621,6 +5680,8 @@ static void uncharge_list(struct list_head *page_list)
*/
void mem_cgroup_uncharge(struct page *page)
{
+ struct uncharge_gather ug;
+
if (mem_cgroup_disabled())
return;
@@ -5628,8 +5689,9 @@ void mem_cgroup_uncharge(struct page *page)
if (!page->mem_cgroup)
return;
- INIT_LIST_HEAD(&page->lru);
- uncharge_list(&page->lru);
+ uncharge_gather_clear(&ug);
+ uncharge_page(page, &ug);
+ uncharge_batch(&ug);
}
/**
diff --git a/mm/memory.c b/mm/memory.c
index f0ea21eae3b8..eff55090b176 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/memremap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
#else
# define HAVE_PTE_SPECIAL 0
#endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
@@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
- if (!is_zero_pfn(pfn))
- print_bad_pte(vma, addr, pte, NULL);
+ if (is_zero_pfn(pfn))
+ return NULL;
+
+ /*
+ * Device public pages are special pages (they are ZONE_DEVICE
+ * pages but different from persistent memory). They behave
+ * allmost like normal pages. The difference is that they are
+ * not on the lru and thus should never be involve with any-
+ * thing that involve lru manipulation (mlock, numa balancing,
+ * ...).
+ *
+ * This is why we still want to return NULL for such page from
+ * vm_normal_page() so that we do not have to special case all
+ * call site of vm_normal_page().
+ */
+ if (likely(pfn <= highest_memmap_pfn)) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (is_device_public_page(page)) {
+ if (with_public_device)
+ return page;
+ return NULL;
+ }
+ }
+ print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
@@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_swp_mksoft_dirty(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_private_entry(entry)) {
+ page = device_private_entry_to_page(entry);
+
+ /*
+ * Update rss count even for unaddressable pages, as
+ * they should treated just like normal pages in this
+ * respect.
+ *
+ * We will likely want to have some new rss counters
+ * for unaddressable pages, at some point. But for now
+ * keep things as they are.
+ */
+ get_page(page);
+ rss[mm_counter(page)]++;
+ page_dup_rmap(page, false);
+
+ /*
+ * We do not preserve soft-dirty information, because so
+ * far, checkpoint/restore is the only feature that
+ * requires that. And checkpoint/restore does not work
+ * when a device driver is involved (you cannot easily
+ * save and restore device driver state).
+ */
+ if (is_write_device_private_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ make_device_private_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
}
goto out_set_pte;
}
@@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
+ } else if (pte_devmap(pte)) {
+ page = pte_page(pte);
+
+ /*
+ * Cache coherent device memory behave like regular page and
+ * not like persistent memory page. For more informations see
+ * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+ */
+ if (is_device_public_page(page)) {
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
+ }
}
out_set_pte:
@@ -1236,7 +1302,7 @@ again:
if (pte_present(ptent)) {
struct page *page;
- page = vm_normal_page(vma, addr, ptent);
+ page = _vm_normal_page(vma, addr, ptent, true);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
@@ -1273,6 +1339,29 @@ again:
}
continue;
}
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ struct page *page = device_private_entry_to_page(entry);
+
+ if (unlikely(details && details->check_mapping)) {
+ /*
+ * unmap_shared_mapping_pages() wants to
+ * invalidate cache without truncating:
+ * unmap shared but keep private pages.
+ */
+ if (details->check_mapping !=
+ page_rmapping(page))
+ continue;
+ }
+
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
+ put_page(page);
+ continue;
+ }
+
/* If details->check_mapping, we leave swap entries. */
if (unlikely(details))
continue;
@@ -2756,6 +2845,14 @@ int do_swap_page(struct vm_fault *vmf)
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_private_entry(entry)) {
+ /*
+ * For un-addressable device memory we call the pgmap
+ * fault handler callback. The callback must migrate
+ * the page back to some CPU accessible page.
+ */
+ ret = device_private_entry_fault(vma, vmf->address, entry,
+ vmf->flags, vmf->pmd);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index db7d18237720..6a4ca07fef50 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -156,7 +156,7 @@ void mem_hotplug_done(void)
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
- struct resource *res;
+ struct resource *res, *conflict;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
if (!res)
return ERR_PTR(-ENOMEM);
@@ -165,7 +165,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->start = start;
res->end = start + size - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- if (request_resource(&iomem_resource, res) < 0) {
+ conflict = request_resource_conflict(&iomem_resource, res);
+ if (conflict) {
+ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_debug("Device unaddressable memory block "
+ "memory hotplug at %#010llx !\n",
+ (unsigned long long)start);
+ }
pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
return ERR_PTR(-EEXIST);
diff --git a/mm/migrate.c b/mm/migrate.c
index 9bb22a4a5132..bf8d9db006e1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,9 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
@@ -228,13 +231,23 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
+ if (unlikely(is_zone_device_page(new))) {
+ if (is_device_private_page(new)) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ } else if (is_device_public_page(new)) {
+ pte = pte_mkdevmap(pte);
+ flush_dcache_page(new);
+ }
+ } else
+ flush_dcache_page(new);
+
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, vma, new, 0);
}
#endif
- flush_dcache_page(new);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageHuge(new)) {
@@ -397,6 +410,13 @@ int migrate_page_move_mapping(struct address_space *mapping,
int expected_count = 1 + extra_count;
void **pslot;
+ /*
+ * Device public or private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ expected_count += is_device_public_page(page);
+
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
@@ -603,15 +623,10 @@ static void copy_huge_page(struct page *dst, struct page *src)
/*
* Copy the page to its new location
*/
-void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_states(struct page *newpage, struct page *page)
{
int cpupid;
- if (PageHuge(page) || PageTransHuge(page))
- copy_huge_page(newpage, page);
- else
- copy_highpage(newpage, page);
-
if (PageError(page))
SetPageError(newpage);
if (PageReferenced(page))
@@ -665,6 +680,17 @@ void migrate_page_copy(struct page *newpage, struct page *page)
mem_cgroup_migrate(page, newpage);
}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ if (PageHuge(page) || PageTransHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
+
+ migrate_page_states(newpage, page);
+}
EXPORT_SYMBOL(migrate_page_copy);
/************************************************************
@@ -690,7 +716,10 @@ int migrate_page(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -740,12 +769,15 @@ int buffer_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
- migrate_page_copy(newpage, page);
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ migrate_page_copy(newpage, page);
+ else
+ migrate_page_states(newpage, page);
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
+ put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -804,8 +836,13 @@ static int fallback_migrate_page(struct address_space *mapping,
{
if (PageDirty(page)) {
/* Only writeback pages in full synchronous migration */
- if (mode != MIGRATE_SYNC)
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
return -EBUSY;
+ }
return writeout(mapping, page);
}
@@ -942,7 +979,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* the retry loop is too short and in the sync-light case,
* the overhead of stalling is too much
*/
- if (mode != MIGRATE_SYNC) {
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
rc = -EBUSY;
goto out_unlock;
}
@@ -1212,8 +1253,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOMEM;
if (!trylock_page(hpage)) {
- if (!force || mode != MIGRATE_SYNC)
+ if (!force)
+ goto out;
+ switch (mode) {
+ case MIGRATE_SYNC:
+ case MIGRATE_SYNC_NO_COPY:
+ break;
+ default:
goto out;
+ }
lock_page(hpage);
}
@@ -2037,3 +2085,860 @@ out_unlock:
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */
+
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
+struct migrate_vma {
+ struct vm_area_struct *vma;
+ unsigned long *dst;
+ unsigned long *src;
+ unsigned long cpages;
+ unsigned long npages;
+ unsigned long start;
+ unsigned long end;
+};
+
+static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+ pfn = pte_pfn(pte);
+
+ if (pte_none(pte)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ mpfn = pfn = 0;
+
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = device_private_entry_to_page(entry);
+ mpfn = migrate_pfn(page_to_pfn(page))|
+ MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ if (is_write_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ if (is_zero_pfn(pfn)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ pfn = 0;
+ goto next;
+ }
+ page = _vm_normal_page(migrate->vma, addr, pte, true);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = pfn = 0;
+ goto next;
+ }
+ pfn = page_to_pfn(page);
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+ migrate->cpages++;
+
+ /*
+ * Optimize for the common case where page is only mapped once
+ * in one process. If we can lock the page, then we can safely
+ * set up a special migration page table entry now.
+ */
+ if (trylock_page(page)) {
+ pte_t swp_pte;
+
+ mpfn |= MIGRATE_PFN_LOCKED;
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* Setup special migration page table entry */
+ entry = make_migration_entry(page, pte_write(pte));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ }
+
+next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return 0;
+}
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+ struct mm_walk mm_walk;
+
+ mm_walk.pmd_entry = migrate_vma_collect_pmd;
+ mm_walk.pte_entry = NULL;
+ mm_walk.pte_hole = migrate_vma_collect_hole;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.vma = migrate->vma;
+ mm_walk.mm = migrate->vma->vm_mm;
+ mm_walk.private = migrate;
+
+ mmu_notifier_invalidate_range_start(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+ walk_page_range(migrate->start, migrate->end, &mm_walk);
+ mmu_notifier_invalidate_range_end(mm_walk.mm,
+ migrate->start,
+ migrate->end);
+
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1;
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page)) {
+ /*
+ * Private page can never be pin as they have no valid pte and
+ * GUP will fail for those. Yet if there is a pending migration
+ * a thread might try to wait on the pte migration entry and
+ * will bump the page reference count. Sadly there is no way to
+ * differentiate a regular pin from migration wait. Hence to
+ * avoid 2 racing thread trying to migrate back to CPU to enter
+ * infinite loop (one stoping migration because the other is
+ * waiting on pte migration entry). We always return true here.
+ *
+ * FIXME proper solution is to rework migration_entry_wait() so
+ * it does not need to take a reference on page.
+ */
+ if (is_device_private_page(page))
+ return true;
+
+ /*
+ * Only allow device public page to be migrated and account for
+ * the extra reference count imply by ZONE_DEVICE pages.
+ */
+ if (!is_device_public_page(page))
+ return false;
+ extra++;
+ }
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+}
+
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+ bool allow_drain = true;
+
+ lru_add_drain();
+
+ for (i = 0; (i < npages) && migrate->cpages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ bool remap = true;
+
+ if (!page)
+ continue;
+
+ if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
+ /*
+ * Because we are migrating several pages there can be
+ * a deadlock between 2 concurrent migration where each
+ * are waiting on each other page lock.
+ *
+ * Make migrate_vma() a best effort thing and backoff
+ * for any page we can not lock right away.
+ */
+ if (!trylock_page(page)) {
+ migrate->src[i] = 0;
+ migrate->cpages--;
+ put_page(page);
+ continue;
+ }
+ remap = false;
+ migrate->src[i] |= MIGRATE_PFN_LOCKED;
+ }
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (isolate_lru_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+ put_page(page);
+ }
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ if (!migrate_vma_check_page(page)) {
+ if (remap) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+ } else {
+ migrate->src[i] = 0;
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (!is_zone_device_page(page))
+ putback_lru_page(page);
+ else
+ put_page(page);
+ }
+ }
+ }
+
+ for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_pte(page, migrate->vma, addr, page);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ put_page(page);
+ restore--;
+ }
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ unsigned long addr, i, restore = 0;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ if (page_mapped(page)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page))
+ goto restore;
+ }
+
+ if (migrate_vma_check_page(page))
+ continue;
+
+restore:
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ }
+
+ for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ remove_migration_ptes(page, page, false);
+
+ migrate->src[i] = 0;
+ unlock_page(page);
+ restore--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+ }
+}
+
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ unsigned long *dst)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mem_cgroup *memcg;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have down_read(mmap_sem).
+ */
+ if (pte_alloc(mm, pmdp, addr))
+ goto abort;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmdp)))
+ goto abort;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_zone_device_page(page)) {
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+ entry = swp_entry_to_pte(swp_entry);
+ } else if (is_device_public_page(page)) {
+ entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkdevmap(entry);
+ }
+ } else {
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+ if (pte_present(*ptep)) {
+ unsigned long pfn = pte_pfn(*ptep);
+
+ if (!is_zero_pfn(pfn)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+ flush = true;
+ } else if (!pte_none(*ptep)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ /*
+ * Check for usefaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ goto abort;
+ }
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ if (!is_zone_device_page(page))
+ lru_cache_add_active_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+}
+
+/*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+static void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr, i, mmu_start;
+ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ continue;
+ }
+ if (!notified) {
+ mmu_start = addr;
+ notified = true;
+ mmu_notifier_invalidate_range_start(mm,
+ mmu_start,
+ migrate->end);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+ &migrate->src[i],
+ &migrate->dst[i]);
+ continue;
+ }
+
+ mapping = page_mapping(page);
+
+ if (is_zone_device_page(newpage)) {
+ if (is_device_private_page(newpage)) {
+ /*
+ * For now only support private anonymous when
+ * migrating to un-addressable device memory.
+ */
+ if (mapping) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else if (!is_device_public_page(newpage)) {
+ /*
+ * Other types of ZONE_DEVICE page are not
+ * supported.
+ */
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ }
+
+ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ if (notified)
+ mmu_notifier_invalidate_range_end(mm, mmu_start,
+ migrate->end);
+}
+
+/*
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+static void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ unsigned long i;
+
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ continue;
+ }
+
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ newpage = page;
+ }
+
+ remove_migration_ptes(page, newpage, false);
+ unlock_page(page);
+ migrate->cpages--;
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+ }
+}
+
+/*
+ * migrate_vma() - migrate a range of memory inside vma
+ *
+ * @ops: migration callback for allocating destination memory and copying
+ * @vma: virtual memory area containing the range to be migrated
+ * @start: start address of the range to migrate (inclusive)
+ * @end: end address of the range to migrate (exclusive)
+ * @src: array of hmm_pfn_t containing source pfns
+ * @dst: array of hmm_pfn_t containing destination pfns
+ * @private: pointer passed back to each of the callback
+ * Returns: 0 on success, error code otherwise
+ *
+ * This function tries to migrate a range of memory virtual address range, using
+ * callbacks to allocate and copy memory from source to destination. First it
+ * collects all the pages backing each virtual address in the range, saving this
+ * inside the src array. Then it locks those pages and unmaps them. Once the pages
+ * are locked and unmapped, it checks whether each page is pinned or not. Pages
+ * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
+ * in the corresponding src array entry. It then restores any pages that are
+ * pinned, by remapping and unlocking those pages.
+ *
+ * At this point it calls the alloc_and_copy() callback. For documentation on
+ * what is expected from that callback, see struct migrate_vma_ops comments in
+ * include/linux/migrate.h
+ *
+ * After the alloc_and_copy() callback, this function goes over each entry in
+ * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then the function tries to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the struct
+ * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
+ * array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * It then calls the finalize_and_map() callback. See comments for "struct
+ * migrate_vma_ops", in include/linux/migrate.h for details about
+ * finalize_and_map() behavior.
+ *
+ * After the finalize_and_map() callback, for successfully migrated pages, this
+ * function updates the CPU page table to point to new pages, otherwise it
+ * restores the CPU page table to point to the original source pages.
+ *
+ * Function returns 0 after the above steps, even if no pages were migrated
+ * (The function only returns an error if any of the arguments are invalid.)
+ *
+ * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
+ * unsigned long entries.
+ */
+int migrate_vma(const struct migrate_vma_ops *ops,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long *src,
+ unsigned long *dst,
+ void *private)
+{
+ struct migrate_vma migrate;
+
+ /* Sanity check the arguments */
+ start &= PAGE_MASK;
+ end &= PAGE_MASK;
+ if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+ return -EINVAL;
+ if (start < vma->vm_start || start >= vma->vm_end)
+ return -EINVAL;
+ if (end <= vma->vm_start || end > vma->vm_end)
+ return -EINVAL;
+ if (!ops || !src || !dst || start >= end)
+ return -EINVAL;
+
+ memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
+ migrate.src = src;
+ migrate.dst = dst;
+ migrate.start = start;
+ migrate.npages = 0;
+ migrate.cpages = 0;
+ migrate.end = end;
+ migrate.vma = vma;
+
+ /* Collect, and try to unmap source pages */
+ migrate_vma_collect(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /* Lock and isolate page */
+ migrate_vma_prepare(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /* Unmap pages */
+ migrate_vma_unmap(&migrate);
+ if (!migrate.cpages)
+ return 0;
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the callback.
+ *
+ * Note that migration can fail in migrate_vma_struct_page() for each
+ * individual page.
+ */
+ ops->alloc_and_copy(vma, src, dst, start, end, private);
+
+ /* This does the real migration of struct page */
+ migrate_vma_pages(&migrate);
+
+ ops->finalize_and_map(vma, src, dst, start, end, private);
+
+ /* Unlock and remap pages */
+ migrate_vma_finalize(&migrate);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_vma);
+#endif /* defined(MIGRATE_VMA_HELPER) */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6963f90ccd2a..31a1080ab62e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -127,6 +127,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pages++;
}
+
+ if (is_write_device_private_entry(entry)) {
+ pte_t newpte;
+
+ /*
+ * We do not preserve soft-dirtiness. See
+ * copy_one_pte() for explanation.
+ */
+ make_device_private_entry_read(&entry);
+ newpte = swp_entry_to_pte(entry);
+ set_pte_at(mm, addr, pte, newpte);
+
+ pages++;
+ }
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 193e5c447847..07afcdc97ada 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -37,6 +37,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
if (!is_swap_pte(*pvmw->pte))
return false;
entry = pte_to_swp_entry(*pvmw->pte);
+
if (!is_migration_entry(entry))
return false;
if (migration_entry_to_page(entry) - pvmw->page >=
@@ -49,6 +50,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
WARN_ON_ONCE(1);
#endif
} else {
+ if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
+
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (is_device_private_entry(entry) &&
+ device_private_entry_to_page(entry) == pvmw->page)
+ return true;
+ }
+
if (!pte_present(*pvmw->pte))
return false;
diff --git a/mm/rmap.c b/mm/rmap.c
index c82751a324d2..1230490da314 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -63,6 +63,7 @@
#include <linux/hugetlb.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
+#include <linux/memremap.h>
#include <asm/tlbflush.h>
@@ -1344,6 +1345,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
+ if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
flags & TTU_MIGRATION, page);
@@ -1380,6 +1385,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address = pvmw.address;
+ if (IS_ENABLED(CONFIG_MIGRATION) &&
+ (flags & TTU_MIGRATION) &&
+ is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_migration_entry(page, 0);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ goto discard;
+ }
+
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
diff --git a/mm/swap.c b/mm/swap.c
index 0411c6e50e93..44db9572ca4e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -759,6 +759,17 @@ void release_pages(struct page **pages, int nr)
if (is_huge_zero_page(page))
continue;
+ /* Device public page can not be huge page */
+ if (is_device_public_page(page)) {
+ if (locked_pgdat) {
+ spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+ flags);
+ locked_pgdat = NULL;
+ }
+ put_zone_device_private_or_public_page(page);
+ continue;
+ }
+
page = compound_head(page);
if (!put_page_testzero(page))
continue;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d41edd28298b..aeea3a5e2381 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1983,6 +1983,14 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
unsigned int obj_idx;
int ret = -EAGAIN;
+ /*
+ * We cannot support the _NO_COPY case here, because copy needs to
+ * happen under the zs lock, which does not work with
+ * MIGRATE_SYNC_NO_COPY workflow.
+ */
+ if (mode == MIGRATE_SYNC_NO_COPY)
+ return -EINVAL;
+
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);