Merge branch 'next' of https://git.kernel.org/pub/scm/linux/kernel/git/scottwood/linux into next

Freescale updates from Scott:

"Highlights include elimination of legacy clock bindings use from dts
 files, an 83xx watchdog handler, fixes to old dts interrupt errors, and
 some minor cleanup."
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
index bdd344a..18c5fee 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -113,7 +113,15 @@
 size (see Fig. 1). This area is *not* released: this region will
 be kept permanently reserved, so that it can act as a receptacle
 for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur.
+and HPTE region, in the case a crash does occur. Since this reserved
+memory area is used only after the system crash, there is no point in
+blocking this significant chunk of memory from production kernel.
+Hence, the implementation uses the Linux kernel's Contiguous Memory
+Allocator (CMA) for memory reservation if CMA is configured for kernel.
+With CMA reservation this memory will be available for applications to
+use it, while kernel is prevented from using it. With this fadump will
+still be able to capture all of the kernel memory and most of the user
+space memory except the user pages that were present in CMA region.
 
   o Memory Reservation during first kernel
 
@@ -162,6 +170,9 @@
 
 1. Set config option CONFIG_FA_DUMP=y and build kernel.
 2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
+   By default, fadump reserved memory will be initialized as CMA area.
+   Alternatively, user can boot linux kernel with 'fadump=nocma' to
+   prevent fadump to use CMA.
 3. Optionally, user can also set 'crashkernel=' kernel cmdline
    to specify size of the memory to reserve for boot memory dump
    preservation.
@@ -172,6 +183,10 @@
       2. If firmware-assisted dump fails to reserve memory then it
          will fallback to existing kdump mechanism if 'crashkernel='
          option is set at kernel cmdline.
+      3. if user wants to capture all of user space memory and ok with
+         reserved memory not available to production system, then
+         'fadump=nocma' kernel parameter can be used to fallback to
+         old behaviour.
 
 Sysfs/debugfs files:
 ------------
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6d6e1ff..a1e858e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1143,7 +1143,7 @@
 
 config PIN_TLB_IMMR
 	bool "Pinned TLB for IMMR"
-	depends on PIN_TLB
+	depends on PIN_TLB || PPC_EARLY_DEBUG_CPM
 	default y
 
 config PIN_TLB_TEXT
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index fa28e84..4c4e0eb 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -417,6 +417,9 @@
 
 archprepare: checkbin
 
+archheaders:
+	$(Q)$(MAKE) $(build)=arch/powerpc/kernel/syscalls all
+
 ifdef CONFIG_STACKPROTECTOR
 prepare: stack_protector_prepare
 
diff --git a/arch/powerpc/boot/serial.c b/arch/powerpc/boot/serial.c
index f045f84..b0491b8 100644
--- a/arch/powerpc/boot/serial.c
+++ b/arch/powerpc/boot/serial.c
@@ -93,7 +93,8 @@ static void *serial_get_stdout_devp(void)
 	if (devp == NULL)
 		goto err_out;
 
-	if (getprop(devp, "linux,stdout-path", path, MAX_PATH_LEN) > 0) {
+	if (getprop(devp, "linux,stdout-path", path, MAX_PATH_LEN) > 0 ||
+		getprop(devp, "stdout-path", path, MAX_PATH_LEN) > 0) {
 		devp = finddevice(path);
 		if (devp == NULL)
 			goto err_out;
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 3196d22..77ff7fb 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,3 +1,7 @@
+generated-y += syscall_table_32.h
+generated-y += syscall_table_64.h
+generated-y += syscall_table_c32.h
+generated-y += syscall_table_spu.h
 generic-y += div64.h
 generic-y += export.h
 generic-y += irq_regs.h
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 15bc16b..cf5ba52 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -1,11 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _ASM_POWERPC_BOOK3S_64_HASH_4K_H
 #define _ASM_POWERPC_BOOK3S_64_HASH_4K_H
-/*
- * Entries per page directory level.  The PTE level must use a 64b record
- * for each page table entry.  The PMD and PGD level use a 32b record for
- * each entry by assuming that each entry is page aligned.
- */
+
 #define H_PTE_INDEX_SIZE  9
 #define H_PMD_INDEX_SIZE  7
 #define H_PUD_INDEX_SIZE  9
diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
index 1e7a335..188776b 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -48,6 +48,10 @@
 
 #define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
 
+/* Alignement per CMA requirement. */
+#define FADUMP_CMA_ALIGNMENT	(PAGE_SIZE <<				\
+			max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
+
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA	0x0001
 #define FADUMP_HPTE_REGION	0x0002
@@ -141,6 +145,7 @@ struct fw_dump {
 	unsigned long	fadump_supported:1;
 	unsigned long	dump_active:1;
 	unsigned long	dump_registered:1;
+	unsigned long	nocma:1;
 };
 
 /*
@@ -200,7 +205,7 @@ struct fad_crash_memory_ranges {
 	unsigned long long	size;
 };
 
-extern int is_fadump_boot_memory_area(u64 addr, ulong size);
+extern int is_fadump_memory_area(u64 addr, ulong size);
 extern int early_init_dt_scan_fw_dump(unsigned long node,
 		const char *uname, int depth, void *data);
 extern int fadump_reserve_mem(void);
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 35db0cb..e847ff6 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -215,11 +215,12 @@ struct iommu_table_group {
 
 extern void iommu_register_group(struct iommu_table_group *table_group,
 				 int pci_domain_number, unsigned long pe_num);
-extern int iommu_add_device(struct device *dev);
+extern int iommu_add_device(struct iommu_table_group *table_group,
+		struct device *dev);
 extern void iommu_del_device(struct device *dev);
-extern int __init tce_iommu_bus_notifier_init(void);
-extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-		unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long *hpa,
+		enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
@@ -227,7 +228,8 @@ static inline void iommu_register_group(struct iommu_table_group *table_group,
 {
 }
 
-static inline int iommu_add_device(struct device *dev)
+static inline int iommu_add_device(struct iommu_table_group *table_group,
+		struct device *dev)
 {
 	return 0;
 }
@@ -235,11 +237,6 @@ static inline int iommu_add_device(struct device *dev)
 static inline void iommu_del_device(struct device *dev)
 {
 }
-
-static inline int __init tce_iommu_bus_notifier_init(void)
-{
-        return 0;
-}
 #endif /* !CONFIG_IOMMU_API */
 
 int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr);
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index c05efd2..6ee8195 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -21,9 +21,12 @@ struct mm_iommu_table_group_mem_t;
 
 extern int isolate_lru_page(struct page *page);	/* from internal.h */
 extern bool mm_iommu_preregistered(struct mm_struct *mm);
-extern long mm_iommu_get(struct mm_struct *mm,
+extern long mm_iommu_new(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries,
 		struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+		unsigned long entries, unsigned long dev_hpa,
+		struct mm_iommu_table_group_mem_t **pmem);
 extern long mm_iommu_put(struct mm_struct *mm,
 		struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_init(struct mm_struct *mm);
@@ -32,15 +35,23 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
 		unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
 		struct mm_struct *mm, unsigned long ua, unsigned long size);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua);
+extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+		unsigned int pageshift, unsigned long *size);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#else
+static inline bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+		unsigned int pageshift, unsigned long *size)
+{
+	return false;
+}
 #endif
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
 extern void set_context(unsigned long id, pgd_t *pgd);
@@ -217,12 +228,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
 
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
-				struct mm_struct *mm)
-{
-	return 0;
-}
-
 #ifdef CONFIG_PPC_BOOK3E_64
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
@@ -247,6 +252,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 #ifdef CONFIG_PPC_MEM_KEYS
 bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
 			       bool execute, bool foreign);
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm);
 #else /* CONFIG_PPC_MEM_KEYS */
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 		bool write, bool execute, bool foreign)
@@ -259,6 +265,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 #define thread_pkey_regs_save(thread)
 #define thread_pkey_regs_restore(new_thread, old_thread)
 #define thread_pkey_regs_init(thread)
+#define arch_dup_pkeys(oldmm, mm)
 
 static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
 {
@@ -267,5 +274,12 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
 
 #endif /* CONFIG_PPC_MEM_KEYS */
 
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+				struct mm_struct *mm)
+{
+	arch_dup_pkeys(oldmm, mm);
+	return 0;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ff38664..a55b01c 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -347,6 +347,7 @@ extern int opal_async_comp_init(void);
 extern int opal_sensor_init(void);
 extern int opal_hmi_handler_init(void);
 extern int opal_event_init(void);
+int opal_power_control_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 94d4490..aee4fcc 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -129,6 +129,7 @@ struct pci_controller {
 #endif	/* CONFIG_PPC64 */
 
 	void *private_data;
+	struct npu *npu;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 2af9ded..0c72f18 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -129,5 +129,9 @@ extern void pcibios_scan_phb(struct pci_controller *hose);
 
 extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev);
 extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
+extern int pnv_npu2_init(struct pci_controller *hose);
+extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
+		unsigned long msr);
+extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
 
 #endif /* __ASM_POWERPC_PCI_H */
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index ab9f3f0..1a0e7a8 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -18,9 +18,8 @@
 #include <linux/thread_info.h>
 
 /* ftrace syscalls requires exporting the sys_call_table */
-#ifdef CONFIG_FTRACE_SYSCALLS
 extern const unsigned long sys_call_table[];
-#endif /* CONFIG_FTRACE_SYSCALLS */
+extern const unsigned long compat_sys_call_table[];
 
 static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
 {
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
deleted file mode 100644
index 01b5171..0000000
--- a/arch/powerpc/include/asm/systbl.h
+++ /dev/null
@@ -1,396 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * List of powerpc syscalls. For the meaning of the _SPU suffix see
- * arch/powerpc/platforms/cell/spu_callbacks.c
- */
-
-SYSCALL(restart_syscall)
-SYSCALL(exit)
-PPC_SYS(fork)
-SYSCALL_SPU(read)
-SYSCALL_SPU(write)
-COMPAT_SYS_SPU(open)
-SYSCALL_SPU(close)
-SYSCALL_SPU(waitpid)
-SYSCALL_SPU(creat)
-SYSCALL_SPU(link)
-SYSCALL_SPU(unlink)
-COMPAT_SYS(execve)
-SYSCALL_SPU(chdir)
-COMPAT_SYS_SPU(time)
-SYSCALL_SPU(mknod)
-SYSCALL_SPU(chmod)
-SYSCALL_SPU(lchown)
-SYSCALL(ni_syscall)
-OLDSYS(stat)
-COMPAT_SYS_SPU(lseek)
-SYSCALL_SPU(getpid)
-COMPAT_SYS(mount)
-SYSX(sys_ni_syscall,sys_oldumount,sys_oldumount)
-SYSCALL_SPU(setuid)
-SYSCALL_SPU(getuid)
-COMPAT_SYS_SPU(stime)
-COMPAT_SYS(ptrace)
-SYSCALL_SPU(alarm)
-OLDSYS(fstat)
-SYSCALL(pause)
-COMPAT_SYS(utime)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(access)
-SYSCALL_SPU(nice)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(sync)
-SYSCALL_SPU(kill)
-SYSCALL_SPU(rename)
-SYSCALL_SPU(mkdir)
-SYSCALL_SPU(rmdir)
-SYSCALL_SPU(dup)
-SYSCALL_SPU(pipe)
-COMPAT_SYS_SPU(times)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(brk)
-SYSCALL_SPU(setgid)
-SYSCALL_SPU(getgid)
-SYSCALL(signal)
-SYSCALL_SPU(geteuid)
-SYSCALL_SPU(getegid)
-SYSCALL(acct)
-SYSCALL(umount)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(ioctl)
-COMPAT_SYS_SPU(fcntl)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(setpgid)
-SYSCALL(ni_syscall)
-SYSX(sys_ni_syscall,sys_olduname,sys_olduname)
-SYSCALL_SPU(umask)
-SYSCALL_SPU(chroot)
-COMPAT_SYS(ustat)
-SYSCALL_SPU(dup2)
-SYSCALL_SPU(getppid)
-SYSCALL_SPU(getpgrp)
-SYSCALL_SPU(setsid)
-SYS32ONLY(sigaction)
-SYSCALL_SPU(sgetmask)
-SYSCALL_SPU(ssetmask)
-SYSCALL_SPU(setreuid)
-SYSCALL_SPU(setregid)
-#define compat_sys_sigsuspend sys_sigsuspend
-SYS32ONLY(sigsuspend)
-SYSX(sys_ni_syscall,compat_sys_sigpending,sys_sigpending)
-SYSCALL_SPU(sethostname)
-COMPAT_SYS_SPU(setrlimit)
-SYSX(sys_ni_syscall,compat_sys_old_getrlimit,sys_old_getrlimit)
-COMPAT_SYS_SPU(getrusage)
-COMPAT_SYS_SPU(gettimeofday)
-COMPAT_SYS_SPU(settimeofday)
-SYSCALL_SPU(getgroups)
-SYSCALL_SPU(setgroups)
-SYSX(sys_ni_syscall,sys_ni_syscall,ppc_select)
-SYSCALL_SPU(symlink)
-OLDSYS(lstat)
-SYSCALL_SPU(readlink)
-SYSCALL(uselib)
-SYSCALL(swapon)
-SYSCALL(reboot)
-SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir)
-SYSCALL_SPU(mmap)
-SYSCALL_SPU(munmap)
-COMPAT_SYS_SPU(truncate)
-COMPAT_SYS_SPU(ftruncate)
-SYSCALL_SPU(fchmod)
-SYSCALL_SPU(fchown)
-SYSCALL_SPU(getpriority)
-SYSCALL_SPU(setpriority)
-SYSCALL(ni_syscall)
-COMPAT_SYS(statfs)
-COMPAT_SYS(fstatfs)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(socketcall)
-SYSCALL_SPU(syslog)
-COMPAT_SYS_SPU(setitimer)
-COMPAT_SYS_SPU(getitimer)
-COMPAT_SYS_SPU(newstat)
-COMPAT_SYS_SPU(newlstat)
-COMPAT_SYS_SPU(newfstat)
-SYSX(sys_ni_syscall,sys_uname,sys_uname)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(vhangup)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(wait4)
-SYSCALL(swapoff)
-COMPAT_SYS_SPU(sysinfo)
-COMPAT_SYS(ipc)
-SYSCALL_SPU(fsync)
-SYS32ONLY(sigreturn)
-PPC_SYS(clone)
-SYSCALL_SPU(setdomainname)
-SYSCALL_SPU(newuname)
-SYSCALL(ni_syscall)
-COMPAT_SYS_SPU(adjtimex)
-SYSCALL_SPU(mprotect)
-SYSX(sys_ni_syscall,compat_sys_sigprocmask,sys_sigprocmask)
-SYSCALL(ni_syscall)
-SYSCALL(init_module)
-SYSCALL(delete_module)
-SYSCALL(ni_syscall)
-SYSCALL(quotactl)
-SYSCALL_SPU(getpgid)
-SYSCALL_SPU(fchdir)
-SYSCALL_SPU(bdflush)
-SYSCALL_SPU(sysfs)
-SYSX_SPU(ppc64_personality,ppc64_personality,sys_personality)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(setfsuid)
-SYSCALL_SPU(setfsgid)
-SYSCALL_SPU(llseek)
-COMPAT_SYS_SPU(getdents)
-COMPAT_SPU_NEW(select)
-SYSCALL_SPU(flock)
-SYSCALL_SPU(msync)
-COMPAT_SYS_SPU(readv)
-COMPAT_SYS_SPU(writev)
-SYSCALL_SPU(getsid)
-SYSCALL_SPU(fdatasync)
-COMPAT_SYS(sysctl)
-SYSCALL_SPU(mlock)
-SYSCALL_SPU(munlock)
-SYSCALL_SPU(mlockall)
-SYSCALL_SPU(munlockall)
-SYSCALL_SPU(sched_setparam)
-SYSCALL_SPU(sched_getparam)
-SYSCALL_SPU(sched_setscheduler)
-SYSCALL_SPU(sched_getscheduler)
-SYSCALL_SPU(sched_yield)
-SYSCALL_SPU(sched_get_priority_max)
-SYSCALL_SPU(sched_get_priority_min)
-COMPAT_SYS_SPU(sched_rr_get_interval)
-COMPAT_SYS_SPU(nanosleep)
-SYSCALL_SPU(mremap)
-SYSCALL_SPU(setresuid)
-SYSCALL_SPU(getresuid)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(poll)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(setresgid)
-SYSCALL_SPU(getresgid)
-SYSCALL_SPU(prctl)
-COMPAT_SYS(rt_sigreturn)
-COMPAT_SYS(rt_sigaction)
-COMPAT_SYS(rt_sigprocmask)
-COMPAT_SYS(rt_sigpending)
-COMPAT_SYS(rt_sigtimedwait)
-COMPAT_SYS(rt_sigqueueinfo)
-COMPAT_SYS(rt_sigsuspend)
-COMPAT_SYS_SPU(pread64)
-COMPAT_SYS_SPU(pwrite64)
-SYSCALL_SPU(chown)
-SYSCALL_SPU(getcwd)
-SYSCALL_SPU(capget)
-SYSCALL_SPU(capset)
-COMPAT_SYS(sigaltstack)
-SYSX_SPU(sys_sendfile64,compat_sys_sendfile,sys_sendfile)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-PPC_SYS(vfork)
-COMPAT_SYS_SPU(getrlimit)
-COMPAT_SYS_SPU(readahead)
-SYS32ONLY(mmap2)
-SYS32ONLY(truncate64)
-SYS32ONLY(ftruncate64)
-SYSX(sys_ni_syscall,sys_stat64,sys_stat64)
-SYSX(sys_ni_syscall,sys_lstat64,sys_lstat64)
-SYSX(sys_ni_syscall,sys_fstat64,sys_fstat64)
-SYSCALL(pciconfig_read)
-SYSCALL(pciconfig_write)
-SYSCALL(pciconfig_iobase)
-SYSCALL(ni_syscall)
-SYSCALL_SPU(getdents64)
-SYSCALL_SPU(pivot_root)
-SYSX(sys_ni_syscall,compat_sys_fcntl64,sys_fcntl64)
-SYSCALL_SPU(madvise)
-SYSCALL_SPU(mincore)
-SYSCALL_SPU(gettid)
-SYSCALL_SPU(tkill)
-SYSCALL_SPU(setxattr)
-SYSCALL_SPU(lsetxattr)
-SYSCALL_SPU(fsetxattr)
-SYSCALL_SPU(getxattr)
-SYSCALL_SPU(lgetxattr)
-SYSCALL_SPU(fgetxattr)
-SYSCALL_SPU(listxattr)
-SYSCALL_SPU(llistxattr)
-SYSCALL_SPU(flistxattr)
-SYSCALL_SPU(removexattr)
-SYSCALL_SPU(lremovexattr)
-SYSCALL_SPU(fremovexattr)
-COMPAT_SYS_SPU(futex)
-COMPAT_SYS_SPU(sched_setaffinity)
-COMPAT_SYS_SPU(sched_getaffinity)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYS32ONLY(sendfile64)
-COMPAT_SYS_SPU(io_setup)
-SYSCALL_SPU(io_destroy)
-COMPAT_SYS_SPU(io_getevents)
-COMPAT_SYS_SPU(io_submit)
-SYSCALL_SPU(io_cancel)
-SYSCALL(set_tid_address)
-SYSX_SPU(sys_fadvise64,ppc32_fadvise64,sys_fadvise64)
-SYSCALL(exit_group)
-COMPAT_SYS(lookup_dcookie)
-SYSCALL_SPU(epoll_create)
-SYSCALL_SPU(epoll_ctl)
-SYSCALL_SPU(epoll_wait)
-SYSCALL_SPU(remap_file_pages)
-COMPAT_SYS_SPU(timer_create)
-COMPAT_SYS_SPU(timer_settime)
-COMPAT_SYS_SPU(timer_gettime)
-SYSCALL_SPU(timer_getoverrun)
-SYSCALL_SPU(timer_delete)
-COMPAT_SYS_SPU(clock_settime)
-COMPAT_SYS_SPU(clock_gettime)
-COMPAT_SYS_SPU(clock_getres)
-COMPAT_SYS_SPU(clock_nanosleep)
-SYSX(ppc64_swapcontext,ppc32_swapcontext,ppc_swapcontext)
-SYSCALL_SPU(tgkill)
-COMPAT_SYS_SPU(utimes)
-COMPAT_SYS_SPU(statfs64)
-COMPAT_SYS_SPU(fstatfs64)
-SYSX(sys_ni_syscall,ppc_fadvise64_64,ppc_fadvise64_64)
-SYSCALL_SPU(rtas)
-OLDSYS(debug_setcontext)
-SYSCALL(ni_syscall)
-COMPAT_SYS(migrate_pages)
-COMPAT_SYS(mbind)
-COMPAT_SYS(get_mempolicy)
-COMPAT_SYS(set_mempolicy)
-COMPAT_SYS(mq_open)
-SYSCALL(mq_unlink)
-COMPAT_SYS(mq_timedsend)
-COMPAT_SYS(mq_timedreceive)
-COMPAT_SYS(mq_notify)
-COMPAT_SYS(mq_getsetattr)
-COMPAT_SYS(kexec_load)
-SYSCALL(add_key)
-SYSCALL(request_key)
-COMPAT_SYS(keyctl)
-COMPAT_SYS(waitid)
-SYSCALL(ioprio_set)
-SYSCALL(ioprio_get)
-SYSCALL(inotify_init)
-SYSCALL(inotify_add_watch)
-SYSCALL(inotify_rm_watch)
-SYSCALL(spu_run)
-SYSCALL(spu_create)
-COMPAT_SYS(pselect6)
-COMPAT_SYS(ppoll)
-SYSCALL_SPU(unshare)
-SYSCALL_SPU(splice)
-SYSCALL_SPU(tee)
-COMPAT_SYS_SPU(vmsplice)
-COMPAT_SYS_SPU(openat)
-SYSCALL_SPU(mkdirat)
-SYSCALL_SPU(mknodat)
-SYSCALL_SPU(fchownat)
-COMPAT_SYS_SPU(futimesat)
-SYSX_SPU(sys_newfstatat,sys_fstatat64,sys_fstatat64)
-SYSCALL_SPU(unlinkat)
-SYSCALL_SPU(renameat)
-SYSCALL_SPU(linkat)
-SYSCALL_SPU(symlinkat)
-SYSCALL_SPU(readlinkat)
-SYSCALL_SPU(fchmodat)
-SYSCALL_SPU(faccessat)
-COMPAT_SYS_SPU(get_robust_list)
-COMPAT_SYS_SPU(set_robust_list)
-COMPAT_SYS_SPU(move_pages)
-SYSCALL_SPU(getcpu)
-COMPAT_SYS(epoll_pwait)
-COMPAT_SYS_SPU(utimensat)
-COMPAT_SYS_SPU(signalfd)
-SYSCALL_SPU(timerfd_create)
-SYSCALL_SPU(eventfd)
-COMPAT_SYS_SPU(sync_file_range2)
-COMPAT_SYS(fallocate)
-SYSCALL(subpage_prot)
-COMPAT_SYS_SPU(timerfd_settime)
-COMPAT_SYS_SPU(timerfd_gettime)
-COMPAT_SYS_SPU(signalfd4)
-SYSCALL_SPU(eventfd2)
-SYSCALL_SPU(epoll_create1)
-SYSCALL_SPU(dup3)
-SYSCALL_SPU(pipe2)
-SYSCALL(inotify_init1)
-SYSCALL_SPU(perf_event_open)
-COMPAT_SYS_SPU(preadv)
-COMPAT_SYS_SPU(pwritev)
-COMPAT_SYS(rt_tgsigqueueinfo)
-SYSCALL(fanotify_init)
-COMPAT_SYS(fanotify_mark)
-SYSCALL_SPU(prlimit64)
-SYSCALL_SPU(socket)
-SYSCALL_SPU(bind)
-SYSCALL_SPU(connect)
-SYSCALL_SPU(listen)
-SYSCALL_SPU(accept)
-SYSCALL_SPU(getsockname)
-SYSCALL_SPU(getpeername)
-SYSCALL_SPU(socketpair)
-SYSCALL_SPU(send)
-SYSCALL_SPU(sendto)
-COMPAT_SYS_SPU(recv)
-COMPAT_SYS_SPU(recvfrom)
-SYSCALL_SPU(shutdown)
-COMPAT_SYS_SPU(setsockopt)
-COMPAT_SYS_SPU(getsockopt)
-COMPAT_SYS_SPU(sendmsg)
-COMPAT_SYS_SPU(recvmsg)
-COMPAT_SYS_SPU(recvmmsg)
-SYSCALL_SPU(accept4)
-SYSCALL_SPU(name_to_handle_at)
-COMPAT_SYS_SPU(open_by_handle_at)
-COMPAT_SYS_SPU(clock_adjtime)
-SYSCALL_SPU(syncfs)
-COMPAT_SYS_SPU(sendmmsg)
-SYSCALL_SPU(setns)
-COMPAT_SYS(process_vm_readv)
-COMPAT_SYS(process_vm_writev)
-SYSCALL(finit_module)
-SYSCALL(kcmp) /* sys_kcmp */
-SYSCALL_SPU(sched_setattr)
-SYSCALL_SPU(sched_getattr)
-SYSCALL_SPU(renameat2)
-SYSCALL_SPU(seccomp)
-SYSCALL_SPU(getrandom)
-SYSCALL_SPU(memfd_create)
-SYSCALL_SPU(bpf)
-COMPAT_SYS(execveat)
-PPC64ONLY(switch_endian)
-SYSCALL_SPU(userfaultfd)
-SYSCALL_SPU(membarrier)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(ni_syscall)
-SYSCALL(mlock2)
-SYSCALL(copy_file_range)
-COMPAT_SYS_SPU(preadv2)
-COMPAT_SYS_SPU(pwritev2)
-SYSCALL(kexec_file_load)
-SYSCALL(statx)
-SYSCALL(pkey_alloc)
-SYSCALL(pkey_free)
-SYSCALL(pkey_mprotect)
-SYSCALL(rseq)
-COMPAT_SYS(io_pgetevents)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index b0de85b..a3c35e6 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -11,8 +11,7 @@
 
 #include <uapi/asm/unistd.h>
 
-
-#define NR_syscalls		389
+#define NR_syscalls	__NR_syscalls
 
 #define __NR__exit __NR_exit
 
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 3712152..8ab8ba1 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,6 +1,8 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+generated-y += unistd_32.h
+generated-y += unistd_64.h
 generic-y += param.h
 generic-y += poll.h
 generic-y += resource.h
diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h
index 985534d..5f84e3d 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -10,395 +10,10 @@
 #ifndef _UAPI_ASM_POWERPC_UNISTD_H_
 #define _UAPI_ASM_POWERPC_UNISTD_H_
 
-
-#define __NR_restart_syscall	  0
-#define __NR_exit		  1
-#define __NR_fork		  2
-#define __NR_read		  3
-#define __NR_write		  4
-#define __NR_open		  5
-#define __NR_close		  6
-#define __NR_waitpid		  7
-#define __NR_creat		  8
-#define __NR_link		  9
-#define __NR_unlink		 10
-#define __NR_execve		 11
-#define __NR_chdir		 12
-#define __NR_time		 13
-#define __NR_mknod		 14
-#define __NR_chmod		 15
-#define __NR_lchown		 16
-#define __NR_break		 17
-#define __NR_oldstat		 18
-#define __NR_lseek		 19
-#define __NR_getpid		 20
-#define __NR_mount		 21
-#define __NR_umount		 22
-#define __NR_setuid		 23
-#define __NR_getuid		 24
-#define __NR_stime		 25
-#define __NR_ptrace		 26
-#define __NR_alarm		 27
-#define __NR_oldfstat		 28
-#define __NR_pause		 29
-#define __NR_utime		 30
-#define __NR_stty		 31
-#define __NR_gtty		 32
-#define __NR_access		 33
-#define __NR_nice		 34
-#define __NR_ftime		 35
-#define __NR_sync		 36
-#define __NR_kill		 37
-#define __NR_rename		 38
-#define __NR_mkdir		 39
-#define __NR_rmdir		 40
-#define __NR_dup		 41
-#define __NR_pipe		 42
-#define __NR_times		 43
-#define __NR_prof		 44
-#define __NR_brk		 45
-#define __NR_setgid		 46
-#define __NR_getgid		 47
-#define __NR_signal		 48
-#define __NR_geteuid		 49
-#define __NR_getegid		 50
-#define __NR_acct		 51
-#define __NR_umount2		 52
-#define __NR_lock		 53
-#define __NR_ioctl		 54
-#define __NR_fcntl		 55
-#define __NR_mpx		 56
-#define __NR_setpgid		 57
-#define __NR_ulimit		 58
-#define __NR_oldolduname	 59
-#define __NR_umask		 60
-#define __NR_chroot		 61
-#define __NR_ustat		 62
-#define __NR_dup2		 63
-#define __NR_getppid		 64
-#define __NR_getpgrp		 65
-#define __NR_setsid		 66
-#define __NR_sigaction		 67
-#define __NR_sgetmask		 68
-#define __NR_ssetmask		 69
-#define __NR_setreuid		 70
-#define __NR_setregid		 71
-#define __NR_sigsuspend		 72
-#define __NR_sigpending		 73
-#define __NR_sethostname	 74
-#define __NR_setrlimit		 75
-#define __NR_getrlimit		 76
-#define __NR_getrusage		 77
-#define __NR_gettimeofday	 78
-#define __NR_settimeofday	 79
-#define __NR_getgroups		 80
-#define __NR_setgroups		 81
-#define __NR_select		 82
-#define __NR_symlink		 83
-#define __NR_oldlstat		 84
-#define __NR_readlink		 85
-#define __NR_uselib		 86
-#define __NR_swapon		 87
-#define __NR_reboot		 88
-#define __NR_readdir		 89
-#define __NR_mmap		 90
-#define __NR_munmap		 91
-#define __NR_truncate		 92
-#define __NR_ftruncate		 93
-#define __NR_fchmod		 94
-#define __NR_fchown		 95
-#define __NR_getpriority	 96
-#define __NR_setpriority	 97
-#define __NR_profil		 98
-#define __NR_statfs		 99
-#define __NR_fstatfs		100
-#define __NR_ioperm		101
-#define __NR_socketcall		102
-#define __NR_syslog		103
-#define __NR_setitimer		104
-#define __NR_getitimer		105
-#define __NR_stat		106
-#define __NR_lstat		107
-#define __NR_fstat		108
-#define __NR_olduname		109
-#define __NR_iopl		110
-#define __NR_vhangup		111
-#define __NR_idle		112
-#define __NR_vm86		113
-#define __NR_wait4		114
-#define __NR_swapoff		115
-#define __NR_sysinfo		116
-#define __NR_ipc		117
-#define __NR_fsync		118
-#define __NR_sigreturn		119
-#define __NR_clone		120
-#define __NR_setdomainname	121
-#define __NR_uname		122
-#define __NR_modify_ldt		123
-#define __NR_adjtimex		124
-#define __NR_mprotect		125
-#define __NR_sigprocmask	126
-#define __NR_create_module	127
-#define __NR_init_module	128
-#define __NR_delete_module	129
-#define __NR_get_kernel_syms	130
-#define __NR_quotactl		131
-#define __NR_getpgid		132
-#define __NR_fchdir		133
-#define __NR_bdflush		134
-#define __NR_sysfs		135
-#define __NR_personality	136
-#define __NR_afs_syscall	137 /* Syscall for Andrew File System */
-#define __NR_setfsuid		138
-#define __NR_setfsgid		139
-#define __NR__llseek		140
-#define __NR_getdents		141
-#define __NR__newselect		142
-#define __NR_flock		143
-#define __NR_msync		144
-#define __NR_readv		145
-#define __NR_writev		146
-#define __NR_getsid		147
-#define __NR_fdatasync		148
-#define __NR__sysctl		149
-#define __NR_mlock		150
-#define __NR_munlock		151
-#define __NR_mlockall		152
-#define __NR_munlockall		153
-#define __NR_sched_setparam		154
-#define __NR_sched_getparam		155
-#define __NR_sched_setscheduler		156
-#define __NR_sched_getscheduler		157
-#define __NR_sched_yield		158
-#define __NR_sched_get_priority_max	159
-#define __NR_sched_get_priority_min	160
-#define __NR_sched_rr_get_interval	161
-#define __NR_nanosleep		162
-#define __NR_mremap		163
-#define __NR_setresuid		164
-#define __NR_getresuid		165
-#define __NR_query_module	166
-#define __NR_poll		167
-#define __NR_nfsservctl		168
-#define __NR_setresgid		169
-#define __NR_getresgid		170
-#define __NR_prctl		171
-#define __NR_rt_sigreturn	172
-#define __NR_rt_sigaction	173
-#define __NR_rt_sigprocmask	174
-#define __NR_rt_sigpending	175
-#define __NR_rt_sigtimedwait	176
-#define __NR_rt_sigqueueinfo	177
-#define __NR_rt_sigsuspend	178
-#define __NR_pread64		179
-#define __NR_pwrite64		180
-#define __NR_chown		181
-#define __NR_getcwd		182
-#define __NR_capget		183
-#define __NR_capset		184
-#define __NR_sigaltstack	185
-#define __NR_sendfile		186
-#define __NR_getpmsg		187	/* some people actually want streams */
-#define __NR_putpmsg		188	/* some people actually want streams */
-#define __NR_vfork		189
-#define __NR_ugetrlimit		190	/* SuS compliant getrlimit */
-#define __NR_readahead		191
-#ifndef __powerpc64__			/* these are 32-bit only */
-#define __NR_mmap2		192
-#define __NR_truncate64		193
-#define __NR_ftruncate64	194
-#define __NR_stat64		195
-#define __NR_lstat64		196
-#define __NR_fstat64		197
-#endif
-#define __NR_pciconfig_read	198
-#define __NR_pciconfig_write	199
-#define __NR_pciconfig_iobase	200
-#define __NR_multiplexer	201
-#define __NR_getdents64		202
-#define __NR_pivot_root		203
 #ifndef __powerpc64__
-#define __NR_fcntl64		204
-#endif
-#define __NR_madvise		205
-#define __NR_mincore		206
-#define __NR_gettid		207
-#define __NR_tkill		208
-#define __NR_setxattr		209
-#define __NR_lsetxattr		210
-#define __NR_fsetxattr		211
-#define __NR_getxattr		212
-#define __NR_lgetxattr		213
-#define __NR_fgetxattr		214
-#define __NR_listxattr		215
-#define __NR_llistxattr		216
-#define __NR_flistxattr		217
-#define __NR_removexattr	218
-#define __NR_lremovexattr	219
-#define __NR_fremovexattr	220
-#define __NR_futex		221
-#define __NR_sched_setaffinity	222
-#define __NR_sched_getaffinity	223
-/* 224 currently unused */
-#define __NR_tuxcall		225
-#ifndef __powerpc64__
-#define __NR_sendfile64		226
-#endif
-#define __NR_io_setup		227
-#define __NR_io_destroy		228
-#define __NR_io_getevents	229
-#define __NR_io_submit		230
-#define __NR_io_cancel		231
-#define __NR_set_tid_address	232
-#define __NR_fadvise64		233
-#define __NR_exit_group		234
-#define __NR_lookup_dcookie	235
-#define __NR_epoll_create	236
-#define __NR_epoll_ctl		237
-#define __NR_epoll_wait		238
-#define __NR_remap_file_pages	239
-#define __NR_timer_create	240
-#define __NR_timer_settime	241
-#define __NR_timer_gettime	242
-#define __NR_timer_getoverrun	243
-#define __NR_timer_delete	244
-#define __NR_clock_settime	245
-#define __NR_clock_gettime	246
-#define __NR_clock_getres	247
-#define __NR_clock_nanosleep	248
-#define __NR_swapcontext	249
-#define __NR_tgkill		250
-#define __NR_utimes		251
-#define __NR_statfs64		252
-#define __NR_fstatfs64		253
-#ifndef __powerpc64__
-#define __NR_fadvise64_64	254
-#endif
-#define __NR_rtas		255
-#define __NR_sys_debug_setcontext 256
-/* Number 257 is reserved for vserver */
-#define __NR_migrate_pages	258
-#define __NR_mbind		259
-#define __NR_get_mempolicy	260
-#define __NR_set_mempolicy	261
-#define __NR_mq_open		262
-#define __NR_mq_unlink		263
-#define __NR_mq_timedsend	264
-#define __NR_mq_timedreceive	265
-#define __NR_mq_notify		266
-#define __NR_mq_getsetattr	267
-#define __NR_kexec_load		268
-#define __NR_add_key		269
-#define __NR_request_key	270
-#define __NR_keyctl		271
-#define __NR_waitid		272
-#define __NR_ioprio_set		273
-#define __NR_ioprio_get		274
-#define __NR_inotify_init	275
-#define __NR_inotify_add_watch	276
-#define __NR_inotify_rm_watch	277
-#define __NR_spu_run		278
-#define __NR_spu_create		279
-#define __NR_pselect6		280
-#define __NR_ppoll		281
-#define __NR_unshare		282
-#define __NR_splice		283
-#define __NR_tee		284
-#define __NR_vmsplice		285
-#define __NR_openat		286
-#define __NR_mkdirat		287
-#define __NR_mknodat		288
-#define __NR_fchownat		289
-#define __NR_futimesat		290
-#ifdef __powerpc64__
-#define __NR_newfstatat		291
+#include <asm/unistd_32.h>
 #else
-#define __NR_fstatat64		291
+#include <asm/unistd_64.h>
 #endif
-#define __NR_unlinkat		292
-#define __NR_renameat		293
-#define __NR_linkat		294
-#define __NR_symlinkat		295
-#define __NR_readlinkat		296
-#define __NR_fchmodat		297
-#define __NR_faccessat		298
-#define __NR_get_robust_list	299
-#define __NR_set_robust_list	300
-#define __NR_move_pages		301
-#define __NR_getcpu		302
-#define __NR_epoll_pwait	303
-#define __NR_utimensat		304
-#define __NR_signalfd		305
-#define __NR_timerfd_create	306
-#define __NR_eventfd		307
-#define __NR_sync_file_range2	308
-#define __NR_fallocate		309
-#define __NR_subpage_prot	310
-#define __NR_timerfd_settime	311
-#define __NR_timerfd_gettime	312
-#define __NR_signalfd4		313
-#define __NR_eventfd2		314
-#define __NR_epoll_create1	315
-#define __NR_dup3		316
-#define __NR_pipe2		317
-#define __NR_inotify_init1	318
-#define __NR_perf_event_open	319
-#define __NR_preadv		320
-#define __NR_pwritev		321
-#define __NR_rt_tgsigqueueinfo	322
-#define __NR_fanotify_init	323
-#define __NR_fanotify_mark	324
-#define __NR_prlimit64		325
-#define __NR_socket		326
-#define __NR_bind		327
-#define __NR_connect		328
-#define __NR_listen		329
-#define __NR_accept		330
-#define __NR_getsockname	331
-#define __NR_getpeername	332
-#define __NR_socketpair		333
-#define __NR_send		334
-#define __NR_sendto		335
-#define __NR_recv		336
-#define __NR_recvfrom		337
-#define __NR_shutdown		338
-#define __NR_setsockopt		339
-#define __NR_getsockopt		340
-#define __NR_sendmsg		341
-#define __NR_recvmsg		342
-#define __NR_recvmmsg		343
-#define __NR_accept4		344
-#define __NR_name_to_handle_at	345
-#define __NR_open_by_handle_at	346
-#define __NR_clock_adjtime	347
-#define __NR_syncfs		348
-#define __NR_sendmmsg		349
-#define __NR_setns		350
-#define __NR_process_vm_readv	351
-#define __NR_process_vm_writev	352
-#define __NR_finit_module	353
-#define __NR_kcmp		354
-#define __NR_sched_setattr	355
-#define __NR_sched_getattr	356
-#define __NR_renameat2		357
-#define __NR_seccomp		358
-#define __NR_getrandom		359
-#define __NR_memfd_create	360
-#define __NR_bpf		361
-#define __NR_execveat		362
-#define __NR_switch_endian	363
-#define __NR_userfaultfd	364
-#define __NR_membarrier		365
-#define __NR_mlock2		378
-#define __NR_copy_file_range	379
-#define __NR_preadv2		380
-#define __NR_pwritev2		381
-#define __NR_kexec_file_load	382
-#define __NR_statx		383
-#define __NR_pkey_alloc		384
-#define __NR_pkey_free		385
-#define __NR_pkey_mprotect	386
-#define __NR_rseq		387
-#define __NR_io_pgetevents	388
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index a5a6a24..cb7f0bb 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -160,16 +160,6 @@
 extra-$(CONFIG_PPC64)		+= entry_64.o
 extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE)	+= prom_init.o
 
-extra-y				+= systbl_chk.i
-$(obj)/systbl.o:		systbl_chk
-
-quiet_cmd_systbl_chk = CALL    $<
-      cmd_systbl_chk = $(CONFIG_SHELL) $< $(obj)/systbl_chk.i
-
-PHONY += systbl_chk
-systbl_chk: $(src)/systbl_chk.sh $(obj)/systbl_chk.i
-	$(call cmd,systbl_chk)
-
 ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE
 $(obj)/built-in.a:		prom_init_check
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 7c2032e..435927f 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -54,6 +54,9 @@
 SYS_CALL_TABLE:
 	.tc sys_call_table[TC],sys_call_table
 
+COMPAT_SYS_CALL_TABLE:
+	.tc compat_sys_call_table[TC],compat_sys_call_table
+
 /* This value is used to mark exception frames on the stack. */
 exception_marker:
 	.tc	ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER
@@ -178,7 +181,7 @@
 	ld	r11,SYS_CALL_TABLE@toc(2)
 	andis.	r10,r10,_TIF_32BIT@h
 	beq	15f
-	addi	r11,r11,8	/* use 32-bit syscall entries */
+	ld	r11,COMPAT_SYS_CALL_TABLE@toc(2)
 	clrldi	r3,r3,32
 	clrldi	r4,r4,32
 	clrldi	r5,r5,32
@@ -186,7 +189,7 @@
 	clrldi	r7,r7,32
 	clrldi	r8,r8,32
 15:
-	slwi	r0,r0,4
+	slwi	r0,r0,3
 
 	barrier_nospec_asm
 	/*
@@ -291,6 +294,10 @@
 	HMT_MEDIUM_LOW
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	std	r8, PACATMSCRATCH(r13)
+#endif
+
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
 	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 89d32bb..ed3d6fa 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1031,7 +1031,7 @@
 	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
 	EXCEPTION_PROLOG_COMMON_3(0xe60)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */
+	BRANCH_LINK_TO_FAR(DOTSYM(hmi_exception_realmode)) /* Function call ABI */
 	cmpdi	cr0,r3,0
 
 	/* Windup the stack. */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 7a0da83..45a8d0b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -35,6 +35,7 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/cma.h>
 
 #include <asm/debugfs.h>
 #include <asm/page.h>
@@ -46,6 +47,9 @@
 static struct fw_dump fw_dump;
 static struct fadump_mem_struct fdm;
 static const struct fadump_mem_struct *fdm_active;
+#ifdef CONFIG_CMA
+static struct cma *fadump_cma;
+#endif
 
 static DEFINE_MUTEX(fadump_mutex);
 struct fad_crash_memory_ranges *crash_memory_ranges;
@@ -53,6 +57,67 @@ int crash_memory_ranges_size;
 int crash_mem_ranges;
 int max_crash_mem_ranges;
 
+#ifdef CONFIG_CMA
+/*
+ * fadump_cma_init() - Initialize CMA area from a fadump reserved memory
+ *
+ * This function initializes CMA area from fadump reserved memory.
+ * The total size of fadump reserved memory covers for boot memory size
+ * + cpu data size + hpte size and metadata.
+ * Initialize only the area equivalent to boot memory size for CMA use.
+ * The reamining portion of fadump reserved memory will be not given
+ * to CMA and pages for thoes will stay reserved. boot memory size is
+ * aligned per CMA requirement to satisy cma_init_reserved_mem() call.
+ * But for some reason even if it fails we still have the memory reservation
+ * with us and we can still continue doing fadump.
+ */
+int __init fadump_cma_init(void)
+{
+	unsigned long long base, size;
+	int rc;
+
+	if (!fw_dump.fadump_enabled)
+		return 0;
+
+	/*
+	 * Do not use CMA if user has provided fadump=nocma kernel parameter.
+	 * Return 1 to continue with fadump old behaviour.
+	 */
+	if (fw_dump.nocma)
+		return 1;
+
+	base = fw_dump.reserve_dump_area_start;
+	size = fw_dump.boot_memory_size;
+
+	if (!size)
+		return 0;
+
+	rc = cma_init_reserved_mem(base, size, 0, "fadump_cma", &fadump_cma);
+	if (rc) {
+		pr_err("Failed to init cma area for firmware-assisted dump,%d\n", rc);
+		/*
+		 * Though the CMA init has failed we still have memory
+		 * reservation with us. The reserved memory will be
+		 * blocked from production system usage.  Hence return 1,
+		 * so that we can continue with fadump.
+		 */
+		return 1;
+	}
+
+	/*
+	 * So we now have successfully initialized cma area for fadump.
+	 */
+	pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx "
+		"bytes of memory reserved for firmware-assisted dump\n",
+		cma_get_size(fadump_cma),
+		(unsigned long)cma_get_base(fadump_cma) >> 20,
+		fw_dump.reserve_dump_area_size);
+	return 1;
+}
+#else
+static int __init fadump_cma_init(void) { return 1; }
+#endif /* CONFIG_CMA */
+
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node,
 			const char *uname, int depth, void *data)
@@ -118,13 +183,19 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
 
 /*
  * If fadump is registered, check if the memory provided
- * falls within boot memory area.
+ * falls within boot memory area and reserved memory area.
  */
-int is_fadump_boot_memory_area(u64 addr, ulong size)
+int is_fadump_memory_area(u64 addr, ulong size)
 {
+	u64 d_start = fw_dump.reserve_dump_area_start;
+	u64 d_end = d_start + fw_dump.reserve_dump_area_size;
+
 	if (!fw_dump.dump_registered)
 		return 0;
 
+	if (((addr + size) > d_start) && (addr <= d_end))
+		return 1;
+
 	return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size;
 }
 
@@ -172,6 +243,35 @@ static int is_boot_memory_area_contiguous(void)
 	return ret;
 }
 
+/*
+ * Returns true, if there are no holes in reserved memory area,
+ * false otherwise.
+ */
+static bool is_reserved_memory_area_contiguous(void)
+{
+	struct memblock_region *reg;
+	unsigned long start, end;
+	unsigned long d_start = fw_dump.reserve_dump_area_start;
+	unsigned long d_end = d_start + fw_dump.reserve_dump_area_size;
+
+	for_each_memblock(memory, reg) {
+		start = max(d_start, (unsigned long)reg->base);
+		end = min(d_end, (unsigned long)(reg->base + reg->size));
+		if (d_start < end) {
+			/* Memory hole from d_start to start */
+			if (start > d_start)
+				break;
+
+			if (end == d_end)
+				return true;
+
+			d_start = end + 1;
+		}
+	}
+
+	return false;
+}
+
 /* Print firmware assisted dump configurations for debugging purpose. */
 static void fadump_show_config(void)
 {
@@ -378,8 +478,15 @@ int __init fadump_reserve_mem(void)
 	 */
 	if (fdm_active)
 		fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len);
-	else
+	else {
 		fw_dump.boot_memory_size = fadump_calculate_reserve_size();
+#ifdef CONFIG_CMA
+		if (!fw_dump.nocma)
+			fw_dump.boot_memory_size =
+				ALIGN(fw_dump.boot_memory_size,
+							FADUMP_CMA_ALIGNMENT);
+#endif
+	}
 
 	/*
 	 * Calculate the memory boundary.
@@ -426,8 +533,9 @@ int __init fadump_reserve_mem(void)
 		fw_dump.fadumphdr_addr =
 				be64_to_cpu(fdm_active->rmr_region.destination_address) +
 				be64_to_cpu(fdm_active->rmr_region.source_len);
-		pr_debug("fadumphdr_addr = %p\n",
-				(void *) fw_dump.fadumphdr_addr);
+		pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr);
+		fw_dump.reserve_dump_area_start = base;
+		fw_dump.reserve_dump_area_size = size;
 	} else {
 		size = get_fadump_area_size();
 
@@ -455,10 +563,11 @@ int __init fadump_reserve_mem(void)
 			(unsigned long)(size >> 20),
 			(unsigned long)(base >> 20),
 			(unsigned long)(memblock_phys_mem_size() >> 20));
-	}
 
-	fw_dump.reserve_dump_area_start = base;
-	fw_dump.reserve_dump_area_size = size;
+		fw_dump.reserve_dump_area_start = base;
+		fw_dump.reserve_dump_area_size = size;
+		return fadump_cma_init();
+	}
 	return 1;
 }
 
@@ -477,6 +586,10 @@ static int __init early_fadump_param(char *p)
 		fw_dump.fadump_enabled = 1;
 	else if (strncmp(p, "off", 3) == 0)
 		fw_dump.fadump_enabled = 0;
+	else if (strncmp(p, "nocma", 5) == 0) {
+		fw_dump.fadump_enabled = 1;
+		fw_dump.nocma = 1;
+	}
 
 	return 0;
 }
@@ -525,8 +638,10 @@ static int register_fw_dump(struct fadump_mem_struct *fdm)
 		break;
 	case -3:
 		if (!is_boot_memory_area_contiguous())
-			pr_err("Can't have holes in boot memory area while "
-			       "registering fadump\n");
+			pr_err("Can't have holes in boot memory area while registering fadump\n");
+		else if (!is_reserved_memory_area_contiguous())
+			pr_err("Can't have holes in reserved memory area while"
+			       " registering fadump\n");
 
 		printk(KERN_ERR "Failed to register firmware-assisted kernel"
 			" dump. Parameter Error(%d).\n", rc);
@@ -1229,7 +1344,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
 	return 0;
 }
 
-static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
+static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm)
 {
 	int rc = 0;
 	unsigned int wait_time;
@@ -1260,9 +1375,8 @@ void fadump_cleanup(void)
 {
 	/* Invalidate the registration only if dump is active. */
 	if (fw_dump.dump_active) {
-		init_fadump_mem_struct(&fdm,
-			be64_to_cpu(fdm_active->cpu_state_data.destination_address));
-		fadump_invalidate_dump(&fdm);
+		/* pass the same memory dump structure provided by platform */
+		fadump_invalidate_dump(fdm_active);
 	} else if (fw_dump.dump_registered) {
 		/* Un-register Firmware-assisted dump if it was registered. */
 		fadump_unregister_dump(&fdm);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f0dc680..9d5d109 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -47,6 +47,7 @@
 #include <asm/fadump.h>
 #include <asm/vio.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 #define DBG(...)
 
@@ -993,15 +994,19 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
-long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-		unsigned long *hpa, enum dma_data_direction *direction)
+long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long *hpa,
+		enum dma_data_direction *direction)
 {
 	long ret;
+	unsigned long size = 0;
 
 	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
 
 	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-			(*direction == DMA_BIDIRECTIONAL)))
+			(*direction == DMA_BIDIRECTIONAL)) &&
+			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
+					&size))
 		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
 
 	/* if (unlikely(ret))
@@ -1073,11 +1078,8 @@ void iommu_release_ownership(struct iommu_table *tbl)
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-int iommu_add_device(struct device *dev)
+int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
 {
-	struct iommu_table *tbl;
-	struct iommu_table_group_link *tgl;
-
 	/*
 	 * The sysfs entries should be populated before
 	 * binding IOMMU group. If sysfs entries isn't
@@ -1093,32 +1095,10 @@ int iommu_add_device(struct device *dev)
 		return -EBUSY;
 	}
 
-	tbl = get_iommu_table_base(dev);
-	if (!tbl) {
-		pr_debug("%s: Skipping device %s with no tbl\n",
-			 __func__, dev_name(dev));
-		return 0;
-	}
-
-	tgl = list_first_entry_or_null(&tbl->it_group_list,
-			struct iommu_table_group_link, next);
-	if (!tgl) {
-		pr_debug("%s: Skipping device %s with no group\n",
-			 __func__, dev_name(dev));
-		return 0;
-	}
 	pr_debug("%s: Adding %s to iommu group %d\n",
-		 __func__, dev_name(dev),
-		 iommu_group_id(tgl->table_group->group));
+		 __func__, dev_name(dev),  iommu_group_id(table_group->group));
 
-	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
-		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
-		       __func__, IOMMU_PAGE_SIZE(tbl),
-		       PAGE_SIZE, dev_name(dev));
-		return -EINVAL;
-	}
-
-	return iommu_group_add_device(tgl->table_group->group, dev);
+	return iommu_group_add_device(table_group->group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
@@ -1138,31 +1118,4 @@ void iommu_del_device(struct device *dev)
 	iommu_group_remove_device(dev);
 }
 EXPORT_SYMBOL_GPL(iommu_del_device);
-
-static int tce_iommu_bus_notifier(struct notifier_block *nb,
-                unsigned long action, void *data)
-{
-        struct device *dev = data;
-
-        switch (action) {
-        case BUS_NOTIFY_ADD_DEVICE:
-                return iommu_add_device(dev);
-        case BUS_NOTIFY_DEL_DEVICE:
-                if (dev->iommu_group)
-                        iommu_del_device(dev);
-                return 0;
-        default:
-                return 0;
-        }
-}
-
-static struct notifier_block tce_iommu_bus_nb = {
-        .notifier_call = tce_iommu_bus_notifier,
-};
-
-int __init tce_iommu_bus_notifier_init(void)
-{
-        bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
-        return 0;
-}
 #endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index fde6323..7cea597 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -400,8 +400,7 @@ void __init find_legacy_serial_ports(void)
 	/* Next, fill our array with ISA ports */
 	for_each_node_by_type(np, "serial") {
 		struct device_node *isa = of_get_parent(np);
-		if (isa && (!strcmp(isa->name, "isa") ||
-			    !strcmp(isa->name, "lpc"))) {
+		if (of_node_name_eq(isa, "isa") || of_node_name_eq(isa, "lpc")) {
 			if (of_device_is_available(np)) {
 				index = add_legacy_isa_port(np, isa);
 				if (index >= 0 && np == stdout)
@@ -415,11 +414,11 @@ void __init find_legacy_serial_ports(void)
 	/* Next, try to locate PCI ports */
 	for (np = NULL; (np = of_find_all_nodes(np));) {
 		struct device_node *pci, *parent = of_get_parent(np);
-		if (parent && !strcmp(parent->name, "isa")) {
+		if (of_node_name_eq(parent, "isa")) {
 			of_node_put(parent);
 			continue;
 		}
-		if (strcmp(np->name, "serial") &&
+		if (!of_node_name_eq(np, "serial") &&
 		    !of_node_is_type(np, "serial")) {
 			of_node_put(parent);
 			continue;
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 9d39e0e..2d47cc7 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -848,7 +848,23 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	/* If TM bits are set to the reserved value, it's an invalid context */
 	if (MSR_TM_RESV(msr_hi))
 		return 1;
-	/* Pull in the MSR TM bits from the user context */
+
+	/*
+	 * Disabling preemption, since it is unsafe to be preempted
+	 * with MSR[TS] set without recheckpointing.
+	 */
+	preempt_disable();
+
+	/*
+	 * CAUTION:
+	 * After regs->MSR[TS] being updated, make sure that get_user(),
+	 * put_user() or similar functions are *not* called. These
+	 * functions can generate page faults which will cause the process
+	 * to be de-scheduled with MSR[TS] set but without calling
+	 * tm_recheckpoint(). This can cause a bug.
+	 *
+	 * Pull in the MSR TM bits from the user context
+	 */
 	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr_hi & MSR_TS_MASK);
 	/* Now, recheckpoint.  This loads up all of the checkpointed (older)
 	 * registers, including FP and V[S]Rs.  After recheckpointing, the
@@ -873,6 +889,8 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	}
 #endif
 
+	preempt_enable();
+
 	return 0;
 }
 #endif
@@ -1140,11 +1158,11 @@ SYSCALL_DEFINE0(rt_sigreturn)
 {
 	struct rt_sigframe __user *rt_sf;
 	struct pt_regs *regs = current_pt_regs();
+	int tm_restore = 0;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	struct ucontext __user *uc_transact;
 	unsigned long msr_hi;
 	unsigned long tmp;
-	int tm_restore = 0;
 #endif
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
@@ -1192,11 +1210,19 @@ SYSCALL_DEFINE0(rt_sigreturn)
 				goto bad;
 		}
 	}
-	if (!tm_restore)
-		/* Fall through, for non-TM restore */
+	if (!tm_restore) {
+		/*
+		 * Unset regs->msr because ucontext MSR TS is not
+		 * set, and recheckpoint was not called. This avoid
+		 * hitting a TM Bad thing at RFID
+		 */
+		regs->msr &= ~MSR_TS_MASK;
+	}
+	/* Fall through, for non-TM restore */
 #endif
-	if (do_setcontext(&rt_sf->uc, regs, 1))
-		goto bad;
+	if (!tm_restore)
+		if (do_setcontext(&rt_sf->uc, regs, 1))
+			goto bad;
 
 	/*
 	 * It's not clear whether or why it is desirable to save the
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index e53ad11..0935fe6 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -467,20 +467,6 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 	if (MSR_TM_RESV(msr))
 		return -EINVAL;
 
-	/* pull in MSR TS bits from user context */
-	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
-
-	/*
-	 * Ensure that TM is enabled in regs->msr before we leave the signal
-	 * handler. It could be the case that (a) user disabled the TM bit
-	 * through the manipulation of the MSR bits in uc_mcontext or (b) the
-	 * TM bit was disabled because a sufficient number of context switches
-	 * happened whilst in the signal handler and load_tm overflowed,
-	 * disabling the TM bit. In either case we can end up with an illegal
-	 * TM state leading to a TM Bad Thing when we return to userspace.
-	 */
-	regs->msr |= MSR_TM;
-
 	/* pull in MSR LE from user context */
 	regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
 
@@ -572,6 +558,34 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 	tm_enable();
 	/* Make sure the transaction is marked as failed */
 	tsk->thread.tm_texasr |= TEXASR_FS;
+
+	/*
+	 * Disabling preemption, since it is unsafe to be preempted
+	 * with MSR[TS] set without recheckpointing.
+	 */
+	preempt_disable();
+
+	/* pull in MSR TS bits from user context */
+	regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+
+	/*
+	 * Ensure that TM is enabled in regs->msr before we leave the signal
+	 * handler. It could be the case that (a) user disabled the TM bit
+	 * through the manipulation of the MSR bits in uc_mcontext or (b) the
+	 * TM bit was disabled because a sufficient number of context switches
+	 * happened whilst in the signal handler and load_tm overflowed,
+	 * disabling the TM bit. In either case we can end up with an illegal
+	 * TM state leading to a TM Bad Thing when we return to userspace.
+	 *
+	 * CAUTION:
+	 * After regs->MSR[TS] being updated, make sure that get_user(),
+	 * put_user() or similar functions are *not* called. These
+	 * functions can generate page faults which will cause the process
+	 * to be de-scheduled with MSR[TS] set but without calling
+	 * tm_recheckpoint(). This can cause a bug.
+	 */
+	regs->msr |= MSR_TM;
+
 	/* This loads the checkpointed FP/VEC state, if used */
 	tm_recheckpoint(&tsk->thread);
 
@@ -585,6 +599,8 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 		regs->msr |= MSR_VEC;
 	}
 
+	preempt_enable();
+
 	return err;
 }
 #endif
@@ -741,11 +757,23 @@ SYSCALL_DEFINE0(rt_sigreturn)
 					   &uc_transact->uc_mcontext))
 			goto badframe;
 	}
-	else
-	/* Fall through, for non-TM restore */
 #endif
-	if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
-		goto badframe;
+	/* Fall through, for non-TM restore */
+	if (!MSR_TM_ACTIVE(msr)) {
+		/*
+		 * Unset MSR[TS] on the thread regs since MSR from user
+		 * context does not have MSR active, and recheckpoint was
+		 * not called since restore_tm_sigcontexts() was not called
+		 * also.
+		 *
+		 * If not unsetting it, the code can RFID to userspace with
+		 * MSR[TS] set, but without CPU in the proper state,
+		 * causing a TM bad thing.
+		 */
+		current->thread.regs->msr &= ~MSR_TS_MASK;
+		if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
+			goto badframe;
+	}
 
 	if (restore_altstack(&uc->uc_stack))
 		goto badframe;
diff --git a/arch/powerpc/kernel/syscalls/Makefile b/arch/powerpc/kernel/syscalls/Makefile
new file mode 100644
index 0000000..27b4895
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/Makefile
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+kapi := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
+
+_dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')	\
+	  $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)')
+
+syscall := $(srctree)/$(src)/syscall.tbl
+syshdr := $(srctree)/$(src)/syscallhdr.sh
+systbl := $(srctree)/$(src)/syscalltbl.sh
+
+quiet_cmd_syshdr = SYSHDR  $@
+      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@'	\
+		   '$(syshdr_abis_$(basetarget))'		\
+		   '$(syshdr_pfx_$(basetarget))'		\
+		   '$(syshdr_offset_$(basetarget))'
+
+quiet_cmd_systbl = SYSTBL  $@
+      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@'	\
+		   '$(systbl_abis_$(basetarget))'		\
+		   '$(systbl_abi_$(basetarget))'		\
+		   '$(systbl_offset_$(basetarget))'
+
+syshdr_abis_unistd_32 := common,nospu,32
+$(uapi)/unistd_32.h: $(syscall) $(syshdr)
+	$(call if_changed,syshdr)
+
+syshdr_abis_unistd_64 := common,nospu,64
+$(uapi)/unistd_64.h: $(syscall) $(syshdr)
+	$(call if_changed,syshdr)
+
+systbl_abis_syscall_table_32 := common,nospu,32
+systbl_abi_syscall_table_32 := 32
+$(kapi)/syscall_table_32.h: $(syscall) $(systbl)
+	$(call if_changed,systbl)
+
+systbl_abis_syscall_table_64 := common,nospu,64
+systbl_abi_syscall_table_64 := 64
+$(kapi)/syscall_table_64.h: $(syscall) $(systbl)
+	$(call if_changed,systbl)
+
+systbl_abis_syscall_table_c32 := common,nospu,32
+systbl_abi_syscall_table_c32 := c32
+$(kapi)/syscall_table_c32.h: $(syscall) $(systbl)
+	$(call if_changed,systbl)
+
+systbl_abis_syscall_table_spu := common,spu
+systbl_abi_syscall_table_spu := spu
+$(kapi)/syscall_table_spu.h: $(syscall) $(systbl)
+	$(call if_changed,systbl)
+
+uapisyshdr-y		+= unistd_32.h unistd_64.h
+kapisyshdr-y		+= syscall_table_32.h		\
+			   syscall_table_64.h		\
+			   syscall_table_c32.h		\
+			   syscall_table_spu.h
+
+targets	+= $(uapisyshdr-y) $(kapisyshdr-y)
+
+PHONY += all
+all: $(addprefix $(uapi)/,$(uapisyshdr-y))
+all: $(addprefix $(kapi)/,$(kapisyshdr-y))
+	@:
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
new file mode 100644
index 0000000..db3bbb8
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -0,0 +1,427 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# system call numbers and entry vectors for powerpc
+#
+# The format is:
+# <number> <abi> <name> <entry point> <compat entry point>
+#
+# The <abi> can be common, spu, nospu, 64, or 32 for this file.
+#
+0	nospu	restart_syscall			sys_restart_syscall
+1	nospu	exit				sys_exit
+2	nospu	fork				ppc_fork
+3	common	read				sys_read
+4	common	write				sys_write
+5	common	open				sys_open			compat_sys_open
+6	common	close				sys_close
+7	common	waitpid				sys_waitpid
+8	common	creat				sys_creat
+9	common	link				sys_link
+10	common	unlink				sys_unlink
+11	nospu	execve				sys_execve			compat_sys_execve
+12	common	chdir				sys_chdir
+13	common	time				sys_time			compat_sys_time
+14	common	mknod				sys_mknod
+15	common	chmod				sys_chmod
+16	common	lchown				sys_lchown
+17	common	break				sys_ni_syscall
+18	32	oldstat				sys_stat			sys_ni_syscall
+18	64	oldstat				sys_ni_syscall
+18	spu	oldstat				sys_ni_syscall
+19	common	lseek				sys_lseek			compat_sys_lseek
+20	common	getpid				sys_getpid
+21	nospu	mount				sys_mount			compat_sys_mount
+22	32	umount				sys_oldumount
+22	64	umount				sys_ni_syscall
+22	spu	umount				sys_ni_syscall
+23	common	setuid				sys_setuid
+24	common	getuid				sys_getuid
+25	common	stime				sys_stime			compat_sys_stime
+26	nospu	ptrace				sys_ptrace			compat_sys_ptrace
+27	common	alarm				sys_alarm
+28	32	oldfstat			sys_fstat			sys_ni_syscall
+28	64	oldfstat			sys_ni_syscall
+28	spu	oldfstat			sys_ni_syscall
+29	nospu	pause				sys_pause
+30	nospu	utime				sys_utime			compat_sys_utime
+31	common	stty				sys_ni_syscall
+32	common	gtty				sys_ni_syscall
+33	common	access				sys_access
+34	common	nice				sys_nice
+35	common	ftime				sys_ni_syscall
+36	common	sync				sys_sync
+37	common	kill				sys_kill
+38	common	rename				sys_rename
+39	common	mkdir				sys_mkdir
+40	common	rmdir				sys_rmdir
+41	common	dup				sys_dup
+42	common	pipe				sys_pipe
+43	common	times				sys_times			compat_sys_times
+44	common	prof				sys_ni_syscall
+45	common	brk				sys_brk
+46	common	setgid				sys_setgid
+47	common	getgid				sys_getgid
+48	nospu	signal				sys_signal
+49	common	geteuid				sys_geteuid
+50	common	getegid				sys_getegid
+51	nospu	acct				sys_acct
+52	nospu	umount2				sys_umount
+53	common	lock				sys_ni_syscall
+54	common	ioctl				sys_ioctl			compat_sys_ioctl
+55	common	fcntl				sys_fcntl			compat_sys_fcntl
+56	common	mpx				sys_ni_syscall
+57	common	setpgid				sys_setpgid
+58	common	ulimit				sys_ni_syscall
+59	32	oldolduname			sys_olduname
+59	64	oldolduname			sys_ni_syscall
+59	spu	oldolduname			sys_ni_syscall
+60	common	umask				sys_umask
+61	common	chroot				sys_chroot
+62	nospu	ustat				sys_ustat			compat_sys_ustat
+63	common	dup2				sys_dup2
+64	common	getppid				sys_getppid
+65	common	getpgrp				sys_getpgrp
+66	common	setsid				sys_setsid
+67	32	sigaction			sys_sigaction			compat_sys_sigaction
+67	64	sigaction			sys_ni_syscall
+67	spu	sigaction			sys_ni_syscall
+68	common	sgetmask			sys_sgetmask
+69	common	ssetmask			sys_ssetmask
+70	common	setreuid			sys_setreuid
+71	common	setregid			sys_setregid
+72	32	sigsuspend			sys_sigsuspend
+72	64	sigsuspend			sys_ni_syscall
+72	spu	sigsuspend			sys_ni_syscall
+73	32	sigpending			sys_sigpending			compat_sys_sigpending
+73	64	sigpending			sys_ni_syscall
+73	spu	sigpending			sys_ni_syscall
+74	common	sethostname			sys_sethostname
+75	common	setrlimit			sys_setrlimit			compat_sys_setrlimit
+76	32	getrlimit			sys_old_getrlimit		compat_sys_old_getrlimit
+76	64	getrlimit			sys_ni_syscall
+76	spu	getrlimit			sys_ni_syscall
+77	common	getrusage			sys_getrusage			compat_sys_getrusage
+78	common	gettimeofday			sys_gettimeofday		compat_sys_gettimeofday
+79	common	settimeofday			sys_settimeofday		compat_sys_settimeofday
+80	common	getgroups			sys_getgroups
+81	common	setgroups			sys_setgroups
+82	32	select				ppc_select			sys_ni_syscall
+82	64	select				sys_ni_syscall
+82	spu	select				sys_ni_syscall
+83	common	symlink				sys_symlink
+84	32	oldlstat			sys_lstat			sys_ni_syscall
+84	64	oldlstat			sys_ni_syscall
+84	spu	oldlstat			sys_ni_syscall
+85	common	readlink			sys_readlink
+86	nospu	uselib				sys_uselib
+87	nospu	swapon				sys_swapon
+88	nospu	reboot				sys_reboot
+89	32	readdir				sys_old_readdir			compat_sys_old_readdir
+89	64	readdir				sys_ni_syscall
+89	spu	readdir				sys_ni_syscall
+90	common	mmap				sys_mmap
+91	common	munmap				sys_munmap
+92	common	truncate			sys_truncate			compat_sys_truncate
+93	common	ftruncate			sys_ftruncate			compat_sys_ftruncate
+94	common	fchmod				sys_fchmod
+95	common	fchown				sys_fchown
+96	common	getpriority			sys_getpriority
+97	common	setpriority			sys_setpriority
+98	common	profil				sys_ni_syscall
+99	nospu	statfs				sys_statfs			compat_sys_statfs
+100	nospu	fstatfs				sys_fstatfs			compat_sys_fstatfs
+101	common	ioperm				sys_ni_syscall
+102	common	socketcall			sys_socketcall			compat_sys_socketcall
+103	common	syslog				sys_syslog
+104	common	setitimer			sys_setitimer			compat_sys_setitimer
+105	common	getitimer			sys_getitimer			compat_sys_getitimer
+106	common	stat				sys_newstat			compat_sys_newstat
+107	common	lstat				sys_newlstat			compat_sys_newlstat
+108	common	fstat				sys_newfstat			compat_sys_newfstat
+109	32	olduname			sys_uname
+109	64	olduname			sys_ni_syscall
+109	spu	olduname			sys_ni_syscall
+110	common	iopl				sys_ni_syscall
+111	common	vhangup				sys_vhangup
+112	common	idle				sys_ni_syscall
+113	common	vm86				sys_ni_syscall
+114	common	wait4				sys_wait4			compat_sys_wait4
+115	nospu	swapoff				sys_swapoff
+116	common	sysinfo				sys_sysinfo			compat_sys_sysinfo
+117	nospu	ipc				sys_ipc				compat_sys_ipc
+118	common	fsync				sys_fsync
+119	32	sigreturn			sys_sigreturn			compat_sys_sigreturn
+119	64	sigreturn			sys_ni_syscall
+119	spu	sigreturn			sys_ni_syscall
+120	nospu	clone				ppc_clone
+121	common	setdomainname			sys_setdomainname
+122	common	uname				sys_newuname
+123	common	modify_ldt			sys_ni_syscall
+124	common	adjtimex			sys_adjtimex			compat_sys_adjtimex
+125	common	mprotect			sys_mprotect
+126	32	sigprocmask			sys_sigprocmask			compat_sys_sigprocmask
+126	64	sigprocmask			sys_ni_syscall
+126	spu	sigprocmask			sys_ni_syscall
+127	common	create_module			sys_ni_syscall
+128	nospu	init_module			sys_init_module
+129	nospu	delete_module			sys_delete_module
+130	common	get_kernel_syms			sys_ni_syscall
+131	nospu	quotactl			sys_quotactl
+132	common	getpgid				sys_getpgid
+133	common	fchdir				sys_fchdir
+134	common	bdflush				sys_bdflush
+135	common	sysfs				sys_sysfs
+136	32	personality			sys_personality			ppc64_personality
+136	64	personality			ppc64_personality
+136	spu	personality			ppc64_personality
+137	common	afs_syscall			sys_ni_syscall
+138	common	setfsuid			sys_setfsuid
+139	common	setfsgid			sys_setfsgid
+140	common	_llseek				sys_llseek
+141	common	getdents			sys_getdents			compat_sys_getdents
+142	common	_newselect			sys_select			compat_sys_select
+143	common	flock				sys_flock
+144	common	msync				sys_msync
+145	common	readv				sys_readv			compat_sys_readv
+146	common	writev				sys_writev			compat_sys_writev
+147	common	getsid				sys_getsid
+148	common	fdatasync			sys_fdatasync
+149	nospu	_sysctl				sys_sysctl			compat_sys_sysctl
+150	common	mlock				sys_mlock
+151	common	munlock				sys_munlock
+152	common	mlockall			sys_mlockall
+153	common	munlockall			sys_munlockall
+154	common	sched_setparam			sys_sched_setparam
+155	common	sched_getparam			sys_sched_getparam
+156	common	sched_setscheduler		sys_sched_setscheduler
+157	common	sched_getscheduler		sys_sched_getscheduler
+158	common	sched_yield			sys_sched_yield
+159	common	sched_get_priority_max		sys_sched_get_priority_max
+160	common	sched_get_priority_min		sys_sched_get_priority_min
+161	common	sched_rr_get_interval		sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
+162	common	nanosleep			sys_nanosleep			compat_sys_nanosleep
+163	common	mremap				sys_mremap
+164	common	setresuid			sys_setresuid
+165	common	getresuid			sys_getresuid
+166	common	query_module			sys_ni_syscall
+167	common	poll				sys_poll
+168	common	nfsservctl			sys_ni_syscall
+169	common	setresgid			sys_setresgid
+170	common	getresgid			sys_getresgid
+171	common	prctl				sys_prctl
+172	nospu	rt_sigreturn			sys_rt_sigreturn		compat_sys_rt_sigreturn
+173	nospu	rt_sigaction			sys_rt_sigaction		compat_sys_rt_sigaction
+174	nospu	rt_sigprocmask			sys_rt_sigprocmask		compat_sys_rt_sigprocmask
+175	nospu	rt_sigpending			sys_rt_sigpending		compat_sys_rt_sigpending
+176	nospu	rt_sigtimedwait			sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+177	nospu 	rt_sigqueueinfo			sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
+178	nospu 	rt_sigsuspend			sys_rt_sigsuspend		compat_sys_rt_sigsuspend
+179	common	pread64				sys_pread64			compat_sys_pread64
+180	common	pwrite64			sys_pwrite64			compat_sys_pwrite64
+181	common	chown				sys_chown
+182	common	getcwd				sys_getcwd
+183	common	capget				sys_capget
+184	common	capset				sys_capset
+185	nospu	sigaltstack			sys_sigaltstack			compat_sys_sigaltstack
+186	32	sendfile			sys_sendfile			compat_sys_sendfile
+186	64	sendfile			sys_sendfile64
+186	spu	sendfile			sys_sendfile64
+187	common	getpmsg				sys_ni_syscall
+188	common 	putpmsg				sys_ni_syscall
+189	nospu	vfork				ppc_vfork
+190	common	ugetrlimit			sys_getrlimit			compat_sys_getrlimit
+191	common	readahead			sys_readahead			compat_sys_readahead
+192	32	mmap2				sys_mmap2			compat_sys_mmap2
+193	32	truncate64			sys_truncate64			compat_sys_truncate64
+194	32	ftruncate64			sys_ftruncate64			compat_sys_ftruncate64
+195	32	stat64				sys_stat64
+196	32	lstat64				sys_lstat64
+197	32	fstat64				sys_fstat64
+198	nospu 	pciconfig_read			sys_pciconfig_read
+199	nospu 	pciconfig_write			sys_pciconfig_write
+200	nospu 	pciconfig_iobase		sys_pciconfig_iobase
+201	common 	multiplexer			sys_ni_syscall
+202	common	getdents64			sys_getdents64
+203	common	pivot_root			sys_pivot_root
+204	32	fcntl64				sys_fcntl64			compat_sys_fcntl64
+205	common	madvise				sys_madvise
+206	common	mincore				sys_mincore
+207	common	gettid				sys_gettid
+208	common	tkill				sys_tkill
+209	common	setxattr			sys_setxattr
+210	common	lsetxattr			sys_lsetxattr
+211	common	fsetxattr			sys_fsetxattr
+212	common	getxattr			sys_getxattr
+213	common	lgetxattr			sys_lgetxattr
+214	common	fgetxattr			sys_fgetxattr
+215	common	listxattr			sys_listxattr
+216	common	llistxattr			sys_llistxattr
+217	common	flistxattr			sys_flistxattr
+218	common	removexattr			sys_removexattr
+219	common	lremovexattr			sys_lremovexattr
+220	common	fremovexattr			sys_fremovexattr
+221	common	futex				sys_futex			compat_sys_futex
+222	common	sched_setaffinity		sys_sched_setaffinity		compat_sys_sched_setaffinity
+223	common	sched_getaffinity		sys_sched_getaffinity		compat_sys_sched_getaffinity
+# 224 unused
+225	common	tuxcall				sys_ni_syscall
+226	32	sendfile64			sys_sendfile64			compat_sys_sendfile64
+227	common	io_setup			sys_io_setup			compat_sys_io_setup
+228	common	io_destroy			sys_io_destroy
+229	common	io_getevents			sys_io_getevents		compat_sys_io_getevents
+230	common	io_submit			sys_io_submit			compat_sys_io_submit
+231	common	io_cancel			sys_io_cancel
+232	nospu	set_tid_address			sys_set_tid_address
+233	common	fadvise64			sys_fadvise64			ppc32_fadvise64
+234	nospu	exit_group			sys_exit_group
+235	nospu	lookup_dcookie			sys_lookup_dcookie		compat_sys_lookup_dcookie
+236	common	epoll_create			sys_epoll_create
+237	common	epoll_ctl			sys_epoll_ctl
+238	common	epoll_wait			sys_epoll_wait
+239	common	remap_file_pages		sys_remap_file_pages
+240	common	timer_create			sys_timer_create		compat_sys_timer_create
+241	common	timer_settime			sys_timer_settime		compat_sys_timer_settime
+242	common	timer_gettime			sys_timer_gettime		compat_sys_timer_gettime
+243	common	timer_getoverrun		sys_timer_getoverrun
+244	common	timer_delete			sys_timer_delete
+245	common	clock_settime			sys_clock_settime		compat_sys_clock_settime
+246	common	clock_gettime			sys_clock_gettime		compat_sys_clock_gettime
+247	common	clock_getres			sys_clock_getres		compat_sys_clock_getres
+248	common	clock_nanosleep			sys_clock_nanosleep		compat_sys_clock_nanosleep
+249	32	swapcontext			ppc_swapcontext			ppc32_swapcontext
+249	64	swapcontext			ppc64_swapcontext
+249	spu	swapcontext			sys_ni_syscall
+250	common	tgkill				sys_tgkill
+251	common	utimes				sys_utimes			compat_sys_utimes
+252	common	statfs64			sys_statfs64			compat_sys_statfs64
+253	common	fstatfs64			sys_fstatfs64			compat_sys_fstatfs64
+254	32	fadvise64_64			ppc_fadvise64_64
+254	spu	fadvise64_64			sys_ni_syscall
+255	common	rtas				sys_rtas
+256	32	sys_debug_setcontext		sys_debug_setcontext		sys_ni_syscall
+256	64	sys_debug_setcontext		sys_ni_syscall
+256	spu	sys_debug_setcontext		sys_ni_syscall
+# 257 reserved for vserver
+258	nospu	migrate_pages			sys_migrate_pages		compat_sys_migrate_pages
+259	nospu	mbind				sys_mbind			compat_sys_mbind
+260	nospu	get_mempolicy			sys_get_mempolicy		compat_sys_get_mempolicy
+261	nospu	set_mempolicy			sys_set_mempolicy		compat_sys_set_mempolicy
+262	nospu	mq_open				sys_mq_open			compat_sys_mq_open
+263	nospu	mq_unlink			sys_mq_unlink
+264	nospu	mq_timedsend			sys_mq_timedsend		compat_sys_mq_timedsend
+265	nospu	mq_timedreceive			sys_mq_timedreceive		compat_sys_mq_timedreceive
+266	nospu	mq_notify			sys_mq_notify			compat_sys_mq_notify
+267	nospu	mq_getsetattr			sys_mq_getsetattr		compat_sys_mq_getsetattr
+268	nospu	kexec_load			sys_kexec_load			compat_sys_kexec_load
+269	nospu	add_key				sys_add_key
+270	nospu	request_key			sys_request_key
+271	nospu	keyctl				sys_keyctl			compat_sys_keyctl
+272	nospu	waitid				sys_waitid			compat_sys_waitid
+273	nospu	ioprio_set			sys_ioprio_set
+274	nospu	ioprio_get			sys_ioprio_get
+275	nospu	inotify_init			sys_inotify_init
+276	nospu	inotify_add_watch		sys_inotify_add_watch
+277	nospu	inotify_rm_watch		sys_inotify_rm_watch
+278	nospu	spu_run				sys_spu_run
+279	nospu	spu_create			sys_spu_create
+280	nospu	pselect6			sys_pselect6			compat_sys_pselect6
+281	nospu	ppoll				sys_ppoll			compat_sys_ppoll
+282	common	unshare				sys_unshare
+283	common	splice				sys_splice
+284	common	tee				sys_tee
+285	common	vmsplice			sys_vmsplice			compat_sys_vmsplice
+286	common	openat				sys_openat			compat_sys_openat
+287	common	mkdirat				sys_mkdirat
+288	common	mknodat				sys_mknodat
+289	common	fchownat			sys_fchownat
+290	common	futimesat			sys_futimesat			compat_sys_futimesat
+291	32	fstatat64			sys_fstatat64
+291	64	newfstatat			sys_newfstatat
+291	spu	newfstatat			sys_newfstatat
+292	common	unlinkat			sys_unlinkat
+293	common	renameat			sys_renameat
+294	common	linkat				sys_linkat
+295	common	symlinkat			sys_symlinkat
+296	common	readlinkat			sys_readlinkat
+297	common	fchmodat			sys_fchmodat
+298	common	faccessat			sys_faccessat
+299	common	get_robust_list			sys_get_robust_list		compat_sys_get_robust_list
+300	common	set_robust_list			sys_set_robust_list		compat_sys_set_robust_list
+301	common	move_pages			sys_move_pages			compat_sys_move_pages
+302	common	getcpu				sys_getcpu
+303	nospu	epoll_pwait			sys_epoll_pwait			compat_sys_epoll_pwait
+304	common	utimensat			sys_utimensat			compat_sys_utimensat
+305	common	signalfd			sys_signalfd			compat_sys_signalfd
+306	common	timerfd_create			sys_timerfd_create
+307	common	eventfd				sys_eventfd
+308	common	sync_file_range2		sys_sync_file_range2		compat_sys_sync_file_range2
+309	nospu	fallocate			sys_fallocate			compat_sys_fallocate
+310	nospu	subpage_prot			sys_subpage_prot
+311	common	timerfd_settime			sys_timerfd_settime		compat_sys_timerfd_settime
+312	common	timerfd_gettime			sys_timerfd_gettime		compat_sys_timerfd_gettime
+313	common	signalfd4			sys_signalfd4			compat_sys_signalfd4
+314	common	eventfd2			sys_eventfd2
+315	common	epoll_create1			sys_epoll_create1
+316	common	dup3				sys_dup3
+317	common	pipe2				sys_pipe2
+318	nospu	inotify_init1			sys_inotify_init1
+319	common	perf_event_open			sys_perf_event_open
+320	common	preadv				sys_preadv			compat_sys_preadv
+321	common	pwritev				sys_pwritev			compat_sys_pwritev
+322	nospu	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
+323	nospu	fanotify_init			sys_fanotify_init
+324	nospu	fanotify_mark			sys_fanotify_mark		compat_sys_fanotify_mark
+325	common	prlimit64			sys_prlimit64
+326	common	socket				sys_socket
+327	common	bind				sys_bind
+328	common	connect				sys_connect
+329	common	listen				sys_listen
+330	common	accept				sys_accept
+331	common	getsockname			sys_getsockname
+332	common	getpeername			sys_getpeername
+333	common	socketpair			sys_socketpair
+334	common	send				sys_send
+335	common	sendto				sys_sendto
+336	common	recv				sys_recv			compat_sys_recv
+337	common	recvfrom			sys_recvfrom			compat_sys_recvfrom
+338	common	shutdown			sys_shutdown
+339	common	setsockopt			sys_setsockopt			compat_sys_setsockopt
+340	common	getsockopt			sys_getsockopt			compat_sys_getsockopt
+341	common	sendmsg				sys_sendmsg			compat_sys_sendmsg
+342	common	recvmsg				sys_recvmsg			compat_sys_recvmsg
+343	common	recvmmsg			sys_recvmmsg			compat_sys_recvmmsg
+344	common	accept4				sys_accept4
+345	common	name_to_handle_at		sys_name_to_handle_at
+346	common	open_by_handle_at		sys_open_by_handle_at		compat_sys_open_by_handle_at
+347	common	clock_adjtime			sys_clock_adjtime		compat_sys_clock_adjtime
+348	common	syncfs				sys_syncfs
+349	common	sendmmsg			sys_sendmmsg			compat_sys_sendmmsg
+350	common	setns				sys_setns
+351	nospu	process_vm_readv		sys_process_vm_readv		compat_sys_process_vm_readv
+352	nospu	process_vm_writev		sys_process_vm_writev		compat_sys_process_vm_writev
+353	nospu	finit_module			sys_finit_module
+354	nospu	kcmp				sys_kcmp
+355	common	sched_setattr			sys_sched_setattr
+356	common	sched_getattr			sys_sched_getattr
+357	common	renameat2			sys_renameat2
+358	common	seccomp				sys_seccomp
+359	common	getrandom			sys_getrandom
+360	common	memfd_create			sys_memfd_create
+361	common	bpf				sys_bpf
+362	nospu	execveat			sys_execveat			compat_sys_execveat
+363	32	switch_endian			sys_ni_syscall
+363	64	switch_endian			ppc_switch_endian
+363	spu	switch_endian			sys_ni_syscall
+364	common	userfaultfd			sys_userfaultfd
+365	common	membarrier			sys_membarrier
+378	nospu	mlock2				sys_mlock2
+379	nospu	copy_file_range			sys_copy_file_range
+380	common	preadv2				sys_preadv2			compat_sys_preadv2
+381	common	pwritev2			sys_pwritev2			compat_sys_pwritev2
+382	nospu	kexec_file_load			sys_kexec_file_load
+383	nospu	statx				sys_statx
+384	nospu	pkey_alloc			sys_pkey_alloc
+385	nospu	pkey_free			sys_pkey_free
+386	nospu	pkey_mprotect			sys_pkey_mprotect
+387	nospu	rseq				sys_rseq
+388	nospu	io_pgetevents			sys_io_pgetevents		compat_sys_io_pgetevents
diff --git a/arch/powerpc/kernel/syscalls/syscallhdr.sh b/arch/powerpc/kernel/syscalls/syscallhdr.sh
new file mode 100644
index 0000000..c0a9a32
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/syscallhdr.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+in="$1"
+out="$2"
+my_abis=`echo "($3)" | tr ',' '|'`
+prefix="$4"
+offset="$5"
+
+fileguard=_UAPI_ASM_POWERPC_`basename "$out" | sed \
+	-e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
+	-e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
+grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
+	printf "#ifndef %s\n" "${fileguard}"
+	printf "#define %s\n" "${fileguard}"
+	printf "\n"
+
+	nxt=0
+	while read nr abi name entry compat ; do
+		if [ -z "$offset" ]; then
+			printf "#define __NR_%s%s\t%s\n" \
+				"${prefix}" "${name}" "${nr}"
+		else
+			printf "#define __NR_%s%s\t(%s + %s)\n" \
+				"${prefix}" "${name}" "${offset}" "${nr}"
+		fi
+		nxt=$((nr+1))
+	done
+
+	printf "\n"
+	printf "#ifdef __KERNEL__\n"
+	printf "#define __NR_syscalls\t%s\n" "${nxt}"
+	printf "#endif\n"
+	printf "\n"
+	printf "#endif /* %s */" "${fileguard}"
+	printf "\n"
+) > "$out"
diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh
new file mode 100644
index 0000000..fd62049
--- /dev/null
+++ b/arch/powerpc/kernel/syscalls/syscalltbl.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+in="$1"
+out="$2"
+my_abis=`echo "($3)" | tr ',' '|'`
+my_abi="$4"
+offset="$5"
+
+emit() {
+	t_nxt="$1"
+	t_nr="$2"
+	t_entry="$3"
+
+	while [ $t_nxt -lt $t_nr ]; do
+		printf "__SYSCALL(%s,sys_ni_syscall, )\n" "${t_nxt}"
+		t_nxt=$((t_nxt+1))
+	done
+	printf "__SYSCALL(%s,%s, )\n" "${t_nxt}" "${t_entry}"
+}
+
+grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
+	nxt=0
+	if [ -z "$offset" ]; then
+		offset=0
+	fi
+
+	while read nr abi name entry compat ; do
+		if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then
+			emit $((nxt+offset)) $((nr+offset)) $compat
+		else
+			emit $((nxt+offset)) $((nr+offset)) $entry
+		fi
+		nxt=$((nr+1))
+	done
+) > "$out"
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 919a327..23265a2 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -16,28 +16,6 @@
 
 #include <asm/ppc_asm.h>
 
-#ifdef CONFIG_PPC64
-#define SYSCALL(func)		.8byte	DOTSYM(sys_##func),DOTSYM(sys_##func)
-#define COMPAT_SYS(func)	.8byte	DOTSYM(sys_##func),DOTSYM(compat_sys_##func)
-#define PPC_SYS(func)		.8byte	DOTSYM(ppc_##func),DOTSYM(ppc_##func)
-#define OLDSYS(func)		.8byte	DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall)
-#define SYS32ONLY(func)		.8byte	DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func)
-#define PPC64ONLY(func)		.8byte	DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall)
-#define SYSX(f, f3264, f32)	.8byte	DOTSYM(f),DOTSYM(f3264)
-#else
-#define SYSCALL(func)		.long	sys_##func
-#define COMPAT_SYS(func)	.long	sys_##func
-#define PPC_SYS(func)		.long	ppc_##func
-#define OLDSYS(func)		.long	sys_##func
-#define SYS32ONLY(func)		.long	sys_##func
-#define PPC64ONLY(func)		.long	sys_ni_syscall
-#define SYSX(f, f3264, f32)	.long	f32
-#endif
-#define SYSCALL_SPU(func)	SYSCALL(func)
-#define COMPAT_SYS_SPU(func)	COMPAT_SYS(func)
-#define COMPAT_SPU_NEW(func)	COMPAT_SYS(func)
-#define SYSX_SPU(f, f3264, f32)	SYSX(f, f3264, f32)
-
 .section .rodata,"a"
 
 #ifdef CONFIG_PPC64
@@ -46,5 +24,21 @@
 
 .globl sys_call_table
 sys_call_table:
+#ifdef CONFIG_PPC64
+#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry)
+#include <asm/syscall_table_64.h>
+#undef __SYSCALL
+#else
+#define __SYSCALL(nr, entry, nargs) .long entry
+#include <asm/syscall_table_32.h>
+#undef __SYSCALL
+#endif
 
-#include <asm/systbl.h>
+#ifdef CONFIG_COMPAT
+.globl compat_sys_call_table
+compat_sys_call_table:
+#define compat_sys_sigsuspend	sys_sigsuspend
+#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry)
+#include <asm/syscall_table_c32.h>
+#undef __SYSCALL
+#endif
diff --git a/arch/powerpc/kernel/systbl_chk.c b/arch/powerpc/kernel/systbl_chk.c
deleted file mode 100644
index 4653258..0000000
--- a/arch/powerpc/kernel/systbl_chk.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * This file, when run through CPP produces a list of syscall numbers
- * in the order of systbl.h.  That way we can check for gaps and syscalls
- * that are out of order.
- *
- * Unfortunately, we cannot check for the correct ordering of entries
- * using SYSX().
- *
- * Copyright © IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <asm/unistd.h>
-
-#define SYSCALL(func)		__NR_##func
-#define COMPAT_SYS(func)	__NR_##func
-#define PPC_SYS(func)		__NR_##func
-#ifdef CONFIG_PPC64
-#define OLDSYS(func)		-1
-#define SYS32ONLY(func)		-1
-#define PPC64ONLY(func)		__NR_##func
-#else
-#define OLDSYS(func)		__NR_old##func
-#define SYS32ONLY(func)		__NR_##func
-#define PPC64ONLY(func)		-1
-#endif
-#define SYSX(f, f3264, f32)	-1
-
-#define SYSCALL_SPU(func)	SYSCALL(func)
-#define COMPAT_SYS_SPU(func)	COMPAT_SYS(func)
-#define COMPAT_SPU_NEW(func)	COMPAT_SYS(_new##func)
-#define SYSX_SPU(f, f3264, f32)	SYSX(f, f3264, f32)
-
-/* Just insert a marker for ni_syscalls */
-#define	__NR_ni_syscall		-1
-
-/*
- * These are the known exceptions.
- * Hopefully, there will be no more.
- */
-#define	__NR_llseek		__NR__llseek
-#undef	__NR_umount
-#define	__NR_umount		__NR_umount2
-#define	__NR_old_getrlimit	__NR_getrlimit
-#define	__NR_newstat		__NR_stat
-#define	__NR_newlstat		__NR_lstat
-#define	__NR_newfstat		__NR_fstat
-#define	__NR_newuname		__NR_uname
-#define	__NR_sysctl		__NR__sysctl
-#define __NR_olddebug_setcontext	__NR_sys_debug_setcontext
-
-/* We call sys_ugetrlimit for syscall number __NR_getrlimit */
-#define getrlimit		ugetrlimit
-
-START_TABLE
-#include <asm/systbl.h>
-END_TABLE NR_syscalls
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9a86572..00af2c4 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1434,7 +1434,8 @@ void program_check_exception(struct pt_regs *regs)
 			goto bail;
 		} else {
 			printk(KERN_EMERG "Unexpected TM Bad Thing exception "
-			       "at %lx (msr 0x%lx)\n", regs->nip, regs->msr);
+			       "at %lx (msr 0x%lx) tm_scratch=%llx\n",
+			       regs->nip, regs->msr, get_paca()->tm_scratch);
 			die("Unrecoverable exception", regs, SIGABRT);
 		}
 	}
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 65b3bdb..7725a97 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -671,15 +671,18 @@ static void __init vdso_setup_syscall_map(void)
 {
 	unsigned int i;
 	extern unsigned long *sys_call_table;
+#ifdef CONFIG_PPC64
+	extern unsigned long *compat_sys_call_table;
+#endif
 	extern unsigned long sys_ni_syscall;
 
 
 	for (i = 0; i < NR_syscalls; i++) {
 #ifdef CONFIG_PPC64
-		if (sys_call_table[i*2] != sys_ni_syscall)
+		if (sys_call_table[i] != sys_ni_syscall)
 			vdso_data->syscall_map_64[i >> 5] |=
 				0x80000000UL >> (i & 0x1f);
-		if (sys_call_table[i*2+1] != sys_ni_syscall)
+		if (compat_sys_call_table[i] != sys_ni_syscall)
 			vdso_data->syscall_map_32[i >> 5] |=
 				0x80000000UL >> (i & 0x1f);
 #else /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 62a8d03..532ab797 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -397,12 +397,13 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
 	return H_SUCCESS;
 }
 
-static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
+		unsigned long entry)
 {
 	unsigned long hpa = 0;
 	enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	iommu_tce_xchg(mm, tbl, entry, &hpa, &dir);
 }
 
 static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -433,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 	unsigned long hpa = 0;
 	long ret;
 
-	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
+	if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir)))
 		return H_TOO_HARD;
 
 	if (dir == DMA_NONE)
@@ -441,7 +442,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 
 	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
 	if (ret != H_SUCCESS)
-		iommu_tce_xchg(tbl, entry, &hpa, &dir);
+		iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
 
 	return ret;
 }
@@ -487,7 +488,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 	if (mm_iommu_mapped_inc(mem))
 		return H_TOO_HARD;
 
-	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
 	if (WARN_ON_ONCE(ret)) {
 		mm_iommu_mapped_dec(mem);
 		return H_TOO_HARD;
@@ -566,7 +567,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 					entry, ua, dir);
 
 		if (ret != H_SUCCESS) {
-			kvmppc_clear_tce(stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
 			goto unlock_exit;
 		}
 	}
@@ -655,7 +656,8 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 					iommu_tce_direction(tce));
 
 			if (ret != H_SUCCESS) {
-				kvmppc_clear_tce(stit->tbl, entry);
+				kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
+						entry);
 				goto unlock_exit;
 			}
 		}
@@ -704,7 +706,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 				return ret;
 
 			WARN_ON_ONCE(1);
-			kvmppc_clear_tce(stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
 		}
 	}
 
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c866d9a..056b750 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -226,7 +226,9 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
 static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
 			     unsigned long address)
 {
-	if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) {
+	/* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */
+	if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT |
+				      DSISR_PROTFAULT))) {
 		printk_ratelimited(KERN_CRIT "kernel tried to execute"
 				   " exec-protected page (%lx) -"
 				   "exploit attempt? (uid: %d)\n",
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 56c2234..a712a65 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -36,6 +36,8 @@ struct mm_iommu_table_group_mem_t {
 	u64 ua;			/* userspace address */
 	u64 entries;		/* number of entries in hpas[] */
 	u64 *hpas;		/* vmalloc'ed */
+#define MM_IOMMU_TABLE_INVALID_HPA	((uint64_t)-1)
+	u64 dev_hpa;		/* Device memory base address */
 };
 
 static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
@@ -126,7 +128,8 @@ static int mm_iommu_move_page_from_cma(struct page *page)
 	return 0;
 }
 
-long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
+		unsigned long entries, unsigned long dev_hpa,
 		struct mm_iommu_table_group_mem_t **pmem)
 {
 	struct mm_iommu_table_group_mem_t *mem;
@@ -140,12 +143,6 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 
 	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
 			next) {
-		if ((mem->ua == ua) && (mem->entries == entries)) {
-			++mem->used;
-			*pmem = mem;
-			goto unlock_exit;
-		}
-
 		/* Overlap? */
 		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
 				(ua < (mem->ua +
@@ -156,11 +153,13 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 
 	}
 
-	ret = mm_iommu_adjust_locked_vm(mm, entries, true);
-	if (ret)
-		goto unlock_exit;
+	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+		if (ret)
+			goto unlock_exit;
 
-	locked_entries = entries;
+		locked_entries = entries;
+	}
 
 	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	if (!mem) {
@@ -168,6 +167,13 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 		goto unlock_exit;
 	}
 
+	if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
+		mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
+		mem->dev_hpa = dev_hpa;
+		goto good_exit;
+	}
+	mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
+
 	/*
 	 * For a starting point for a maximum page size calculation
 	 * we use @ua and @entries natural alignment to allow IOMMU pages
@@ -236,6 +242,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
 	}
 
+good_exit:
 	atomic64_set(&mem->mapped, 1);
 	mem->used = 1;
 	mem->ua = ua;
@@ -252,13 +259,31 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
+			pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_new);
+
+long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+		unsigned long entries, unsigned long dev_hpa,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_newdev);
 
 static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
 {
 	long i;
 	struct page *page = NULL;
 
+	if (!mem->hpas)
+		return;
+
 	for (i = 0; i < mem->entries; ++i) {
 		if (!mem->hpas[i])
 			continue;
@@ -300,6 +325,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
 long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 {
 	long ret = 0;
+	unsigned long entries, dev_hpa;
 
 	mutex_lock(&mem_list_mutex);
 
@@ -321,9 +347,12 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 	}
 
 	/* @mapped became 0 so now mappings are disabled, release the region */
+	entries = mem->entries;
+	dev_hpa = mem->dev_hpa;
 	mm_iommu_release(mem);
 
-	mm_iommu_adjust_locked_vm(mm, mem->entries, false);
+	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+		mm_iommu_adjust_locked_vm(mm, entries, false);
 
 unlock_exit:
 	mutex_unlock(&mem_list_mutex);
@@ -368,27 +397,32 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
 	return ret;
 }
 
-struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries)
 {
 	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
 
+	mutex_lock(&mem_list_mutex);
+
 	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
 		if ((mem->ua == ua) && (mem->entries == entries)) {
 			ret = mem;
+			++mem->used;
 			break;
 		}
 	}
 
+	mutex_unlock(&mem_list_mutex);
+
 	return ret;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_find);
+EXPORT_SYMBOL_GPL(mm_iommu_get);
 
 long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
 {
 	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-	u64 *va = &mem->hpas[entry];
+	u64 *va;
 
 	if (entry >= mem->entries)
 		return -EFAULT;
@@ -396,6 +430,12 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 	if (pageshift > mem->pageshift)
 		return -EFAULT;
 
+	if (!mem->hpas) {
+		*hpa = mem->dev_hpa + (ua - mem->ua);
+		return 0;
+	}
+
+	va = &mem->hpas[entry];
 	*hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
 
 	return 0;
@@ -406,7 +446,6 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
 {
 	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-	void *va = &mem->hpas[entry];
 	unsigned long *pa;
 
 	if (entry >= mem->entries)
@@ -415,7 +454,12 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
 	if (pageshift > mem->pageshift)
 		return -EFAULT;
 
-	pa = (void *) vmalloc_to_phys(va);
+	if (!mem->hpas) {
+		*hpa = mem->dev_hpa + (ua - mem->ua);
+		return 0;
+	}
+
+	pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
 	if (!pa)
 		return -EFAULT;
 
@@ -435,6 +479,9 @@ extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
 	if (!mem)
 		return;
 
+	if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
+		return;
+
 	entry = (ua - mem->ua) >> PAGE_SHIFT;
 	va = &mem->hpas[entry];
 
@@ -445,6 +492,33 @@ extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
 	*pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
 }
 
+bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+		unsigned int pageshift, unsigned long *size)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	unsigned long end;
+
+	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+		if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+			continue;
+
+		end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
+		if ((mem->dev_hpa <= hpa) && (hpa < end)) {
+			/*
+			 * Since the IOMMU page size might be bigger than
+			 * PAGE_SIZE, the amount of preregistered memory
+			 * starting from @hpa might be smaller than 1<<pageshift
+			 * and the caller needs to distinguish this situation.
+			 */
+			*size = min(1UL << pageshift, end - hpa);
+			return true;
+		}
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
+
 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
 {
 	if (atomic64_inc_not_zero(&mem->mapped))
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 04b60a8..5878077 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -415,3 +415,13 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
 
 	return pkey_access_permitted(vma_pkey(vma), write, execute);
 }
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	if (static_branch_likely(&pkey_disabled))
+		return;
+
+	/* Duplicate the oldmm pkey state in mm: */
+	mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+	mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 383cc36..b072300 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -10,6 +10,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/perf_event.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
@@ -2166,7 +2167,7 @@ static bool pmc_overflow(unsigned long val)
 /*
  * Performance monitor interrupt stuff
  */
-static void perf_event_interrupt(struct pt_regs *regs)
+static void __perf_event_interrupt(struct pt_regs *regs)
 {
 	int i, j;
 	struct cpu_hw_events *cpuhw = this_cpu_ptr(&cpu_hw_events);
@@ -2250,6 +2251,14 @@ static void perf_event_interrupt(struct pt_regs *regs)
 		irq_exit();
 }
 
+static void perf_event_interrupt(struct pt_regs *regs)
+{
+	u64 start_clock = sched_clock();
+
+	__perf_event_interrupt(regs);
+	perf_sample_event_took(sched_clock() - start_clock);
+}
+
 static int power_pmu_prepare_cpu(unsigned int cpu)
 {
 	struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c
index 7e4f8ca..f467247 100644
--- a/arch/powerpc/platforms/44x/warp.c
+++ b/arch/powerpc/platforms/44x/warp.c
@@ -179,9 +179,9 @@ static int pika_setup_leds(void)
 	}
 
 	for_each_child_of_node(np, child)
-		if (strcmp(child->name, "green") == 0)
+		if (of_node_name_eq(child, "green"))
 			green_led = of_get_gpio(child, 0);
-		else if (strcmp(child->name, "red") == 0)
+		else if (of_node_name_eq(child, "red"))
 			red_led = of_get_gpio(child, 0);
 
 	of_node_put(np);
diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c
index f5bbd45..f2610a0 100644
--- a/arch/powerpc/platforms/4xx/ocm.c
+++ b/arch/powerpc/platforms/4xx/ocm.c
@@ -223,8 +223,6 @@ static void __init ocm_init_node(int count, struct device_node *node)
 	INIT_LIST_HEAD(&ocm->c.list);
 
 	ocm->ready = 1;
-
-	return;
 }
 
 static int ocm_debugfs_show(struct seq_file *m, void *v)
@@ -242,9 +240,7 @@ static int ocm_debugfs_show(struct seq_file *m, void *v)
 		seq_printf(m, "PhysAddr     : 0x%llx\n", ocm->phys);
 		seq_printf(m, "MemTotal     : %d Bytes\n", ocm->memtotal);
 		seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal);
-		seq_printf(m, "MemTotal(C)  : %d Bytes\n", ocm->c.memtotal);
-
-		seq_printf(m, "\n");
+		seq_printf(m, "MemTotal(C)  : %d Bytes\n\n", ocm->c.memtotal);
 
 		seq_printf(m, "NC.PhysAddr  : 0x%llx\n", ocm->nc.phys);
 		seq_printf(m, "NC.VirtAddr  : 0x%p\n", ocm->nc.virt);
@@ -256,9 +252,7 @@ static int ocm_debugfs_show(struct seq_file *m, void *v)
 							blk->size, blk->owner);
 		}
 
-		seq_printf(m, "\n");
-
-		seq_printf(m, "C.PhysAddr   : 0x%llx\n", ocm->c.phys);
+		seq_printf(m, "\nC.PhysAddr   : 0x%llx\n", ocm->c.phys);
 		seq_printf(m, "C.VirtAddr   : 0x%p\n", ocm->c.virt);
 		seq_printf(m, "C.MemTotal   : %d Bytes\n", ocm->c.memtotal);
 		seq_printf(m, "C.MemFree    : %d Bytes\n", ocm->c.memfree);
@@ -268,7 +262,7 @@ static int ocm_debugfs_show(struct seq_file *m, void *v)
 						blk->size, blk->owner);
 		}
 
-		seq_printf(m, "\n");
+		seq_putc(m, '\n');
 	}
 
 	return 0;
@@ -338,7 +332,6 @@ void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align,
 
 		ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL);
 		if (!ocm_blk) {
-			printk(KERN_ERR "PPC4XX OCM: could not allocate ocm block");
 			rh_free(ocm_reg->rh, offset);
 			break;
 		}
@@ -392,10 +385,8 @@ static int __init ppc4xx_ocm_init(void)
 		return 0;
 
 	ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL);
-	if (!ocm_nodes) {
-		printk(KERN_ERR "PPC4XX OCM: failed to allocate OCM nodes!\n");
+	if (!ocm_nodes)
 		return -ENOMEM;
-	}
 
 	ocm_count = count;
 	count = 0;
diff --git a/arch/powerpc/platforms/4xx/pci.c b/arch/powerpc/platforms/4xx/pci.c
index d0825f5..e6e2adc 100644
--- a/arch/powerpc/platforms/4xx/pci.c
+++ b/arch/powerpc/platforms/4xx/pci.c
@@ -1399,7 +1399,6 @@ static void __init ppc_476fpe_pciex_check_link(struct ppc4xx_pciex_port *port)
 		printk(KERN_WARNING "PCIE%d: Link up failed\n", port->index);
 
 	iounmap(mbase);
-	return;
 }
 
 static struct ppc4xx_pciex_hwops ppc_476fpe_pcie_hwops __initdata =
diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c
index 1ecbf17..6153886 100644
--- a/arch/powerpc/platforms/52xx/efika.c
+++ b/arch/powerpc/platforms/52xx/efika.c
@@ -82,11 +82,9 @@ static void __init efika_pcisetup(void)
 		return;
 	}
 
-	for (pcictrl = NULL;;) {
-		pcictrl = of_get_next_child(root, pcictrl);
-		if ((pcictrl == NULL) || (strcmp(pcictrl->name, "pci") == 0))
+	for_each_child_of_node(root, pcictrl)
+		if (of_node_name_eq(pcictrl, "pci"))
 			break;
-	}
 
 	of_node_put(root);
 
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index 902d527..e2e1371 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -131,7 +131,7 @@ static int cell_setup_phb(struct pci_controller *phb)
 
 	np = phb->dn;
 	model = of_get_property(np, "model", NULL);
-	if (model == NULL || strcmp(np->name, "pci"))
+	if (model == NULL || !of_node_name_eq(np, "pci"))
 		return 0;
 
 	/* Setup workarounds for spider */
diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c
index 8ae8620..125f2a5 100644
--- a/arch/powerpc/platforms/cell/spu_callbacks.c
+++ b/arch/powerpc/platforms/cell/spu_callbacks.c
@@ -34,20 +34,9 @@
  */
 
 static void *spu_syscall_table[] = {
-#define SYSCALL(func)		sys_ni_syscall,
-#define COMPAT_SYS(func)	sys_ni_syscall,
-#define PPC_SYS(func)		sys_ni_syscall,
-#define OLDSYS(func)		sys_ni_syscall,
-#define SYS32ONLY(func)		sys_ni_syscall,
-#define PPC64ONLY(func)		sys_ni_syscall,
-#define SYSX(f, f3264, f32)	sys_ni_syscall,
-
-#define SYSCALL_SPU(func)	sys_##func,
-#define COMPAT_SYS_SPU(func)	sys_##func,
-#define COMPAT_SPU_NEW(func)	sys_##func,
-#define SYSX_SPU(f, f3264, f32)	f,
-
-#include <asm/systbl.h>
+#define __SYSCALL(nr, entry, nargs) entry,
+#include <asm/syscall_table_spu.h>
+#undef __SYSCALL
 };
 
 long spu_sys_callback(struct spu_syscall_block *s)
diff --git a/arch/powerpc/platforms/cell/spu_manage.c b/arch/powerpc/platforms/cell/spu_manage.c
index f7e3637..bed935c 100644
--- a/arch/powerpc/platforms/cell/spu_manage.c
+++ b/arch/powerpc/platforms/cell/spu_manage.c
@@ -458,7 +458,6 @@ static void init_affinity_node(int cbe)
 	struct device_node *vic_dn, *last_spu_dn;
 	phandle avoid_ph;
 	const phandle *vic_handles;
-	const char *name;
 	int lenp, i, added;
 
 	last_spu = list_first_entry(&cbe_spu_info[cbe].spus, struct spu,
@@ -480,12 +479,7 @@ static void init_affinity_node(int cbe)
 			if (!vic_dn)
 				continue;
 
-			/* a neighbour might be spe, mic-tm, or bif0 */
-			name = of_get_property(vic_dn, "name", NULL);
-			if (!name)
-				continue;
-
-			if (strcmp(name, "spe") == 0) {
+			if (of_node_name_eq(vic_dn, "spe") ) {
 				spu = devnode_spu(cbe, vic_dn);
 				avoid_ph = last_spu_dn->phandle;
 			} else {
@@ -498,7 +492,7 @@ static void init_affinity_node(int cbe)
 				spu = neighbour_spu(cbe, vic_dn, last_spu_dn);
 				if (!spu)
 					continue;
-				if (!strcmp(name, "mic-tm")) {
+				if (of_node_name_eq(vic_dn, "mic-tm")) {
 					last_spu->has_mem_affinity = 1;
 					spu->has_mem_affinity = 1;
 				}
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c
index 3c21f04..e66644e 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -287,10 +287,7 @@ static __init void chrp_init(void)
 	 * or /pci@80000000/isa@C/serial@i2F8
 	 * The optional graphics card has also type 'serial' in VGA mode.
 	 */
-	property = of_get_property(node, "name", NULL);
-	if (!property)
-		goto out_put;
-	if (!strcmp(property, "failsafe") || !strcmp(property, "serial"))
+	if (of_node_name_eq(node, "failsafe") || of_node_name_eq(node, "serial"))
 		add_preferred_console("ttyS", 0, NULL);
 out_put:
 	of_node_put(node);
diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c
index bdb33163..c3e5ee8 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -173,9 +173,9 @@ static long ohare_htw_scc_enable(struct device_node *node, long param,
 	macio = macio_find(node, 0);
 	if (!macio)
 		return -ENODEV;
-	if (!strcmp(node->name, "ch-a"))
+	if (of_node_name_eq(node, "ch-a"))
 		chan_mask = MACIO_FLAG_SCCA_ON;
-	else if (!strcmp(node->name, "ch-b"))
+	else if (of_node_name_eq(node, "ch-b"))
 		chan_mask = MACIO_FLAG_SCCB_ON;
 	else
 		return -ENODEV;
@@ -610,9 +610,9 @@ static long core99_scc_enable(struct device_node *node, long param, long value)
 	macio = macio_find(node, 0);
 	if (!macio)
 		return -ENODEV;
-	if (!strcmp(node->name, "ch-a"))
+	if (of_node_name_eq(node, "ch-a"))
 		chan_mask = MACIO_FLAG_SCCA_ON;
-	else if (!strcmp(node->name, "ch-b"))
+	else if (of_node_name_eq(node, "ch-b"))
 		chan_mask = MACIO_FLAG_SCCB_ON;
 	else
 		return -ENODEV;
@@ -1392,8 +1392,7 @@ static long g5_mpic_enable(struct device_node *node, long param, long value)
 
 	if (parent == NULL)
 		return 0;
-	is_u3 = strcmp(parent->name, "u3") == 0 ||
-		strcmp(parent->name, "u4") == 0;
+	is_u3 = of_node_name_eq(parent, "u3") || of_node_name_eq(parent, "u4");
 	of_node_put(parent);
 	if (!is_u3)
 		return 0;
diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index 84bace3..4de058a 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -617,7 +617,7 @@ static void __init kw_i2c_probe(void)
 		 * but not for now
 		 */
 		child = of_get_next_child(np, NULL);
-		multibus = !child || strcmp(child->name, "i2c-bus");
+		multibus = !of_node_name_eq(child, "i2c-bus");
 		of_node_put(child);
 
 		/* For a multibus setup, we get the bus count based on the
@@ -1205,7 +1205,7 @@ static void pmac_i2c_devscan(void (*callback)(struct device_node *dev,
 				if (bus != pmac_i2c_find_bus(np))
 					continue;
 			for (p = whitelist; p->name != NULL; p++) {
-				if (strcmp(np->name, p->name))
+				if (!of_node_name_eq(np, p->name))
 					continue;
 				if (p->compatible &&
 				    !of_device_is_compatible(np, p->compatible))
diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c
index 04527d1..3d74205 100644
--- a/arch/powerpc/platforms/powermac/pci.c
+++ b/arch/powerpc/platforms/powermac/pci.c
@@ -501,9 +501,7 @@ static void __init init_p2pbridge(void)
 	/* XXX it would be better here to identify the specific
 	   PCI-PCI bridge chip we have. */
 	p2pbridge = of_find_node_by_name(NULL, "pci-bridge");
-	if (p2pbridge == NULL
-	    || p2pbridge->parent == NULL
-	    || strcmp(p2pbridge->parent->name, "pci") != 0)
+	if (p2pbridge == NULL || !of_node_name_eq(p2pbridge->parent, "pci"))
 		goto done;
 	if (pci_device_from_OF_node(p2pbridge, &bus, &devfn) < 0) {
 		DBG("Can't find PCI infos for PCI<->PCI bridge\n");
@@ -828,14 +826,14 @@ static int __init pmac_add_bridge(struct device_node *dev)
 	if (of_device_is_compatible(dev, "uni-north")) {
 		primary = setup_uninorth(hose, &rsrc);
 		disp_name = "UniNorth";
-	} else if (strcmp(dev->name, "pci") == 0) {
+	} else if (of_node_name_eq(dev, "pci")) {
 		/* XXX assume this is a mpc106 (grackle) */
 		setup_grackle(hose);
 		disp_name = "Grackle (MPC106)";
-	} else if (strcmp(dev->name, "bandit") == 0) {
+	} else if (of_node_name_eq(dev, "bandit")) {
 		setup_bandit(hose, &rsrc);
 		disp_name = "Bandit";
-	} else if (strcmp(dev->name, "chaos") == 0) {
+	} else if (of_node_name_eq(dev, "chaos")) {
 		setup_chaos(hose, &rsrc);
 		disp_name = "Chaos";
 		primary = 0;
@@ -914,16 +912,14 @@ void __init pmac_pci_init(void)
 		       "of device tree\n");
 		return;
 	}
-	for (np = NULL; (np = of_get_next_child(root, np)) != NULL;) {
-		if (np->name == NULL)
-			continue;
-		if (strcmp(np->name, "bandit") == 0
-		    || strcmp(np->name, "chaos") == 0
-		    || strcmp(np->name, "pci") == 0) {
+	for_each_child_of_node(root, np) {
+		if (of_node_name_eq(np, "bandit")
+		    || of_node_name_eq(np, "chaos")
+		    || of_node_name_eq(np, "pci")) {
 			if (pmac_add_bridge(np) == 0)
 				of_node_get(np);
 		}
-		if (strcmp(np->name, "ht") == 0) {
+		if (of_node_name_eq(np, "ht")) {
 			of_node_get(np);
 			ht = np;
 		}
@@ -983,7 +979,7 @@ static bool pmac_pci_enable_device_hook(struct pci_dev *dev)
 	/* Firewire & GMAC were disabled after PCI probe, the driver is
 	 * claiming them, we must re-enable them now.
 	 */
-	if (uninorth_child && !strcmp(node->name, "firewire") &&
+	if (uninorth_child && of_node_name_eq(node, "firewire") &&
 	    (of_device_is_compatible(node, "pci106b,18") ||
 	     of_device_is_compatible(node, "pci106b,30") ||
 	     of_device_is_compatible(node, "pci11c1,5811"))) {
@@ -991,7 +987,7 @@ static bool pmac_pci_enable_device_hook(struct pci_dev *dev)
 		pmac_call_feature(PMAC_FTR_1394_ENABLE, node, 0, 1);
 		updatecfg = 1;
 	}
-	if (uninorth_child && !strcmp(node->name, "ethernet") &&
+	if (uninorth_child && of_node_name_eq(node, "ethernet") &&
 	    of_device_is_compatible(node, "gmac")) {
 		pmac_call_feature(PMAC_FTR_GMAC_ENABLE, node, 0, 1);
 		updatecfg = 1;
@@ -1262,4 +1258,3 @@ struct pci_controller_ops pmac_pci_controller_ops = {
 	.enable_device_hook	= pmac_pci_enable_device_hook,
 #endif
 };
-
diff --git a/arch/powerpc/platforms/powermac/pfunc_base.c b/arch/powerpc/platforms/powermac/pfunc_base.c
index fd2e210..62311e8 100644
--- a/arch/powerpc/platforms/powermac/pfunc_base.c
+++ b/arch/powerpc/platforms/powermac/pfunc_base.c
@@ -101,9 +101,8 @@ static void macio_gpio_init_one(struct macio_chip *macio)
 	 * Find the "gpio" parent node
 	 */
 
-	for (gparent = NULL;
-	     (gparent = of_get_next_child(macio->of_node, gparent)) != NULL;)
-		if (strcmp(gparent->name, "gpio") == 0)
+	for_each_child_of_node(macio->of_node, gparent)
+		if (of_node_name_eq(gparent, "gpio"))
 			break;
 	if (gparent == NULL)
 		return;
@@ -313,7 +312,7 @@ static void uninorth_install_pfunc(void)
 	 * Install handlers for the hwclock child if any
 	 */
 	for (np = NULL; (np = of_get_next_child(uninorth_node, np)) != NULL;)
-		if (strcmp(np->name, "hw-clock") == 0) {
+		if (of_node_name_eq(np, "hw-clock")) {
 			unin_hwclock = np;
 			break;
 		}
diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c
index 5a71395..c292ffa 100644
--- a/arch/powerpc/platforms/powermac/pic.c
+++ b/arch/powerpc/platforms/powermac/pic.c
@@ -553,13 +553,13 @@ void __init pmac_pic_init(void)
 
 		for_each_node_with_property(np, "interrupt-controller") {
 			/* Skip /chosen/interrupt-controller */
-			if (strcmp(np->name, "chosen") == 0)
+			if (of_node_name_eq(np, "chosen"))
 				continue;
 			/* It seems like at least one person wants
 			 * to use BootX on a machine with an AppleKiwi
 			 * controller which happens to pretend to be an
 			 * interrupt controller too. */
-			if (strcmp(np->name, "AppleKiwi") == 0)
+			if (of_node_name_eq(np, "AppleKiwi"))
 				continue;
 			/* I think we found one ! */
 			of_irq_dflt_pic = np;
diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index 2f00e3d..2e8221e 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -560,15 +560,9 @@ static int __init check_pmac_serial_console(void)
 	}
 	pr_debug("stdout is %pOF\n", prom_stdout);
 
-	name = of_get_property(prom_stdout, "name", NULL);
-	if (!name) {
-		pr_debug(" stdout package has no name !\n");
-		goto not_found;
-	}
-
-	if (strcmp(name, "ch-a") == 0)
+	if (of_node_name_eq(prom_stdout, "ch-a"))
 		offset = 0;
-	else if (strcmp(name, "ch-b") == 0)
+	else if (of_node_name_eq(prom_stdout, "ch-b"))
 		offset = 1;
 	else
 		goto not_found;
diff --git a/arch/powerpc/platforms/powermac/udbg_scc.c b/arch/powerpc/platforms/powermac/udbg_scc.c
index 8901973..415b74d 100644
--- a/arch/powerpc/platforms/powermac/udbg_scc.c
+++ b/arch/powerpc/platforms/powermac/udbg_scc.c
@@ -87,7 +87,7 @@ void udbg_scc_init(int force_scc)
 	for (ch = NULL; (ch = of_get_next_child(escc, ch)) != NULL;) {
 		if (ch == stdout)
 			ch_def = of_node_get(ch);
-		if (strcmp(ch->name, "ch-a") == 0)
+		if (of_node_name_eq(ch, "ch-a"))
 			ch_a = of_node_get(ch);
 	}
 	if (ch_def == NULL && !force_scc)
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index abc0be7..f380789 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -564,8 +564,8 @@ static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
 static int pnv_eeh_get_phb_state(struct eeh_pe *pe)
 {
 	struct pnv_phb *phb = pe->phb->private_data;
-	u8 fstate;
-	__be16 pcierr;
+	u8 fstate = 0;
+	__be16 pcierr = 0;
 	s64 rc;
 	int result = 0;
 
@@ -603,8 +603,8 @@ static int pnv_eeh_get_phb_state(struct eeh_pe *pe)
 static int pnv_eeh_get_pe_state(struct eeh_pe *pe)
 {
 	struct pnv_phb *phb = pe->phb->private_data;
-	u8 fstate;
-	__be16 pcierr;
+	u8 fstate = 0;
+	__be16 pcierr = 0;
 	s64 rc;
 	int result;
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 75b9352..d7f742e 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -9,32 +9,19 @@
  * License as published by the Free Software Foundation.
  */
 
-#include <linux/slab.h>
 #include <linux/mmu_notifier.h>
 #include <linux/mmu_context.h>
 #include <linux/of.h>
-#include <linux/export.h>
 #include <linux/pci.h>
 #include <linux/memblock.h>
-#include <linux/iommu.h>
 #include <linux/sizes.h>
 
 #include <asm/debugfs.h>
-#include <asm/tlb.h>
 #include <asm/powernv.h>
-#include <asm/reg.h>
-#include <asm/opal.h>
-#include <asm/io.h>
-#include <asm/iommu.h>
-#include <asm/pnv-pci.h>
-#include <asm/msi_bitmap.h>
 #include <asm/opal.h>
 
-#include "powernv.h"
 #include "pci.h"
 
-#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
-
 /*
  * spinlock to protect initialisation of an npu_context for a particular
  * mm_struct.
@@ -133,15 +120,25 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
 	return pe;
 }
 
-long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+static long pnv_npu_unset_window(struct iommu_table_group *table_group,
+		int num);
+
+static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
 		struct iommu_table *tbl)
 {
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
 	struct pnv_phb *phb = npe->phb;
 	int64_t rc;
 	const unsigned long size = tbl->it_indirect_levels ?
 		tbl->it_level_size : tbl->it_size;
 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+	int num2 = (num == 0) ? 1 : 0;
+
+	/* NPU has just one TVE so if there is another table, remove it first */
+	if (npe->table_group.tables[num2])
+		pnv_npu_unset_window(&npe->table_group, num2);
 
 	pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
 			start_addr, start_addr + win_size - 1,
@@ -167,11 +164,16 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 	return 0;
 }
 
-long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
+static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
 {
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
 	struct pnv_phb *phb = npe->phb;
 	int64_t rc;
 
+	if (!npe->table_group.tables[num])
+		return 0;
+
 	pe_info(npe, "Removing DMA window\n");
 
 	rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
@@ -210,7 +212,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
 	if (!gpe)
 		return;
 
-	rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
+	rc = pnv_npu_set_window(&npe->table_group, 0,
+			gpe->table_group.tables[0]);
 
 	/*
 	 * NVLink devices use the same TCE table configuration as
@@ -235,7 +238,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
 	if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
 		return -EINVAL;
 
-	rc = pnv_npu_unset_window(npe, 0);
+	rc = pnv_npu_unset_window(&npe->table_group, 0);
 	if (rc != OPAL_SUCCESS)
 		return rc;
 
@@ -288,11 +291,15 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
 	}
 }
 
+#ifdef CONFIG_IOMMU_API
 /* Switch ownership from platform code to external user (e.g. VFIO) */
-void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
 {
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
 	struct pnv_phb *phb = npe->phb;
 	int64_t rc;
+	struct pci_dev *gpdev = NULL;
 
 	/*
 	 * Note: NPU has just a single TVE in the hardware which means that
@@ -301,7 +308,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
 	 * if it was enabled at the moment of ownership change.
 	 */
 	if (npe->table_group.tables[0]) {
-		pnv_npu_unset_window(npe, 0);
+		pnv_npu_unset_window(&npe->table_group, 0);
 		return;
 	}
 
@@ -314,31 +321,316 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
 		return;
 	}
 	pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+
+	get_gpu_pci_dev_and_pe(npe, &gpdev);
+	if (gpdev)
+		pnv_npu2_unmap_lpar_dev(gpdev);
 }
 
-struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
+static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
 {
-	struct pnv_phb *phb = npe->phb;
-	struct pci_bus *pbus = phb->hose->bus;
-	struct pci_dev *npdev, *gpdev = NULL, *gptmp;
-	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
+	struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pci_dev *gpdev = NULL;
 
-	if (!gpe || !gpdev)
-		return NULL;
+	get_gpu_pci_dev_and_pe(npe, &gpdev);
+	if (gpdev)
+		pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
+}
 
-	list_for_each_entry(npdev, &pbus->devices, bus_list) {
-		gptmp = pnv_pci_get_gpu_dev(npdev);
+static struct iommu_table_group_ops pnv_pci_npu_ops = {
+	.set_window = pnv_npu_set_window,
+	.unset_window = pnv_npu_unset_window,
+	.take_ownership = pnv_npu_take_ownership,
+	.release_ownership = pnv_npu_release_ownership,
+};
+#endif /* !CONFIG_IOMMU_API */
 
-		if (gptmp != gpdev)
+/*
+ * NPU2 ATS
+ */
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+#define NV_NPU_MAX_PE_NUM	16
+
+/*
+ * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
+ * up to 3 x (GPU + 2xNPUs) (POWER9).
+ */
+struct npu_comp {
+	struct iommu_table_group table_group;
+	int pe_num;
+	struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
+};
+
+/* An NPU descriptor, valid for POWER9 only */
+struct npu {
+	int index;
+	__be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+	unsigned int mmio_atsd_count;
+
+	/* Bitmask for MMIO register usage */
+	unsigned long mmio_atsd_usage;
+
+	/* Do we need to explicitly flush the nest mmu? */
+	bool nmmu_flush;
+
+	struct npu_comp npucomp;
+};
+
+#ifdef CONFIG_IOMMU_API
+static long pnv_npu_peers_create_table_userspace(
+		struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl)
+{
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	if (!npucomp->pe_num || !npucomp->pe[0] ||
+			!npucomp->pe[0]->table_group.ops ||
+			!npucomp->pe[0]->table_group.ops->create_table)
+		return -EFAULT;
+
+	return npucomp->pe[0]->table_group.ops->create_table(
+			&npucomp->pe[0]->table_group, num, page_shift,
+			window_size, levels, ptbl);
+}
+
+static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl)
+{
+	int i, j;
+	long ret = 0;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops->set_window)
 			continue;
 
-		pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
-		iommu_group_add_device(gpe->table_group.group, &npdev->dev);
+		ret = pe->table_group.ops->set_window(&pe->table_group,
+				num, tbl);
+		if (ret)
+			break;
 	}
 
-	return gpe;
+	if (ret) {
+		for (j = 0; j < i; ++j) {
+			struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+			if (!pe->table_group.ops->unset_window)
+				continue;
+
+			ret = pe->table_group.ops->unset_window(
+					&pe->table_group, num);
+			if (ret)
+				break;
+		}
+	} else {
+		table_group->tables[num] = iommu_tce_table_get(tbl);
+	}
+
+	return ret;
 }
 
+static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	int i, j;
+	long ret = 0;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		WARN_ON(npucomp->table_group.tables[num] !=
+				table_group->tables[num]);
+		if (!npucomp->table_group.tables[num])
+			continue;
+
+		if (!pe->table_group.ops->unset_window)
+			continue;
+
+		ret = pe->table_group.ops->unset_window(&pe->table_group, num);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		for (j = 0; j < i; ++j) {
+			struct pnv_ioda_pe *pe = npucomp->pe[j];
+
+			if (!npucomp->table_group.tables[num])
+				continue;
+
+			if (!pe->table_group.ops->set_window)
+				continue;
+
+			ret = pe->table_group.ops->set_window(&pe->table_group,
+					num, table_group->tables[num]);
+			if (ret)
+				break;
+		}
+	} else if (table_group->tables[num]) {
+		iommu_tce_table_put(table_group->tables[num]);
+		table_group->tables[num] = NULL;
+	}
+
+	return ret;
+}
+
+static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
+{
+	int i;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops->take_ownership)
+			continue;
+		pe->table_group.ops->take_ownership(&pe->table_group);
+	}
+}
+
+static void pnv_npu_peers_release_ownership(
+		struct iommu_table_group *table_group)
+{
+	int i;
+	struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
+			table_group);
+
+	for (i = 0; i < npucomp->pe_num; ++i) {
+		struct pnv_ioda_pe *pe = npucomp->pe[i];
+
+		if (!pe->table_group.ops->release_ownership)
+			continue;
+		pe->table_group.ops->release_ownership(&pe->table_group);
+	}
+}
+
+static struct iommu_table_group_ops pnv_npu_peers_ops = {
+	.get_table_size = pnv_pci_ioda2_get_table_size,
+	.create_table = pnv_npu_peers_create_table_userspace,
+	.set_window = pnv_npu_peers_set_window,
+	.unset_window = pnv_npu_peers_unset_window,
+	.take_ownership = pnv_npu_peers_take_ownership,
+	.release_ownership = pnv_npu_peers_release_ownership,
+};
+
+static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
+		struct pnv_ioda_pe *pe)
+{
+	if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
+		return;
+
+	npucomp->pe[npucomp->pe_num] = pe;
+	++npucomp->pe_num;
+}
+
+struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table_group *table_group;
+	struct npu_comp *npucomp;
+	struct pci_dev *gpdev = NULL;
+	struct pci_controller *hose;
+	struct pci_dev *npdev = NULL;
+
+	list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
+		npdev = pnv_pci_get_npu_dev(gpdev, 0);
+		if (npdev)
+			break;
+	}
+
+	if (!npdev)
+		/* It is not an NPU attached device, skip */
+		return NULL;
+
+	hose = pci_bus_to_host(npdev->bus);
+
+	if (hose->npu) {
+		table_group = &hose->npu->npucomp.table_group;
+
+		if (!table_group->group) {
+			table_group->ops = &pnv_npu_peers_ops;
+			iommu_register_group(table_group,
+					hose->global_number,
+					pe->pe_number);
+		}
+	} else {
+		/* Create a group for 1 GPU and attached NPUs for POWER8 */
+		pe->npucomp = kzalloc(sizeof(pe->npucomp), GFP_KERNEL);
+		table_group = &pe->npucomp->table_group;
+		table_group->ops = &pnv_npu_peers_ops;
+		iommu_register_group(table_group, hose->global_number,
+				pe->pe_number);
+	}
+
+	/* Steal capabilities from a GPU PE */
+	table_group->max_dynamic_windows_supported =
+		pe->table_group.max_dynamic_windows_supported;
+	table_group->tce32_start = pe->table_group.tce32_start;
+	table_group->tce32_size = pe->table_group.tce32_size;
+	table_group->max_levels = pe->table_group.max_levels;
+	if (!table_group->pgsizes)
+		table_group->pgsizes = pe->table_group.pgsizes;
+
+	npucomp = container_of(table_group, struct npu_comp, table_group);
+	pnv_comp_attach_table_group(npucomp, pe);
+
+	return table_group;
+}
+
+struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table_group *table_group;
+	struct npu_comp *npucomp;
+	struct pci_dev *gpdev = NULL;
+	struct pci_dev *npdev;
+	struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
+
+	WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
+	if (!gpe)
+		return NULL;
+
+	/*
+	 * IODA2 bridges get this set up from pci_controller_ops::setup_bridge
+	 * but NPU bridges do not have this hook defined so we do it here.
+	 * We do not setup other table group parameters as they won't be used
+	 * anyway - NVLink bridges are subordinate PEs.
+	 */
+	pe->table_group.ops = &pnv_pci_npu_ops;
+
+	table_group = iommu_group_get_iommudata(
+			iommu_group_get(&gpdev->dev));
+
+	/*
+	 * On P9 NPU PHB and PCI PHB support different page sizes,
+	 * keep only matching. We expect here that NVLink bridge PE pgsizes is
+	 * initialized by the caller.
+	 */
+	table_group->pgsizes &= pe->table_group.pgsizes;
+	npucomp = container_of(table_group, struct npu_comp, table_group);
+	pnv_comp_attach_table_group(npucomp, pe);
+
+	list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
+		struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
+
+		if (gpdevtmp != gpdev)
+			continue;
+
+		iommu_add_device(table_group, &npdev->dev);
+	}
+
+	return table_group;
+}
+#endif /* CONFIG_IOMMU_API */
+
 /* Maximum number of nvlinks per npu */
 #define NV_MAX_LINKS 6
 
@@ -490,7 +782,6 @@ static void acquire_atsd_reg(struct npu_context *npu_context,
 	int i, j;
 	struct npu *npu;
 	struct pci_dev *npdev;
-	struct pnv_phb *nphb;
 
 	for (i = 0; i <= max_npu2_index; i++) {
 		mmio_atsd_reg[i].reg = -1;
@@ -505,8 +796,10 @@ static void acquire_atsd_reg(struct npu_context *npu_context,
 			if (!npdev)
 				continue;
 
-			nphb = pci_bus_to_host(npdev->bus)->private_data;
-			npu = &nphb->npu;
+			npu = pci_bus_to_host(npdev->bus)->npu;
+			if (!npu)
+				continue;
+
 			mmio_atsd_reg[i].npu = npu;
 			mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
 			while (mmio_atsd_reg[i].reg < 0) {
@@ -671,9 +964,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 	u32 nvlink_index;
 	struct device_node *nvlink_dn;
 	struct mm_struct *mm = current->mm;
-	struct pnv_phb *nphb;
 	struct npu *npu;
 	struct npu_context *npu_context;
+	struct pci_controller *hose;
 
 	/*
 	 * At present we don't support GPUs connected to multiple NPUs and I'm
@@ -681,13 +974,14 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 	 */
 	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
 
-	if (!firmware_has_feature(FW_FEATURE_OPAL))
-		return ERR_PTR(-ENODEV);
-
 	if (!npdev)
 		/* No nvlink associated with this GPU device */
 		return ERR_PTR(-ENODEV);
 
+	/* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */
+	if (flags & ~(MSR_DR | MSR_PR | MSR_HV))
+		return ERR_PTR(-EINVAL);
+
 	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
 	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
 							&nvlink_index)))
@@ -701,20 +995,10 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 		return ERR_PTR(-EINVAL);
 	}
 
-	nphb = pci_bus_to_host(npdev->bus)->private_data;
-	npu = &nphb->npu;
-
-	/*
-	 * Setup the NPU context table for a particular GPU. These need to be
-	 * per-GPU as we need the tables to filter ATSDs when there are no
-	 * active contexts on a particular GPU. It is safe for these to be
-	 * called concurrently with destroy as the OPAL call takes appropriate
-	 * locks and refcounts on init/destroy.
-	 */
-	rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
-				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
-	if (rc < 0)
-		return ERR_PTR(-ENOSPC);
+	hose = pci_bus_to_host(npdev->bus);
+	npu = hose->npu;
+	if (!npu)
+		return ERR_PTR(-ENODEV);
 
 	/*
 	 * We store the npu pci device so we can more easily get at the
@@ -726,9 +1010,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 		if (npu_context->release_cb != cb ||
 			npu_context->priv != priv) {
 			spin_unlock(&npu_context_lock);
-			opal_npu_destroy_context(nphb->opal_id, mm->context.id,
-						PCI_DEVID(gpdev->bus->number,
-							gpdev->devfn));
 			return ERR_PTR(-EINVAL);
 		}
 
@@ -754,9 +1035,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 
 		if (rc) {
 			kfree(npu_context);
-			opal_npu_destroy_context(nphb->opal_id, mm->context.id,
-					PCI_DEVID(gpdev->bus->number,
-						gpdev->devfn));
 			return ERR_PTR(rc);
 		}
 
@@ -776,7 +1054,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 	 */
 	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
 
-	if (!nphb->npu.nmmu_flush) {
+	if (!npu->nmmu_flush) {
 		/*
 		 * If we're not explicitly flushing ourselves we need to mark
 		 * the thread for global flushes
@@ -809,27 +1087,24 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
 			struct pci_dev *gpdev)
 {
 	int removed;
-	struct pnv_phb *nphb;
 	struct npu *npu;
 	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
 	struct device_node *nvlink_dn;
 	u32 nvlink_index;
+	struct pci_controller *hose;
 
 	if (WARN_ON(!npdev))
 		return;
 
-	if (!firmware_has_feature(FW_FEATURE_OPAL))
+	hose = pci_bus_to_host(npdev->bus);
+	npu = hose->npu;
+	if (!npu)
 		return;
-
-	nphb = pci_bus_to_host(npdev->bus)->private_data;
-	npu = &nphb->npu;
 	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
 	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
 							&nvlink_index)))
 		return;
 	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
-	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
-				PCI_DEVID(gpdev->bus->number, gpdev->devfn));
 	spin_lock(&npu_context_lock);
 	removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
 	spin_unlock(&npu_context_lock);
@@ -857,13 +1132,12 @@ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
 	u64 rc = 0, result = 0;
 	int i, is_write;
 	struct page *page[1];
+	const char __user *u;
+	char c;
 
 	/* mmap_sem should be held so the struct_mm must be present */
 	struct mm_struct *mm = context->mm;
 
-	if (!firmware_has_feature(FW_FEATURE_OPAL))
-		return -ENODEV;
-
 	WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
 
 	for (i = 0; i < count; i++) {
@@ -872,18 +1146,17 @@ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
 					is_write ? FOLL_WRITE : 0,
 					page, NULL, NULL);
 
-		/*
-		 * To support virtualised environments we will have to do an
-		 * access to the page to ensure it gets faulted into the
-		 * hypervisor. For the moment virtualisation is not supported in
-		 * other areas so leave the access out.
-		 */
 		if (rc != 1) {
 			status[i] = rc;
 			result = -EFAULT;
 			continue;
 		}
 
+		/* Make sure partition scoped tree gets a pte */
+		u = page_address(page[0]);
+		if (__get_user(c, u))
+			result = -EFAULT;
+
 		status[i] = 0;
 		put_page(page[0]);
 	}
@@ -892,42 +1165,127 @@ int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
 }
 EXPORT_SYMBOL(pnv_npu2_handle_fault);
 
-int pnv_npu2_init(struct pnv_phb *phb)
+int pnv_npu2_init(struct pci_controller *hose)
 {
 	unsigned int i;
 	u64 mmio_atsd;
-	struct device_node *dn;
-	struct pci_dev *gpdev;
 	static int npu_index;
-	uint64_t rc = 0;
+	struct npu *npu;
+	int ret;
 
-	phb->npu.nmmu_flush =
-		of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
-	for_each_child_of_node(phb->hose->dn, dn) {
-		gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
-		if (gpdev) {
-			rc = opal_npu_map_lpar(phb->opal_id,
-				PCI_DEVID(gpdev->bus->number, gpdev->devfn),
-				0, 0);
-			if (rc)
-				dev_err(&gpdev->dev,
-					"Error %lld mapping device to LPAR\n",
-					rc);
-		}
+	npu = kzalloc(sizeof(*npu), GFP_KERNEL);
+	if (!npu)
+		return -ENOMEM;
+
+	npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush");
+
+	for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) &&
+			!of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
+				i, &mmio_atsd); i++)
+		npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
+
+	pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i);
+	npu->mmio_atsd_count = i;
+	npu->mmio_atsd_usage = 0;
+	npu_index++;
+	if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
+		ret = -ENOSPC;
+		goto fail_exit;
+	}
+	max_npu2_index = npu_index;
+	npu->index = npu_index;
+	hose->npu = npu;
+
+	return 0;
+
+fail_exit:
+	for (i = 0; i < npu->mmio_atsd_count; ++i)
+		iounmap(npu->mmio_atsd_regs[i]);
+
+	kfree(npu);
+
+	return ret;
+}
+
+int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
+		unsigned long msr)
+{
+	int ret;
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+	struct pci_controller *hose;
+	struct pnv_phb *nphb;
+
+	if (!npdev)
+		return -ENODEV;
+
+	hose = pci_bus_to_host(npdev->bus);
+	nphb = hose->private_data;
+
+	dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n",
+			nphb->opal_id, lparid);
+	/*
+	 * Currently we only support radix and non-zero LPCR only makes sense
+	 * for hash tables so skiboot expects the LPCR parameter to be a zero.
+	 */
+	ret = opal_npu_map_lpar(nphb->opal_id,
+			PCI_DEVID(gpdev->bus->number, gpdev->devfn), lparid,
+			0 /* LPCR bits */);
+	if (ret) {
+		dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
+		return ret;
 	}
 
-	for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
-							i, &mmio_atsd); i++)
-		phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
-
-	pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
-	phb->npu.mmio_atsd_count = i;
-	phb->npu.mmio_atsd_usage = 0;
-	npu_index++;
-	if (WARN_ON(npu_index >= NV_MAX_NPUS))
-		return -ENOSPC;
-	max_npu2_index = npu_index;
-	phb->npu.index = npu_index;
+	dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
+			nphb->opal_id, msr);
+	ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
+			PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+	if (ret < 0)
+		dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
+	else
+		ret = 0;
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev);
+
+void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr)
+{
+	struct pci_dev *gpdev;
+
+	list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
+		pnv_npu2_map_lpar_dev(gpdev, 0, msr);
+}
+
+int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
+{
+	int ret;
+	struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+	struct pci_controller *hose;
+	struct pnv_phb *nphb;
+
+	if (!npdev)
+		return -ENODEV;
+
+	hose = pci_bus_to_host(npdev->bus);
+	nphb = hose->private_data;
+
+	dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
+			nphb->opal_id);
+	ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
+			PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+	if (ret < 0) {
+		dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
+		return ret;
+	}
+
+	/* Set LPID to 0 anyway, just to be safe */
+	dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
+	ret = opal_npu_map_lpar(nphb->opal_id,
+			PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/,
+			0 /* LPCR bits */);
+	if (ret)
+		dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c
index 58dc330..89ab1da 100644
--- a/arch/powerpc/platforms/powernv/opal-power.c
+++ b/arch/powerpc/platforms/powernv/opal-power.c
@@ -138,7 +138,7 @@ static struct notifier_block opal_power_control_nb = {
 	.priority	= 0,
 };
 
-static int __init opal_power_control_init(void)
+int __init opal_power_control_init(void)
 {
 	int ret, supported = 0;
 	struct device_node *np;
@@ -176,4 +176,3 @@ static int __init opal_power_control_init(void)
 
 	return 0;
 }
-machine_subsys_initcall(powernv, opal_power_control_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index beed86f..79586f1 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -877,7 +877,7 @@ static int __init opal_init(void)
 	consoles = of_find_node_by_path("/ibm,opal/consoles");
 	if (consoles) {
 		for_each_child_of_node(consoles, np) {
-			if (strcmp(np->name, "serial"))
+			if (!of_node_name_eq(np, "serial"))
 				continue;
 			of_platform_device_create(np, NULL, NULL);
 		}
@@ -960,6 +960,9 @@ static int __init opal_init(void)
 	/* Initialise OPAL sensor groups */
 	opal_sensor_groups_init();
 
+	/* Initialise OPAL Power control interface */
+	opal_power_control_init();
+
 	return 0;
 }
 machine_subsys_initcall(powernv, opal_init);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index fe96910..697449a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -299,7 +299,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (alloc_userspace_copy) {
 		offset = 0;
 		uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-				levels, tce_table_size, &offset,
+				tmplevels, tce_table_size, &offset,
 				&total_allocated_uas);
 		if (!uas)
 			goto free_tces_exit;
@@ -368,6 +368,7 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
 	found = false;
 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 		if (table_group->tables[i] == tbl) {
+			iommu_tce_table_put(tbl);
 			table_group->tables[i] = NULL;
 			found = true;
 			break;
@@ -393,7 +394,7 @@ long pnv_pci_link_table_and_group(int node, int num,
 	tgl->table_group = table_group;
 	list_add_rcu(&tgl->next, &tbl->it_group_list);
 
-	table_group->tables[num] = tbl;
+	table_group->tables[num] = iommu_tce_table_get(tbl);
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index dd80744..1d6406a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -190,7 +190,8 @@ static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
 	unsigned int pe_num = pe->pe_number;
 
 	WARN_ON(pe->pdev);
-
+	WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */
+	kfree(pe->npucomp);
 	memset(pe, 0, sizeof(struct pnv_ioda_pe));
 	clear_bit(pe_num, phb->ioda.pe_alloc);
 }
@@ -517,8 +518,6 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
 		phb->init_m64 = pnv_ioda1_init_m64;
 	else
 		phb->init_m64 = pnv_ioda2_init_m64;
-	phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
-	phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
 }
 
 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -604,8 +603,8 @@ static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt)
 static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
 {
 	struct pnv_ioda_pe *slave, *pe;
-	u8 fstate, state;
-	__be16 pcierr;
+	u8 fstate = 0, state;
+	__be16 pcierr = 0;
 	s64 rc;
 
 	/* Sanity check on PE number */
@@ -663,10 +662,6 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
 	return state;
 }
 
-/* Currently those 2 are only used when MSIs are enabled, this will change
- * but in the meantime, we need to protect them to avoid warnings
- */
-#ifdef CONFIG_PCI_MSI
 struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 {
 	struct pci_controller *hose = pci_bus_to_host(dev->bus);
@@ -679,7 +674,6 @@ struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
 		return NULL;
 	return &phb->ioda.pe_array[pdn->pe_number];
 }
-#endif /* CONFIG_PCI_MSI */
 
 static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
 				  struct pnv_ioda_pe *parent,
@@ -1160,8 +1154,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 		pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
 
 	/* Check if PE is determined by M64 */
-	if (!pe && phb->pick_m64_pe)
-		pe = phb->pick_m64_pe(bus, all);
+	if (!pe)
+		pe = pnv_ioda_pick_m64_pe(bus, all);
 
 	/* The PE number isn't pinned by M64 */
 	if (!pe)
@@ -1273,19 +1267,20 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
 
 static void pnv_pci_ioda_setup_PEs(void)
 {
-	struct pci_controller *hose, *tmp;
+	struct pci_controller *hose;
 	struct pnv_phb *phb;
 	struct pci_bus *bus;
 	struct pci_dev *pdev;
+	struct pnv_ioda_pe *pe;
 
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+	list_for_each_entry(hose, &hose_list, list_node) {
 		phb = hose->private_data;
 		if (phb->type == PNV_PHB_NPU_NVLINK) {
 			/* PE#0 is needed for error reporting */
 			pnv_ioda_reserve_pe(phb, 0);
 			pnv_ioda_setup_npu_PEs(hose->bus);
 			if (phb->model == PNV_PHB_MODEL_NPU2)
-				pnv_npu2_init(phb);
+				WARN_ON_ONCE(pnv_npu2_init(hose));
 		}
 		if (phb->type == PNV_PHB_NPU_OCAPI) {
 			bus = hose->bus;
@@ -1293,6 +1288,14 @@ static void pnv_pci_ioda_setup_PEs(void)
 				pnv_ioda_setup_dev_PE(pdev);
 		}
 	}
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+		if (phb->type != PNV_PHB_IODA2)
+			continue;
+
+		list_for_each_entry(pe, &phb->ioda.pe_list, list)
+			pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
+	}
 }
 
 #ifdef CONFIG_PCI_IOV
@@ -1531,6 +1534,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe);
+#ifdef CONFIG_IOMMU_API
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
+		struct iommu_table_group *table_group, struct pci_bus *bus);
+
+#endif
 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 {
 	struct pci_bus        *bus;
@@ -1584,6 +1592,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 		mutex_unlock(&phb->ioda.pe_list_mutex);
 
 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
+#ifdef CONFIG_IOMMU_API
+		pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
+#endif
 	}
 }
 
@@ -1923,21 +1934,16 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
 	return mask;
 }
 
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-				   struct pci_bus *bus,
-				   bool add_to_group)
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
 		set_dma_offset(&dev->dev, pe->tce_bypass_base);
-		if (add_to_group)
-			iommu_add_device(&dev->dev);
 
 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
-			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
-					add_to_group);
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
 	}
 }
 
@@ -2366,16 +2372,8 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
 	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
 	iommu_init_table(tbl, phb->hose->node);
 
-	if (pe->flags & PNV_IODA_PE_DEV) {
-		/*
-		 * Setting table base here only for carrying iommu_group
-		 * further down to let iommu_add_device() do the job.
-		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
-		 */
-		set_iommu_table_base(&pe->pdev->dev, tbl);
-		iommu_add_device(&pe->pdev->dev);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 
 	return;
  fail:
@@ -2527,14 +2525,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	if (!pnv_iommu_bypass_disabled)
 		pnv_pci_ioda2_set_bypass(pe, true);
 
-	/*
-	 * Setting table base here only for carrying iommu_group
-	 * further down to let iommu_add_device() do the job.
-	 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
-	 */
-	if (pe->flags & PNV_IODA_PE_DEV)
-		set_iommu_table_base(&pe->pdev->dev, tbl);
-
 	return 0;
 }
 
@@ -2565,7 +2555,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 #endif
 
 #ifdef CONFIG_IOMMU_API
-static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels)
 {
 	unsigned long bytes = 0;
@@ -2616,7 +2606,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 	pnv_pci_ioda2_set_bypass(pe, false);
 	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (pe->pbus)
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 	iommu_tce_table_put(tbl);
 }
 
@@ -2627,7 +2617,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
 
 	pnv_pci_ioda2_setup_default_config(pe);
 	if (pe->pbus)
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
@@ -2639,131 +2629,100 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 	.release_ownership = pnv_ioda2_release_ownership,
 };
 
-static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
-{
-	struct pci_controller *hose;
-	struct pnv_phb *phb;
-	struct pnv_ioda_pe **ptmppe = opaque;
-	struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
-	struct pci_dn *pdn = pci_get_pdn(pdev);
-
-	if (!pdn || pdn->pe_number == IODA_INVALID_PE)
-		return 0;
-
-	hose = pci_bus_to_host(pdev->bus);
-	phb = hose->private_data;
-	if (phb->type != PNV_PHB_NPU_NVLINK)
-		return 0;
-
-	*ptmppe = &phb->ioda.pe_array[pdn->pe_number];
-
-	return 1;
-}
-
-/*
- * This returns PE of associated NPU.
- * This assumes that NPU is in the same IOMMU group with GPU and there is
- * no other PEs.
- */
-static struct pnv_ioda_pe *gpe_table_group_to_npe(
-		struct iommu_table_group *table_group)
-{
-	struct pnv_ioda_pe *npe = NULL;
-	int ret = iommu_group_for_each_dev(table_group->group, &npe,
-			gpe_table_group_to_npe_cb);
-
-	BUG_ON(!ret || !npe);
-
-	return npe;
-}
-
-static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
-		int num, struct iommu_table *tbl)
-{
-	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-	int num2 = (num == 0) ? 1 : 0;
-	long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
-
-	if (ret)
-		return ret;
-
-	if (table_group->tables[num2])
-		pnv_npu_unset_window(npe, num2);
-
-	ret = pnv_npu_set_window(npe, num, tbl);
-	if (ret) {
-		pnv_pci_ioda2_unset_window(table_group, num);
-		if (table_group->tables[num2])
-			pnv_npu_set_window(npe, num2,
-					table_group->tables[num2]);
-	}
-
-	return ret;
-}
-
-static long pnv_pci_ioda2_npu_unset_window(
+static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
 		struct iommu_table_group *table_group,
-		int num)
+		struct pci_bus *bus)
 {
-	struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-	int num2 = (num == 0) ? 1 : 0;
-	long ret = pnv_pci_ioda2_unset_window(table_group, num);
+	struct pci_dev *dev;
 
-	if (ret)
-		return ret;
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		iommu_add_device(table_group, &dev->dev);
 
-	if (!npe->table_group.tables[num])
-		return 0;
-
-	ret = pnv_npu_unset_window(npe, num);
-	if (ret)
-		return ret;
-
-	if (table_group->tables[num2])
-		ret = pnv_npu_set_window(npe, num2, table_group->tables[num2]);
-
-	return ret;
+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+			pnv_ioda_setup_bus_iommu_group_add_devices(pe,
+					table_group, dev->subordinate);
+	}
 }
 
-static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
+		struct iommu_table_group *table_group, struct pci_bus *bus)
 {
-	/*
-	 * Detach NPU first as pnv_ioda2_take_ownership() will destroy
-	 * the iommu_table if 32bit DMA is enabled.
-	 */
-	pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
-	pnv_ioda2_take_ownership(table_group);
+
+	if (pe->flags & PNV_IODA_PE_DEV)
+		iommu_add_device(table_group, &pe->pdev->dev);
+
+	if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
+		pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
+				bus);
 }
 
-static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = {
-	.get_table_size = pnv_pci_ioda2_get_table_size,
-	.create_table = pnv_pci_ioda2_create_table_userspace,
-	.set_window = pnv_pci_ioda2_npu_set_window,
-	.unset_window = pnv_pci_ioda2_npu_unset_window,
-	.take_ownership = pnv_ioda2_npu_take_ownership,
-	.release_ownership = pnv_ioda2_release_ownership,
-};
+static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
 
 static void pnv_pci_ioda_setup_iommu_api(void)
 {
-	struct pci_controller *hose, *tmp;
+	struct pci_controller *hose;
 	struct pnv_phb *phb;
-	struct pnv_ioda_pe *pe, *gpe;
+	struct pnv_ioda_pe *pe;
+
+	/*
+	 * There are 4 types of PEs:
+	 * - PNV_IODA_PE_BUS: a downstream port with an adapter,
+	 *   created from pnv_pci_setup_bridge();
+	 * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
+	 *   created from pnv_pci_setup_bridge();
+	 * - PNV_IODA_PE_VF: a SRIOV virtual function,
+	 *   created from pnv_pcibios_sriov_enable();
+	 * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
+	 *   created from pnv_pci_ioda_fixup().
+	 *
+	 * Normally a PE is represented by an IOMMU group, however for
+	 * devices with side channels the groups need to be more strict.
+	 */
+	list_for_each_entry(hose, &hose_list, list_node) {
+		phb = hose->private_data;
+
+		if (phb->type == PNV_PHB_NPU_NVLINK)
+			continue;
+
+		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+			struct iommu_table_group *table_group;
+
+			table_group = pnv_try_setup_npu_table_group(pe);
+			if (!table_group) {
+				if (!pnv_pci_ioda_pe_dma_weight(pe))
+					continue;
+
+				table_group = &pe->table_group;
+				iommu_register_group(&pe->table_group,
+						pe->phb->hose->global_number,
+						pe->pe_number);
+			}
+			pnv_ioda_setup_bus_iommu_group(pe, table_group,
+					pe->pbus);
+		}
+	}
 
 	/*
 	 * Now we have all PHBs discovered, time to add NPU devices to
 	 * the corresponding IOMMU groups.
 	 */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+	list_for_each_entry(hose, &hose_list, list_node) {
+		unsigned long  pgsizes;
+
 		phb = hose->private_data;
 
 		if (phb->type != PNV_PHB_NPU_NVLINK)
 			continue;
 
+		pgsizes = pnv_ioda_parse_tce_sizes(phb);
 		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-			gpe = pnv_pci_npu_setup_iommu(pe);
-			if (gpe)
-				gpe->table_group.ops = &pnv_pci_ioda2_npu_ops;
+			/*
+			 * IODA2 bridges get this set up from
+			 * pci_controller_ops::setup_bridge but NPU bridges
+			 * do not have this hook defined so we do it here.
+			 */
+			pe->table_group.pgsizes = pgsizes;
+			pnv_npu_compound_attach(pe);
 		}
 	}
 }
@@ -2810,9 +2769,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	/* TVE #1 is selected by PCI address bit 59 */
 	pe->tce_bypass_base = 1ull << 59;
 
-	iommu_register_group(&pe->table_group, phb->hose->global_number,
-			pe->pe_number);
-
 	/* The PE will reserve all possible 32-bits space */
 	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
 		phb->ioda.m32_pci_base);
@@ -2833,10 +2789,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		return;
 
 	if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 }
 
-#ifdef CONFIG_PCI_MSI
 int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
 {
 	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
@@ -2982,9 +2937,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
 	pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
 		count, phb->msi_base);
 }
-#else
-static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
-#endif /* CONFIG_PCI_MSI */
 
 #ifdef CONFIG_PCI_IOV
 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
@@ -3402,8 +3354,7 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
 		return;
 
 	/* Reserve PEs according to used M64 resources */
-	if (phb->reserve_m64_pe)
-		phb->reserve_m64_pe(bus, NULL, all);
+	pnv_ioda_reserve_m64_pe(bus, NULL, all);
 
 	/*
 	 * Assign PE. We might run here because of partial hotplug.
@@ -3687,6 +3638,15 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
 		pnv_ioda_release_pe(pe);
 }
 
+static void pnv_npu_disable_device(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+	struct eeh_pe *eehpe = edev ? edev->pe : NULL;
+
+	if (eehpe && eeh_ops && eeh_ops->reset)
+		eeh_ops->reset(eehpe, EEH_RESET_HOT);
+}
+
 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 {
 	struct pnv_phb *phb = hose->private_data;
@@ -3698,10 +3658,8 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
 	.dma_dev_setup		= pnv_pci_dma_dev_setup,
 	.dma_bus_setup		= pnv_pci_dma_bus_setup,
-#ifdef CONFIG_PCI_MSI
 	.setup_msi_irqs		= pnv_setup_msi_irqs,
 	.teardown_msi_irqs	= pnv_teardown_msi_irqs,
-#endif
 	.enable_device_hook	= pnv_pci_enable_device_hook,
 	.release_device		= pnv_pci_release_device,
 	.window_alignment	= pnv_pci_window_alignment,
@@ -3722,15 +3680,14 @@ static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
 
 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
 	.dma_dev_setup		= pnv_pci_dma_dev_setup,
-#ifdef CONFIG_PCI_MSI
 	.setup_msi_irqs		= pnv_setup_msi_irqs,
 	.teardown_msi_irqs	= pnv_teardown_msi_irqs,
-#endif
 	.enable_device_hook	= pnv_pci_enable_device_hook,
 	.window_alignment	= pnv_pci_window_alignment,
 	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
 	.dma_set_mask		= pnv_npu_dma_set_mask,
 	.shutdown		= pnv_pci_ioda_shutdown,
+	.disable_device		= pnv_npu_disable_device,
 };
 
 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 13aef23..45fb70b 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -160,7 +160,6 @@ int pnv_pci_set_power_state(uint64_t id, uint8_t state, struct opal_msg *msg)
 }
 EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
 
-#ifdef CONFIG_PCI_MSI
 int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -229,7 +228,6 @@ void pnv_teardown_msi_irqs(struct pci_dev *pdev)
 		msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1);
 	}
 }
-#endif /* CONFIG_PCI_MSI */
 
 /* Nicely print the contents of the PE State Tables (PEST). */
 static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
@@ -602,8 +600,8 @@ static void pnv_pci_handle_eeh_config(struct pnv_phb *phb, u32 pe_no)
 static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
 {
 	struct pnv_phb *phb = pdn->phb->private_data;
-	u8	fstate;
-	__be16	pcierr;
+	u8	fstate = 0;
+	__be16	pcierr = 0;
 	unsigned int pe_no;
 	s64	rc;
 
@@ -1127,4 +1125,45 @@ void __init pnv_pci_init(void)
 	set_pci_dma_ops(&dma_iommu_ops);
 }
 
-machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init);
+static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb,
+		unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev;
+	struct pci_dn *pdn;
+	struct pnv_ioda_pe *pe;
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		pdev = to_pci_dev(dev);
+		pdn = pci_get_pdn(pdev);
+		hose = pci_bus_to_host(pdev->bus);
+		phb = hose->private_data;
+
+		WARN_ON_ONCE(!phb);
+		if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb)
+			return 0;
+
+		pe = &phb->ioda.pe_array[pdn->pe_number];
+		iommu_add_device(&pe->table_group, dev);
+		return 0;
+	case BUS_NOTIFY_DEL_DEVICE:
+		iommu_del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block pnv_tce_iommu_bus_nb = {
+	.notifier_call = pnv_tce_iommu_bus_notifier,
+};
+
+static int __init pnv_tce_iommu_bus_notifier_init(void)
+{
+	bus_register_notifier(&pci_bus_type, &pnv_tce_iommu_bus_nb);
+	return 0;
+}
+machine_subsys_initcall_sync(powernv, pnv_tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 8b37b28..8e36da3 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -8,9 +8,6 @@
 
 struct pci_dn;
 
-/* Maximum possible number of ATSD MMIO registers per NPU */
-#define NV_NMMU_ATSD_REGS 8
-
 enum pnv_phb_type {
 	PNV_PHB_IODA1		= 0,
 	PNV_PHB_IODA2		= 1,
@@ -65,6 +62,7 @@ struct pnv_ioda_pe {
 
 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
 	struct iommu_table_group table_group;
+	struct npu_comp		*npucomp;
 
 	/* 64-bit TCE bypass region */
 	bool			tce_bypass_enabled;
@@ -106,20 +104,14 @@ struct pnv_phb {
 	struct dentry		*dbgfs;
 #endif
 
-#ifdef CONFIG_PCI_MSI
 	unsigned int		msi_base;
 	unsigned int		msi32_support;
 	struct msi_bitmap	msi_bmp;
-#endif
 	int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
 			 unsigned int hwirq, unsigned int virq,
 			 unsigned int is_64, struct msi_msg *msg);
 	void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
-	void (*fixup_phb)(struct pci_controller *hose);
 	int (*init_m64)(struct pnv_phb *phb);
-	void (*reserve_m64_pe)(struct pci_bus *bus,
-			       unsigned long *pe_bitmap, bool all);
-	struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all);
 	int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
 	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
 	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
@@ -180,19 +172,6 @@ struct pnv_phb {
 	unsigned int		diag_data_size;
 	u8			*diag_data;
 
-	/* Nvlink2 data */
-	struct npu {
-		int index;
-		__be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
-		unsigned int mmio_atsd_count;
-
-		/* Bitmask for MMIO register usage */
-		unsigned long mmio_atsd_usage;
-
-		/* Do we need to explicitly flush the nest mmu? */
-		bool nmmu_flush;
-	} npu;
-
 	int p2p_target_count;
 };
 
@@ -210,6 +189,7 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
 extern void pnv_pci_init_npu_phb(struct device_node *np);
 extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
+extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
@@ -220,6 +200,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+		__u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
 
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -235,12 +217,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
-extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
-		struct iommu_table *tbl);
-extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
-extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
-extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
-extern int pnv_npu2_init(struct pnv_phb *phb);
+extern struct iommu_table_group *pnv_try_setup_npu_table_group(
+		struct pnv_ioda_pe *pe);
+extern struct iommu_table_group *pnv_npu_compound_attach(
+		struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS	1
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 3b881ac..d291b61 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -197,6 +197,7 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
 
 	found = find_aa_index(dr_node, ala_prop, lmb_assoc, &aa_index);
 
+	of_node_put(dr_node);
 	dlpar_free_cc_nodes(lmb_node);
 
 	if (!found) {
@@ -353,8 +354,11 @@ static bool lmb_is_removable(struct drmem_lmb *lmb)
 	phys_addr = lmb->base_addr;
 
 #ifdef CONFIG_FA_DUMP
-	/* Don't hot-remove memory that falls in fadump boot memory area */
-	if (is_fadump_boot_memory_area(phys_addr, block_sz))
+	/*
+	 * Don't hot-remove memory that falls in fadump boot memory area
+	 * and memory that is reserved for capturing old kernel memory.
+	 */
+	if (is_fadump_memory_area(phys_addr, block_sz))
 		return false;
 #endif
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 06f0296..8fc8fe0 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -57,7 +57,6 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
 	struct iommu_table_group *table_group;
 	struct iommu_table *tbl;
-	struct iommu_table_group_link *tgl;
 
 	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
 			   node);
@@ -68,22 +67,13 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 	if (!tbl)
 		goto free_group;
 
-	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
-			node);
-	if (!tgl)
-		goto free_table;
-
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
 	kref_init(&tbl->it_kref);
-	tgl->table_group = table_group;
-	list_add_rcu(&tgl->next, &tbl->it_group_list);
 
 	table_group->tables[0] = tbl;
 
 	return table_group;
 
-free_table:
-	kfree(tbl);
 free_group:
 	kfree(table_group);
 	return NULL;
@@ -93,23 +83,12 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		const char *node_name)
 {
 	struct iommu_table *tbl;
-#ifdef CONFIG_IOMMU_API
-	struct iommu_table_group_link *tgl;
-#endif
 
 	if (!table_group)
 		return;
 
 	tbl = table_group->tables[0];
 #ifdef CONFIG_IOMMU_API
-	tgl = list_first_entry_or_null(&tbl->it_group_list,
-			struct iommu_table_group_link, next);
-
-	WARN_ON_ONCE(!tgl);
-	if (tgl) {
-		list_del_rcu(&tgl->next);
-		kfree(tgl);
-	}
 	if (table_group->group) {
 		iommu_group_put(table_group->group);
 		BUG_ON(table_group->group);
@@ -645,7 +624,6 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	iommu_table_setparms(pci->phb, dn, tbl);
 	tbl->it_ops = &iommu_table_pseries_ops;
 	iommu_init_table(tbl, pci->phb->node);
-	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -756,10 +734,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		iommu_table_setparms(phb, dn, tbl);
 		tbl->it_ops = &iommu_table_pseries_ops;
 		iommu_init_table(tbl, phb->node);
-		iommu_register_group(PCI_DN(dn)->table_group,
-				pci_domain_nr(phb->bus), 0);
 		set_iommu_table_base(&dev->dev, tbl);
-		iommu_add_device(&dev->dev);
 		return;
 	}
 
@@ -770,11 +745,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
 		dn = dn->parent;
 
-	if (dn && PCI_DN(dn)) {
+	if (dn && PCI_DN(dn))
 		set_iommu_table_base(&dev->dev,
 				PCI_DN(dn)->table_group->tables[0]);
-		iommu_add_device(&dev->dev);
-	} else
+	else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
 		       pci_name(dev));
 }
@@ -964,6 +938,37 @@ struct failed_ddw_pdn {
 
 static LIST_HEAD(failed_ddw_pdn_list);
 
+static phys_addr_t ddw_memory_hotplug_max(void)
+{
+	phys_addr_t max_addr = memory_hotplug_max();
+	struct device_node *memory;
+
+	for_each_node_by_type(memory, "memory") {
+		unsigned long start, size;
+		int ranges, n_mem_addr_cells, n_mem_size_cells, len;
+		const __be32 *memcell_buf;
+
+		memcell_buf = of_get_property(memory, "reg", &len);
+		if (!memcell_buf || len <= 0)
+			continue;
+
+		n_mem_addr_cells = of_n_addr_cells(memory);
+		n_mem_size_cells = of_n_size_cells(memory);
+
+		/* ranges in cell */
+		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+
+		start = of_read_number(memcell_buf, n_mem_addr_cells);
+		memcell_buf += n_mem_addr_cells;
+		size = of_read_number(memcell_buf, n_mem_size_cells);
+		memcell_buf += n_mem_size_cells;
+
+		max_addr = max_t(phys_addr_t, max_addr, start + size);
+	}
+
+	return max_addr;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1053,7 +1058,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	}
 	/* verify the window * number of ptes will map the partition */
 	/* check largest block * page size > max memory hotplug addr */
-	max_addr = memory_hotplug_max();
+	max_addr = ddw_memory_hotplug_max();
 	if (query.largest_available_block < (max_addr >> page_shift)) {
 		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
 			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
@@ -1190,7 +1195,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	}
 
 	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
-	iommu_add_device(&dev->dev);
+	iommu_add_device(pci->table_group, &dev->dev);
 }
 
 static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
@@ -1395,4 +1400,27 @@ static int __init disable_multitce(char *str)
 
 __setup("multitce=", disable_multitce);
 
+static int tce_iommu_bus_notifier(struct notifier_block *nb,
+		unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_DEL_DEVICE:
+		iommu_del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = tce_iommu_bus_notifier,
+};
+
+static int __init tce_iommu_bus_notifier_init(void)
+{
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+	return 0;
+}
 machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 41d8a4d..7725825 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -29,6 +29,7 @@
 #include <asm/pci-bridge.h>
 #include <asm/prom.h>
 #include <asm/ppc-pci.h>
+#include <asm/pci.h>
 #include "pseries.h"
 
 #if 0
@@ -237,6 +238,8 @@ static void __init pSeries_request_regions(void)
 
 void __init pSeries_final_fixup(void)
 {
+	struct pci_controller *hose;
+
 	pSeries_request_regions();
 
 	eeh_probe_devices();
@@ -246,6 +249,25 @@ void __init pSeries_final_fixup(void)
 	ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
 	ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable;
 #endif
+	list_for_each_entry(hose, &hose_list, list_node) {
+		struct device_node *dn = hose->dn, *nvdn;
+
+		while (1) {
+			dn = of_find_all_nodes(dn);
+			if (!dn)
+				break;
+			nvdn = of_parse_phandle(dn, "ibm,nvlink", 0);
+			if (!nvdn)
+				continue;
+			if (!of_device_is_compatible(nvdn, "ibm,npu-link"))
+				continue;
+			if (!of_device_is_compatible(nvdn->parent,
+						"ibm,power9-npu"))
+				continue;
+			WARN_ON_ONCE(pnv_npu2_init(hose));
+			break;
+		}
+	}
 }
 
 /*
diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c
index a27f40e..27f0a91 100644
--- a/arch/powerpc/platforms/pseries/pmem.c
+++ b/arch/powerpc/platforms/pseries/pmem.c
@@ -52,8 +52,8 @@ static ssize_t pmem_drc_add_node(u32 drc_index)
 	/* NB: The of reconfig notifier creates platform device from the node */
 	rc = dlpar_attach_node(dn, pmem_node);
 	if (rc) {
-		pr_err("Failed to attach node %s, rc: %d, drc index: %x\n",
-			dn->name, rc, drc_index);
+		pr_err("Failed to attach node %pOF, rc: %d, drc index: %x\n",
+			dn, rc, drc_index);
 
 		if (dlpar_release_drc(drc_index))
 			dlpar_free_cc_nodes(dn);
@@ -93,8 +93,8 @@ static ssize_t pmem_drc_remove_node(u32 drc_index)
 
 	rc = dlpar_release_drc(drc_index);
 	if (rc) {
-		pr_err("Failed to release drc (%x) for CPU %s, rc: %d\n",
-			drc_index, dn->name, rc);
+		pr_err("Failed to release drc (%x) for CPU %pOFn, rc: %d\n",
+			drc_index, dn, rc);
 		dlpar_attach_node(dn, pmem_node);
 		return rc;
 	}
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 4078a05..41f62ca2 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -190,7 +190,7 @@ static void __init pseries_setup_i8259_cascade(void)
 		of_node_put(old);
 		if (np == NULL)
 			break;
-		if (strcmp(np->name, "pci") != 0)
+		if (!of_node_name_eq(np, "pci"))
 			continue;
 		addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
 		if (addrp == NULL)
diff --git a/drivers/ide/pmac.c b/drivers/ide/pmac.c
index c5b902b..ec46aa8 100644
--- a/drivers/ide/pmac.c
+++ b/drivers/ide/pmac.c
@@ -1045,7 +1045,7 @@ static int pmac_ide_setup_device(pmac_ide_hwif_t *pmif, struct ide_hw *hw)
 		d.port_ops = &pmac_ide_ata4_port_ops;
 		d.udma_mask = ATA_UDMA5;
 	} else if (of_device_is_compatible(np, "keylargo-ata")) {
-		if (strcmp(np->name, "ata-4") == 0) {
+		if (of_node_name_eq(np, "ata-4")) {
 			pmif->kind = controller_kl_ata4;
 			d.port_ops = &pmac_ide_ata4_port_ops;
 			d.udma_mask = ATA_UDMA4;
diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c
index c8e078b..ef0c236 100644
--- a/drivers/macintosh/ans-lcd.c
+++ b/drivers/macintosh/ans-lcd.c
@@ -160,7 +160,7 @@ anslcd_init(void)
 	struct device_node* node;
 
 	node = of_find_node_by_name(NULL, "lcd");
-	if (!node || !node->parent || strcmp(node->parent->name, "gc")) {
+	if (!node || !of_node_name_eq(node->parent, "gc")) {
 		of_node_put(node);
 		return -ENODEV;
 	}
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index bc84188..3543a82 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -190,11 +190,11 @@ static int macio_resource_quirks(struct device_node *np, struct resource *res,
 		return 0;
 
 	/* Grand Central has too large resource 0 on some machines */
-	if (index == 0 && !strcmp(np->name, "gc"))
+	if (index == 0 && of_node_name_eq(np, "gc"))
 		res->end = res->start + 0x1ffff;
 
 	/* Airport has bogus resource 2 */
-	if (index >= 2 && !strcmp(np->name, "radio"))
+	if (index >= 2 && of_node_name_eq(np, "radio"))
 		return 1;
 
 #ifndef CONFIG_PPC64
@@ -207,20 +207,20 @@ static int macio_resource_quirks(struct device_node *np, struct resource *res,
 	 * level of hierarchy, but I don't really feel the need
 	 * for it
 	 */
-	if (!strcmp(np->name, "escc"))
+	if (of_node_name_eq(np, "escc"))
 		return 1;
 
 	/* ESCC has bogus resources >= 3 */
-	if (index >= 3 && !(strcmp(np->name, "ch-a") &&
-			    strcmp(np->name, "ch-b")))
+	if (index >= 3 && (of_node_name_eq(np, "ch-a") ||
+			   of_node_name_eq(np, "ch-b")))
 		return 1;
 
 	/* Media bay has too many resources, keep only first one */
-	if (index > 0 && !strcmp(np->name, "media-bay"))
+	if (index > 0 && of_node_name_eq(np, "media-bay"))
 		return 1;
 
 	/* Some older IDE resources have bogus sizes */
-	if (!strcmp(np->name, "IDE") || !strcmp(np->name, "ATA") ||
+	if (of_node_name_eq(np, "IDE") || of_node_name_eq(np, "ATA") ||
 	    of_node_is_type(np, "ide") || of_node_is_type(np, "ata")) {
 		if (index == 0 && (res->end - res->start) > 0xfff)
 			res->end = res->start + 0xfff;
@@ -260,7 +260,7 @@ static void macio_add_missing_resources(struct macio_dev *dev)
 	irq_base = 64;
 
 	/* Fix SCC */
-	if (strcmp(np->name, "ch-a") == 0) {
+	if (of_node_name_eq(np, "ch-a")) {
 		macio_create_fixup_irq(dev, 0, 15 + irq_base);
 		macio_create_fixup_irq(dev, 1,  4 + irq_base);
 		macio_create_fixup_irq(dev, 2,  5 + irq_base);
@@ -268,18 +268,18 @@ static void macio_add_missing_resources(struct macio_dev *dev)
 	}
 
 	/* Fix media-bay */
-	if (strcmp(np->name, "media-bay") == 0) {
+	if (of_node_name_eq(np, "media-bay")) {
 		macio_create_fixup_irq(dev, 0, 29 + irq_base);
 		printk(KERN_INFO "macio: fixed media-bay irq on gatwick\n");
 	}
 
 	/* Fix left media bay childs */
-	if (dev->media_bay != NULL && strcmp(np->name, "floppy") == 0) {
+	if (dev->media_bay != NULL && of_node_name_eq(np, "floppy")) {
 		macio_create_fixup_irq(dev, 0, 19 + irq_base);
 		macio_create_fixup_irq(dev, 1,  1 + irq_base);
 		printk(KERN_INFO "macio: fixed left floppy irqs\n");
 	}
-	if (dev->media_bay != NULL && strcasecmp(np->name, "ata4") == 0) {
+	if (dev->media_bay != NULL && of_node_name_eq(np, "ata4")) {
 		macio_create_fixup_irq(dev, 0, 14 + irq_base);
 		macio_create_fixup_irq(dev, 0,  3 + irq_base);
 		printk(KERN_INFO "macio: fixed left ide irqs\n");
@@ -438,11 +438,8 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip,
 
 static int macio_skip_device(struct device_node *np)
 {
-	if (strncmp(np->name, "battery", 7) == 0)
-		return 1;
-	if (strncmp(np->name, "escc-legacy", 11) == 0)
-		return 1;
-	return 0;
+	return of_node_name_prefix(np, "battery") ||
+	       of_node_name_prefix(np, "escc-legacy");
 }
 
 /**
@@ -489,9 +486,9 @@ static void macio_pci_add_devices(struct macio_chip *chip)
 					    root_res);
 		if (mdev == NULL)
 			of_node_put(np);
-		else if (strncmp(np->name, "media-bay", 9) == 0)
+		else if (of_node_name_prefix(np, "media-bay"))
 			mbdev = mdev;
-		else if (strncmp(np->name, "escc", 4) == 0)
+		else if (of_node_name_prefix(np, "escc"))
 			sdev = mdev;
 	}
 
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 1f29d24..3940e2a 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -376,18 +376,19 @@ static int rackmeter_probe(struct macio_dev* mdev,
 	pr_debug("rackmeter_probe()\n");
 
 	/* Get i2s-a node */
-	while ((i2s = of_get_next_child(mdev->ofdev.dev.of_node, i2s)) != NULL)
-	       if (strcmp(i2s->name, "i2s-a") == 0)
-		       break;
+	for_each_child_of_node(mdev->ofdev.dev.of_node, i2s)
+		if (of_node_name_eq(i2s, "i2s-a"))
+			break;
+
 	if (i2s == NULL) {
 		pr_debug("  i2s-a child not found\n");
 		goto bail;
 	}
 	/* Get lightshow or virtual sound */
-	while ((np = of_get_next_child(i2s, np)) != NULL) {
-	       if (strcmp(np->name, "lightshow") == 0)
+	for_each_child_of_node(i2s, np) {
+	       if (of_node_name_eq(np, "lightshow"))
 		       break;
-	       if ((strcmp(np->name, "sound") == 0) &&
+	       if (of_node_name_eq(np, "sound") &&
 		   of_get_property(np, "virtual", NULL) != NULL)
 		       break;
 	}
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 60f57e2..ac0cf37 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -318,8 +318,8 @@ int __init find_via_pmu(void)
 			PMU_INT_ADB |
 			PMU_INT_TICK;
 	
-	if (vias->parent->name && ((strcmp(vias->parent->name, "ohare") == 0)
-	    || of_device_is_compatible(vias->parent, "ohare")))
+	if (of_node_name_eq(vias->parent, "ohare") ||
+	    of_device_is_compatible(vias->parent, "ohare"))
 		pmu_kind = PMU_OHARE_BASED;
 	else if (of_device_is_compatible(vias->parent, "paddington"))
 		pmu_kind = PMU_PADDINGTON_BASED;
diff --git a/drivers/macintosh/windfarm_lm87_sensor.c b/drivers/macintosh/windfarm_lm87_sensor.c
index 35aa571..09724ac 100644
--- a/drivers/macintosh/windfarm_lm87_sensor.c
+++ b/drivers/macintosh/windfarm_lm87_sensor.c
@@ -110,8 +110,8 @@ static int wf_lm87_probe(struct i2c_client *client,
 	 * the Xserve G5 has several lm87's. However, for now we only
 	 * care about the internal temperature sensor
 	 */
-	while ((np = of_get_next_child(client->dev.of_node, np)) != NULL) {
-		if (strcmp(np->name, "int-temp"))
+	for_each_child_of_node(client->dev.of_node, np) {
+		if (!of_node_name_eq(np, "int-temp"))
 			continue;
 		loc = of_get_property(np, "location", NULL);
 		if (!loc)
diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c
index 86d6546..2cb9652 100644
--- a/drivers/macintosh/windfarm_smu_controls.c
+++ b/drivers/macintosh/windfarm_smu_controls.c
@@ -267,7 +267,7 @@ static int __init smu_controls_init(void)
 
 	/* Look for RPM fans */
 	for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;)
-		if (!strcmp(fans->name, "rpm-fans") ||
+		if (of_node_name_eq(fans, "rpm-fans") ||
 		    of_device_is_compatible(fans, "smu-rpm-fans"))
 			break;
 	for (fan = NULL;
@@ -287,7 +287,7 @@ static int __init smu_controls_init(void)
 
 	/* Look for PWM fans */
 	for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;)
-		if (!strcmp(fans->name, "pwm-fans"))
+		if (of_node_name_eq(fans, "pwm-fans"))
 			break;
 	for (fan = NULL;
 	     fans && (fan = of_get_next_child(fans, fan)) != NULL;) {
diff --git a/drivers/macintosh/windfarm_smu_sensors.c b/drivers/macintosh/windfarm_smu_sensors.c
index 1ba86de..a58f673 100644
--- a/drivers/macintosh/windfarm_smu_sensors.c
+++ b/drivers/macintosh/windfarm_smu_sensors.c
@@ -424,7 +424,7 @@ static int __init smu_sensors_init(void)
 	/* Look for sensors subdir */
 	for (sensors = NULL;
 	     (sensors = of_get_next_child(smu, sensors)) != NULL;)
-		if (!strcmp(sensors->name, "sensors"))
+		if (of_node_name_eq(sensors, "sensors"))
 			break;
 
 	of_node_put(smu);
diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index 57a6bb1..8f2c5d8 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -318,7 +318,7 @@ static int read_afu_name(struct pci_dev *dev, struct ocxl_fn_config *fn,
 		if (rc)
 			return rc;
 		ptr = (u32 *) &afu->name[i];
-		*ptr = val;
+		*ptr = le32_to_cpu((__force __le32) val);
 	}
 	afu->name[OCXL_AFU_NAME_SZ - 1] = '\0'; /* play safe */
 	return 0;
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 42dc1d3..d0f8e4f 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -38,3 +38,9 @@
 	  and LPC bridge config space.
 
 	  To enable Intel IGD assignment through vfio-pci, say Y.
+
+config VFIO_PCI_NVLINK2
+	def_bool y
+	depends on VFIO_PCI && PPC_POWERNV
+	help
+	  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 76d8ec0..9662c06 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,5 +1,6 @@
 
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
+vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h
new file mode 100644
index 0000000..228ccdb
--- /dev/null
+++ b/drivers/vfio/pci/trace.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * VFIO PCI mmap/mmap_fault tracepoints
+ *
+ * Copyright (C) 2018 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vfio_pci
+
+#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VFIO_PCI_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap_fault,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			vm_fault_t ret),
+	TP_ARGS(pdev, hpa, ua, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			unsigned long size, int ret),
+	TP_ARGS(pdev, hpa, ua, size, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(unsigned long, size)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->size = size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->size, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_npu2_mmap,
+	TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+			unsigned long size, int ret),
+	TP_ARGS(pdev, hpa, ua, size, ret),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, hpa)
+		__field(unsigned long, ua)
+		__field(unsigned long, size)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->name = dev_name(&pdev->dev),
+		__entry->hpa = hpa;
+		__entry->ua = ua;
+		__entry->size = size;
+		__entry->ret = ret;
+	),
+
+	TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+			__entry->ua, __entry->size, __entry->ret)
+);
+
+#endif /* _TRACE_VFIO_PCI_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 50cdedf..a89fa5d 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -289,14 +289,37 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		if (ret) {
 			dev_warn(&vdev->pdev->dev,
 				 "Failed to setup Intel IGD regions\n");
-			vfio_pci_disable(vdev);
-			return ret;
+			goto disable_exit;
+		}
+	}
+
+	if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
+		ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
+		if (ret && ret != -ENODEV) {
+			dev_warn(&vdev->pdev->dev,
+				 "Failed to setup NVIDIA NV2 RAM region\n");
+			goto disable_exit;
+		}
+	}
+
+	if (pdev->vendor == PCI_VENDOR_ID_IBM &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
+		ret = vfio_pci_ibm_npu2_init(vdev);
+		if (ret && ret != -ENODEV) {
+			dev_warn(&vdev->pdev->dev,
+					"Failed to setup NVIDIA NV2 ATSD region\n");
+			goto disable_exit;
 		}
 	}
 
 	vfio_pci_probe_mmaps(vdev);
 
 	return 0;
+
+disable_exit:
+	vfio_pci_disable(vdev);
+	return ret;
 }
 
 static void vfio_pci_disable(struct vfio_pci_device *vdev)
@@ -750,6 +773,12 @@ static long vfio_pci_ioctl(void *device_data,
 			if (ret)
 				return ret;
 
+			if (vdev->region[i].ops->add_capability) {
+				ret = vdev->region[i].ops->add_capability(vdev,
+						&vdev->region[i], &caps);
+				if (ret)
+					return ret;
+			}
 		}
 		}
 
@@ -1117,6 +1146,15 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
 		return -EINVAL;
 	if ((vma->vm_flags & VM_SHARED) == 0)
 		return -EINVAL;
+	if (index >= VFIO_PCI_NUM_REGIONS) {
+		int regnum = index - VFIO_PCI_NUM_REGIONS;
+		struct vfio_pci_region *region = vdev->region + regnum;
+
+		if (region && region->ops && region->ops->mmap &&
+		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+			return region->ops->mmap(vdev, region, vma);
+		return -EINVAL;
+	}
 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
 		return -EINVAL;
 	if (!vdev->bar_mmap_supported[index])
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
new file mode 100644
index 0000000..054a2cf
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
+ *
+ * Copyright (C) 2018 IBM Corp.  All rights reserved.
+ *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Register an on-GPU RAM region for cacheable access.
+ *
+ * Derived from original vfio_pci_igd.c:
+ * Copyright (C) 2016 Red Hat, Inc.  All rights reserved.
+ *	Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
+#include <asm/kvm_ppc.h>
+#include "vfio_pci_private.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
+
+struct vfio_pci_nvgpu_data {
+	unsigned long gpu_hpa; /* GPU RAM physical address */
+	unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
+	unsigned long useraddr; /* GPU RAM userspace address */
+	unsigned long size; /* Size of the GPU RAM window (usually 128GB) */
+	struct mm_struct *mm;
+	struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */
+	struct pci_dev *gpdev;
+	struct notifier_block group_notifier;
+};
+
+static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev,
+		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_pci_nvgpu_data *data = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK;
+	size_t sizealigned;
+	void __iomem *ptr;
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	/*
+	 * We map only a bit of GPU RAM for a short time instead of mapping it
+	 * for the guest lifetime as:
+	 *
+	 * 1) we do not know GPU RAM size, only aperture which is 4-8 times
+	 *    bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
+	 * 2) mapping GPU RAM allows CPU to prefetch and if this happens
+	 *    before NVLink bridge is reset (which fences GPU RAM),
+	 *    hardware management interrupts (HMI) might happen, this
+	 *    will freeze NVLink bridge.
+	 *
+	 * This is not fast path anyway.
+	 */
+	sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE);
+	ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned);
+	if (!ptr)
+		return -EFAULT;
+
+	if (iswrite) {
+		if (copy_from_user(ptr + posoff, buf, count))
+			count = -EFAULT;
+		else
+			*ppos += count;
+	} else {
+		if (copy_to_user(buf, ptr + posoff, count))
+			count = -EFAULT;
+		else
+			*ppos += count;
+	}
+
+	iounmap(ptr);
+
+	return count;
+}
+
+static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region)
+{
+	struct vfio_pci_nvgpu_data *data = region->data;
+	long ret;
+
+	/* If there were any mappings at all... */
+	if (data->mm) {
+		ret = mm_iommu_put(data->mm, data->mem);
+		WARN_ON(ret);
+
+		mmdrop(data->mm);
+	}
+
+	vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+			&data->group_notifier);
+
+	pnv_npu2_unmap_lpar_dev(data->gpdev);
+
+	kfree(data);
+}
+
+static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf)
+{
+	vm_fault_t ret;
+	struct vm_area_struct *vma = vmf->vma;
+	struct vfio_pci_region *region = vma->vm_private_data;
+	struct vfio_pci_nvgpu_data *data = region->data;
+	unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+	unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT;
+	unsigned long vm_pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	unsigned long pfn = nv2pg + vm_pgoff + vmf_off;
+
+	ret = vmf_insert_pfn(vma, vmf->address, pfn);
+	trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT,
+			vmf->address, ret);
+
+	return ret;
+}
+
+static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = {
+	.fault = vfio_pci_nvgpu_mmap_fault,
+};
+
+static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vm_area_struct *vma)
+{
+	int ret;
+	struct vfio_pci_nvgpu_data *data = region->data;
+
+	if (data->useraddr)
+		return -EPERM;
+
+	if (vma->vm_end - vma->vm_start > data->size)
+		return -EINVAL;
+
+	vma->vm_private_data = region;
+	vma->vm_flags |= VM_PFNMAP;
+	vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops;
+
+	/*
+	 * Calling mm_iommu_newdev() here once as the region is not
+	 * registered yet and therefore right initialization will happen now.
+	 * Other places will use mm_iommu_find() which returns
+	 * registered @mem and does not go gup().
+	 */
+	data->useraddr = vma->vm_start;
+	data->mm = current->mm;
+
+	atomic_inc(&data->mm->mm_count);
+	ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
+			(vma->vm_end - vma->vm_start) >> PAGE_SHIFT,
+			data->gpu_hpa, &data->mem);
+
+	trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr,
+			vma->vm_end - vma->vm_start, ret);
+
+	return ret;
+}
+
+static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vfio_info_cap *caps)
+{
+	struct vfio_pci_nvgpu_data *data = region->data;
+	struct vfio_region_info_cap_nvlink2_ssatgt cap = { 0 };
+
+	cap.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT;
+	cap.header.version = 1;
+	cap.tgt = data->gpu_tgt;
+
+	return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+static const struct vfio_pci_regops vfio_pci_nvgpu_regops = {
+	.rw = vfio_pci_nvgpu_rw,
+	.release = vfio_pci_nvgpu_release,
+	.mmap = vfio_pci_nvgpu_mmap,
+	.add_capability = vfio_pci_nvgpu_add_capability,
+};
+
+static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb,
+		unsigned long action, void *opaque)
+{
+	struct kvm *kvm = opaque;
+	struct vfio_pci_nvgpu_data *data = container_of(nb,
+			struct vfio_pci_nvgpu_data,
+			group_notifier);
+
+	if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm &&
+			pnv_npu2_map_lpar_dev(data->gpdev,
+				kvm->arch.lpid, MSR_DR | MSR_PR))
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
+{
+	int ret;
+	u64 reg[2];
+	u64 tgt = 0;
+	struct device_node *npu_node, *mem_node;
+	struct pci_dev *npu_dev;
+	struct vfio_pci_nvgpu_data *data;
+	uint32_t mem_phandle = 0;
+	unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM;
+
+	/*
+	 * PCI config space does not tell us about NVLink presense but
+	 * platform does, use this.
+	 */
+	npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0);
+	if (!npu_dev)
+		return -ENODEV;
+
+	npu_node = pci_device_to_OF_node(npu_dev);
+	if (!npu_node)
+		return -EINVAL;
+
+	if (of_property_read_u32(npu_node, "memory-region", &mem_phandle))
+		return -EINVAL;
+
+	mem_node = of_find_node_by_phandle(mem_phandle);
+	if (!mem_node)
+		return -EINVAL;
+
+	if (of_property_read_variable_u64_array(mem_node, "reg", reg,
+				ARRAY_SIZE(reg), ARRAY_SIZE(reg)) !=
+			ARRAY_SIZE(reg))
+		return -EINVAL;
+
+	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
+		dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
+		return -EFAULT;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->gpu_hpa = reg[0];
+	data->gpu_tgt = tgt;
+	data->size = reg[1];
+
+	dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa,
+			data->gpu_hpa + data->size - 1);
+
+	data->gpdev = vdev->pdev;
+	data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier;
+
+	ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+			&events, &data->group_notifier);
+	if (ret)
+		goto free_exit;
+
+	/*
+	 * We have just set KVM, we do not need the listener anymore.
+	 * Also, keeping it registered means that if more than one GPU is
+	 * assigned, we will get several similar notifiers notifying about
+	 * the same device again which does not help with anything.
+	 */
+	vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+			&data->group_notifier);
+
+	ret = vfio_pci_register_dev_region(vdev,
+			PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+			VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
+			&vfio_pci_nvgpu_regops,
+			data->size,
+			VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE |
+			VFIO_REGION_INFO_FLAG_MMAP,
+			data);
+	if (ret)
+		goto free_exit;
+
+	return 0;
+free_exit:
+	kfree(data);
+
+	return ret;
+}
+
+/*
+ * IBM NPU2 bridge
+ */
+struct vfio_pci_npu2_data {
+	void *base; /* ATSD register virtual address, for emulated access */
+	unsigned long mmio_atsd; /* ATSD physical address */
+	unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
+	unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */
+};
+
+static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev,
+		char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct vfio_pci_npu2_data *data = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	if (pos >= vdev->region[i].size)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	if (iswrite) {
+		if (copy_from_user(data->base + pos, buf, count))
+			return -EFAULT;
+	} else {
+		if (copy_to_user(buf, data->base + pos, count))
+			return -EFAULT;
+	}
+	*ppos += count;
+
+	return count;
+}
+
+static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vm_area_struct *vma)
+{
+	int ret;
+	struct vfio_pci_npu2_data *data = region->data;
+	unsigned long req_len = vma->vm_end - vma->vm_start;
+
+	if (req_len != PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_PFNMAP;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT,
+			req_len, vma->vm_page_prot);
+	trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start,
+			vma->vm_end - vma->vm_start, ret);
+
+	return ret;
+}
+
+static void vfio_pci_npu2_release(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region)
+{
+	struct vfio_pci_npu2_data *data = region->data;
+
+	memunmap(data->base);
+	kfree(data);
+}
+
+static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev,
+		struct vfio_pci_region *region, struct vfio_info_cap *caps)
+{
+	struct vfio_pci_npu2_data *data = region->data;
+	struct vfio_region_info_cap_nvlink2_ssatgt captgt = { 0 };
+	struct vfio_region_info_cap_nvlink2_lnkspd capspd = { 0 };
+	int ret;
+
+	captgt.header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT;
+	captgt.header.version = 1;
+	captgt.tgt = data->gpu_tgt;
+
+	capspd.header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD;
+	capspd.header.version = 1;
+	capspd.link_speed = data->link_speed;
+
+	ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt));
+	if (ret)
+		return ret;
+
+	return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd));
+}
+
+static const struct vfio_pci_regops vfio_pci_npu2_regops = {
+	.rw = vfio_pci_npu2_rw,
+	.mmap = vfio_pci_npu2_mmap,
+	.release = vfio_pci_npu2_release,
+	.add_capability = vfio_pci_npu2_add_capability,
+};
+
+int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
+{
+	int ret;
+	struct vfio_pci_npu2_data *data;
+	struct device_node *nvlink_dn;
+	u32 nvlink_index = 0;
+	struct pci_dev *npdev = vdev->pdev;
+	struct device_node *npu_node = pci_device_to_OF_node(npdev);
+	struct pci_controller *hose = pci_bus_to_host(npdev->bus);
+	u64 mmio_atsd = 0;
+	u64 tgt = 0;
+	u32 link_speed = 0xff;
+
+	/*
+	 * PCI config space does not tell us about NVLink presense but
+	 * platform does, use this.
+	 */
+	if (!pnv_pci_get_gpu_dev(vdev->pdev))
+		return -ENODEV;
+
+	/*
+	 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
+	 * so we can allocate one register per link, using nvlink index as
+	 * a key.
+	 * There is always at least one ATSD register so as long as at least
+	 * NVLink bridge #0 is passed to the guest, ATSD will be available.
+	 */
+	nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+			&nvlink_index)))
+		return -ENODEV;
+
+	if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
+			&mmio_atsd)) {
+		dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
+		mmio_atsd = 0;
+	}
+
+	if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
+		dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
+		return -EFAULT;
+	}
+
+	if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) {
+		dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n");
+		return -EFAULT;
+	}
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->mmio_atsd = mmio_atsd;
+	data->gpu_tgt = tgt;
+	data->link_speed = link_speed;
+	if (data->mmio_atsd) {
+		data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT);
+		if (!data->base) {
+			ret = -ENOMEM;
+			goto free_exit;
+		}
+	}
+
+	/*
+	 * We want to expose the capability even if this specific NVLink
+	 * did not get its own ATSD register because capabilities
+	 * belong to VFIO regions and normally there will be ATSD register
+	 * assigned to the NVLink bridge.
+	 */
+	ret = vfio_pci_register_dev_region(vdev,
+			PCI_VENDOR_ID_IBM |
+			VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+			VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
+			&vfio_pci_npu2_regops,
+			data->mmio_atsd ? PAGE_SIZE : 0,
+			VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE |
+			VFIO_REGION_INFO_FLAG_MMAP,
+			data);
+	if (ret)
+		goto free_exit;
+
+	return 0;
+
+free_exit:
+	kfree(data);
+
+	return ret;
+}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d..127071b 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -59,6 +59,12 @@ struct vfio_pci_regops {
 		      size_t count, loff_t *ppos, bool iswrite);
 	void	(*release)(struct vfio_pci_device *vdev,
 			   struct vfio_pci_region *region);
+	int	(*mmap)(struct vfio_pci_device *vdev,
+			struct vfio_pci_region *region,
+			struct vm_area_struct *vma);
+	int	(*add_capability)(struct vfio_pci_device *vdev,
+				  struct vfio_pci_region *region,
+				  struct vfio_info_cap *caps);
 };
 
 struct vfio_pci_region {
@@ -157,4 +163,18 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
 	return -ENODEV;
 }
 #endif
+#ifdef CONFIG_VFIO_PCI_NVLINK2
+extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
+extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
+#else
+static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
+{
+	return -ENODEV;
+}
+
+static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
+{
+	return -ENODEV;
+}
+#endif
 #endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index b30926e..c424913 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -152,11 +152,12 @@ static long tce_iommu_unregister_pages(struct tce_container *container,
 	struct mm_iommu_table_group_mem_t *mem;
 	struct tce_iommu_prereg *tcemem;
 	bool found = false;
+	long ret;
 
 	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 		return -EINVAL;
 
-	mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
+	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 	if (!mem)
 		return -ENOENT;
 
@@ -168,9 +169,13 @@ static long tce_iommu_unregister_pages(struct tce_container *container,
 	}
 
 	if (!found)
-		return -ENOENT;
+		ret = -ENOENT;
+	else
+		ret = tce_iommu_prereg_free(container, tcemem);
 
-	return tce_iommu_prereg_free(container, tcemem);
+	mm_iommu_put(container->mm, mem);
+
+	return ret;
 }
 
 static long tce_iommu_register_pages(struct tce_container *container,
@@ -185,22 +190,24 @@ static long tce_iommu_register_pages(struct tce_container *container,
 			((vaddr + size) < vaddr))
 		return -EINVAL;
 
-	mem = mm_iommu_find(container->mm, vaddr, entries);
+	mem = mm_iommu_get(container->mm, vaddr, entries);
 	if (mem) {
 		list_for_each_entry(tcemem, &container->prereg_list, next) {
-			if (tcemem->mem == mem)
-				return -EBUSY;
+			if (tcemem->mem == mem) {
+				ret = -EBUSY;
+				goto put_exit;
+			}
 		}
+	} else {
+		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
+		if (ret)
+			return ret;
 	}
 
-	ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
-	if (ret)
-		return ret;
-
 	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 	if (!tcemem) {
-		mm_iommu_put(container->mm, mem);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto put_exit;
 	}
 
 	tcemem->mem = mem;
@@ -209,10 +216,22 @@ static long tce_iommu_register_pages(struct tce_container *container,
 	container->enabled = true;
 
 	return 0;
+
+put_exit:
+	mm_iommu_put(container->mm, mem);
+	return ret;
 }
 
-static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
+		unsigned int page_shift)
 {
+	struct page *page;
+	unsigned long size = 0;
+
+	if (mm_iommu_is_devmem(mm, hpa, page_shift, &size))
+		return size == (1UL << page_shift);
+
+	page = pfn_to_page(hpa >> PAGE_SHIFT);
 	/*
 	 * Check that the TCE table granularity is not bigger than the size of
 	 * a page we just found. Otherwise the hardware can get access to
@@ -371,6 +390,7 @@ static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
 	struct tce_iommu_group *tcegrp;
+	struct tce_iommu_prereg *tcemem, *tmtmp;
 	long i;
 
 	while (tce_groups_attached(container)) {
@@ -393,13 +413,8 @@ static void tce_iommu_release(void *iommu_data)
 		tce_iommu_free_table(container, tbl);
 	}
 
-	while (!list_empty(&container->prereg_list)) {
-		struct tce_iommu_prereg *tcemem;
-
-		tcemem = list_first_entry(&container->prereg_list,
-				struct tce_iommu_prereg, next);
-		WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
-	}
+	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
+		WARN_ON(tce_iommu_prereg_free(container, tcemem));
 
 	tce_iommu_disable(container);
 	if (container->mm)
@@ -492,7 +507,8 @@ static int tce_iommu_clear(struct tce_container *container,
 
 		direction = DMA_NONE;
 		oldhpa = 0;
-		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
+		ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
+				&direction);
 		if (ret)
 			continue;
 
@@ -530,7 +546,6 @@ static long tce_iommu_build(struct tce_container *container,
 		enum dma_data_direction direction)
 {
 	long i, ret = 0;
-	struct page *page;
 	unsigned long hpa;
 	enum dma_data_direction dirtmp;
 
@@ -541,15 +556,16 @@ static long tce_iommu_build(struct tce_container *container,
 		if (ret)
 			break;
 
-		page = pfn_to_page(hpa >> PAGE_SHIFT);
-		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+		if (!tce_page_is_contained(container->mm, hpa,
+				tbl->it_page_shift)) {
 			ret = -EPERM;
 			break;
 		}
 
 		hpa |= offset;
 		dirtmp = direction;
-		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
+				&dirtmp);
 		if (ret) {
 			tce_iommu_unuse_page(container, hpa);
 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -576,7 +592,6 @@ static long tce_iommu_build_v2(struct tce_container *container,
 		enum dma_data_direction direction)
 {
 	long i, ret = 0;
-	struct page *page;
 	unsigned long hpa;
 	enum dma_data_direction dirtmp;
 
@@ -589,8 +604,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
 		if (ret)
 			break;
 
-		page = pfn_to_page(hpa >> PAGE_SHIFT);
-		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+		if (!tce_page_is_contained(container->mm, hpa,
+				tbl->it_page_shift)) {
 			ret = -EPERM;
 			break;
 		}
@@ -603,7 +618,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
 		if (mm_iommu_mapped_inc(mem))
 			break;
 
-		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
+				&dirtmp);
 		if (ret) {
 			/* dirtmp cannot be DMA_NONE here */
 			tce_iommu_unuse_page_v2(container, tbl, entry + i);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 8131028..02bb7ad 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -354,6 +354,21 @@ struct vfio_region_gfx_edid {
 };
 
 /*
+ * 10de vendor sub-type
+ *
+ * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space.
+ */
+#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM	(1)
+
+/*
+ * 1014 vendor sub-type
+ *
+ * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU
+ * to do TLB invalidation on a GPU.
+ */
+#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD	(1)
+
+/*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
  * which allows direct access to non-MSIX registers which happened to be within
  * the same system page.
@@ -363,6 +378,33 @@ struct vfio_region_gfx_edid {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
 
+/*
+ * Capability with compressed real address (aka SSA - small system address)
+ * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing
+ * and by the userspace to associate a NVLink bridge with a GPU.
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT	4
+
+struct vfio_region_info_cap_nvlink2_ssatgt {
+	struct vfio_info_cap_header header;
+	__u64 tgt;
+};
+
+/*
+ * Capability with an NVLink link speed. The value is read by
+ * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed"
+ * property in the device tree. The value is fixed in the hardware
+ * and failing to provide the correct value results in the link
+ * not working with no indication from the driver why.
+ */
+#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD	5
+
+struct vfio_region_info_cap_nvlink2_lnkspd {
+	struct vfio_info_cap_header header;
+	__u32 link_speed;
+	__u32 __pad;
+};
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
index c3ee839..208452a 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -11,6 +11,7 @@
 tm-signal-context-chk-gpr
 tm-signal-context-chk-vmx
 tm-signal-context-chk-vsx
+tm-signal-sigreturn-nt
 tm-vmx-unavail
 tm-unavailable
 tm-trap
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 9fc2cf6..75a6853 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -4,7 +4,7 @@
 
 TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \
 	tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \
-	$(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn
+	$(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt
 
 top_srcdir = ../../../../..
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
new file mode 100644
index 0000000..56fbf9f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp.
+ *
+ * A test case that creates a signal and starts a suspended transaction
+ * inside the signal handler.
+ *
+ * It returns from the signal handler with the CPU at suspended state, but
+ * without setting usercontext MSR Transaction State (TS) fields.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <signal.h>
+
+#include "utils.h"
+
+void trap_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+	ucontext_t *ucp = (ucontext_t *) uc;
+
+	asm("tbegin.; tsuspend.;");
+
+	/* Skip 'trap' instruction if it succeed */
+	ucp->uc_mcontext.regs->nip += 4;
+}
+
+int tm_signal_sigreturn_nt(void)
+{
+	struct sigaction trap_sa;
+
+	trap_sa.sa_flags = SA_SIGINFO;
+	trap_sa.sa_sigaction = trap_signal_handler;
+
+	sigaction(SIGTRAP, &trap_sa, NULL);
+
+	raise(SIGTRAP);
+
+	return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+	test_harness(tm_signal_sigreturn_nt, "tm_signal_sigreturn_nt");
+}
+