Skip to content

Commit 3fa6bed

Browse files
committed
Merge: Backport huge pfnmap support for significantly faster large mmap'd PCI device BARs
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6638 Backport huge pfnmap support for significantly faster large mmap'd PCI device BARs JIRA: https://issues.redhat.com/browse/RHEL-73613 Mapping and unmapping PCI BARs into VM va-space is a common VFIO function when assigning a PCIe device to a VM. Currently, the mapping requires setting up the BAR-mappings at a native pagesize level, e.g., 4K on x86, 64K possible on ARM64. This is done using PFNMAP support in the kernel, as these device pages are not backed by the standard page-struct that system memory is mapped under. For relatively small to medium sized BARs, the mapping performance isn't too noticeable; but with vGPUs and BARs in the gigabyte-size ranges, the map time can become a signficant amount of the VM startup time (minutes). Upstream resolved this issue by providing huge PFNMAP support -- superpage PFNMAPs; 2MB, and even 1GB(on x86) pages vs 4K page mappings yields three to ten orders of magnitude speed up in the mapping operation, which is quite visible to the VM user of assigned GPUs in the guest VM. This series backports the core of what upstream did: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ along with a couple of prior upstream commits which allow these to apply cleanly to RHEL9, as well as a couple bug fixes that patchreview identified, and one related commit the kernel-mm team requested. The code can be tested by doing the following (thanks to AlexW for providing this info): # echo "func vfio_pci_mmap_huge_fault +p" > /proc/dynamic_debug/control Then you'll see things in dmesg like: vfio-pci 0000:5e:00.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x0: 0x100 vfio-pci 0000:5e:00.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x200: 0x100 vfio-pci 0000:5e:00.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x400: 0x100 Here we know order 9 is a 2MB PMD mapping on x86. BAR0 on this device is 16MB, so 2MB mappings is the best we can get. If you have a device with at least a 1GB BAR, you should also see: vfio-pci 0000:5e:00.0: vfio_pci_mmap_huge_fault(,order = 18) BAR 1 page offset 0x240000: 0x100 vfio-pci 0000:5e:00.0: vfio_pci_mmap_huge_fault(,order = 18) BAR 1 page offset 0x280000: 0x100 vfio-pci 0000:5e:00.0: vfio_pci_mmap_huge_fault(,order = 18) BAR 1 page offset 0x2c0000: 0x100 Again here we know order 18 is 1GB PUD mappings and BAR1 of this device is 32GB (NVIDIA A10). You'll need to be running at least QEMU 9.2 to get reliable alignment for PUD mappings (which neither RHEL-9 or RHEL-10 has atm; they have QEMU-9.1) If you see order = 0 mappings for BARs that are at least 2MB, something is wrong. Omitted-fix: 7223769 ("mm/memory.c: simplify pfnmap_lockdep_assert") Signed-off-by: Donald Dutile <ddutile@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: David Arcari <darcari@redhat.com> Approved-by: Alex Williamson <alex.williamson@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Approved-by: David Hildenbrand <david@redhat.com> Merged-by: Augusto Caringi <acaringi@redhat.com>
2 parents 70b67ef + 06aa84b commit 3fa6bed

File tree

21 files changed

+485
-442
lines changed

21 files changed

+485
-442
lines changed

arch/arm64/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ config ARM64
9797
select ARCH_SUPPORTS_NUMA_BALANCING
9898
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
9999
select ARCH_SUPPORTS_PER_VMA_LOCK
100+
select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
100101
select ARCH_SUPPORTS_RT
101102
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
102103
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT

arch/arm64/include/asm/pgtable.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,7 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
354354
/*
355355
* Select all bits except the pfn
356356
*/
357+
#define pte_pgprot pte_pgprot
357358
static inline pgprot_t pte_pgprot(pte_t pte)
358359
{
359360
unsigned long pfn = pte_pfn(pte);
@@ -527,6 +528,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
527528
return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
528529
}
529530

531+
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
532+
#define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL)))
533+
static inline pmd_t pmd_mkspecial(pmd_t pmd)
534+
{
535+
return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL));
536+
}
537+
#endif
538+
530539
#define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd))
531540
#define __phys_to_pmd_val(phys) __phys_to_pte_val(phys)
532541
#define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
@@ -544,6 +553,27 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
544553
#define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
545554
#define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
546555

556+
#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
557+
#define pud_special(pte) pte_special(pud_pte(pud))
558+
#define pud_mkspecial(pte) pte_pud(pte_mkspecial(pud_pte(pud)))
559+
#endif
560+
561+
#define pmd_pgprot pmd_pgprot
562+
static inline pgprot_t pmd_pgprot(pmd_t pmd)
563+
{
564+
unsigned long pfn = pmd_pfn(pmd);
565+
566+
return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
567+
}
568+
569+
#define pud_pgprot pud_pgprot
570+
static inline pgprot_t pud_pgprot(pud_t pud)
571+
{
572+
unsigned long pfn = pud_pfn(pud);
573+
574+
return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud));
575+
}
576+
547577
static inline void __set_pte_at(struct mm_struct *mm,
548578
unsigned long __always_unused addr,
549579
pte_t *ptep, pte_t pte, unsigned int nr)

arch/powerpc/include/asm/pgtable.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ static inline unsigned long pte_pfn(pte_t pte)
4444
/*
4545
* Select all bits except the pfn
4646
*/
47+
#define pte_pgprot pte_pgprot
4748
static inline pgprot_t pte_pgprot(pte_t pte)
4849
{
4950
unsigned long pte_flags;

arch/s390/include/asm/pgtable.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,7 @@ static inline int pte_unused(pte_t pte)
941941
* young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID
942942
* must not be set.
943943
*/
944+
#define pte_pgprot pte_pgprot
944945
static inline pgprot_t pte_pgprot(pte_t pte)
945946
{
946947
unsigned long pte_flags = pte_val(pte) & _PAGE_CHG_MASK;

arch/s390/pci/pci_mmio.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,11 @@ static inline int __memcpy_toio_inuser(void __iomem *dst,
118118
SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
119119
const void __user *, user_buffer, size_t, length)
120120
{
121+
struct follow_pfnmap_args args = { };
121122
u8 local_buf[64];
122123
void __iomem *io_addr;
123124
void *buf;
124125
struct vm_area_struct *vma;
125-
pte_t *ptep;
126-
spinlock_t *ptl;
127126
long ret;
128127

129128
if (!zpci_is_enabled())
@@ -169,19 +168,21 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
169168
if (!(vma->vm_flags & VM_WRITE))
170169
goto out_unlock_mmap;
171170

172-
ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
171+
args.address = mmio_addr;
172+
args.vma = vma;
173+
ret = follow_pfnmap_start(&args);
173174
if (ret)
174175
goto out_unlock_mmap;
175176

176-
io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
177+
io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) |
177178
(mmio_addr & ~PAGE_MASK));
178179

179180
if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE)
180181
goto out_unlock_pt;
181182

182183
ret = zpci_memcpy_toio(io_addr, buf, length);
183184
out_unlock_pt:
184-
pte_unmap_unlock(ptep, ptl);
185+
follow_pfnmap_end(&args);
185186
out_unlock_mmap:
186187
mmap_read_unlock(current->mm);
187188
out_free:
@@ -260,12 +261,11 @@ static inline int __memcpy_fromio_inuser(void __user *dst,
260261
SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
261262
void __user *, user_buffer, size_t, length)
262263
{
264+
struct follow_pfnmap_args args = { };
263265
u8 local_buf[64];
264266
void __iomem *io_addr;
265267
void *buf;
266268
struct vm_area_struct *vma;
267-
pte_t *ptep;
268-
spinlock_t *ptl;
269269
long ret;
270270

271271
if (!zpci_is_enabled())
@@ -308,11 +308,13 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
308308
if (!(vma->vm_flags & VM_WRITE))
309309
goto out_unlock_mmap;
310310

311-
ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
311+
args.vma = vma;
312+
args.address = mmio_addr;
313+
ret = follow_pfnmap_start(&args);
312314
if (ret)
313315
goto out_unlock_mmap;
314316

315-
io_addr = (void __iomem *)((pte_pfn(*ptep) << PAGE_SHIFT) |
317+
io_addr = (void __iomem *)((args.pfn << PAGE_SHIFT) |
316318
(mmio_addr & ~PAGE_MASK));
317319

318320
if ((unsigned long) io_addr < ZPCI_IOMAP_ADDR_BASE) {
@@ -322,7 +324,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
322324
ret = zpci_memcpy_fromio(buf, io_addr, length);
323325

324326
out_unlock_pt:
325-
pte_unmap_unlock(ptep, ptl);
327+
follow_pfnmap_end(&args);
326328
out_unlock_mmap:
327329
mmap_read_unlock(current->mm);
328330

arch/sparc/include/asm/pgtable_64.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,7 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
782782
return __pmd(pte_val(pte));
783783
}
784784

785+
#define pmd_pgprot pmd_pgprot
785786
static inline pgprot_t pmd_pgprot(pmd_t entry)
786787
{
787788
unsigned long val = pmd_val(entry);

arch/x86/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ config X86_64
2828
select ARCH_HAS_GIGANTIC_PAGE
2929
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
3030
select ARCH_SUPPORTS_PER_VMA_LOCK
31+
select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE
3132
select ARCH_SUPPORTS_RT
3233
select HAVE_ARCH_SOFT_DIRTY
3334
select MODULES_USE_ELF_RELA

arch/x86/include/asm/pgtable.h

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,34 @@ extern pmdval_t early_pmd_flags;
121121
#define arch_end_context_switch(prev) do {} while(0)
122122
#endif /* CONFIG_PARAVIRT_XXL */
123123

124+
static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
125+
{
126+
pmdval_t v = native_pmd_val(pmd);
127+
128+
return native_make_pmd(v | set);
129+
}
130+
131+
static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
132+
{
133+
pmdval_t v = native_pmd_val(pmd);
134+
135+
return native_make_pmd(v & ~clear);
136+
}
137+
138+
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
139+
{
140+
pudval_t v = native_pud_val(pud);
141+
142+
return native_make_pud(v | set);
143+
}
144+
145+
static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
146+
{
147+
pudval_t v = native_pud_val(pud);
148+
149+
return native_make_pud(v & ~clear);
150+
}
151+
124152
/*
125153
* The following only work if pte_present() is true.
126154
* Undefined behaviour if not..
@@ -310,6 +338,30 @@ static inline int pud_devmap(pud_t pud)
310338
}
311339
#endif
312340

341+
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
342+
static inline bool pmd_special(pmd_t pmd)
343+
{
344+
return pmd_flags(pmd) & _PAGE_SPECIAL;
345+
}
346+
347+
static inline pmd_t pmd_mkspecial(pmd_t pmd)
348+
{
349+
return pmd_set_flags(pmd, _PAGE_SPECIAL);
350+
}
351+
#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */
352+
353+
#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
354+
static inline bool pud_special(pud_t pud)
355+
{
356+
return pud_flags(pud) & _PAGE_SPECIAL;
357+
}
358+
359+
static inline pud_t pud_mkspecial(pud_t pud)
360+
{
361+
return pud_set_flags(pud, _PAGE_SPECIAL);
362+
}
363+
#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
364+
313365
static inline int pgd_devmap(pgd_t pgd)
314366
{
315367
return 0;
@@ -480,20 +532,6 @@ static inline pte_t pte_mkdevmap(pte_t pte)
480532
return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
481533
}
482534

483-
static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
484-
{
485-
pmdval_t v = native_pmd_val(pmd);
486-
487-
return native_make_pmd(v | set);
488-
}
489-
490-
static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
491-
{
492-
pmdval_t v = native_pmd_val(pmd);
493-
494-
return native_make_pmd(v & ~clear);
495-
}
496-
497535
/* See comments above mksaveddirty_shift() */
498536
static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
499537
{
@@ -588,20 +626,6 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
588626
pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
589627
#define pmd_mkwrite pmd_mkwrite
590628

591-
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
592-
{
593-
pudval_t v = native_pud_val(pud);
594-
595-
return native_make_pud(v | set);
596-
}
597-
598-
static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
599-
{
600-
pudval_t v = native_pud_val(pud);
601-
602-
return native_make_pud(v & ~clear);
603-
}
604-
605629
/* See comments above mksaveddirty_shift() */
606630
static inline pud_t pud_mksaveddirty(pud_t pud)
607631
{

arch/x86/mm/pat/memtype.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include <linux/pfn_t.h>
4040
#include <linux/slab.h>
4141
#include <linux/mm.h>
42+
#include <linux/highmem.h>
4243
#include <linux/fs.h>
4344
#include <linux/rbtree.h>
4445

@@ -947,6 +948,26 @@ static void free_pfn_range(u64 paddr, unsigned long size)
947948
memtype_free(paddr, paddr + size);
948949
}
949950

951+
static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
952+
resource_size_t *phys)
953+
{
954+
struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
955+
956+
if (follow_pfnmap_start(&args))
957+
return -EINVAL;
958+
959+
/* Never return PFNs of anon folios in COW mappings. */
960+
if (!args.special) {
961+
follow_pfnmap_end(&args);
962+
return -EINVAL;
963+
}
964+
965+
*prot = pgprot_val(args.pgprot);
966+
*phys = (resource_size_t)args.pfn << PAGE_SHIFT;
967+
follow_pfnmap_end(&args);
968+
return 0;
969+
}
970+
950971
static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
951972
pgprot_t *pgprot)
952973
{
@@ -964,7 +985,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
964985
* detect the PFN. If we need the cachemode as well, we're out of luck
965986
* for now and have to fail fork().
966987
*/
967-
if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
988+
if (!follow_phys(vma, &prot, paddr)) {
968989
if (pgprot)
969990
*pgprot = __pgprot(prot);
970991
return 0;

0 commit comments

Comments
 (0)