vfio/type1: Use vfio_batch for vaddr_get_pfns()

Alex Williamson · Alex Williamson · commit 860cd61420d0 · 2025-03-31T14:06:52.000-06:00
JIRA: https://issues.redhat.com/browse/RHEL-85587 commit eb996ee Author: Alex Williamson <alex.williamson@redhat.com> Date: Tue Feb 18 15:22:03 2025 -0700 vfio/type1: Use vfio_batch for vaddr_get_pfns() Passing the vfio_batch to vaddr_get_pfns() allows for greater distinction between page backed pfns and pfnmaps. In the case of page backed pfns, vfio_batch.size is set to a positive value matching the number of pages filled in vfio_batch.pages. For a pfnmap, vfio_batch.size remains zero as vfio_batch.pages are not used. In both cases the return value continues to indicate the number of pfns and the provided pfn arg is set to the initial pfn value. This allows us to shortcut the pfnmap case, which is detected by the zero vfio_batch.size. pfnmaps do not contribute to locked memory accounting, therefore we can update counters and continue directly, which also enables a future where vaddr_get_pfns() can return a value greater than one for consecutive pfnmaps. NB. Now that we're not guessing whether the initial pfn is page backed or pfnmap, we no longer need to special case the put_pfn() and batch size reset. It's safe for vfio_batch_unpin() to handle this case. Reviewed-by: Peter Xu <peterx@redhat.com> Reviewed-by: Mitchell Augustin <mitchell.augustin@canonical.com> Tested-by: Mitchell Augustin <mitchell.augustin@canonical.com> Link: https://lore.kernel.org/r/20250218222209.1382449-4-alex.williamson@redhat.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
@@ -556,12 +556,16 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 
 /*
  * Returns the positive number of pfns successfully obtained or a negative
- * error code.
+ * error code.  The initial pfn is stored in the pfn arg.  For page-backed
+ * pfns, the provided batch is also updated to indicate the filled pages and
+ * initial offset.  For VM_PFNMAP pfns, only the returned number of pfns and
+ * returned initial pfn are provided; subsequent pfns are contiguous.
  */
 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 			  long npages, int prot, unsigned long *pfn,
-			  struct page **pages)
+			  struct vfio_batch *batch)
 {
+	long pin_pages = min_t(long, npages, batch->capacity);
 	struct vm_area_struct *vma;
 	unsigned int flags = 0;
 	int ret;
@@ -570,10 +574,12 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 		flags |= FOLL_WRITE;
 
 	mmap_read_lock(mm);
-	ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
-				    pages, NULL);
+	ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM,
+				    batch->pages, NULL);
 	if (ret > 0) {
-		*pfn = page_to_pfn(pages[0]);
+		*pfn = page_to_pfn(batch->pages[0]);
+		batch->size = ret;
+		batch->offset = 0;
 		goto done;
 	} else if (!ret) {
 		ret = -EFAULT;
@@ -629,32 +635,42 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 		*pfn_base = 0;
 	}
 
+	if (unlikely(disable_hugepages))
+		npage = 1;
+
 	while (npage) {
 		if (!batch->size) {
 			/* Empty batch, so refill it. */
-			long req_pages = min_t(long, npage, batch->capacity);
-
-			ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
-					     &pfn, batch->pages);
+			ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot,
+					     &pfn, batch);
 			if (ret < 0)
 				goto unpin_out;
 
-			batch->size = ret;
-			batch->offset = 0;
-
 			if (!*pfn_base) {
 				*pfn_base = pfn;
 				rsvd = is_invalid_reserved_pfn(*pfn_base);
 			}
+
+			/* Handle pfnmap */
+			if (!batch->size) {
+				if (pfn != *pfn_base + pinned || !rsvd)
+					goto out;
+
+				pinned += ret;
+				npage -= ret;
+				vaddr += (PAGE_SIZE * ret);
+				iova += (PAGE_SIZE * ret);
+				continue;
+			}
 		}
 
 		/*
-		 * pfn is preset for the first iteration of this inner loop and
-		 * updated at the end to handle a VM_PFNMAP pfn.  In that case,
-		 * batch->pages isn't valid (there's no struct page), so allow
-		 * batch->pages to be touched only when there's more than one
-		 * pfn to check, which guarantees the pfns are from a
-		 * !VM_PFNMAP vma.
+		 * pfn is preset for the first iteration of this inner loop
+		 * due to the fact that vaddr_get_pfns() needs to provide the
+		 * initial pfn for pfnmaps.  Therefore to reduce redundancy,
+		 * the next pfn is fetched at the end of the loop.
+		 * A PageReserved() page could still qualify as page backed
+		 * and rsvd here, and therefore continues to use the batch.
 		 */
 		while (true) {
 			if (pfn != *pfn_base + pinned ||
@@ -689,21 +705,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 
 			pfn = page_to_pfn(batch->pages[batch->offset]);
 		}
-
-		if (unlikely(disable_hugepages))
-			break;
 	}
 
 out:
 	ret = vfio_lock_acct(dma, lock_acct, false);
 
 unpin_out:
-	if (batch->size == 1 && !batch->offset) {
-		/* May be a VM_PFNMAP pfn, which the batch can't remember. */
-		put_pfn(pfn, dma->prot);
-		batch->size = 0;
-	}
-
 	if (ret < 0) {
 		if (pinned && !rsvd) {
 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
@@ -751,7 +758,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 
 	vfio_batch_init_single(&batch);
 
-	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, batch.pages);
+	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch);
 	if (ret != 1)
 		goto out;