Skip to content

Commit dc1f1ce

Browse files
committed
RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage
JIRA: https://issues.redhat.com/browse/RHEL-110100 Conflicts: - Drop the rdma_rxe hunk the driver is unmaintained. - Minor context diff due to the include of the following commit: -- 9a0e6f1 ("RDMA/core: Silence oversized kvmalloc() warning") commit 1efe8c0 Author: Leon Romanovsky <leon@kernel.org> Date: Mon Apr 28 12:22:19 2025 +0300 RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage Reuse newly added DMA API to cache IOVA and only link/unlink pages in fast path for UMEM ODP flow. Tested-by: Jens Axboe <axboe@kernel.dk> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Kamal Heib <kheib@redhat.com>
1 parent 5dce9ba commit dc1f1ce

File tree

5 files changed

+69
-113
lines changed

5 files changed

+69
-113
lines changed

drivers/infiniband/core/umem_odp.c

Lines changed: 22 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <linux/hugetlb.h>
4242
#include <linux/interval_tree.h>
4343
#include <linux/hmm.h>
44+
#include <linux/hmm-dma.h>
4445
#include <linux/pagemap.h>
4546

4647
#include <rdma/ib_umem_odp.h>
@@ -50,6 +51,7 @@
5051
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
5152
const struct mmu_interval_notifier_ops *ops)
5253
{
54+
struct ib_device *dev = umem_odp->umem.ibdev;
5355
int ret;
5456

5557
umem_odp->umem.is_odp = 1;
@@ -59,7 +61,6 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
5961
size_t page_size = 1UL << umem_odp->page_shift;
6062
unsigned long start;
6163
unsigned long end;
62-
size_t ndmas, npfns;
6364

6465
start = ALIGN_DOWN(umem_odp->umem.address, page_size);
6566
if (check_add_overflow(umem_odp->umem.address,
@@ -70,38 +71,23 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
7071
if (unlikely(end < page_size))
7172
return -EOVERFLOW;
7273

73-
ndmas = (end - start) >> umem_odp->page_shift;
74-
if (!ndmas)
75-
return -EINVAL;
76-
77-
npfns = (end - start) >> PAGE_SHIFT;
78-
umem_odp->pfn_list = kvcalloc(
79-
npfns, sizeof(*umem_odp->pfn_list),
80-
GFP_KERNEL | __GFP_NOWARN);
81-
if (!umem_odp->pfn_list)
82-
return -ENOMEM;
83-
84-
umem_odp->dma_list = kvcalloc(
85-
ndmas, sizeof(*umem_odp->dma_list),
86-
GFP_KERNEL | __GFP_NOWARN);
87-
if (!umem_odp->dma_list) {
88-
ret = -ENOMEM;
89-
goto out_pfn_list;
90-
}
74+
ret = hmm_dma_map_alloc(dev->dma_device, &umem_odp->map,
75+
(end - start) >> PAGE_SHIFT,
76+
1 << umem_odp->page_shift);
77+
if (ret)
78+
return ret;
9179

9280
ret = mmu_interval_notifier_insert(&umem_odp->notifier,
9381
umem_odp->umem.owning_mm,
9482
start, end - start, ops);
9583
if (ret)
96-
goto out_dma_list;
84+
goto out_free_map;
9785
}
9886

9987
return 0;
10088

101-
out_dma_list:
102-
kvfree(umem_odp->dma_list);
103-
out_pfn_list:
104-
kvfree(umem_odp->pfn_list);
89+
out_free_map:
90+
hmm_dma_map_free(dev->dma_device, &umem_odp->map);
10591
return ret;
10692
}
10793

@@ -264,6 +250,8 @@ EXPORT_SYMBOL(ib_umem_odp_get);
264250

265251
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
266252
{
253+
struct ib_device *dev = umem_odp->umem.ibdev;
254+
267255
/*
268256
* Ensure that no more pages are mapped in the umem.
269257
*
@@ -276,48 +264,17 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
276264
ib_umem_end(umem_odp));
277265
mutex_unlock(&umem_odp->umem_mutex);
278266
mmu_interval_notifier_remove(&umem_odp->notifier);
279-
kvfree(umem_odp->dma_list);
280-
kvfree(umem_odp->pfn_list);
267+
hmm_dma_map_free(dev->dma_device, &umem_odp->map);
281268
}
282269
put_pid(umem_odp->tgid);
283270
kfree(umem_odp);
284271
}
285272
EXPORT_SYMBOL(ib_umem_odp_release);
286273

287-
/*
288-
* Map for DMA and insert a single page into the on-demand paging page tables.
289-
*
290-
* @umem: the umem to insert the page to.
291-
* @dma_index: index in the umem to add the dma to.
292-
* @page: the page struct to map and add.
293-
* @access_mask: access permissions needed for this page.
294-
*
295-
* The function returns -EFAULT if the DMA mapping operation fails.
296-
*
297-
*/
298-
static int ib_umem_odp_map_dma_single_page(
299-
struct ib_umem_odp *umem_odp,
300-
unsigned int dma_index,
301-
struct page *page)
302-
{
303-
struct ib_device *dev = umem_odp->umem.ibdev;
304-
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
305-
306-
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
307-
DMA_BIDIRECTIONAL);
308-
if (ib_dma_mapping_error(dev, *dma_addr)) {
309-
*dma_addr = 0;
310-
return -EFAULT;
311-
}
312-
umem_odp->npages++;
313-
return 0;
314-
}
315-
316274
/**
317275
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
318276
*
319277
* Maps the range passed in the argument to DMA addresses.
320-
* The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
321278
* Upon success the ODP MR will be locked to let caller complete its device
322279
* page table update.
323280
*
@@ -374,7 +331,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
374331
range.default_flags |= HMM_PFN_REQ_WRITE;
375332
}
376333

377-
range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
334+
range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
378335
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
379336

380337
retry:
@@ -425,16 +382,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
425382
__func__, hmm_order, page_shift);
426383
break;
427384
}
428-
429-
ret = ib_umem_odp_map_dma_single_page(
430-
umem_odp, dma_index,
431-
hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
432-
if (ret < 0) {
433-
ibdev_dbg(umem_odp->umem.ibdev,
434-
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
435-
break;
436-
}
437-
range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
438385
}
439386
/* upon success lock should stay on hold for the callee */
440387
if (!ret)
@@ -454,32 +401,23 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
454401
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
455402
u64 bound)
456403
{
457-
dma_addr_t dma;
458-
int idx;
459-
u64 addr;
460404
struct ib_device *dev = umem_odp->umem.ibdev;
405+
u64 addr;
461406

462407
lockdep_assert_held(&umem_odp->umem_mutex);
463408

464409
virt = max_t(u64, virt, ib_umem_start(umem_odp));
465410
bound = min_t(u64, bound, ib_umem_end(umem_odp));
466411
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
467-
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
468-
PAGE_SHIFT;
469-
struct page *page =
470-
hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
471-
472-
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
473-
dma = umem_odp->dma_list[idx];
412+
u64 offset = addr - ib_umem_start(umem_odp);
413+
size_t idx = offset >> umem_odp->page_shift;
414+
unsigned long pfn = umem_odp->map.pfn_list[idx];
474415

475-
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
476-
goto clear;
477-
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
416+
if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
478417
goto clear;
479418

480-
ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
481-
DMA_BIDIRECTIONAL);
482-
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
419+
if (pfn & HMM_PFN_WRITE) {
420+
struct page *page = hmm_pfn_to_page(pfn);
483421
struct page *head_page = compound_head(page);
484422
/*
485423
* set_page_dirty prefers being called with
@@ -494,7 +432,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
494432
}
495433
umem_odp->npages--;
496434
clear:
497-
umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
435+
umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
498436
}
499437
}
500438
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,8 +1470,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
14701470
int __init mlx5_ib_odp_init(void);
14711471
void mlx5_ib_odp_cleanup(void);
14721472
int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
1473-
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1474-
struct mlx5_ib_mr *mr, int flags);
1473+
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1474+
struct mlx5_ib_mr *mr, int flags);
14751475

14761476
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
14771477
enum ib_uverbs_advise_mr_advice advice,
@@ -1492,8 +1492,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
14921492
{
14931493
return 0;
14941494
}
1495-
static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1496-
struct mlx5_ib_mr *mr, int flags) {}
1495+
static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1496+
struct mlx5_ib_mr *mr, int flags)
1497+
{
1498+
return -EOPNOTSUPP;
1499+
}
14971500

14981501
static inline int
14991502
mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
#include <linux/dma-buf.h>
3636
#include <linux/dma-resv.h>
3737
#include <linux/hmm.h>
38+
#include <linux/hmm-dma.h>
39+
#include <linux/pci-p2pdma.h>
3840

3941
#include "mlx5_ib.h"
4042
#include "cmd.h"
@@ -159,40 +161,50 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
159161
}
160162
}
161163

162-
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
163-
struct mlx5_ib_mr *mr, int flags)
164+
static int populate_mtt(__be64 *pas, size_t start, size_t nentries,
165+
struct mlx5_ib_mr *mr, int flags)
164166
{
165167
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
166168
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
167-
unsigned long pfn;
168-
dma_addr_t pa;
169+
struct pci_p2pdma_map_state p2pdma_state = {};
170+
struct ib_device *dev = odp->umem.ibdev;
169171
size_t i;
170172

171173
if (flags & MLX5_IB_UPD_XLT_ZAP)
172-
return;
174+
return 0;
173175

174176
for (i = 0; i < nentries; i++) {
175-
pfn = odp->pfn_list[idx + i];
177+
unsigned long pfn = odp->map.pfn_list[start + i];
178+
dma_addr_t dma_addr;
179+
180+
pfn = odp->map.pfn_list[start + i];
176181
if (!(pfn & HMM_PFN_VALID))
177182
/* ODP initialization */
178183
continue;
179184

180-
pa = odp->dma_list[idx + i];
181-
pa |= MLX5_IB_MTT_READ;
185+
dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map,
186+
start + i, &p2pdma_state);
187+
if (ib_dma_mapping_error(dev, dma_addr))
188+
return -EFAULT;
189+
190+
dma_addr |= MLX5_IB_MTT_READ;
182191
if ((pfn & HMM_PFN_WRITE) && !downgrade)
183-
pa |= MLX5_IB_MTT_WRITE;
192+
dma_addr |= MLX5_IB_MTT_WRITE;
184193

185-
pas[i] = cpu_to_be64(pa);
194+
pas[i] = cpu_to_be64(dma_addr);
195+
odp->npages++;
186196
}
197+
return 0;
187198
}
188199

189-
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
190-
struct mlx5_ib_mr *mr, int flags)
200+
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
201+
struct mlx5_ib_mr *mr, int flags)
191202
{
192203
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
193204
populate_klm(xlt, idx, nentries, mr, flags);
205+
return 0;
194206
} else {
195-
populate_mtt(xlt, idx, nentries, mr, flags);
207+
return populate_mtt(xlt, idx, nentries, mr, flags);
196208
}
197209
}
198210

@@ -303,7 +315,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
303315
* estimate the cost of another UMR vs. the cost of bigger
304316
* UMR.
305317
*/
306-
if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
318+
if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) {
307319
if (!in_block) {
308320
blk_start_idx = idx;
309321
in_block = 1;

drivers/infiniband/hw/mlx5/umr.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
840840
size_to_map = npages * desc_size;
841841
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
842842
DMA_TO_DEVICE);
843-
mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
843+
/*
844+
* npages is the maximum number of pages to map, but we
845+
* can't guarantee that all pages are actually mapped.
846+
*
847+
* For example, if page is p2p of type which is not supported
848+
* for mapping, the number of pages mapped will be less than
849+
* requested.
850+
*/
851+
err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
852+
if (err)
853+
return err;
844854
dma_sync_single_for_device(ddev, sg.addr, sg.length,
845855
DMA_TO_DEVICE);
846856
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);

include/rdma/ib_umem_odp.h

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,17 @@
88

99
#include <rdma/ib_umem.h>
1010
#include <rdma/ib_verbs.h>
11-
#include <linux/hmm.h>
11+
#include <linux/hmm-dma.h>
1212

1313
struct ib_umem_odp {
1414
struct ib_umem umem;
1515
struct mmu_interval_notifier notifier;
1616
struct pid *tgid;
1717

18-
/* An array of the pfns included in the on-demand paging umem. */
19-
unsigned long *pfn_list;
18+
struct hmm_dma_map map;
2019

2120
/*
22-
* An array with DMA addresses mapped for pfns in pfn_list.
23-
* The lower two bits designate access permissions.
24-
* See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
25-
*/
26-
dma_addr_t *dma_list;
27-
/*
28-
* The umem_mutex protects the page_list and dma_list fields of an ODP
21+
* The umem_mutex protects the page_list field of an ODP
2922
* umem, allowing only a single thread to map/unmap pages. The mutex
3023
* also protects access to the mmu notifier counters.
3124
*/

0 commit comments

Comments
 (0)