Skip to content

Commit 5dce9ba

Browse files
committed
RDMA/umem: Store ODP access mask information in PFN
JIRA: https://issues.redhat.com/browse/RHEL-110100 Conflicts: Drop the rdma_rxe hunk, the driver is unmaintained. commit eedd5b1 Author: Leon Romanovsky <leon@kernel.org> Date: Mon Apr 28 12:22:18 2025 +0300 RDMA/umem: Store ODP access mask information in PFN As a preparation to remove dma_list, store access mask in PFN pointer and not in dma_addr_t. Tested-by: Jens Axboe <axboe@kernel.dk> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Signed-off-by: Kamal Heib <kheib@redhat.com>
1 parent 2aa81e5 commit 5dce9ba

File tree

4 files changed

+64
-91
lines changed

4 files changed

+64
-91
lines changed

drivers/infiniband/core/umem_odp.c

Lines changed: 43 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -298,30 +298,18 @@ EXPORT_SYMBOL(ib_umem_odp_release);
298298
static int ib_umem_odp_map_dma_single_page(
299299
struct ib_umem_odp *umem_odp,
300300
unsigned int dma_index,
301-
struct page *page,
302-
u64 access_mask)
301+
struct page *page)
303302
{
304303
struct ib_device *dev = umem_odp->umem.ibdev;
305304
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
306305

307-
if (*dma_addr) {
308-
/*
309-
* If the page is already dma mapped it means it went through
310-
* a non-invalidating trasition, like read-only to writable.
311-
* Resync the flags.
312-
*/
313-
*dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
314-
return 0;
315-
}
316-
317306
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
318307
DMA_BIDIRECTIONAL);
319308
if (ib_dma_mapping_error(dev, *dma_addr)) {
320309
*dma_addr = 0;
321310
return -EFAULT;
322311
}
323312
umem_odp->npages++;
324-
*dma_addr |= access_mask;
325313
return 0;
326314
}
327315

@@ -357,9 +345,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
357345
struct hmm_range range = {};
358346
unsigned long timeout;
359347

360-
if (access_mask == 0)
361-
return -EINVAL;
362-
363348
if (user_virt < ib_umem_start(umem_odp) ||
364349
user_virt + bcnt > ib_umem_end(umem_odp))
365350
return -EFAULT;
@@ -385,7 +370,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
385370
if (fault) {
386371
range.default_flags = HMM_PFN_REQ_FAULT;
387372

388-
if (access_mask & ODP_WRITE_ALLOWED_BIT)
373+
if (access_mask & HMM_PFN_WRITE)
389374
range.default_flags |= HMM_PFN_REQ_WRITE;
390375
}
391376

@@ -417,22 +402,17 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
417402
for (pfn_index = 0; pfn_index < num_pfns;
418403
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
419404

420-
if (fault) {
421-
/*
422-
* Since we asked for hmm_range_fault() to populate
423-
* pages it shouldn't return an error entry on success.
424-
*/
425-
WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
426-
WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
427-
} else {
428-
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
429-
WARN_ON(umem_odp->dma_list[dma_index]);
430-
continue;
431-
}
432-
access_mask = ODP_READ_ALLOWED_BIT;
433-
if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
434-
access_mask |= ODP_WRITE_ALLOWED_BIT;
435-
}
405+
/*
406+
* Since we asked for hmm_range_fault() to populate
407+
* pages it shouldn't return an error entry on success.
408+
*/
409+
WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
410+
WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
411+
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
412+
continue;
413+
414+
if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
415+
continue;
436416

437417
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
438418
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -447,13 +427,14 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
447427
}
448428

449429
ret = ib_umem_odp_map_dma_single_page(
450-
umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
451-
access_mask);
430+
umem_odp, dma_index,
431+
hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
452432
if (ret < 0) {
453433
ibdev_dbg(umem_odp->umem.ibdev,
454434
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
455435
break;
456436
}
437+
range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
457438
}
458439
/* upon success lock should stay on hold for the callee */
459440
if (!ret)
@@ -473,7 +454,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
473454
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
474455
u64 bound)
475456
{
476-
dma_addr_t dma_addr;
477457
dma_addr_t dma;
478458
int idx;
479459
u64 addr;
@@ -484,34 +464,37 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
484464
virt = max_t(u64, virt, ib_umem_start(umem_odp));
485465
bound = min_t(u64, bound, ib_umem_end(umem_odp));
486466
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
467+
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
468+
PAGE_SHIFT;
469+
struct page *page =
470+
hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
471+
487472
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
488473
dma = umem_odp->dma_list[idx];
489474

490-
/* The access flags guaranteed a valid DMA address in case was NULL */
491-
if (dma) {
492-
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
493-
struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
494-
495-
dma_addr = dma & ODP_DMA_ADDR_MASK;
496-
ib_dma_unmap_page(dev, dma_addr,
497-
BIT(umem_odp->page_shift),
498-
DMA_BIDIRECTIONAL);
499-
if (dma & ODP_WRITE_ALLOWED_BIT) {
500-
struct page *head_page = compound_head(page);
501-
/*
502-
* set_page_dirty prefers being called with
503-
* the page lock. However, MMU notifiers are
504-
* called sometimes with and sometimes without
505-
* the lock. We rely on the umem_mutex instead
506-
* to prevent other mmu notifiers from
507-
* continuing and allowing the page mapping to
508-
* be removed.
509-
*/
510-
set_page_dirty(head_page);
511-
}
512-
umem_odp->dma_list[idx] = 0;
513-
umem_odp->npages--;
475+
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
476+
goto clear;
477+
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
478+
goto clear;
479+
480+
ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
481+
DMA_BIDIRECTIONAL);
482+
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
483+
struct page *head_page = compound_head(page);
484+
/*
485+
* set_page_dirty prefers being called with
486+
* the page lock. However, MMU notifiers are
487+
* called sometimes with and sometimes without
488+
* the lock. We rely on the umem_mutex instead
489+
* to prevent other mmu notifiers from
490+
* continuing and allowing the page mapping to
491+
* be removed.
492+
*/
493+
set_page_dirty(head_page);
514494
}
495+
umem_odp->npages--;
496+
clear:
497+
umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
515498
}
516499
}
517500
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ struct mlx5_ib_flow_db {
347347
#define MLX5_IB_UPD_XLT_PD BIT(4)
348348
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
349349
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
350+
#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
350351

351352
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
352353
*

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <linux/kernel.h>
3535
#include <linux/dma-buf.h>
3636
#include <linux/dma-resv.h>
37+
#include <linux/hmm.h>
3738

3839
#include "mlx5_ib.h"
3940
#include "cmd.h"
@@ -158,31 +159,30 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
158159
}
159160
}
160161

161-
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
162-
{
163-
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
164-
165-
if (umem_dma & ODP_READ_ALLOWED_BIT)
166-
mtt_entry |= MLX5_IB_MTT_READ;
167-
if (umem_dma & ODP_WRITE_ALLOWED_BIT)
168-
mtt_entry |= MLX5_IB_MTT_WRITE;
169-
170-
return mtt_entry;
171-
}
172-
173162
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
174163
struct mlx5_ib_mr *mr, int flags)
175164
{
176165
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
166+
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
167+
unsigned long pfn;
177168
dma_addr_t pa;
178169
size_t i;
179170

180171
if (flags & MLX5_IB_UPD_XLT_ZAP)
181172
return;
182173

183174
for (i = 0; i < nentries; i++) {
175+
pfn = odp->pfn_list[idx + i];
176+
if (!(pfn & HMM_PFN_VALID))
177+
/* ODP initialization */
178+
continue;
179+
184180
pa = odp->dma_list[idx + i];
185-
pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
181+
pa |= MLX5_IB_MTT_READ;
182+
if ((pfn & HMM_PFN_WRITE) && !downgrade)
183+
pa |= MLX5_IB_MTT_WRITE;
184+
185+
pas[i] = cpu_to_be64(pa);
186186
}
187187
}
188188

@@ -303,8 +303,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
303303
* estimate the cost of another UMR vs. the cost of bigger
304304
* UMR.
305305
*/
306-
if (umem_odp->dma_list[idx] &
307-
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
306+
if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
308307
if (!in_block) {
309308
blk_start_idx = idx;
310309
in_block = 1;
@@ -687,20 +686,22 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
687686
{
688687
int page_shift, ret, np;
689688
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
690-
u64 access_mask;
689+
u64 access_mask = 0;
691690
u64 start_idx;
692691
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
693692
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
694693

695694
if (flags & MLX5_PF_FLAGS_ENABLE)
696695
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
697696

697+
if (flags & MLX5_PF_FLAGS_DOWNGRADE)
698+
xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
699+
698700
page_shift = odp->page_shift;
699701
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
700-
access_mask = ODP_READ_ALLOWED_BIT;
701702

702703
if (odp->umem.writable && !downgrade)
703-
access_mask |= ODP_WRITE_ALLOWED_BIT;
704+
access_mask |= HMM_PFN_WRITE;
704705

705706
np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
706707
if (np < 0)

include/rdma/ib_umem_odp.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <rdma/ib_umem.h>
1010
#include <rdma/ib_verbs.h>
11+
#include <linux/hmm.h>
1112

1213
struct ib_umem_odp {
1314
struct ib_umem umem;
@@ -67,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
6768
umem_odp->page_shift;
6869
}
6970

70-
/*
71-
* The lower 2 bits of the DMA address signal the R/W permissions for
72-
* the entry. To upgrade the permissions, provide the appropriate
73-
* bitmask to the map_dma_pages function.
74-
*
75-
* Be aware that upgrading a mapped address might result in change of
76-
* the DMA address for the page.
77-
*/
78-
#define ODP_READ_ALLOWED_BIT (1<<0ULL)
79-
#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
80-
81-
#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
82-
8371
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
8472

8573
struct ib_umem_odp *

0 commit comments

Comments
 (0)