Skip to content

Commit 9de1daf

Browse files
author
Herton R. Krzesinski
committed
Merge: RDMA: Add support of RDMA dmabuf for mlx5 driver
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/1940 Upstream status: v6.1. Bugzilla: https://bugzilla.redhat.com/2123401 Tested: cuda pyverbs tests passed. Add support for DMABUF FD's when creating a devx umem in the RDMA mlx5 driver. This allows applications to create work queues directly on GPU memory where the GPU fully controls the data flow out of the RDMA NIC. Signed-off-by: Kamal Heib <kheib@redhat.com> Approved-by: Íñigo Huguet <ihuguet@redhat.com> Approved-by: José Ignacio Tornos Martínez <jtornosm@redhat.com> Approved-by: Jonathan Toppins <jtoppins@redhat.com> Signed-off-by: Herton R. Krzesinski <herton@redhat.com>
2 parents ec26642 + ee8da65 commit 9de1daf

File tree

7 files changed

+109
-20
lines changed

7 files changed

+109
-20
lines changed

drivers/infiniband/core/uverbs_ioctl.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,14 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
337337

338338
break;
339339

340+
case UVERBS_ATTR_TYPE_RAW_FD:
341+
if (uattr->attr_data.reserved || uattr->len != 0 ||
342+
uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX)
343+
return -EINVAL;
344+
/* _uverbs_get_const_signed() is the accessor */
345+
e->ptr_attr.data = uattr->data_s64;
346+
break;
347+
340348
case UVERBS_ATTR_TYPE_IDRS_ARRAY:
341349
return uverbs_process_idrs_array(pbundle, attr_uapi,
342350
&e->objs_arr_attr, uattr,

drivers/infiniband/hw/mlx5/devx.c

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2158,32 +2158,39 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
21582158

21592159
static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
21602160
struct uverbs_attr_bundle *attrs,
2161-
struct devx_umem *obj)
2161+
struct devx_umem *obj, u32 access_flags)
21622162
{
21632163
u64 addr;
21642164
size_t size;
2165-
u32 access;
21662165
int err;
21672166

21682167
if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) ||
21692168
uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN))
21702169
return -EFAULT;
21712170

2172-
err = uverbs_get_flags32(&access, attrs,
2173-
MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
2174-
IB_ACCESS_LOCAL_WRITE |
2175-
IB_ACCESS_REMOTE_WRITE |
2176-
IB_ACCESS_REMOTE_READ);
2171+
err = ib_check_mr_access(&dev->ib_dev, access_flags);
21772172
if (err)
21782173
return err;
21792174

2180-
err = ib_check_mr_access(&dev->ib_dev, access);
2181-
if (err)
2182-
return err;
2175+
if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD)) {
2176+
struct ib_umem_dmabuf *umem_dmabuf;
2177+
int dmabuf_fd;
2178+
2179+
err = uverbs_get_raw_fd(&dmabuf_fd, attrs,
2180+
MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD);
2181+
if (err)
2182+
return -EFAULT;
21832183

2184-
obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
2185-
if (IS_ERR(obj->umem))
2186-
return PTR_ERR(obj->umem);
2184+
umem_dmabuf = ib_umem_dmabuf_get_pinned(
2185+
&dev->ib_dev, addr, size, dmabuf_fd, access_flags);
2186+
if (IS_ERR(umem_dmabuf))
2187+
return PTR_ERR(umem_dmabuf);
2188+
obj->umem = &umem_dmabuf->umem;
2189+
} else {
2190+
obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access_flags);
2191+
if (IS_ERR(obj->umem))
2192+
return PTR_ERR(obj->umem);
2193+
}
21872194
return 0;
21882195
}
21892196

@@ -2222,7 +2229,8 @@ static unsigned int devx_umem_find_best_pgsize(struct ib_umem *umem,
22222229
static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
22232230
struct uverbs_attr_bundle *attrs,
22242231
struct devx_umem *obj,
2225-
struct devx_umem_reg_cmd *cmd)
2232+
struct devx_umem_reg_cmd *cmd,
2233+
int access)
22262234
{
22272235
unsigned long pgsz_bitmap;
22282236
unsigned int page_size;
@@ -2271,6 +2279,9 @@ static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
22712279
MLX5_SET(umem, umem, page_offset,
22722280
ib_umem_dma_offset(obj->umem, page_size));
22732281

2282+
if (mlx5_umem_needs_ats(dev, obj->umem, access))
2283+
MLX5_SET(umem, umem, ats, 1);
2284+
22742285
mlx5_ib_populate_pas(obj->umem, page_size, mtt,
22752286
(obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
22762287
MLX5_IB_MTT_READ);
@@ -2288,20 +2299,30 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
22882299
struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
22892300
&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
22902301
struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
2302+
int access_flags;
22912303
int err;
22922304

22932305
if (!c->devx_uid)
22942306
return -EINVAL;
22952307

2308+
err = uverbs_get_flags32(&access_flags, attrs,
2309+
MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
2310+
IB_ACCESS_LOCAL_WRITE |
2311+
IB_ACCESS_REMOTE_WRITE |
2312+
IB_ACCESS_REMOTE_READ |
2313+
IB_ACCESS_RELAXED_ORDERING);
2314+
if (err)
2315+
return err;
2316+
22962317
obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL);
22972318
if (!obj)
22982319
return -ENOMEM;
22992320

2300-
err = devx_umem_get(dev, &c->ibucontext, attrs, obj);
2321+
err = devx_umem_get(dev, &c->ibucontext, attrs, obj, access_flags);
23012322
if (err)
23022323
goto err_obj_free;
23032324

2304-
err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd);
2325+
err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd, access_flags);
23052326
if (err)
23062327
goto err_umem_release;
23072328

@@ -2833,6 +2854,8 @@ DECLARE_UVERBS_NAMED_METHOD(
28332854
UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_LEN,
28342855
UVERBS_ATTR_TYPE(u64),
28352856
UA_MANDATORY),
2857+
UVERBS_ATTR_RAW_FD(MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD,
2858+
UA_OPTIONAL),
28362859
UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
28372860
enum ib_access_flags),
28382861
UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,4 +1551,40 @@ static inline bool rt_supported(int ts_cap)
15511551
return ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME ||
15521552
ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME;
15531553
}
1554+
1555+
/*
1556+
* PCI Peer to Peer is a trainwreck. If no switch is present then things
1557+
* sometimes work, depending on the pci_distance_p2p logic for excluding broken
1558+
* root complexes. However if a switch is present in the path, then things get
1559+
* really ugly depending on how the switch is setup. This table assumes that the
1560+
* root complex is strict and is validating that all req/reps are matches
1561+
* perfectly - so any scenario where it sees only half the transaction is a
1562+
* failure.
1563+
*
1564+
* CR/RR/DT ATS RO P2P
1565+
* 00X X X OK
1566+
* 010 X X fails (request is routed to root but root never sees comp)
1567+
* 011 0 X fails (request is routed to root but root never sees comp)
1568+
* 011 1 X OK
1569+
* 10X X 1 OK
1570+
* 101 X 0 fails (completion is routed to root but root didn't see req)
1571+
* 110 X 0 SLOW
1572+
* 111 0 0 SLOW
1573+
* 111 1 0 fails (completion is routed to root but root didn't see req)
1574+
* 111 1 1 OK
1575+
*
1576+
* Unfortunately we cannot reliably know if a switch is present or what the
1577+
* CR/RR/DT ACS settings are, as in a VM that is all hidden. Assume that
1578+
* CR/RR/DT is 111 if the ATS cap is enabled and follow the last three rows.
1579+
*
1580+
* For now assume if the umem is a dma_buf then it is P2P.
1581+
*/
1582+
static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev,
1583+
struct ib_umem *umem, int access_flags)
1584+
{
1585+
if (!MLX5_CAP_GEN(dev->mdev, ats) || !umem->is_dmabuf)
1586+
return false;
1587+
return access_flags & IB_ACCESS_RELAXED_ORDERING;
1588+
}
1589+
15541590
#endif /* MLX5_IB_H */

drivers/infiniband/hw/mlx5/mr.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
937937
* cache then synchronously create an uncached one.
938938
*/
939939
if (!ent || ent->limit == 0 ||
940-
!mlx5r_umr_can_reconfig(dev, 0, access_flags)) {
940+
!mlx5r_umr_can_reconfig(dev, 0, access_flags) ||
941+
mlx5_umem_needs_ats(dev, umem, access_flags)) {
941942
mutex_lock(&dev->slow_path_mutex);
942943
mr = reg_create(pd, umem, iova, access_flags, page_size, false);
943944
mutex_unlock(&dev->slow_path_mutex);
@@ -1018,6 +1019,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
10181019
MLX5_SET(mkc, mkc, translations_octword_size,
10191020
get_octo_len(iova, umem->length, mr->page_shift));
10201021
MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1022+
if (mlx5_umem_needs_ats(dev, umem, access_flags))
1023+
MLX5_SET(mkc, mkc, ma_translation_mode, 1);
10211024
if (populate) {
10221025
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
10231026
get_octo_len(iova, umem->length, mr->page_shift));

include/linux/mlx5/mlx5_ifc.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
17071707
u8 steering_format_version[0x4];
17081708
u8 create_qp_start_hint[0x18];
17091709

1710-
u8 reserved_at_460[0x3];
1710+
u8 reserved_at_460[0x1];
1711+
u8 ats[0x1];
1712+
u8 reserved_at_462[0x1];
17111713
u8 log_max_uctx[0x5];
17121714
u8 reserved_at_468[0x2];
17131715
u8 ipsec_offload[0x1];
@@ -3873,7 +3875,9 @@ struct mlx5_ifc_mkc_bits {
38733875
u8 lw[0x1];
38743876
u8 lr[0x1];
38753877
u8 access_mode_1_0[0x2];
3876-
u8 reserved_at_18[0x8];
3878+
u8 reserved_at_18[0x2];
3879+
u8 ma_translation_mode[0x2];
3880+
u8 reserved_at_1c[0x4];
38773881

38783882
u8 qpn[0x18];
38793883
u8 mkey_7_0[0x8];
@@ -11134,7 +11138,8 @@ struct mlx5_ifc_dealloc_memic_out_bits {
1113411138
struct mlx5_ifc_umem_bits {
1113511139
u8 reserved_at_0[0x80];
1113611140

11137-
u8 reserved_at_80[0x1b];
11141+
u8 ats[0x1];
11142+
u8 reserved_at_81[0x1a];
1113811143
u8 log_page_size[0x5];
1113911144

1114011145
u8 page_offset[0x20];

include/rdma/uverbs_ioctl.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ enum uverbs_attr_type {
2424
UVERBS_ATTR_TYPE_PTR_OUT,
2525
UVERBS_ATTR_TYPE_IDR,
2626
UVERBS_ATTR_TYPE_FD,
27+
UVERBS_ATTR_TYPE_RAW_FD,
2728
UVERBS_ATTR_TYPE_ENUM_IN,
2829
UVERBS_ATTR_TYPE_IDRS_ARRAY,
2930
};
@@ -521,6 +522,11 @@ struct uapi_definition {
521522
.u.obj.access = _access, \
522523
__VA_ARGS__ } })
523524

525+
#define UVERBS_ATTR_RAW_FD(_attr_id, ...) \
526+
(&(const struct uverbs_attr_def){ \
527+
.id = (_attr_id), \
528+
.attr = { .type = UVERBS_ATTR_TYPE_RAW_FD, __VA_ARGS__ } })
529+
524530
#define UVERBS_ATTR_PTR_IN(_attr_id, _type, ...) \
525531
(&(const struct uverbs_attr_def){ \
526532
.id = _attr_id, \
@@ -999,4 +1005,11 @@ _uverbs_get_const_unsigned(u64 *to,
9991005
uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx, \
10001006
_default))
10011007

1008+
static inline int
1009+
uverbs_get_raw_fd(int *to, const struct uverbs_attr_bundle *attrs_bundle,
1010+
size_t idx)
1011+
{
1012+
return uverbs_get_const_signed(to, attrs_bundle, idx);
1013+
}
1014+
10021015
#endif

include/uapi/rdma/mlx5_user_ioctl_cmds.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ enum mlx5_ib_devx_umem_reg_attrs {
174174
MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
175175
MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID,
176176
MLX5_IB_ATTR_DEVX_UMEM_REG_PGSZ_BITMAP,
177+
MLX5_IB_ATTR_DEVX_UMEM_REG_DMABUF_FD,
177178
};
178179

179180
enum mlx5_ib_devx_umem_dereg_attrs {

0 commit comments

Comments
 (0)