Skip to content

Commit ee8da65

Browse files
committed
RDMA/mlx5: Enable ATS support for MRs and umems
Bugzilla: https://bugzilla.redhat.com/2123401 commit 72b2f76 Author: Jason Gunthorpe <jgg@ziepe.ca> Date: Thu Sep 1 11:20:56 2022 -0300 RDMA/mlx5: Enable ATS support for MRs and umems For mlx5 if ATS is enabled in the PCI config then the device will use ATS requests for only certain DMA operations. This has to be opted in by the SW side based on the mkey or umem settings. ATS slows down the PCI performance, so it should only be set in cases when it is needed. All of these cases revolve around optimizing PCI P2P transfers and avoiding bad cases where the bus just doesn't work. Link: https://lore.kernel.org/r/4-v1-bd147097458e+ede-umem_dmabuf_jgg@nvidia.com Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Kamal Heib <kheib@redhat.com>
1 parent 7cdfecb commit ee8da65

File tree

3 files changed

+61
-17
lines changed

3 files changed

+61
-17
lines changed

drivers/infiniband/hw/mlx5/devx.c

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2158,26 +2158,17 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_SUBSCRIBE_EVENT)(
21582158

21592159
static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
21602160
struct uverbs_attr_bundle *attrs,
2161-
struct devx_umem *obj)
2161+
struct devx_umem *obj, u32 access_flags)
21622162
{
21632163
u64 addr;
21642164
size_t size;
2165-
u32 access;
21662165
int err;
21672166

21682167
if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) ||
21692168
uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN))
21702169
return -EFAULT;
21712170

2172-
err = uverbs_get_flags32(&access, attrs,
2173-
MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
2174-
IB_ACCESS_LOCAL_WRITE |
2175-
IB_ACCESS_REMOTE_WRITE |
2176-
IB_ACCESS_REMOTE_READ);
2177-
if (err)
2178-
return err;
2179-
2180-
err = ib_check_mr_access(&dev->ib_dev, access);
2171+
err = ib_check_mr_access(&dev->ib_dev, access_flags);
21812172
if (err)
21822173
return err;
21832174

@@ -2191,12 +2182,12 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
21912182
return -EFAULT;
21922183

21932184
umem_dmabuf = ib_umem_dmabuf_get_pinned(
2194-
&dev->ib_dev, addr, size, dmabuf_fd, access);
2185+
&dev->ib_dev, addr, size, dmabuf_fd, access_flags);
21952186
if (IS_ERR(umem_dmabuf))
21962187
return PTR_ERR(umem_dmabuf);
21972188
obj->umem = &umem_dmabuf->umem;
21982189
} else {
2199-
obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
2190+
obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access_flags);
22002191
if (IS_ERR(obj->umem))
22012192
return PTR_ERR(obj->umem);
22022193
}
@@ -2238,7 +2229,8 @@ static unsigned int devx_umem_find_best_pgsize(struct ib_umem *umem,
22382229
static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
22392230
struct uverbs_attr_bundle *attrs,
22402231
struct devx_umem *obj,
2241-
struct devx_umem_reg_cmd *cmd)
2232+
struct devx_umem_reg_cmd *cmd,
2233+
int access)
22422234
{
22432235
unsigned long pgsz_bitmap;
22442236
unsigned int page_size;
@@ -2287,6 +2279,9 @@ static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
22872279
MLX5_SET(umem, umem, page_offset,
22882280
ib_umem_dma_offset(obj->umem, page_size));
22892281

2282+
if (mlx5_umem_needs_ats(dev, obj->umem, access))
2283+
MLX5_SET(umem, umem, ats, 1);
2284+
22902285
mlx5_ib_populate_pas(obj->umem, page_size, mtt,
22912286
(obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
22922287
MLX5_IB_MTT_READ);
@@ -2304,20 +2299,30 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
23042299
struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context(
23052300
&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
23062301
struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device);
2302+
int access_flags;
23072303
int err;
23082304

23092305
if (!c->devx_uid)
23102306
return -EINVAL;
23112307

2308+
err = uverbs_get_flags32(&access_flags, attrs,
2309+
MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS,
2310+
IB_ACCESS_LOCAL_WRITE |
2311+
IB_ACCESS_REMOTE_WRITE |
2312+
IB_ACCESS_REMOTE_READ |
2313+
IB_ACCESS_RELAXED_ORDERING);
2314+
if (err)
2315+
return err;
2316+
23122317
obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL);
23132318
if (!obj)
23142319
return -ENOMEM;
23152320

2316-
err = devx_umem_get(dev, &c->ibucontext, attrs, obj);
2321+
err = devx_umem_get(dev, &c->ibucontext, attrs, obj, access_flags);
23172322
if (err)
23182323
goto err_obj_free;
23192324

2320-
err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd);
2325+
err = devx_umem_reg_cmd_alloc(dev, attrs, obj, &cmd, access_flags);
23212326
if (err)
23222327
goto err_umem_release;
23232328

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,4 +1549,40 @@ static inline bool rt_supported(int ts_cap)
15491549
return ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_REAL_TIME ||
15501550
ts_cap == MLX5_TIMESTAMP_FORMAT_CAP_FREE_RUNNING_AND_REAL_TIME;
15511551
}
1552+
1553+
/*
1554+
* PCI Peer to Peer is a trainwreck. If no switch is present then things
1555+
* sometimes work, depending on the pci_distance_p2p logic for excluding broken
1556+
* root complexes. However if a switch is present in the path, then things get
1557+
* really ugly depending on how the switch is setup. This table assumes that the
1558+
* root complex is strict and is validating that all req/reps are matches
1559+
* perfectly - so any scenario where it sees only half the transaction is a
1560+
* failure.
1561+
*
1562+
* CR/RR/DT ATS RO P2P
1563+
* 00X X X OK
1564+
* 010 X X fails (request is routed to root but root never sees comp)
1565+
* 011 0 X fails (request is routed to root but root never sees comp)
1566+
* 011 1 X OK
1567+
* 10X X 1 OK
1568+
* 101 X 0 fails (completion is routed to root but root didn't see req)
1569+
* 110 X 0 SLOW
1570+
* 111 0 0 SLOW
1571+
* 111 1 0 fails (completion is routed to root but root didn't see req)
1572+
* 111 1 1 OK
1573+
*
1574+
* Unfortunately we cannot reliably know if a switch is present or what the
1575+
* CR/RR/DT ACS settings are, as in a VM that is all hidden. Assume that
1576+
* CR/RR/DT is 111 if the ATS cap is enabled and follow the last three rows.
1577+
*
1578+
* For now assume if the umem is a dma_buf then it is P2P.
1579+
*/
1580+
static inline bool mlx5_umem_needs_ats(struct mlx5_ib_dev *dev,
1581+
struct ib_umem *umem, int access_flags)
1582+
{
1583+
if (!MLX5_CAP_GEN(dev->mdev, ats) || !umem->is_dmabuf)
1584+
return false;
1585+
return access_flags & IB_ACCESS_RELAXED_ORDERING;
1586+
}
1587+
15521588
#endif /* MLX5_IB_H */

drivers/infiniband/hw/mlx5/mr.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -895,7 +895,8 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
895895
* cache then synchronously create an uncached one.
896896
*/
897897
if (!ent || ent->limit == 0 ||
898-
!mlx5r_umr_can_reconfig(dev, 0, access_flags)) {
898+
!mlx5r_umr_can_reconfig(dev, 0, access_flags) ||
899+
mlx5_umem_needs_ats(dev, umem, access_flags)) {
899900
mutex_lock(&dev->slow_path_mutex);
900901
mr = reg_create(pd, umem, iova, access_flags, page_size, false);
901902
mutex_unlock(&dev->slow_path_mutex);
@@ -976,6 +977,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
976977
MLX5_SET(mkc, mkc, translations_octword_size,
977978
get_octo_len(iova, umem->length, mr->page_shift));
978979
MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
980+
if (mlx5_umem_needs_ats(dev, umem, access_flags))
981+
MLX5_SET(mkc, mkc, ma_translation_mode, 1);
979982
if (populate) {
980983
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
981984
get_octo_len(iova, umem->length, mr->page_shift));

0 commit comments

Comments
 (0)