Skip to content

Commit 29b595f

Browse files
author
CKI Backport Bot
committed
RDMA/mlx5: Move events notifier registration to be after device registration
JIRA: https://issues.redhat.com/browse/RHEL-72349 CVE: CVE-2024-53224 commit ede132a Author: Patrisious Haddad <phaddad@nvidia.com> Date: Wed Nov 13 13:23:19 2024 +0200 RDMA/mlx5: Move events notifier registration to be after device registration Move pkey change work initialization and cleanup from device resources stage to notifier stage, since this is the stage which handles this work events. Fix a race between the device deregistration and pkey change work by moving MLX5_IB_STAGE_DEVICE_NOTIFIER to be after MLX5_IB_STAGE_IB_REG in order to ensure that the notifier is deregistered before the device during cleanup. Which ensures there are no works that are being executed after the device has already unregistered which can cause the panic below. BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 630071 Comm: kworker/1:2 Kdump: loaded Tainted: G W OE --------- --- 5.14.0-162.6.1.el9_1.x86_64 #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 02/27/2023 Workqueue: events pkey_change_handler [mlx5_ib] RIP: 0010:setup_qp+0x38/0x1f0 [mlx5_ib] Code: ee 41 54 45 31 e4 55 89 f5 53 48 89 fb 48 83 ec 20 8b 77 08 65 48 8b 04 25 28 00 00 00 48 89 44 24 18 48 8b 07 48 8d 4c 24 16 <4c> 8b 38 49 8b 87 80 0b 00 00 4c 89 ff 48 8b 80 08 05 00 00 8b 40 RSP: 0018:ffffbcc54068be20 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff954054494128 RCX: ffffbcc54068be36 RDX: ffff954004934000 RSI: 0000000000000001 RDI: ffff954054494128 RBP: 0000000000000023 R08: ffff954001be2c20 R09: 0000000000000001 R10: ffff954001be2c20 R11: ffff9540260133c0 R12: 0000000000000000 R13: 0000000000000023 R14: 0000000000000000 R15: ffff9540ffcb0905 FS: 0000000000000000(0000) GS:ffff9540ffc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010625c001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mlx5_ib_gsi_pkey_change+0x20/0x40 [mlx5_ib] process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x380/0x380 kthread+0x149/0x170 ? set_kthread_struct+0x50/0x50 ret_from_fork+0x22/0x30 Modules linked in: rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_ib(OE) mlx5_fwctl(OE) fwctl(OE) ib_uverbs(OE) mlx5_core(OE) mlxdevm(OE) ib_core(OE) mlx_compat(OE) psample mlxfw(OE) tls knem(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache netfs qrtr rfkill sunrpc intel_rapl_msr intel_rapl_common rapl hv_balloon hv_utils i2c_piix4 pcspkr joydev fuse ext4 mbcache jbd2 sr_mod sd_mod cdrom t10_pi sg ata_generic pci_hyperv pci_hyperv_intf hyperv_drm drm_shmem_helper drm_kms_helper hv_storvsc syscopyarea hv_netvsc sysfillrect sysimgblt hid_hyperv fb_sys_fops scsi_transport_fc hyperv_keyboard drm ata_piix crct10dif_pclmul crc32_pclmul crc32c_intel libata ghash_clmulni_intel hv_vmbus serio_raw [last unloaded: ib_core] CR2: 0000000000000000 ---[ end trace f6f8be4eae12f7bc ]--- Fixes: 7722f47 ("IB/mlx5: Create GSI transmission QPs when P_Key table is changed") Signed-off-by: Patrisious Haddad <phaddad@nvidia.com> Reviewed-by: Michael Guralnik <michaelgur@nvidia.com> Link: https://patch.msgid.link/d271ceeff0c08431b3cbbbb3e2d416f09b6d1621.1731496944.git.leon@kernel.org Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: CKI Backport Bot <cki-ci-bot+cki-gitlab-backport-bot@redhat.com>
1 parent 8987e6b commit 29b595f

File tree

2 files changed

+20
-22
lines changed

2 files changed

+20
-22
lines changed

drivers/infiniband/hw/mlx5/main.c

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2997,7 +2997,6 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev)
29972997
static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
29982998
{
29992999
struct mlx5_ib_resources *devr = &dev->devr;
3000-
int port;
30013000
int ret;
30023001

30033002
if (!MLX5_CAP_GEN(dev->mdev, xrc))
@@ -3013,10 +3012,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
30133012
return ret;
30143013
}
30153014

3016-
for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
3017-
INIT_WORK(&devr->ports[port].pkey_change_work,
3018-
pkey_change_handler);
3019-
30203015
mutex_init(&devr->cq_lock);
30213016
mutex_init(&devr->srq_lock);
30223017

@@ -3026,16 +3021,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
30263021
static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
30273022
{
30283023
struct mlx5_ib_resources *devr = &dev->devr;
3029-
int port;
3030-
3031-
/*
3032-
* Make sure no change P_Key work items are still executing.
3033-
*
3034-
* At this stage, the mlx5_ib_event should be unregistered
3035-
* and it ensures that no new works are added.
3036-
*/
3037-
for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
3038-
cancel_work_sync(&devr->ports[port].pkey_change_work);
30393024

30403025
/* After s0/s1 init, they are not unset during the device lifetime. */
30413026
if (devr->s1) {
@@ -4470,6 +4455,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
44704455

44714456
static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
44724457
{
4458+
struct mlx5_ib_resources *devr = &dev->devr;
4459+
int port;
4460+
4461+
for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
4462+
INIT_WORK(&devr->ports[port].pkey_change_work,
4463+
pkey_change_handler);
4464+
44734465
dev->mdev_events.notifier_call = mlx5_ib_event;
44744466
mlx5_notifier_register(dev->mdev, &dev->mdev_events);
44754467

@@ -4480,8 +4472,14 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
44804472

44814473
static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
44824474
{
4475+
struct mlx5_ib_resources *devr = &dev->devr;
4476+
int port;
4477+
44834478
mlx5r_macsec_event_unregister(dev);
44844479
mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
4480+
4481+
for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
4482+
cancel_work_sync(&devr->ports[port].pkey_change_work);
44854483
}
44864484

44874485
void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
@@ -4571,9 +4569,6 @@ static const struct mlx5_ib_profile pf_profile = {
45714569
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
45724570
mlx5_ib_dev_res_init,
45734571
mlx5_ib_dev_res_cleanup),
4574-
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4575-
mlx5_ib_stage_dev_notifier_init,
4576-
mlx5_ib_stage_dev_notifier_cleanup),
45774572
STAGE_CREATE(MLX5_IB_STAGE_ODP,
45784573
mlx5_ib_odp_init_one,
45794574
mlx5_ib_odp_cleanup_one),
@@ -4598,6 +4593,9 @@ static const struct mlx5_ib_profile pf_profile = {
45984593
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
45994594
mlx5_ib_stage_ib_reg_init,
46004595
mlx5_ib_stage_ib_reg_cleanup),
4596+
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4597+
mlx5_ib_stage_dev_notifier_init,
4598+
mlx5_ib_stage_dev_notifier_cleanup),
46014599
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
46024600
mlx5_ib_stage_post_ib_reg_umr_init,
46034601
NULL),
@@ -4634,9 +4632,6 @@ const struct mlx5_ib_profile raw_eth_profile = {
46344632
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
46354633
mlx5_ib_dev_res_init,
46364634
mlx5_ib_dev_res_cleanup),
4637-
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4638-
mlx5_ib_stage_dev_notifier_init,
4639-
mlx5_ib_stage_dev_notifier_cleanup),
46404635
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
46414636
mlx5_ib_counters_init,
46424637
mlx5_ib_counters_cleanup),
@@ -4658,6 +4653,9 @@ const struct mlx5_ib_profile raw_eth_profile = {
46584653
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
46594654
mlx5_ib_stage_ib_reg_init,
46604655
mlx5_ib_stage_ib_reg_cleanup),
4656+
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4657+
mlx5_ib_stage_dev_notifier_init,
4658+
mlx5_ib_stage_dev_notifier_cleanup),
46614659
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
46624660
mlx5_ib_stage_post_ib_reg_umr_init,
46634661
NULL),

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -968,7 +968,6 @@ enum mlx5_ib_stages {
968968
MLX5_IB_STAGE_QP,
969969
MLX5_IB_STAGE_SRQ,
970970
MLX5_IB_STAGE_DEVICE_RESOURCES,
971-
MLX5_IB_STAGE_DEVICE_NOTIFIER,
972971
MLX5_IB_STAGE_ODP,
973972
MLX5_IB_STAGE_COUNTERS,
974973
MLX5_IB_STAGE_CONG_DEBUGFS,
@@ -977,6 +976,7 @@ enum mlx5_ib_stages {
977976
MLX5_IB_STAGE_PRE_IB_REG_UMR,
978977
MLX5_IB_STAGE_WHITELIST_UID,
979978
MLX5_IB_STAGE_IB_REG,
979+
MLX5_IB_STAGE_DEVICE_NOTIFIER,
980980
MLX5_IB_STAGE_POST_IB_REG_UMR,
981981
MLX5_IB_STAGE_DELAY_DROP,
982982
MLX5_IB_STAGE_RESTRACK,

0 commit comments

Comments
 (0)