Skip to content

Commit 7bc4f3e

Browse files
committed
net/mlx5e: Fix netif state handling
jira LE-1907 cve CVE-2024-38608 Rebuild_History Non-Buildable kernel-5.14.0-427.33.1.el9_4 commit-author Shay Drory <shayd@nvidia.com> commit 3d59184 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-5.14.0-427.33.1.el9_4/3d591847.failed mlx5e_suspend cleans resources only if netif_device_present() returns true. However, mlx5e_resume changes the state of netif, via mlx5e_nic_enable, only if reg_state == NETREG_REGISTERED. In the below case, the above leads to NULL-ptr Oops[1] and memory leaks: mlx5e_probe _mlx5e_resume mlx5e_attach_netdev mlx5e_nic_enable <-- netdev not reg, not calling netif_device_attach() register_netdev <-- failed for some reason. ERROR_FLOW: _mlx5e_suspend <-- netif_device_present return false, resources aren't freed :( Hence, clean resources in this case as well. [1] BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0010 [#1] SMP CPU: 2 PID: 9345 Comm: test-ovs-ct-gen Not tainted 6.5.0_for_upstream_min_debug_2023_09_05_16_01 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:0x0 Code: Unable to access opcode bytes at0xffffffffffffffd6. RSP: 0018:ffff888178aaf758 EFLAGS: 00010246 Call Trace: <TASK> ? __die+0x20/0x60 ? page_fault_oops+0x14c/0x3c0 ? exc_page_fault+0x75/0x140 ? asm_exc_page_fault+0x22/0x30 notifier_call_chain+0x35/0xb0 blocking_notifier_call_chain+0x3d/0x60 mlx5_blocking_notifier_call_chain+0x22/0x30 [mlx5_core] mlx5_core_uplink_netdev_event_replay+0x3e/0x60 [mlx5_core] mlx5_mdev_netdev_track+0x53/0x60 [mlx5_ib] mlx5_ib_roce_init+0xc3/0x340 [mlx5_ib] __mlx5_ib_add+0x34/0xd0 [mlx5_ib] mlx5r_probe+0xe1/0x210 [mlx5_ib] ? auxiliary_match_id+0x6a/0x90 auxiliary_bus_probe+0x38/0x80 ? driver_sysfs_add+0x51/0x80 really_probe+0xc9/0x3e0 ? driver_probe_device+0x90/0x90 __driver_probe_device+0x80/0x160 driver_probe_device+0x1e/0x90 __device_attach_driver+0x7d/0x100 bus_for_each_drv+0x80/0xd0 __device_attach+0xbc/0x1f0 bus_probe_device+0x86/0xa0 device_add+0x637/0x840 __auxiliary_device_add+0x3b/0xa0 add_adev+0xc9/0x140 [mlx5_core] mlx5_rescan_drivers_locked+0x22a/0x310 [mlx5_core] mlx5_register_device+0x53/0xa0 [mlx5_core] mlx5_init_one_devl_locked+0x5c4/0x9c0 [mlx5_core] mlx5_init_one+0x3b/0x60 [mlx5_core] probe_one+0x44c/0x730 [mlx5_core] local_pci_probe+0x3e/0x90 pci_device_probe+0xbf/0x210 ? kernfs_create_link+0x5d/0xa0 ? sysfs_do_create_link_sd+0x60/0xc0 really_probe+0xc9/0x3e0 ? driver_probe_device+0x90/0x90 __driver_probe_device+0x80/0x160 driver_probe_device+0x1e/0x90 __device_attach_driver+0x7d/0x100 bus_for_each_drv+0x80/0xd0 __device_attach+0xbc/0x1f0 pci_bus_add_device+0x54/0x80 pci_iov_add_virtfn+0x2e6/0x320 sriov_enable+0x208/0x420 mlx5_core_sriov_configure+0x9e/0x200 [mlx5_core] sriov_numvfs_store+0xae/0x1a0 kernfs_fop_write_iter+0x10c/0x1a0 vfs_write+0x291/0x3c0 ksys_write+0x5f/0xe0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- Fixes: 2c3b5be ("net/mlx5e: More generic netdev management API") Signed-off-by: Shay Drory <shayd@nvidia.com> Signed-off-by: Tariq Toukan <tariqt@nvidia.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240509112951.590184-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> (cherry picked from commit 3d59184) Signed-off-by: Jonathan Maple <jmaple@ciq.com> # Conflicts: # drivers/net/ethernet/mellanox/mlx5/core/en_main.c
1 parent fb928ac commit 7bc4f3e

File tree

1 file changed

+184
-0
lines changed

1 file changed

+184
-0
lines changed
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
net/mlx5e: Fix netif state handling
2+
3+
jira LE-1907
4+
cve CVE-2024-38608
5+
Rebuild_History Non-Buildable kernel-5.14.0-427.33.1.el9_4
6+
commit-author Shay Drory <shayd@nvidia.com>
7+
commit 3d5918477f94e4c2f064567875c475468e264644
8+
Empty-Commit: Cherry-Pick Conflicts during history rebuild.
9+
Will be included in final tarball splat. Ref for failed cherry-pick at:
10+
ciq/ciq_backports/kernel-5.14.0-427.33.1.el9_4/3d591847.failed
11+
12+
mlx5e_suspend cleans resources only if netif_device_present() returns
13+
true. However, mlx5e_resume changes the state of netif, via
14+
mlx5e_nic_enable, only if reg_state == NETREG_REGISTERED.
15+
In the below case, the above leads to NULL-ptr Oops[1] and memory
16+
leaks:
17+
18+
mlx5e_probe
19+
_mlx5e_resume
20+
mlx5e_attach_netdev
21+
mlx5e_nic_enable <-- netdev not reg, not calling netif_device_attach()
22+
register_netdev <-- failed for some reason.
23+
ERROR_FLOW:
24+
_mlx5e_suspend <-- netif_device_present return false, resources aren't freed :(
25+
26+
Hence, clean resources in this case as well.
27+
28+
[1]
29+
BUG: kernel NULL pointer dereference, address: 0000000000000000
30+
PGD 0 P4D 0
31+
Oops: 0010 [#1] SMP
32+
CPU: 2 PID: 9345 Comm: test-ovs-ct-gen Not tainted 6.5.0_for_upstream_min_debug_2023_09_05_16_01 #1
33+
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
34+
RIP: 0010:0x0
35+
Code: Unable to access opcode bytes at0xffffffffffffffd6.
36+
RSP: 0018:ffff888178aaf758 EFLAGS: 00010246
37+
Call Trace:
38+
<TASK>
39+
? __die+0x20/0x60
40+
? page_fault_oops+0x14c/0x3c0
41+
? exc_page_fault+0x75/0x140
42+
? asm_exc_page_fault+0x22/0x30
43+
notifier_call_chain+0x35/0xb0
44+
blocking_notifier_call_chain+0x3d/0x60
45+
mlx5_blocking_notifier_call_chain+0x22/0x30 [mlx5_core]
46+
mlx5_core_uplink_netdev_event_replay+0x3e/0x60 [mlx5_core]
47+
mlx5_mdev_netdev_track+0x53/0x60 [mlx5_ib]
48+
mlx5_ib_roce_init+0xc3/0x340 [mlx5_ib]
49+
__mlx5_ib_add+0x34/0xd0 [mlx5_ib]
50+
mlx5r_probe+0xe1/0x210 [mlx5_ib]
51+
? auxiliary_match_id+0x6a/0x90
52+
auxiliary_bus_probe+0x38/0x80
53+
? driver_sysfs_add+0x51/0x80
54+
really_probe+0xc9/0x3e0
55+
? driver_probe_device+0x90/0x90
56+
__driver_probe_device+0x80/0x160
57+
driver_probe_device+0x1e/0x90
58+
__device_attach_driver+0x7d/0x100
59+
bus_for_each_drv+0x80/0xd0
60+
__device_attach+0xbc/0x1f0
61+
bus_probe_device+0x86/0xa0
62+
device_add+0x637/0x840
63+
__auxiliary_device_add+0x3b/0xa0
64+
add_adev+0xc9/0x140 [mlx5_core]
65+
mlx5_rescan_drivers_locked+0x22a/0x310 [mlx5_core]
66+
mlx5_register_device+0x53/0xa0 [mlx5_core]
67+
mlx5_init_one_devl_locked+0x5c4/0x9c0 [mlx5_core]
68+
mlx5_init_one+0x3b/0x60 [mlx5_core]
69+
probe_one+0x44c/0x730 [mlx5_core]
70+
local_pci_probe+0x3e/0x90
71+
pci_device_probe+0xbf/0x210
72+
? kernfs_create_link+0x5d/0xa0
73+
? sysfs_do_create_link_sd+0x60/0xc0
74+
really_probe+0xc9/0x3e0
75+
? driver_probe_device+0x90/0x90
76+
__driver_probe_device+0x80/0x160
77+
driver_probe_device+0x1e/0x90
78+
__device_attach_driver+0x7d/0x100
79+
bus_for_each_drv+0x80/0xd0
80+
__device_attach+0xbc/0x1f0
81+
pci_bus_add_device+0x54/0x80
82+
pci_iov_add_virtfn+0x2e6/0x320
83+
sriov_enable+0x208/0x420
84+
mlx5_core_sriov_configure+0x9e/0x200 [mlx5_core]
85+
sriov_numvfs_store+0xae/0x1a0
86+
kernfs_fop_write_iter+0x10c/0x1a0
87+
vfs_write+0x291/0x3c0
88+
ksys_write+0x5f/0xe0
89+
do_syscall_64+0x3d/0x90
90+
entry_SYSCALL_64_after_hwframe+0x46/0xb0
91+
CR2: 0000000000000000
92+
---[ end trace 0000000000000000 ]---
93+
94+
Fixes: 2c3b5beec46a ("net/mlx5e: More generic netdev management API")
95+
Signed-off-by: Shay Drory <shayd@nvidia.com>
96+
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
97+
Reviewed-by: Simon Horman <horms@kernel.org>
98+
Link: https://lore.kernel.org/r/20240509112951.590184-2-tariqt@nvidia.com
99+
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
100+
(cherry picked from commit 3d5918477f94e4c2f064567875c475468e264644)
101+
Signed-off-by: Jonathan Maple <jmaple@ciq.com>
102+
103+
# Conflicts:
104+
# drivers/net/ethernet/mellanox/mlx5/core/en_main.c
105+
diff --cc drivers/net/ethernet/mellanox/mlx5/core/en_main.c
106+
index 53343da28517,64497b6eebd3..000000000000
107+
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
108+
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
109+
@@@ -6011,10 -6064,13 +6011,10 @@@ static int _mlx5e_suspend(struct auxili
110+
struct mlx5e_priv *priv = mlx5e_dev->priv;
111+
struct net_device *netdev = priv->netdev;
112+
struct mlx5_core_dev *mdev = priv->mdev;
113+
- struct mlx5_core_dev *pos;
114+
- int i;
115+
116+
- if (!netif_device_present(netdev)) {
117+
+ if (!pre_netdev_reg && !netif_device_present(netdev)) {
118+
if (test_bit(MLX5E_STATE_DESTROYING, &priv->state))
119+
- mlx5_sd_for_each_dev(i, mdev, pos)
120+
- mlx5e_destroy_mdev_resources(pos);
121+
+ mlx5e_destroy_mdev_resources(mdev);
122+
return -ENODEV;
123+
}
124+
125+
@@@ -6025,7 -6083,17 +6025,21 @@@
126+
127+
static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state)
128+
{
129+
++<<<<<<< HEAD
130+
+ return _mlx5e_suspend(adev);
131+
++=======
132+
+ struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
133+
+ struct mlx5_core_dev *mdev = edev->mdev;
134+
+ struct auxiliary_device *actual_adev;
135+
+ int err = 0;
136+
+
137+
+ actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
138+
+ if (actual_adev)
139+
+ err = _mlx5e_suspend(actual_adev, false);
140+
+
141+
+ mlx5_sd_cleanup(mdev);
142+
+ return err;
143+
++>>>>>>> 3d5918477f94 (net/mlx5e: Fix netif state handling)
144+
}
145+
146+
static int _mlx5e_probe(struct auxiliary_device *adev)
147+
@@@ -6104,18 -6172,32 +6118,18 @@@ err_devlink_unregister
148+
static int mlx5e_probe(struct auxiliary_device *adev,
149+
const struct auxiliary_device_id *id)
150+
{
151+
- struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
152+
- struct mlx5_core_dev *mdev = edev->mdev;
153+
- struct auxiliary_device *actual_adev;
154+
- int err;
155+
-
156+
- err = mlx5_sd_init(mdev);
157+
- if (err)
158+
- return err;
159+
-
160+
- actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx);
161+
- if (actual_adev)
162+
- return _mlx5e_probe(actual_adev);
163+
- return 0;
164+
+ return _mlx5e_probe(adev);
165+
}
166+
167+
-static void _mlx5e_remove(struct auxiliary_device *adev)
168+
+static void mlx5e_remove(struct auxiliary_device *adev)
169+
{
170+
- struct mlx5_adev *edev = container_of(adev, struct mlx5_adev, adev);
171+
struct mlx5e_dev *mlx5e_dev = auxiliary_get_drvdata(adev);
172+
struct mlx5e_priv *priv = mlx5e_dev->priv;
173+
- struct mlx5_core_dev *mdev = edev->mdev;
174+
175+
- mlx5_core_uplink_netdev_set(mdev, NULL);
176+
+ mlx5_core_uplink_netdev_set(priv->mdev, NULL);
177+
mlx5e_dcbnl_delete_app(priv);
178+
unregister_netdev(priv->netdev);
179+
- _mlx5e_suspend(adev);
180+
+ _mlx5e_suspend(adev, false);
181+
priv->profile->cleanup(priv);
182+
mlx5e_destroy_netdev(priv);
183+
mlx5e_devlink_port_unregister(mlx5e_dev);
184+
* Unmerged path drivers/net/ethernet/mellanox/mlx5/core/en_main.c

0 commit comments

Comments
 (0)