Skip to content

Commit 72c2311

Browse files
author
Maxim Levitsky
committed
net: mana: Handle Reset Request from MANA NIC
JIRA: https://issues.redhat.com/browse/RHEL-109580 commit fbe346c Author: Haiyang Zhang <haiyangz@microsoft.com> Date: Fri Jun 27 13:26:23 2025 -0700 net: mana: Handle Reset Request from MANA NIC Upon receiving the Reset Request, pause the connection and clean up queues, wait for the specified period, then resume the NIC. In the cleanup phase, the HWC is no longer responding, so set hwc_timeout to zero to skip waiting on the response. Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com> Link: https://patch.msgid.link/1751055983-29760-1-git-send-email-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
1 parent c305b5f commit 72c2311

File tree

4 files changed

+143
-35
lines changed

4 files changed

+143
-35
lines changed

drivers/net/ethernet/microsoft/mana/gdma_main.c

Lines changed: 103 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/irqdomain.h>
1212

1313
#include <net/mana/mana.h>
14+
#include <net/mana/hw_channel.h>
1415

1516
struct dentry *mana_debugfs_root;
1617

@@ -69,6 +70,24 @@ static void mana_gd_init_registers(struct pci_dev *pdev)
6970
mana_gd_init_vf_regs(pdev);
7071
}
7172

73+
/* Suppress logging when we set timeout to zero */
74+
bool mana_need_log(struct gdma_context *gc, int err)
75+
{
76+
struct hw_channel_context *hwc;
77+
78+
if (err != -ETIMEDOUT)
79+
return true;
80+
81+
if (!gc)
82+
return true;
83+
84+
hwc = gc->hwc.driver_data;
85+
if (hwc && hwc->hwc_timeout == 0)
86+
return false;
87+
88+
return true;
89+
}
90+
7291
static int mana_gd_query_max_resources(struct pci_dev *pdev)
7392
{
7493
struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -279,8 +298,9 @@ static int mana_gd_disable_queue(struct gdma_queue *queue)
279298

280299
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
281300
if (err || resp.hdr.status) {
282-
dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err,
283-
resp.hdr.status);
301+
if (mana_need_log(gc, err))
302+
dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err,
303+
resp.hdr.status);
284304
return err ? err : -EPROTO;
285305
}
286306

@@ -367,25 +387,12 @@ EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
367387

368388
#define MANA_SERVICE_PERIOD 10
369389

370-
struct mana_serv_work {
371-
struct work_struct serv_work;
372-
struct pci_dev *pdev;
373-
};
374-
375-
static void mana_serv_func(struct work_struct *w)
390+
static void mana_serv_fpga(struct pci_dev *pdev)
376391
{
377-
struct mana_serv_work *mns_wk;
378392
struct pci_bus *bus, *parent;
379-
struct pci_dev *pdev;
380-
381-
mns_wk = container_of(w, struct mana_serv_work, serv_work);
382-
pdev = mns_wk->pdev;
383393

384394
pci_lock_rescan_remove();
385395

386-
if (!pdev)
387-
goto out;
388-
389396
bus = pdev->bus;
390397
if (!bus) {
391398
dev_err(&pdev->dev, "MANA service: no bus\n");
@@ -406,7 +413,74 @@ static void mana_serv_func(struct work_struct *w)
406413

407414
out:
408415
pci_unlock_rescan_remove();
416+
}
417+
418+
static void mana_serv_reset(struct pci_dev *pdev)
419+
{
420+
struct gdma_context *gc = pci_get_drvdata(pdev);
421+
struct hw_channel_context *hwc;
422+
423+
if (!gc) {
424+
dev_err(&pdev->dev, "MANA service: no GC\n");
425+
return;
426+
}
427+
428+
hwc = gc->hwc.driver_data;
429+
if (!hwc) {
430+
dev_err(&pdev->dev, "MANA service: no HWC\n");
431+
goto out;
432+
}
433+
434+
/* HWC is not responding in this case, so don't wait */
435+
hwc->hwc_timeout = 0;
436+
437+
dev_info(&pdev->dev, "MANA reset cycle start\n");
409438

439+
mana_gd_suspend(pdev, PMSG_SUSPEND);
440+
441+
msleep(MANA_SERVICE_PERIOD * 1000);
442+
443+
mana_gd_resume(pdev);
444+
445+
dev_info(&pdev->dev, "MANA reset cycle completed\n");
446+
447+
out:
448+
gc->in_service = false;
449+
}
450+
451+
struct mana_serv_work {
452+
struct work_struct serv_work;
453+
struct pci_dev *pdev;
454+
enum gdma_eqe_type type;
455+
};
456+
457+
static void mana_serv_func(struct work_struct *w)
458+
{
459+
struct mana_serv_work *mns_wk;
460+
struct pci_dev *pdev;
461+
462+
mns_wk = container_of(w, struct mana_serv_work, serv_work);
463+
pdev = mns_wk->pdev;
464+
465+
if (!pdev)
466+
goto out;
467+
468+
switch (mns_wk->type) {
469+
case GDMA_EQE_HWC_FPGA_RECONFIG:
470+
mana_serv_fpga(pdev);
471+
break;
472+
473+
case GDMA_EQE_HWC_RESET_REQUEST:
474+
mana_serv_reset(pdev);
475+
break;
476+
477+
default:
478+
dev_err(&pdev->dev, "MANA service: unknown type %d\n",
479+
mns_wk->type);
480+
break;
481+
}
482+
483+
out:
410484
pci_dev_put(pdev);
411485
kfree(mns_wk);
412486
module_put(THIS_MODULE);
@@ -463,6 +537,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
463537
break;
464538

465539
case GDMA_EQE_HWC_FPGA_RECONFIG:
540+
case GDMA_EQE_HWC_RESET_REQUEST:
466541
dev_info(gc->dev, "Recv MANA service type:%d\n", type);
467542

468543
if (gc->in_service) {
@@ -484,6 +559,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
484559
dev_info(gc->dev, "Start MANA service type:%d\n", type);
485560
gc->in_service = true;
486561
mns_wk->pdev = to_pci_dev(gc->dev);
562+
mns_wk->type = type;
487563
pci_dev_get(mns_wk->pdev);
488564
INIT_WORK(&mns_wk->serv_work, mana_serv_func);
489565
schedule_work(&mns_wk->serv_work);
@@ -635,7 +711,8 @@ int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq)
635711

636712
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
637713
if (err) {
638-
dev_err(dev, "test_eq failed: %d\n", err);
714+
if (mana_need_log(gc, err))
715+
dev_err(dev, "test_eq failed: %d\n", err);
639716
goto out;
640717
}
641718

@@ -670,7 +747,7 @@ static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets,
670747

671748
if (flush_evenets) {
672749
err = mana_gd_test_eq(gc, queue);
673-
if (err)
750+
if (err && mana_need_log(gc, err))
674751
dev_warn(gc->dev, "Failed to flush EQ: %d\n", err);
675752
}
676753

@@ -816,8 +893,9 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle)
816893

817894
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
818895
if (err || resp.hdr.status) {
819-
dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
820-
err, resp.hdr.status);
896+
if (mana_need_log(gc, err))
897+
dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
898+
err, resp.hdr.status);
821899
return -EPROTO;
822900
}
823901

@@ -1117,8 +1195,9 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
11171195

11181196
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
11191197
if (err || resp.hdr.status) {
1120-
dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
1121-
err, resp.hdr.status);
1198+
if (mana_need_log(gc, err))
1199+
dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
1200+
err, resp.hdr.status);
11221201
if (!err)
11231202
err = -EPROTO;
11241203
}
@@ -1916,7 +1995,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
19161995
}
19171996

19181997
/* The 'state' parameter is not used. */
1919-
static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
1998+
int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
19201999
{
19212000
struct gdma_context *gc = pci_get_drvdata(pdev);
19222001

@@ -1932,7 +2011,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
19322011
* fail -- if this happens, it's safer to just report an error than try to undo
19332012
* what has been done.
19342013
*/
1935-
static int mana_gd_resume(struct pci_dev *pdev)
2014+
int mana_gd_resume(struct pci_dev *pdev)
19362015
{
19372016
struct gdma_context *gc = pci_get_drvdata(pdev);
19382017
int err;

drivers/net/ethernet/microsoft/mana/hw_channel.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
880880

881881
if (!wait_for_completion_timeout(&ctx->comp_event,
882882
(msecs_to_jiffies(hwc->hwc_timeout)))) {
883-
dev_err(hwc->dev, "HWC: Request timed out!\n");
883+
if (hwc->hwc_timeout != 0)
884+
dev_err(hwc->dev, "HWC: Request timed out!\n");
885+
884886
err = -ETIMEDOUT;
885887
goto out;
886888
}

drivers/net/ethernet/microsoft/mana/mana_en.c

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ static const struct file_operations mana_dbg_q_fops = {
4747
.read = mana_dbg_q_read,
4848
};
4949

50+
static bool mana_en_need_log(struct mana_port_context *apc, int err)
51+
{
52+
if (apc && apc->ac && apc->ac->gdma_dev &&
53+
apc->ac->gdma_dev->gdma_context)
54+
return mana_need_log(apc->ac->gdma_dev->gdma_context, err);
55+
else
56+
return true;
57+
}
58+
5059
/* Microsoft Azure Network Adapter (MANA) functions */
5160

5261
static int mana_open(struct net_device *ndev)
@@ -781,7 +790,8 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
781790
if (err == -EOPNOTSUPP)
782791
return err;
783792

784-
if (req->req.msg_type != MANA_QUERY_PHY_STAT)
793+
if (req->req.msg_type != MANA_QUERY_PHY_STAT &&
794+
mana_need_log(gc, err))
785795
dev_err(dev, "Failed to send mana message: %d, 0x%x\n",
786796
err, resp->status);
787797
return err ? err : -EPROTO;
@@ -858,8 +868,10 @@ static void mana_pf_deregister_hw_vport(struct mana_port_context *apc)
858868
err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
859869
sizeof(resp));
860870
if (err) {
861-
netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n",
862-
err);
871+
if (mana_en_need_log(apc, err))
872+
netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n",
873+
err);
874+
863875
return;
864876
}
865877

@@ -914,8 +926,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc)
914926
err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
915927
sizeof(resp));
916928
if (err) {
917-
netdev_err(apc->ndev, "Failed to unregister filter: %d\n",
918-
err);
929+
if (mana_en_need_log(apc, err))
930+
netdev_err(apc->ndev, "Failed to unregister filter: %d\n",
931+
err);
932+
919933
return;
920934
}
921935

@@ -1145,7 +1159,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
11451159
err = mana_send_request(apc->ac, req, req_buf_size, &resp,
11461160
sizeof(resp));
11471161
if (err) {
1148-
netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
1162+
if (mana_en_need_log(apc, err))
1163+
netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
1164+
11491165
goto out;
11501166
}
11511167

@@ -1240,7 +1256,9 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
12401256
err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
12411257
sizeof(resp));
12421258
if (err) {
1243-
netdev_err(ndev, "Failed to destroy WQ object: %d\n", err);
1259+
if (mana_en_need_log(apc, err))
1260+
netdev_err(ndev, "Failed to destroy WQ object: %d\n", err);
1261+
12441262
return;
12451263
}
12461264

@@ -2895,11 +2913,10 @@ static int mana_dealloc_queues(struct net_device *ndev)
28952913

28962914
apc->rss_state = TRI_STATE_FALSE;
28972915
err = mana_config_rss(apc, TRI_STATE_FALSE, false, false);
2898-
if (err) {
2916+
if (err && mana_en_need_log(apc, err))
28992917
netdev_err(ndev, "Failed to disable vPort: %d\n", err);
2900-
return err;
2901-
}
29022918

2919+
/* Even in err case, still need to cleanup the vPort */
29032920
mana_destroy_vport(apc);
29042921

29052922
return 0;

include/net/mana/gdma.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ enum gdma_eqe_type {
6262
GDMA_EQE_HWC_FPGA_RECONFIG = 132,
6363
GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133,
6464
GDMA_EQE_HWC_SOC_SERVICE = 134,
65+
GDMA_EQE_HWC_RESET_REQUEST = 135,
6566
GDMA_EQE_RNIC_QP_FATAL = 176,
6667
};
6768

@@ -584,6 +585,9 @@ enum {
584585
/* Driver supports dynamic MSI-X vector allocation */
585586
#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13)
586587

588+
/* Driver can self reset on EQE notification */
589+
#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14)
590+
587591
/* Driver can self reset on FPGA Reconfig EQE notification */
588592
#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
589593

@@ -594,6 +598,7 @@ enum {
594598
GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
595599
GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
596600
GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
601+
GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
597602
GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
598603

599604
#define GDMA_DRV_CAP_FLAGS2 0
@@ -921,4 +926,9 @@ void mana_unregister_debugfs(void);
921926

922927
int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event);
923928

929+
int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state);
930+
int mana_gd_resume(struct pci_dev *pdev);
931+
932+
bool mana_need_log(struct gdma_context *gc, int err);
933+
924934
#endif /* _GDMA_H */

0 commit comments

Comments
 (0)