Skip to content

Commit a19e910

Browse files
author
Maxim Levitsky
committed
net: mana: Handle Reset Request from MANA NIC
JIRA: https://issues.redhat.com/browse/RHEL-109583 commit fbe346c Author: Haiyang Zhang <haiyangz@microsoft.com> Date: Fri Jun 27 13:26:23 2025 -0700 net: mana: Handle Reset Request from MANA NIC Upon receiving the Reset Request, pause the connection and clean up queues, wait for the specified period, then resume the NIC. In the cleanup phase, the HWC is no longer responding, so set hwc_timeout to zero to skip waiting on the response. Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com> Link: https://patch.msgid.link/1751055983-29760-1-git-send-email-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
1 parent c55102f commit a19e910

File tree

4 files changed

+143
-35
lines changed

4 files changed

+143
-35
lines changed

drivers/net/ethernet/microsoft/mana/gdma_main.c

Lines changed: 103 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/irqdomain.h>
1212

1313
#include <net/mana/mana.h>
14+
#include <net/mana/hw_channel.h>
1415

1516
#include <linux/cpu.h>
1617
struct dentry *mana_debugfs_root;
@@ -70,6 +71,24 @@ static void mana_gd_init_registers(struct pci_dev *pdev)
7071
mana_gd_init_vf_regs(pdev);
7172
}
7273

74+
/* Suppress logging when we set timeout to zero */
75+
bool mana_need_log(struct gdma_context *gc, int err)
76+
{
77+
struct hw_channel_context *hwc;
78+
79+
if (err != -ETIMEDOUT)
80+
return true;
81+
82+
if (!gc)
83+
return true;
84+
85+
hwc = gc->hwc.driver_data;
86+
if (hwc && hwc->hwc_timeout == 0)
87+
return false;
88+
89+
return true;
90+
}
91+
7392
static int mana_gd_query_max_resources(struct pci_dev *pdev)
7493
{
7594
struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -280,8 +299,9 @@ static int mana_gd_disable_queue(struct gdma_queue *queue)
280299

281300
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
282301
if (err || resp.hdr.status) {
283-
dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err,
284-
resp.hdr.status);
302+
if (mana_need_log(gc, err))
303+
dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err,
304+
resp.hdr.status);
285305
return err ? err : -EPROTO;
286306
}
287307

@@ -368,25 +388,12 @@ EXPORT_SYMBOL_NS(mana_gd_ring_cq, NET_MANA);
368388

369389
#define MANA_SERVICE_PERIOD 10
370390

371-
struct mana_serv_work {
372-
struct work_struct serv_work;
373-
struct pci_dev *pdev;
374-
};
375-
376-
static void mana_serv_func(struct work_struct *w)
391+
static void mana_serv_fpga(struct pci_dev *pdev)
377392
{
378-
struct mana_serv_work *mns_wk;
379393
struct pci_bus *bus, *parent;
380-
struct pci_dev *pdev;
381-
382-
mns_wk = container_of(w, struct mana_serv_work, serv_work);
383-
pdev = mns_wk->pdev;
384394

385395
pci_lock_rescan_remove();
386396

387-
if (!pdev)
388-
goto out;
389-
390397
bus = pdev->bus;
391398
if (!bus) {
392399
dev_err(&pdev->dev, "MANA service: no bus\n");
@@ -407,7 +414,74 @@ static void mana_serv_func(struct work_struct *w)
407414

408415
out:
409416
pci_unlock_rescan_remove();
417+
}
418+
419+
static void mana_serv_reset(struct pci_dev *pdev)
420+
{
421+
struct gdma_context *gc = pci_get_drvdata(pdev);
422+
struct hw_channel_context *hwc;
423+
424+
if (!gc) {
425+
dev_err(&pdev->dev, "MANA service: no GC\n");
426+
return;
427+
}
428+
429+
hwc = gc->hwc.driver_data;
430+
if (!hwc) {
431+
dev_err(&pdev->dev, "MANA service: no HWC\n");
432+
goto out;
433+
}
434+
435+
/* HWC is not responding in this case, so don't wait */
436+
hwc->hwc_timeout = 0;
437+
438+
dev_info(&pdev->dev, "MANA reset cycle start\n");
410439

440+
mana_gd_suspend(pdev, PMSG_SUSPEND);
441+
442+
msleep(MANA_SERVICE_PERIOD * 1000);
443+
444+
mana_gd_resume(pdev);
445+
446+
dev_info(&pdev->dev, "MANA reset cycle completed\n");
447+
448+
out:
449+
gc->in_service = false;
450+
}
451+
452+
struct mana_serv_work {
453+
struct work_struct serv_work;
454+
struct pci_dev *pdev;
455+
enum gdma_eqe_type type;
456+
};
457+
458+
static void mana_serv_func(struct work_struct *w)
459+
{
460+
struct mana_serv_work *mns_wk;
461+
struct pci_dev *pdev;
462+
463+
mns_wk = container_of(w, struct mana_serv_work, serv_work);
464+
pdev = mns_wk->pdev;
465+
466+
if (!pdev)
467+
goto out;
468+
469+
switch (mns_wk->type) {
470+
case GDMA_EQE_HWC_FPGA_RECONFIG:
471+
mana_serv_fpga(pdev);
472+
break;
473+
474+
case GDMA_EQE_HWC_RESET_REQUEST:
475+
mana_serv_reset(pdev);
476+
break;
477+
478+
default:
479+
dev_err(&pdev->dev, "MANA service: unknown type %d\n",
480+
mns_wk->type);
481+
break;
482+
}
483+
484+
out:
411485
pci_dev_put(pdev);
412486
kfree(mns_wk);
413487
module_put(THIS_MODULE);
@@ -464,6 +538,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
464538
break;
465539

466540
case GDMA_EQE_HWC_FPGA_RECONFIG:
541+
case GDMA_EQE_HWC_RESET_REQUEST:
467542
dev_info(gc->dev, "Recv MANA service type:%d\n", type);
468543

469544
if (gc->in_service) {
@@ -485,6 +560,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
485560
dev_info(gc->dev, "Start MANA service type:%d\n", type);
486561
gc->in_service = true;
487562
mns_wk->pdev = to_pci_dev(gc->dev);
563+
mns_wk->type = type;
488564
pci_dev_get(mns_wk->pdev);
489565
INIT_WORK(&mns_wk->serv_work, mana_serv_func);
490566
schedule_work(&mns_wk->serv_work);
@@ -636,7 +712,8 @@ int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq)
636712

637713
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
638714
if (err) {
639-
dev_err(dev, "test_eq failed: %d\n", err);
715+
if (mana_need_log(gc, err))
716+
dev_err(dev, "test_eq failed: %d\n", err);
640717
goto out;
641718
}
642719

@@ -671,7 +748,7 @@ static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets,
671748

672749
if (flush_evenets) {
673750
err = mana_gd_test_eq(gc, queue);
674-
if (err)
751+
if (err && mana_need_log(gc, err))
675752
dev_warn(gc->dev, "Failed to flush EQ: %d\n", err);
676753
}
677754

@@ -817,8 +894,9 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle)
817894

818895
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
819896
if (err || resp.hdr.status) {
820-
dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
821-
err, resp.hdr.status);
897+
if (mana_need_log(gc, err))
898+
dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
899+
err, resp.hdr.status);
822900
return -EPROTO;
823901
}
824902

@@ -1118,8 +1196,9 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
11181196

11191197
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
11201198
if (err || resp.hdr.status) {
1121-
dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
1122-
err, resp.hdr.status);
1199+
if (mana_need_log(gc, err))
1200+
dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
1201+
err, resp.hdr.status);
11231202
if (!err)
11241203
err = -EPROTO;
11251204
}
@@ -1917,7 +1996,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
19171996
}
19181997

19191998
/* The 'state' parameter is not used. */
1920-
static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
1999+
int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
19212000
{
19222001
struct gdma_context *gc = pci_get_drvdata(pdev);
19232002

@@ -1933,7 +2012,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
19332012
* fail -- if this happens, it's safer to just report an error than try to undo
19342013
* what has been done.
19352014
*/
1936-
static int mana_gd_resume(struct pci_dev *pdev)
2015+
int mana_gd_resume(struct pci_dev *pdev)
19372016
{
19382017
struct gdma_context *gc = pci_get_drvdata(pdev);
19392018
int err;

drivers/net/ethernet/microsoft/mana/hw_channel.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -879,7 +879,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
879879

880880
if (!wait_for_completion_timeout(&ctx->comp_event,
881881
(msecs_to_jiffies(hwc->hwc_timeout)))) {
882-
dev_err(hwc->dev, "HWC: Request timed out!\n");
882+
if (hwc->hwc_timeout != 0)
883+
dev_err(hwc->dev, "HWC: Request timed out!\n");
884+
883885
err = -ETIMEDOUT;
884886
goto out;
885887
}

drivers/net/ethernet/microsoft/mana/mana_en.c

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ static const struct file_operations mana_dbg_q_fops = {
4646
.read = mana_dbg_q_read,
4747
};
4848

49+
static bool mana_en_need_log(struct mana_port_context *apc, int err)
50+
{
51+
if (apc && apc->ac && apc->ac->gdma_dev &&
52+
apc->ac->gdma_dev->gdma_context)
53+
return mana_need_log(apc->ac->gdma_dev->gdma_context, err);
54+
else
55+
return true;
56+
}
57+
4958
/* Microsoft Azure Network Adapter (MANA) functions */
5059

5160
static int mana_open(struct net_device *ndev)
@@ -780,7 +789,8 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
780789
if (err == -EOPNOTSUPP)
781790
return err;
782791

783-
if (req->req.msg_type != MANA_QUERY_PHY_STAT)
792+
if (req->req.msg_type != MANA_QUERY_PHY_STAT &&
793+
mana_need_log(gc, err))
784794
dev_err(dev, "Failed to send mana message: %d, 0x%x\n",
785795
err, resp->status);
786796
return err ? err : -EPROTO;
@@ -857,8 +867,10 @@ static void mana_pf_deregister_hw_vport(struct mana_port_context *apc)
857867
err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
858868
sizeof(resp));
859869
if (err) {
860-
netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n",
861-
err);
870+
if (mana_en_need_log(apc, err))
871+
netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n",
872+
err);
873+
862874
return;
863875
}
864876

@@ -913,8 +925,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc)
913925
err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
914926
sizeof(resp));
915927
if (err) {
916-
netdev_err(apc->ndev, "Failed to unregister filter: %d\n",
917-
err);
928+
if (mana_en_need_log(apc, err))
929+
netdev_err(apc->ndev, "Failed to unregister filter: %d\n",
930+
err);
931+
918932
return;
919933
}
920934

@@ -1144,7 +1158,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
11441158
err = mana_send_request(apc->ac, req, req_buf_size, &resp,
11451159
sizeof(resp));
11461160
if (err) {
1147-
netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
1161+
if (mana_en_need_log(apc, err))
1162+
netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
1163+
11481164
goto out;
11491165
}
11501166

@@ -1239,7 +1255,9 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
12391255
err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
12401256
sizeof(resp));
12411257
if (err) {
1242-
netdev_err(ndev, "Failed to destroy WQ object: %d\n", err);
1258+
if (mana_en_need_log(apc, err))
1259+
netdev_err(ndev, "Failed to destroy WQ object: %d\n", err);
1260+
12431261
return;
12441262
}
12451263

@@ -2894,11 +2912,10 @@ static int mana_dealloc_queues(struct net_device *ndev)
28942912

28952913
apc->rss_state = TRI_STATE_FALSE;
28962914
err = mana_config_rss(apc, TRI_STATE_FALSE, false, false);
2897-
if (err) {
2915+
if (err && mana_en_need_log(apc, err))
28982916
netdev_err(ndev, "Failed to disable vPort: %d\n", err);
2899-
return err;
2900-
}
29012917

2918+
/* Even in err case, still need to cleanup the vPort */
29022919
mana_destroy_vport(apc);
29032920

29042921
return 0;

include/net/mana/gdma.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ enum gdma_eqe_type {
6262
GDMA_EQE_HWC_FPGA_RECONFIG = 132,
6363
GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133,
6464
GDMA_EQE_HWC_SOC_SERVICE = 134,
65+
GDMA_EQE_HWC_RESET_REQUEST = 135,
6566
GDMA_EQE_RNIC_QP_FATAL = 176,
6667
};
6768

@@ -584,6 +585,9 @@ enum {
584585
/* Driver supports dynamic MSI-X vector allocation */
585586
#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13)
586587

588+
/* Driver can self reset on EQE notification */
589+
#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14)
590+
587591
/* Driver can self reset on FPGA Reconfig EQE notification */
588592
#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
589593

@@ -594,6 +598,7 @@ enum {
594598
GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
595599
GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
596600
GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
601+
GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
597602
GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
598603

599604
#define GDMA_DRV_CAP_FLAGS2 0
@@ -921,4 +926,9 @@ void mana_unregister_debugfs(void);
921926

922927
int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event);
923928

929+
int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state);
930+
int mana_gd_resume(struct pci_dev *pdev);
931+
932+
bool mana_need_log(struct gdma_context *gc, int err);
933+
924934
#endif /* _GDMA_H */

0 commit comments

Comments
 (0)