Skip to content

Commit 65ba18c

Browse files
Ranjan Kumargregkh
authored andcommitted
scsi: mpi3mr: Synchronous access b/w reset and tm thread for reply queue
[ Upstream commit f195fc0 ] When the task management thread processes reply queues while the reset thread resets them, the task management thread accesses an invalid queue ID (0xFFFF), set by the reset thread, which points to unallocated memory, causing a crash. Add flag 'io_admin_reset_sync' to synchronize access between the reset, I/O, and admin threads. Before a reset, the reset handler sets this flag to block I/O and admin processing threads. If any thread bypasses the initial check, the reset thread waits up to 10 seconds for processing to finish. If the wait exceeds 10 seconds, the controller is marked as unrecoverable. Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com> Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com> Link: https://lore.kernel.org/r/20250129100850.25430-4-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 6a35449 commit 65ba18c

File tree

2 files changed

+66
-3
lines changed

2 files changed

+66
-3
lines changed

drivers/scsi/mpi3mr/mpi3mr.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,6 +1097,7 @@ struct scmd_priv {
10971097
* @ts_update_interval: Timestamp update interval
10981098
* @reset_in_progress: Reset in progress flag
10991099
* @unrecoverable: Controller unrecoverable flag
1100+
* @io_admin_reset_sync: Manage state of I/O ops during an admin reset process
11001101
* @prev_reset_result: Result of previous reset
11011102
* @reset_mutex: Controller reset mutex
11021103
* @reset_waitq: Controller reset wait queue
@@ -1285,6 +1286,7 @@ struct mpi3mr_ioc {
12851286
u16 ts_update_interval;
12861287
u8 reset_in_progress;
12871288
u8 unrecoverable;
1289+
u8 io_admin_reset_sync;
12881290
int prev_reset_result;
12891291
struct mutex reset_mutex;
12901292
wait_queue_head_t reset_waitq;

drivers/scsi/mpi3mr/mpi3mr_fw.c

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ static void mpi3mr_process_factsdata(struct mpi3mr_ioc *mrioc,
1717
struct mpi3_ioc_facts_data *facts_data);
1818
static void mpi3mr_pel_wait_complete(struct mpi3mr_ioc *mrioc,
1919
struct mpi3mr_drv_cmd *drv_cmd);
20-
20+
static int mpi3mr_check_op_admin_proc(struct mpi3mr_ioc *mrioc);
2121
static int poll_queues;
2222
module_param(poll_queues, int, 0444);
2323
MODULE_PARM_DESC(poll_queues, "Number of queues for io_uring poll mode. (Range 1 - 126)");
@@ -459,7 +459,7 @@ int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc)
459459
}
460460

461461
do {
462-
if (mrioc->unrecoverable)
462+
if (mrioc->unrecoverable || mrioc->io_admin_reset_sync)
463463
break;
464464

465465
mrioc->admin_req_ci = le16_to_cpu(reply_desc->request_queue_ci);
@@ -554,7 +554,7 @@ int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc,
554554
}
555555

556556
do {
557-
if (mrioc->unrecoverable)
557+
if (mrioc->unrecoverable || mrioc->io_admin_reset_sync)
558558
break;
559559

560560
req_q_idx = le16_to_cpu(reply_desc->request_queue_id) - 1;
@@ -4394,6 +4394,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
43944394
goto out_failed_noretry;
43954395
}
43964396

4397+
mrioc->io_admin_reset_sync = 0;
43974398
if (is_resume || mrioc->block_on_pci_err) {
43984399
dprint_reset(mrioc, "setting up single ISR\n");
43994400
retval = mpi3mr_setup_isr(mrioc, 1);
@@ -5252,6 +5253,55 @@ void mpi3mr_pel_get_seqnum_complete(struct mpi3mr_ioc *mrioc,
52525253
drv_cmd->retry_count = 0;
52535254
}
52545255

5256+
/**
5257+
* mpi3mr_check_op_admin_proc -
5258+
* @mrioc: Adapter instance reference
5259+
*
5260+
* Check if any of the operation reply queues
5261+
* or the admin reply queue are currently in use.
5262+
* If any queue is in use, this function waits for
5263+
* a maximum of 10 seconds for them to become available.
5264+
*
5265+
* Return: 0 on success, non-zero on failure.
5266+
*/
5267+
static int mpi3mr_check_op_admin_proc(struct mpi3mr_ioc *mrioc)
5268+
{
5269+
5270+
u16 timeout = 10 * 10;
5271+
u16 elapsed_time = 0;
5272+
bool op_admin_in_use = false;
5273+
5274+
do {
5275+
op_admin_in_use = false;
5276+
5277+
/* Check admin_reply queue first to exit early */
5278+
if (atomic_read(&mrioc->admin_reply_q_in_use) == 1)
5279+
op_admin_in_use = true;
5280+
else {
5281+
/* Check op_reply queues */
5282+
int i;
5283+
5284+
for (i = 0; i < mrioc->num_queues; i++) {
5285+
if (atomic_read(&mrioc->op_reply_qinfo[i].in_use) == 1) {
5286+
op_admin_in_use = true;
5287+
break;
5288+
}
5289+
}
5290+
}
5291+
5292+
if (!op_admin_in_use)
5293+
break;
5294+
5295+
msleep(100);
5296+
5297+
} while (++elapsed_time < timeout);
5298+
5299+
if (op_admin_in_use)
5300+
return 1;
5301+
5302+
return 0;
5303+
}
5304+
52555305
/**
52565306
* mpi3mr_soft_reset_handler - Reset the controller
52575307
* @mrioc: Adapter instance reference
@@ -5332,6 +5382,7 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
53325382
mpi3mr_wait_for_host_io(mrioc, MPI3MR_RESET_HOST_IOWAIT_TIMEOUT);
53335383

53345384
mpi3mr_ioc_disable_intr(mrioc);
5385+
mrioc->io_admin_reset_sync = 1;
53355386

53365387
if (snapdump) {
53375388
mpi3mr_set_diagsave(mrioc);
@@ -5359,6 +5410,16 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
53595410
ioc_err(mrioc, "Failed to issue soft reset to the ioc\n");
53605411
goto out;
53615412
}
5413+
5414+
retval = mpi3mr_check_op_admin_proc(mrioc);
5415+
if (retval) {
5416+
ioc_err(mrioc, "Soft reset failed due to an Admin or I/O queue polling\n"
5417+
"thread still processing replies even after a 10 second\n"
5418+
"timeout. Marking the controller as unrecoverable!\n");
5419+
5420+
goto out;
5421+
}
5422+
53625423
if (mrioc->num_io_throttle_group !=
53635424
mrioc->facts.max_io_throttle_group) {
53645425
ioc_err(mrioc,

0 commit comments

Comments
 (0)