habanalabs: reset device in case of sync error
As the F/wW is the first to detect out of sync event, a new event is
added to notify the driver on such event. In which case the driver
performs hard reset.
Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 9152242..c1f0023 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7097,6 +7097,15 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
}
}
+static void gaudi_print_out_of_sync_info(struct hl_device *hdev,
+ struct cpucp_pkt_sync_err *sync_err)
+{
+ struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ];
+
+ dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n",
+ sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci));
+}
+
static int gaudi_soft_reset_late_init(struct hl_device *hdev)
{
struct gaudi_device *gaudi = hdev->asic_specific;
@@ -7552,6 +7561,15 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
event_type, cause);
break;
+ case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC:
+ gaudi_print_irq_info(hdev, event_type, false);
+ gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
+ if (hdev->hard_reset_on_fw_events)
+ hl_device_reset(hdev, true, false);
+ else
+ hl_fw_unmask_irq(hdev, event_type);
+ break;
+
default:
dev_err(hdev->dev, "Received invalid H/W interrupt %d\n",
event_type);