msm: kgsl: Recover from GMU fault
The GMU fault caused by GMU FW bugs or memory corruption can be
recovered. However, if FW is stuck waiting for RPMh response, GMU
recovery cannot work. The change shall improve GMU stability.
CRs-Fixed: 2085877
Change-Id: Ifad1f190c30a13cebf931be3756712a4fbc8355c
Signed-off-by: George Shen <sqiao@codeaurora.org>
diff --git a/drivers/gpu/msm/adreno_a6xx.c b/drivers/gpu/msm/adreno_a6xx.c
index 5f070bc..30e5c66 100644
--- a/drivers/gpu/msm/adreno_a6xx.c
+++ b/drivers/gpu/msm/adreno_a6xx.c
@@ -1943,9 +1943,7 @@ static int a6xx_gmu_suspend(struct kgsl_device *device)
/* If SPTP_RAC is on, turn off SPTP_RAC HS */
a6xx_sptprac_disable(adreno_dev);
- /* Disconnect GPU from BUS. Clear and reconnected after reset */
- adreno_vbif_clear_pending_transactions(device);
- /* Unnecessary: a6xx_soft_reset(adreno_dev); */
+ /* Disconnect GPU from BUS is not needed if CX GDSC goes off later */
/* Check no outstanding RPMh voting */
a6xx_complete_rpmh_votes(device);
diff --git a/drivers/gpu/msm/kgsl_gmu.c b/drivers/gpu/msm/kgsl_gmu.c
index 2ddb082..0c49f27 100644
--- a/drivers/gpu/msm/kgsl_gmu.c
+++ b/drivers/gpu/msm/kgsl_gmu.c
@@ -456,11 +456,12 @@ int gmu_dcvs_set(struct gmu_device *gmu,
GMU_DCVS_NOHFI, perf_idx, bw_idx);
if (ret) {
- dev_err(&gmu->pdev->dev,
+ dev_err_ratelimited(&gmu->pdev->dev,
"Failed to set GPU perf idx %d, bw idx %d\n",
perf_idx, bw_idx);
- gmu_snapshot(device);
+ adreno_set_gpu_fault(adreno_dev, ADRENO_GMU_FAULT);
+ adreno_dispatcher_schedule(device);
}
return ret;
@@ -1341,6 +1342,7 @@ static int gmu_suspend(struct kgsl_device *device)
gmu_disable_clks(gmu);
gmu_disable_gdsc(gmu);
+ dev_err(&gmu->pdev->dev, "Suspended GMU\n");
return 0;
}
@@ -1349,7 +1351,7 @@ static void gmu_snapshot(struct kgsl_device *device)
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
struct gmu_device *gmu = &device->gmu;
- if (!test_and_set_bit(GMU_FAULT, &gmu->flags)) {
+ if (!gmu->fault_count) {
/* Mask so there's no interrupt caused by NMI */
adreno_write_gmureg(adreno_dev,
ADRENO_REG_GMU_GMU2HOST_INTR_MASK, 0xFFFFFFFF);
@@ -1372,6 +1374,8 @@ static void gmu_snapshot(struct kgsl_device *device)
ADRENO_REG_GMU_GMU2HOST_INTR_MASK,
(unsigned int) ~HFI_IRQ_MASK);
}
+
+ gmu->fault_count++;
}
/* To be called to power on both GPU and GMU */
@@ -1389,32 +1393,29 @@ int gmu_start(struct kgsl_device *device)
WARN_ON(test_bit(GMU_CLK_ON, &gmu->flags));
gmu_enable_gdsc(gmu);
gmu_enable_clks(gmu);
+ gmu_irq_enable(device);
/* Vote for 300MHz DDR for GMU to init */
ret = msm_bus_scale_client_update_request(gmu->pcl,
pwr->pwrlevels[pwr->default_pwrlevel].bus_freq);
- if (ret) {
+ if (ret)
dev_err(&gmu->pdev->dev,
- "Failed to allocate gmu b/w\n");
- goto error_clks;
- }
+ "Failed to allocate gmu b/w: %d\n", ret);
ret = gpudev->rpmh_gpu_pwrctrl(adreno_dev, GMU_FW_START,
GMU_COLD_BOOT, 0);
if (ret)
- goto error_bus;
-
- gmu_irq_enable(device);
+ goto error_gmu;
ret = hfi_start(gmu, GMU_COLD_BOOT);
if (ret)
- goto error_gpu;
+ goto error_gmu;
/* Send default DCVS level */
ret = gmu_dcvs_set(gmu, pwr->default_pwrlevel,
pwr->pwrlevels[pwr->default_pwrlevel].bus_freq);
if (ret)
- goto error_gpu;
+ goto error_gmu;
msm_bus_scale_client_update_request(gmu->pcl, 0);
break;
@@ -1423,49 +1424,49 @@ int gmu_start(struct kgsl_device *device)
WARN_ON(test_bit(GMU_CLK_ON, &gmu->flags));
gmu_enable_gdsc(gmu);
gmu_enable_clks(gmu);
+ gmu_irq_enable(device);
ret = gpudev->rpmh_gpu_pwrctrl(adreno_dev, GMU_FW_START,
GMU_WARM_BOOT, 0);
if (ret)
- goto error_clks;
-
- gmu_irq_enable(device);
+ goto error_gmu;
ret = hfi_start(gmu, GMU_WARM_BOOT);
if (ret)
- goto error_gpu;
+ goto error_gmu;
ret = gmu_dcvs_set(gmu, gmu->wakeup_pwrlevel,
pwr->pwrlevels[gmu->wakeup_pwrlevel].bus_freq);
if (ret)
- goto error_gpu;
+ goto error_gmu;
gmu->wakeup_pwrlevel = pwr->default_pwrlevel;
break;
case KGSL_STATE_RESET:
- if (test_bit(ADRENO_DEVICE_HARD_RESET, &adreno_dev->priv)) {
+ if (test_bit(ADRENO_DEVICE_HARD_RESET, &adreno_dev->priv) ||
+ test_bit(GMU_FAULT, &gmu->flags)) {
gmu_suspend(device);
gmu_enable_gdsc(gmu);
gmu_enable_clks(gmu);
+ gmu_irq_enable(device);
ret = gpudev->rpmh_gpu_pwrctrl(
adreno_dev, GMU_FW_START, GMU_RESET, 0);
if (ret)
- goto error_clks;
+ goto error_gmu;
- gmu_irq_enable(device);
ret = hfi_start(gmu, GMU_COLD_BOOT);
if (ret)
- goto error_gpu;
+ goto error_gmu;
/* Send DCVS level prior to reset*/
ret = gmu_dcvs_set(gmu, pwr->active_pwrlevel,
pwr->pwrlevels[pwr->active_pwrlevel]
.bus_freq);
if (ret)
- goto error_gpu;
+ goto error_gmu;
ret = gpudev->oob_set(adreno_dev,
OOB_CPINIT_SET_MASK,
@@ -1485,20 +1486,8 @@ int gmu_start(struct kgsl_device *device)
return ret;
-error_gpu:
+error_gmu:
gmu_snapshot(device);
- hfi_stop(gmu);
- gmu_irq_disable(device);
- if (ADRENO_QUIRK(adreno_dev, ADRENO_QUIRK_HFI_USE_REG))
- gpudev->oob_clear(adreno_dev,
- OOB_BOOT_SLUMBER_CLEAR_MASK);
- gpudev->rpmh_gpu_pwrctrl(adreno_dev, GMU_FW_STOP, 0, 0);
-error_bus:
- msm_bus_scale_client_update_request(gmu->pcl, 0);
-error_clks:
- gmu_snapshot(device);
- gmu_disable_clks(gmu);
- gmu_disable_gdsc(gmu);
return ret;
}
@@ -1535,18 +1524,30 @@ void gmu_stop(struct kgsl_device *device)
if (!idle || (gpudev->wait_for_gmu_idle &&
gpudev->wait_for_gmu_idle(adreno_dev))) {
dev_err(&gmu->pdev->dev, "Stopping GMU before it is idle\n");
+ idle = false;
+ set_bit(GMU_FAULT, &gmu->flags);
+ } else {
+ idle = true;
}
- /* Pending message in all queues are abandoned */
- hfi_stop(gmu);
- clear_bit(GMU_HFI_ON, &gmu->flags);
- gmu_irq_disable(device);
+ if (idle) {
+ /* Pending message in all queues are abandoned */
+ hfi_stop(gmu);
+ clear_bit(GMU_HFI_ON, &gmu->flags);
+ gmu_irq_disable(device);
- gpudev->rpmh_gpu_pwrctrl(adreno_dev, GMU_FW_STOP, 0, 0);
- gmu_disable_clks(gmu);
- gmu_disable_gdsc(gmu);
-
- /* TODO: Vote CX, MX retention off */
+ gpudev->rpmh_gpu_pwrctrl(adreno_dev, GMU_FW_STOP, 0, 0);
+ gmu_disable_clks(gmu);
+ gmu_disable_gdsc(gmu);
+ } else {
+ /*
+ * The power controller will change state to SLUMBER anyway
+ * Set GMU_FAULT flag to indicate to power contrller
+ * that hang recovery is needed to power on GPU
+ */
+ set_bit(GMU_FAULT, &gmu->flags);
+ gmu_snapshot(device);
+ }
msm_bus_scale_client_update_request(gmu->pcl, 0);
}
diff --git a/drivers/gpu/msm/kgsl_gmu.h b/drivers/gpu/msm/kgsl_gmu.h
index 63ca028..ff65f66 100644
--- a/drivers/gpu/msm/kgsl_gmu.h
+++ b/drivers/gpu/msm/kgsl_gmu.h
@@ -193,6 +193,7 @@ enum gpu_idle_level {
* @pcl: GPU BW scaling client
* @ccl: CNOC BW scaling client
* @idle_level: Minimal GPU idle power level
+ * @fault_count: GMU fault count
*/
struct gmu_device {
unsigned int ver;
@@ -226,6 +227,7 @@ struct gmu_device {
unsigned int pcl;
unsigned int ccl;
unsigned int idle_level;
+ unsigned int fault_count;
};
bool kgsl_gmu_isenabled(struct kgsl_device *device);
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index 6e55395..007121c 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -2629,6 +2629,7 @@ static int
_aware(struct kgsl_device *device)
{
int status = 0;
+ struct gmu_device *gmu = &device->gmu;
switch (device->state) {
case KGSL_STATE_RESET:
@@ -2648,15 +2649,42 @@ _aware(struct kgsl_device *device)
kgsl_pwrscale_midframe_timer_cancel(device);
break;
case KGSL_STATE_SLUMBER:
+ /* if GMU already in FAULT */
+ if (kgsl_gmu_isenabled(device) &&
+ test_bit(GMU_FAULT, &gmu->flags)) {
+ status = -EINVAL;
+ break;
+ }
+
status = kgsl_pwrctrl_enable(device);
break;
default:
status = -EINVAL;
}
- if (status)
+
+ if (status) {
+ if (kgsl_gmu_isenabled(device)) {
+ /* GMU hang recovery */
+ kgsl_pwrctrl_set_state(device, KGSL_STATE_RESET);
+ set_bit(GMU_FAULT, &gmu->flags);
+ status = kgsl_pwrctrl_enable(device);
+ if (status) {
+ /*
+ * Cannot recover GMU failure
+ * GPU will not be powered on
+ */
+ WARN_ONCE(1, "Failed to recover GMU\n");
+ }
+
+ clear_bit(GMU_FAULT, &gmu->flags);
+ kgsl_pwrctrl_set_state(device, KGSL_STATE_AWARE);
+ return status;
+ }
+
kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
- else
+ } else {
kgsl_pwrctrl_set_state(device, KGSL_STATE_AWARE);
+ }
return status;
}