blob: 5871162a84425ef895852296a1b7b44b02ef166b [file] [log] [blame]
Oded Gabbayc4d66342019-02-16 00:39:11 +02001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 */
7
Tomer Tayare00dac32019-04-10 15:18:46 +03008#define pr_fmt(fmt) "habanalabs: " fmt
9
Oded Gabbayc4d66342019-02-16 00:39:11 +020010#include "habanalabs.h"
11
12#include <linux/pci.h>
Oded Gabbayd91389b2019-02-16 00:39:19 +020013#include <linux/hwmon.h>
Dalit Ben Zooraa957082019-03-24 10:15:44 +020014#include <uapi/misc/habanalabs.h>
Oded Gabbayc4d66342019-02-16 00:39:11 +020015
Dalit Ben Zooraa957082019-03-24 10:15:44 +020016enum hl_device_status hl_device_status(struct hl_device *hdev)
17{
18 enum hl_device_status status;
19
20 if (hdev->disabled)
21 status = HL_DEVICE_STATUS_MALFUNCTION;
22 else if (atomic_read(&hdev->in_reset))
23 status = HL_DEVICE_STATUS_IN_RESET;
Ofir Bitton66a76402020-10-05 14:40:10 +030024 else if (hdev->needs_reset)
25 status = HL_DEVICE_STATUS_NEEDS_RESET;
Dalit Ben Zooraa957082019-03-24 10:15:44 +020026 else
27 status = HL_DEVICE_STATUS_OPERATIONAL;
28
29 return status;
Oded Gabbay7491c032020-01-07 23:44:32 +020030}
Dalit Ben Zooraa957082019-03-24 10:15:44 +020031
Ofir Bitton66a76402020-10-05 14:40:10 +030032bool hl_device_operational(struct hl_device *hdev,
33 enum hl_device_status *status)
34{
35 enum hl_device_status current_status;
36
37 current_status = hl_device_status(hdev);
38 if (status)
39 *status = current_status;
40
41 switch (current_status) {
42 case HL_DEVICE_STATUS_IN_RESET:
43 case HL_DEVICE_STATUS_MALFUNCTION:
44 case HL_DEVICE_STATUS_NEEDS_RESET:
45 return false;
46 case HL_DEVICE_STATUS_OPERATIONAL:
47 default:
48 return true;
49 }
50}
51
Oded Gabbayc4d66342019-02-16 00:39:11 +020052static void hpriv_release(struct kref *ref)
53{
54 struct hl_fpriv *hpriv;
55 struct hl_device *hdev;
56
57 hpriv = container_of(ref, struct hl_fpriv, refcount);
58
59 hdev = hpriv->hdev;
60
61 put_pid(hpriv->taskpid);
62
Oded Gabbayc2164772019-02-16 00:39:24 +020063 hl_debugfs_remove_file(hpriv);
64
Oded Gabbayeff6f4a2019-02-16 00:39:21 +020065 mutex_destroy(&hpriv->restore_phase_mutex);
66
Oded Gabbayeb7caf82019-07-30 11:56:09 +030067 mutex_lock(&hdev->fpriv_list_lock);
68 list_del(&hpriv->dev_node);
Oded Gabbay86d53072019-07-30 11:49:36 +030069 hdev->compute_ctx = NULL;
Oded Gabbayeb7caf82019-07-30 11:56:09 +030070 mutex_unlock(&hdev->fpriv_list_lock);
71
72 kfree(hpriv);
Oded Gabbayc4d66342019-02-16 00:39:11 +020073}
74
75void hl_hpriv_get(struct hl_fpriv *hpriv)
76{
77 kref_get(&hpriv->refcount);
78}
79
80void hl_hpriv_put(struct hl_fpriv *hpriv)
81{
82 kref_put(&hpriv->refcount, hpriv_release);
83}
84
85/*
86 * hl_device_release - release function for habanalabs device
87 *
88 * @inode: pointer to inode structure
89 * @filp: pointer to file structure
90 *
91 * Called when process closes an habanalabs device
92 */
93static int hl_device_release(struct inode *inode, struct file *filp)
94{
95 struct hl_fpriv *hpriv = filp->private_data;
96
Oded Gabbaybe5d9262019-02-16 00:39:15 +020097 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
Oded Gabbay0861e412019-02-16 00:39:14 +020098 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
99
Oded Gabbayc4d66342019-02-16 00:39:11 +0200100 filp->private_data = NULL;
101
102 hl_hpriv_put(hpriv);
103
104 return 0;
105}
106
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300107static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
108{
109 struct hl_fpriv *hpriv = filp->private_data;
110 struct hl_device *hdev;
111
112 filp->private_data = NULL;
113
114 hdev = hpriv->hdev;
115
116 mutex_lock(&hdev->fpriv_list_lock);
117 list_del(&hpriv->dev_node);
118 mutex_unlock(&hdev->fpriv_list_lock);
119
120 kfree(hpriv);
121
122 return 0;
123}
124
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200125/*
126 * hl_mmap - mmap function for habanalabs device
127 *
128 * @*filp: pointer to file structure
129 * @*vma: pointer to vm_area_struct of the process
130 *
131 * Called when process does an mmap on habanalabs device. Call the device's mmap
132 * function at the end of the common code.
133 */
134static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
135{
136 struct hl_fpriv *hpriv = filp->private_data;
Oded Gabbay3174ac92020-08-29 11:51:39 +0300137 unsigned long vm_pgoff;
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200138
Oded Gabbay3174ac92020-08-29 11:51:39 +0300139 vm_pgoff = vma->vm_pgoff;
140 vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
141
142 switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
143 case HL_MMAP_TYPE_CB:
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200144 return hl_cb_mmap(hpriv, vma);
145 }
146
Oded Gabbay5e6e0232019-02-27 12:15:16 +0200147 return -EINVAL;
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200148}
149
Oded Gabbayc4d66342019-02-16 00:39:11 +0200150static const struct file_operations hl_ops = {
151 .owner = THIS_MODULE,
152 .open = hl_device_open,
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200153 .release = hl_device_release,
154 .mmap = hl_mmap,
155 .unlocked_ioctl = hl_ioctl,
156 .compat_ioctl = hl_ioctl
Oded Gabbayc4d66342019-02-16 00:39:11 +0200157};
158
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300159static const struct file_operations hl_ctrl_ops = {
160 .owner = THIS_MODULE,
161 .open = hl_device_open_ctrl,
162 .release = hl_device_release_ctrl,
163 .unlocked_ioctl = hl_ioctl_control,
164 .compat_ioctl = hl_ioctl_control
165};
166
Tomer Tayarea451f82019-08-08 12:25:52 +0000167static void device_release_func(struct device *dev)
168{
169 kfree(dev);
170}
171
Oded Gabbayc4d66342019-02-16 00:39:11 +0200172/*
Tomer Tayarea451f82019-08-08 12:25:52 +0000173 * device_init_cdev - Initialize cdev and device for habanalabs device
Oded Gabbayc4d66342019-02-16 00:39:11 +0200174 *
175 * @hdev: pointer to habanalabs device structure
176 * @hclass: pointer to the class object of the device
177 * @minor: minor number of the specific device
Oded Gabbayb968eb12019-07-30 09:10:02 +0300178 * @fpos: file operations to install for this device
179 * @name: name of the device as it will appear in the filesystem
Tomer Tayarea451f82019-08-08 12:25:52 +0000180 * @cdev: pointer to the char device object that will be initialized
181 * @dev: pointer to the device object that will be initialized
Oded Gabbayc4d66342019-02-16 00:39:11 +0200182 *
Tomer Tayarea451f82019-08-08 12:25:52 +0000183 * Initialize a cdev and a Linux device for habanalabs's device.
Oded Gabbayc4d66342019-02-16 00:39:11 +0200184 */
Tomer Tayarea451f82019-08-08 12:25:52 +0000185static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
Oded Gabbayb968eb12019-07-30 09:10:02 +0300186 int minor, const struct file_operations *fops,
187 char *name, struct cdev *cdev,
188 struct device **dev)
Oded Gabbayc4d66342019-02-16 00:39:11 +0200189{
Oded Gabbayb968eb12019-07-30 09:10:02 +0300190 cdev_init(cdev, fops);
191 cdev->owner = THIS_MODULE;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200192
Tomer Tayarea451f82019-08-08 12:25:52 +0000193 *dev = kzalloc(sizeof(**dev), GFP_KERNEL);
194 if (!*dev)
195 return -ENOMEM;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200196
Tomer Tayarea451f82019-08-08 12:25:52 +0000197 device_initialize(*dev);
198 (*dev)->devt = MKDEV(hdev->major, minor);
199 (*dev)->class = hclass;
200 (*dev)->release = device_release_func;
Oded Gabbayb968eb12019-07-30 09:10:02 +0300201 dev_set_drvdata(*dev, hdev);
Tomer Tayarea451f82019-08-08 12:25:52 +0000202 dev_set_name(*dev, "%s", name);
203
204 return 0;
205}
206
207static int device_cdev_sysfs_add(struct hl_device *hdev)
208{
209 int rc;
210
211 rc = cdev_device_add(&hdev->cdev, hdev->dev);
212 if (rc) {
213 dev_err(hdev->dev,
214 "failed to add a char device to the system\n");
215 return rc;
216 }
217
218 rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
219 if (rc) {
220 dev_err(hdev->dev,
221 "failed to add a control char device to the system\n");
222 goto delete_cdev_device;
223 }
224
225 /* hl_sysfs_init() must be done after adding the device to the system */
226 rc = hl_sysfs_init(hdev);
227 if (rc) {
228 dev_err(hdev->dev, "failed to initialize sysfs\n");
229 goto delete_ctrl_cdev_device;
230 }
231
232 hdev->cdev_sysfs_created = true;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200233
234 return 0;
235
Tomer Tayarea451f82019-08-08 12:25:52 +0000236delete_ctrl_cdev_device:
237 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
238delete_cdev_device:
239 cdev_device_del(&hdev->cdev, hdev->dev);
240 return rc;
241}
242
243static void device_cdev_sysfs_del(struct hl_device *hdev)
244{
Ofir Bitton5555b7c52020-11-29 09:34:12 +0200245 if (!hdev->cdev_sysfs_created)
246 goto put_devices;
Tomer Tayarea451f82019-08-08 12:25:52 +0000247
248 hl_sysfs_fini(hdev);
249 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
250 cdev_device_del(&hdev->cdev, hdev->dev);
Ofir Bitton5555b7c52020-11-29 09:34:12 +0200251
252put_devices:
253 put_device(hdev->dev);
254 put_device(hdev->dev_ctrl);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200255}
256
Ofir Bittonadb51292020-10-08 10:27:42 +0300257static void device_hard_reset_pending(struct work_struct *work)
258{
259 struct hl_device_reset_work *device_reset_work =
260 container_of(work, struct hl_device_reset_work,
261 reset_work.work);
262 struct hl_device *hdev = device_reset_work->hdev;
263 int rc;
264
265 rc = hl_device_reset(hdev, true, true);
266 if ((rc == -EBUSY) && !hdev->device_fini_pending) {
267 dev_info(hdev->dev,
268 "Could not reset device. will try again in %u seconds",
269 HL_PENDING_RESET_PER_SEC);
270
271 queue_delayed_work(device_reset_work->wq,
272 &device_reset_work->reset_work,
273 msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
274 }
275}
276
Oded Gabbayc4d66342019-02-16 00:39:11 +0200277/*
278 * device_early_init - do some early initialization for the habanalabs device
279 *
280 * @hdev: pointer to habanalabs device structure
281 *
282 * Install the relevant function pointers and call the early_init function,
283 * if such a function exists
284 */
285static int device_early_init(struct hl_device *hdev)
286{
Ofir Bitton5574cb22020-07-05 13:35:51 +0300287 int i, rc;
288 char workq_name[32];
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200289
Oded Gabbayc4d66342019-02-16 00:39:11 +0200290 switch (hdev->asic_type) {
291 case ASIC_GOYA:
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200292 goya_set_asic_funcs(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200293 strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
294 break;
Oded Gabbayaf57cb82020-05-11 10:47:05 +0300295 case ASIC_GAUDI:
296 gaudi_set_asic_funcs(hdev);
297 sprintf(hdev->asic_name, "GAUDI");
298 break;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200299 default:
300 dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
301 hdev->asic_type);
302 return -EINVAL;
303 }
304
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200305 rc = hdev->asic_funcs->early_init(hdev);
306 if (rc)
307 return rc;
308
Oded Gabbay0861e412019-02-16 00:39:14 +0200309 rc = hl_asid_init(hdev);
310 if (rc)
311 goto early_fini;
312
Ofir Bitton5574cb22020-07-05 13:35:51 +0300313 if (hdev->asic_prop.completion_queues_count) {
314 hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
315 sizeof(*hdev->cq_wq),
316 GFP_ATOMIC);
317 if (!hdev->cq_wq) {
318 rc = -ENOMEM;
319 goto asid_fini;
320 }
321 }
322
323 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
Oded Gabbayf907af12020-08-12 10:15:27 +0300324 snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
Ofir Bitton5574cb22020-07-05 13:35:51 +0300325 hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
Colin Ian King804d0572020-07-30 09:20:22 +0100326 if (hdev->cq_wq[i] == NULL) {
Ofir Bitton5574cb22020-07-05 13:35:51 +0300327 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
328 rc = -ENOMEM;
329 goto free_cq_wq;
330 }
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200331 }
332
Oded Gabbay1251f232019-02-16 00:39:18 +0200333 hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
334 if (hdev->eq_wq == NULL) {
335 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
336 rc = -ENOMEM;
337 goto free_cq_wq;
338 }
339
Oded Gabbayd91389b2019-02-16 00:39:19 +0200340 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
341 GFP_KERNEL);
342 if (!hdev->hl_chip_info) {
343 rc = -ENOMEM;
344 goto free_eq_wq;
345 }
346
Oded Gabbay75b3cb22019-08-28 17:32:04 +0300347 hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
348 sizeof(struct hl_device_idle_busy_ts),
349 (GFP_KERNEL | __GFP_ZERO));
350 if (!hdev->idle_busy_ts_arr) {
351 rc = -ENOMEM;
352 goto free_chip_info;
353 }
354
Moti Haimovskid83fe662020-08-12 13:33:44 +0300355 rc = hl_mmu_if_set_funcs(hdev);
356 if (rc)
357 goto free_idle_busy_ts_arr;
358
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200359 hl_cb_mgr_init(&hdev->kernel_cb_mgr);
360
Ofir Bittonadb51292020-10-08 10:27:42 +0300361 hdev->device_reset_work.wq =
362 create_singlethread_workqueue("hl_device_reset");
363 if (!hdev->device_reset_work.wq) {
364 rc = -ENOMEM;
365 dev_err(hdev->dev, "Failed to create device reset WQ\n");
366 goto free_cb_mgr;
367 }
368
369 INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work,
370 device_hard_reset_pending);
371 hdev->device_reset_work.hdev = hdev;
372 hdev->device_fini_pending = 0;
373
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200374 mutex_init(&hdev->send_cpu_message_lock);
Oded Gabbay19734972019-05-04 17:36:06 +0300375 mutex_init(&hdev->debug_lock);
Tomer Tayar8d45f1de2019-05-13 12:13:39 +0300376 mutex_init(&hdev->mmu_cache_lock);
Tomer Tayar804a72272020-10-30 11:16:23 +0200377 INIT_LIST_HEAD(&hdev->cs_mirror_list);
378 spin_lock_init(&hdev->cs_mirror_lock);
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300379 INIT_LIST_HEAD(&hdev->fpriv_list);
380 mutex_init(&hdev->fpriv_list_lock);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200381 atomic_set(&hdev->in_reset, 0);
Oded Gabbay0861e412019-02-16 00:39:14 +0200382
Oded Gabbayc4d66342019-02-16 00:39:11 +0200383 return 0;
Oded Gabbay0861e412019-02-16 00:39:14 +0200384
Ofir Bittonadb51292020-10-08 10:27:42 +0300385free_cb_mgr:
386 hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
Moti Haimovskid83fe662020-08-12 13:33:44 +0300387free_idle_busy_ts_arr:
388 kfree(hdev->idle_busy_ts_arr);
Oded Gabbay75b3cb22019-08-28 17:32:04 +0300389free_chip_info:
390 kfree(hdev->hl_chip_info);
Oded Gabbayd91389b2019-02-16 00:39:19 +0200391free_eq_wq:
392 destroy_workqueue(hdev->eq_wq);
Oded Gabbay1251f232019-02-16 00:39:18 +0200393free_cq_wq:
Ofir Bitton5574cb22020-07-05 13:35:51 +0300394 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
395 if (hdev->cq_wq[i])
396 destroy_workqueue(hdev->cq_wq[i]);
397 kfree(hdev->cq_wq);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200398asid_fini:
399 hl_asid_fini(hdev);
Oded Gabbay0861e412019-02-16 00:39:14 +0200400early_fini:
401 if (hdev->asic_funcs->early_fini)
402 hdev->asic_funcs->early_fini(hdev);
403
404 return rc;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200405}
406
407/*
408 * device_early_fini - finalize all that was done in device_early_init
409 *
410 * @hdev: pointer to habanalabs device structure
411 *
412 */
413static void device_early_fini(struct hl_device *hdev)
414{
Ofir Bitton5574cb22020-07-05 13:35:51 +0300415 int i;
416
Tomer Tayar8d45f1de2019-05-13 12:13:39 +0300417 mutex_destroy(&hdev->mmu_cache_lock);
Oded Gabbay19734972019-05-04 17:36:06 +0300418 mutex_destroy(&hdev->debug_lock);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200419 mutex_destroy(&hdev->send_cpu_message_lock);
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200420
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300421 mutex_destroy(&hdev->fpriv_list_lock);
422
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200423 hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
424
Oded Gabbay75b3cb22019-08-28 17:32:04 +0300425 kfree(hdev->idle_busy_ts_arr);
Oded Gabbayd91389b2019-02-16 00:39:19 +0200426 kfree(hdev->hl_chip_info);
427
Oded Gabbay1251f232019-02-16 00:39:18 +0200428 destroy_workqueue(hdev->eq_wq);
Ofir Bittonadb51292020-10-08 10:27:42 +0300429 destroy_workqueue(hdev->device_reset_work.wq);
Ofir Bitton5574cb22020-07-05 13:35:51 +0300430
431 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
432 destroy_workqueue(hdev->cq_wq[i]);
433 kfree(hdev->cq_wq);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200434
Oded Gabbay0861e412019-02-16 00:39:14 +0200435 hl_asid_fini(hdev);
436
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200437 if (hdev->asic_funcs->early_fini)
438 hdev->asic_funcs->early_fini(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200439}
440
Oded Gabbayd91389b2019-02-16 00:39:19 +0200441static void set_freq_to_low_job(struct work_struct *work)
442{
443 struct hl_device *hdev = container_of(work, struct hl_device,
444 work_freq.work);
445
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300446 mutex_lock(&hdev->fpriv_list_lock);
447
448 if (!hdev->compute_ctx)
Oded Gabbayd91389b2019-02-16 00:39:19 +0200449 hl_device_set_frequency(hdev, PLL_LOW);
450
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300451 mutex_unlock(&hdev->fpriv_list_lock);
452
Oded Gabbayd91389b2019-02-16 00:39:19 +0200453 schedule_delayed_work(&hdev->work_freq,
454 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
455}
456
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200457static void hl_device_heartbeat(struct work_struct *work)
458{
459 struct hl_device *hdev = container_of(work, struct hl_device,
460 work_heartbeat.work);
461
Ofir Bitton66a76402020-10-05 14:40:10 +0300462 if (!hl_device_operational(hdev, NULL))
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200463 goto reschedule;
464
465 if (!hdev->asic_funcs->send_heartbeat(hdev))
466 goto reschedule;
467
468 dev_err(hdev->dev, "Device heartbeat failed!\n");
469 hl_device_reset(hdev, true, false);
470
471 return;
472
473reschedule:
474 schedule_delayed_work(&hdev->work_heartbeat,
475 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
476}
477
Oded Gabbayd91389b2019-02-16 00:39:19 +0200478/*
479 * device_late_init - do late stuff initialization for the habanalabs device
480 *
481 * @hdev: pointer to habanalabs device structure
482 *
483 * Do stuff that either needs the device H/W queues to be active or needs
484 * to happen after all the rest of the initialization is finished
485 */
486static int device_late_init(struct hl_device *hdev)
487{
488 int rc;
489
Oded Gabbay0b28d262019-05-29 14:24:51 +0300490 if (hdev->asic_funcs->late_init) {
491 rc = hdev->asic_funcs->late_init(hdev);
492 if (rc) {
493 dev_err(hdev->dev,
494 "failed late initialization for the H/W\n");
495 return rc;
496 }
497 }
498
Oded Gabbayd91389b2019-02-16 00:39:19 +0200499 hdev->high_pll = hdev->asic_prop.high_pll;
500
501 /* force setting to low frequency */
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300502 hdev->curr_pll_profile = PLL_LOW;
Oded Gabbayd91389b2019-02-16 00:39:19 +0200503
504 if (hdev->pm_mng_profile == PM_AUTO)
505 hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
506 else
507 hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
508
Oded Gabbay0b28d262019-05-29 14:24:51 +0300509 INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
Oded Gabbayd91389b2019-02-16 00:39:19 +0200510 schedule_delayed_work(&hdev->work_freq,
Oded Gabbay0b28d262019-05-29 14:24:51 +0300511 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
Oded Gabbayd91389b2019-02-16 00:39:19 +0200512
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200513 if (hdev->heartbeat) {
514 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
515 schedule_delayed_work(&hdev->work_heartbeat,
516 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
517 }
518
Oded Gabbayd91389b2019-02-16 00:39:19 +0200519 hdev->late_init_done = true;
520
521 return 0;
522}
523
524/*
525 * device_late_fini - finalize all that was done in device_late_init
526 *
527 * @hdev: pointer to habanalabs device structure
528 *
529 */
530static void device_late_fini(struct hl_device *hdev)
531{
532 if (!hdev->late_init_done)
533 return;
534
535 cancel_delayed_work_sync(&hdev->work_freq);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200536 if (hdev->heartbeat)
537 cancel_delayed_work_sync(&hdev->work_heartbeat);
Oded Gabbayd91389b2019-02-16 00:39:19 +0200538
539 if (hdev->asic_funcs->late_fini)
540 hdev->asic_funcs->late_fini(hdev);
541
542 hdev->late_init_done = false;
543}
544
Oded Gabbay75b3cb22019-08-28 17:32:04 +0300545uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms)
546{
547 struct hl_device_idle_busy_ts *ts;
548 ktime_t zero_ktime, curr = ktime_get();
549 u32 overlap_cnt = 0, last_index = hdev->idle_busy_ts_idx;
550 s64 period_us, last_start_us, last_end_us, last_busy_time_us,
551 total_busy_time_us = 0, total_busy_time_ms;
552
553 zero_ktime = ktime_set(0, 0);
554 period_us = period_ms * USEC_PER_MSEC;
555 ts = &hdev->idle_busy_ts_arr[last_index];
556
557 /* check case that device is currently in idle */
558 if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime) &&
559 !ktime_compare(ts->idle_to_busy_ts, zero_ktime)) {
560
561 last_index--;
562 /* Handle case idle_busy_ts_idx was 0 */
563 if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
564 last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
565
566 ts = &hdev->idle_busy_ts_arr[last_index];
567 }
568
569 while (overlap_cnt < HL_IDLE_BUSY_TS_ARR_SIZE) {
570 /* Check if we are in last sample case. i.e. if the sample
571 * begun before the sampling period. This could be a real
572 * sample or 0 so need to handle both cases
573 */
574 last_start_us = ktime_to_us(
575 ktime_sub(curr, ts->idle_to_busy_ts));
576
577 if (last_start_us > period_us) {
578
579 /* First check two cases:
580 * 1. If the device is currently busy
581 * 2. If the device was idle during the whole sampling
582 * period
583 */
584
585 if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime)) {
586 /* Check if the device is currently busy */
587 if (ktime_compare(ts->idle_to_busy_ts,
588 zero_ktime))
589 return 100;
590
591 /* We either didn't have any activity or we
592 * reached an entry which is 0. Either way,
593 * exit and return what was accumulated so far
594 */
595 break;
596 }
597
598 /* If sample has finished, check it is relevant */
599 last_end_us = ktime_to_us(
600 ktime_sub(curr, ts->busy_to_idle_ts));
601
602 if (last_end_us > period_us)
603 break;
604
605 /* It is relevant so add it but with adjustment */
606 last_busy_time_us = ktime_to_us(
607 ktime_sub(ts->busy_to_idle_ts,
608 ts->idle_to_busy_ts));
609 total_busy_time_us += last_busy_time_us -
610 (last_start_us - period_us);
611 break;
612 }
613
614 /* Check if the sample is finished or still open */
615 if (ktime_compare(ts->busy_to_idle_ts, zero_ktime))
616 last_busy_time_us = ktime_to_us(
617 ktime_sub(ts->busy_to_idle_ts,
618 ts->idle_to_busy_ts));
619 else
620 last_busy_time_us = ktime_to_us(
621 ktime_sub(curr, ts->idle_to_busy_ts));
622
623 total_busy_time_us += last_busy_time_us;
624
625 last_index--;
626 /* Handle case idle_busy_ts_idx was 0 */
627 if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
628 last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
629
630 ts = &hdev->idle_busy_ts_arr[last_index];
631
632 overlap_cnt++;
633 }
634
635 total_busy_time_ms = DIV_ROUND_UP_ULL(total_busy_time_us,
636 USEC_PER_MSEC);
637
638 return DIV_ROUND_UP_ULL(total_busy_time_ms * 100, period_ms);
639}
640
Oded Gabbayd91389b2019-02-16 00:39:19 +0200641/*
642 * hl_device_set_frequency - set the frequency of the device
643 *
644 * @hdev: pointer to habanalabs device structure
645 * @freq: the new frequency value
646 *
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300647 * Change the frequency if needed. This function has no protection against
648 * concurrency, therefore it is assumed that the calling function has protected
649 * itself against the case of calling this function from multiple threads with
650 * different values
651 *
652 * Returns 0 if no change was done, otherwise returns 1
Oded Gabbayd91389b2019-02-16 00:39:19 +0200653 */
654int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
655{
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300656 if ((hdev->pm_mng_profile == PM_MANUAL) ||
657 (hdev->curr_pll_profile == freq))
Oded Gabbayd91389b2019-02-16 00:39:19 +0200658 return 0;
659
Oded Gabbayd91389b2019-02-16 00:39:19 +0200660 dev_dbg(hdev->dev, "Changing device frequency to %s\n",
661 freq == PLL_HIGH ? "high" : "low");
662
663 hdev->asic_funcs->set_pll_profile(hdev, freq);
664
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300665 hdev->curr_pll_profile = freq;
666
Oded Gabbayd91389b2019-02-16 00:39:19 +0200667 return 1;
668}
669
Oded Gabbay19734972019-05-04 17:36:06 +0300670int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
671{
672 int rc = 0;
673
674 mutex_lock(&hdev->debug_lock);
675
676 if (!enable) {
677 if (!hdev->in_debug) {
678 dev_err(hdev->dev,
679 "Failed to disable debug mode because device was not in debug mode\n");
680 rc = -EFAULT;
681 goto out;
682 }
683
Omer Shpigelmana37e4712020-01-05 09:05:45 +0000684 if (!hdev->hard_reset_pending)
685 hdev->asic_funcs->halt_coresight(hdev);
686
Oded Gabbay19734972019-05-04 17:36:06 +0300687 hdev->in_debug = 0;
688
Oded Gabbayca624332020-05-09 12:17:21 +0300689 if (!hdev->hard_reset_pending)
Oded Gabbaye38bfd32020-07-03 20:46:12 +0300690 hdev->asic_funcs->set_clock_gating(hdev);
Oded Gabbayca624332020-05-09 12:17:21 +0300691
Oded Gabbay19734972019-05-04 17:36:06 +0300692 goto out;
693 }
694
695 if (hdev->in_debug) {
696 dev_err(hdev->dev,
697 "Failed to enable debug mode because device is already in debug mode\n");
698 rc = -EFAULT;
699 goto out;
700 }
701
Oded Gabbayca624332020-05-09 12:17:21 +0300702 hdev->asic_funcs->disable_clock_gating(hdev);
Oded Gabbay19734972019-05-04 17:36:06 +0300703 hdev->in_debug = 1;
704
Oded Gabbay19734972019-05-04 17:36:06 +0300705out:
706 mutex_unlock(&hdev->debug_lock);
707
708 return rc;
709}
710
Oded Gabbayc4d66342019-02-16 00:39:11 +0200711/*
712 * hl_device_suspend - initiate device suspend
713 *
714 * @hdev: pointer to habanalabs device structure
715 *
716 * Puts the hw in the suspend state (all asics).
717 * Returns 0 for success or an error on failure.
718 * Called at driver suspend.
719 */
720int hl_device_suspend(struct hl_device *hdev)
721{
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200722 int rc;
723
Oded Gabbayc4d66342019-02-16 00:39:11 +0200724 pci_save_state(hdev->pdev);
725
Oded Gabbay7cb51012019-03-03 22:29:20 +0200726 /* Block future CS/VM/JOB completion operations */
727 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
728 if (rc) {
729 dev_err(hdev->dev, "Can't suspend while in reset\n");
730 return -EIO;
731 }
732
733 /* This blocks all other stuff that is not blocked by in_reset */
734 hdev->disabled = true;
735
736 /*
737 * Flush anyone that is inside the critical section of enqueue
738 * jobs to the H/W
739 */
740 hdev->asic_funcs->hw_queues_lock(hdev);
741 hdev->asic_funcs->hw_queues_unlock(hdev);
742
743 /* Flush processes that are sending message to CPU */
744 mutex_lock(&hdev->send_cpu_message_lock);
745 mutex_unlock(&hdev->send_cpu_message_lock);
746
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200747 rc = hdev->asic_funcs->suspend(hdev);
748 if (rc)
749 dev_err(hdev->dev,
750 "Failed to disable PCI access of device CPU\n");
751
Oded Gabbayc4d66342019-02-16 00:39:11 +0200752 /* Shut down the device */
753 pci_disable_device(hdev->pdev);
754 pci_set_power_state(hdev->pdev, PCI_D3hot);
755
756 return 0;
757}
758
759/*
760 * hl_device_resume - initiate device resume
761 *
762 * @hdev: pointer to habanalabs device structure
763 *
764 * Bring the hw back to operating state (all asics).
765 * Returns 0 for success or an error on failure.
766 * Called at driver resume.
767 */
768int hl_device_resume(struct hl_device *hdev)
769{
770 int rc;
771
772 pci_set_power_state(hdev->pdev, PCI_D0);
773 pci_restore_state(hdev->pdev);
Oded Gabbay7cb51012019-03-03 22:29:20 +0200774 rc = pci_enable_device_mem(hdev->pdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200775 if (rc) {
776 dev_err(hdev->dev,
777 "Failed to enable PCI device in resume\n");
778 return rc;
779 }
780
Oded Gabbay7cb51012019-03-03 22:29:20 +0200781 pci_set_master(hdev->pdev);
782
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200783 rc = hdev->asic_funcs->resume(hdev);
784 if (rc) {
Oded Gabbay7cb51012019-03-03 22:29:20 +0200785 dev_err(hdev->dev, "Failed to resume device after suspend\n");
786 goto disable_device;
787 }
788
789
790 hdev->disabled = false;
791 atomic_set(&hdev->in_reset, 0);
792
793 rc = hl_device_reset(hdev, true, false);
794 if (rc) {
795 dev_err(hdev->dev, "Failed to reset device during resume\n");
796 goto disable_device;
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200797 }
798
Oded Gabbayc4d66342019-02-16 00:39:11 +0200799 return 0;
Oded Gabbay7cb51012019-03-03 22:29:20 +0200800
801disable_device:
802 pci_clear_master(hdev->pdev);
803 pci_disable_device(hdev->pdev);
804
805 return rc;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200806}
807
Ofir Bittonadb51292020-10-08 10:27:42 +0300808static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200809{
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300810 struct hl_fpriv *hpriv;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200811 struct task_struct *task = NULL;
Ofir Bittonadb51292020-10-08 10:27:42 +0300812 u32 pending_cnt;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200813
Omer Shpigelmanf650a952019-03-13 13:36:28 +0200814
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300815 /* Giving time for user to close FD, and for processes that are inside
816 * hl_device_open to finish
817 */
818 if (!list_empty(&hdev->fpriv_list))
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200819 ssleep(1);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200820
Ofir Bittonadb51292020-10-08 10:27:42 +0300821 if (timeout) {
822 pending_cnt = timeout;
823 } else {
824 if (hdev->process_kill_trial_cnt) {
825 /* Processes have been already killed */
826 pending_cnt = 1;
827 goto wait_for_processes;
828 } else {
829 /* Wait a small period after process kill */
830 pending_cnt = HL_PENDING_RESET_PER_SEC;
831 }
832 }
833
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300834 mutex_lock(&hdev->fpriv_list_lock);
835
836 /* This section must be protected because we are dereferencing
837 * pointers that are freed if the process exits
838 */
839 list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
840 task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200841 if (task) {
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300842 dev_info(hdev->dev, "Killing user process pid=%d\n",
843 task_pid_nr(task));
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200844 send_sig(SIGKILL, task, 1);
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300845 usleep_range(1000, 10000);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200846
847 put_task_struct(task);
848 }
849 }
850
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300851 mutex_unlock(&hdev->fpriv_list_lock);
852
Omer Shpigelmanf2d032e2020-10-31 22:03:55 +0200853 /*
854 * We killed the open users, but that doesn't mean they are closed.
855 * It could be that they are running a long cleanup phase in the driver
856 * e.g. MMU unmappings, or running other long teardown flow even before
857 * our cleanup.
858 * Therefore we need to wait again to make sure they are closed before
859 * continuing with the reset.
Oded Gabbaycaa3c8e2019-04-06 13:23:54 +0300860 */
861
Ofir Bittonadb51292020-10-08 10:27:42 +0300862wait_for_processes:
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300863 while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
Ofir Bittonadb51292020-10-08 10:27:42 +0300864 dev_dbg(hdev->dev,
865 "Waiting for all unmap operations to finish before hard reset\n");
Omer Shpigelmanf650a952019-03-13 13:36:28 +0200866
867 pending_cnt--;
868
869 ssleep(1);
870 }
871
Ofir Bittonadb51292020-10-08 10:27:42 +0300872 /* All processes exited successfully */
873 if (list_empty(&hdev->fpriv_list))
874 return 0;
Oded Gabbaycaa3c8e2019-04-06 13:23:54 +0300875
Ofir Bittonadb51292020-10-08 10:27:42 +0300876 /* Give up waiting for processes to exit */
877 if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS)
878 return -ETIME;
Oded Gabbaycaa3c8e2019-04-06 13:23:54 +0300879
Ofir Bittonadb51292020-10-08 10:27:42 +0300880 hdev->process_kill_trial_cnt++;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200881
Ofir Bittonadb51292020-10-08 10:27:42 +0300882 return -EBUSY;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200883}
884
885/*
886 * hl_device_reset - reset the device
887 *
888 * @hdev: pointer to habanalabs device structure
889 * @hard_reset: should we do hard reset to all engines or just reset the
890 * compute/dma engines
Oded Gabbay66446822020-05-18 16:48:01 +0300891 * @from_hard_reset_thread: is the caller the hard-reset thread
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200892 *
893 * Block future CS and wait for pending CS to be enqueued
894 * Call ASIC H/W fini
895 * Flush all completions
896 * Re-initialize all internal data structures
897 * Call ASIC H/W init, late_init
898 * Test queues
899 * Enable device
900 *
901 * Returns 0 for success or an error on failure.
902 */
903int hl_device_reset(struct hl_device *hdev, bool hard_reset,
904 bool from_hard_reset_thread)
905{
906 int i, rc;
907
908 if (!hdev->init_done) {
909 dev_err(hdev->dev,
910 "Can't reset before initialization is done\n");
911 return 0;
912 }
913
Oded Gabbay66446822020-05-18 16:48:01 +0300914 if ((!hard_reset) && (!hdev->supports_soft_reset)) {
915 dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
916 hard_reset = true;
917 }
918
Ofir Bittonadb51292020-10-08 10:27:42 +0300919 /* Re-entry of reset thread */
920 if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
921 goto kill_processes;
922
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200923 /*
924 * Prevent concurrency in this function - only one reset should be
925 * done at any given time. Only need to perform this if we didn't
926 * get from the dedicated hard reset thread
927 */
928 if (!from_hard_reset_thread) {
929 /* Block future CS/VM/JOB completion operations */
930 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
931 if (rc)
932 return 0;
933
Oded Gabbayc83c4172020-07-05 15:48:34 +0300934 if (hard_reset) {
935 /* Disable PCI access from device F/W so he won't send
936 * us additional interrupts. We disable MSI/MSI-X at
937 * the halt_engines function and we can't have the F/W
938 * sending us interrupts after that. We need to disable
939 * the access here because if the device is marked
940 * disable, the message won't be send. Also, in case
941 * of heartbeat, the device CPU is marked as disable
942 * so this message won't be sent
943 */
944 if (hl_fw_send_pci_access_msg(hdev,
Oded Gabbay2f553422020-08-15 16:28:10 +0300945 CPUCP_PACKET_DISABLE_PCI_ACCESS))
Oded Gabbayc83c4172020-07-05 15:48:34 +0300946 dev_warn(hdev->dev,
947 "Failed to disable PCI access by F/W\n");
948 }
949
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200950 /* This also blocks future CS/VM/JOB completion operations */
951 hdev->disabled = true;
952
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300953 /* Flush anyone that is inside the critical section of enqueue
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200954 * jobs to the H/W
955 */
956 hdev->asic_funcs->hw_queues_lock(hdev);
957 hdev->asic_funcs->hw_queues_unlock(hdev);
958
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300959 /* Flush anyone that is inside device open */
960 mutex_lock(&hdev->fpriv_list_lock);
961 mutex_unlock(&hdev->fpriv_list_lock);
962
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200963 dev_err(hdev->dev, "Going to RESET device!\n");
964 }
965
966again:
967 if ((hard_reset) && (!from_hard_reset_thread)) {
Oded Gabbay3f5398c2019-04-06 15:41:35 +0300968 hdev->hard_reset_pending = true;
969
Ofir Bittonadb51292020-10-08 10:27:42 +0300970 hdev->process_kill_trial_cnt = 0;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200971
972 /*
973 * Because the reset function can't run from interrupt or
974 * from heartbeat work, we need to call the reset function
975 * from a dedicated work
976 */
Ofir Bittonadb51292020-10-08 10:27:42 +0300977 queue_delayed_work(hdev->device_reset_work.wq,
978 &hdev->device_reset_work.reset_work, 0);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200979
980 return 0;
981 }
982
983 if (hard_reset) {
984 device_late_fini(hdev);
985
986 /*
987 * Now that the heartbeat thread is closed, flush processes
988 * which are sending messages to CPU
989 */
990 mutex_lock(&hdev->send_cpu_message_lock);
991 mutex_unlock(&hdev->send_cpu_message_lock);
992 }
993
994 /*
995 * Halt the engines and disable interrupts so we won't get any more
996 * completions from H/W and we won't have any accesses from the
997 * H/W to the host machine
998 */
999 hdev->asic_funcs->halt_engines(hdev, hard_reset);
1000
Oded Gabbayeff6f4a2019-02-16 00:39:21 +02001001 /* Go over all the queues, release all CS and their jobs */
1002 hl_cs_rollback_all(hdev);
1003
Ofir Bittonadb51292020-10-08 10:27:42 +03001004kill_processes:
Oded Gabbay55f6d682019-11-17 17:41:57 +02001005 if (hard_reset) {
1006 /* Kill processes here after CS rollback. This is because the
1007 * process can't really exit until all its CSs are done, which
1008 * is what we do in cs rollback
1009 */
Ofir Bittonadb51292020-10-08 10:27:42 +03001010 rc = device_kill_open_processes(hdev, 0);
1011
1012 if (rc == -EBUSY) {
1013 if (hdev->device_fini_pending) {
1014 dev_crit(hdev->dev,
1015 "Failed to kill all open processes, stopping hard reset\n");
1016 goto out_err;
1017 }
1018
1019 /* signal reset thread to reschedule */
1020 return rc;
1021 }
1022
Omer Shpigelman36fafe82020-05-18 22:27:46 +03001023 if (rc) {
1024 dev_crit(hdev->dev,
1025 "Failed to kill all open processes, stopping hard reset\n");
1026 goto out_err;
1027 }
Oded Gabbay4aecb052019-07-22 17:37:22 +03001028
Oded Gabbay55f6d682019-11-17 17:41:57 +02001029 /* Flush the Event queue workers to make sure no other thread is
1030 * reading or writing to registers during the reset
1031 */
1032 flush_workqueue(hdev->eq_wq);
1033 }
1034
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001035 /* Reset the H/W. It will be in idle state after this returns */
1036 hdev->asic_funcs->hw_fini(hdev, hard_reset);
1037
Omer Shpigelman0feaf862019-02-16 00:39:22 +02001038 if (hard_reset) {
Oded Gabbay9e2e8fc2020-09-23 15:06:52 +03001039 /* Release kernel context */
1040 if (hl_ctx_put(hdev->kernel_ctx) == 1)
1041 hdev->kernel_ctx = NULL;
Omer Shpigelman0feaf862019-02-16 00:39:22 +02001042 hl_vm_fini(hdev);
Oded Gabbay37d68ce2019-05-29 14:43:04 +03001043 hl_mmu_fini(hdev);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001044 hl_eq_reset(hdev, &hdev->event_queue);
Omer Shpigelman0feaf862019-02-16 00:39:22 +02001045 }
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001046
1047 /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
1048 hl_hw_queue_reset(hdev, hard_reset);
1049 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1050 hl_cq_reset(hdev, &hdev->completion_queue[i]);
1051
Oded Gabbay75b3cb22019-08-28 17:32:04 +03001052 hdev->idle_busy_ts_idx = 0;
1053 hdev->idle_busy_ts_arr[0].busy_to_idle_ts = ktime_set(0, 0);
1054 hdev->idle_busy_ts_arr[0].idle_to_busy_ts = ktime_set(0, 0);
1055
1056 if (hdev->cs_active_cnt)
1057 dev_crit(hdev->dev, "CS active cnt %d is not 0 during reset\n",
1058 hdev->cs_active_cnt);
1059
Oded Gabbayeb7caf82019-07-30 11:56:09 +03001060 mutex_lock(&hdev->fpriv_list_lock);
1061
Oded Gabbay027d35d2019-04-25 20:15:42 +03001062 /* Make sure the context switch phase will run again */
Oded Gabbay86d53072019-07-30 11:49:36 +03001063 if (hdev->compute_ctx) {
1064 atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1);
1065 hdev->compute_ctx->thread_ctx_switch_wait_token = 0;
Oded Gabbayeff6f4a2019-02-16 00:39:21 +02001066 }
1067
Oded Gabbayeb7caf82019-07-30 11:56:09 +03001068 mutex_unlock(&hdev->fpriv_list_lock);
1069
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001070 /* Finished tear-down, starting to re-initialize */
1071
1072 if (hard_reset) {
Oded Gabbaya28ce422019-02-28 10:46:12 +02001073 hdev->device_cpu_disabled = false;
Oded Gabbay3f5398c2019-04-06 15:41:35 +03001074 hdev->hard_reset_pending = false;
Oded Gabbaya28ce422019-02-28 10:46:12 +02001075
Oded Gabbay0878a422019-03-17 09:12:29 +02001076 if (hdev->kernel_ctx) {
1077 dev_crit(hdev->dev,
1078 "kernel ctx was alive during hard reset, something is terribly wrong\n");
1079 rc = -EBUSY;
1080 goto out_err;
1081 }
1082
Oded Gabbay37d68ce2019-05-29 14:43:04 +03001083 rc = hl_mmu_init(hdev);
1084 if (rc) {
1085 dev_err(hdev->dev,
1086 "Failed to initialize MMU S/W after hard reset\n");
1087 goto out_err;
1088 }
1089
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001090 /* Allocate the kernel context */
1091 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
1092 GFP_KERNEL);
1093 if (!hdev->kernel_ctx) {
1094 rc = -ENOMEM;
1095 goto out_err;
1096 }
1097
Oded Gabbay86d53072019-07-30 11:49:36 +03001098 hdev->compute_ctx = NULL;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001099
1100 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
1101 if (rc) {
1102 dev_err(hdev->dev,
1103 "failed to init kernel ctx in hard reset\n");
1104 kfree(hdev->kernel_ctx);
1105 hdev->kernel_ctx = NULL;
1106 goto out_err;
1107 }
1108 }
1109
Oded Gabbay8df8cb12020-07-15 21:59:32 +03001110 /* Device is now enabled as part of the initialization requires
1111 * communication with the device firmware to get information that
1112 * is required for the initialization itself
1113 */
1114 hdev->disabled = false;
1115
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001116 rc = hdev->asic_funcs->hw_init(hdev);
1117 if (rc) {
1118 dev_err(hdev->dev,
1119 "failed to initialize the H/W after reset\n");
1120 goto out_err;
1121 }
1122
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001123 /* Check that the communication with the device is working */
1124 rc = hdev->asic_funcs->test_queues(hdev);
1125 if (rc) {
1126 dev_err(hdev->dev,
1127 "Failed to detect if device is alive after reset\n");
1128 goto out_err;
1129 }
1130
1131 if (hard_reset) {
1132 rc = device_late_init(hdev);
1133 if (rc) {
1134 dev_err(hdev->dev,
1135 "Failed late init after hard reset\n");
1136 goto out_err;
1137 }
1138
Omer Shpigelman0feaf862019-02-16 00:39:22 +02001139 rc = hl_vm_init(hdev);
1140 if (rc) {
1141 dev_err(hdev->dev,
1142 "Failed to init memory module after hard reset\n");
1143 goto out_err;
1144 }
1145
Oded Gabbay58361aa2020-08-08 23:34:47 +03001146 hl_set_max_power(hdev);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001147 } else {
1148 rc = hdev->asic_funcs->soft_reset_late_init(hdev);
1149 if (rc) {
1150 dev_err(hdev->dev,
1151 "Failed late init after soft reset\n");
1152 goto out_err;
1153 }
1154 }
1155
1156 atomic_set(&hdev->in_reset, 0);
Ofir Bitton66a76402020-10-05 14:40:10 +03001157 hdev->needs_reset = false;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001158
1159 if (hard_reset)
1160 hdev->hard_reset_cnt++;
1161 else
1162 hdev->soft_reset_cnt++;
1163
Oded Gabbay867b58a2019-08-08 16:48:55 +03001164 dev_warn(hdev->dev, "Successfully finished resetting the device\n");
1165
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001166 return 0;
1167
1168out_err:
1169 hdev->disabled = true;
1170
1171 if (hard_reset) {
1172 dev_err(hdev->dev,
1173 "Failed to reset! Device is NOT usable\n");
1174 hdev->hard_reset_cnt++;
1175 } else {
1176 dev_err(hdev->dev,
1177 "Failed to do soft-reset, trying hard reset\n");
1178 hdev->soft_reset_cnt++;
1179 hard_reset = true;
1180 goto again;
1181 }
1182
1183 atomic_set(&hdev->in_reset, 0);
1184
1185 return rc;
1186}
1187
Oded Gabbayc4d66342019-02-16 00:39:11 +02001188/*
1189 * hl_device_init - main initialization function for habanalabs device
1190 *
1191 * @hdev: pointer to habanalabs device structure
1192 *
1193 * Allocate an id for the device, do early initialization and then call the
1194 * ASIC specific initialization functions. Finally, create the cdev and the
1195 * Linux device to expose it to the user
1196 */
1197int hl_device_init(struct hl_device *hdev, struct class *hclass)
1198{
Omer Shpigelman1fa185c2020-03-01 19:59:39 +02001199 int i, rc, cq_cnt, cq_ready_cnt;
Oded Gabbayb968eb12019-07-30 09:10:02 +03001200 char *name;
Tomer Tayarea451f82019-08-08 12:25:52 +00001201 bool add_cdev_sysfs_on_err = false;
Oded Gabbayb968eb12019-07-30 09:10:02 +03001202
Oded Gabbay4d6a7752019-07-30 09:10:50 +03001203 name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2);
Tomer Tayarea451f82019-08-08 12:25:52 +00001204 if (!name) {
1205 rc = -ENOMEM;
1206 goto out_disabled;
1207 }
Oded Gabbayc4d66342019-02-16 00:39:11 +02001208
Tomer Tayarea451f82019-08-08 12:25:52 +00001209 /* Initialize cdev and device structures */
1210 rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
Oded Gabbayb968eb12019-07-30 09:10:02 +03001211 &hdev->cdev, &hdev->dev);
1212
1213 kfree(name);
Oded Gabbayc4d66342019-02-16 00:39:11 +02001214
1215 if (rc)
1216 goto out_disabled;
1217
Oded Gabbay4d6a7752019-07-30 09:10:50 +03001218 name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2);
1219 if (!name) {
1220 rc = -ENOMEM;
Tomer Tayarea451f82019-08-08 12:25:52 +00001221 goto free_dev;
Oded Gabbay4d6a7752019-07-30 09:10:50 +03001222 }
1223
Tomer Tayarea451f82019-08-08 12:25:52 +00001224 /* Initialize cdev and device structures for control device */
1225 rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
Oded Gabbay4d6a7752019-07-30 09:10:50 +03001226 name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
1227
1228 kfree(name);
1229
1230 if (rc)
Tomer Tayarea451f82019-08-08 12:25:52 +00001231 goto free_dev;
Oded Gabbay4d6a7752019-07-30 09:10:50 +03001232
Oded Gabbayc4d66342019-02-16 00:39:11 +02001233 /* Initialize ASIC function pointers and perform early init */
1234 rc = device_early_init(hdev);
1235 if (rc)
Tomer Tayarea451f82019-08-08 12:25:52 +00001236 goto free_dev_ctrl;
Oded Gabbayc4d66342019-02-16 00:39:11 +02001237
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001238 /*
1239 * Start calling ASIC initialization. First S/W then H/W and finally
1240 * late init
1241 */
1242 rc = hdev->asic_funcs->sw_init(hdev);
1243 if (rc)
1244 goto early_fini;
1245
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001246 /*
1247 * Initialize the H/W queues. Must be done before hw_init, because
1248 * there the addresses of the kernel queue are being written to the
1249 * registers of the device
1250 */
1251 rc = hl_hw_queues_create(hdev);
1252 if (rc) {
1253 dev_err(hdev->dev, "failed to initialize kernel queues\n");
1254 goto sw_fini;
1255 }
1256
Omer Shpigelman1fa185c2020-03-01 19:59:39 +02001257 cq_cnt = hdev->asic_prop.completion_queues_count;
1258
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001259 /*
1260 * Initialize the completion queues. Must be done before hw_init,
1261 * because there the addresses of the completion queues are being
1262 * passed as arguments to request_irq
1263 */
Ofir Bitton79b18942020-06-24 14:49:43 +03001264 if (cq_cnt) {
1265 hdev->completion_queue = kcalloc(cq_cnt,
1266 sizeof(*hdev->completion_queue),
1267 GFP_KERNEL);
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001268
Ofir Bitton79b18942020-06-24 14:49:43 +03001269 if (!hdev->completion_queue) {
1270 dev_err(hdev->dev,
1271 "failed to allocate completion queues\n");
1272 rc = -ENOMEM;
1273 goto hw_queues_destroy;
1274 }
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001275 }
1276
Omer Shpigelman1fa185c2020-03-01 19:59:39 +02001277 for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
1278 rc = hl_cq_init(hdev, &hdev->completion_queue[i],
1279 hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001280 if (rc) {
1281 dev_err(hdev->dev,
1282 "failed to initialize completion queue\n");
1283 goto cq_fini;
1284 }
Ofir Bitton5574cb22020-07-05 13:35:51 +03001285 hdev->completion_queue[i].cq_idx = i;
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001286 }
1287
Oded Gabbay1251f232019-02-16 00:39:18 +02001288 /*
1289 * Initialize the event queue. Must be done before hw_init,
1290 * because there the address of the event queue is being
1291 * passed as argument to request_irq
1292 */
1293 rc = hl_eq_init(hdev, &hdev->event_queue);
1294 if (rc) {
1295 dev_err(hdev->dev, "failed to initialize event queue\n");
1296 goto cq_fini;
1297 }
1298
Oded Gabbay37d68ce2019-05-29 14:43:04 +03001299 /* MMU S/W must be initialized before kernel context is created */
1300 rc = hl_mmu_init(hdev);
1301 if (rc) {
1302 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
1303 goto eq_fini;
1304 }
1305
Oded Gabbay0861e412019-02-16 00:39:14 +02001306 /* Allocate the kernel context */
1307 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
1308 if (!hdev->kernel_ctx) {
1309 rc = -ENOMEM;
Oded Gabbay37d68ce2019-05-29 14:43:04 +03001310 goto mmu_fini;
Oded Gabbay0861e412019-02-16 00:39:14 +02001311 }
1312
Oded Gabbay86d53072019-07-30 11:49:36 +03001313 hdev->compute_ctx = NULL;
Oded Gabbay0861e412019-02-16 00:39:14 +02001314
1315 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
1316 if (rc) {
1317 dev_err(hdev->dev, "failed to initialize kernel context\n");
Tomer Tayar508c5842019-08-01 13:57:36 +00001318 kfree(hdev->kernel_ctx);
1319 goto mmu_fini;
Oded Gabbay0861e412019-02-16 00:39:14 +02001320 }
1321
Oded Gabbaybe5d9262019-02-16 00:39:15 +02001322 rc = hl_cb_pool_init(hdev);
1323 if (rc) {
1324 dev_err(hdev->dev, "failed to initialize CB pool\n");
1325 goto release_ctx;
1326 }
1327
Oded Gabbayc2164772019-02-16 00:39:24 +02001328 hl_debugfs_add_device(hdev);
1329
Tomer Tayarea451f82019-08-08 12:25:52 +00001330 /*
1331 * From this point, in case of an error, add char devices and create
1332 * sysfs nodes as part of the error flow, to allow debugging.
1333 */
1334 add_cdev_sysfs_on_err = true;
1335
Oded Gabbay8df8cb12020-07-15 21:59:32 +03001336 /* Device is now enabled as part of the initialization requires
1337 * communication with the device firmware to get information that
1338 * is required for the initialization itself
1339 */
1340 hdev->disabled = false;
1341
Oded Gabbay839c4802019-02-16 00:39:16 +02001342 rc = hdev->asic_funcs->hw_init(hdev);
1343 if (rc) {
1344 dev_err(hdev->dev, "failed to initialize the H/W\n");
1345 rc = 0;
1346 goto out_disabled;
1347 }
1348
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001349 /* Check that the communication with the device is working */
1350 rc = hdev->asic_funcs->test_queues(hdev);
1351 if (rc) {
1352 dev_err(hdev->dev, "Failed to detect if device is alive\n");
1353 rc = 0;
1354 goto out_disabled;
1355 }
1356
Oded Gabbayd91389b2019-02-16 00:39:19 +02001357 rc = device_late_init(hdev);
1358 if (rc) {
1359 dev_err(hdev->dev, "Failed late initialization\n");
1360 rc = 0;
1361 goto out_disabled;
1362 }
1363
1364 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
1365 hdev->asic_name,
1366 hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
1367
Omer Shpigelman0feaf862019-02-16 00:39:22 +02001368 rc = hl_vm_init(hdev);
1369 if (rc) {
1370 dev_err(hdev->dev, "Failed to initialize memory module\n");
1371 rc = 0;
1372 goto out_disabled;
1373 }
1374
Oded Gabbayd91389b2019-02-16 00:39:19 +02001375 /*
Tomer Tayarea451f82019-08-08 12:25:52 +00001376 * Expose devices and sysfs nodes to user.
1377 * From here there is no need to add char devices and create sysfs nodes
1378 * in case of an error.
1379 */
1380 add_cdev_sysfs_on_err = false;
1381 rc = device_cdev_sysfs_add(hdev);
1382 if (rc) {
1383 dev_err(hdev->dev,
1384 "Failed to add char devices and sysfs nodes\n");
1385 rc = 0;
1386 goto out_disabled;
1387 }
1388
Oded Gabbay58361aa2020-08-08 23:34:47 +03001389 /* Need to call this again because the max power might change,
1390 * depending on card type for certain ASICs
1391 */
1392 hl_set_max_power(hdev);
1393
Tomer Tayarea451f82019-08-08 12:25:52 +00001394 /*
1395 * hl_hwmon_init() must be called after device_late_init(), because only
Oded Gabbayd91389b2019-02-16 00:39:19 +02001396 * there we get the information from the device about which
Tomer Tayarea451f82019-08-08 12:25:52 +00001397 * hwmon-related sensors the device supports.
1398 * Furthermore, it must be done after adding the device to the system.
Oded Gabbayd91389b2019-02-16 00:39:19 +02001399 */
1400 rc = hl_hwmon_init(hdev);
1401 if (rc) {
1402 dev_err(hdev->dev, "Failed to initialize hwmon\n");
1403 rc = 0;
1404 goto out_disabled;
1405 }
1406
Oded Gabbayc4d66342019-02-16 00:39:11 +02001407 dev_notice(hdev->dev,
1408 "Successfully added device to habanalabs driver\n");
1409
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001410 hdev->init_done = true;
1411
Oded Gabbayc4d66342019-02-16 00:39:11 +02001412 return 0;
1413
Oded Gabbaybe5d9262019-02-16 00:39:15 +02001414release_ctx:
1415 if (hl_ctx_put(hdev->kernel_ctx) != 1)
1416 dev_err(hdev->dev,
1417 "kernel ctx is still alive on initialization failure\n");
Oded Gabbay37d68ce2019-05-29 14:43:04 +03001418mmu_fini:
1419 hl_mmu_fini(hdev);
Oded Gabbay1251f232019-02-16 00:39:18 +02001420eq_fini:
1421 hl_eq_fini(hdev, &hdev->event_queue);
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001422cq_fini:
1423 for (i = 0 ; i < cq_ready_cnt ; i++)
1424 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1425 kfree(hdev->completion_queue);
1426hw_queues_destroy:
1427 hl_hw_queues_destroy(hdev);
Oded Gabbay0861e412019-02-16 00:39:14 +02001428sw_fini:
1429 hdev->asic_funcs->sw_fini(hdev);
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001430early_fini:
1431 device_early_fini(hdev);
Tomer Tayarea451f82019-08-08 12:25:52 +00001432free_dev_ctrl:
Ofir Bitton5555b7c52020-11-29 09:34:12 +02001433 put_device(hdev->dev_ctrl);
Tomer Tayarea451f82019-08-08 12:25:52 +00001434free_dev:
Ofir Bitton5555b7c52020-11-29 09:34:12 +02001435 put_device(hdev->dev);
Oded Gabbayc4d66342019-02-16 00:39:11 +02001436out_disabled:
1437 hdev->disabled = true;
Tomer Tayarea451f82019-08-08 12:25:52 +00001438 if (add_cdev_sysfs_on_err)
1439 device_cdev_sysfs_add(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +02001440 if (hdev->pdev)
1441 dev_err(&hdev->pdev->dev,
1442 "Failed to initialize hl%d. Device is NOT usable !\n",
Oded Gabbay307eae92019-09-01 16:13:25 +03001443 hdev->id / 2);
Oded Gabbayc4d66342019-02-16 00:39:11 +02001444 else
1445 pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
Oded Gabbay307eae92019-09-01 16:13:25 +03001446 hdev->id / 2);
Oded Gabbayc4d66342019-02-16 00:39:11 +02001447
1448 return rc;
1449}
1450
1451/*
1452 * hl_device_fini - main tear-down function for habanalabs device
1453 *
1454 * @hdev: pointer to habanalabs device structure
1455 *
1456 * Destroy the device, call ASIC fini functions and release the id
1457 */
1458void hl_device_fini(struct hl_device *hdev)
1459{
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001460 ktime_t timeout;
Ofir Bittonadb51292020-10-08 10:27:42 +03001461 int i, rc;
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001462
Oded Gabbayc4d66342019-02-16 00:39:11 +02001463 dev_info(hdev->dev, "Removing device\n");
1464
Ofir Bittonadb51292020-10-08 10:27:42 +03001465 hdev->device_fini_pending = 1;
1466 flush_delayed_work(&hdev->device_reset_work.reset_work);
1467
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001468 /*
1469 * This function is competing with the reset function, so try to
1470 * take the reset atomic and if we are already in middle of reset,
1471 * wait until reset function is finished. Reset function is designed
Omer Shpigelmane09498b2020-05-09 12:18:01 +03001472 * to always finish. However, in Gaudi, because of all the network
1473 * ports, the hard reset could take between 10-30 seconds
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001474 */
1475
1476 timeout = ktime_add_us(ktime_get(),
Omer Shpigelmane09498b2020-05-09 12:18:01 +03001477 HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001478 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
1479 while (rc) {
1480 usleep_range(50, 200);
1481 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
1482 if (ktime_compare(ktime_get(), timeout) > 0) {
1483 WARN(1, "Failed to remove device because reset function did not finish\n");
1484 return;
1485 }
Oded Gabbaya1c92d12019-04-02 15:46:02 +03001486 }
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +02001487
Oded Gabbayc4d66342019-02-16 00:39:11 +02001488 /* Mark device as disabled */
1489 hdev->disabled = true;
1490
Oded Gabbayeb7caf82019-07-30 11:56:09 +03001491 /* Flush anyone that is inside the critical section of enqueue
Oded Gabbaycaa3c8e2019-04-06 13:23:54 +03001492 * jobs to the H/W
1493 */
1494 hdev->asic_funcs->hw_queues_lock(hdev);
1495 hdev->asic_funcs->hw_queues_unlock(hdev);
1496
Oded Gabbayeb7caf82019-07-30 11:56:09 +03001497 /* Flush anyone that is inside device open */
1498 mutex_lock(&hdev->fpriv_list_lock);
1499 mutex_unlock(&hdev->fpriv_list_lock);
1500
Oded Gabbay3f5398c2019-04-06 15:41:35 +03001501 hdev->hard_reset_pending = true;
1502
Oded Gabbayd91389b2019-02-16 00:39:19 +02001503 hl_hwmon_fini(hdev);
1504
1505 device_late_fini(hdev);
1506
Oded Gabbayc2164772019-02-16 00:39:24 +02001507 hl_debugfs_remove_device(hdev);
1508
Oded Gabbay1251f232019-02-16 00:39:18 +02001509 /*
1510 * Halt the engines and disable interrupts so we won't get any more
1511 * completions from H/W and we won't have any accesses from the
1512 * H/W to the host machine
1513 */
1514 hdev->asic_funcs->halt_engines(hdev, true);
1515
Oded Gabbayeff6f4a2019-02-16 00:39:21 +02001516 /* Go over all the queues, release all CS and their jobs */
1517 hl_cs_rollback_all(hdev);
1518
Oded Gabbay4aecb052019-07-22 17:37:22 +03001519 /* Kill processes here after CS rollback. This is because the process
1520 * can't really exit until all its CSs are done, which is what we
1521 * do in cs rollback
1522 */
Ofir Bittonadb51292020-10-08 10:27:42 +03001523 dev_info(hdev->dev,
1524 "Waiting for all processes to exit (timeout of %u seconds)",
1525 HL_PENDING_RESET_LONG_SEC);
1526
1527 rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC);
Omer Shpigelman36fafe82020-05-18 22:27:46 +03001528 if (rc)
1529 dev_crit(hdev->dev, "Failed to kill all open processes\n");
Oded Gabbay4aecb052019-07-22 17:37:22 +03001530
Oded Gabbaybe5d9262019-02-16 00:39:15 +02001531 hl_cb_pool_fini(hdev);
1532
Oded Gabbay9e2e8fc2020-09-23 15:06:52 +03001533 /* Reset the H/W. It will be in idle state after this returns */
1534 hdev->asic_funcs->hw_fini(hdev, true);
1535
Oded Gabbay0861e412019-02-16 00:39:14 +02001536 /* Release kernel context */
1537 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
1538 dev_err(hdev->dev, "kernel ctx is still alive\n");
1539
Omer Shpigelman0feaf862019-02-16 00:39:22 +02001540 hl_vm_fini(hdev);
1541
Oded Gabbay37d68ce2019-05-29 14:43:04 +03001542 hl_mmu_fini(hdev);
1543
Oded Gabbay1251f232019-02-16 00:39:18 +02001544 hl_eq_fini(hdev, &hdev->event_queue);
1545
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001546 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1547 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1548 kfree(hdev->completion_queue);
1549
1550 hl_hw_queues_destroy(hdev);
1551
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001552 /* Call ASIC S/W finalize function */
1553 hdev->asic_funcs->sw_fini(hdev);
1554
Oded Gabbayc4d66342019-02-16 00:39:11 +02001555 device_early_fini(hdev);
1556
Tomer Tayarea451f82019-08-08 12:25:52 +00001557 /* Hide devices and sysfs nodes from user */
1558 device_cdev_sysfs_del(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +02001559
1560 pr_info("removed device successfully\n");
1561}
1562
1563/*
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001564 * MMIO register access helper functions.
1565 */
1566
1567/*
1568 * hl_rreg - Read an MMIO register
1569 *
1570 * @hdev: pointer to habanalabs device structure
1571 * @reg: MMIO register offset (in bytes)
1572 *
1573 * Returns the value of the MMIO register we are asked to read
1574 *
1575 */
1576inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
1577{
1578 return readl(hdev->rmmio + reg);
1579}
1580
1581/*
1582 * hl_wreg - Write to an MMIO register
1583 *
1584 * @hdev: pointer to habanalabs device structure
1585 * @reg: MMIO register offset (in bytes)
1586 * @val: 32-bit value
1587 *
1588 * Writes the 32-bit value into the MMIO register
1589 *
1590 */
1591inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
1592{
1593 writel(val, hdev->rmmio + reg);
1594}