blob: de46aa6ed1542438c5d5952ff77c9cc17dadc5a6 [file] [log] [blame]
Oded Gabbayc4d66342019-02-16 00:39:11 +02001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 */
7
8#include "habanalabs.h"
9
10#include <linux/pci.h>
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +020011#include <linux/sched/signal.h>
Oded Gabbayd91389b2019-02-16 00:39:19 +020012#include <linux/hwmon.h>
Oded Gabbayc4d66342019-02-16 00:39:11 +020013
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +020014bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
15{
16 if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
17 return true;
18 else
19 return false;
20}
21
Oded Gabbayc4d66342019-02-16 00:39:11 +020022static void hpriv_release(struct kref *ref)
23{
24 struct hl_fpriv *hpriv;
25 struct hl_device *hdev;
26
27 hpriv = container_of(ref, struct hl_fpriv, refcount);
28
29 hdev = hpriv->hdev;
30
31 put_pid(hpriv->taskpid);
32
Oded Gabbayc2164772019-02-16 00:39:24 +020033 hl_debugfs_remove_file(hpriv);
34
Oded Gabbayeff6f4a2019-02-16 00:39:21 +020035 mutex_destroy(&hpriv->restore_phase_mutex);
36
Oded Gabbayc4d66342019-02-16 00:39:11 +020037 kfree(hpriv);
Oded Gabbay0861e412019-02-16 00:39:14 +020038
39 /* Now the FD is really closed */
40 atomic_dec(&hdev->fd_open_cnt);
41
42 /* This allows a new user context to open the device */
43 hdev->user_ctx = NULL;
Oded Gabbayc4d66342019-02-16 00:39:11 +020044}
45
46void hl_hpriv_get(struct hl_fpriv *hpriv)
47{
48 kref_get(&hpriv->refcount);
49}
50
51void hl_hpriv_put(struct hl_fpriv *hpriv)
52{
53 kref_put(&hpriv->refcount, hpriv_release);
54}
55
56/*
57 * hl_device_release - release function for habanalabs device
58 *
59 * @inode: pointer to inode structure
60 * @filp: pointer to file structure
61 *
62 * Called when process closes an habanalabs device
63 */
64static int hl_device_release(struct inode *inode, struct file *filp)
65{
66 struct hl_fpriv *hpriv = filp->private_data;
67
Oded Gabbaybe5d9262019-02-16 00:39:15 +020068 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
Oded Gabbay0861e412019-02-16 00:39:14 +020069 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
70
Oded Gabbayc4d66342019-02-16 00:39:11 +020071 filp->private_data = NULL;
72
73 hl_hpriv_put(hpriv);
74
75 return 0;
76}
77
Oded Gabbaybe5d9262019-02-16 00:39:15 +020078/*
79 * hl_mmap - mmap function for habanalabs device
80 *
81 * @*filp: pointer to file structure
82 * @*vma: pointer to vm_area_struct of the process
83 *
84 * Called when process does an mmap on habanalabs device. Call the device's mmap
85 * function at the end of the common code.
86 */
87static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
88{
89 struct hl_fpriv *hpriv = filp->private_data;
90
91 if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
92 vma->vm_pgoff ^= HL_MMAP_CB_MASK;
93 return hl_cb_mmap(hpriv, vma);
94 }
95
Oded Gabbay5e6e0232019-02-27 12:15:16 +020096 return -EINVAL;
Oded Gabbaybe5d9262019-02-16 00:39:15 +020097}
98
Oded Gabbayc4d66342019-02-16 00:39:11 +020099static const struct file_operations hl_ops = {
100 .owner = THIS_MODULE,
101 .open = hl_device_open,
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200102 .release = hl_device_release,
103 .mmap = hl_mmap,
104 .unlocked_ioctl = hl_ioctl,
105 .compat_ioctl = hl_ioctl
Oded Gabbayc4d66342019-02-16 00:39:11 +0200106};
107
108/*
109 * device_setup_cdev - setup cdev and device for habanalabs device
110 *
111 * @hdev: pointer to habanalabs device structure
112 * @hclass: pointer to the class object of the device
113 * @minor: minor number of the specific device
114 * @fpos : file operations to install for this device
115 *
116 * Create a cdev and a Linux device for habanalabs's device. Need to be
117 * called at the end of the habanalabs device initialization process,
118 * because this function exposes the device to the user
119 */
120static int device_setup_cdev(struct hl_device *hdev, struct class *hclass,
121 int minor, const struct file_operations *fops)
122{
123 int err, devno = MKDEV(hdev->major, minor);
124 struct cdev *hdev_cdev = &hdev->cdev;
125 char *name;
126
127 name = kasprintf(GFP_KERNEL, "hl%d", hdev->id);
128 if (!name)
129 return -ENOMEM;
130
131 cdev_init(hdev_cdev, fops);
132 hdev_cdev->owner = THIS_MODULE;
133 err = cdev_add(hdev_cdev, devno, 1);
134 if (err) {
135 pr_err("Failed to add char device %s\n", name);
136 goto err_cdev_add;
137 }
138
139 hdev->dev = device_create(hclass, NULL, devno, NULL, "%s", name);
140 if (IS_ERR(hdev->dev)) {
141 pr_err("Failed to create device %s\n", name);
142 err = PTR_ERR(hdev->dev);
143 goto err_device_create;
144 }
145
146 dev_set_drvdata(hdev->dev, hdev);
147
148 kfree(name);
149
150 return 0;
151
152err_device_create:
153 cdev_del(hdev_cdev);
154err_cdev_add:
155 kfree(name);
156 return err;
157}
158
159/*
160 * device_early_init - do some early initialization for the habanalabs device
161 *
162 * @hdev: pointer to habanalabs device structure
163 *
164 * Install the relevant function pointers and call the early_init function,
165 * if such a function exists
166 */
167static int device_early_init(struct hl_device *hdev)
168{
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200169 int rc;
170
Oded Gabbayc4d66342019-02-16 00:39:11 +0200171 switch (hdev->asic_type) {
172 case ASIC_GOYA:
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200173 goya_set_asic_funcs(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200174 strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
175 break;
176 default:
177 dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
178 hdev->asic_type);
179 return -EINVAL;
180 }
181
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200182 rc = hdev->asic_funcs->early_init(hdev);
183 if (rc)
184 return rc;
185
Oded Gabbay0861e412019-02-16 00:39:14 +0200186 rc = hl_asid_init(hdev);
187 if (rc)
188 goto early_fini;
189
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200190 hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
191 if (hdev->cq_wq == NULL) {
192 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
193 rc = -ENOMEM;
194 goto asid_fini;
195 }
196
Oded Gabbay1251f232019-02-16 00:39:18 +0200197 hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
198 if (hdev->eq_wq == NULL) {
199 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
200 rc = -ENOMEM;
201 goto free_cq_wq;
202 }
203
Oded Gabbayd91389b2019-02-16 00:39:19 +0200204 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
205 GFP_KERNEL);
206 if (!hdev->hl_chip_info) {
207 rc = -ENOMEM;
208 goto free_eq_wq;
209 }
210
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200211 hl_cb_mgr_init(&hdev->kernel_cb_mgr);
212
Oded Gabbay0861e412019-02-16 00:39:14 +0200213 mutex_init(&hdev->fd_open_cnt_lock);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200214 mutex_init(&hdev->send_cpu_message_lock);
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200215 INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
216 spin_lock_init(&hdev->hw_queues_mirror_lock);
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200217 atomic_set(&hdev->in_reset, 0);
Oded Gabbay0861e412019-02-16 00:39:14 +0200218 atomic_set(&hdev->fd_open_cnt, 0);
219
Oded Gabbayc4d66342019-02-16 00:39:11 +0200220 return 0;
Oded Gabbay0861e412019-02-16 00:39:14 +0200221
Oded Gabbayd91389b2019-02-16 00:39:19 +0200222free_eq_wq:
223 destroy_workqueue(hdev->eq_wq);
Oded Gabbay1251f232019-02-16 00:39:18 +0200224free_cq_wq:
225 destroy_workqueue(hdev->cq_wq);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200226asid_fini:
227 hl_asid_fini(hdev);
Oded Gabbay0861e412019-02-16 00:39:14 +0200228early_fini:
229 if (hdev->asic_funcs->early_fini)
230 hdev->asic_funcs->early_fini(hdev);
231
232 return rc;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200233}
234
235/*
236 * device_early_fini - finalize all that was done in device_early_init
237 *
238 * @hdev: pointer to habanalabs device structure
239 *
240 */
241static void device_early_fini(struct hl_device *hdev)
242{
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200243 mutex_destroy(&hdev->send_cpu_message_lock);
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200244
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200245 hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
246
Oded Gabbayd91389b2019-02-16 00:39:19 +0200247 kfree(hdev->hl_chip_info);
248
Oded Gabbay1251f232019-02-16 00:39:18 +0200249 destroy_workqueue(hdev->eq_wq);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200250 destroy_workqueue(hdev->cq_wq);
251
Oded Gabbay0861e412019-02-16 00:39:14 +0200252 hl_asid_fini(hdev);
253
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200254 if (hdev->asic_funcs->early_fini)
255 hdev->asic_funcs->early_fini(hdev);
256
Oded Gabbay0861e412019-02-16 00:39:14 +0200257 mutex_destroy(&hdev->fd_open_cnt_lock);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200258}
259
Oded Gabbayd91389b2019-02-16 00:39:19 +0200260static void set_freq_to_low_job(struct work_struct *work)
261{
262 struct hl_device *hdev = container_of(work, struct hl_device,
263 work_freq.work);
264
265 if (atomic_read(&hdev->fd_open_cnt) == 0)
266 hl_device_set_frequency(hdev, PLL_LOW);
267
268 schedule_delayed_work(&hdev->work_freq,
269 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
270}
271
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200272static void hl_device_heartbeat(struct work_struct *work)
273{
274 struct hl_device *hdev = container_of(work, struct hl_device,
275 work_heartbeat.work);
276
277 if (hl_device_disabled_or_in_reset(hdev))
278 goto reschedule;
279
280 if (!hdev->asic_funcs->send_heartbeat(hdev))
281 goto reschedule;
282
283 dev_err(hdev->dev, "Device heartbeat failed!\n");
284 hl_device_reset(hdev, true, false);
285
286 return;
287
288reschedule:
289 schedule_delayed_work(&hdev->work_heartbeat,
290 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
291}
292
Oded Gabbayd91389b2019-02-16 00:39:19 +0200293/*
294 * device_late_init - do late stuff initialization for the habanalabs device
295 *
296 * @hdev: pointer to habanalabs device structure
297 *
298 * Do stuff that either needs the device H/W queues to be active or needs
299 * to happen after all the rest of the initialization is finished
300 */
301static int device_late_init(struct hl_device *hdev)
302{
303 int rc;
304
305 INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
306 hdev->high_pll = hdev->asic_prop.high_pll;
307
308 /* force setting to low frequency */
309 atomic_set(&hdev->curr_pll_profile, PLL_LOW);
310
311 if (hdev->pm_mng_profile == PM_AUTO)
312 hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
313 else
314 hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
315
316 if (hdev->asic_funcs->late_init) {
317 rc = hdev->asic_funcs->late_init(hdev);
318 if (rc) {
319 dev_err(hdev->dev,
320 "failed late initialization for the H/W\n");
321 return rc;
322 }
323 }
324
325 schedule_delayed_work(&hdev->work_freq,
326 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
327
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200328 if (hdev->heartbeat) {
329 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
330 schedule_delayed_work(&hdev->work_heartbeat,
331 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
332 }
333
Oded Gabbayd91389b2019-02-16 00:39:19 +0200334 hdev->late_init_done = true;
335
336 return 0;
337}
338
339/*
340 * device_late_fini - finalize all that was done in device_late_init
341 *
342 * @hdev: pointer to habanalabs device structure
343 *
344 */
345static void device_late_fini(struct hl_device *hdev)
346{
347 if (!hdev->late_init_done)
348 return;
349
350 cancel_delayed_work_sync(&hdev->work_freq);
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200351 if (hdev->heartbeat)
352 cancel_delayed_work_sync(&hdev->work_heartbeat);
Oded Gabbayd91389b2019-02-16 00:39:19 +0200353
354 if (hdev->asic_funcs->late_fini)
355 hdev->asic_funcs->late_fini(hdev);
356
357 hdev->late_init_done = false;
358}
359
360/*
361 * hl_device_set_frequency - set the frequency of the device
362 *
363 * @hdev: pointer to habanalabs device structure
364 * @freq: the new frequency value
365 *
366 * Change the frequency if needed.
367 * We allose to set PLL to low only if there is no user process
368 * Returns 0 if no change was done, otherwise returns 1;
369 */
370int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
371{
372 enum hl_pll_frequency old_freq =
373 (freq == PLL_HIGH) ? PLL_LOW : PLL_HIGH;
374 int ret;
375
376 if (hdev->pm_mng_profile == PM_MANUAL)
377 return 0;
378
379 ret = atomic_cmpxchg(&hdev->curr_pll_profile, old_freq, freq);
380 if (ret == freq)
381 return 0;
382
383 /*
384 * in case we want to lower frequency, check if device is not
385 * opened. We must have a check here to workaround race condition with
386 * hl_device_open
387 */
388 if ((freq == PLL_LOW) && (atomic_read(&hdev->fd_open_cnt) > 0)) {
389 atomic_set(&hdev->curr_pll_profile, PLL_HIGH);
390 return 0;
391 }
392
393 dev_dbg(hdev->dev, "Changing device frequency to %s\n",
394 freq == PLL_HIGH ? "high" : "low");
395
396 hdev->asic_funcs->set_pll_profile(hdev, freq);
397
398 return 1;
399}
400
Oded Gabbayc4d66342019-02-16 00:39:11 +0200401/*
402 * hl_device_suspend - initiate device suspend
403 *
404 * @hdev: pointer to habanalabs device structure
405 *
406 * Puts the hw in the suspend state (all asics).
407 * Returns 0 for success or an error on failure.
408 * Called at driver suspend.
409 */
410int hl_device_suspend(struct hl_device *hdev)
411{
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200412 int rc;
413
Oded Gabbayc4d66342019-02-16 00:39:11 +0200414 pci_save_state(hdev->pdev);
415
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200416 rc = hdev->asic_funcs->suspend(hdev);
417 if (rc)
418 dev_err(hdev->dev,
419 "Failed to disable PCI access of device CPU\n");
420
Oded Gabbayc4d66342019-02-16 00:39:11 +0200421 /* Shut down the device */
422 pci_disable_device(hdev->pdev);
423 pci_set_power_state(hdev->pdev, PCI_D3hot);
424
425 return 0;
426}
427
428/*
429 * hl_device_resume - initiate device resume
430 *
431 * @hdev: pointer to habanalabs device structure
432 *
433 * Bring the hw back to operating state (all asics).
434 * Returns 0 for success or an error on failure.
435 * Called at driver resume.
436 */
437int hl_device_resume(struct hl_device *hdev)
438{
439 int rc;
440
441 pci_set_power_state(hdev->pdev, PCI_D0);
442 pci_restore_state(hdev->pdev);
443 rc = pci_enable_device(hdev->pdev);
444 if (rc) {
445 dev_err(hdev->dev,
446 "Failed to enable PCI device in resume\n");
447 return rc;
448 }
449
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200450 rc = hdev->asic_funcs->resume(hdev);
451 if (rc) {
452 dev_err(hdev->dev,
453 "Failed to enable PCI access from device CPU\n");
454 return rc;
455 }
456
Oded Gabbayc4d66342019-02-16 00:39:11 +0200457 return 0;
458}
459
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200460static void hl_device_hard_reset_pending(struct work_struct *work)
461{
462 struct hl_device_reset_work *device_reset_work =
463 container_of(work, struct hl_device_reset_work, reset_work);
464 struct hl_device *hdev = device_reset_work->hdev;
465 u16 pending_cnt = HL_PENDING_RESET_PER_SEC;
466 struct task_struct *task = NULL;
467
468 /* Flush all processes that are inside hl_open */
469 mutex_lock(&hdev->fd_open_cnt_lock);
470
471 while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
472
473 pending_cnt--;
474
475 dev_info(hdev->dev,
476 "Can't HARD reset, waiting for user to close FD\n");
477 ssleep(1);
478 }
479
480 if (atomic_read(&hdev->fd_open_cnt)) {
481 task = get_pid_task(hdev->user_ctx->hpriv->taskpid,
482 PIDTYPE_PID);
483 if (task) {
484 dev_info(hdev->dev, "Killing user processes\n");
485 send_sig(SIGKILL, task, 1);
486 msleep(100);
487
488 put_task_struct(task);
489 }
490 }
491
492 mutex_unlock(&hdev->fd_open_cnt_lock);
493
494 hl_device_reset(hdev, true, true);
495
496 kfree(device_reset_work);
497}
498
499/*
500 * hl_device_reset - reset the device
501 *
502 * @hdev: pointer to habanalabs device structure
503 * @hard_reset: should we do hard reset to all engines or just reset the
504 * compute/dma engines
505 *
506 * Block future CS and wait for pending CS to be enqueued
507 * Call ASIC H/W fini
508 * Flush all completions
509 * Re-initialize all internal data structures
510 * Call ASIC H/W init, late_init
511 * Test queues
512 * Enable device
513 *
514 * Returns 0 for success or an error on failure.
515 */
516int hl_device_reset(struct hl_device *hdev, bool hard_reset,
517 bool from_hard_reset_thread)
518{
519 int i, rc;
520
521 if (!hdev->init_done) {
522 dev_err(hdev->dev,
523 "Can't reset before initialization is done\n");
524 return 0;
525 }
526
527 /*
528 * Prevent concurrency in this function - only one reset should be
529 * done at any given time. Only need to perform this if we didn't
530 * get from the dedicated hard reset thread
531 */
532 if (!from_hard_reset_thread) {
533 /* Block future CS/VM/JOB completion operations */
534 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
535 if (rc)
536 return 0;
537
538 /* This also blocks future CS/VM/JOB completion operations */
539 hdev->disabled = true;
540
541 /*
542 * Flush anyone that is inside the critical section of enqueue
543 * jobs to the H/W
544 */
545 hdev->asic_funcs->hw_queues_lock(hdev);
546 hdev->asic_funcs->hw_queues_unlock(hdev);
547
548 dev_err(hdev->dev, "Going to RESET device!\n");
549 }
550
551again:
552 if ((hard_reset) && (!from_hard_reset_thread)) {
553 struct hl_device_reset_work *device_reset_work;
554
555 if (!hdev->pdev) {
556 dev_err(hdev->dev,
557 "Reset action is NOT supported in simulator\n");
558 rc = -EINVAL;
559 goto out_err;
560 }
561
562 hdev->hard_reset_pending = true;
563
564 device_reset_work = kzalloc(sizeof(*device_reset_work),
565 GFP_ATOMIC);
566 if (!device_reset_work) {
567 rc = -ENOMEM;
568 goto out_err;
569 }
570
571 /*
572 * Because the reset function can't run from interrupt or
573 * from heartbeat work, we need to call the reset function
574 * from a dedicated work
575 */
576 INIT_WORK(&device_reset_work->reset_work,
577 hl_device_hard_reset_pending);
578 device_reset_work->hdev = hdev;
579 schedule_work(&device_reset_work->reset_work);
580
581 return 0;
582 }
583
584 if (hard_reset) {
585 device_late_fini(hdev);
586
587 /*
588 * Now that the heartbeat thread is closed, flush processes
589 * which are sending messages to CPU
590 */
591 mutex_lock(&hdev->send_cpu_message_lock);
592 mutex_unlock(&hdev->send_cpu_message_lock);
593 }
594
595 /*
596 * Halt the engines and disable interrupts so we won't get any more
597 * completions from H/W and we won't have any accesses from the
598 * H/W to the host machine
599 */
600 hdev->asic_funcs->halt_engines(hdev, hard_reset);
601
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200602 /* Go over all the queues, release all CS and their jobs */
603 hl_cs_rollback_all(hdev);
604
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200605 if (hard_reset) {
606 /* Release kernel context */
607 if (hl_ctx_put(hdev->kernel_ctx) != 1) {
608 dev_err(hdev->dev,
609 "kernel ctx is alive during hard reset\n");
610 rc = -EBUSY;
611 goto out_err;
612 }
613
614 hdev->kernel_ctx = NULL;
615 }
616
617 /* Reset the H/W. It will be in idle state after this returns */
618 hdev->asic_funcs->hw_fini(hdev, hard_reset);
619
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200620 if (hard_reset) {
621 hl_vm_fini(hdev);
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200622 hl_eq_reset(hdev, &hdev->event_queue);
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200623 }
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200624
625 /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
626 hl_hw_queue_reset(hdev, hard_reset);
627 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
628 hl_cq_reset(hdev, &hdev->completion_queue[i]);
629
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200630 /* Make sure the setup phase for the user context will run again */
631 if (hdev->user_ctx) {
632 atomic_set(&hdev->user_ctx->thread_restore_token, 1);
633 hdev->user_ctx->thread_restore_wait_token = 0;
634 }
635
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200636 /* Finished tear-down, starting to re-initialize */
637
638 if (hard_reset) {
Oded Gabbaya28ce422019-02-28 10:46:12 +0200639 hdev->device_cpu_disabled = false;
640
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200641 /* Allocate the kernel context */
642 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
643 GFP_KERNEL);
644 if (!hdev->kernel_ctx) {
645 rc = -ENOMEM;
646 goto out_err;
647 }
648
649 hdev->user_ctx = NULL;
650
651 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
652 if (rc) {
653 dev_err(hdev->dev,
654 "failed to init kernel ctx in hard reset\n");
655 kfree(hdev->kernel_ctx);
656 hdev->kernel_ctx = NULL;
657 goto out_err;
658 }
659 }
660
661 rc = hdev->asic_funcs->hw_init(hdev);
662 if (rc) {
663 dev_err(hdev->dev,
664 "failed to initialize the H/W after reset\n");
665 goto out_err;
666 }
667
668 hdev->disabled = false;
669
670 /* Check that the communication with the device is working */
671 rc = hdev->asic_funcs->test_queues(hdev);
672 if (rc) {
673 dev_err(hdev->dev,
674 "Failed to detect if device is alive after reset\n");
675 goto out_err;
676 }
677
678 if (hard_reset) {
679 rc = device_late_init(hdev);
680 if (rc) {
681 dev_err(hdev->dev,
682 "Failed late init after hard reset\n");
683 goto out_err;
684 }
685
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200686 rc = hl_vm_init(hdev);
687 if (rc) {
688 dev_err(hdev->dev,
689 "Failed to init memory module after hard reset\n");
690 goto out_err;
691 }
692
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200693 hl_set_max_power(hdev, hdev->max_power);
694
695 hdev->hard_reset_pending = false;
696 } else {
697 rc = hdev->asic_funcs->soft_reset_late_init(hdev);
698 if (rc) {
699 dev_err(hdev->dev,
700 "Failed late init after soft reset\n");
701 goto out_err;
702 }
703 }
704
705 atomic_set(&hdev->in_reset, 0);
706
707 if (hard_reset)
708 hdev->hard_reset_cnt++;
709 else
710 hdev->soft_reset_cnt++;
711
712 return 0;
713
714out_err:
715 hdev->disabled = true;
716
717 if (hard_reset) {
718 dev_err(hdev->dev,
719 "Failed to reset! Device is NOT usable\n");
720 hdev->hard_reset_cnt++;
721 } else {
722 dev_err(hdev->dev,
723 "Failed to do soft-reset, trying hard reset\n");
724 hdev->soft_reset_cnt++;
725 hard_reset = true;
726 goto again;
727 }
728
729 atomic_set(&hdev->in_reset, 0);
730
731 return rc;
732}
733
Oded Gabbayc4d66342019-02-16 00:39:11 +0200734/*
735 * hl_device_init - main initialization function for habanalabs device
736 *
737 * @hdev: pointer to habanalabs device structure
738 *
739 * Allocate an id for the device, do early initialization and then call the
740 * ASIC specific initialization functions. Finally, create the cdev and the
741 * Linux device to expose it to the user
742 */
743int hl_device_init(struct hl_device *hdev, struct class *hclass)
744{
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200745 int i, rc, cq_ready_cnt;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200746
747 /* Create device */
748 rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops);
749
750 if (rc)
751 goto out_disabled;
752
753 /* Initialize ASIC function pointers and perform early init */
754 rc = device_early_init(hdev);
755 if (rc)
756 goto release_device;
757
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200758 /*
759 * Start calling ASIC initialization. First S/W then H/W and finally
760 * late init
761 */
762 rc = hdev->asic_funcs->sw_init(hdev);
763 if (rc)
764 goto early_fini;
765
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200766 /*
767 * Initialize the H/W queues. Must be done before hw_init, because
768 * there the addresses of the kernel queue are being written to the
769 * registers of the device
770 */
771 rc = hl_hw_queues_create(hdev);
772 if (rc) {
773 dev_err(hdev->dev, "failed to initialize kernel queues\n");
774 goto sw_fini;
775 }
776
777 /*
778 * Initialize the completion queues. Must be done before hw_init,
779 * because there the addresses of the completion queues are being
780 * passed as arguments to request_irq
781 */
782 hdev->completion_queue =
783 kcalloc(hdev->asic_prop.completion_queues_count,
784 sizeof(*hdev->completion_queue), GFP_KERNEL);
785
786 if (!hdev->completion_queue) {
787 dev_err(hdev->dev, "failed to allocate completion queues\n");
788 rc = -ENOMEM;
789 goto hw_queues_destroy;
790 }
791
792 for (i = 0, cq_ready_cnt = 0;
793 i < hdev->asic_prop.completion_queues_count;
794 i++, cq_ready_cnt++) {
795 rc = hl_cq_init(hdev, &hdev->completion_queue[i], i);
796 if (rc) {
797 dev_err(hdev->dev,
798 "failed to initialize completion queue\n");
799 goto cq_fini;
800 }
801 }
802
Oded Gabbay1251f232019-02-16 00:39:18 +0200803 /*
804 * Initialize the event queue. Must be done before hw_init,
805 * because there the address of the event queue is being
806 * passed as argument to request_irq
807 */
808 rc = hl_eq_init(hdev, &hdev->event_queue);
809 if (rc) {
810 dev_err(hdev->dev, "failed to initialize event queue\n");
811 goto cq_fini;
812 }
813
Oded Gabbay0861e412019-02-16 00:39:14 +0200814 /* Allocate the kernel context */
815 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
816 if (!hdev->kernel_ctx) {
817 rc = -ENOMEM;
Oded Gabbay1251f232019-02-16 00:39:18 +0200818 goto eq_fini;
Oded Gabbay0861e412019-02-16 00:39:14 +0200819 }
820
821 hdev->user_ctx = NULL;
822
823 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
824 if (rc) {
825 dev_err(hdev->dev, "failed to initialize kernel context\n");
826 goto free_ctx;
827 }
828
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200829 rc = hl_cb_pool_init(hdev);
830 if (rc) {
831 dev_err(hdev->dev, "failed to initialize CB pool\n");
832 goto release_ctx;
833 }
834
Oded Gabbayd91389b2019-02-16 00:39:19 +0200835 rc = hl_sysfs_init(hdev);
836 if (rc) {
837 dev_err(hdev->dev, "failed to initialize sysfs\n");
838 goto free_cb_pool;
839 }
840
Oded Gabbayc2164772019-02-16 00:39:24 +0200841 hl_debugfs_add_device(hdev);
842
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200843 if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
844 dev_info(hdev->dev,
845 "H/W state is dirty, must reset before initializing\n");
846 hdev->asic_funcs->hw_fini(hdev, true);
847 }
848
Oded Gabbay839c4802019-02-16 00:39:16 +0200849 rc = hdev->asic_funcs->hw_init(hdev);
850 if (rc) {
851 dev_err(hdev->dev, "failed to initialize the H/W\n");
852 rc = 0;
853 goto out_disabled;
854 }
855
856 hdev->disabled = false;
857
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200858 /* Check that the communication with the device is working */
859 rc = hdev->asic_funcs->test_queues(hdev);
860 if (rc) {
861 dev_err(hdev->dev, "Failed to detect if device is alive\n");
862 rc = 0;
863 goto out_disabled;
864 }
865
Oded Gabbayd91389b2019-02-16 00:39:19 +0200866 /* After test_queues, KMD can start sending messages to device CPU */
867
868 rc = device_late_init(hdev);
869 if (rc) {
870 dev_err(hdev->dev, "Failed late initialization\n");
871 rc = 0;
872 goto out_disabled;
873 }
874
875 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
876 hdev->asic_name,
877 hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
878
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200879 rc = hl_vm_init(hdev);
880 if (rc) {
881 dev_err(hdev->dev, "Failed to initialize memory module\n");
882 rc = 0;
883 goto out_disabled;
884 }
885
Oded Gabbayd91389b2019-02-16 00:39:19 +0200886 /*
887 * hl_hwmon_init must be called after device_late_init, because only
888 * there we get the information from the device about which
889 * hwmon-related sensors the device supports
890 */
891 rc = hl_hwmon_init(hdev);
892 if (rc) {
893 dev_err(hdev->dev, "Failed to initialize hwmon\n");
894 rc = 0;
895 goto out_disabled;
896 }
897
Oded Gabbayc4d66342019-02-16 00:39:11 +0200898 dev_notice(hdev->dev,
899 "Successfully added device to habanalabs driver\n");
900
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200901 hdev->init_done = true;
902
Oded Gabbayc4d66342019-02-16 00:39:11 +0200903 return 0;
904
Oded Gabbayd91389b2019-02-16 00:39:19 +0200905free_cb_pool:
906 hl_cb_pool_fini(hdev);
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200907release_ctx:
908 if (hl_ctx_put(hdev->kernel_ctx) != 1)
909 dev_err(hdev->dev,
910 "kernel ctx is still alive on initialization failure\n");
Oded Gabbay0861e412019-02-16 00:39:14 +0200911free_ctx:
912 kfree(hdev->kernel_ctx);
Oded Gabbay1251f232019-02-16 00:39:18 +0200913eq_fini:
914 hl_eq_fini(hdev, &hdev->event_queue);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200915cq_fini:
916 for (i = 0 ; i < cq_ready_cnt ; i++)
917 hl_cq_fini(hdev, &hdev->completion_queue[i]);
918 kfree(hdev->completion_queue);
919hw_queues_destroy:
920 hl_hw_queues_destroy(hdev);
Oded Gabbay0861e412019-02-16 00:39:14 +0200921sw_fini:
922 hdev->asic_funcs->sw_fini(hdev);
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200923early_fini:
924 device_early_fini(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200925release_device:
926 device_destroy(hclass, hdev->dev->devt);
927 cdev_del(&hdev->cdev);
928out_disabled:
929 hdev->disabled = true;
930 if (hdev->pdev)
931 dev_err(&hdev->pdev->dev,
932 "Failed to initialize hl%d. Device is NOT usable !\n",
933 hdev->id);
934 else
935 pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
936 hdev->id);
937
938 return rc;
939}
940
941/*
942 * hl_device_fini - main tear-down function for habanalabs device
943 *
944 * @hdev: pointer to habanalabs device structure
945 *
946 * Destroy the device, call ASIC fini functions and release the id
947 */
948void hl_device_fini(struct hl_device *hdev)
949{
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200950 int i, rc;
951 ktime_t timeout;
952
Oded Gabbayc4d66342019-02-16 00:39:11 +0200953 dev_info(hdev->dev, "Removing device\n");
954
Oded Gabbayf8c8c7d2019-02-16 00:39:20 +0200955 /*
956 * This function is competing with the reset function, so try to
957 * take the reset atomic and if we are already in middle of reset,
958 * wait until reset function is finished. Reset function is designed
959 * to always finish (could take up to a few seconds in worst case).
960 */
961
962 timeout = ktime_add_us(ktime_get(),
963 HL_PENDING_RESET_PER_SEC * 1000 * 1000 * 4);
964 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
965 while (rc) {
966 usleep_range(50, 200);
967 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
968 if (ktime_compare(ktime_get(), timeout) > 0) {
969 WARN(1, "Failed to remove device because reset function did not finish\n");
970 return;
971 }
972 };
973
Oded Gabbayc4d66342019-02-16 00:39:11 +0200974 /* Mark device as disabled */
975 hdev->disabled = true;
976
Oded Gabbayd91389b2019-02-16 00:39:19 +0200977 hl_hwmon_fini(hdev);
978
979 device_late_fini(hdev);
980
Oded Gabbayc2164772019-02-16 00:39:24 +0200981 hl_debugfs_remove_device(hdev);
982
Oded Gabbayd91389b2019-02-16 00:39:19 +0200983 hl_sysfs_fini(hdev);
984
Oded Gabbay1251f232019-02-16 00:39:18 +0200985 /*
986 * Halt the engines and disable interrupts so we won't get any more
987 * completions from H/W and we won't have any accesses from the
988 * H/W to the host machine
989 */
990 hdev->asic_funcs->halt_engines(hdev, true);
991
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200992 /* Go over all the queues, release all CS and their jobs */
993 hl_cs_rollback_all(hdev);
994
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200995 hl_cb_pool_fini(hdev);
996
Oded Gabbay0861e412019-02-16 00:39:14 +0200997 /* Release kernel context */
998 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
999 dev_err(hdev->dev, "kernel ctx is still alive\n");
1000
Oded Gabbay839c4802019-02-16 00:39:16 +02001001 /* Reset the H/W. It will be in idle state after this returns */
1002 hdev->asic_funcs->hw_fini(hdev, true);
1003
Omer Shpigelman0feaf862019-02-16 00:39:22 +02001004 hl_vm_fini(hdev);
1005
Oded Gabbay1251f232019-02-16 00:39:18 +02001006 hl_eq_fini(hdev, &hdev->event_queue);
1007
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001008 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1009 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1010 kfree(hdev->completion_queue);
1011
1012 hl_hw_queues_destroy(hdev);
1013
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001014 /* Call ASIC S/W finalize function */
1015 hdev->asic_funcs->sw_fini(hdev);
1016
Oded Gabbayc4d66342019-02-16 00:39:11 +02001017 device_early_fini(hdev);
1018
1019 /* Hide device from user */
1020 device_destroy(hdev->dev->class, hdev->dev->devt);
1021 cdev_del(&hdev->cdev);
1022
1023 pr_info("removed device successfully\n");
1024}
1025
1026/*
1027 * hl_poll_timeout_memory - Periodically poll a host memory address
1028 * until it is not zero or a timeout occurs
1029 * @hdev: pointer to habanalabs device structure
1030 * @addr: Address to poll
1031 * @timeout_us: timeout in us
1032 * @val: Variable to read the value into
1033 *
1034 * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
1035 * case, the last read value at @addr is stored in @val. Must not
1036 * be called from atomic context if sleep_us or timeout_us are used.
1037 *
1038 * The function sleeps for 100us with timeout value of
1039 * timeout_us
1040 */
1041int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr,
1042 u32 timeout_us, u32 *val)
1043{
1044 /*
1045 * address in this function points always to a memory location in the
1046 * host's (server's) memory. That location is updated asynchronously
1047 * either by the direct access of the device or by another core
1048 */
1049 u32 *paddr = (u32 *) (uintptr_t) addr;
1050 ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
1051
1052 might_sleep();
1053
1054 for (;;) {
1055 /*
1056 * Flush CPU read/write buffers to make sure we read updates
1057 * done by other cores or by the device
1058 */
1059 mb();
1060 *val = *paddr;
1061 if (*val)
1062 break;
1063 if (ktime_compare(ktime_get(), timeout) > 0) {
1064 *val = *paddr;
1065 break;
1066 }
1067 usleep_range((100 >> 2) + 1, 100);
1068 }
1069
1070 return *val ? 0 : -ETIMEDOUT;
1071}
1072
1073/*
1074 * hl_poll_timeout_devicememory - Periodically poll a device memory address
1075 * until it is not zero or a timeout occurs
1076 * @hdev: pointer to habanalabs device structure
1077 * @addr: Device address to poll
1078 * @timeout_us: timeout in us
1079 * @val: Variable to read the value into
1080 *
1081 * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
1082 * case, the last read value at @addr is stored in @val. Must not
1083 * be called from atomic context if sleep_us or timeout_us are used.
1084 *
1085 * The function sleeps for 100us with timeout value of
1086 * timeout_us
1087 */
1088int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
1089 u32 timeout_us, u32 *val)
1090{
1091 ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
1092
1093 might_sleep();
1094
1095 for (;;) {
1096 *val = readl(addr);
1097 if (*val)
1098 break;
1099 if (ktime_compare(ktime_get(), timeout) > 0) {
1100 *val = readl(addr);
1101 break;
1102 }
1103 usleep_range((100 >> 2) + 1, 100);
1104 }
1105
1106 return *val ? 0 : -ETIMEDOUT;
1107}
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001108
1109/*
1110 * MMIO register access helper functions.
1111 */
1112
1113/*
1114 * hl_rreg - Read an MMIO register
1115 *
1116 * @hdev: pointer to habanalabs device structure
1117 * @reg: MMIO register offset (in bytes)
1118 *
1119 * Returns the value of the MMIO register we are asked to read
1120 *
1121 */
1122inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
1123{
1124 return readl(hdev->rmmio + reg);
1125}
1126
1127/*
1128 * hl_wreg - Write to an MMIO register
1129 *
1130 * @hdev: pointer to habanalabs device structure
1131 * @reg: MMIO register offset (in bytes)
1132 * @val: 32-bit value
1133 *
1134 * Writes the 32-bit value into the MMIO register
1135 *
1136 */
1137inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
1138{
1139 writel(val, hdev->rmmio + reg);
1140}