blob: d0929022655b7a476b7eff288a69a182a14e8492 [file] [log] [blame]
Oded Gabbayc4d66342019-02-16 00:39:11 +02001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 */
7
8#include "habanalabs.h"
9
10#include <linux/pci.h>
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +020011#include <linux/sched/signal.h>
Oded Gabbayd91389b2019-02-16 00:39:19 +020012#include <linux/hwmon.h>
Oded Gabbayc4d66342019-02-16 00:39:11 +020013
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +020014bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
15{
16 if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
17 return true;
18 else
19 return false;
20}
21
Oded Gabbayc4d66342019-02-16 00:39:11 +020022static void hpriv_release(struct kref *ref)
23{
24 struct hl_fpriv *hpriv;
25 struct hl_device *hdev;
26
27 hpriv = container_of(ref, struct hl_fpriv, refcount);
28
29 hdev = hpriv->hdev;
30
31 put_pid(hpriv->taskpid);
32
Oded Gabbayeff6f4a2019-02-16 00:39:21 +020033 mutex_destroy(&hpriv->restore_phase_mutex);
34
Oded Gabbayc4d66342019-02-16 00:39:11 +020035 kfree(hpriv);
Oded Gabbay0861e412019-02-16 00:39:14 +020036
37 /* Now the FD is really closed */
38 atomic_dec(&hdev->fd_open_cnt);
39
40 /* This allows a new user context to open the device */
41 hdev->user_ctx = NULL;
Oded Gabbayc4d66342019-02-16 00:39:11 +020042}
43
44void hl_hpriv_get(struct hl_fpriv *hpriv)
45{
46 kref_get(&hpriv->refcount);
47}
48
49void hl_hpriv_put(struct hl_fpriv *hpriv)
50{
51 kref_put(&hpriv->refcount, hpriv_release);
52}
53
54/*
55 * hl_device_release - release function for habanalabs device
56 *
57 * @inode: pointer to inode structure
58 * @filp: pointer to file structure
59 *
60 * Called when process closes an habanalabs device
61 */
62static int hl_device_release(struct inode *inode, struct file *filp)
63{
64 struct hl_fpriv *hpriv = filp->private_data;
65
Oded Gabbaybe5d9262019-02-16 00:39:15 +020066 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
Oded Gabbay0861e412019-02-16 00:39:14 +020067 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
68
Oded Gabbayc4d66342019-02-16 00:39:11 +020069 filp->private_data = NULL;
70
71 hl_hpriv_put(hpriv);
72
73 return 0;
74}
75
Oded Gabbaybe5d9262019-02-16 00:39:15 +020076/*
77 * hl_mmap - mmap function for habanalabs device
78 *
79 * @*filp: pointer to file structure
80 * @*vma: pointer to vm_area_struct of the process
81 *
82 * Called when process does an mmap on habanalabs device. Call the device's mmap
83 * function at the end of the common code.
84 */
85static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
86{
87 struct hl_fpriv *hpriv = filp->private_data;
88
89 if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
90 vma->vm_pgoff ^= HL_MMAP_CB_MASK;
91 return hl_cb_mmap(hpriv, vma);
92 }
93
94 return hpriv->hdev->asic_funcs->mmap(hpriv, vma);
95}
96
Oded Gabbayc4d66342019-02-16 00:39:11 +020097static const struct file_operations hl_ops = {
98 .owner = THIS_MODULE,
99 .open = hl_device_open,
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200100 .release = hl_device_release,
101 .mmap = hl_mmap,
102 .unlocked_ioctl = hl_ioctl,
103 .compat_ioctl = hl_ioctl
Oded Gabbayc4d66342019-02-16 00:39:11 +0200104};
105
106/*
107 * device_setup_cdev - setup cdev and device for habanalabs device
108 *
109 * @hdev: pointer to habanalabs device structure
110 * @hclass: pointer to the class object of the device
111 * @minor: minor number of the specific device
112 * @fpos : file operations to install for this device
113 *
114 * Create a cdev and a Linux device for habanalabs's device. Need to be
115 * called at the end of the habanalabs device initialization process,
116 * because this function exposes the device to the user
117 */
118static int device_setup_cdev(struct hl_device *hdev, struct class *hclass,
119 int minor, const struct file_operations *fops)
120{
121 int err, devno = MKDEV(hdev->major, minor);
122 struct cdev *hdev_cdev = &hdev->cdev;
123 char *name;
124
125 name = kasprintf(GFP_KERNEL, "hl%d", hdev->id);
126 if (!name)
127 return -ENOMEM;
128
129 cdev_init(hdev_cdev, fops);
130 hdev_cdev->owner = THIS_MODULE;
131 err = cdev_add(hdev_cdev, devno, 1);
132 if (err) {
133 pr_err("Failed to add char device %s\n", name);
134 goto err_cdev_add;
135 }
136
137 hdev->dev = device_create(hclass, NULL, devno, NULL, "%s", name);
138 if (IS_ERR(hdev->dev)) {
139 pr_err("Failed to create device %s\n", name);
140 err = PTR_ERR(hdev->dev);
141 goto err_device_create;
142 }
143
144 dev_set_drvdata(hdev->dev, hdev);
145
146 kfree(name);
147
148 return 0;
149
150err_device_create:
151 cdev_del(hdev_cdev);
152err_cdev_add:
153 kfree(name);
154 return err;
155}
156
157/*
158 * device_early_init - do some early initialization for the habanalabs device
159 *
160 * @hdev: pointer to habanalabs device structure
161 *
162 * Install the relevant function pointers and call the early_init function,
163 * if such a function exists
164 */
165static int device_early_init(struct hl_device *hdev)
166{
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200167 int rc;
168
Oded Gabbayc4d66342019-02-16 00:39:11 +0200169 switch (hdev->asic_type) {
170 case ASIC_GOYA:
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200171 goya_set_asic_funcs(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200172 strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
173 break;
174 default:
175 dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
176 hdev->asic_type);
177 return -EINVAL;
178 }
179
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200180 rc = hdev->asic_funcs->early_init(hdev);
181 if (rc)
182 return rc;
183
Oded Gabbay0861e412019-02-16 00:39:14 +0200184 rc = hl_asid_init(hdev);
185 if (rc)
186 goto early_fini;
187
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200188 hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
189 if (hdev->cq_wq == NULL) {
190 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
191 rc = -ENOMEM;
192 goto asid_fini;
193 }
194
Oded Gabbay1251f232019-02-16 00:39:18 +0200195 hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
196 if (hdev->eq_wq == NULL) {
197 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
198 rc = -ENOMEM;
199 goto free_cq_wq;
200 }
201
Oded Gabbayd91389b2019-02-16 00:39:19 +0200202 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
203 GFP_KERNEL);
204 if (!hdev->hl_chip_info) {
205 rc = -ENOMEM;
206 goto free_eq_wq;
207 }
208
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200209 hl_cb_mgr_init(&hdev->kernel_cb_mgr);
210
Oded Gabbay0861e412019-02-16 00:39:14 +0200211 mutex_init(&hdev->fd_open_cnt_lock);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200212 mutex_init(&hdev->send_cpu_message_lock);
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200213 INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
214 spin_lock_init(&hdev->hw_queues_mirror_lock);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200215 atomic_set(&hdev->in_reset, 0);
Oded Gabbay0861e412019-02-16 00:39:14 +0200216 atomic_set(&hdev->fd_open_cnt, 0);
217
Oded Gabbayc4d66342019-02-16 00:39:11 +0200218 return 0;
Oded Gabbay0861e412019-02-16 00:39:14 +0200219
Oded Gabbayd91389b2019-02-16 00:39:19 +0200220free_eq_wq:
221 destroy_workqueue(hdev->eq_wq);
Oded Gabbay1251f232019-02-16 00:39:18 +0200222free_cq_wq:
223 destroy_workqueue(hdev->cq_wq);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200224asid_fini:
225 hl_asid_fini(hdev);
Oded Gabbay0861e412019-02-16 00:39:14 +0200226early_fini:
227 if (hdev->asic_funcs->early_fini)
228 hdev->asic_funcs->early_fini(hdev);
229
230 return rc;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200231}
232
233/*
234 * device_early_fini - finalize all that was done in device_early_init
235 *
236 * @hdev: pointer to habanalabs device structure
237 *
238 */
239static void device_early_fini(struct hl_device *hdev)
240{
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200241 mutex_destroy(&hdev->send_cpu_message_lock);
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200242
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200243 hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
244
Oded Gabbayd91389b2019-02-16 00:39:19 +0200245 kfree(hdev->hl_chip_info);
246
Oded Gabbay1251f232019-02-16 00:39:18 +0200247 destroy_workqueue(hdev->eq_wq);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200248 destroy_workqueue(hdev->cq_wq);
249
Oded Gabbay0861e412019-02-16 00:39:14 +0200250 hl_asid_fini(hdev);
251
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200252 if (hdev->asic_funcs->early_fini)
253 hdev->asic_funcs->early_fini(hdev);
254
Oded Gabbay0861e412019-02-16 00:39:14 +0200255 mutex_destroy(&hdev->fd_open_cnt_lock);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200256}
257
Oded Gabbayd91389b2019-02-16 00:39:19 +0200258static void set_freq_to_low_job(struct work_struct *work)
259{
260 struct hl_device *hdev = container_of(work, struct hl_device,
261 work_freq.work);
262
263 if (atomic_read(&hdev->fd_open_cnt) == 0)
264 hl_device_set_frequency(hdev, PLL_LOW);
265
266 schedule_delayed_work(&hdev->work_freq,
267 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
268}
269
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200270static void hl_device_heartbeat(struct work_struct *work)
271{
272 struct hl_device *hdev = container_of(work, struct hl_device,
273 work_heartbeat.work);
274
275 if (hl_device_disabled_or_in_reset(hdev))
276 goto reschedule;
277
278 if (!hdev->asic_funcs->send_heartbeat(hdev))
279 goto reschedule;
280
281 dev_err(hdev->dev, "Device heartbeat failed!\n");
282 hl_device_reset(hdev, true, false);
283
284 return;
285
286reschedule:
287 schedule_delayed_work(&hdev->work_heartbeat,
288 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
289}
290
Oded Gabbayd91389b2019-02-16 00:39:19 +0200291/*
292 * device_late_init - do late stuff initialization for the habanalabs device
293 *
294 * @hdev: pointer to habanalabs device structure
295 *
296 * Do stuff that either needs the device H/W queues to be active or needs
297 * to happen after all the rest of the initialization is finished
298 */
299static int device_late_init(struct hl_device *hdev)
300{
301 int rc;
302
303 INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
304 hdev->high_pll = hdev->asic_prop.high_pll;
305
306 /* force setting to low frequency */
307 atomic_set(&hdev->curr_pll_profile, PLL_LOW);
308
309 if (hdev->pm_mng_profile == PM_AUTO)
310 hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
311 else
312 hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
313
314 if (hdev->asic_funcs->late_init) {
315 rc = hdev->asic_funcs->late_init(hdev);
316 if (rc) {
317 dev_err(hdev->dev,
318 "failed late initialization for the H/W\n");
319 return rc;
320 }
321 }
322
323 schedule_delayed_work(&hdev->work_freq,
324 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
325
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200326 if (hdev->heartbeat) {
327 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
328 schedule_delayed_work(&hdev->work_heartbeat,
329 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
330 }
331
Oded Gabbayd91389b2019-02-16 00:39:19 +0200332 hdev->late_init_done = true;
333
334 return 0;
335}
336
337/*
338 * device_late_fini - finalize all that was done in device_late_init
339 *
340 * @hdev: pointer to habanalabs device structure
341 *
342 */
343static void device_late_fini(struct hl_device *hdev)
344{
345 if (!hdev->late_init_done)
346 return;
347
348 cancel_delayed_work_sync(&hdev->work_freq);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200349 if (hdev->heartbeat)
350 cancel_delayed_work_sync(&hdev->work_heartbeat);
Oded Gabbayd91389b2019-02-16 00:39:19 +0200351
352 if (hdev->asic_funcs->late_fini)
353 hdev->asic_funcs->late_fini(hdev);
354
355 hdev->late_init_done = false;
356}
357
358/*
359 * hl_device_set_frequency - set the frequency of the device
360 *
361 * @hdev: pointer to habanalabs device structure
362 * @freq: the new frequency value
363 *
364 * Change the frequency if needed.
365 * We allose to set PLL to low only if there is no user process
366 * Returns 0 if no change was done, otherwise returns 1;
367 */
368int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
369{
370 enum hl_pll_frequency old_freq =
371 (freq == PLL_HIGH) ? PLL_LOW : PLL_HIGH;
372 int ret;
373
374 if (hdev->pm_mng_profile == PM_MANUAL)
375 return 0;
376
377 ret = atomic_cmpxchg(&hdev->curr_pll_profile, old_freq, freq);
378 if (ret == freq)
379 return 0;
380
381 /*
382 * in case we want to lower frequency, check if device is not
383 * opened. We must have a check here to workaround race condition with
384 * hl_device_open
385 */
386 if ((freq == PLL_LOW) && (atomic_read(&hdev->fd_open_cnt) > 0)) {
387 atomic_set(&hdev->curr_pll_profile, PLL_HIGH);
388 return 0;
389 }
390
391 dev_dbg(hdev->dev, "Changing device frequency to %s\n",
392 freq == PLL_HIGH ? "high" : "low");
393
394 hdev->asic_funcs->set_pll_profile(hdev, freq);
395
396 return 1;
397}
398
Oded Gabbayc4d66342019-02-16 00:39:11 +0200399/*
400 * hl_device_suspend - initiate device suspend
401 *
402 * @hdev: pointer to habanalabs device structure
403 *
404 * Puts the hw in the suspend state (all asics).
405 * Returns 0 for success or an error on failure.
406 * Called at driver suspend.
407 */
408int hl_device_suspend(struct hl_device *hdev)
409{
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200410 int rc;
411
Oded Gabbayc4d66342019-02-16 00:39:11 +0200412 pci_save_state(hdev->pdev);
413
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200414 rc = hdev->asic_funcs->suspend(hdev);
415 if (rc)
416 dev_err(hdev->dev,
417 "Failed to disable PCI access of device CPU\n");
418
Oded Gabbayc4d66342019-02-16 00:39:11 +0200419 /* Shut down the device */
420 pci_disable_device(hdev->pdev);
421 pci_set_power_state(hdev->pdev, PCI_D3hot);
422
423 return 0;
424}
425
426/*
427 * hl_device_resume - initiate device resume
428 *
429 * @hdev: pointer to habanalabs device structure
430 *
431 * Bring the hw back to operating state (all asics).
432 * Returns 0 for success or an error on failure.
433 * Called at driver resume.
434 */
435int hl_device_resume(struct hl_device *hdev)
436{
437 int rc;
438
439 pci_set_power_state(hdev->pdev, PCI_D0);
440 pci_restore_state(hdev->pdev);
441 rc = pci_enable_device(hdev->pdev);
442 if (rc) {
443 dev_err(hdev->dev,
444 "Failed to enable PCI device in resume\n");
445 return rc;
446 }
447
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200448 rc = hdev->asic_funcs->resume(hdev);
449 if (rc) {
450 dev_err(hdev->dev,
451 "Failed to enable PCI access from device CPU\n");
452 return rc;
453 }
454
Oded Gabbayc4d66342019-02-16 00:39:11 +0200455 return 0;
456}
457
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200458static void hl_device_hard_reset_pending(struct work_struct *work)
459{
460 struct hl_device_reset_work *device_reset_work =
461 container_of(work, struct hl_device_reset_work, reset_work);
462 struct hl_device *hdev = device_reset_work->hdev;
463 u16 pending_cnt = HL_PENDING_RESET_PER_SEC;
464 struct task_struct *task = NULL;
465
466 /* Flush all processes that are inside hl_open */
467 mutex_lock(&hdev->fd_open_cnt_lock);
468
469 while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
470
471 pending_cnt--;
472
473 dev_info(hdev->dev,
474 "Can't HARD reset, waiting for user to close FD\n");
475 ssleep(1);
476 }
477
478 if (atomic_read(&hdev->fd_open_cnt)) {
479 task = get_pid_task(hdev->user_ctx->hpriv->taskpid,
480 PIDTYPE_PID);
481 if (task) {
482 dev_info(hdev->dev, "Killing user processes\n");
483 send_sig(SIGKILL, task, 1);
484 msleep(100);
485
486 put_task_struct(task);
487 }
488 }
489
490 mutex_unlock(&hdev->fd_open_cnt_lock);
491
492 hl_device_reset(hdev, true, true);
493
494 kfree(device_reset_work);
495}
496
497/*
498 * hl_device_reset - reset the device
499 *
500 * @hdev: pointer to habanalabs device structure
501 * @hard_reset: should we do hard reset to all engines or just reset the
502 * compute/dma engines
503 *
504 * Block future CS and wait for pending CS to be enqueued
505 * Call ASIC H/W fini
506 * Flush all completions
507 * Re-initialize all internal data structures
508 * Call ASIC H/W init, late_init
509 * Test queues
510 * Enable device
511 *
512 * Returns 0 for success or an error on failure.
513 */
514int hl_device_reset(struct hl_device *hdev, bool hard_reset,
515 bool from_hard_reset_thread)
516{
517 int i, rc;
518
519 if (!hdev->init_done) {
520 dev_err(hdev->dev,
521 "Can't reset before initialization is done\n");
522 return 0;
523 }
524
525 /*
526 * Prevent concurrency in this function - only one reset should be
527 * done at any given time. Only need to perform this if we didn't
528 * get from the dedicated hard reset thread
529 */
530 if (!from_hard_reset_thread) {
531 /* Block future CS/VM/JOB completion operations */
532 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
533 if (rc)
534 return 0;
535
536 /* This also blocks future CS/VM/JOB completion operations */
537 hdev->disabled = true;
538
539 /*
540 * Flush anyone that is inside the critical section of enqueue
541 * jobs to the H/W
542 */
543 hdev->asic_funcs->hw_queues_lock(hdev);
544 hdev->asic_funcs->hw_queues_unlock(hdev);
545
546 dev_err(hdev->dev, "Going to RESET device!\n");
547 }
548
549again:
550 if ((hard_reset) && (!from_hard_reset_thread)) {
551 struct hl_device_reset_work *device_reset_work;
552
553 if (!hdev->pdev) {
554 dev_err(hdev->dev,
555 "Reset action is NOT supported in simulator\n");
556 rc = -EINVAL;
557 goto out_err;
558 }
559
560 hdev->hard_reset_pending = true;
561
562 device_reset_work = kzalloc(sizeof(*device_reset_work),
563 GFP_ATOMIC);
564 if (!device_reset_work) {
565 rc = -ENOMEM;
566 goto out_err;
567 }
568
569 /*
570 * Because the reset function can't run from interrupt or
571 * from heartbeat work, we need to call the reset function
572 * from a dedicated work
573 */
574 INIT_WORK(&device_reset_work->reset_work,
575 hl_device_hard_reset_pending);
576 device_reset_work->hdev = hdev;
577 schedule_work(&device_reset_work->reset_work);
578
579 return 0;
580 }
581
582 if (hard_reset) {
583 device_late_fini(hdev);
584
585 /*
586 * Now that the heartbeat thread is closed, flush processes
587 * which are sending messages to CPU
588 */
589 mutex_lock(&hdev->send_cpu_message_lock);
590 mutex_unlock(&hdev->send_cpu_message_lock);
591 }
592
593 /*
594 * Halt the engines and disable interrupts so we won't get any more
595 * completions from H/W and we won't have any accesses from the
596 * H/W to the host machine
597 */
598 hdev->asic_funcs->halt_engines(hdev, hard_reset);
599
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200600 /* Go over all the queues, release all CS and their jobs */
601 hl_cs_rollback_all(hdev);
602
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200603 if (hard_reset) {
604 /* Release kernel context */
605 if (hl_ctx_put(hdev->kernel_ctx) != 1) {
606 dev_err(hdev->dev,
607 "kernel ctx is alive during hard reset\n");
608 rc = -EBUSY;
609 goto out_err;
610 }
611
612 hdev->kernel_ctx = NULL;
613 }
614
615 /* Reset the H/W. It will be in idle state after this returns */
616 hdev->asic_funcs->hw_fini(hdev, hard_reset);
617
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200618 if (hard_reset) {
619 hl_vm_fini(hdev);
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200620 hl_eq_reset(hdev, &hdev->event_queue);
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200621 }
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200622
623 /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
624 hl_hw_queue_reset(hdev, hard_reset);
625 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
626 hl_cq_reset(hdev, &hdev->completion_queue[i]);
627
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200628 /* Make sure the setup phase for the user context will run again */
629 if (hdev->user_ctx) {
630 atomic_set(&hdev->user_ctx->thread_restore_token, 1);
631 hdev->user_ctx->thread_restore_wait_token = 0;
632 }
633
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200634 /* Finished tear-down, starting to re-initialize */
635
636 if (hard_reset) {
637 /* Allocate the kernel context */
638 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
639 GFP_KERNEL);
640 if (!hdev->kernel_ctx) {
641 rc = -ENOMEM;
642 goto out_err;
643 }
644
645 hdev->user_ctx = NULL;
646
647 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
648 if (rc) {
649 dev_err(hdev->dev,
650 "failed to init kernel ctx in hard reset\n");
651 kfree(hdev->kernel_ctx);
652 hdev->kernel_ctx = NULL;
653 goto out_err;
654 }
655 }
656
657 rc = hdev->asic_funcs->hw_init(hdev);
658 if (rc) {
659 dev_err(hdev->dev,
660 "failed to initialize the H/W after reset\n");
661 goto out_err;
662 }
663
664 hdev->disabled = false;
665
666 /* Check that the communication with the device is working */
667 rc = hdev->asic_funcs->test_queues(hdev);
668 if (rc) {
669 dev_err(hdev->dev,
670 "Failed to detect if device is alive after reset\n");
671 goto out_err;
672 }
673
674 if (hard_reset) {
675 rc = device_late_init(hdev);
676 if (rc) {
677 dev_err(hdev->dev,
678 "Failed late init after hard reset\n");
679 goto out_err;
680 }
681
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200682 rc = hl_vm_init(hdev);
683 if (rc) {
684 dev_err(hdev->dev,
685 "Failed to init memory module after hard reset\n");
686 goto out_err;
687 }
688
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200689 hl_set_max_power(hdev, hdev->max_power);
690
691 hdev->hard_reset_pending = false;
692 } else {
693 rc = hdev->asic_funcs->soft_reset_late_init(hdev);
694 if (rc) {
695 dev_err(hdev->dev,
696 "Failed late init after soft reset\n");
697 goto out_err;
698 }
699 }
700
701 atomic_set(&hdev->in_reset, 0);
702
703 if (hard_reset)
704 hdev->hard_reset_cnt++;
705 else
706 hdev->soft_reset_cnt++;
707
708 return 0;
709
710out_err:
711 hdev->disabled = true;
712
713 if (hard_reset) {
714 dev_err(hdev->dev,
715 "Failed to reset! Device is NOT usable\n");
716 hdev->hard_reset_cnt++;
717 } else {
718 dev_err(hdev->dev,
719 "Failed to do soft-reset, trying hard reset\n");
720 hdev->soft_reset_cnt++;
721 hard_reset = true;
722 goto again;
723 }
724
725 atomic_set(&hdev->in_reset, 0);
726
727 return rc;
728}
729
Oded Gabbayc4d66342019-02-16 00:39:11 +0200730/*
731 * hl_device_init - main initialization function for habanalabs device
732 *
733 * @hdev: pointer to habanalabs device structure
734 *
735 * Allocate an id for the device, do early initialization and then call the
736 * ASIC specific initialization functions. Finally, create the cdev and the
737 * Linux device to expose it to the user
738 */
739int hl_device_init(struct hl_device *hdev, struct class *hclass)
740{
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200741 int i, rc, cq_ready_cnt;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200742
743 /* Create device */
744 rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops);
745
746 if (rc)
747 goto out_disabled;
748
749 /* Initialize ASIC function pointers and perform early init */
750 rc = device_early_init(hdev);
751 if (rc)
752 goto release_device;
753
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200754 /*
755 * Start calling ASIC initialization. First S/W then H/W and finally
756 * late init
757 */
758 rc = hdev->asic_funcs->sw_init(hdev);
759 if (rc)
760 goto early_fini;
761
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200762 /*
763 * Initialize the H/W queues. Must be done before hw_init, because
764 * there the addresses of the kernel queue are being written to the
765 * registers of the device
766 */
767 rc = hl_hw_queues_create(hdev);
768 if (rc) {
769 dev_err(hdev->dev, "failed to initialize kernel queues\n");
770 goto sw_fini;
771 }
772
773 /*
774 * Initialize the completion queues. Must be done before hw_init,
775 * because there the addresses of the completion queues are being
776 * passed as arguments to request_irq
777 */
778 hdev->completion_queue =
779 kcalloc(hdev->asic_prop.completion_queues_count,
780 sizeof(*hdev->completion_queue), GFP_KERNEL);
781
782 if (!hdev->completion_queue) {
783 dev_err(hdev->dev, "failed to allocate completion queues\n");
784 rc = -ENOMEM;
785 goto hw_queues_destroy;
786 }
787
788 for (i = 0, cq_ready_cnt = 0;
789 i < hdev->asic_prop.completion_queues_count;
790 i++, cq_ready_cnt++) {
791 rc = hl_cq_init(hdev, &hdev->completion_queue[i], i);
792 if (rc) {
793 dev_err(hdev->dev,
794 "failed to initialize completion queue\n");
795 goto cq_fini;
796 }
797 }
798
Oded Gabbay1251f232019-02-16 00:39:18 +0200799 /*
800 * Initialize the event queue. Must be done before hw_init,
801 * because there the address of the event queue is being
802 * passed as argument to request_irq
803 */
804 rc = hl_eq_init(hdev, &hdev->event_queue);
805 if (rc) {
806 dev_err(hdev->dev, "failed to initialize event queue\n");
807 goto cq_fini;
808 }
809
Oded Gabbay0861e412019-02-16 00:39:14 +0200810 /* Allocate the kernel context */
811 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
812 if (!hdev->kernel_ctx) {
813 rc = -ENOMEM;
Oded Gabbay1251f232019-02-16 00:39:18 +0200814 goto eq_fini;
Oded Gabbay0861e412019-02-16 00:39:14 +0200815 }
816
817 hdev->user_ctx = NULL;
818
819 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
820 if (rc) {
821 dev_err(hdev->dev, "failed to initialize kernel context\n");
822 goto free_ctx;
823 }
824
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200825 rc = hl_cb_pool_init(hdev);
826 if (rc) {
827 dev_err(hdev->dev, "failed to initialize CB pool\n");
828 goto release_ctx;
829 }
830
Oded Gabbayd91389b2019-02-16 00:39:19 +0200831 rc = hl_sysfs_init(hdev);
832 if (rc) {
833 dev_err(hdev->dev, "failed to initialize sysfs\n");
834 goto free_cb_pool;
835 }
836
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200837 if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
838 dev_info(hdev->dev,
839 "H/W state is dirty, must reset before initializing\n");
840 hdev->asic_funcs->hw_fini(hdev, true);
841 }
842
Oded Gabbay839c4802019-02-16 00:39:16 +0200843 rc = hdev->asic_funcs->hw_init(hdev);
844 if (rc) {
845 dev_err(hdev->dev, "failed to initialize the H/W\n");
846 rc = 0;
847 goto out_disabled;
848 }
849
850 hdev->disabled = false;
851
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200852 /* Check that the communication with the device is working */
853 rc = hdev->asic_funcs->test_queues(hdev);
854 if (rc) {
855 dev_err(hdev->dev, "Failed to detect if device is alive\n");
856 rc = 0;
857 goto out_disabled;
858 }
859
Oded Gabbayd91389b2019-02-16 00:39:19 +0200860 /* After test_queues, KMD can start sending messages to device CPU */
861
862 rc = device_late_init(hdev);
863 if (rc) {
864 dev_err(hdev->dev, "Failed late initialization\n");
865 rc = 0;
866 goto out_disabled;
867 }
868
869 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
870 hdev->asic_name,
871 hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
872
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200873 rc = hl_vm_init(hdev);
874 if (rc) {
875 dev_err(hdev->dev, "Failed to initialize memory module\n");
876 rc = 0;
877 goto out_disabled;
878 }
879
Oded Gabbayd91389b2019-02-16 00:39:19 +0200880 /*
881 * hl_hwmon_init must be called after device_late_init, because only
882 * there we get the information from the device about which
883 * hwmon-related sensors the device supports
884 */
885 rc = hl_hwmon_init(hdev);
886 if (rc) {
887 dev_err(hdev->dev, "Failed to initialize hwmon\n");
888 rc = 0;
889 goto out_disabled;
890 }
891
Oded Gabbayc4d66342019-02-16 00:39:11 +0200892 dev_notice(hdev->dev,
893 "Successfully added device to habanalabs driver\n");
894
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200895 hdev->init_done = true;
896
Oded Gabbayc4d66342019-02-16 00:39:11 +0200897 return 0;
898
Oded Gabbayd91389b2019-02-16 00:39:19 +0200899free_cb_pool:
900 hl_cb_pool_fini(hdev);
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200901release_ctx:
902 if (hl_ctx_put(hdev->kernel_ctx) != 1)
903 dev_err(hdev->dev,
904 "kernel ctx is still alive on initialization failure\n");
Oded Gabbay0861e412019-02-16 00:39:14 +0200905free_ctx:
906 kfree(hdev->kernel_ctx);
Oded Gabbay1251f232019-02-16 00:39:18 +0200907eq_fini:
908 hl_eq_fini(hdev, &hdev->event_queue);
Oded Gabbay9494a8d2019-02-16 00:39:17 +0200909cq_fini:
910 for (i = 0 ; i < cq_ready_cnt ; i++)
911 hl_cq_fini(hdev, &hdev->completion_queue[i]);
912 kfree(hdev->completion_queue);
913hw_queues_destroy:
914 hl_hw_queues_destroy(hdev);
Oded Gabbay0861e412019-02-16 00:39:14 +0200915sw_fini:
916 hdev->asic_funcs->sw_fini(hdev);
Oded Gabbay99b9d7b2019-02-16 00:39:13 +0200917early_fini:
918 device_early_fini(hdev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200919release_device:
920 device_destroy(hclass, hdev->dev->devt);
921 cdev_del(&hdev->cdev);
922out_disabled:
923 hdev->disabled = true;
924 if (hdev->pdev)
925 dev_err(&hdev->pdev->dev,
926 "Failed to initialize hl%d. Device is NOT usable !\n",
927 hdev->id);
928 else
929 pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
930 hdev->id);
931
932 return rc;
933}
934
935/*
936 * hl_device_fini - main tear-down function for habanalabs device
937 *
938 * @hdev: pointer to habanalabs device structure
939 *
940 * Destroy the device, call ASIC fini functions and release the id
941 */
942void hl_device_fini(struct hl_device *hdev)
943{
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200944 int i, rc;
945 ktime_t timeout;
946
Oded Gabbayc4d66342019-02-16 00:39:11 +0200947 dev_info(hdev->dev, "Removing device\n");
948
Oded Gabbayf8c8c7d52019-02-16 00:39:20 +0200949 /*
950 * This function is competing with the reset function, so try to
951 * take the reset atomic and if we are already in middle of reset,
952 * wait until reset function is finished. Reset function is designed
953 * to always finish (could take up to a few seconds in worst case).
954 */
955
956 timeout = ktime_add_us(ktime_get(),
957 HL_PENDING_RESET_PER_SEC * 1000 * 1000 * 4);
958 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
959 while (rc) {
960 usleep_range(50, 200);
961 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
962 if (ktime_compare(ktime_get(), timeout) > 0) {
963 WARN(1, "Failed to remove device because reset function did not finish\n");
964 return;
965 }
966 };
967
Oded Gabbayc4d66342019-02-16 00:39:11 +0200968 /* Mark device as disabled */
969 hdev->disabled = true;
970
Oded Gabbayd91389b2019-02-16 00:39:19 +0200971 hl_hwmon_fini(hdev);
972
973 device_late_fini(hdev);
974
975 hl_sysfs_fini(hdev);
976
Oded Gabbay1251f232019-02-16 00:39:18 +0200977 /*
978 * Halt the engines and disable interrupts so we won't get any more
979 * completions from H/W and we won't have any accesses from the
980 * H/W to the host machine
981 */
982 hdev->asic_funcs->halt_engines(hdev, true);
983
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200984 /* Go over all the queues, release all CS and their jobs */
985 hl_cs_rollback_all(hdev);
986
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200987 hl_cb_pool_fini(hdev);
988
Oded Gabbay0861e412019-02-16 00:39:14 +0200989 /* Release kernel context */
990 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
991 dev_err(hdev->dev, "kernel ctx is still alive\n");
992
Oded Gabbay839c4802019-02-16 00:39:16 +0200993 /* Reset the H/W. It will be in idle state after this returns */
994 hdev->asic_funcs->hw_fini(hdev, true);
995
Omer Shpigelman0feaf862019-02-16 00:39:22 +0200996 hl_vm_fini(hdev);
997
Oded Gabbay1251f232019-02-16 00:39:18 +0200998 hl_eq_fini(hdev, &hdev->event_queue);
999
Oded Gabbay9494a8d2019-02-16 00:39:17 +02001000 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1001 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1002 kfree(hdev->completion_queue);
1003
1004 hl_hw_queues_destroy(hdev);
1005
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001006 /* Call ASIC S/W finalize function */
1007 hdev->asic_funcs->sw_fini(hdev);
1008
Oded Gabbayc4d66342019-02-16 00:39:11 +02001009 device_early_fini(hdev);
1010
1011 /* Hide device from user */
1012 device_destroy(hdev->dev->class, hdev->dev->devt);
1013 cdev_del(&hdev->cdev);
1014
1015 pr_info("removed device successfully\n");
1016}
1017
1018/*
1019 * hl_poll_timeout_memory - Periodically poll a host memory address
1020 * until it is not zero or a timeout occurs
1021 * @hdev: pointer to habanalabs device structure
1022 * @addr: Address to poll
1023 * @timeout_us: timeout in us
1024 * @val: Variable to read the value into
1025 *
1026 * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
1027 * case, the last read value at @addr is stored in @val. Must not
1028 * be called from atomic context if sleep_us or timeout_us are used.
1029 *
1030 * The function sleeps for 100us with timeout value of
1031 * timeout_us
1032 */
1033int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr,
1034 u32 timeout_us, u32 *val)
1035{
1036 /*
1037 * address in this function points always to a memory location in the
1038 * host's (server's) memory. That location is updated asynchronously
1039 * either by the direct access of the device or by another core
1040 */
1041 u32 *paddr = (u32 *) (uintptr_t) addr;
1042 ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
1043
1044 might_sleep();
1045
1046 for (;;) {
1047 /*
1048 * Flush CPU read/write buffers to make sure we read updates
1049 * done by other cores or by the device
1050 */
1051 mb();
1052 *val = *paddr;
1053 if (*val)
1054 break;
1055 if (ktime_compare(ktime_get(), timeout) > 0) {
1056 *val = *paddr;
1057 break;
1058 }
1059 usleep_range((100 >> 2) + 1, 100);
1060 }
1061
1062 return *val ? 0 : -ETIMEDOUT;
1063}
1064
1065/*
1066 * hl_poll_timeout_devicememory - Periodically poll a device memory address
1067 * until it is not zero or a timeout occurs
1068 * @hdev: pointer to habanalabs device structure
1069 * @addr: Device address to poll
1070 * @timeout_us: timeout in us
1071 * @val: Variable to read the value into
1072 *
1073 * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
1074 * case, the last read value at @addr is stored in @val. Must not
1075 * be called from atomic context if sleep_us or timeout_us are used.
1076 *
1077 * The function sleeps for 100us with timeout value of
1078 * timeout_us
1079 */
1080int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
1081 u32 timeout_us, u32 *val)
1082{
1083 ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
1084
1085 might_sleep();
1086
1087 for (;;) {
1088 *val = readl(addr);
1089 if (*val)
1090 break;
1091 if (ktime_compare(ktime_get(), timeout) > 0) {
1092 *val = readl(addr);
1093 break;
1094 }
1095 usleep_range((100 >> 2) + 1, 100);
1096 }
1097
1098 return *val ? 0 : -ETIMEDOUT;
1099}
Oded Gabbay99b9d7b2019-02-16 00:39:13 +02001100
1101/*
1102 * MMIO register access helper functions.
1103 */
1104
1105/*
1106 * hl_rreg - Read an MMIO register
1107 *
1108 * @hdev: pointer to habanalabs device structure
1109 * @reg: MMIO register offset (in bytes)
1110 *
1111 * Returns the value of the MMIO register we are asked to read
1112 *
1113 */
1114inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
1115{
1116 return readl(hdev->rmmio + reg);
1117}
1118
1119/*
1120 * hl_wreg - Write to an MMIO register
1121 *
1122 * @hdev: pointer to habanalabs device structure
1123 * @reg: MMIO register offset (in bytes)
1124 * @val: 32-bit value
1125 *
1126 * Writes the 32-bit value into the MMIO register
1127 *
1128 */
1129inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
1130{
1131 writel(val, hdev->rmmio + reg);
1132}